From 36fe4fced0e5b8c7c42da7336157671c92367cce Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Thu, 25 Jan 2024 11:32:52 +0000
Subject: [PATCH 001/200] elf: os: deprioritize .ident strategy due to
 potential for FPs

---
 capa/features/extractors/elf.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/capa/features/extractors/elf.py b/capa/features/extractors/elf.py
index 74e91117a..8574b533c 100644
--- a/capa/features/extractors/elf.py
+++ b/capa/features/extractors/elf.py
@@ -1045,6 +1045,11 @@ def detect_elf_os(f) -> str:
     elif symtab_guess:
         ret = symtab_guess
 
+    elif ident_guess:
+        # at the bottom because we don't trust this too much
+        # due to potential for bugs with cross-compilation.
+        ret = ident_guess
+
     return ret.value if ret is not None else "unknown"
 
 

From 044e4d62fc65373c42696e0fcc179c72e600a954 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Thu, 25 Jan 2024 11:33:15 +0000
Subject: [PATCH 002/200] elf: os: same as parent, fix .ident FP

---
 capa/features/extractors/elf.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/capa/features/extractors/elf.py b/capa/features/extractors/elf.py
index 8574b533c..cc1810fc0 100644
--- a/capa/features/extractors/elf.py
+++ b/capa/features/extractors/elf.py
@@ -1023,10 +1023,6 @@ def detect_elf_os(f) -> str:
     if osabi_guess:
         ret = osabi_guess
 
-    elif ident_guess:
-        # we don't trust this too much due to non-cross-compilation assumptions
-        ret = ident_guess
-
     elif ph_notes_guess:
         ret = ph_notes_guess
 

From ad732fc352d0a42a8ec469885d1e05fe92ca1d0e Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Thu, 25 Jan 2024 11:33:37 +0000
Subject: [PATCH 003/200] elf: os: detect Android via clang compiler .ident
 note

---
 capa/features/extractors/elf.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/capa/features/extractors/elf.py b/capa/features/extractors/elf.py
index cc1810fc0..a0329411a 100644
--- a/capa/features/extractors/elf.py
+++ b/capa/features/extractors/elf.py
@@ -866,6 +866,8 @@ def guess_os_from_ident_directive(elf: ELF) -> Optional[OS]:
             return OS.LINUX
         elif "Red Hat" in comment:
             return OS.LINUX
+        elif "Android" in comment:
+            return OS.ANDROID
 
     return None
 

From 270956b67274478265f82ec3ef493fa5b822e4e0 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Thu, 25 Jan 2024 11:33:58 +0000
Subject: [PATCH 004/200] elf: os: detect Android via dependency on liblog.so

---
 capa/features/extractors/elf.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/capa/features/extractors/elf.py b/capa/features/extractors/elf.py
index a0329411a..b969463df 100644
--- a/capa/features/extractors/elf.py
+++ b/capa/features/extractors/elf.py
@@ -923,6 +923,8 @@ def guess_os_from_needed_dependencies(elf: ELF) -> Optional[OS]:
             return OS.HURD
         if needed.startswith("libandroid.so"):
             return OS.ANDROID
+        if needed.startswith("liblog.so"):
+            return OS.ANDROID
 
     return None
 

From e064ce81bb52e55d9b152989132ac800ad729e2e Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Thu, 25 Jan 2024 11:48:23 +0000
Subject: [PATCH 005/200] main: split main into a bunch of "main routines"

[wip] since there are a few references to BinExport2
that are in progress elsewhre. Next commit will remove them.
---
 capa/features/extractors/common.py |   4 +-
 capa/main.py                       | 632 ++++++++++++++++++++---------
 2 files changed, 433 insertions(+), 203 deletions(-)

diff --git a/capa/features/extractors/common.py b/capa/features/extractors/common.py
index b7bb3c399..bf5a3e7b4 100644
--- a/capa/features/extractors/common.py
+++ b/capa/features/extractors/common.py
@@ -45,7 +45,7 @@
 MATCH_JSON_OBJECT = b'{"'
 
 
-def extract_file_strings(buf, **kwargs) -> Iterator[Tuple[String, Address]]:
+def extract_file_strings(buf: bytes, **kwargs) -> Iterator[Tuple[String, Address]]:
     """
     extract ASCII and UTF-16 LE strings from file
     """
@@ -56,7 +56,7 @@ def extract_file_strings(buf, **kwargs) -> Iterator[Tuple[String, Address]]:
         yield String(s.s), FileOffsetAddress(s.offset)
 
 
-def extract_format(buf) -> Iterator[Tuple[Feature, Address]]:
+def extract_format(buf: bytes) -> Iterator[Tuple[Feature, Address]]:
     if buf.startswith(MATCH_PE):
         yield Format(FORMAT_PE), NO_ADDRESS
     elif buf.startswith(MATCH_ELF):
diff --git a/capa/main.py b/capa/main.py
index e5ee92a2a..6b0899821 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -73,6 +73,7 @@
     OS_MACOS,
     FORMAT_PE,
     FORMAT_ELF,
+    FORMAT_BINEXPORT2,
     OS_WINDOWS,
     FORMAT_AUTO,
     FORMAT_CAPE,
@@ -93,10 +94,14 @@
 
 RULES_PATH_DEFAULT_STRING = "(embedded rules)"
 SIGNATURES_PATH_DEFAULT_STRING = "(embedded signatures)"
+
 BACKEND_VIV = "vivisect"
 BACKEND_DOTNET = "dotnet"
 BACKEND_BINJA = "binja"
 BACKEND_PEFILE = "pefile"
+BACKEND_CAPE = "cape"
+BACKEND_BINEXPORT2 = "binexport2"
+BACKEND_DEFAULT = "(default) use default backend for given file type"
 
 E_MISSING_RULES = 10
 E_MISSING_FILE = 11
@@ -225,7 +230,7 @@ def get_default_signatures() -> List[Path]:
     return ret
 
 
-def get_workspace(path: Path, format_: str, sigpaths: List[Path]):
+def get_workspace(path: Path, input_format: str, sigpaths: List[Path]):
     """
     load the program at the given path into a vivisect workspace using the given format.
     also apply the given FLIRT signatures.
@@ -246,21 +251,21 @@ def get_workspace(path: Path, format_: str, sigpaths: List[Path]):
     import viv_utils.flirt
 
     logger.debug("generating vivisect workspace for: %s", path)
-    if format_ == FORMAT_AUTO:
+    if input_format == FORMAT_AUTO:
         if not is_supported_format(path):
             raise UnsupportedFormatError()
 
         # don't analyze, so that we can add our Flirt function analyzer first.
         vw = viv_utils.getWorkspace(str(path), analyze=False, should_save=False)
-    elif format_ in {FORMAT_PE, FORMAT_ELF}:
+    elif input_format in {FORMAT_PE, FORMAT_ELF}:
         vw = viv_utils.getWorkspace(str(path), analyze=False, should_save=False)
-    elif format_ == FORMAT_SC32:
+    elif input_format == FORMAT_SC32:
         # these are not analyzed nor saved.
         vw = viv_utils.getShellcodeWorkspaceFromFile(str(path), arch="i386", analyze=False)
-    elif format_ == FORMAT_SC64:
+    elif input_format == FORMAT_SC64:
         vw = viv_utils.getShellcodeWorkspaceFromFile(str(path), arch="amd64", analyze=False)
     else:
-        raise ValueError("unexpected format: " + format_)
+        raise ValueError("unexpected format: " + input_format)
 
     viv_utils.flirt.register_flirt_signature_analyzers(vw, [str(s) for s in sigpaths])
 
@@ -271,13 +276,14 @@ def get_workspace(path: Path, format_: str, sigpaths: List[Path]):
 
 
 def get_extractor(
-    path: Path,
-    format_: str,
+    input_path: Path,
+    input_format: str,
     os_: str,
     backend: str,
     sigpaths: List[Path],
     should_save_workspace=False,
     disable_progress=False,
+    sample_path: Optional[Path]=None,
 ) -> FeatureExtractor:
     """
     raises:
@@ -285,30 +291,22 @@ def get_extractor(
       UnsupportedArchError
       UnsupportedOSError
     """
-
-    if format_ not in (FORMAT_SC32, FORMAT_SC64, FORMAT_CAPE):
-        if not is_supported_format(path):
-            raise UnsupportedFormatError()
-
-        if not is_supported_arch(path):
-            raise UnsupportedArchError()
-
-        if os_ == OS_AUTO and not is_supported_os(path):
-            raise UnsupportedOSError()
-
-    if format_ == FORMAT_CAPE:
+    if backend == BACKEND_CAPE:
         import capa.features.extractors.cape.extractor
 
-        report = json.load(Path(path).open(encoding="utf-8"))
+        report = json.load(Path(input_path).open(encoding="utf-8"))
         return capa.features.extractors.cape.extractor.CapeExtractor.from_report(report)
 
-    elif format_ == FORMAT_DOTNET:
+    elif backend == BACKEND_DOTNET:
         import capa.features.extractors.dnfile.extractor
 
-        return capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(path)
+        if input_format not in (FORMAT_PE, FORMAT_DOTNET):
+            raise UnsupportedFormatError()
+
+        return capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(input_path)
 
     elif backend == BACKEND_BINJA:
-        from capa.features.extractors.binja.find_binja_api import find_binja_path
+        from capa.features.extractors.binaryninja.find_binja_api import find_binja_path
 
         # When we are running as a standalone executable, we cannot directly import binaryninja
         # We need to fist find the binja API installation path and add it into sys.path
@@ -326,25 +324,45 @@ def get_extractor(
                 + "https://docs.binary.ninja/dev/batch.html#install-the-api)."
             )
 
-        import capa.features.extractors.binja.extractor
+        import capa.features.extractors.binaryninja.extractor
+
+        if input_format not in (FORMAT_SC32, FORMAT_SC64):
+            if not is_supported_format(input_path):
+                raise UnsupportedFormatError()
+
+            if not is_supported_arch(input_path):
+                raise UnsupportedArchError()
+
+            if os_ == OS_AUTO and not is_supported_os(input_path):
+                raise UnsupportedOSError()
 
         with halo.Halo(text="analyzing program", spinner="simpleDots", stream=sys.stderr, enabled=not disable_progress):
-            bv: BinaryView = binaryninja.load(str(path))
+            bv: BinaryView = binaryninja.load(str(input_path))
             if bv is None:
-                raise RuntimeError(f"Binary Ninja cannot open file {path}")
+                raise RuntimeError(f"Binary Ninja cannot open file {input_path}")
 
-        return capa.features.extractors.binja.extractor.BinjaFeatureExtractor(bv)
+        return capa.features.extractors.binaryninja.extractor.BinjaFeatureExtractor(bv)
 
     elif backend == BACKEND_PEFILE:
         import capa.features.extractors.pefile
 
-        return capa.features.extractors.pefile.PefileFeatureExtractor(path)
+        return capa.features.extractors.pefile.PefileFeatureExtractor(input_path)
 
     elif backend == BACKEND_VIV:
         import capa.features.extractors.viv.extractor
 
+        if input_format not in (FORMAT_SC32, FORMAT_SC64):
+            if not is_supported_format(input_path):
+                raise UnsupportedFormatError()
+
+            if not is_supported_arch(input_path):
+                raise UnsupportedArchError()
+
+            if os_ == OS_AUTO and not is_supported_os(input_path):
+                raise UnsupportedOSError()
+
         with halo.Halo(text="analyzing program", spinner="simpleDots", stream=sys.stderr, enabled=not disable_progress):
-            vw = get_workspace(path, format_, sigpaths)
+            vw = get_workspace(input_path, input_format, sigpaths)
 
             if should_save_workspace:
                 logger.debug("saving workspace")
@@ -356,29 +374,46 @@ def get_extractor(
             else:
                 logger.debug("CAPA_SAVE_WORKSPACE unset, not saving workspace")
 
-        return capa.features.extractors.viv.extractor.VivisectFeatureExtractor(vw, path, os_)
+        return capa.features.extractors.viv.extractor.VivisectFeatureExtractor(vw, input_path, os_)
+
+    elif backend == BACKEND_BINEXPORT2:
+        import capa.features.extractors.binexport2
+        import capa.features.extractors.binexport2.extractor
+
+        be2 = capa.features.extractors.binexport2.get_binexport2(input_path)
+        assert sample_path is not None
+        # we let BinExport support a wide array of Arch/OS/etc.
+        # it can be an intermediate representation for us.
+        # therefore, don't restrict format/arch/OS.
+        buf = sample_path.read_bytes()
+
+        return capa.features.extractors.binexport2.extractor.BinExport2FeatureExtractor(be2, buf)
 
     else:
         raise ValueError("unexpected backend: " + backend)
 
 
-def get_file_extractors(sample: Path, format_: str) -> List[FeatureExtractor]:
+def get_file_extractors(input: Path, input_format: str) -> List[FeatureExtractor]:
     file_extractors: List[FeatureExtractor] = []
 
-    if format_ == FORMAT_PE:
-        file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(sample))
+    if input_format == FORMAT_PE:
+        file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(input))
 
-    elif format_ == FORMAT_DOTNET:
-        file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(sample))
-        file_extractors.append(capa.features.extractors.dotnetfile.DotnetFileFeatureExtractor(sample))
+    elif input_format == FORMAT_DOTNET:
+        file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(input))
+        file_extractors.append(capa.features.extractors.dotnetfile.DotnetFileFeatureExtractor(input))
 
-    elif format_ == capa.features.common.FORMAT_ELF:
-        file_extractors.append(capa.features.extractors.elffile.ElfFeatureExtractor(sample))
+    elif input_format == FORMAT_ELF:
+        file_extractors.append(capa.features.extractors.elffile.ElfFeatureExtractor(input))
 
-    elif format_ == FORMAT_CAPE:
-        report = json.load(Path(sample).open(encoding="utf-8"))
+    elif input_format == FORMAT_CAPE:
+        report = json.load(Path(input).open(encoding="utf-8"))
         file_extractors.append(capa.features.extractors.cape.extractor.CapeExtractor.from_report(report))
 
+    elif input_format == FORMAT_BINEXPORT2:
+        # TODO(wb): 1755
+        pass
+
     return file_extractors
 
 
@@ -548,7 +583,7 @@ def get_sample_analysis(format_, arch, os_, extractor, rules_path, counts):
 def collect_metadata(
     argv: List[str],
     sample_path: Path,
-    format_: str,
+    input_format: str,
     os_: str,
     rules_path: List[Path],
     extractor: FeatureExtractor,
@@ -564,7 +599,7 @@ def collect_metadata(
     extractor_arch = [f.value for (f, _) in global_feats if isinstance(f, capa.features.common.Arch)]
     extractor_os = [f.value for (f, _) in global_feats if isinstance(f, capa.features.common.OS)]
 
-    format_ = str(extractor_format[0]) if extractor_format else "unknown" if format_ == FORMAT_AUTO else format_
+    input_format = str(extractor_format[0]) if extractor_format else "unknown" if input_format == FORMAT_AUTO else input_format
     arch = str(extractor_arch[0]) if extractor_arch else "unknown"
     os_ = str(extractor_os[0]) if extractor_os else "unknown" if os_ == OS_AUTO else os_
 
@@ -588,7 +623,7 @@ def collect_metadata(
             path=Path(sample_path).resolve().as_posix(),
         ),
         analysis=get_sample_analysis(
-            format_,
+            input_format,
             arch,
             os_,
             extractor,
@@ -731,11 +766,32 @@ def compute_layout(rules, extractor, capabilities) -> rdoc.Layout:
         raise ValueError("extractor must be either a static or dynamic extracotr")
 
 
+def simple_message_exception_handler(exctype, value: BaseException, traceback: TracebackType):
+    """
+    prints friendly message on unexpected exceptions to regular users (debug mode shows regular stack trace)
+
+    args:
+      # TODO(aaronatp): Once capa drops support for Python 3.8, move the exctype type annotation to
+      # the function parameters and remove the "# type: ignore[assignment]" from the relevant place
+      # in the main function, see (https://github.com/mandiant/capa/issues/1896)
+      exctype (type[BaseException]): exception class
+    """
+
+    if exctype is KeyboardInterrupt:
+        print("KeyboardInterrupt detected, program terminated")
+    else:
+        print(
+            f"Unexpected exception raised: {exctype}. Please run capa in debug mode (-d/--debug) "
+            + "to see the stack trace. Please also report your issue on the capa GitHub page so we "
+            + "can improve the code! (https://github.com/mandiant/capa/issues)"
+        )
+
+
 def install_common_args(parser, wanted=None):
     """
     register a common set of command line arguments for re-use by main & scripts.
     these are things like logging/coloring/etc.
-    also enable callers to opt-in to common arguments, like specifying the input sample.
+    also enable callers to opt-in to common arguments, like specifying the input file.
 
     this routine lets many script use the same language for cli arguments.
     see `handle_common_args` to do common configuration.
@@ -743,7 +799,7 @@ def install_common_args(parser, wanted=None):
     args:
       parser (argparse.ArgumentParser): a parser to update in place, adding common arguments.
       wanted (Set[str]): collection of arguments to opt-into, including:
-        - "sample": required positional argument to input file.
+        - "input": required positional argument to input file.
         - "format": flag to override file format.
         - "os": flag to override file operating system.
         - "backend": flag to override analysis backend.
@@ -777,18 +833,18 @@ def install_common_args(parser, wanted=None):
     #
     # arguments that may be opted into:
     #
-    #   - sample
+    #   - input
     #   - format
     #   - os
     #   - rules
     #   - tag
     #
 
-    if "sample" in wanted:
+    if "input" in wanted:
         parser.add_argument(
-            "sample",
+            "input",
             type=str,
-            help="path to sample to analyze",
+            help="path to file to analyze",
         )
 
     if "format" in wanted:
@@ -808,7 +864,7 @@ def install_common_args(parser, wanted=None):
             "--format",
             choices=[f[0] for f in formats],
             default=FORMAT_AUTO,
-            help=f"select sample format, {format_help}",
+            help=f"select input format, {format_help}",
         )
 
     if "backend" in wanted:
@@ -817,8 +873,8 @@ def install_common_args(parser, wanted=None):
             "--backend",
             type=str,
             help="select the backend to use",
-            choices=(BACKEND_VIV, BACKEND_BINJA, BACKEND_PEFILE),
-            default=BACKEND_VIV,
+            choices=(BACKEND_VIV, BACKEND_BINJA, BACKEND_PEFILE, BACKEND_CAPE, BACKEND_BINEXPORT2),
+            default=BACKEND_DEFAULT,
         )
 
     if "os" in wanted:
@@ -859,6 +915,32 @@ def install_common_args(parser, wanted=None):
         parser.add_argument("-t", "--tag", type=str, help="filter on rule meta field values")
 
 
+###############################################################################
+#
+# "main routines"
+#
+# All of the following routines are considered "main routines".
+# That is, they rely upon the given CLI arguments and write to output streams.
+# We prefer to keep as much logic away from input/output as possible;
+# however, capa does handle many combinations of flags/switches/overrides,
+# so these routines deal with that logic.
+#
+# Other scripts may use this routines, but should also prefer to invoke them
+# directly within `main()`, not within library code.
+#
+# These main routines may raise `ShouldExitError` to indicate the program
+# ...should exit. Its a tiny step away from doing `sys.exit()` directly.
+# I'm not sure if we should just do that. In the meantime, programs should
+# handle `ShoudlExitError` and pass the status code to `sys.exit()`.
+#
+
+
+class ShouldExitError(Exception):
+    """raised when a main-related routine indicates the program should exit."""
+    def __init__(self, status_code: int):
+        self.status_code = status_code
+
+
 def handle_common_args(args):
     """
     handle the global config specified by `install_common_args`,
@@ -871,7 +953,10 @@ def handle_common_args(args):
       - is_default_rules: if the default rules were used.
 
     args:
-      args (argparse.Namespace): parsed arguments that included at least `install_common_args` args.
+      args: The parsed command line arguments from `install_common_args`.
+
+    raises:
+      ShouldExitError: if the program is invoked incorrectly and should exit.
     """
     if args.quiet:
         logging.basicConfig(level=logging.WARNING)
@@ -914,8 +999,11 @@ def handle_common_args(args):
     else:
         raise RuntimeError("unexpected --color value: " + args.color)
 
-    if hasattr(args, "sample"):
-        args.sample = Path(args.sample)
+    if not args.debug:
+        sys.excepthook = simple_message_exception_handler  # type: ignore[assignment]
+
+    if hasattr(args, "input"):
+        args.input = Path(args.input)
 
     if hasattr(args, "rules"):
         rules_paths: List[Path] = []
@@ -937,7 +1025,7 @@ def handle_common_args(args):
                 # so in this case, we require the user to use -r to specify the rule directory.
                 logger.error("default embedded rules not found! (maybe you installed capa as a library?)")
                 logger.error("provide your own rule set via the `-r` option.")
-                return E_MISSING_RULES
+                raise ShouldExitError(E_MISSING_RULES)
 
             rules_paths.append(default_rule_path)
             args.is_default_rules = True
@@ -978,94 +1066,134 @@ def handle_common_args(args):
         args.signatures = sigs_path
 
 
-def simple_message_exception_handler(exctype, value: BaseException, traceback: TracebackType):
+def ensure_input_exists_from_args(args):
     """
-    prints friendly message on unexpected exceptions to regular users (debug mode shows regular stack trace)
+    args:
+      args: The parsed command line arguments from `install_common_args`.
+
+    raises:
+      ShouldExitError: if the program is invoked incorrectly and should exit.
+    """
+    try:
+        _ = get_file_taste(args.input)
+    except IOError as e:
+        # per our research there's not a programmatic way to render the IOError with non-ASCII filename unless we
+        # handle the IOError separately and reach into the args
+        logger.error("%s", e.args[0])
+        raise ShouldExitError(E_MISSING_FILE) from e
+
+
+def get_input_format_from_args(args) -> str:
+    """
+    Determine the format of the input file.
+
+    Note: this may not be the same as the format of the sample.
+    Cape, Freeze, etc. formats describe a sample without being the sample itself.
 
     args:
-      # TODO(aaronatp): Once capa drops support for Python 3.8, move the exctype type annotation to
-      # the function parameters and remove the "# type: ignore[assignment]" from the relevant place
-      # in the main function, see (https://github.com/mandiant/capa/issues/1896)
-      exctype (type[BaseException]): exception class
+      args: The parsed command line arguments from `install_common_args`.
+
+    raises:
+      ShouldExitError: if the program is invoked incorrectly and should exit.
     """
+    format = args.format
+
+    if format != FORMAT_AUTO:
+        return format
+
+    try:
+        return get_auto_format(args.input)
+    except PEFormatError as e:
+        logger.error("Input file '%s' is not a valid PE file: %s", args.input, str(e))
+        raise ShouldExitError(E_CORRUPT_FILE) from e
+    except UnsupportedFormatError as e:
+        log_unsupported_format_error()
+        raise ShouldExitError(E_INVALID_FILE_TYPE) from e
+
+
+def get_backend_from_args(args, input_format: str) -> str:
+    """
+    Determine the backend that should be used for the given input file.
+    Respects an override provided by the user, otherwise, use a good default.
+
+    args:
+      args: The parsed command line arguments from `install_common_args`.
+      input_format: The file format of the input file.
+
+    raises:
+      ShouldExitError: if the program is invoked incorrectly and should exit.
+    """
+    if args.backend != BACKEND_DEFAULT:
+        return args.backend
+
+    if input_format == FORMAT_CAPE:
+        return BACKEND_CAPE
+
+    elif input_format == FORMAT_BINEXPORT2:
+        return BACKEND_BINEXPORT2
+
+    elif input_format == FORMAT_DOTNET:
+        return BACKEND_DOTNET
 
-    if exctype is KeyboardInterrupt:
-        print("KeyboardInterrupt detected, program terminated")
     else:
-        print(
-            f"Unexpected exception raised: {exctype}. Please run capa in debug mode (-d/--debug) "
-            + "to see the stack trace. Please also report your issue on the capa GitHub page so we "
-            + "can improve the code! (https://github.com/mandiant/capa/issues)"
-        )
+        return BACKEND_VIV
 
 
-def main(argv: Optional[List[str]] = None):
-    if sys.version_info < (3, 8):
-        raise UnsupportedRuntimeError("This version of capa can only be used with Python 3.8+")
+def get_sample_path_from_args(args, backend: str) -> Optional[Path]:
+    """
+    Determine the path to the underlying sample, if it exists.
 
-    if argv is None:
-        argv = sys.argv[1:]
+    Note: this may not be the same as the input file.
+    Cape, Freeze, etc. formats describe a sample without being the sample itself.
 
-    desc = "The FLARE team's open-source tool to identify capabilities in executable files."
-    epilog = textwrap.dedent(
-        """
-        By default, capa uses a default set of embedded rules.
-        You can see the rule set here:
-          https://github.com/mandiant/capa-rules
+    args:
+      args: The parsed command line arguments from `install_common_args`.
+      backend: The backend that will handle the input file.
 
-        To provide your own rule set, use the `-r` flag:
-          capa  --rules /path/to/rules  suspicious.exe
-          capa  -r      /path/to/rules  suspicious.exe
+    raises:
+      ShouldExitError: if the program is invoked incorrectly and should exit.
+    """
+    if backend == BACKEND_BINEXPORT2:
+        import capa.features.extractors.binexport2
 
-        examples:
-          identify capabilities in a binary
-            capa suspicious.exe
+        be2 = capa.features.extractors.binexport2.get_binexport2(args.input)
+        return capa.features.extractors.binexport2.get_sample_from_binexport2(be2)
+    elif backend == BACKEND_CAPE:
+        return None
+    else:
+        return args.input
 
-          identify capabilities in 32-bit shellcode, see `-f` for all supported formats
-            capa -f sc32 shellcode.bin
 
-          report match locations
-            capa -v suspicious.exe
+def get_os_from_args(args, backend) -> str:
+    """
+    Determine the OS for the given sample.
+    Respects an override provided by the user, otherwise, use heuristics and
+    algorithms to detect the OS.
 
-          report all feature match details
-            capa -vv suspicious.exe
+    args:
+      args: The parsed command line arguments from `install_common_args`.
+      backend: The backend that will handle the input file.
 
-          filter rules by meta fields, e.g. rule name or namespace
-            capa -t "create TCP socket" suspicious.exe
-         """
-    )
+    raises:
+      ShouldExitError: if the program is invoked incorrectly and should exit.
+    """
+    if args.os:
+        return args.os
 
-    parser = argparse.ArgumentParser(
-        description=desc, epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter
-    )
-    install_common_args(parser, {"sample", "format", "backend", "os", "signatures", "rules", "tag"})
-    parser.add_argument("-j", "--json", action="store_true", help="emit JSON instead of text")
-    args = parser.parse_args(args=argv)
-    if not args.debug:
-        sys.excepthook = simple_message_exception_handler  # type: ignore[assignment]
-    ret = handle_common_args(args)
-    if ret is not None and ret != 0:
-        return ret
+    sample_path = get_sample_path_from_args(args, backend)
+    if sample_path is None:
+        return "unknown"
+    return get_os(sample_path)
 
-    try:
-        _ = get_file_taste(args.sample)
-    except IOError as e:
-        # per our research there's not a programmatic way to render the IOError with non-ASCII filename unless we
-        # handle the IOError separately and reach into the args
-        logger.error("%s", e.args[0])
-        return E_MISSING_FILE
 
-    format_ = args.format
-    if format_ == FORMAT_AUTO:
-        try:
-            format_ = get_auto_format(args.sample)
-        except PEFormatError as e:
-            logger.error("Input file '%s' is not a valid PE file: %s", args.sample, str(e))
-            return E_CORRUPT_FILE
-        except UnsupportedFormatError:
-            log_unsupported_format_error()
-            return E_INVALID_FILE_TYPE
+def get_rules_from_args(args) -> str:
+    """
+    args:
+      args: The parsed command line arguments from `install_common_args`.
 
+    raises:
+      ShouldExitError: if the program is invoked incorrectly and should exit.
+    """
     try:
         if is_running_standalone() and args.is_default_rules:
             cache_dir = get_default_root() / "cache"
@@ -1073,20 +1201,6 @@ def main(argv: Optional[List[str]] = None):
             cache_dir = capa.rules.cache.get_default_cache_directory()
 
         rules = get_rules(args.rules, cache_dir=cache_dir)
-
-        logger.debug(
-            "successfully loaded %s rules",
-            # during the load of the RuleSet, we extract subscope statements into their own rules
-            # that are subsequently `match`ed upon. this inflates the total rule count.
-            # so, filter out the subscope rules when reporting total number of loaded rules.
-            len(list(filter(lambda r: not (r.is_subscope_rule()), rules.rules.values()))),
-        )
-        if args.tag:
-            rules = rules.filter_rules_by_meta(args.tag)
-            logger.debug("selected %d rules", len(rules))
-            for i, r in enumerate(rules.rules, 1):
-                logger.debug(" %d. %s", i, r)
-
     except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e:
         logger.error("%s", str(e))
         logger.error(
@@ -1101,8 +1215,34 @@ def main(argv: Optional[List[str]] = None):
             "Or, for more details, see the rule set documentation here: %s",
             "https://github.com/mandiant/capa/blob/master/doc/rules.md",
         )
-        return E_INVALID_RULE
+        raise ShouldExitError(E_INVALID_RULE) from e
+
+    logger.debug(
+        "successfully loaded %s rules",
+        # during the load of the RuleSet, we extract subscope statements into their own rules
+        # that are subsequently `match`ed upon. this inflates the total rule count.
+        # so, filter out the subscope rules when reporting total number of loaded rules.
+        len(list(filter(lambda r: not (r.is_subscope_rule()), rules.rules.values()))),
+    )
+
+    if args.tag:
+        rules = rules.filter_rules_by_meta(args.tag)
+        logger.debug("selected %d rules", len(rules))
+        for i, r in enumerate(rules.rules, 1):
+            logger.debug(" %d. %s", i, r)
+
+    return rules
+
 
+def get_file_extractors_from_args(args, input_format: str) -> List[FeatureExtractor]:
+    """
+    args:
+      args: The parsed command line arguments from `install_common_args`.
+      input_format: The file format of the input file.
+
+    raises:
+      ShouldExitError: if the program is invoked incorrectly and should exit.
+    """
     # file feature extractors are pretty lightweight: they don't do any code analysis.
     # so we can fairly quickly determine if the given file has "pure" file-scope rules
     # that indicate a limitation (like "file is packed based on section names")
@@ -1111,27 +1251,36 @@ def main(argv: Optional[List[str]] = None):
     # this pass can inspect multiple file extractors, e.g., dotnet and pe to identify
     # various limitations
     try:
-        file_extractors = get_file_extractors(args.sample, format_)
+        return get_file_extractors(args.input, input_format)
     except PEFormatError as e:
-        logger.error("Input file '%s' is not a valid PE file: %s", args.sample, str(e))
+        logger.error("Input file '%s' is not a valid PE file: %s", args.input, str(e))
         return E_CORRUPT_FILE
     except (ELFError, OverflowError) as e:
-        logger.error("Input file '%s' is not a valid ELF file: %s", args.sample, str(e))
+        logger.error("Input file '%s' is not a valid ELF file: %s", args.input, str(e))
         return E_CORRUPT_FILE
     except UnsupportedFormatError as e:
-        if format_ == FORMAT_CAPE:
+        if input_format == FORMAT_CAPE:
             log_unsupported_cape_report_error(str(e))
         else:
             log_unsupported_format_error()
-        return E_INVALID_FILE_TYPE
+        raise ShouldExitError(E_INVALID_FILE_TYPE) from e
     except EmptyReportError as e:
-        if format_ == FORMAT_CAPE:
+        if input_format == FORMAT_CAPE:
             log_empty_cape_report_error(str(e))
-            return E_EMPTY_REPORT
+            raise ShouldExitError(E_EMPTY_REPORT) from e
         else:
             log_unsupported_format_error()
-            return E_INVALID_FILE_TYPE
+            raise ShouldExitError(E_INVALID_FILE_TYPE) from e
+
 
+def find_file_limitations_from_args(args, rules: RuleSet, file_extractors: List[FeatureExtractor]) -> bool:
+    """
+    args:
+      args: The parsed command line arguments from `install_common_args`.
+
+    raises:
+      ShouldExitError: if the program is invoked incorrectly and should exit.
+    """
     found_file_limitation = False
     for file_extractor in file_extractors:
         if isinstance(file_extractor, DynamicFeatureExtractor):
@@ -1141,11 +1290,11 @@ def main(argv: Optional[List[str]] = None):
         try:
             pure_file_capabilities, _ = find_file_capabilities(rules, file_extractor, {})
         except PEFormatError as e:
-            logger.error("Input file '%s' is not a valid PE file: %s", args.sample, str(e))
-            return E_CORRUPT_FILE
+            logger.error("Input file '%s' is not a valid PE file: %s", args.input, str(e))
+            raise ShouldExitError(E_CORRUPT_FILE) from e
         except (ELFError, OverflowError) as e:
-            logger.error("Input file '%s' is not a valid ELF file: %s", args.sample, str(e))
-            return E_CORRUPT_FILE
+            logger.error("Input file '%s' is not a valid ELF file: %s", args.input, str(e))
+            raise ShouldExitError(E_CORRUPT_FILE) from e
 
         # file limitations that rely on non-file scope won't be detected here.
         # nor on FunctionName features, because pefile doesn't support this.
@@ -1155,70 +1304,150 @@ def main(argv: Optional[List[str]] = None):
             # do show the output in verbose mode, though.
             if not (args.verbose or args.vverbose or args.json):
                 logger.debug("file limitation short circuit, won't analyze fully.")
-                return E_FILE_LIMITATION
+                raise ShouldExitError(E_FILE_LIMITATION)
+    return found_file_limitation
+
+
+def get_extractor_from_args(args, input_format: str, backend: str) -> FeatureExtractor:
+    """
+    args:
+      args: The parsed command line arguments from `install_common_args`.
+      input_format: The file format of the input file.
+      backend: The backend that will handle the input file.
+
+    raises:
+      ShouldExitError: if the program is invoked incorrectly and should exit.
+    """
+    if input_format == FORMAT_FREEZE:
+        # freeze format deserializes directly into an extractor
+        return frz.load(Path(args.input).read_bytes())
+    else:
+        # all other formats we must create an extractor,
+        # such as viv, binary ninja, etc. workspaces
+        # and use those for extracting.
+
+        try:
+            sig_paths = []
+            if backend != BACKEND_VIV:
+                logger.debug("skipping library code matching: only supported by the vivisect backend")
+            elif input_format != FORMAT_PE:
+                logger.debug("skipping library code matching: signatures only supports PE files")
+            else:
+                sig_paths = get_signatures(args.signatures)
+        except IOError as e:
+            logger.error("%s", str(e))
+            raise ShouldExitError(E_INVALID_SIG) from e
+
+        should_save_workspace = os.environ.get("CAPA_SAVE_WORKSPACE") not in ("0", "no", "NO", "n", None)
+
+        os_ = get_os_from_args(args, backend)
+        sample_path = get_sample_path_from_args(args, backend)
+
+        # TODO(mr-tz): this should be wrapped and refactored as it's tedious to update everywhere
+        #  see same code and show-features above examples
+        #  https://github.com/mandiant/capa/issues/1813
+        try:
+            return get_extractor(
+                args.input,
+                input_format,
+                os_,
+                backend,
+                sig_paths,
+                should_save_workspace=should_save_workspace,
+                disable_progress=args.quiet or args.debug,
+                sample_path=sample_path,
+            )
+        except UnsupportedFormatError as e:
+            if input_format == FORMAT_CAPE:
+                log_unsupported_cape_report_error(str(e))
+            else:
+                log_unsupported_format_error()
+            raise ShouldExitError(E_INVALID_FILE_TYPE) from e
+        except UnsupportedArchError as e:
+            log_unsupported_arch_error()
+            raise ShouldExitError(E_INVALID_FILE_ARCH) from e
+        except UnsupportedOSError as e:
+            log_unsupported_os_error()
+            raise ShouldExitError(E_INVALID_FILE_OS) from e
+
+
+def main(argv: Optional[List[str]] = None):
+    if sys.version_info < (3, 8):
+        raise UnsupportedRuntimeError("This version of capa can only be used with Python 3.8+")
+
+    if argv is None:
+        argv = sys.argv[1:]
+
+    desc = "The FLARE team's open-source tool to identify capabilities in executable files."
+    epilog = textwrap.dedent(
+        """
+        By default, capa uses a default set of embedded rules.
+        You can see the rule set here:
+          https://github.com/mandiant/capa-rules
+
+        To provide your own rule set, use the `-r` flag:
+          capa  --rules /path/to/rules  suspicious.exe
+          capa  -r      /path/to/rules  suspicious.exe
+
+        examples:
+          identify capabilities in a binary
+            capa suspicious.exe
+
+          identify capabilities in 32-bit shellcode, see `-f` for all supported formats
+            capa -f sc32 shellcode.bin
+
+          report match locations
+            capa -v suspicious.exe
+
+          report all feature match details
+            capa -vv suspicious.exe
+
+          filter rules by meta fields, e.g. rule name or namespace
+            capa -t "create TCP socket" suspicious.exe
+         """
+    )
+
+    parser = argparse.ArgumentParser(
+        description=desc, epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    install_common_args(parser, {"input", "format", "backend", "os", "signatures", "rules", "tag"})
+    parser.add_argument("-j", "--json", action="store_true", help="emit JSON instead of text")
+    args = parser.parse_args(args=argv)
+
+    try:
+        handle_common_args(args)
+        ensure_input_exists_from_args(args)
+        input_format = get_input_format_from_args(args)
+        rules = get_rules_from_args(args)
+        file_extractors = get_file_extractors_from_args(args, input_format)
+        found_file_limitation = find_file_limitations_from_args(args, rules, file_extractors)
+    except ShouldExitError as e:
+        return e.status_code
 
     meta: rdoc.Metadata
     capabilities: MatchResults
     counts: Dict[str, Any]
 
-    if format_ == FORMAT_RESULT:
+    if input_format == FORMAT_RESULT:
         # result document directly parses into meta, capabilities
-        result_doc = capa.render.result_document.ResultDocument.from_file(Path(args.sample))
+        result_doc = capa.render.result_document.ResultDocument.from_file(Path(args.input))
         meta, capabilities = result_doc.to_capa()
 
     else:
         # all other formats we must create an extractor
         # and use that to extract meta and capabilities
 
-        if format_ == FORMAT_FREEZE:
-            # freeze format deserializes directly into an extractor
-            extractor: FeatureExtractor = frz.load(Path(args.sample).read_bytes())
-        else:
-            # all other formats we must create an extractor,
-            # such as viv, binary ninja, etc. workspaces
-            # and use those for extracting.
-
-            try:
-                if format_ == FORMAT_PE:
-                    sig_paths = get_signatures(args.signatures)
-                else:
-                    sig_paths = []
-                    logger.debug("skipping library code matching: only have native PE signatures")
-            except IOError as e:
-                logger.error("%s", str(e))
-                return E_INVALID_SIG
-
-            should_save_workspace = os.environ.get("CAPA_SAVE_WORKSPACE") not in ("0", "no", "NO", "n", None)
-
-            # TODO(mr-tz): this should be wrapped and refactored as it's tedious to update everywhere
-            #  see same code and show-features above examples
-            #  https://github.com/mandiant/capa/issues/1813
-            try:
-                extractor = get_extractor(
-                    args.sample,
-                    format_,
-                    args.os,
-                    args.backend,
-                    sig_paths,
-                    should_save_workspace,
-                    disable_progress=args.quiet or args.debug,
-                )
-            except UnsupportedFormatError as e:
-                if format_ == FORMAT_CAPE:
-                    log_unsupported_cape_report_error(str(e))
-                else:
-                    log_unsupported_format_error()
-                return E_INVALID_FILE_TYPE
-            except UnsupportedArchError:
-                log_unsupported_arch_error()
-                return E_INVALID_FILE_ARCH
-            except UnsupportedOSError:
-                log_unsupported_os_error()
-                return E_INVALID_FILE_OS
+        try:
+            backend = get_backend_from_args(args, input_format)
+            sample_path = get_sample_path_from_args(args, backend)
+            os = get_os(sample_path)
+            extractor = get_extractor_from_args(args, input_format, backend)
+        except ShouldExitError as e:
+            return e.status_code
 
         capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet)
 
-        meta = collect_metadata(argv, args.sample, args.format, args.os, args.rules, extractor, counts)
+        meta = collect_metadata(argv, args.input, input_format, os, args.rules, extractor, counts)
         meta.analysis.layout = compute_layout(rules, extractor, capabilities)
 
         if isinstance(extractor, StaticFeatureExtractor) and found_file_limitation:
@@ -1226,6 +1455,7 @@ def main(argv: Optional[List[str]] = None):
             # do show the output in verbose mode, though.
             if not (args.verbose or args.vverbose or args.json):
                 return E_FILE_LIMITATION
+
     if args.json:
         print(capa.render.json.render(meta, rules, capabilities))
     elif args.vverbose:

From e1186c5b4b30ea477e51bef144413f90b7f72798 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Thu, 25 Jan 2024 11:50:24 +0000
Subject: [PATCH 006/200] features: add BinExport2 declarations

---
 capa/features/common.py |  5 ++++-
 capa/helpers.py         | 23 +++++++++++++----------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/capa/features/common.py b/capa/features/common.py
index 0cb1396de..0609cdfec 100644
--- a/capa/features/common.py
+++ b/capa/features/common.py
@@ -407,9 +407,10 @@ def get_value_str(self):
 # other candidates here: https://docs.microsoft.com/en-us/windows/win32/debug/pe-format#machine-types
 ARCH_I386 = "i386"
 ARCH_AMD64 = "amd64"
+ARCH_AARCH64 = "aarch64"
 # dotnet
 ARCH_ANY = "any"
-VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_ANY)
+VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_ANY, ARCH_AARCH64)
 
 
 class Arch(Feature):
@@ -457,6 +458,7 @@ def evaluate(self, ctx, **kwargs):
 FORMAT_AUTO = "auto"
 FORMAT_SC32 = "sc32"
 FORMAT_SC64 = "sc64"
+FORMAT_BINEXPORT2 = "binexport2"
 FORMAT_CAPE = "cape"
 STATIC_FORMATS = {
     FORMAT_SC32,
@@ -464,6 +466,7 @@ def evaluate(self, ctx, **kwargs):
     FORMAT_PE,
     FORMAT_ELF,
     FORMAT_DOTNET,
+    FORMAT_BINEXPORT2,
 }
 DYNAMIC_FORMATS = {
     FORMAT_CAPE,
diff --git a/capa/helpers.py b/capa/helpers.py
index 89dad8b91..6ea3de5a5 100644
--- a/capa/helpers.py
+++ b/capa/helpers.py
@@ -16,10 +16,11 @@
 import tqdm
 
 from capa.exceptions import UnsupportedFormatError
-from capa.features.common import FORMAT_PE, FORMAT_CAPE, FORMAT_SC32, FORMAT_SC64, FORMAT_DOTNET, FORMAT_UNKNOWN, Format
+from capa.features.common import FORMAT_PE, FORMAT_CAPE, FORMAT_SC32, FORMAT_SC64, FORMAT_DOTNET, FORMAT_UNKNOWN, Format, FORMAT_BINEXPORT2
 
 EXTENSIONS_SHELLCODE_32 = ("sc32", "raw32")
 EXTENSIONS_SHELLCODE_64 = ("sc64", "raw64")
+EXTENSIONS_BINEXPORT2 = ("BinExport", "BinExport2")
 EXTENSIONS_DYNAMIC = ("json", "json_")
 EXTENSIONS_ELF = "elf_"
 
@@ -81,15 +82,8 @@ def get_format_from_extension(sample: Path) -> str:
         format_ = FORMAT_SC64
     elif sample.name.endswith(EXTENSIONS_DYNAMIC):
         format_ = get_format_from_report(sample)
-    return format_
-
-
-def get_auto_format(path: Path) -> str:
-    format_ = get_format(path)
-    if format_ == FORMAT_UNKNOWN:
-        format_ = get_format_from_extension(path)
-    if format_ == FORMAT_UNKNOWN:
-        raise UnsupportedFormatError()
+    elif sample.name.endswith(EXTENSIONS_BINEXPORT2):
+        format_ = FORMAT_BINEXPORT2
     return format_
 
 
@@ -112,6 +106,15 @@ def get_format(sample: Path) -> str:
     return FORMAT_UNKNOWN
 
 
+def get_auto_format(path: Path) -> str:
+    format_ = get_format(path)
+    if format_ == FORMAT_UNKNOWN:
+        format_ = get_format_from_extension(path)
+    if format_ == FORMAT_UNKNOWN:
+        raise UnsupportedFormatError()
+    return format_
+
+
 @contextlib.contextmanager
 def redirecting_print_to_tqdm(disable_progress):
     """

From 3acdd2842d166ab5102f8be00535ff3b8bf2e70a Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Thu, 25 Jan 2024 11:51:14 +0000
Subject: [PATCH 007/200] BinExport2: initial skeleton of feature extraction

---
 .../extractors/binexport2/__init__.py         |  33 +
 .../extractors/binexport2/basicblock.py       |  36 +
 .../extractors/binexport2/binexport2_pb2.py   |  72 ++
 .../extractors/binexport2/binexport2_pb2.pyi  | 784 ++++++++++++++++++
 .../extractors/binexport2/extractor.py        |  70 ++
 capa/features/extractors/binexport2/file.py   |  56 ++
 .../extractors/binexport2/function.py         |  41 +
 .../features/extractors/binexport2/global_.py |  33 +
 capa/features/extractors/binexport2/insn.py   |  93 +++
 9 files changed, 1218 insertions(+)
 create mode 100644 capa/features/extractors/binexport2/__init__.py
 create mode 100644 capa/features/extractors/binexport2/basicblock.py
 create mode 100644 capa/features/extractors/binexport2/binexport2_pb2.py
 create mode 100644 capa/features/extractors/binexport2/binexport2_pb2.pyi
 create mode 100644 capa/features/extractors/binexport2/extractor.py
 create mode 100644 capa/features/extractors/binexport2/file.py
 create mode 100644 capa/features/extractors/binexport2/function.py
 create mode 100644 capa/features/extractors/binexport2/global_.py
 create mode 100644 capa/features/extractors/binexport2/insn.py

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
new file mode 100644
index 000000000..858cea593
--- /dev/null
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -0,0 +1,33 @@
+"""
+Proto files generated via protobuf v24.4:
+
+    protoc --python_out=. --mypy_out=. binexport2.proto
+"""
+import os
+import logging
+from pathlib import Path
+
+from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_binexport2(sample: Path) -> BinExport2:
+    be2 = BinExport2()
+    be2.ParseFromString(sample.read_bytes())
+    return be2
+
+
+def get_sample_from_binexport2(be2: BinExport2) -> Path:
+    # $CAPA_SAMPLE_DIR/<sha256>
+    base = Path(os.environ.get("CAPA_SAMPLES_DIR", "."))
+
+    sha256 = be2.meta_information.executable_id.lower()
+
+    logger.debug("searching for sample in: %s", base)
+    path = base / sha256
+    if path.exists():
+        return path
+    else:
+        raise ValueError("cannot find sample")
diff --git a/capa/features/extractors/binexport2/basicblock.py b/capa/features/extractors/binexport2/basicblock.py
new file mode 100644
index 000000000..bf59f4121
--- /dev/null
+++ b/capa/features/extractors/binexport2/basicblock.py
@@ -0,0 +1,36 @@
+# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+
+from typing import Tuple, Iterator, Any
+
+from capa.features.common import Feature
+from capa.features.address import Address
+from capa.features.basicblock import BasicBlock
+from capa.features.extractors.base_extractor import BBHandle, FunctionHandle
+
+
+# TODO(wb): 1755
+TODOType = Any
+
+
+def extract_bb_tight_loop(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
+    # TODO(wb): 1755
+    yield from ()
+
+
+def extract_features(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
+    """extract basic block features"""
+    for bb_handler in BASIC_BLOCK_HANDLERS:
+        for feature, addr in bb_handler(fh, bbh):
+            yield feature, addr
+    yield BasicBlock(), bbh.address
+
+
+BASIC_BLOCK_HANDLERS = (
+    extract_bb_tight_loop,
+)
diff --git a/capa/features/extractors/binexport2/binexport2_pb2.py b/capa/features/extractors/binexport2/binexport2_pb2.py
new file mode 100644
index 000000000..4d11d1a6e
--- /dev/null
+++ b/capa/features/extractors/binexport2/binexport2_pb2.py
@@ -0,0 +1,72 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: binexport2.proto
+"""Generated protocol buffer code."""
+from google.protobuf.internal import builder as _builder
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x10\x62inexport2.proto\"\xa5\x17\n\nBinExport2\x12*\n\x10meta_information\x18\x01 \x01(\x0b\x32\x10.BinExport2.Meta\x12*\n\nexpression\x18\x02 \x03(\x0b\x32\x16.BinExport2.Expression\x12$\n\x07operand\x18\x03 \x03(\x0b\x32\x13.BinExport2.Operand\x12&\n\x08mnemonic\x18\x04 \x03(\x0b\x32\x14.BinExport2.Mnemonic\x12,\n\x0binstruction\x18\x05 \x03(\x0b\x32\x17.BinExport2.Instruction\x12+\n\x0b\x62\x61sic_block\x18\x06 \x03(\x0b\x32\x16.BinExport2.BasicBlock\x12)\n\nflow_graph\x18\x07 \x03(\x0b\x32\x15.BinExport2.FlowGraph\x12)\n\ncall_graph\x18\x08 \x01(\x0b\x32\x15.BinExport2.CallGraph\x12\x14\n\x0cstring_table\x18\t \x03(\t\x12\x32\n\x0f\x61\x64\x64ress_comment\x18\n \x03(\x0b\x32\x15.BinExport2.ReferenceB\x02\x18\x01\x12$\n\x07\x63omment\x18\x11 \x03(\x0b\x32\x13.BinExport2.Comment\x12/\n\x10string_reference\x18\x0b \x03(\x0b\x32\x15.BinExport2.Reference\x12\x36\n\x17\x65xpression_substitution\x18\x0c \x03(\x0b\x32\x15.BinExport2.Reference\x12$\n\x07section\x18\r \x03(\x0b\x32\x13.BinExport2.Section\x12$\n\x07library\x18\x0e \x03(\x0b\x32\x13.BinExport2.Library\x12\x31\n\x0e\x64\x61ta_reference\x18\x0f \x03(\x0b\x32\x19.BinExport2.DataReference\x12\"\n\x06module\x18\x10 \x03(\x0b\x32\x12.BinExport2.Module\x1aj\n\x04Meta\x12\x17\n\x0f\x65xecutable_name\x18\x01 \x01(\t\x12\x15\n\rexecutable_id\x18\x02 \x01(\t\x12\x19\n\x11\x61rchitecture_name\x18\x03 \x01(\t\x12\x11\n\ttimestamp\x18\x04 \x01(\x03J\x04\x08\x05\x10\x06\x1a\x9c\x03\n\tCallGraph\x12,\n\x06vertex\x18\x01 \x03(\x0b\x32\x1c.BinExport2.CallGraph.Vertex\x12(\n\x04\x65\x64ge\x18\x02 \x03(\x0b\x32\x1a.BinExport2.CallGraph.Edge\x1a\xf4\x01\n\x06Vertex\x12\x0f\n\x07\x61\x64\x64ress\x18\x01 \x01(\x04\x12\x37\n\x04type\x18\x02 \x01(\x0e\x32!.BinExport2.CallGraph.Vertex.Type:\x06NORMAL\x12\x14\n\x0cmangled_name\x18\x03 \x01(\t\x12\x16\n\x0e\x64\x65mangled_name\x18\x04 \x01(\t\x12\x15\n\rlibrary_index\x18\x05 \x01(\x05\x12\x14\n\x0cmodule_index\x18\x06 \x01(\x05\"E\n\x04Type\x12\n\n\x06NORMAL\x10\x00\x12\x0b\n\x07LIBRARY\x10\x01\x12\x0c\n\x08IMPORTED\x10\x02\x12\t\n\x05THUNK\x10\x03\x12\x0b\n\x07INVALID\x10\x04\x1a@\n\x04\x45\x64ge\x12\x1b\n\x13source_vertex_index\x18\x01 \x01(\x05\x12\x1b\n\x13target_vertex_index\x18\x02 \x01(\x05\x1a\x90\x02\n\nExpression\x12\x38\n\x04type\x18\x01 \x01(\x0e\x32\x1b.BinExport2.Expression.Type:\rIMMEDIATE_INT\x12\x0e\n\x06symbol\x18\x02 \x01(\t\x12\x11\n\timmediate\x18\x03 \x01(\x04\x12\x14\n\x0cparent_index\x18\x04 \x01(\x05\x12\x15\n\ris_relocation\x18\x05 \x01(\x08\"x\n\x04Type\x12\n\n\x06SYMBOL\x10\x01\x12\x11\n\rIMMEDIATE_INT\x10\x02\x12\x13\n\x0fIMMEDIATE_FLOAT\x10\x03\x12\x0c\n\x08OPERATOR\x10\x04\x12\x0c\n\x08REGISTER\x10\x05\x12\x0f\n\x0bSIZE_PREFIX\x10\x06\x12\x0f\n\x0b\x44\x45REFERENCE\x10\x07\x1a#\n\x07Operand\x12\x18\n\x10\x65xpression_index\x18\x01 \x03(\x05\x1a\x18\n\x08Mnemonic\x12\x0c\n\x04name\x18\x01 \x01(\t\x1a\x8f\x01\n\x0bInstruction\x12\x0f\n\x07\x61\x64\x64ress\x18\x01 \x01(\x04\x12\x13\n\x0b\x63\x61ll_target\x18\x02 \x03(\x04\x12\x19\n\x0emnemonic_index\x18\x03 \x01(\x05:\x01\x30\x12\x15\n\roperand_index\x18\x04 \x03(\x05\x12\x11\n\traw_bytes\x18\x05 \x01(\x0c\x12\x15\n\rcomment_index\x18\x06 \x03(\x05\x1a\x80\x01\n\nBasicBlock\x12<\n\x11instruction_index\x18\x01 \x03(\x0b\x32!.BinExport2.BasicBlock.IndexRange\x1a\x34\n\nIndexRange\x12\x13\n\x0b\x62\x65gin_index\x18\x01 \x01(\x05\x12\x11\n\tend_index\x18\x02 \x01(\x05\x1a\xe9\x02\n\tFlowGraph\x12\x19\n\x11\x62\x61sic_block_index\x18\x01 \x03(\x05\x12\x1f\n\x17\x65ntry_basic_block_index\x18\x03 \x01(\x05\x12(\n\x04\x65\x64ge\x18\x02 \x03(\x0b\x32\x1a.BinExport2.FlowGraph.Edge\x1a\xf5\x01\n\x04\x45\x64ge\x12 \n\x18source_basic_block_index\x18\x01 \x01(\x05\x12 \n\x18target_basic_block_index\x18\x02 \x01(\x05\x12<\n\x04type\x18\x03 \x01(\x0e\x32\x1f.BinExport2.FlowGraph.Edge.Type:\rUNCONDITIONAL\x12\x1b\n\x0cis_back_edge\x18\x04 \x01(\x08:\x05\x66\x61lse\"N\n\x04Type\x12\x12\n\x0e\x43ONDITION_TRUE\x10\x01\x12\x13\n\x0f\x43ONDITION_FALSE\x10\x02\x12\x11\n\rUNCONDITIONAL\x10\x03\x12\n\n\x06SWITCH\x10\x04\x1a\x8d\x01\n\tReference\x12\x19\n\x11instruction_index\x18\x01 \x01(\x05\x12$\n\x19instruction_operand_index\x18\x02 \x01(\x05:\x01\x30\x12#\n\x18operand_expression_index\x18\x03 \x01(\x05:\x01\x30\x12\x1a\n\x12string_table_index\x18\x04 \x01(\x05\x1a;\n\rDataReference\x12\x19\n\x11instruction_index\x18\x01 \x01(\x05\x12\x0f\n\x07\x61\x64\x64ress\x18\x02 \x01(\x04\x1a\xd4\x02\n\x07\x43omment\x12\x19\n\x11instruction_index\x18\x01 \x01(\x05\x12$\n\x19instruction_operand_index\x18\x02 \x01(\x05:\x01\x30\x12#\n\x18operand_expression_index\x18\x03 \x01(\x05:\x01\x30\x12\x1a\n\x12string_table_index\x18\x04 \x01(\x05\x12\x12\n\nrepeatable\x18\x05 \x01(\x08\x12/\n\x04type\x18\x06 \x01(\x0e\x32\x18.BinExport2.Comment.Type:\x07\x44\x45\x46\x41ULT\"\x81\x01\n\x04Type\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\x0c\n\x08\x41NTERIOR\x10\x01\x12\r\n\tPOSTERIOR\x10\x02\x12\x0c\n\x08\x46UNCTION\x10\x03\x12\x08\n\x04\x45NUM\x10\x04\x12\x0c\n\x08LOCATION\x10\x05\x12\x14\n\x10GLOBAL_REFERENCE\x10\x06\x12\x13\n\x0fLOCAL_REFERENCE\x10\x07\x1aX\n\x07Section\x12\x0f\n\x07\x61\x64\x64ress\x18\x01 \x01(\x04\x12\x0c\n\x04size\x18\x02 \x01(\x04\x12\x0e\n\x06\x66lag_r\x18\x03 \x01(\x08\x12\x0e\n\x06\x66lag_w\x18\x04 \x01(\x08\x12\x0e\n\x06\x66lag_x\x18\x05 \x01(\x08\x1a\x43\n\x07Library\x12\x11\n\tis_static\x18\x01 \x01(\x08\x12\x17\n\x0cload_address\x18\x02 \x01(\x04:\x01\x30\x12\x0c\n\x04name\x18\x03 \x01(\t\x1a\x16\n\x06Module\x12\x0c\n\x04name\x18\x01 \x01(\t*\x0b\x08\x80\xc2\xd7/\x10\x80\x80\x80\x80\x02\x42)\n\x1c\x63om.google.security.zynamicsB\tBinExport')
+
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'binexport2_pb2', globals())
+if _descriptor._USE_C_DESCRIPTORS == False:
+
+  DESCRIPTOR._options = None
+  DESCRIPTOR._serialized_options = b'\n\034com.google.security.zynamicsB\tBinExport'
+  _BINEXPORT2.fields_by_name['address_comment']._options = None
+  _BINEXPORT2.fields_by_name['address_comment']._serialized_options = b'\030\001'
+  _BINEXPORT2._serialized_start=21
+  _BINEXPORT2._serialized_end=3002
+  _BINEXPORT2_META._serialized_start=758
+  _BINEXPORT2_META._serialized_end=864
+  _BINEXPORT2_CALLGRAPH._serialized_start=867
+  _BINEXPORT2_CALLGRAPH._serialized_end=1279
+  _BINEXPORT2_CALLGRAPH_VERTEX._serialized_start=969
+  _BINEXPORT2_CALLGRAPH_VERTEX._serialized_end=1213
+  _BINEXPORT2_CALLGRAPH_VERTEX_TYPE._serialized_start=1144
+  _BINEXPORT2_CALLGRAPH_VERTEX_TYPE._serialized_end=1213
+  _BINEXPORT2_CALLGRAPH_EDGE._serialized_start=1215
+  _BINEXPORT2_CALLGRAPH_EDGE._serialized_end=1279
+  _BINEXPORT2_EXPRESSION._serialized_start=1282
+  _BINEXPORT2_EXPRESSION._serialized_end=1554
+  _BINEXPORT2_EXPRESSION_TYPE._serialized_start=1434
+  _BINEXPORT2_EXPRESSION_TYPE._serialized_end=1554
+  _BINEXPORT2_OPERAND._serialized_start=1556
+  _BINEXPORT2_OPERAND._serialized_end=1591
+  _BINEXPORT2_MNEMONIC._serialized_start=1593
+  _BINEXPORT2_MNEMONIC._serialized_end=1617
+  _BINEXPORT2_INSTRUCTION._serialized_start=1620
+  _BINEXPORT2_INSTRUCTION._serialized_end=1763
+  _BINEXPORT2_BASICBLOCK._serialized_start=1766
+  _BINEXPORT2_BASICBLOCK._serialized_end=1894
+  _BINEXPORT2_BASICBLOCK_INDEXRANGE._serialized_start=1842
+  _BINEXPORT2_BASICBLOCK_INDEXRANGE._serialized_end=1894
+  _BINEXPORT2_FLOWGRAPH._serialized_start=1897
+  _BINEXPORT2_FLOWGRAPH._serialized_end=2258
+  _BINEXPORT2_FLOWGRAPH_EDGE._serialized_start=2013
+  _BINEXPORT2_FLOWGRAPH_EDGE._serialized_end=2258
+  _BINEXPORT2_FLOWGRAPH_EDGE_TYPE._serialized_start=2180
+  _BINEXPORT2_FLOWGRAPH_EDGE_TYPE._serialized_end=2258
+  _BINEXPORT2_REFERENCE._serialized_start=2261
+  _BINEXPORT2_REFERENCE._serialized_end=2402
+  _BINEXPORT2_DATAREFERENCE._serialized_start=2404
+  _BINEXPORT2_DATAREFERENCE._serialized_end=2463
+  _BINEXPORT2_COMMENT._serialized_start=2466
+  _BINEXPORT2_COMMENT._serialized_end=2806
+  _BINEXPORT2_COMMENT_TYPE._serialized_start=2677
+  _BINEXPORT2_COMMENT_TYPE._serialized_end=2806
+  _BINEXPORT2_SECTION._serialized_start=2808
+  _BINEXPORT2_SECTION._serialized_end=2896
+  _BINEXPORT2_LIBRARY._serialized_start=2898
+  _BINEXPORT2_LIBRARY._serialized_end=2965
+  _BINEXPORT2_MODULE._serialized_start=2967
+  _BINEXPORT2_MODULE._serialized_end=2989
+# @@protoc_insertion_point(module_scope)
diff --git a/capa/features/extractors/binexport2/binexport2_pb2.pyi b/capa/features/extractors/binexport2/binexport2_pb2.pyi
new file mode 100644
index 000000000..1620aee7a
--- /dev/null
+++ b/capa/features/extractors/binexport2/binexport2_pb2.pyi
@@ -0,0 +1,784 @@
+"""
+@generated by mypy-protobuf.  Do not edit manually!
+isort:skip_file
+The representation is generic to accommodate various source architectures.
+In particular 32 and 64 bit versions of x86, ARM, PowerPC and MIPS have been
+tested.
+
+Multiple levels of deduping have been applied to make the format more compact
+and avoid redundant data duplication. Some of this due to hard-earned
+experience trying to cope with intentionally obfuscated malicious binaries.
+Note in particular that the same instruction may occur in multiple basic
+blocks and the same basic block in multiple functions (instruction and basic
+block sharing). Implemented naively, malware can use this to cause
+combinatorial explosion in memory usage, DOSing the analyst. This format
+should store every unique expression, mnemonic, operand, instruction and
+basic block only once instead of duplicating the information for every
+instance of it.
+
+This format does _not_ try to be 100% backwards compatible with the old
+version. In particular, we do not store IDA's comment types, making lossless
+porting of IDA comments impossible. We do however, store comments and
+expression substitutions, so porting the actual data is possible, just not
+the exact IDA type.
+
+While it would be more natural to use addresses when defining call graph and
+flow graph edges and other such references, it is more efficient to employ
+one more level of indirection and use indices into the basic block or
+function arrays instead. This is because addresses will usually use most of
+the available 64 bit space while indices will be much smaller and compress
+much better (less randomly distributed).
+
+We omit all fields that are set to their default value anyways. Note that
+this has two side effects:
+  - changing the defaults in this proto file will, in effect, change what's
+    read from disk
+  - the generated code has_* methods are somewhat less useful
+WARNING: We omit the defaults manually in the code writing the data. Do not
+         change the defaults here without changing the code!
+
+TODO(cblichmann): Link flow graphs to call graph nodes. The connection is
+                  there via the address, but tricky to extract.
+"""
+import builtins
+import collections.abc
+import google.protobuf.descriptor
+import google.protobuf.internal.containers
+import google.protobuf.internal.enum_type_wrapper
+import google.protobuf.message
+import sys
+import typing
+
+if sys.version_info >= (3, 10):
+    import typing as typing_extensions
+else:
+    import typing_extensions
+
+DESCRIPTOR: google.protobuf.descriptor.FileDescriptor
+
+@typing_extensions.final
+class BinExport2(google.protobuf.message.Message):
+    DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+    @typing_extensions.final
+    class Meta(google.protobuf.message.Message):
+        DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+        EXECUTABLE_NAME_FIELD_NUMBER: builtins.int
+        EXECUTABLE_ID_FIELD_NUMBER: builtins.int
+        ARCHITECTURE_NAME_FIELD_NUMBER: builtins.int
+        TIMESTAMP_FIELD_NUMBER: builtins.int
+        executable_name: builtins.str
+        """Input binary filename including file extension but excluding file path.
+        example: "insider_gcc.exe"
+        """
+        executable_id: builtins.str
+        """Application defined executable id. Often the SHA256 hash of the input
+        binary.
+        """
+        architecture_name: builtins.str
+        """Input architecture name, e.g. x86-32."""
+        timestamp: builtins.int
+        """When did this file get created? Unix time. This may be used for some
+        primitive versioning in case the file format ever changes.
+        """
+        def __init__(
+            self,
+            *,
+            executable_name: builtins.str | None = ...,
+            executable_id: builtins.str | None = ...,
+            architecture_name: builtins.str | None = ...,
+            timestamp: builtins.int | None = ...,
+        ) -> None: ...
+        def HasField(self, field_name: typing_extensions.Literal["architecture_name", b"architecture_name", "executable_id", b"executable_id", "executable_name", b"executable_name", "timestamp", b"timestamp"]) -> builtins.bool: ...
+        def ClearField(self, field_name: typing_extensions.Literal["architecture_name", b"architecture_name", "executable_id", b"executable_id", "executable_name", b"executable_name", "timestamp", b"timestamp"]) -> None: ...
+
+    @typing_extensions.final
+    class CallGraph(google.protobuf.message.Message):
+        DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+        @typing_extensions.final
+        class Vertex(google.protobuf.message.Message):
+            DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+            class _Type:
+                ValueType = typing.NewType("ValueType", builtins.int)
+                V: typing_extensions.TypeAlias = ValueType
+
+            class _TypeEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[BinExport2.CallGraph.Vertex._Type.ValueType], builtins.type):
+                DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor
+                NORMAL: BinExport2.CallGraph.Vertex._Type.ValueType  # 0
+                """Regular function with full disassembly."""
+                LIBRARY: BinExport2.CallGraph.Vertex._Type.ValueType  # 1
+                """This function is a well known library function."""
+                IMPORTED: BinExport2.CallGraph.Vertex._Type.ValueType  # 2
+                """Imported from a dynamic link library (e.g. dll)."""
+                THUNK: BinExport2.CallGraph.Vertex._Type.ValueType  # 3
+                """A thunk function, forwarding its work via an unconditional jump."""
+                INVALID: BinExport2.CallGraph.Vertex._Type.ValueType  # 4
+                """An invalid function (a function that contained invalid code or was
+                considered invalid by some heuristics).
+                """
+
+            class Type(_Type, metaclass=_TypeEnumTypeWrapper): ...
+            NORMAL: BinExport2.CallGraph.Vertex.Type.ValueType  # 0
+            """Regular function with full disassembly."""
+            LIBRARY: BinExport2.CallGraph.Vertex.Type.ValueType  # 1
+            """This function is a well known library function."""
+            IMPORTED: BinExport2.CallGraph.Vertex.Type.ValueType  # 2
+            """Imported from a dynamic link library (e.g. dll)."""
+            THUNK: BinExport2.CallGraph.Vertex.Type.ValueType  # 3
+            """A thunk function, forwarding its work via an unconditional jump."""
+            INVALID: BinExport2.CallGraph.Vertex.Type.ValueType  # 4
+            """An invalid function (a function that contained invalid code or was
+            considered invalid by some heuristics).
+            """
+
+            ADDRESS_FIELD_NUMBER: builtins.int
+            TYPE_FIELD_NUMBER: builtins.int
+            MANGLED_NAME_FIELD_NUMBER: builtins.int
+            DEMANGLED_NAME_FIELD_NUMBER: builtins.int
+            LIBRARY_INDEX_FIELD_NUMBER: builtins.int
+            MODULE_INDEX_FIELD_NUMBER: builtins.int
+            address: builtins.int
+            """The function's entry point address. Messages need to be sorted, see
+            comment below on `vertex`.
+            """
+            type: global___BinExport2.CallGraph.Vertex.Type.ValueType
+            mangled_name: builtins.str
+            """If the function has a user defined, real name it will be given here.
+            main() is a proper name, sub_BAADF00D is not (auto generated dummy
+            name).
+            """
+            demangled_name: builtins.str
+            """Demangled name if the function is a mangled C++ function and we could
+            demangle it.
+            """
+            library_index: builtins.int
+            """If this is a library function, what is its index in library arrays."""
+            module_index: builtins.int
+            """If module name, such as class name for DEX files, is present - index in
+            module table.
+            """
+            def __init__(
+                self,
+                *,
+                address: builtins.int | None = ...,
+                type: global___BinExport2.CallGraph.Vertex.Type.ValueType | None = ...,
+                mangled_name: builtins.str | None = ...,
+                demangled_name: builtins.str | None = ...,
+                library_index: builtins.int | None = ...,
+                module_index: builtins.int | None = ...,
+            ) -> None: ...
+            def HasField(self, field_name: typing_extensions.Literal["address", b"address", "demangled_name", b"demangled_name", "library_index", b"library_index", "mangled_name", b"mangled_name", "module_index", b"module_index", "type", b"type"]) -> builtins.bool: ...
+            def ClearField(self, field_name: typing_extensions.Literal["address", b"address", "demangled_name", b"demangled_name", "library_index", b"library_index", "mangled_name", b"mangled_name", "module_index", b"module_index", "type", b"type"]) -> None: ...
+
+        @typing_extensions.final
+        class Edge(google.protobuf.message.Message):
+            DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+            SOURCE_VERTEX_INDEX_FIELD_NUMBER: builtins.int
+            TARGET_VERTEX_INDEX_FIELD_NUMBER: builtins.int
+            source_vertex_index: builtins.int
+            """source and target index into the vertex repeated field."""
+            target_vertex_index: builtins.int
+            def __init__(
+                self,
+                *,
+                source_vertex_index: builtins.int | None = ...,
+                target_vertex_index: builtins.int | None = ...,
+            ) -> None: ...
+            def HasField(self, field_name: typing_extensions.Literal["source_vertex_index", b"source_vertex_index", "target_vertex_index", b"target_vertex_index"]) -> builtins.bool: ...
+            def ClearField(self, field_name: typing_extensions.Literal["source_vertex_index", b"source_vertex_index", "target_vertex_index", b"target_vertex_index"]) -> None: ...
+
+        VERTEX_FIELD_NUMBER: builtins.int
+        EDGE_FIELD_NUMBER: builtins.int
+        @property
+        def vertex(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.CallGraph.Vertex]:
+            """vertices == functions in the call graph.
+            Important: Most downstream tooling (notably BinDiff), need these to be
+                       sorted by `Vertex::address` (ascending). For C++, the
+                       `BinExport2Writer` class enforces this invariant.
+            """
+        @property
+        def edge(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.CallGraph.Edge]:
+            """edges == calls in the call graph."""
+        def __init__(
+            self,
+            *,
+            vertex: collections.abc.Iterable[global___BinExport2.CallGraph.Vertex] | None = ...,
+            edge: collections.abc.Iterable[global___BinExport2.CallGraph.Edge] | None = ...,
+        ) -> None: ...
+        def ClearField(self, field_name: typing_extensions.Literal["edge", b"edge", "vertex", b"vertex"]) -> None: ...
+
+    @typing_extensions.final
+    class Expression(google.protobuf.message.Message):
+        """An operand consists of 1 or more expressions, linked together as a tree."""
+
+        DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+        class _Type:
+            ValueType = typing.NewType("ValueType", builtins.int)
+            V: typing_extensions.TypeAlias = ValueType
+
+        class _TypeEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[BinExport2.Expression._Type.ValueType], builtins.type):
+            DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor
+            SYMBOL: BinExport2.Expression._Type.ValueType  # 1
+            IMMEDIATE_INT: BinExport2.Expression._Type.ValueType  # 2
+            IMMEDIATE_FLOAT: BinExport2.Expression._Type.ValueType  # 3
+            OPERATOR: BinExport2.Expression._Type.ValueType  # 4
+            REGISTER: BinExport2.Expression._Type.ValueType  # 5
+            SIZE_PREFIX: BinExport2.Expression._Type.ValueType  # 6
+            DEREFERENCE: BinExport2.Expression._Type.ValueType  # 7
+
+        class Type(_Type, metaclass=_TypeEnumTypeWrapper): ...
+        SYMBOL: BinExport2.Expression.Type.ValueType  # 1
+        IMMEDIATE_INT: BinExport2.Expression.Type.ValueType  # 2
+        IMMEDIATE_FLOAT: BinExport2.Expression.Type.ValueType  # 3
+        OPERATOR: BinExport2.Expression.Type.ValueType  # 4
+        REGISTER: BinExport2.Expression.Type.ValueType  # 5
+        SIZE_PREFIX: BinExport2.Expression.Type.ValueType  # 6
+        DEREFERENCE: BinExport2.Expression.Type.ValueType  # 7
+
+        TYPE_FIELD_NUMBER: builtins.int
+        SYMBOL_FIELD_NUMBER: builtins.int
+        IMMEDIATE_FIELD_NUMBER: builtins.int
+        PARENT_INDEX_FIELD_NUMBER: builtins.int
+        IS_RELOCATION_FIELD_NUMBER: builtins.int
+        type: global___BinExport2.Expression.Type.ValueType
+        """IMMEDIATE_INT is by far the most common type and thus we can save some
+        space by omitting it as the default.
+        """
+        symbol: builtins.str
+        """Symbol for this expression. Interpretation depends on type. Examples
+        include: "eax", "[", "+"
+        """
+        immediate: builtins.int
+        """If the expression can be interpreted as an integer value (IMMEDIATE_INT)
+        the value is given here.
+        """
+        parent_index: builtins.int
+        """The parent expression. Example expression tree for the second operand of:
+        mov eax, b4 [ebx + 12]
+        "b4" --- "[" --- "+" --- "ebx"
+                              \\  "12"
+        """
+        is_relocation: builtins.bool
+        """true if the expression has entry in relocation table"""
+        def __init__(
+            self,
+            *,
+            type: global___BinExport2.Expression.Type.ValueType | None = ...,
+            symbol: builtins.str | None = ...,
+            immediate: builtins.int | None = ...,
+            parent_index: builtins.int | None = ...,
+            is_relocation: builtins.bool | None = ...,
+        ) -> None: ...
+        def HasField(self, field_name: typing_extensions.Literal["immediate", b"immediate", "is_relocation", b"is_relocation", "parent_index", b"parent_index", "symbol", b"symbol", "type", b"type"]) -> builtins.bool: ...
+        def ClearField(self, field_name: typing_extensions.Literal["immediate", b"immediate", "is_relocation", b"is_relocation", "parent_index", b"parent_index", "symbol", b"symbol", "type", b"type"]) -> None: ...
+
+    @typing_extensions.final
+    class Operand(google.protobuf.message.Message):
+        """An instruction may have 0 or more operands."""
+
+        DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+        EXPRESSION_INDEX_FIELD_NUMBER: builtins.int
+        @property
+        def expression_index(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
+            """Contains all expressions constituting this operand. All expressions
+            should be linked into a single tree, i.e. there should only be one
+            expression in this list with parent_index == NULL and all others should
+            descend from that. Rendering order for expressions on the same tree level
+            (siblings) is implicitly given by the order they are referenced in this
+            repeated field.
+            Implicit: expression sequence
+            """
+        def __init__(
+            self,
+            *,
+            expression_index: collections.abc.Iterable[builtins.int] | None = ...,
+        ) -> None: ...
+        def ClearField(self, field_name: typing_extensions.Literal["expression_index", b"expression_index"]) -> None: ...
+
+    @typing_extensions.final
+    class Mnemonic(google.protobuf.message.Message):
+        """An instruction has exactly 1 mnemonic."""
+
+        DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+        NAME_FIELD_NUMBER: builtins.int
+        name: builtins.str
+        """Literal representation of the mnemonic, e.g.: "mov"."""
+        def __init__(
+            self,
+            *,
+            name: builtins.str | None = ...,
+        ) -> None: ...
+        def HasField(self, field_name: typing_extensions.Literal["name", b"name"]) -> builtins.bool: ...
+        def ClearField(self, field_name: typing_extensions.Literal["name", b"name"]) -> None: ...
+
+    @typing_extensions.final
+    class Instruction(google.protobuf.message.Message):
+        DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+        ADDRESS_FIELD_NUMBER: builtins.int
+        CALL_TARGET_FIELD_NUMBER: builtins.int
+        MNEMONIC_INDEX_FIELD_NUMBER: builtins.int
+        OPERAND_INDEX_FIELD_NUMBER: builtins.int
+        RAW_BYTES_FIELD_NUMBER: builtins.int
+        COMMENT_INDEX_FIELD_NUMBER: builtins.int
+        address: builtins.int
+        """This will only be filled for instructions that do not just flow from the
+        immediately preceding instruction. Regular instructions will have to
+        calculate their own address by adding raw_bytes.size() to the previous
+        instruction's address.
+        """
+        @property
+        def call_target(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
+            """If this is a call instruction and call targets could be determined
+            they'll be given here. Note that we may or may not have a flow graph for
+            the target and thus cannot use an index into the flow graph table here.
+            We could potentially use call graph nodes, but linking instructions to
+            the call graph directly does not seem a good choice.
+            """
+        mnemonic_index: builtins.int
+        """Index into the mnemonic array of strings. Used for de-duping the data.
+        The default value is used for the most common mnemonic in the executable.
+        """
+        @property
+        def operand_index(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
+            """Indices into the operand tree. On X86 this can be 0, 1 or 2 elements
+            long, 3 elements with VEX/EVEX.
+            Implicit: operand sequence
+            """
+        raw_bytes: builtins.bytes
+        """The unmodified input bytes corresponding to this instruction."""
+        @property
+        def comment_index(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
+            """Implicit: comment sequence"""
+        def __init__(
+            self,
+            *,
+            address: builtins.int | None = ...,
+            call_target: collections.abc.Iterable[builtins.int] | None = ...,
+            mnemonic_index: builtins.int | None = ...,
+            operand_index: collections.abc.Iterable[builtins.int] | None = ...,
+            raw_bytes: builtins.bytes | None = ...,
+            comment_index: collections.abc.Iterable[builtins.int] | None = ...,
+        ) -> None: ...
+        def HasField(self, field_name: typing_extensions.Literal["address", b"address", "mnemonic_index", b"mnemonic_index", "raw_bytes", b"raw_bytes"]) -> builtins.bool: ...
+        def ClearField(self, field_name: typing_extensions.Literal["address", b"address", "call_target", b"call_target", "comment_index", b"comment_index", "mnemonic_index", b"mnemonic_index", "operand_index", b"operand_index", "raw_bytes", b"raw_bytes"]) -> None: ...
+
+    @typing_extensions.final
+    class BasicBlock(google.protobuf.message.Message):
+        DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+        @typing_extensions.final
+        class IndexRange(google.protobuf.message.Message):
+            """This is a space optimization. The instructions for an individual basic
+            block will usually be in a continuous index range. Thus it is more
+            efficient to store the range instead of individual indices. However, this
+            does not hold true for all basic blocks, so we need to be able to store
+            multiple index ranges per block.
+            """
+
+            DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+            BEGIN_INDEX_FIELD_NUMBER: builtins.int
+            END_INDEX_FIELD_NUMBER: builtins.int
+            begin_index: builtins.int
+            """These work like begin and end iterators, i.e. the sequence is
+            [begin_index, end_index). If the sequence only contains a single
+            element end_index will be omitted.
+            """
+            end_index: builtins.int
+            def __init__(
+                self,
+                *,
+                begin_index: builtins.int | None = ...,
+                end_index: builtins.int | None = ...,
+            ) -> None: ...
+            def HasField(self, field_name: typing_extensions.Literal["begin_index", b"begin_index", "end_index", b"end_index"]) -> builtins.bool: ...
+            def ClearField(self, field_name: typing_extensions.Literal["begin_index", b"begin_index", "end_index", b"end_index"]) -> None: ...
+
+        INSTRUCTION_INDEX_FIELD_NUMBER: builtins.int
+        @property
+        def instruction_index(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.BasicBlock.IndexRange]:
+            """Implicit: instruction sequence"""
+        def __init__(
+            self,
+            *,
+            instruction_index: collections.abc.Iterable[global___BinExport2.BasicBlock.IndexRange] | None = ...,
+        ) -> None: ...
+        def ClearField(self, field_name: typing_extensions.Literal["instruction_index", b"instruction_index"]) -> None: ...
+
+    @typing_extensions.final
+    class FlowGraph(google.protobuf.message.Message):
+        DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+        @typing_extensions.final
+        class Edge(google.protobuf.message.Message):
+            DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+            class _Type:
+                ValueType = typing.NewType("ValueType", builtins.int)
+                V: typing_extensions.TypeAlias = ValueType
+
+            class _TypeEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[BinExport2.FlowGraph.Edge._Type.ValueType], builtins.type):
+                DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor
+                CONDITION_TRUE: BinExport2.FlowGraph.Edge._Type.ValueType  # 1
+                CONDITION_FALSE: BinExport2.FlowGraph.Edge._Type.ValueType  # 2
+                UNCONDITIONAL: BinExport2.FlowGraph.Edge._Type.ValueType  # 3
+                SWITCH: BinExport2.FlowGraph.Edge._Type.ValueType  # 4
+
+            class Type(_Type, metaclass=_TypeEnumTypeWrapper): ...
+            CONDITION_TRUE: BinExport2.FlowGraph.Edge.Type.ValueType  # 1
+            CONDITION_FALSE: BinExport2.FlowGraph.Edge.Type.ValueType  # 2
+            UNCONDITIONAL: BinExport2.FlowGraph.Edge.Type.ValueType  # 3
+            SWITCH: BinExport2.FlowGraph.Edge.Type.ValueType  # 4
+
+            SOURCE_BASIC_BLOCK_INDEX_FIELD_NUMBER: builtins.int
+            TARGET_BASIC_BLOCK_INDEX_FIELD_NUMBER: builtins.int
+            TYPE_FIELD_NUMBER: builtins.int
+            IS_BACK_EDGE_FIELD_NUMBER: builtins.int
+            source_basic_block_index: builtins.int
+            """Source instruction will always be the last instruction of the source
+            basic block, target instruction the first instruction of the target
+            basic block.
+            """
+            target_basic_block_index: builtins.int
+            type: global___BinExport2.FlowGraph.Edge.Type.ValueType
+            is_back_edge: builtins.bool
+            """Indicates whether this is a loop edge as determined by Lengauer-Tarjan."""
+            def __init__(
+                self,
+                *,
+                source_basic_block_index: builtins.int | None = ...,
+                target_basic_block_index: builtins.int | None = ...,
+                type: global___BinExport2.FlowGraph.Edge.Type.ValueType | None = ...,
+                is_back_edge: builtins.bool | None = ...,
+            ) -> None: ...
+            def HasField(self, field_name: typing_extensions.Literal["is_back_edge", b"is_back_edge", "source_basic_block_index", b"source_basic_block_index", "target_basic_block_index", b"target_basic_block_index", "type", b"type"]) -> builtins.bool: ...
+            def ClearField(self, field_name: typing_extensions.Literal["is_back_edge", b"is_back_edge", "source_basic_block_index", b"source_basic_block_index", "target_basic_block_index", b"target_basic_block_index", "type", b"type"]) -> None: ...
+
+        BASIC_BLOCK_INDEX_FIELD_NUMBER: builtins.int
+        ENTRY_BASIC_BLOCK_INDEX_FIELD_NUMBER: builtins.int
+        EDGE_FIELD_NUMBER: builtins.int
+        @property
+        def basic_block_index(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]:
+            """Basic blocks are sorted by address."""
+        entry_basic_block_index: builtins.int
+        """The flow graph's entry point address is the first instruction of the
+        entry_basic_block.
+        """
+        @property
+        def edge(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.FlowGraph.Edge]: ...
+        def __init__(
+            self,
+            *,
+            basic_block_index: collections.abc.Iterable[builtins.int] | None = ...,
+            entry_basic_block_index: builtins.int | None = ...,
+            edge: collections.abc.Iterable[global___BinExport2.FlowGraph.Edge] | None = ...,
+        ) -> None: ...
+        def HasField(self, field_name: typing_extensions.Literal["entry_basic_block_index", b"entry_basic_block_index"]) -> builtins.bool: ...
+        def ClearField(self, field_name: typing_extensions.Literal["basic_block_index", b"basic_block_index", "edge", b"edge", "entry_basic_block_index", b"entry_basic_block_index"]) -> None: ...
+
+    @typing_extensions.final
+    class Reference(google.protobuf.message.Message):
+        """Generic reference class used for address comments (deprecated), string
+        references and expression substitutions. It allows referencing from an
+        instruction, operand, expression subtree tuple to a de-duped string in the
+        string table.
+        """
+
+        DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+        INSTRUCTION_INDEX_FIELD_NUMBER: builtins.int
+        INSTRUCTION_OPERAND_INDEX_FIELD_NUMBER: builtins.int
+        OPERAND_EXPRESSION_INDEX_FIELD_NUMBER: builtins.int
+        STRING_TABLE_INDEX_FIELD_NUMBER: builtins.int
+        instruction_index: builtins.int
+        """Index into the global instruction table."""
+        instruction_operand_index: builtins.int
+        """Index into the operand array local to an instruction."""
+        operand_expression_index: builtins.int
+        """Index into the expression array local to an operand."""
+        string_table_index: builtins.int
+        """Index into the global string table."""
+        def __init__(
+            self,
+            *,
+            instruction_index: builtins.int | None = ...,
+            instruction_operand_index: builtins.int | None = ...,
+            operand_expression_index: builtins.int | None = ...,
+            string_table_index: builtins.int | None = ...,
+        ) -> None: ...
+        def HasField(self, field_name: typing_extensions.Literal["instruction_index", b"instruction_index", "instruction_operand_index", b"instruction_operand_index", "operand_expression_index", b"operand_expression_index", "string_table_index", b"string_table_index"]) -> builtins.bool: ...
+        def ClearField(self, field_name: typing_extensions.Literal["instruction_index", b"instruction_index", "instruction_operand_index", b"instruction_operand_index", "operand_expression_index", b"operand_expression_index", "string_table_index", b"string_table_index"]) -> None: ...
+
+    @typing_extensions.final
+    class DataReference(google.protobuf.message.Message):
+        DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+        INSTRUCTION_INDEX_FIELD_NUMBER: builtins.int
+        ADDRESS_FIELD_NUMBER: builtins.int
+        instruction_index: builtins.int
+        """Index into the global instruction table."""
+        address: builtins.int
+        """Address being referred."""
+        def __init__(
+            self,
+            *,
+            instruction_index: builtins.int | None = ...,
+            address: builtins.int | None = ...,
+        ) -> None: ...
+        def HasField(self, field_name: typing_extensions.Literal["address", b"address", "instruction_index", b"instruction_index"]) -> builtins.bool: ...
+        def ClearField(self, field_name: typing_extensions.Literal["address", b"address", "instruction_index", b"instruction_index"]) -> None: ...
+
+    @typing_extensions.final
+    class Comment(google.protobuf.message.Message):
+        DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+        class _Type:
+            ValueType = typing.NewType("ValueType", builtins.int)
+            V: typing_extensions.TypeAlias = ValueType
+
+        class _TypeEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[BinExport2.Comment._Type.ValueType], builtins.type):
+            DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor
+            DEFAULT: BinExport2.Comment._Type.ValueType  # 0
+            """A regular instruction comment. Typically displayed next to the
+            instruction disassembly.
+            """
+            ANTERIOR: BinExport2.Comment._Type.ValueType  # 1
+            """A comment line that is typically displayed before (above) the
+            instruction it refers to.
+            """
+            POSTERIOR: BinExport2.Comment._Type.ValueType  # 2
+            """Like ANTERIOR, but a typically displayed after (below)."""
+            FUNCTION: BinExport2.Comment._Type.ValueType  # 3
+            """Similar to an ANTERIOR comment, but applies to the beginning of an
+            identified function. Programs displaying the proto may choose to render
+            these differently (e.g. above an inferred function signature).
+            """
+            ENUM: BinExport2.Comment._Type.ValueType  # 4
+            """Named constants, bitfields and similar."""
+            LOCATION: BinExport2.Comment._Type.ValueType  # 5
+            """Named locations, usually the target of a jump."""
+            GLOBAL_REFERENCE: BinExport2.Comment._Type.ValueType  # 6
+            """Data cross references."""
+            LOCAL_REFERENCE: BinExport2.Comment._Type.ValueType  # 7
+            """Local/stack variables."""
+
+        class Type(_Type, metaclass=_TypeEnumTypeWrapper): ...
+        DEFAULT: BinExport2.Comment.Type.ValueType  # 0
+        """A regular instruction comment. Typically displayed next to the
+        instruction disassembly.
+        """
+        ANTERIOR: BinExport2.Comment.Type.ValueType  # 1
+        """A comment line that is typically displayed before (above) the
+        instruction it refers to.
+        """
+        POSTERIOR: BinExport2.Comment.Type.ValueType  # 2
+        """Like ANTERIOR, but a typically displayed after (below)."""
+        FUNCTION: BinExport2.Comment.Type.ValueType  # 3
+        """Similar to an ANTERIOR comment, but applies to the beginning of an
+        identified function. Programs displaying the proto may choose to render
+        these differently (e.g. above an inferred function signature).
+        """
+        ENUM: BinExport2.Comment.Type.ValueType  # 4
+        """Named constants, bitfields and similar."""
+        LOCATION: BinExport2.Comment.Type.ValueType  # 5
+        """Named locations, usually the target of a jump."""
+        GLOBAL_REFERENCE: BinExport2.Comment.Type.ValueType  # 6
+        """Data cross references."""
+        LOCAL_REFERENCE: BinExport2.Comment.Type.ValueType  # 7
+        """Local/stack variables."""
+
+        INSTRUCTION_INDEX_FIELD_NUMBER: builtins.int
+        INSTRUCTION_OPERAND_INDEX_FIELD_NUMBER: builtins.int
+        OPERAND_EXPRESSION_INDEX_FIELD_NUMBER: builtins.int
+        STRING_TABLE_INDEX_FIELD_NUMBER: builtins.int
+        REPEATABLE_FIELD_NUMBER: builtins.int
+        TYPE_FIELD_NUMBER: builtins.int
+        instruction_index: builtins.int
+        """Index into the global instruction table. This is here to enable
+        comment processing without having to iterate over all instructions.
+        There is an N:M mapping of instructions to comments.
+        """
+        instruction_operand_index: builtins.int
+        """Index into the operand array local to an instruction."""
+        operand_expression_index: builtins.int
+        """Index into the expression array local to an operand, like in Reference.
+        This is not currently used, but allows to implement expression
+        substitutions.
+        """
+        string_table_index: builtins.int
+        """Index into the global string table."""
+        repeatable: builtins.bool
+        """Comment is propagated to all locations that reference the original
+        location.
+        """
+        type: global___BinExport2.Comment.Type.ValueType
+        def __init__(
+            self,
+            *,
+            instruction_index: builtins.int | None = ...,
+            instruction_operand_index: builtins.int | None = ...,
+            operand_expression_index: builtins.int | None = ...,
+            string_table_index: builtins.int | None = ...,
+            repeatable: builtins.bool | None = ...,
+            type: global___BinExport2.Comment.Type.ValueType | None = ...,
+        ) -> None: ...
+        def HasField(self, field_name: typing_extensions.Literal["instruction_index", b"instruction_index", "instruction_operand_index", b"instruction_operand_index", "operand_expression_index", b"operand_expression_index", "repeatable", b"repeatable", "string_table_index", b"string_table_index", "type", b"type"]) -> builtins.bool: ...
+        def ClearField(self, field_name: typing_extensions.Literal["instruction_index", b"instruction_index", "instruction_operand_index", b"instruction_operand_index", "operand_expression_index", b"operand_expression_index", "repeatable", b"repeatable", "string_table_index", b"string_table_index", "type", b"type"]) -> None: ...
+
+    @typing_extensions.final
+    class Section(google.protobuf.message.Message):
+        DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+        ADDRESS_FIELD_NUMBER: builtins.int
+        SIZE_FIELD_NUMBER: builtins.int
+        FLAG_R_FIELD_NUMBER: builtins.int
+        FLAG_W_FIELD_NUMBER: builtins.int
+        FLAG_X_FIELD_NUMBER: builtins.int
+        address: builtins.int
+        """Section start address."""
+        size: builtins.int
+        """Section size."""
+        flag_r: builtins.bool
+        """Read flag of the section, True when section is readable."""
+        flag_w: builtins.bool
+        """Write flag of the section, True when section is writable."""
+        flag_x: builtins.bool
+        """Execute flag of the section, True when section is executable."""
+        def __init__(
+            self,
+            *,
+            address: builtins.int | None = ...,
+            size: builtins.int | None = ...,
+            flag_r: builtins.bool | None = ...,
+            flag_w: builtins.bool | None = ...,
+            flag_x: builtins.bool | None = ...,
+        ) -> None: ...
+        def HasField(self, field_name: typing_extensions.Literal["address", b"address", "flag_r", b"flag_r", "flag_w", b"flag_w", "flag_x", b"flag_x", "size", b"size"]) -> builtins.bool: ...
+        def ClearField(self, field_name: typing_extensions.Literal["address", b"address", "flag_r", b"flag_r", "flag_w", b"flag_w", "flag_x", b"flag_x", "size", b"size"]) -> None: ...
+
+    @typing_extensions.final
+    class Library(google.protobuf.message.Message):
+        DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+        IS_STATIC_FIELD_NUMBER: builtins.int
+        LOAD_ADDRESS_FIELD_NUMBER: builtins.int
+        NAME_FIELD_NUMBER: builtins.int
+        is_static: builtins.bool
+        """If this library is statically linked."""
+        load_address: builtins.int
+        """Address where this library was loaded, 0 if unknown."""
+        name: builtins.str
+        """Name of the library (format is platform-dependent)."""
+        def __init__(
+            self,
+            *,
+            is_static: builtins.bool | None = ...,
+            load_address: builtins.int | None = ...,
+            name: builtins.str | None = ...,
+        ) -> None: ...
+        def HasField(self, field_name: typing_extensions.Literal["is_static", b"is_static", "load_address", b"load_address", "name", b"name"]) -> builtins.bool: ...
+        def ClearField(self, field_name: typing_extensions.Literal["is_static", b"is_static", "load_address", b"load_address", "name", b"name"]) -> None: ...
+
+    @typing_extensions.final
+    class Module(google.protobuf.message.Message):
+        DESCRIPTOR: google.protobuf.descriptor.Descriptor
+
+        NAME_FIELD_NUMBER: builtins.int
+        name: builtins.str
+        """Name, such as Java class name. Platform-dependent."""
+        def __init__(
+            self,
+            *,
+            name: builtins.str | None = ...,
+        ) -> None: ...
+        def HasField(self, field_name: typing_extensions.Literal["name", b"name"]) -> builtins.bool: ...
+        def ClearField(self, field_name: typing_extensions.Literal["name", b"name"]) -> None: ...
+
+    META_INFORMATION_FIELD_NUMBER: builtins.int
+    EXPRESSION_FIELD_NUMBER: builtins.int
+    OPERAND_FIELD_NUMBER: builtins.int
+    MNEMONIC_FIELD_NUMBER: builtins.int
+    INSTRUCTION_FIELD_NUMBER: builtins.int
+    BASIC_BLOCK_FIELD_NUMBER: builtins.int
+    FLOW_GRAPH_FIELD_NUMBER: builtins.int
+    CALL_GRAPH_FIELD_NUMBER: builtins.int
+    STRING_TABLE_FIELD_NUMBER: builtins.int
+    ADDRESS_COMMENT_FIELD_NUMBER: builtins.int
+    COMMENT_FIELD_NUMBER: builtins.int
+    STRING_REFERENCE_FIELD_NUMBER: builtins.int
+    EXPRESSION_SUBSTITUTION_FIELD_NUMBER: builtins.int
+    SECTION_FIELD_NUMBER: builtins.int
+    LIBRARY_FIELD_NUMBER: builtins.int
+    DATA_REFERENCE_FIELD_NUMBER: builtins.int
+    MODULE_FIELD_NUMBER: builtins.int
+    @property
+    def meta_information(self) -> global___BinExport2.Meta: ...
+    @property
+    def expression(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.Expression]: ...
+    @property
+    def operand(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.Operand]: ...
+    @property
+    def mnemonic(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.Mnemonic]: ...
+    @property
+    def instruction(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.Instruction]: ...
+    @property
+    def basic_block(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.BasicBlock]: ...
+    @property
+    def flow_graph(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.FlowGraph]: ...
+    @property
+    def call_graph(self) -> global___BinExport2.CallGraph: ...
+    @property
+    def string_table(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]: ...
+    @property
+    def address_comment(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.Reference]:
+        """No longer written. This is here so that BinDiff can work with older
+        BinExport files.
+        """
+    @property
+    def comment(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.Comment]:
+        """Rich comment index used for BinDiff's comment porting."""
+    @property
+    def string_reference(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.Reference]: ...
+    @property
+    def expression_substitution(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.Reference]: ...
+    @property
+    def section(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.Section]: ...
+    @property
+    def library(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.Library]: ...
+    @property
+    def data_reference(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.DataReference]: ...
+    @property
+    def module(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.Module]: ...
+    def __init__(
+        self,
+        *,
+        meta_information: global___BinExport2.Meta | None = ...,
+        expression: collections.abc.Iterable[global___BinExport2.Expression] | None = ...,
+        operand: collections.abc.Iterable[global___BinExport2.Operand] | None = ...,
+        mnemonic: collections.abc.Iterable[global___BinExport2.Mnemonic] | None = ...,
+        instruction: collections.abc.Iterable[global___BinExport2.Instruction] | None = ...,
+        basic_block: collections.abc.Iterable[global___BinExport2.BasicBlock] | None = ...,
+        flow_graph: collections.abc.Iterable[global___BinExport2.FlowGraph] | None = ...,
+        call_graph: global___BinExport2.CallGraph | None = ...,
+        string_table: collections.abc.Iterable[builtins.str] | None = ...,
+        address_comment: collections.abc.Iterable[global___BinExport2.Reference] | None = ...,
+        comment: collections.abc.Iterable[global___BinExport2.Comment] | None = ...,
+        string_reference: collections.abc.Iterable[global___BinExport2.Reference] | None = ...,
+        expression_substitution: collections.abc.Iterable[global___BinExport2.Reference] | None = ...,
+        section: collections.abc.Iterable[global___BinExport2.Section] | None = ...,
+        library: collections.abc.Iterable[global___BinExport2.Library] | None = ...,
+        data_reference: collections.abc.Iterable[global___BinExport2.DataReference] | None = ...,
+        module: collections.abc.Iterable[global___BinExport2.Module] | None = ...,
+    ) -> None: ...
+    def HasField(self, field_name: typing_extensions.Literal["call_graph", b"call_graph", "meta_information", b"meta_information"]) -> builtins.bool: ...
+    def ClearField(self, field_name: typing_extensions.Literal["address_comment", b"address_comment", "basic_block", b"basic_block", "call_graph", b"call_graph", "comment", b"comment", "data_reference", b"data_reference", "expression", b"expression", "expression_substitution", b"expression_substitution", "flow_graph", b"flow_graph", "instruction", b"instruction", "library", b"library", "meta_information", b"meta_information", "mnemonic", b"mnemonic", "module", b"module", "operand", b"operand", "section", b"section", "string_reference", b"string_reference", "string_table", b"string_table"]) -> None: ...
+
+global___BinExport2 = BinExport2
diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
new file mode 100644
index 000000000..1c748d61b
--- /dev/null
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -0,0 +1,70 @@
+# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+from typing import List, Tuple, Iterator, Any
+
+import capa.features.extractors.elf
+import capa.features.extractors.binexport2.file
+import capa.features.extractors.binexport2.insn
+import capa.features.extractors.binexport2.global_
+import capa.features.extractors.binexport2.function
+import capa.features.extractors.binexport2.basicblock
+from capa.features.common import Feature
+from capa.features.address import Address, AbsoluteVirtualAddress
+from capa.features.extractors.base_extractor import (
+    BBHandle,
+    InsnHandle,
+    SampleHashes,
+    FunctionHandle,
+    StaticFeatureExtractor,
+)
+
+
+# TODO(wb): 1755
+TODOType = Any
+
+
+class BinExport2FeatureExtractor(StaticFeatureExtractor):
+    def __init__(self, be2: TODOType, buf: TODOType):
+        super().__init__(hashes=SampleHashes.from_bytes(buf))
+        self.be2 = be2
+        self.buf = buf
+        self.global_features: List[Tuple[Feature, Address]] = []
+        self.global_features.extend(capa.features.extractors.binexport2.file.extract_file_format(self.be2, self.buf))
+        self.global_features.extend(capa.features.extractors.binexport2.global_.extract_os(self.be2))
+        self.global_features.extend(capa.features.extractors.binexport2.global_.extract_arch(self.be2))
+
+    def get_base_address(self):
+        # TODO(wb): 1755
+        return AbsoluteVirtualAddress(0x0)
+
+    def extract_global_features(self):
+        yield from self.global_features
+
+    def extract_file_features(self):
+        yield from capa.features.extractors.binexport2.file.extract_features(self.be2, self.buf)
+
+    def get_functions(self) -> Iterator[FunctionHandle]:
+        # TODO(wb): 1755
+        yield from ()
+
+    def extract_function_features(self, fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
+        yield from capa.features.extractors.binexport2.function.extract_features(fh)
+
+    def get_basic_blocks(self, fh: FunctionHandle) -> Iterator[BBHandle]:
+        # TODO(wb): 1755
+        yield from ()
+
+    def extract_basic_block_features(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
+        yield from capa.features.extractors.binexport2.basicblock.extract_features(fh, bbh)
+
+    def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHandle]:
+        # TODO(wb): 1755
+        yield from ()
+
+    def extract_insn_features(self, fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle):
+        yield from capa.features.extractors.binexport2.insn.extract_features(fh, bbh, ih)
diff --git a/capa/features/extractors/binexport2/file.py b/capa/features/extractors/binexport2/file.py
new file mode 100644
index 000000000..47ddb7654
--- /dev/null
+++ b/capa/features/extractors/binexport2/file.py
@@ -0,0 +1,56 @@
+# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+
+from typing import Tuple, Iterator, Any
+
+from capa.features.common import Feature
+from capa.features.address import Address
+
+# TODO(wb): 1755
+TODOType = Any
+
+
+def extract_file_export_names(be2: TODOType, buf: bytes) -> Iterator[Tuple[Feature, Address]]:
+    # TODO(wb): 1755
+    yield from ()
+
+
+def extract_file_import_names(be2: TODOType, buf: bytes) -> Iterator[Tuple[Feature, Address]]:
+    # TODO(wb): 1755
+    yield from ()
+
+
+def extract_file_section_names(be2: TODOType, buf: bytes) -> Iterator[Tuple[Feature, Address]]:
+    # TODO(wb): 1755
+    yield from ()
+
+
+def extract_file_strings(be2: TODOType, buf: bytes) -> Iterator[Tuple[Feature, Address]]:
+    # TODO(wb): 1755
+    yield from ()
+
+
+def extract_file_format(be2: TODOType, buf: bytes) -> Iterator[Tuple[Feature, Address]]:
+    # TODO(wb): 1755
+    yield from ()
+
+
+def extract_features(be2: TODOType, buf: bytes) -> Iterator[Tuple[Feature, Address]]:
+    """extract file features"""
+    for file_handler in FILE_HANDLERS:
+        for feature, addr in file_handler(be2, buf):
+            yield feature, addr
+
+
+FILE_HANDLERS = (
+    extract_file_export_names,
+    extract_file_import_names,
+    extract_file_strings,
+    extract_file_section_names,
+    extract_file_format,
+)
diff --git a/capa/features/extractors/binexport2/function.py b/capa/features/extractors/binexport2/function.py
new file mode 100644
index 000000000..53a0088b7
--- /dev/null
+++ b/capa/features/extractors/binexport2/function.py
@@ -0,0 +1,41 @@
+# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+from typing import Tuple, Iterator
+
+from capa.features.common import Feature
+from capa.features.address import Address
+from capa.features.extractors.base_extractor import FunctionHandle
+
+
+def extract_function_calls_to(fh: FunctionHandle):
+    # TODO(wb): 1755
+    yield from ()
+
+
+def extract_function_loop(fh: FunctionHandle):
+    # TODO(wb): 1755
+    yield from ()
+
+
+def extract_recursive_call(fh: FunctionHandle):
+    # TODO(wb): 1755
+    yield from ()
+
+
+def extract_function_name(fh: FunctionHandle):
+    # TODO(wb): 1755
+    yield from ()
+
+
+def extract_features(fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
+    for func_handler in FUNCTION_HANDLERS:
+        for feature, addr in func_handler(fh):
+            yield feature, addr
+
+
+FUNCTION_HANDLERS = (extract_function_calls_to, extract_function_loop, extract_recursive_call, extract_function_name)
diff --git a/capa/features/extractors/binexport2/global_.py b/capa/features/extractors/binexport2/global_.py
new file mode 100644
index 000000000..48f39d392
--- /dev/null
+++ b/capa/features/extractors/binexport2/global_.py
@@ -0,0 +1,33 @@
+# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+import logging
+from typing import Tuple, Iterator, Any
+
+from capa.features.common import Feature, Arch, ARCH_AARCH64
+from capa.features.address import Address, NO_ADDRESS
+from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
+
+
+logger = logging.getLogger(__name__)
+
+
+def extract_os(be2: BinExport2) -> Iterator[Tuple[Feature, Address]]:
+    # fetch from the buf.
+    # TODO(wb): 1755
+    yield from ()
+
+
+def extract_arch(be2: BinExport2) -> Iterator[Tuple[Feature, Address]]:
+    arch = be2.meta_information.architecture_name
+    # TODO: where does this come from? is it from the BinExport extractor? is there any schema??
+    if arch == "aarch64":
+        yield Arch(ARCH_AARCH64), NO_ADDRESS
+    # TODO: x86, etc.
+    else:
+        logger.debug("unsupported architecture: %s", arch)
+        return
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
new file mode 100644
index 000000000..c7e6f6126
--- /dev/null
+++ b/capa/features/extractors/binexport2/insn.py
@@ -0,0 +1,93 @@
+# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+from typing import Any, List, Tuple, Iterator, Optional, Any
+
+from capa.features.common import Feature
+from capa.features.address import Address
+from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
+
+
+def extract_insn_api_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
+    # TODO(wb): 1755
+    yield from ()
+
+
+def extract_insn_number_features(
+    fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
+) -> Iterator[Tuple[Feature, Address]]:
+    # TODO(wb): 1755
+    yield from ()
+
+
+def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
+    # TODO(wb): 1755
+    yield from ()
+
+
+def extract_insn_string_features(
+    fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
+) -> Iterator[Tuple[Feature, Address]]:
+    # TODO(wb): 1755
+    yield from ()
+
+
+def extract_insn_offset_features(
+    fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
+) -> Iterator[Tuple[Feature, Address]]:
+    # TODO(wb): 1755
+    yield from ()
+
+
+def extract_insn_nzxor_characteristic_features(
+    fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
+) -> Iterator[Tuple[Feature, Address]]:
+    # TODO(wb): 1755
+    yield from ()
+
+
+def extract_insn_mnemonic_features(
+    fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
+) -> Iterator[Tuple[Feature, Address]]:
+    # TODO(wb): 1755
+    yield from ()
+
+
+def extract_function_calls_from(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
+    """extract functions calls from features
+
+    most relevant at the function scope, however, its most efficient to extract at the instruction scope
+    """
+    # TODO(wb): 1755
+    yield from ()
+
+
+def extract_function_indirect_call_characteristic_features(
+    fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
+) -> Iterator[Tuple[Feature, Address]]:
+    # TODO(wb): 1755
+    yield from ()
+
+
+def extract_features(f: FunctionHandle, bbh: BBHandle, insn: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
+    """extract instruction features"""
+    for inst_handler in INSTRUCTION_HANDLERS:
+        for feature, ea in inst_handler(f, bbh, insn):
+            yield feature, ea
+
+
+INSTRUCTION_HANDLERS = (
+    extract_insn_api_features,
+    extract_insn_number_features,
+    extract_insn_bytes_features,
+    extract_insn_string_features,
+    extract_insn_offset_features,
+    extract_insn_nzxor_characteristic_features,
+    extract_insn_mnemonic_features,
+    extract_function_calls_from,
+    extract_function_indirect_call_characteristic_features,
+)

From ebdc5fc5849bd389d3d78476225347e2ea365683 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Thu, 25 Jan 2024 11:54:49 +0000
Subject: [PATCH 008/200] main: remove references to wip BinExport2 code

---
 capa/main.py | 31 ++-----------------------------
 1 file changed, 2 insertions(+), 29 deletions(-)

diff --git a/capa/main.py b/capa/main.py
index 6b0899821..3151f2f4c 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -73,7 +73,6 @@
     OS_MACOS,
     FORMAT_PE,
     FORMAT_ELF,
-    FORMAT_BINEXPORT2,
     OS_WINDOWS,
     FORMAT_AUTO,
     FORMAT_CAPE,
@@ -100,7 +99,6 @@
 BACKEND_BINJA = "binja"
 BACKEND_PEFILE = "pefile"
 BACKEND_CAPE = "cape"
-BACKEND_BINEXPORT2 = "binexport2"
 BACKEND_DEFAULT = "(default) use default backend for given file type"
 
 E_MISSING_RULES = 10
@@ -376,19 +374,6 @@ def get_extractor(
 
         return capa.features.extractors.viv.extractor.VivisectFeatureExtractor(vw, input_path, os_)
 
-    elif backend == BACKEND_BINEXPORT2:
-        import capa.features.extractors.binexport2
-        import capa.features.extractors.binexport2.extractor
-
-        be2 = capa.features.extractors.binexport2.get_binexport2(input_path)
-        assert sample_path is not None
-        # we let BinExport support a wide array of Arch/OS/etc.
-        # it can be an intermediate representation for us.
-        # therefore, don't restrict format/arch/OS.
-        buf = sample_path.read_bytes()
-
-        return capa.features.extractors.binexport2.extractor.BinExport2FeatureExtractor(be2, buf)
-
     else:
         raise ValueError("unexpected backend: " + backend)
 
@@ -410,10 +395,6 @@ def get_file_extractors(input: Path, input_format: str) -> List[FeatureExtractor
         report = json.load(Path(input).open(encoding="utf-8"))
         file_extractors.append(capa.features.extractors.cape.extractor.CapeExtractor.from_report(report))
 
-    elif input_format == FORMAT_BINEXPORT2:
-        # TODO(wb): 1755
-        pass
-
     return file_extractors
 
 
@@ -873,7 +854,7 @@ def install_common_args(parser, wanted=None):
             "--backend",
             type=str,
             help="select the backend to use",
-            choices=(BACKEND_VIV, BACKEND_BINJA, BACKEND_PEFILE, BACKEND_CAPE, BACKEND_BINEXPORT2),
+            choices=(BACKEND_VIV, BACKEND_BINJA, BACKEND_PEFILE, BACKEND_CAPE),
             default=BACKEND_DEFAULT,
         )
 
@@ -1129,9 +1110,6 @@ def get_backend_from_args(args, input_format: str) -> str:
     if input_format == FORMAT_CAPE:
         return BACKEND_CAPE
 
-    elif input_format == FORMAT_BINEXPORT2:
-        return BACKEND_BINEXPORT2
-
     elif input_format == FORMAT_DOTNET:
         return BACKEND_DOTNET
 
@@ -1153,12 +1131,7 @@ def get_sample_path_from_args(args, backend: str) -> Optional[Path]:
     raises:
       ShouldExitError: if the program is invoked incorrectly and should exit.
     """
-    if backend == BACKEND_BINEXPORT2:
-        import capa.features.extractors.binexport2
-
-        be2 = capa.features.extractors.binexport2.get_binexport2(args.input)
-        return capa.features.extractors.binexport2.get_sample_from_binexport2(be2)
-    elif backend == BACKEND_CAPE:
+    if backend == BACKEND_CAPE:
         return None
     else:
         return args.input

From 4b039cdd15caa284a54bdc15af6f1a3b026a2ad8 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Thu, 25 Jan 2024 11:56:36 +0000
Subject: [PATCH 009/200] changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4931831ad..ca689b03d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,7 @@
 - protobuf: deprecate `Metadata.analysis` in favor of `Metadata.analysis2` that is dynamic analysis aware @williballenthin
 - update freeze format to v3, adding support for dynamic analysis @williballenthin
 - extractor: ignore DLL name for api features #1815 @mr-tz
+- main: introduce wrapping routines within main for working with CLI args #1813 @williballenthin
 
 ### New Rules (41)
 

From dfa6c287c7824b047132ca0316f397d3445b1479 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Thu, 25 Jan 2024 15:15:47 +0000
Subject: [PATCH 010/200] main: rename first position argument "input_file"

closes #1946
---
 capa/main.py | 58 ++++++++++++++++++++++++++--------------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/capa/main.py b/capa/main.py
index 3151f2f4c..c41291f18 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -292,7 +292,7 @@ def get_extractor(
     if backend == BACKEND_CAPE:
         import capa.features.extractors.cape.extractor
 
-        report = json.load(Path(input_path).open(encoding="utf-8"))
+        report = json.loads(input_path.read_text(encoding="utf-8"))
         return capa.features.extractors.cape.extractor.CapeExtractor.from_report(report)
 
     elif backend == BACKEND_DOTNET:
@@ -378,21 +378,21 @@ def get_extractor(
         raise ValueError("unexpected backend: " + backend)
 
 
-def get_file_extractors(input: Path, input_format: str) -> List[FeatureExtractor]:
+def get_file_extractors(input_file: Path, input_format: str) -> List[FeatureExtractor]:
     file_extractors: List[FeatureExtractor] = []
 
     if input_format == FORMAT_PE:
-        file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(input))
+        file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(input_file))
 
     elif input_format == FORMAT_DOTNET:
-        file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(input))
-        file_extractors.append(capa.features.extractors.dotnetfile.DotnetFileFeatureExtractor(input))
+        file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(input_file))
+        file_extractors.append(capa.features.extractors.dotnetfile.DotnetFileFeatureExtractor(input_file))
 
     elif input_format == FORMAT_ELF:
-        file_extractors.append(capa.features.extractors.elffile.ElfFeatureExtractor(input))
+        file_extractors.append(capa.features.extractors.elffile.ElfFeatureExtractor(input_file))
 
     elif input_format == FORMAT_CAPE:
-        report = json.load(Path(input).open(encoding="utf-8"))
+        report = json.loads(input_file.read_text(encoding="utf-8"))
         file_extractors.append(capa.features.extractors.cape.extractor.CapeExtractor.from_report(report))
 
     return file_extractors
@@ -563,7 +563,7 @@ def get_sample_analysis(format_, arch, os_, extractor, rules_path, counts):
 
 def collect_metadata(
     argv: List[str],
-    sample_path: Path,
+    input_path: Path,
     input_format: str,
     os_: str,
     rules_path: List[Path],
@@ -601,7 +601,7 @@ def collect_metadata(
             md5=md5,
             sha1=sha1,
             sha256=sha256,
-            path=Path(sample_path).resolve().as_posix(),
+            path=input_path.resolve().as_posix(),
         ),
         analysis=get_sample_analysis(
             input_format,
@@ -780,7 +780,7 @@ def install_common_args(parser, wanted=None):
     args:
       parser (argparse.ArgumentParser): a parser to update in place, adding common arguments.
       wanted (Set[str]): collection of arguments to opt-into, including:
-        - "input": required positional argument to input file.
+        - "input_file": required positional argument to input file.
         - "format": flag to override file format.
         - "os": flag to override file operating system.
         - "backend": flag to override analysis backend.
@@ -814,16 +814,16 @@ def install_common_args(parser, wanted=None):
     #
     # arguments that may be opted into:
     #
-    #   - input
+    #   - input_file
     #   - format
     #   - os
     #   - rules
     #   - tag
     #
 
-    if "input" in wanted:
+    if "input_file" in wanted:
         parser.add_argument(
-            "input",
+            "input_file",
             type=str,
             help="path to file to analyze",
         )
@@ -983,8 +983,8 @@ def handle_common_args(args):
     if not args.debug:
         sys.excepthook = simple_message_exception_handler  # type: ignore[assignment]
 
-    if hasattr(args, "input"):
-        args.input = Path(args.input)
+    if hasattr(args, "input_file"):
+        args.input_file = Path(args.input_file)
 
     if hasattr(args, "rules"):
         rules_paths: List[Path] = []
@@ -1056,7 +1056,7 @@ def ensure_input_exists_from_args(args):
       ShouldExitError: if the program is invoked incorrectly and should exit.
     """
     try:
-        _ = get_file_taste(args.input)
+        _ = get_file_taste(args.input_file)
     except IOError as e:
         # per our research there's not a programmatic way to render the IOError with non-ASCII filename unless we
         # handle the IOError separately and reach into the args
@@ -1083,9 +1083,9 @@ def get_input_format_from_args(args) -> str:
         return format
 
     try:
-        return get_auto_format(args.input)
+        return get_auto_format(args.input_file)
     except PEFormatError as e:
-        logger.error("Input file '%s' is not a valid PE file: %s", args.input, str(e))
+        logger.error("Input file '%s' is not a valid PE file: %s", args.input_file, str(e))
         raise ShouldExitError(E_CORRUPT_FILE) from e
     except UnsupportedFormatError as e:
         log_unsupported_format_error()
@@ -1134,7 +1134,7 @@ def get_sample_path_from_args(args, backend: str) -> Optional[Path]:
     if backend == BACKEND_CAPE:
         return None
     else:
-        return args.input
+        return args.input_file
 
 
 def get_os_from_args(args, backend) -> str:
@@ -1224,12 +1224,12 @@ def get_file_extractors_from_args(args, input_format: str) -> List[FeatureExtrac
     # this pass can inspect multiple file extractors, e.g., dotnet and pe to identify
     # various limitations
     try:
-        return get_file_extractors(args.input, input_format)
+        return get_file_extractors(args.input_file, input_format)
     except PEFormatError as e:
-        logger.error("Input file '%s' is not a valid PE file: %s", args.input, str(e))
+        logger.error("Input file '%s' is not a valid PE file: %s", args.input_file, str(e))
         return E_CORRUPT_FILE
     except (ELFError, OverflowError) as e:
-        logger.error("Input file '%s' is not a valid ELF file: %s", args.input, str(e))
+        logger.error("Input file '%s' is not a valid ELF file: %s", args.input_file, str(e))
         return E_CORRUPT_FILE
     except UnsupportedFormatError as e:
         if input_format == FORMAT_CAPE:
@@ -1263,10 +1263,10 @@ def find_file_limitations_from_args(args, rules: RuleSet, file_extractors: List[
         try:
             pure_file_capabilities, _ = find_file_capabilities(rules, file_extractor, {})
         except PEFormatError as e:
-            logger.error("Input file '%s' is not a valid PE file: %s", args.input, str(e))
+            logger.error("Input file '%s' is not a valid PE file: %s", args.input_file, str(e))
             raise ShouldExitError(E_CORRUPT_FILE) from e
         except (ELFError, OverflowError) as e:
-            logger.error("Input file '%s' is not a valid ELF file: %s", args.input, str(e))
+            logger.error("Input file '%s' is not a valid ELF file: %s", args.input_file, str(e))
             raise ShouldExitError(E_CORRUPT_FILE) from e
 
         # file limitations that rely on non-file scope won't be detected here.
@@ -1293,7 +1293,7 @@ def get_extractor_from_args(args, input_format: str, backend: str) -> FeatureExt
     """
     if input_format == FORMAT_FREEZE:
         # freeze format deserializes directly into an extractor
-        return frz.load(Path(args.input).read_bytes())
+        return frz.load(args.input_file.read_bytes())
     else:
         # all other formats we must create an extractor,
         # such as viv, binary ninja, etc. workspaces
@@ -1321,7 +1321,7 @@ def get_extractor_from_args(args, input_format: str, backend: str) -> FeatureExt
         #  https://github.com/mandiant/capa/issues/1813
         try:
             return get_extractor(
-                args.input,
+                args.input_file,
                 input_format,
                 os_,
                 backend,
@@ -1383,7 +1383,7 @@ def main(argv: Optional[List[str]] = None):
     parser = argparse.ArgumentParser(
         description=desc, epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter
     )
-    install_common_args(parser, {"input", "format", "backend", "os", "signatures", "rules", "tag"})
+    install_common_args(parser, {"input_file", "format", "backend", "os", "signatures", "rules", "tag"})
     parser.add_argument("-j", "--json", action="store_true", help="emit JSON instead of text")
     args = parser.parse_args(args=argv)
 
@@ -1403,7 +1403,7 @@ def main(argv: Optional[List[str]] = None):
 
     if input_format == FORMAT_RESULT:
         # result document directly parses into meta, capabilities
-        result_doc = capa.render.result_document.ResultDocument.from_file(Path(args.input))
+        result_doc = capa.render.result_document.ResultDocument.from_file(args.input_file)
         meta, capabilities = result_doc.to_capa()
 
     else:
@@ -1420,7 +1420,7 @@ def main(argv: Optional[List[str]] = None):
 
         capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet)
 
-        meta = collect_metadata(argv, args.input, input_format, os, args.rules, extractor, counts)
+        meta = collect_metadata(argv, args.input_file, input_format, os, args.rules, extractor, counts)
         meta.analysis.layout = compute_layout(rules, extractor, capabilities)
 
         if isinstance(extractor, StaticFeatureExtractor) and found_file_limitation:

From 8c2c486ca8fe1863060b78473dbbba19aa294675 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Thu, 25 Jan 2024 15:22:59 +0000
Subject: [PATCH 011/200] main: linters

---
 capa/main.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/capa/main.py b/capa/main.py
index c41291f18..ec34e581f 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -281,7 +281,7 @@ def get_extractor(
     sigpaths: List[Path],
     should_save_workspace=False,
     disable_progress=False,
-    sample_path: Optional[Path]=None,
+    sample_path: Optional[Path] = None,
 ) -> FeatureExtractor:
     """
     raises:
@@ -580,7 +580,9 @@ def collect_metadata(
     extractor_arch = [f.value for (f, _) in global_feats if isinstance(f, capa.features.common.Arch)]
     extractor_os = [f.value for (f, _) in global_feats if isinstance(f, capa.features.common.OS)]
 
-    input_format = str(extractor_format[0]) if extractor_format else "unknown" if input_format == FORMAT_AUTO else input_format
+    input_format = (
+        str(extractor_format[0]) if extractor_format else "unknown" if input_format == FORMAT_AUTO else input_format
+    )
     arch = str(extractor_arch[0]) if extractor_arch else "unknown"
     os_ = str(extractor_os[0]) if extractor_os else "unknown" if os_ == OS_AUTO else os_
 
@@ -918,6 +920,7 @@ def install_common_args(parser, wanted=None):
 
 class ShouldExitError(Exception):
     """raised when a main-related routine indicates the program should exit."""
+
     def __init__(self, status_code: int):
         self.status_code = status_code
 
@@ -1159,7 +1162,7 @@ def get_os_from_args(args, backend) -> str:
     return get_os(sample_path)
 
 
-def get_rules_from_args(args) -> str:
+def get_rules_from_args(args) -> RuleSet:
     """
     args:
       args: The parsed command line arguments from `install_common_args`.
@@ -1227,10 +1230,10 @@ def get_file_extractors_from_args(args, input_format: str) -> List[FeatureExtrac
         return get_file_extractors(args.input_file, input_format)
     except PEFormatError as e:
         logger.error("Input file '%s' is not a valid PE file: %s", args.input_file, str(e))
-        return E_CORRUPT_FILE
+        raise ShouldExitError(E_CORRUPT_FILE) from e
     except (ELFError, OverflowError) as e:
         logger.error("Input file '%s' is not a valid ELF file: %s", args.input_file, str(e))
-        return E_CORRUPT_FILE
+        raise ShouldExitError(E_CORRUPT_FILE) from e
     except UnsupportedFormatError as e:
         if input_format == FORMAT_CAPE:
             log_unsupported_cape_report_error(str(e))
@@ -1413,14 +1416,17 @@ def main(argv: Optional[List[str]] = None):
         try:
             backend = get_backend_from_args(args, input_format)
             sample_path = get_sample_path_from_args(args, backend)
-            os = get_os(sample_path)
+            if sample_path is None:
+                os_ = "unknown"
+            else:
+                os_ = get_os(sample_path)
             extractor = get_extractor_from_args(args, input_format, backend)
         except ShouldExitError as e:
             return e.status_code
 
         capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet)
 
-        meta = collect_metadata(argv, args.input_file, input_format, os, args.rules, extractor, counts)
+        meta = collect_metadata(argv, args.input_file, input_format, os_, args.rules, extractor, counts)
         meta.analysis.layout = compute_layout(rules, extractor, capabilities)
 
         if isinstance(extractor, StaticFeatureExtractor) and found_file_limitation:

From a9e1fd90d506c06a55311034cedfe1a459715c4b Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Thu, 25 Jan 2024 15:36:07 +0000
Subject: [PATCH 012/200] main: move rule-related routines to capa.rules

ref #1821
---
 capa/ghidra/capa_ghidra.py               |   4 +-
 capa/ida/plugin/form.py                  |   2 +-
 capa/main.py                             | 112 +----------------------
 capa/rules/__init__.py                   | 105 ++++++++++++++++++++-
 scripts/bulk-process.py                  |   2 +-
 scripts/cache-ruleset.py                 |   2 +-
 scripts/capa2yara.py                     |   2 +-
 scripts/capa_as_library.py               |   2 +-
 scripts/detect_duplicate_features.py     |   2 +-
 scripts/lint.py                          |   2 +-
 scripts/profile-time.py                  |   2 +-
 scripts/show-capabilities-by-function.py |   2 +-
 scripts/show-unused-features.py          |   2 +-
 13 files changed, 121 insertions(+), 120 deletions(-)

diff --git a/capa/ghidra/capa_ghidra.py b/capa/ghidra/capa_ghidra.py
index 70b98df56..2594edb71 100644
--- a/capa/ghidra/capa_ghidra.py
+++ b/capa/ghidra/capa_ghidra.py
@@ -69,7 +69,7 @@ def run_headless():
     rules_path = pathlib.Path(args.rules)
 
     logger.debug("rule path: %s", rules_path)
-    rules = capa.main.get_rules([rules_path])
+    rules = capa.rules.get_rules([rules_path])
 
     meta = capa.ghidra.helpers.collect_metadata([rules_path])
     extractor = capa.features.extractors.ghidra.extractor.GhidraFeatureExtractor()
@@ -119,7 +119,7 @@ def run_ui():
     rules_path: pathlib.Path = pathlib.Path(rules_dir)
     logger.info("running capa using rules from %s", str(rules_path))
 
-    rules = capa.main.get_rules([rules_path])
+    rules = capa.rules.get_rules([rules_path])
 
     meta = capa.ghidra.helpers.collect_metadata([rules_path])
     extractor = capa.features.extractors.ghidra.extractor.GhidraFeatureExtractor()
diff --git a/capa/ida/plugin/form.py b/capa/ida/plugin/form.py
index 4e1bd572a..e9249a77f 100644
--- a/capa/ida/plugin/form.py
+++ b/capa/ida/plugin/form.py
@@ -636,7 +636,7 @@ def on_load_rule(_, i, total):
                 if ida_kernwin.user_cancelled():
                     raise UserCancelledError("user cancelled")
 
-            return capa.main.get_rules([rule_path], on_load_rule=on_load_rule)
+            return capa.rules.get_rules([rule_path], on_load_rule=on_load_rule)
         except UserCancelledError:
             logger.info("User cancelled analysis.")
             return None
diff --git a/capa/main.py b/capa/main.py
index ec34e581f..f5dd205d7 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -19,7 +19,7 @@
 import textwrap
 import contextlib
 from types import TracebackType
-from typing import Any, Set, Dict, List, Callable, Optional
+from typing import Any, Set, Dict, List, Optional
 from pathlib import Path
 
 import halo
@@ -49,7 +49,7 @@
 import capa.features.extractors.dotnetfile
 import capa.features.extractors.base_extractor
 import capa.features.extractors.cape.extractor
-from capa.rules import Rule, RuleSet
+from capa.rules import RuleSet
 from capa.engine import MatchResults
 from capa.helpers import (
     get_file_taste,
@@ -398,108 +398,6 @@ def get_file_extractors(input_file: Path, input_format: str) -> List[FeatureExtr
     return file_extractors
 
 
-def is_nursery_rule_path(path: Path) -> bool:
-    """
-    The nursery is a spot for rules that have not yet been fully polished.
-    For example, they may not have references to public example of a technique.
-    Yet, we still want to capture and report on their matches.
-    The nursery is currently a subdirectory of the rules directory with that name.
-
-    When nursery rules are loaded, their metadata section should be updated with:
-      `nursery=True`.
-    """
-    return "nursery" in path.parts
-
-
-def collect_rule_file_paths(rule_paths: List[Path]) -> List[Path]:
-    """
-    collect all rule file paths, including those in subdirectories.
-    """
-    rule_file_paths = []
-    for rule_path in rule_paths:
-        if not rule_path.exists():
-            raise IOError(f"rule path {rule_path} does not exist or cannot be accessed")
-
-        if rule_path.is_file():
-            rule_file_paths.append(rule_path)
-        elif rule_path.is_dir():
-            logger.debug("reading rules from directory %s", rule_path)
-            for root, _, files in os.walk(rule_path):
-                if ".git" in root:
-                    # the .github directory contains CI config in capa-rules
-                    # this includes some .yml files
-                    # these are not rules
-                    # additionally, .git has files that are not .yml and generate the warning
-                    # skip those too
-                    continue
-                for file in files:
-                    if not file.endswith(".yml"):
-                        if not (file.startswith(".git") or file.endswith((".git", ".md", ".txt"))):
-                            # expect to see .git* files, readme.md, format.md, and maybe a .git directory
-                            # other things maybe are rules, but are mis-named.
-                            logger.warning("skipping non-.yml file: %s", file)
-                        continue
-                    rule_file_paths.append(Path(root) / file)
-    return rule_file_paths
-
-
-# TypeAlias. note: using `foo: TypeAlias = bar` is Python 3.10+
-RulePath = Path
-
-
-def on_load_rule_default(_path: RulePath, i: int, _total: int) -> None:
-    return
-
-
-def get_rules(
-    rule_paths: List[RulePath],
-    cache_dir=None,
-    on_load_rule: Callable[[RulePath, int, int], None] = on_load_rule_default,
-) -> RuleSet:
-    """
-    args:
-      rule_paths: list of paths to rules files or directories containing rules files
-      cache_dir: directory to use for caching rules, or will use the default detected cache directory if None
-      on_load_rule: callback to invoke before a rule is loaded, use for progress or cancellation
-    """
-    if cache_dir is None:
-        cache_dir = capa.rules.cache.get_default_cache_directory()
-    # rule_paths may contain directory paths,
-    # so search for file paths recursively.
-    rule_file_paths = collect_rule_file_paths(rule_paths)
-
-    # this list is parallel to `rule_file_paths`:
-    # rule_file_paths[i] corresponds to rule_contents[i].
-    rule_contents = [file_path.read_bytes() for file_path in rule_file_paths]
-
-    ruleset = capa.rules.cache.load_cached_ruleset(cache_dir, rule_contents)
-    if ruleset is not None:
-        return ruleset
-
-    rules: List[Rule] = []
-
-    total_rule_count = len(rule_file_paths)
-    for i, (path, content) in enumerate(zip(rule_file_paths, rule_contents)):
-        on_load_rule(path, i, total_rule_count)
-
-        try:
-            rule = capa.rules.Rule.from_yaml(content.decode("utf-8"))
-        except capa.rules.InvalidRule:
-            raise
-        else:
-            rule.meta["capa/path"] = path.as_posix()
-            rule.meta["capa/nursery"] = is_nursery_rule_path(path)
-
-            rules.append(rule)
-            logger.debug("loaded rule: '%s' with scope: %s", rule.name, rule.scopes)
-
-    ruleset = capa.rules.RuleSet(rules)
-
-    capa.rules.cache.cache_ruleset(cache_dir, ruleset)
-
-    return ruleset
-
-
 def get_signatures(sigs_path: Path) -> List[Path]:
     if not sigs_path.exists():
         raise IOError(f"signatures path {sigs_path} does not exist or cannot be accessed")
@@ -1176,7 +1074,7 @@ def get_rules_from_args(args) -> RuleSet:
         else:
             cache_dir = capa.rules.cache.get_default_cache_directory()
 
-        rules = get_rules(args.rules, cache_dir=cache_dir)
+        rules = capa.rules.get_rules(args.rules, cache_dir=cache_dir)
     except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e:
         logger.error("%s", str(e))
         logger.error(
@@ -1474,7 +1372,7 @@ def ida_main():
 
     rules_path = get_default_root() / "rules"
     logger.debug("rule path: %s", rules_path)
-    rules = get_rules([rules_path])
+    rules = capa.rules.get_rules([rules_path])
 
     meta = capa.ida.helpers.collect_metadata([rules_path])
 
@@ -1508,7 +1406,7 @@ def ghidra_main():
 
     rules_path = get_default_root() / "rules"
     logger.debug("rule path: %s", rules_path)
-    rules = get_rules([rules_path])
+    rules = capa.rules.get_rules([rules_path])
 
     meta = capa.ghidra.helpers.collect_metadata([rules_path])
 
diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index b5423ad92..d9e43dfc5 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -7,6 +7,7 @@
 # See the License for the specific language governing permissions and limitations under the License.
 
 import io
+import os
 import re
 import uuid
 import codecs
@@ -25,7 +26,7 @@
     # https://github.com/python/mypy/issues/1153
     from backports.functools_lru_cache import lru_cache  # type: ignore
 
-from typing import Any, Set, Dict, List, Tuple, Union, Iterator, Optional
+from typing import Any, Set, Dict, List, Tuple, Union, Callable, Iterator, Optional
 from dataclasses import asdict, dataclass
 
 import yaml
@@ -1691,3 +1692,105 @@ def match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Feat
         matches.update(hard_matches)
 
         return (features3, matches)
+
+
+def is_nursery_rule_path(path: Path) -> bool:
+    """
+    The nursery is a spot for rules that have not yet been fully polished.
+    For example, they may not have references to public example of a technique.
+    Yet, we still want to capture and report on their matches.
+    The nursery is currently a subdirectory of the rules directory with that name.
+
+    When nursery rules are loaded, their metadata section should be updated with:
+      `nursery=True`.
+    """
+    return "nursery" in path.parts
+
+
+def collect_rule_file_paths(rule_paths: List[Path]) -> List[Path]:
+    """
+    collect all rule file paths, including those in subdirectories.
+    """
+    rule_file_paths = []
+    for rule_path in rule_paths:
+        if not rule_path.exists():
+            raise IOError(f"rule path {rule_path} does not exist or cannot be accessed")
+
+        if rule_path.is_file():
+            rule_file_paths.append(rule_path)
+        elif rule_path.is_dir():
+            logger.debug("reading rules from directory %s", rule_path)
+            for root, _, files in os.walk(rule_path):
+                if ".git" in root:
+                    # the .github directory contains CI config in capa-rules
+                    # this includes some .yml files
+                    # these are not rules
+                    # additionally, .git has files that are not .yml and generate the warning
+                    # skip those too
+                    continue
+                for file in files:
+                    if not file.endswith(".yml"):
+                        if not (file.startswith(".git") or file.endswith((".git", ".md", ".txt"))):
+                            # expect to see .git* files, readme.md, format.md, and maybe a .git directory
+                            # other things maybe are rules, but are mis-named.
+                            logger.warning("skipping non-.yml file: %s", file)
+                        continue
+                    rule_file_paths.append(Path(root) / file)
+    return rule_file_paths
+
+
+# TypeAlias. note: using `foo: TypeAlias = bar` is Python 3.10+
+RulePath = Path
+
+
+def on_load_rule_default(_path: RulePath, i: int, _total: int) -> None:
+    return
+
+
+def get_rules(
+    rule_paths: List[RulePath],
+    cache_dir=None,
+    on_load_rule: Callable[[RulePath, int, int], None] = on_load_rule_default,
+) -> RuleSet:
+    """
+    args:
+      rule_paths: list of paths to rules files or directories containing rules files
+      cache_dir: directory to use for caching rules, or will use the default detected cache directory if None
+      on_load_rule: callback to invoke before a rule is loaded, use for progress or cancellation
+    """
+    if cache_dir is None:
+        cache_dir = capa.rules.cache.get_default_cache_directory()
+    # rule_paths may contain directory paths,
+    # so search for file paths recursively.
+    rule_file_paths = collect_rule_file_paths(rule_paths)
+
+    # this list is parallel to `rule_file_paths`:
+    # rule_file_paths[i] corresponds to rule_contents[i].
+    rule_contents = [file_path.read_bytes() for file_path in rule_file_paths]
+
+    ruleset = capa.rules.cache.load_cached_ruleset(cache_dir, rule_contents)
+    if ruleset is not None:
+        return ruleset
+
+    rules: List[Rule] = []
+
+    total_rule_count = len(rule_file_paths)
+    for i, (path, content) in enumerate(zip(rule_file_paths, rule_contents)):
+        on_load_rule(path, i, total_rule_count)
+
+        try:
+            rule = capa.rules.Rule.from_yaml(content.decode("utf-8"))
+        except capa.rules.InvalidRule:
+            raise
+        else:
+            rule.meta["capa/path"] = path.as_posix()
+            rule.meta["capa/nursery"] = is_nursery_rule_path(path)
+
+            rules.append(rule)
+            logger.debug("loaded rule: '%s' with scope: %s", rule.name, rule.scopes)
+
+    ruleset = capa.rules.RuleSet(rules)
+
+    capa.rules.cache.cache_ruleset(cache_dir, ruleset)
+
+    return ruleset
diff --git a/scripts/bulk-process.py b/scripts/bulk-process.py
index 8950b8936..0f6422c18 100644
--- a/scripts/bulk-process.py
+++ b/scripts/bulk-process.py
@@ -161,7 +161,7 @@ def main(argv=None):
         capa.main.handle_common_args(args)
 
         try:
-            rules = capa.main.get_rules(args.rules)
+            rules = capa.rules.get_rules(args.rules)
             logger.info("successfully loaded %s rules", len(rules))
         except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e:
             logger.error("%s", str(e))
diff --git a/scripts/cache-ruleset.py b/scripts/cache-ruleset.py
index 6630f2eea..89137650d 100644
--- a/scripts/cache-ruleset.py
+++ b/scripts/cache-ruleset.py
@@ -49,7 +49,7 @@ def main(argv=None):
     try:
         cache_dir = Path(args.cache)
         cache_dir.mkdir(parents=True, exist_ok=True)
-        rules = capa.main.get_rules(args.rules, cache_dir)
+        rules = capa.rules.get_rules(args.rules, cache_dir)
         logger.info("successfully loaded %s rules", len(rules))
     except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e:
         logger.error("%s", str(e))
diff --git a/scripts/capa2yara.py b/scripts/capa2yara.py
index 5fe5c0849..56fd0e8cb 100644
--- a/scripts/capa2yara.py
+++ b/scripts/capa2yara.py
@@ -741,7 +741,7 @@ def main(argv=None):
     logging.getLogger("capa2yara").setLevel(level)
 
     try:
-        rules = capa.main.get_rules([Path(args.rules)])
+        rules = capa.rules.get_rules([Path(args.rules)])
         namespaces = capa.rules.index_rules_by_namespace(list(rules.rules.values()))
         logger.info("successfully loaded %d rules (including subscope rules which will be ignored)", len(rules))
         if args.tag:
diff --git a/scripts/capa_as_library.py b/scripts/capa_as_library.py
index 611576908..e6b8bf429 100644
--- a/scripts/capa_as_library.py
+++ b/scripts/capa_as_library.py
@@ -170,7 +170,7 @@ def render_dictionary(doc: rd.ResultDocument) -> Dict[str, Any]:
 # ==== render dictionary helpers
 def capa_details(rules_path: Path, file_path: Path, output_format="dictionary"):
     # load rules from disk
-    rules = capa.main.get_rules([rules_path])
+    rules = capa.rules.get_rules([rules_path])
 
     # extract features and find capabilities
     extractor = capa.main.get_extractor(
diff --git a/scripts/detect_duplicate_features.py b/scripts/detect_duplicate_features.py
index 6737d7fa9..9561339c2 100644
--- a/scripts/detect_duplicate_features.py
+++ b/scripts/detect_duplicate_features.py
@@ -48,7 +48,7 @@ def find_overlapping_rules(new_rule_path, rules_path):
     overlapping_rules = []
 
     # capa.rules.RuleSet stores all rules in given paths
-    ruleset = capa.main.get_rules(rules_path)
+    ruleset = capa.rules.get_rules(rules_path)
 
     for rule_name, rule in ruleset.rules.items():
         rule_features = rule.extract_all_features()
diff --git a/scripts/lint.py b/scripts/lint.py
index edcf9f563..b24aa1349 100644
--- a/scripts/lint.py
+++ b/scripts/lint.py
@@ -1002,7 +1002,7 @@ def main(argv=None):
     time0 = time.time()
 
     try:
-        rules = capa.main.get_rules(args.rules)
+        rules = capa.rules.get_rules(args.rules)
         logger.info("successfully loaded %s rules", rules.source_rule_count)
         if args.tag:
             rules = rules.filter_rules_by_meta(args.tag)
diff --git a/scripts/profile-time.py b/scripts/profile-time.py
index 86590a800..f9615cba6 100644
--- a/scripts/profile-time.py
+++ b/scripts/profile-time.py
@@ -91,7 +91,7 @@ def main(argv=None):
 
     try:
         with capa.main.timing("load rules"):
-            rules = capa.main.get_rules(args.rules)
+            rules = capa.rules.get_rules(args.rules)
     except IOError as e:
         logger.error("%s", str(e))
         return -1
diff --git a/scripts/show-capabilities-by-function.py b/scripts/show-capabilities-by-function.py
index 421c6c7e1..c09797ec2 100644
--- a/scripts/show-capabilities-by-function.py
+++ b/scripts/show-capabilities-by-function.py
@@ -153,7 +153,7 @@ def main(argv=None):
         return -1
 
     try:
-        rules = capa.main.get_rules(args.rules)
+        rules = capa.rules.get_rules(args.rules)
         logger.info("successfully loaded %s rules", len(rules))
         if args.tag:
             rules = rules.filter_rules_by_meta(args.tag)
diff --git a/scripts/show-unused-features.py b/scripts/show-unused-features.py
index ddd236614..b030995c3 100644
--- a/scripts/show-unused-features.py
+++ b/scripts/show-unused-features.py
@@ -43,7 +43,7 @@ def format_address(addr: capa.features.address.Address) -> str:
 
 
 def get_rules_feature_set(rules_path) -> Set[Feature]:
-    ruleset = capa.main.get_rules(rules_path)
+    ruleset = capa.rules.get_rules(rules_path)
     rules_feature_set: Set[Feature] = set()
     for _, rule in ruleset.rules.items():
         rules_feature_set.update(rule.extract_all_features())

From d8d7f2800c339d319ca39ac2aac2258089764331 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Thu, 25 Jan 2024 16:08:00 +0000
Subject: [PATCH 013/200] main: extract routines to capa.loader module

closes #1821
---
 capa/features/freeze/__init__.py         |   5 +-
 capa/ghidra/capa_ghidra.py               |   4 +-
 capa/helpers.py                          |   9 +
 capa/ida/plugin/form.py                  |   2 +-
 capa/main.py                             | 525 +----------------------
 scripts/bulk-process.py                  |   8 +-
 scripts/capa_as_library.py               |   6 +-
 scripts/lint.py                          |   2 +-
 scripts/profile-time.py                  |   4 +-
 scripts/show-capabilities-by-function.py |   8 +-
 scripts/show-features.py                 |   5 +-
 scripts/show-unused-features.py          |   4 +-
 tests/fixtures.py                        |   6 +-
 13 files changed, 53 insertions(+), 535 deletions(-)

diff --git a/capa/features/freeze/__init__.py b/capa/features/freeze/__init__.py
index 9e3f73310..b5b0f7f92 100644
--- a/capa/features/freeze/__init__.py
+++ b/capa/features/freeze/__init__.py
@@ -21,6 +21,7 @@
 # https://github.com/mandiant/capa/issues/1699
 from typing_extensions import TypeAlias
 
+import capa.loader
 import capa.helpers
 import capa.version
 import capa.features.file
@@ -686,9 +687,9 @@ def main(argv=None):
     args = parser.parse_args(args=argv)
     capa.main.handle_common_args(args)
 
-    sigpaths = capa.main.get_signatures(args.signatures)
+    sigpaths = capa.loader.get_signatures(args.signatures)
 
-    extractor = capa.main.get_extractor(args.sample, args.format, args.os, args.backend, sigpaths, False)
+    extractor = capa.loader.get_extractor(args.sample, args.format, args.os, args.backend, sigpaths, False)
 
     Path(args.output).write_bytes(dump(extractor))
 
diff --git a/capa/ghidra/capa_ghidra.py b/capa/ghidra/capa_ghidra.py
index 2594edb71..b3ec0183b 100644
--- a/capa/ghidra/capa_ghidra.py
+++ b/capa/ghidra/capa_ghidra.py
@@ -78,7 +78,7 @@ def run_headless():
 
     meta.analysis.feature_counts = counts["feature_counts"]
     meta.analysis.library_functions = counts["library_functions"]
-    meta.analysis.layout = capa.main.compute_layout(rules, extractor, capabilities)
+    meta.analysis.layout = capa.loader.compute_layout(rules, extractor, capabilities)
 
     if capa.capabilities.common.has_file_limitation(rules, capabilities, is_standalone=True):
         logger.info("capa encountered warnings during analysis")
@@ -128,7 +128,7 @@ def run_ui():
 
     meta.analysis.feature_counts = counts["feature_counts"]
     meta.analysis.library_functions = counts["library_functions"]
-    meta.analysis.layout = capa.main.compute_layout(rules, extractor, capabilities)
+    meta.analysis.layout = capa.loader.compute_layout(rules, extractor, capabilities)
 
     if capa.capabilities.common.has_file_limitation(rules, capabilities, is_standalone=False):
         logger.info("capa encountered warnings during analysis")
diff --git a/capa/helpers.py b/capa/helpers.py
index 89dad8b91..a85271af1 100644
--- a/capa/helpers.py
+++ b/capa/helpers.py
@@ -5,6 +5,7 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
+import sys
 import json
 import inspect
 import logging
@@ -201,3 +202,11 @@ def log_unsupported_runtime_error():
         " If you're seeing this message on the command line, please ensure you're running a supported Python version."
     )
     logger.error("-" * 80)
+
+
+def is_running_standalone() -> bool:
+    """
+    are we running from a PyInstaller'd executable?
+    if so, then we'll be able to access `sys._MEIPASS` for the packaged resources.
+    """
+    return hasattr(sys, "frozen") and hasattr(sys, "_MEIPASS")
diff --git a/capa/ida/plugin/form.py b/capa/ida/plugin/form.py
index e9249a77f..ddd4c4e0d 100644
--- a/capa/ida/plugin/form.py
+++ b/capa/ida/plugin/form.py
@@ -775,7 +775,7 @@ def slot_progress_feature_extraction(text):
 
                     meta.analysis.feature_counts = counts["feature_counts"]
                     meta.analysis.library_functions = counts["library_functions"]
-                    meta.analysis.layout = capa.main.compute_layout(ruleset, self.feature_extractor, capabilities)
+                    meta.analysis.layout = capa.loader.compute_layout(ruleset, self.feature_extractor, capabilities)
                 except UserCancelledError:
                     logger.info("User cancelled analysis.")
                     return False
diff --git a/capa/main.py b/capa/main.py
index f5dd205d7..de1101ab4 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -11,26 +11,23 @@
 import io
 import os
 import sys
-import json
 import time
 import logging
 import argparse
-import datetime
 import textwrap
 import contextlib
 from types import TracebackType
-from typing import Any, Set, Dict, List, Optional
+from typing import Any, Dict, List, Optional
 from pathlib import Path
 
-import halo
 import colorama
 from pefile import PEFormatError
-from typing_extensions import assert_never
 from elftools.common.exceptions import ELFError
 
 import capa.perf
 import capa.rules
 import capa.engine
+import capa.loader
 import capa.helpers
 import capa.version
 import capa.render.json
@@ -51,6 +48,7 @@
 import capa.features.extractors.cape.extractor
 from capa.rules import RuleSet
 from capa.engine import MatchResults
+from capa.loader import BACKEND_VIV, BACKEND_CAPE, BACKEND_BINJA, BACKEND_DOTNET, BACKEND_PEFILE
 from capa.helpers import (
     get_file_taste,
     get_auto_format,
@@ -82,23 +80,11 @@
     FORMAT_FREEZE,
     FORMAT_RESULT,
 )
-from capa.features.address import Address
 from capa.capabilities.common import find_capabilities, has_file_limitation, find_file_capabilities
-from capa.features.extractors.base_extractor import (
-    SampleHashes,
-    FeatureExtractor,
-    StaticFeatureExtractor,
-    DynamicFeatureExtractor,
-)
+from capa.features.extractors.base_extractor import FeatureExtractor, StaticFeatureExtractor, DynamicFeatureExtractor
 
 RULES_PATH_DEFAULT_STRING = "(embedded rules)"
 SIGNATURES_PATH_DEFAULT_STRING = "(embedded signatures)"
-
-BACKEND_VIV = "vivisect"
-BACKEND_DOTNET = "dotnet"
-BACKEND_BINJA = "binja"
-BACKEND_PEFILE = "pefile"
-BACKEND_CAPE = "cape"
 BACKEND_DEFAULT = "(default) use default backend for given file type"
 
 E_MISSING_RULES = 10
@@ -137,73 +123,13 @@ def set_vivisect_log_level(level):
     logging.getLogger("Elf").setLevel(level)
 
 
-def is_supported_format(sample: Path) -> bool:
-    """
-    Return if this is a supported file based on magic header values
-    """
-    taste = sample.open("rb").read(0x100)
-
-    return len(list(capa.features.extractors.common.extract_format(taste))) == 1
-
-
-def is_supported_arch(sample: Path) -> bool:
-    buf = sample.read_bytes()
-
-    return len(list(capa.features.extractors.common.extract_arch(buf))) == 1
-
-
-def get_arch(sample: Path) -> str:
-    buf = sample.read_bytes()
-
-    for feature, _ in capa.features.extractors.common.extract_arch(buf):
-        assert isinstance(feature.value, str)
-        return feature.value
-
-    return "unknown"
-
-
-def is_supported_os(sample: Path) -> bool:
-    buf = sample.read_bytes()
-
-    return len(list(capa.features.extractors.common.extract_os(buf))) == 1
-
-
-def get_os(sample: Path) -> str:
-    buf = sample.read_bytes()
-
-    for feature, _ in capa.features.extractors.common.extract_os(buf):
-        assert isinstance(feature.value, str)
-        return feature.value
-
-    return "unknown"
-
-
-def get_meta_str(vw):
-    """
-    Return workspace meta information string
-    """
-    meta = []
-    for k in ["Format", "Platform", "Architecture"]:
-        if k in vw.metadata:
-            meta.append(f"{k.lower()}: {vw.metadata[k]}")
-    return f"{', '.join(meta)}, number of functions: {len(vw.getFunctions())}"
-
-
-def is_running_standalone() -> bool:
-    """
-    are we running from a PyInstaller'd executable?
-    if so, then we'll be able to access `sys._MEIPASS` for the packaged resources.
-    """
-    return hasattr(sys, "frozen") and hasattr(sys, "_MEIPASS")
-
-
 def get_default_root() -> Path:
     """
     get the file system path to the default resources directory.
     under PyInstaller, this comes from _MEIPASS.
     under source, this is the root directory of the project.
     """
-    if is_running_standalone():
+    if capa.helpers.is_running_standalone():
         # pylance/mypy don't like `sys._MEIPASS` because this isn't standard.
         # its injected by pyinstaller.
         # so we'll fetch this attribute dynamically.
@@ -228,425 +154,6 @@ def get_default_signatures() -> List[Path]:
     return ret
 
 
-def get_workspace(path: Path, input_format: str, sigpaths: List[Path]):
-    """
-    load the program at the given path into a vivisect workspace using the given format.
-    also apply the given FLIRT signatures.
-
-    supported formats:
-      - pe
-      - elf
-      - shellcode 32-bit
-      - shellcode 64-bit
-      - auto
-
-    this creates and analyzes the workspace; however, it does *not* save the workspace.
-    this is the responsibility of the caller.
-    """
-
-    # lazy import enables us to not require viv if user wants SMDA, for example.
-    import viv_utils
-    import viv_utils.flirt
-
-    logger.debug("generating vivisect workspace for: %s", path)
-    if input_format == FORMAT_AUTO:
-        if not is_supported_format(path):
-            raise UnsupportedFormatError()
-
-        # don't analyze, so that we can add our Flirt function analyzer first.
-        vw = viv_utils.getWorkspace(str(path), analyze=False, should_save=False)
-    elif input_format in {FORMAT_PE, FORMAT_ELF}:
-        vw = viv_utils.getWorkspace(str(path), analyze=False, should_save=False)
-    elif input_format == FORMAT_SC32:
-        # these are not analyzed nor saved.
-        vw = viv_utils.getShellcodeWorkspaceFromFile(str(path), arch="i386", analyze=False)
-    elif input_format == FORMAT_SC64:
-        vw = viv_utils.getShellcodeWorkspaceFromFile(str(path), arch="amd64", analyze=False)
-    else:
-        raise ValueError("unexpected format: " + input_format)
-
-    viv_utils.flirt.register_flirt_signature_analyzers(vw, [str(s) for s in sigpaths])
-
-    vw.analyze()
-
-    logger.debug("%s", get_meta_str(vw))
-    return vw
-
-
-def get_extractor(
-    input_path: Path,
-    input_format: str,
-    os_: str,
-    backend: str,
-    sigpaths: List[Path],
-    should_save_workspace=False,
-    disable_progress=False,
-    sample_path: Optional[Path] = None,
-) -> FeatureExtractor:
-    """
-    raises:
-      UnsupportedFormatError
-      UnsupportedArchError
-      UnsupportedOSError
-    """
-    if backend == BACKEND_CAPE:
-        import capa.features.extractors.cape.extractor
-
-        report = json.loads(input_path.read_text(encoding="utf-8"))
-        return capa.features.extractors.cape.extractor.CapeExtractor.from_report(report)
-
-    elif backend == BACKEND_DOTNET:
-        import capa.features.extractors.dnfile.extractor
-
-        if input_format not in (FORMAT_PE, FORMAT_DOTNET):
-            raise UnsupportedFormatError()
-
-        return capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(input_path)
-
-    elif backend == BACKEND_BINJA:
-        from capa.features.extractors.binaryninja.find_binja_api import find_binja_path
-
-        # When we are running as a standalone executable, we cannot directly import binaryninja
-        # We need to fist find the binja API installation path and add it into sys.path
-        if is_running_standalone():
-            bn_api = find_binja_path()
-            if bn_api.exists():
-                sys.path.append(str(bn_api))
-
-        try:
-            import binaryninja
-            from binaryninja import BinaryView
-        except ImportError:
-            raise RuntimeError(
-                "Cannot import binaryninja module. Please install the Binary Ninja Python API first: "
-                + "https://docs.binary.ninja/dev/batch.html#install-the-api)."
-            )
-
-        import capa.features.extractors.binaryninja.extractor
-
-        if input_format not in (FORMAT_SC32, FORMAT_SC64):
-            if not is_supported_format(input_path):
-                raise UnsupportedFormatError()
-
-            if not is_supported_arch(input_path):
-                raise UnsupportedArchError()
-
-            if os_ == OS_AUTO and not is_supported_os(input_path):
-                raise UnsupportedOSError()
-
-        with halo.Halo(text="analyzing program", spinner="simpleDots", stream=sys.stderr, enabled=not disable_progress):
-            bv: BinaryView = binaryninja.load(str(input_path))
-            if bv is None:
-                raise RuntimeError(f"Binary Ninja cannot open file {input_path}")
-
-        return capa.features.extractors.binaryninja.extractor.BinjaFeatureExtractor(bv)
-
-    elif backend == BACKEND_PEFILE:
-        import capa.features.extractors.pefile
-
-        return capa.features.extractors.pefile.PefileFeatureExtractor(input_path)
-
-    elif backend == BACKEND_VIV:
-        import capa.features.extractors.viv.extractor
-
-        if input_format not in (FORMAT_SC32, FORMAT_SC64):
-            if not is_supported_format(input_path):
-                raise UnsupportedFormatError()
-
-            if not is_supported_arch(input_path):
-                raise UnsupportedArchError()
-
-            if os_ == OS_AUTO and not is_supported_os(input_path):
-                raise UnsupportedOSError()
-
-        with halo.Halo(text="analyzing program", spinner="simpleDots", stream=sys.stderr, enabled=not disable_progress):
-            vw = get_workspace(input_path, input_format, sigpaths)
-
-            if should_save_workspace:
-                logger.debug("saving workspace")
-                try:
-                    vw.saveWorkspace()
-                except IOError:
-                    # see #168 for discussion around how to handle non-writable directories
-                    logger.info("source directory is not writable, won't save intermediate workspace")
-            else:
-                logger.debug("CAPA_SAVE_WORKSPACE unset, not saving workspace")
-
-        return capa.features.extractors.viv.extractor.VivisectFeatureExtractor(vw, input_path, os_)
-
-    else:
-        raise ValueError("unexpected backend: " + backend)
-
-
-def get_file_extractors(input_file: Path, input_format: str) -> List[FeatureExtractor]:
-    file_extractors: List[FeatureExtractor] = []
-
-    if input_format == FORMAT_PE:
-        file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(input_file))
-
-    elif input_format == FORMAT_DOTNET:
-        file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(input_file))
-        file_extractors.append(capa.features.extractors.dotnetfile.DotnetFileFeatureExtractor(input_file))
-
-    elif input_format == FORMAT_ELF:
-        file_extractors.append(capa.features.extractors.elffile.ElfFeatureExtractor(input_file))
-
-    elif input_format == FORMAT_CAPE:
-        report = json.loads(input_file.read_text(encoding="utf-8"))
-        file_extractors.append(capa.features.extractors.cape.extractor.CapeExtractor.from_report(report))
-
-    return file_extractors
-
-
-def get_signatures(sigs_path: Path) -> List[Path]:
-    if not sigs_path.exists():
-        raise IOError(f"signatures path {sigs_path} does not exist or cannot be accessed")
-
-    paths: List[Path] = []
-    if sigs_path.is_file():
-        paths.append(sigs_path)
-    elif sigs_path.is_dir():
-        logger.debug("reading signatures from directory %s", sigs_path.resolve())
-        for file in sigs_path.rglob("*"):
-            if file.is_file() and file.suffix.lower() in (".pat", ".pat.gz", ".sig"):
-                paths.append(file)
-
-    # Convert paths to their absolute and normalized forms
-    paths = [path.resolve().absolute() for path in paths]
-
-    # load signatures in deterministic order: the alphabetic sorting of filename.
-    # this means that `0_sigs.pat` loads before `1_sigs.pat`.
-    paths = sorted(paths, key=lambda path: path.name)
-
-    for path in paths:
-        logger.debug("found signature file: %s", path)
-
-    return paths
-
-
-def get_sample_analysis(format_, arch, os_, extractor, rules_path, counts):
-    if isinstance(extractor, StaticFeatureExtractor):
-        return rdoc.StaticAnalysis(
-            format=format_,
-            arch=arch,
-            os=os_,
-            extractor=extractor.__class__.__name__,
-            rules=tuple(rules_path),
-            base_address=frz.Address.from_capa(extractor.get_base_address()),
-            layout=rdoc.StaticLayout(
-                functions=(),
-                # this is updated after capabilities have been collected.
-                # will look like:
-                #
-                # "functions": { 0x401000: { "matched_basic_blocks": [ 0x401000, 0x401005, ... ] }, ... }
-            ),
-            feature_counts=counts["feature_counts"],
-            library_functions=counts["library_functions"],
-        )
-    elif isinstance(extractor, DynamicFeatureExtractor):
-        return rdoc.DynamicAnalysis(
-            format=format_,
-            arch=arch,
-            os=os_,
-            extractor=extractor.__class__.__name__,
-            rules=tuple(rules_path),
-            layout=rdoc.DynamicLayout(
-                processes=(),
-            ),
-            feature_counts=counts["feature_counts"],
-        )
-    else:
-        raise ValueError("invalid extractor type")
-
-
-def collect_metadata(
-    argv: List[str],
-    input_path: Path,
-    input_format: str,
-    os_: str,
-    rules_path: List[Path],
-    extractor: FeatureExtractor,
-    counts: dict,
-) -> rdoc.Metadata:
-    # if it's a binary sample we hash it, if it's a report
-    # we fetch the hashes from the report
-    sample_hashes: SampleHashes = extractor.get_sample_hashes()
-    md5, sha1, sha256 = sample_hashes.md5, sample_hashes.sha1, sample_hashes.sha256
-
-    global_feats = list(extractor.extract_global_features())
-    extractor_format = [f.value for (f, _) in global_feats if isinstance(f, capa.features.common.Format)]
-    extractor_arch = [f.value for (f, _) in global_feats if isinstance(f, capa.features.common.Arch)]
-    extractor_os = [f.value for (f, _) in global_feats if isinstance(f, capa.features.common.OS)]
-
-    input_format = (
-        str(extractor_format[0]) if extractor_format else "unknown" if input_format == FORMAT_AUTO else input_format
-    )
-    arch = str(extractor_arch[0]) if extractor_arch else "unknown"
-    os_ = str(extractor_os[0]) if extractor_os else "unknown" if os_ == OS_AUTO else os_
-
-    if isinstance(extractor, StaticFeatureExtractor):
-        meta_class: type = rdoc.StaticMetadata
-    elif isinstance(extractor, DynamicFeatureExtractor):
-        meta_class = rdoc.DynamicMetadata
-    else:
-        assert_never(extractor)
-
-    rules = tuple(r.resolve().absolute().as_posix() for r in rules_path)
-
-    return meta_class(
-        timestamp=datetime.datetime.now(),
-        version=capa.version.__version__,
-        argv=tuple(argv) if argv else None,
-        sample=rdoc.Sample(
-            md5=md5,
-            sha1=sha1,
-            sha256=sha256,
-            path=input_path.resolve().as_posix(),
-        ),
-        analysis=get_sample_analysis(
-            input_format,
-            arch,
-            os_,
-            extractor,
-            rules,
-            counts,
-        ),
-    )
-
-
-def compute_dynamic_layout(rules, extractor: DynamicFeatureExtractor, capabilities: MatchResults) -> rdoc.DynamicLayout:
-    """
-    compute a metadata structure that links threads
-    to the processes in which they're found.
-
-    only collect the threads at which some rule matched.
-    otherwise, we may pollute the json document with
-    a large amount of un-referenced data.
-    """
-    assert isinstance(extractor, DynamicFeatureExtractor)
-
-    matched_calls: Set[Address] = set()
-
-    def result_rec(result: capa.features.common.Result):
-        for loc in result.locations:
-            if isinstance(loc, capa.features.address.DynamicCallAddress):
-                matched_calls.add(loc)
-        for child in result.children:
-            result_rec(child)
-
-    for matches in capabilities.values():
-        for _, result in matches:
-            result_rec(result)
-
-    names_by_process: Dict[Address, str] = {}
-    names_by_call: Dict[Address, str] = {}
-
-    matched_processes: Set[Address] = set()
-    matched_threads: Set[Address] = set()
-
-    threads_by_process: Dict[Address, List[Address]] = {}
-    calls_by_thread: Dict[Address, List[Address]] = {}
-
-    for p in extractor.get_processes():
-        threads_by_process[p.address] = []
-
-        for t in extractor.get_threads(p):
-            calls_by_thread[t.address] = []
-
-            for c in extractor.get_calls(p, t):
-                if c.address in matched_calls:
-                    names_by_call[c.address] = extractor.get_call_name(p, t, c)
-                    calls_by_thread[t.address].append(c.address)
-
-            if calls_by_thread[t.address]:
-                matched_threads.add(t.address)
-                threads_by_process[p.address].append(t.address)
-
-        if threads_by_process[p.address]:
-            matched_processes.add(p.address)
-            names_by_process[p.address] = extractor.get_process_name(p)
-
-    layout = rdoc.DynamicLayout(
-        processes=tuple(
-            rdoc.ProcessLayout(
-                address=frz.Address.from_capa(p),
-                name=names_by_process[p],
-                matched_threads=tuple(
-                    rdoc.ThreadLayout(
-                        address=frz.Address.from_capa(t),
-                        matched_calls=tuple(
-                            rdoc.CallLayout(
-                                address=frz.Address.from_capa(c),
-                                name=names_by_call[c],
-                            )
-                            for c in calls_by_thread[t]
-                            if c in matched_calls
-                        ),
-                    )
-                    for t in threads
-                    if t in matched_threads
-                )  # this object is open to extension in the future,
-                # such as with the function name, etc.
-            )
-            for p, threads in threads_by_process.items()
-            if p in matched_processes
-        )
-    )
-
-    return layout
-
-
-def compute_static_layout(rules, extractor: StaticFeatureExtractor, capabilities) -> rdoc.StaticLayout:
-    """
-    compute a metadata structure that links basic blocks
-    to the functions in which they're found.
-
-    only collect the basic blocks at which some rule matched.
-    otherwise, we may pollute the json document with
-    a large amount of un-referenced data.
-    """
-    functions_by_bb: Dict[Address, Address] = {}
-    bbs_by_function: Dict[Address, List[Address]] = {}
-    for f in extractor.get_functions():
-        bbs_by_function[f.address] = []
-        for bb in extractor.get_basic_blocks(f):
-            functions_by_bb[bb.address] = f.address
-            bbs_by_function[f.address].append(bb.address)
-
-    matched_bbs = set()
-    for rule_name, matches in capabilities.items():
-        rule = rules[rule_name]
-        if capa.rules.Scope.BASIC_BLOCK in rule.scopes:
-            for addr, _ in matches:
-                assert addr in functions_by_bb
-                matched_bbs.add(addr)
-
-    layout = rdoc.StaticLayout(
-        functions=tuple(
-            rdoc.FunctionLayout(
-                address=frz.Address.from_capa(f),
-                matched_basic_blocks=tuple(
-                    rdoc.BasicBlockLayout(address=frz.Address.from_capa(bb)) for bb in bbs if bb in matched_bbs
-                )  # this object is open to extension in the future,
-                # such as with the function name, etc.
-            )
-            for f, bbs in bbs_by_function.items()
-            if len([bb for bb in bbs if bb in matched_bbs]) > 0
-        )
-    )
-
-    return layout
-
-
-def compute_layout(rules, extractor, capabilities) -> rdoc.Layout:
-    if isinstance(extractor, StaticFeatureExtractor):
-        return compute_static_layout(rules, extractor, capabilities)
-    elif isinstance(extractor, DynamicFeatureExtractor):
-        return compute_dynamic_layout(rules, extractor, capabilities)
-    else:
-        raise ValueError("extractor must be either a static or dynamic extracotr")
-
-
 def simple_message_exception_handler(exctype, value: BaseException, traceback: TracebackType):
     """
     prints friendly message on unexpected exceptions to regular users (debug mode shows regular stack trace)
@@ -978,10 +485,10 @@ def get_input_format_from_args(args) -> str:
     raises:
       ShouldExitError: if the program is invoked incorrectly and should exit.
     """
-    format = args.format
+    format_ = args.format
 
-    if format != FORMAT_AUTO:
-        return format
+    if format_ != FORMAT_AUTO:
+        return format_
 
     try:
         return get_auto_format(args.input_file)
@@ -1057,7 +564,7 @@ def get_os_from_args(args, backend) -> str:
     sample_path = get_sample_path_from_args(args, backend)
     if sample_path is None:
         return "unknown"
-    return get_os(sample_path)
+    return capa.loader.get_os(sample_path)
 
 
 def get_rules_from_args(args) -> RuleSet:
@@ -1069,7 +576,7 @@ def get_rules_from_args(args) -> RuleSet:
       ShouldExitError: if the program is invoked incorrectly and should exit.
     """
     try:
-        if is_running_standalone() and args.is_default_rules:
+        if capa.helpers.is_running_standalone() and args.is_default_rules:
             cache_dir = get_default_root() / "cache"
         else:
             cache_dir = capa.rules.cache.get_default_cache_directory()
@@ -1125,7 +632,7 @@ def get_file_extractors_from_args(args, input_format: str) -> List[FeatureExtrac
     # this pass can inspect multiple file extractors, e.g., dotnet and pe to identify
     # various limitations
     try:
-        return get_file_extractors(args.input_file, input_format)
+        return capa.loader.get_file_extractors(args.input_file, input_format)
     except PEFormatError as e:
         logger.error("Input file '%s' is not a valid PE file: %s", args.input_file, str(e))
         raise ShouldExitError(E_CORRUPT_FILE) from e
@@ -1207,7 +714,7 @@ def get_extractor_from_args(args, input_format: str, backend: str) -> FeatureExt
             elif input_format != FORMAT_PE:
                 logger.debug("skipping library code matching: signatures only supports PE files")
             else:
-                sig_paths = get_signatures(args.signatures)
+                sig_paths = capa.loader.get_signatures(args.signatures)
         except IOError as e:
             logger.error("%s", str(e))
             raise ShouldExitError(E_INVALID_SIG) from e
@@ -1221,7 +728,7 @@ def get_extractor_from_args(args, input_format: str, backend: str) -> FeatureExt
         #  see same code and show-features above examples
         #  https://github.com/mandiant/capa/issues/1813
         try:
-            return get_extractor(
+            return capa.loader.get_extractor(
                 args.input_file,
                 input_format,
                 os_,
@@ -1317,15 +824,15 @@ def main(argv: Optional[List[str]] = None):
             if sample_path is None:
                 os_ = "unknown"
             else:
-                os_ = get_os(sample_path)
+                os_ = capa.loader.get_os(sample_path)
             extractor = get_extractor_from_args(args, input_format, backend)
         except ShouldExitError as e:
             return e.status_code
 
         capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet)
 
-        meta = collect_metadata(argv, args.input_file, input_format, os_, args.rules, extractor, counts)
-        meta.analysis.layout = compute_layout(rules, extractor, capabilities)
+        meta = capa.loader.collect_metadata(argv, args.input_file, input_format, os_, args.rules, extractor, counts)
+        meta.analysis.layout = capa.loader.compute_layout(rules, extractor, capabilities)
 
         if isinstance(extractor, StaticFeatureExtractor) and found_file_limitation:
             # bail if capa's static feature extractor encountered file limitation e.g. a packed binary
diff --git a/scripts/bulk-process.py b/scripts/bulk-process.py
index 0f6422c18..82c511c25 100644
--- a/scripts/bulk-process.py
+++ b/scripts/bulk-process.py
@@ -110,7 +110,7 @@ def get_capa_results(args):
     should_save_workspace = os.environ.get("CAPA_SAVE_WORKSPACE") not in ("0", "no", "NO", "n", None)
     logger.info("computing capa results for: %s", path)
     try:
-        extractor = capa.main.get_extractor(
+        extractor = capa.loader.get_extractor(
             path, format, os_, capa.main.BACKEND_VIV, sigpaths, should_save_workspace, disable_progress=True
         )
     except capa.exceptions.UnsupportedFormatError:
@@ -139,8 +139,8 @@ def get_capa_results(args):
 
     capabilities, counts = capa.capabilities.common.find_capabilities(rules, extractor, disable_progress=True)
 
-    meta = capa.main.collect_metadata([], path, format, os_, [], extractor, counts)
-    meta.analysis.layout = capa.main.compute_layout(rules, extractor, capabilities)
+    meta = capa.loader.collect_metadata([], path, format, os_, [], extractor, counts)
+    meta.analysis.layout = capa.loader.compute_layout(rules, extractor, capabilities)
 
     doc = rd.ResultDocument.from_capa(meta, rules, capabilities)
     return {"path": path, "status": "ok", "ok": doc.model_dump()}
@@ -168,7 +168,7 @@ def main(argv=None):
             return -1
 
         try:
-            sig_paths = capa.main.get_signatures(args.signatures)
+            sig_paths = capa.loader.get_signatures(args.signatures)
         except IOError as e:
             logger.error("%s", str(e))
             return -1
diff --git a/scripts/capa_as_library.py b/scripts/capa_as_library.py
index e6b8bf429..a3a160784 100644
--- a/scripts/capa_as_library.py
+++ b/scripts/capa_as_library.py
@@ -173,14 +173,14 @@ def capa_details(rules_path: Path, file_path: Path, output_format="dictionary"):
     rules = capa.rules.get_rules([rules_path])
 
     # extract features and find capabilities
-    extractor = capa.main.get_extractor(
+    extractor = capa.loader.get_extractor(
         file_path, FORMAT_AUTO, OS_AUTO, capa.main.BACKEND_VIV, [], False, disable_progress=True
     )
     capabilities, counts = capa.capabilities.common.find_capabilities(rules, extractor, disable_progress=True)
 
     # collect metadata (used only to make rendering more complete)
-    meta = capa.main.collect_metadata([], file_path, FORMAT_AUTO, OS_AUTO, [rules_path], extractor, counts)
-    meta.analysis.layout = capa.main.compute_layout(rules, extractor, capabilities)
+    meta = capa.loader.collect_metadata([], file_path, FORMAT_AUTO, OS_AUTO, [rules_path], extractor, counts)
+    meta.analysis.layout = capa.loader.compute_layout(rules, extractor, capabilities)
 
     capa_output: Any = False
 
diff --git a/scripts/lint.py b/scripts/lint.py
index b24aa1349..4eb05f289 100644
--- a/scripts/lint.py
+++ b/scripts/lint.py
@@ -363,7 +363,7 @@ def get_sample_capabilities(ctx: Context, path: Path) -> Set[str]:
         format_ = capa.helpers.get_auto_format(nice_path)
 
     logger.debug("analyzing sample: %s", nice_path)
-    extractor = capa.main.get_extractor(
+    extractor = capa.loader.get_extractor(
         nice_path, format_, OS_AUTO, capa.main.BACKEND_VIV, DEFAULT_SIGNATURES, False, disable_progress=True
     )
 
diff --git a/scripts/profile-time.py b/scripts/profile-time.py
index f9615cba6..d67ba5385 100644
--- a/scripts/profile-time.py
+++ b/scripts/profile-time.py
@@ -97,7 +97,7 @@ def main(argv=None):
         return -1
 
     try:
-        sig_paths = capa.main.get_signatures(args.signatures)
+        sig_paths = capa.loader.get_signatures(args.signatures)
     except IOError as e:
         logger.error("%s", str(e))
         return -1
@@ -107,7 +107,7 @@ def main(argv=None):
     ):
         extractor = capa.features.freeze.load(Path(args.sample).read_bytes())
     else:
-        extractor = capa.main.get_extractor(
+        extractor = capa.loader.get_extractor(
             args.sample, args.format, args.os, capa.main.BACKEND_VIV, sig_paths, should_save_workspace=False
         )
 
diff --git a/scripts/show-capabilities-by-function.py b/scripts/show-capabilities-by-function.py
index c09797ec2..22ebd1e55 100644
--- a/scripts/show-capabilities-by-function.py
+++ b/scripts/show-capabilities-by-function.py
@@ -163,7 +163,7 @@ def main(argv=None):
         return -1
 
     try:
-        sig_paths = capa.main.get_signatures(args.signatures)
+        sig_paths = capa.loader.get_signatures(args.signatures)
     except IOError as e:
         logger.error("%s", str(e))
         return -1
@@ -176,7 +176,7 @@ def main(argv=None):
         should_save_workspace = os.environ.get("CAPA_SAVE_WORKSPACE") not in ("0", "no", "NO", "n", None)
 
         try:
-            extractor = capa.main.get_extractor(
+            extractor = capa.loader.get_extractor(
                 args.sample, args.format, args.os, args.backend, sig_paths, should_save_workspace
             )
             assert isinstance(extractor, StaticFeatureExtractor)
@@ -189,8 +189,8 @@ def main(argv=None):
 
     capabilities, counts = capa.capabilities.common.find_capabilities(rules, extractor)
 
-    meta = capa.main.collect_metadata(argv, args.sample, format_, args.os, args.rules, extractor, counts)
-    meta.analysis.layout = capa.main.compute_layout(rules, extractor, capabilities)
+    meta = capa.loader.collect_metadata(argv, args.sample, format_, args.os, args.rules, extractor, counts)
+    meta.analysis.layout = capa.loader.compute_layout(rules, extractor, capabilities)
 
     if capa.capabilities.common.has_file_limitation(rules, capabilities):
         # bail if capa encountered file limitation e.g. a packed binary
diff --git a/scripts/show-features.py b/scripts/show-features.py
index 2d5a34808..b448efe6c 100644
--- a/scripts/show-features.py
+++ b/scripts/show-features.py
@@ -74,6 +74,7 @@
 import capa.main
 import capa.rules
 import capa.engine
+import capa.loader
 import capa.helpers
 import capa.features
 import capa.exceptions
@@ -124,7 +125,7 @@ def main(argv=None):
         return -1
 
     try:
-        sig_paths = capa.main.get_signatures(args.signatures)
+        sig_paths = capa.loader.get_signatures(args.signatures)
     except IOError as e:
         logger.error("%s", str(e))
         return -1
@@ -137,7 +138,7 @@ def main(argv=None):
     else:
         should_save_workspace = os.environ.get("CAPA_SAVE_WORKSPACE") not in ("0", "no", "NO", "n", None)
         try:
-            extractor = capa.main.get_extractor(
+            extractor = capa.loader.get_extractor(
                 args.sample, format_, args.os, args.backend, sig_paths, should_save_workspace
             )
         except capa.exceptions.UnsupportedFormatError as e:
diff --git a/scripts/show-unused-features.py b/scripts/show-unused-features.py
index b030995c3..b045f2613 100644
--- a/scripts/show-unused-features.py
+++ b/scripts/show-unused-features.py
@@ -123,7 +123,7 @@ def main(argv=None):
         return -1
 
     try:
-        sig_paths = capa.main.get_signatures(args.signatures)
+        sig_paths = capa.loader.get_signatures(args.signatures)
     except IOError as e:
         logger.error("%s", str(e))
         return -1
@@ -135,7 +135,7 @@ def main(argv=None):
     else:
         should_save_workspace = os.environ.get("CAPA_SAVE_WORKSPACE") not in ("0", "no", "NO", "n", None)
         try:
-            extractor = capa.main.get_extractor(
+            extractor = capa.loader.get_extractor(
                 args.sample, args.format, args.os, args.backend, sig_paths, should_save_workspace
             )
         except capa.exceptions.UnsupportedFormatError:
diff --git a/tests/fixtures.py b/tests/fixtures.py
index a06308a1c..ebfe557a5 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -106,11 +106,11 @@ def get_viv_extractor(path: Path):
     ]
 
     if "raw32" in path.name:
-        vw = capa.main.get_workspace(path, "sc32", sigpaths=sigpaths)
+        vw = capa.loader.get_workspace(path, "sc32", sigpaths=sigpaths)
     elif "raw64" in path.name:
-        vw = capa.main.get_workspace(path, "sc64", sigpaths=sigpaths)
+        vw = capa.loader.get_workspace(path, "sc64", sigpaths=sigpaths)
     else:
-        vw = capa.main.get_workspace(path, FORMAT_AUTO, sigpaths=sigpaths)
+        vw = capa.loader.get_workspace(path, FORMAT_AUTO, sigpaths=sigpaths)
     vw.saveWorkspace()
     extractor = capa.features.extractors.viv.extractor.VivisectFeatureExtractor(vw, path, OS_AUTO)
     fixup_viv(path, extractor)

From 256d47876052625e164555686f4c378605ac8439 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 26 Jan 2024 08:08:02 +0000
Subject: [PATCH 014/200] add loader module

---
 capa/loader.py | 540 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 540 insertions(+)
 create mode 100644 capa/loader.py

diff --git a/capa/loader.py b/capa/loader.py
new file mode 100644
index 000000000..e1f559cd7
--- /dev/null
+++ b/capa/loader.py
@@ -0,0 +1,540 @@
+# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+import sys
+import json
+import logging
+import datetime
+from typing import Set, Dict, List, Optional
+from pathlib import Path
+
+import halo
+from typing_extensions import assert_never
+
+import capa.perf
+import capa.rules
+import capa.engine
+import capa.helpers
+import capa.version
+import capa.render.json
+import capa.rules.cache
+import capa.render.default
+import capa.render.verbose
+import capa.features.common
+import capa.features.freeze as frz
+import capa.render.vverbose
+import capa.features.extractors
+import capa.render.result_document
+import capa.render.result_document as rdoc
+import capa.features.extractors.common
+import capa.features.extractors.pefile
+import capa.features.extractors.elffile
+import capa.features.extractors.dotnetfile
+import capa.features.extractors.base_extractor
+import capa.features.extractors.cape.extractor
+from capa.rules import RuleSet
+from capa.engine import MatchResults
+from capa.exceptions import UnsupportedOSError, UnsupportedArchError, UnsupportedFormatError
+from capa.features.common import (
+    OS_AUTO,
+    FORMAT_PE,
+    FORMAT_ELF,
+    FORMAT_AUTO,
+    FORMAT_CAPE,
+    FORMAT_SC32,
+    FORMAT_SC64,
+    FORMAT_DOTNET,
+)
+from capa.features.address import Address
+from capa.features.extractors.base_extractor import (
+    SampleHashes,
+    FeatureExtractor,
+    StaticFeatureExtractor,
+    DynamicFeatureExtractor,
+)
+
+logger = logging.getLogger(__name__)
+
+BACKEND_VIV = "vivisect"
+BACKEND_DOTNET = "dotnet"
+BACKEND_BINJA = "binja"
+BACKEND_PEFILE = "pefile"
+BACKEND_CAPE = "cape"
+
+
+def is_supported_format(sample: Path) -> bool:
+    """
+    Return if this is a supported file based on magic header values
+    """
+    taste = sample.open("rb").read(0x100)
+
+    return len(list(capa.features.extractors.common.extract_format(taste))) == 1
+
+
+def is_supported_arch(sample: Path) -> bool:
+    buf = sample.read_bytes()
+
+    return len(list(capa.features.extractors.common.extract_arch(buf))) == 1
+
+
+def get_arch(sample: Path) -> str:
+    buf = sample.read_bytes()
+
+    for feature, _ in capa.features.extractors.common.extract_arch(buf):
+        assert isinstance(feature.value, str)
+        return feature.value
+
+    return "unknown"
+
+
+def is_supported_os(sample: Path) -> bool:
+    buf = sample.read_bytes()
+
+    return len(list(capa.features.extractors.common.extract_os(buf))) == 1
+
+
+def get_os(sample: Path) -> str:
+    buf = sample.read_bytes()
+
+    for feature, _ in capa.features.extractors.common.extract_os(buf):
+        assert isinstance(feature.value, str)
+        return feature.value
+
+    return "unknown"
+
+
+def get_meta_str(vw):
+    """
+    Return workspace meta information string
+    """
+    meta = []
+    for k in ["Format", "Platform", "Architecture"]:
+        if k in vw.metadata:
+            meta.append(f"{k.lower()}: {vw.metadata[k]}")
+    return f"{', '.join(meta)}, number of functions: {len(vw.getFunctions())}"
+
+
+def get_workspace(path: Path, input_format: str, sigpaths: List[Path]):
+    """
+    load the program at the given path into a vivisect workspace using the given format.
+    also apply the given FLIRT signatures.
+
+    supported formats:
+      - pe
+      - elf
+      - shellcode 32-bit
+      - shellcode 64-bit
+      - auto
+
+    this creates and analyzes the workspace; however, it does *not* save the workspace.
+    this is the responsibility of the caller.
+    """
+
+    # lazy import enables us to not require viv if user wants SMDA, for example.
+    import viv_utils
+    import viv_utils.flirt
+
+    logger.debug("generating vivisect workspace for: %s", path)
+    if input_format == FORMAT_AUTO:
+        if not is_supported_format(path):
+            raise UnsupportedFormatError()
+
+        # don't analyze, so that we can add our Flirt function analyzer first.
+        vw = viv_utils.getWorkspace(str(path), analyze=False, should_save=False)
+    elif input_format in {FORMAT_PE, FORMAT_ELF}:
+        vw = viv_utils.getWorkspace(str(path), analyze=False, should_save=False)
+    elif input_format == FORMAT_SC32:
+        # these are not analyzed nor saved.
+        vw = viv_utils.getShellcodeWorkspaceFromFile(str(path), arch="i386", analyze=False)
+    elif input_format == FORMAT_SC64:
+        vw = viv_utils.getShellcodeWorkspaceFromFile(str(path), arch="amd64", analyze=False)
+    else:
+        raise ValueError("unexpected format: " + input_format)
+
+    viv_utils.flirt.register_flirt_signature_analyzers(vw, [str(s) for s in sigpaths])
+
+    vw.analyze()
+
+    logger.debug("%s", get_meta_str(vw))
+    return vw
+
+
+def get_extractor(
+    input_path: Path,
+    input_format: str,
+    os_: str,
+    backend: str,
+    sigpaths: List[Path],
+    should_save_workspace=False,
+    disable_progress=False,
+    sample_path: Optional[Path] = None,
+) -> FeatureExtractor:
+    """
+    raises:
+      UnsupportedFormatError
+      UnsupportedArchError
+      UnsupportedOSError
+    """
+    if backend == BACKEND_CAPE:
+        import capa.features.extractors.cape.extractor
+
+        report = json.loads(input_path.read_text(encoding="utf-8"))
+        return capa.features.extractors.cape.extractor.CapeExtractor.from_report(report)
+
+    elif backend == BACKEND_DOTNET:
+        import capa.features.extractors.dnfile.extractor
+
+        if input_format not in (FORMAT_PE, FORMAT_DOTNET):
+            raise UnsupportedFormatError()
+
+        return capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(input_path)
+
+    elif backend == BACKEND_BINJA:
+        import capa.helpers
+        from capa.features.extractors.binaryninja.find_binja_api import find_binja_path
+
+        # When we are running as a standalone executable, we cannot directly import binaryninja
+        # We need to fist find the binja API installation path and add it into sys.path
+        if capa.helpers.is_running_standalone():
+            bn_api = find_binja_path()
+            if bn_api.exists():
+                sys.path.append(str(bn_api))
+
+        try:
+            import binaryninja
+            from binaryninja import BinaryView
+        except ImportError:
+            raise RuntimeError(
+                "Cannot import binaryninja module. Please install the Binary Ninja Python API first: "
+                + "https://docs.binary.ninja/dev/batch.html#install-the-api)."
+            )
+
+        import capa.features.extractors.binaryninja.extractor
+
+        if input_format not in (FORMAT_SC32, FORMAT_SC64):
+            if not is_supported_format(input_path):
+                raise UnsupportedFormatError()
+
+            if not is_supported_arch(input_path):
+                raise UnsupportedArchError()
+
+            if os_ == OS_AUTO and not is_supported_os(input_path):
+                raise UnsupportedOSError()
+
+        with halo.Halo(text="analyzing program", spinner="simpleDots", stream=sys.stderr, enabled=not disable_progress):
+            bv: BinaryView = binaryninja.load(str(input_path))
+            if bv is None:
+                raise RuntimeError(f"Binary Ninja cannot open file {input_path}")
+
+        return capa.features.extractors.binaryninja.extractor.BinjaFeatureExtractor(bv)
+
+    elif backend == BACKEND_PEFILE:
+        import capa.features.extractors.pefile
+
+        return capa.features.extractors.pefile.PefileFeatureExtractor(input_path)
+
+    elif backend == BACKEND_VIV:
+        import capa.features.extractors.viv.extractor
+
+        if input_format not in (FORMAT_SC32, FORMAT_SC64):
+            if not is_supported_format(input_path):
+                raise UnsupportedFormatError()
+
+            if not is_supported_arch(input_path):
+                raise UnsupportedArchError()
+
+            if os_ == OS_AUTO and not is_supported_os(input_path):
+                raise UnsupportedOSError()
+
+        with halo.Halo(text="analyzing program", spinner="simpleDots", stream=sys.stderr, enabled=not disable_progress):
+            vw = get_workspace(input_path, input_format, sigpaths)
+
+            if should_save_workspace:
+                logger.debug("saving workspace")
+                try:
+                    vw.saveWorkspace()
+                except IOError:
+                    # see #168 for discussion around how to handle non-writable directories
+                    logger.info("source directory is not writable, won't save intermediate workspace")
+            else:
+                logger.debug("CAPA_SAVE_WORKSPACE unset, not saving workspace")
+
+        return capa.features.extractors.viv.extractor.VivisectFeatureExtractor(vw, input_path, os_)
+
+    else:
+        raise ValueError("unexpected backend: " + backend)
+
+
+def get_file_extractors(input_file: Path, input_format: str) -> List[FeatureExtractor]:
+    file_extractors: List[FeatureExtractor] = []
+
+    if input_format == FORMAT_PE:
+        file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(input_file))
+
+    elif input_format == FORMAT_DOTNET:
+        file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(input_file))
+        file_extractors.append(capa.features.extractors.dotnetfile.DotnetFileFeatureExtractor(input_file))
+
+    elif input_format == FORMAT_ELF:
+        file_extractors.append(capa.features.extractors.elffile.ElfFeatureExtractor(input_file))
+
+    elif input_format == FORMAT_CAPE:
+        report = json.loads(input_file.read_text(encoding="utf-8"))
+        file_extractors.append(capa.features.extractors.cape.extractor.CapeExtractor.from_report(report))
+
+    return file_extractors
+
+
+def get_signatures(sigs_path: Path) -> List[Path]:
+    if not sigs_path.exists():
+        raise IOError(f"signatures path {sigs_path} does not exist or cannot be accessed")
+
+    paths: List[Path] = []
+    if sigs_path.is_file():
+        paths.append(sigs_path)
+    elif sigs_path.is_dir():
+        logger.debug("reading signatures from directory %s", sigs_path.resolve())
+        for file in sigs_path.rglob("*"):
+            if file.is_file() and file.suffix.lower() in (".pat", ".pat.gz", ".sig"):
+                paths.append(file)
+
+    # Convert paths to their absolute and normalized forms
+    paths = [path.resolve().absolute() for path in paths]
+
+    # load signatures in deterministic order: the alphabetic sorting of filename.
+    # this means that `0_sigs.pat` loads before `1_sigs.pat`.
+    paths = sorted(paths, key=lambda path: path.name)
+
+    for path in paths:
+        logger.debug("found signature file: %s", path)
+
+    return paths
+
+
+def get_sample_analysis(format_, arch, os_, extractor, rules_path, counts):
+    if isinstance(extractor, StaticFeatureExtractor):
+        return rdoc.StaticAnalysis(
+            format=format_,
+            arch=arch,
+            os=os_,
+            extractor=extractor.__class__.__name__,
+            rules=tuple(rules_path),
+            base_address=frz.Address.from_capa(extractor.get_base_address()),
+            layout=rdoc.StaticLayout(
+                functions=(),
+                # this is updated after capabilities have been collected.
+                # will look like:
+                #
+                # "functions": { 0x401000: { "matched_basic_blocks": [ 0x401000, 0x401005, ... ] }, ... }
+            ),
+            feature_counts=counts["feature_counts"],
+            library_functions=counts["library_functions"],
+        )
+    elif isinstance(extractor, DynamicFeatureExtractor):
+        return rdoc.DynamicAnalysis(
+            format=format_,
+            arch=arch,
+            os=os_,
+            extractor=extractor.__class__.__name__,
+            rules=tuple(rules_path),
+            layout=rdoc.DynamicLayout(
+                processes=(),
+            ),
+            feature_counts=counts["feature_counts"],
+        )
+    else:
+        raise ValueError("invalid extractor type")
+
+
+def collect_metadata(
+    argv: List[str],
+    input_path: Path,
+    input_format: str,
+    os_: str,
+    rules_path: List[Path],
+    extractor: FeatureExtractor,
+    counts: dict,
+) -> rdoc.Metadata:
+    # if it's a binary sample we hash it, if it's a report
+    # we fetch the hashes from the report
+    sample_hashes: SampleHashes = extractor.get_sample_hashes()
+    md5, sha1, sha256 = sample_hashes.md5, sample_hashes.sha1, sample_hashes.sha256
+
+    global_feats = list(extractor.extract_global_features())
+    extractor_format = [f.value for (f, _) in global_feats if isinstance(f, capa.features.common.Format)]
+    extractor_arch = [f.value for (f, _) in global_feats if isinstance(f, capa.features.common.Arch)]
+    extractor_os = [f.value for (f, _) in global_feats if isinstance(f, capa.features.common.OS)]
+
+    input_format = (
+        str(extractor_format[0]) if extractor_format else "unknown" if input_format == FORMAT_AUTO else input_format
+    )
+    arch = str(extractor_arch[0]) if extractor_arch else "unknown"
+    os_ = str(extractor_os[0]) if extractor_os else "unknown" if os_ == OS_AUTO else os_
+
+    if isinstance(extractor, StaticFeatureExtractor):
+        meta_class: type = rdoc.StaticMetadata
+    elif isinstance(extractor, DynamicFeatureExtractor):
+        meta_class = rdoc.DynamicMetadata
+    else:
+        assert_never(extractor)
+
+    rules = tuple(r.resolve().absolute().as_posix() for r in rules_path)
+
+    return meta_class(
+        timestamp=datetime.datetime.now(),
+        version=capa.version.__version__,
+        argv=tuple(argv) if argv else None,
+        sample=rdoc.Sample(
+            md5=md5,
+            sha1=sha1,
+            sha256=sha256,
+            path=input_path.resolve().as_posix(),
+        ),
+        analysis=get_sample_analysis(
+            input_format,
+            arch,
+            os_,
+            extractor,
+            rules,
+            counts,
+        ),
+    )
+
+
+def compute_dynamic_layout(
+    rules: RuleSet, extractor: DynamicFeatureExtractor, capabilities: MatchResults
+) -> rdoc.DynamicLayout:
+    """
+    compute a metadata structure that links threads
+    to the processes in which they're found.
+
+    only collect the threads at which some rule matched.
+    otherwise, we may pollute the json document with
+    a large amount of un-referenced data.
+    """
+    assert isinstance(extractor, DynamicFeatureExtractor)
+
+    matched_calls: Set[Address] = set()
+
+    def result_rec(result: capa.features.common.Result):
+        for loc in result.locations:
+            if isinstance(loc, capa.features.address.DynamicCallAddress):
+                matched_calls.add(loc)
+        for child in result.children:
+            result_rec(child)
+
+    for matches in capabilities.values():
+        for _, result in matches:
+            result_rec(result)
+
+    names_by_process: Dict[Address, str] = {}
+    names_by_call: Dict[Address, str] = {}
+
+    matched_processes: Set[Address] = set()
+    matched_threads: Set[Address] = set()
+
+    threads_by_process: Dict[Address, List[Address]] = {}
+    calls_by_thread: Dict[Address, List[Address]] = {}
+
+    for p in extractor.get_processes():
+        threads_by_process[p.address] = []
+
+        for t in extractor.get_threads(p):
+            calls_by_thread[t.address] = []
+
+            for c in extractor.get_calls(p, t):
+                if c.address in matched_calls:
+                    names_by_call[c.address] = extractor.get_call_name(p, t, c)
+                    calls_by_thread[t.address].append(c.address)
+
+            if calls_by_thread[t.address]:
+                matched_threads.add(t.address)
+                threads_by_process[p.address].append(t.address)
+
+        if threads_by_process[p.address]:
+            matched_processes.add(p.address)
+            names_by_process[p.address] = extractor.get_process_name(p)
+
+    layout = rdoc.DynamicLayout(
+        processes=tuple(
+            rdoc.ProcessLayout(
+                address=frz.Address.from_capa(p),
+                name=names_by_process[p],
+                matched_threads=tuple(
+                    rdoc.ThreadLayout(
+                        address=frz.Address.from_capa(t),
+                        matched_calls=tuple(
+                            rdoc.CallLayout(
+                                address=frz.Address.from_capa(c),
+                                name=names_by_call[c],
+                            )
+                            for c in calls_by_thread[t]
+                            if c in matched_calls
+                        ),
+                    )
+                    for t in threads
+                    if t in matched_threads
+                )  # this object is open to extension in the future,
+                # such as with the function name, etc.
+            )
+            for p, threads in threads_by_process.items()
+            if p in matched_processes
+        )
+    )
+
+    return layout
+
+
+def compute_static_layout(rules: RuleSet, extractor: StaticFeatureExtractor, capabilities) -> rdoc.StaticLayout:
+    """
+    compute a metadata structure that links basic blocks
+    to the functions in which they're found.
+
+    only collect the basic blocks at which some rule matched.
+    otherwise, we may pollute the json document with
+    a large amount of un-referenced data.
+    """
+    functions_by_bb: Dict[Address, Address] = {}
+    bbs_by_function: Dict[Address, List[Address]] = {}
+    for f in extractor.get_functions():
+        bbs_by_function[f.address] = []
+        for bb in extractor.get_basic_blocks(f):
+            functions_by_bb[bb.address] = f.address
+            bbs_by_function[f.address].append(bb.address)
+
+    matched_bbs = set()
+    for rule_name, matches in capabilities.items():
+        rule = rules[rule_name]
+        if capa.rules.Scope.BASIC_BLOCK in rule.scopes:
+            for addr, _ in matches:
+                assert addr in functions_by_bb
+                matched_bbs.add(addr)
+
+    layout = rdoc.StaticLayout(
+        functions=tuple(
+            rdoc.FunctionLayout(
+                address=frz.Address.from_capa(f),
+                matched_basic_blocks=tuple(
+                    rdoc.BasicBlockLayout(address=frz.Address.from_capa(bb)) for bb in bbs if bb in matched_bbs
+                )  # this object is open to extension in the future,
+                # such as with the function name, etc.
+            )
+            for f, bbs in bbs_by_function.items()
+            if len([bb for bb in bbs if bb in matched_bbs]) > 0
+        )
+    )
+
+    return layout
+
+
+def compute_layout(rules: RuleSet, extractor, capabilities) -> rdoc.Layout:
+    if isinstance(extractor, StaticFeatureExtractor):
+        return compute_static_layout(rules, extractor, capabilities)
+    elif isinstance(extractor, DynamicFeatureExtractor):
+        return compute_dynamic_layout(rules, extractor, capabilities)
+    else:
+        raise ValueError("extractor must be either a static or dynamic extracotr")

From 554df5ccb5571705f8f3528d79c4842ead80e50d Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 26 Jan 2024 08:40:11 +0000
Subject: [PATCH 015/200] loader: learn to load freeze format

---
 capa/features/common.py |  8 +++-
 capa/helpers.py         | 14 +++++-
 capa/loader.py          |  4 ++
 capa/main.py            | 96 ++++++++++++++++++++---------------------
 4 files changed, 70 insertions(+), 52 deletions(-)

diff --git a/capa/features/common.py b/capa/features/common.py
index 0cb1396de..b6527625f 100644
--- a/capa/features/common.py
+++ b/capa/features/common.py
@@ -458,18 +458,22 @@ def evaluate(self, ctx, **kwargs):
 FORMAT_SC32 = "sc32"
 FORMAT_SC64 = "sc64"
 FORMAT_CAPE = "cape"
+FORMAT_FREEZE = "freeze"
+FORMAT_RESULT = "result"
 STATIC_FORMATS = {
     FORMAT_SC32,
     FORMAT_SC64,
     FORMAT_PE,
     FORMAT_ELF,
     FORMAT_DOTNET,
+    FORMAT_FREEZE,
+    FORMAT_RESULT,
 }
 DYNAMIC_FORMATS = {
     FORMAT_CAPE,
+    FORMAT_FREEZE,
+    FORMAT_RESULT,
 }
-FORMAT_FREEZE = "freeze"
-FORMAT_RESULT = "result"
 FORMAT_UNKNOWN = "unknown"
 
 
diff --git a/capa/helpers.py b/capa/helpers.py
index a85271af1..fbde14d74 100644
--- a/capa/helpers.py
+++ b/capa/helpers.py
@@ -17,12 +17,22 @@
 import tqdm
 
 from capa.exceptions import UnsupportedFormatError
-from capa.features.common import FORMAT_PE, FORMAT_CAPE, FORMAT_SC32, FORMAT_SC64, FORMAT_DOTNET, FORMAT_UNKNOWN, Format
+from capa.features.common import (
+    FORMAT_PE,
+    FORMAT_CAPE,
+    FORMAT_SC32,
+    FORMAT_SC64,
+    FORMAT_DOTNET,
+    FORMAT_FREEZE,
+    FORMAT_UNKNOWN,
+    Format,
+)
 
 EXTENSIONS_SHELLCODE_32 = ("sc32", "raw32")
 EXTENSIONS_SHELLCODE_64 = ("sc64", "raw64")
 EXTENSIONS_DYNAMIC = ("json", "json_")
 EXTENSIONS_ELF = "elf_"
+EXTENSIONS_FREEZE = "frz"
 
 logger = logging.getLogger("capa")
 
@@ -82,6 +92,8 @@ def get_format_from_extension(sample: Path) -> str:
         format_ = FORMAT_SC64
     elif sample.name.endswith(EXTENSIONS_DYNAMIC):
         format_ = get_format_from_report(sample)
+    elif sample.name.endswith(EXTENSIONS_FREEZE):
+        format_ = FORMAT_FREEZE
     return format_
 
 
diff --git a/capa/loader.py b/capa/loader.py
index e1f559cd7..83a9931af 100644
--- a/capa/loader.py
+++ b/capa/loader.py
@@ -64,6 +64,7 @@
 BACKEND_BINJA = "binja"
 BACKEND_PEFILE = "pefile"
 BACKEND_CAPE = "cape"
+BACKEND_FREEZE = "freeze"
 
 
 def is_supported_format(sample: Path) -> bool:
@@ -265,6 +266,9 @@ def get_extractor(
 
         return capa.features.extractors.viv.extractor.VivisectFeatureExtractor(vw, input_path, os_)
 
+    elif backend == BACKEND_FREEZE:
+        return frz.load(input_path.read_bytes())
+
     else:
         raise ValueError("unexpected backend: " + backend)
 
diff --git a/capa/main.py b/capa/main.py
index de1101ab4..76a7cc28f 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -35,7 +35,6 @@
 import capa.render.default
 import capa.render.verbose
 import capa.features.common
-import capa.features.freeze as frz
 import capa.render.vverbose
 import capa.features.extractors
 import capa.render.result_document
@@ -48,7 +47,7 @@
 import capa.features.extractors.cape.extractor
 from capa.rules import RuleSet
 from capa.engine import MatchResults
-from capa.loader import BACKEND_VIV, BACKEND_CAPE, BACKEND_BINJA, BACKEND_DOTNET, BACKEND_PEFILE
+from capa.loader import BACKEND_VIV, BACKEND_CAPE, BACKEND_BINJA, BACKEND_DOTNET, BACKEND_FREEZE, BACKEND_PEFILE
 from capa.helpers import (
     get_file_taste,
     get_auto_format,
@@ -521,6 +520,9 @@ def get_backend_from_args(args, input_format: str) -> str:
     elif input_format == FORMAT_DOTNET:
         return BACKEND_DOTNET
 
+    elif input_format == FORMAT_FREEZE:
+        return BACKEND_FREEZE
+
     else:
         return BACKEND_VIV
 
@@ -699,57 +701,53 @@ def get_extractor_from_args(args, input_format: str, backend: str) -> FeatureExt
     raises:
       ShouldExitError: if the program is invoked incorrectly and should exit.
     """
-    if input_format == FORMAT_FREEZE:
-        # freeze format deserializes directly into an extractor
-        return frz.load(args.input_file.read_bytes())
-    else:
-        # all other formats we must create an extractor,
-        # such as viv, binary ninja, etc. workspaces
-        # and use those for extracting.
+    # all other formats we must create an extractor,
+    # such as viv, binary ninja, etc. workspaces
+    # and use those for extracting.
 
-        try:
-            sig_paths = []
-            if backend != BACKEND_VIV:
-                logger.debug("skipping library code matching: only supported by the vivisect backend")
-            elif input_format != FORMAT_PE:
-                logger.debug("skipping library code matching: signatures only supports PE files")
-            else:
-                sig_paths = capa.loader.get_signatures(args.signatures)
-        except IOError as e:
-            logger.error("%s", str(e))
-            raise ShouldExitError(E_INVALID_SIG) from e
+    try:
+        sig_paths = []
+        if backend != BACKEND_VIV:
+            logger.debug("skipping library code matching: only supported by the vivisect backend")
+        elif input_format != FORMAT_PE:
+            logger.debug("skipping library code matching: signatures only supports PE files")
+        else:
+            sig_paths = capa.loader.get_signatures(args.signatures)
+    except IOError as e:
+        logger.error("%s", str(e))
+        raise ShouldExitError(E_INVALID_SIG) from e
 
-        should_save_workspace = os.environ.get("CAPA_SAVE_WORKSPACE") not in ("0", "no", "NO", "n", None)
+    should_save_workspace = os.environ.get("CAPA_SAVE_WORKSPACE") not in ("0", "no", "NO", "n", None)
 
-        os_ = get_os_from_args(args, backend)
-        sample_path = get_sample_path_from_args(args, backend)
+    os_ = get_os_from_args(args, backend)
+    sample_path = get_sample_path_from_args(args, backend)
 
-        # TODO(mr-tz): this should be wrapped and refactored as it's tedious to update everywhere
-        #  see same code and show-features above examples
-        #  https://github.com/mandiant/capa/issues/1813
-        try:
-            return capa.loader.get_extractor(
-                args.input_file,
-                input_format,
-                os_,
-                backend,
-                sig_paths,
-                should_save_workspace=should_save_workspace,
-                disable_progress=args.quiet or args.debug,
-                sample_path=sample_path,
-            )
-        except UnsupportedFormatError as e:
-            if input_format == FORMAT_CAPE:
-                log_unsupported_cape_report_error(str(e))
-            else:
-                log_unsupported_format_error()
-            raise ShouldExitError(E_INVALID_FILE_TYPE) from e
-        except UnsupportedArchError as e:
-            log_unsupported_arch_error()
-            raise ShouldExitError(E_INVALID_FILE_ARCH) from e
-        except UnsupportedOSError as e:
-            log_unsupported_os_error()
-            raise ShouldExitError(E_INVALID_FILE_OS) from e
+    # TODO(mr-tz): this should be wrapped and refactored as it's tedious to update everywhere
+    #  see same code and show-features above examples
+    #  https://github.com/mandiant/capa/issues/1813
+    try:
+        return capa.loader.get_extractor(
+            args.input_file,
+            input_format,
+            os_,
+            backend,
+            sig_paths,
+            should_save_workspace=should_save_workspace,
+            disable_progress=args.quiet or args.debug,
+            sample_path=sample_path,
+        )
+    except UnsupportedFormatError as e:
+        if input_format == FORMAT_CAPE:
+            log_unsupported_cape_report_error(str(e))
+        else:
+            log_unsupported_format_error()
+        raise ShouldExitError(E_INVALID_FILE_TYPE) from e
+    except UnsupportedArchError as e:
+        log_unsupported_arch_error()
+        raise ShouldExitError(E_INVALID_FILE_ARCH) from e
+    except UnsupportedOSError as e:
+        log_unsupported_os_error()
+        raise ShouldExitError(E_INVALID_FILE_OS) from e
 
 
 def main(argv: Optional[List[str]] = None):

From 2d190aafd1908f4ad1883c47090f51007090081b Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 26 Jan 2024 08:40:27 +0000
Subject: [PATCH 016/200] freeze: use new cli arg handling

---
 capa/features/freeze/__init__.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/capa/features/freeze/__init__.py b/capa/features/freeze/__init__.py
index b5b0f7f92..cbaafee61 100644
--- a/capa/features/freeze/__init__.py
+++ b/capa/features/freeze/__init__.py
@@ -682,14 +682,18 @@ def main(argv=None):
         argv = sys.argv[1:]
 
     parser = argparse.ArgumentParser(description="save capa features to a file")
-    capa.main.install_common_args(parser, {"sample", "format", "backend", "os", "signatures"})
+    capa.main.install_common_args(parser, {"input_file", "format", "backend", "os", "signatures"})
     parser.add_argument("output", type=str, help="Path to output file")
     args = parser.parse_args(args=argv)
-    capa.main.handle_common_args(args)
 
-    sigpaths = capa.loader.get_signatures(args.signatures)
-
-    extractor = capa.loader.get_extractor(args.sample, args.format, args.os, args.backend, sigpaths, False)
+    try:
+        capa.main.handle_common_args(args)
+        capa.main.ensure_input_exists_from_args(args)
+        input_format = capa.main.get_input_format_from_args(args)
+        backend = capa.main.get_backend_from_args(args, input_format)
+        extractor = capa.main.get_extractor_from_args(args, input_format, backend)
+    except capa.main.ShouldExitError as e:
+        return e.status_code
 
     Path(args.output).write_bytes(dump(extractor))
 

From 247209b90249c5160da8b1efb36ad361b1cf54a9 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <wballenthin@google.com>
Date: Fri, 26 Jan 2024 10:37:56 +0100
Subject: [PATCH 017/200] Update capa/loader.py

Co-authored-by: Moritz <mr-tz@users.noreply.github.com>
---
 capa/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/loader.py b/capa/loader.py
index 83a9931af..9f6cfcde8 100644
--- a/capa/loader.py
+++ b/capa/loader.py
@@ -135,7 +135,7 @@ def get_workspace(path: Path, input_format: str, sigpaths: List[Path]):
     this is the responsibility of the caller.
     """
 
-    # lazy import enables us to not require viv if user wants SMDA, for example.
+    # lazy import enables us to not require viv if user wants another backend.
     import viv_utils
     import viv_utils.flirt
 

From 0a979a3ace12129064576674eee8128ccf2b1c5c Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 26 Jan 2024 09:44:58 +0000
Subject: [PATCH 018/200] main: remove duplicate documentation

---
 capa/main.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/capa/main.py b/capa/main.py
index 76a7cc28f..6d6a1cbf4 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -217,16 +217,6 @@ def install_common_args(parser, wanted=None):
         help="enable ANSI color codes in results, default: only during interactive session",
     )
 
-    #
-    # arguments that may be opted into:
-    #
-    #   - input_file
-    #   - format
-    #   - os
-    #   - rules
-    #   - tag
-    #
-
     if "input_file" in wanted:
         parser.add_argument(
             "input_file",

From 4183f862751a5730c906383ef32f1224541bfde4 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 26 Jan 2024 09:48:16 +0000
Subject: [PATCH 019/200] main: add doc about where some functions live

---
 capa/helpers.py | 5 +++++
 capa/main.py    | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/capa/helpers.py b/capa/helpers.py
index fbde14d74..ad27f3903 100644
--- a/capa/helpers.py
+++ b/capa/helpers.py
@@ -221,4 +221,9 @@ def is_running_standalone() -> bool:
     are we running from a PyInstaller'd executable?
     if so, then we'll be able to access `sys._MEIPASS` for the packaged resources.
     """
+    # typically we only expect capa.main to be packaged via PyInstaller.
+    # therefore, this *should* be in capa.main; however,
+    # the Binary Ninja extractor uses this to resolve the BN API code,
+    # so we keep this in a common area.
+    # generally, other library code should not use this function.
     return hasattr(sys, "frozen") and hasattr(sys, "_MEIPASS")
diff --git a/capa/main.py b/capa/main.py
index 6d6a1cbf4..b8b9eab86 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -128,6 +128,9 @@ def get_default_root() -> Path:
     under PyInstaller, this comes from _MEIPASS.
     under source, this is the root directory of the project.
     """
+    # we only expect capa.main to be packaged within PyInstaller,
+    # so we don't put this in a more common place, like capa.helpers.
+
     if capa.helpers.is_running_standalone():
         # pylance/mypy don't like `sys._MEIPASS` because this isn't standard.
         # its injected by pyinstaller.

From d536b9afcf592ac633ec3c2473d42dc6934f099a Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 26 Jan 2024 10:24:37 +0000
Subject: [PATCH 020/200] scripts: migrate to new main wrapper helper functions

---
 scripts/capa2yara.py            | 33 ++++------------
 scripts/capa_as_library.py      | 11 +++---
 scripts/capafmt.py              | 17 +++-----
 scripts/lint.py                 | 28 +++++++------
 scripts/show-features.py        | 69 +++++++++------------------------
 scripts/show-unused-features.py | 56 ++++++++------------------
 6 files changed, 70 insertions(+), 144 deletions(-)

diff --git a/scripts/capa2yara.py b/scripts/capa2yara.py
index 56fd0e8cb..5f9c1d806 100644
--- a/scripts/capa2yara.py
+++ b/scripts/capa2yara.py
@@ -726,32 +726,15 @@ def main(argv=None):
     parser.add_argument("rules", type=str, help="Path to rules")
     parser.add_argument("--private", "-p", action="store_true", help="Create private rules", default=False)
     capa.main.install_common_args(parser, wanted={"tag"})
-
     args = parser.parse_args(args=argv)
-    make_priv = args.private
-
-    if args.verbose:
-        level = logging.DEBUG
-    elif args.quiet:
-        level = logging.ERROR
-    else:
-        level = logging.INFO
-
-    logging.basicConfig(level=level)
-    logging.getLogger("capa2yara").setLevel(level)
 
     try:
-        rules = capa.rules.get_rules([Path(args.rules)])
-        namespaces = capa.rules.index_rules_by_namespace(list(rules.rules.values()))
-        logger.info("successfully loaded %d rules (including subscope rules which will be ignored)", len(rules))
-        if args.tag:
-            rules = rules.filter_rules_by_meta(args.tag)
-            logger.debug("selected %d rules", len(rules))
-            for i, r in enumerate(rules.rules, 1):
-                logger.debug(" %d. %s", i, r)
-    except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e:
-        logger.error("%s", str(e))
-        return -1
+        capa.main.handle_common_args(args)
+        rules = capa.main.get_rules_from_args(args)
+    except capa.main.ShouldExitError as e:
+        return e.status_code
+
+    namespaces = capa.rules.index_rules_by_namespace(list(rules.rules.values()))
 
     output_yar(
         "// Rules from Mandiant's https://github.com/mandiant/capa-rules converted to YARA using https://github.com/mandiant/capa/blob/master/scripts/capa2yara.py by Arnim Rupp"
@@ -780,10 +763,10 @@ def main(argv=None):
         cround += 1
         logger.info("doing convert_rules(), round: %d", cround)
         num_rules = len(converted_rules)
-        count_incomplete += convert_rules(rules, namespaces, cround, make_priv)
+        count_incomplete += convert_rules(rules, namespaces, cround, args.private)
 
     # one last round to collect all unconverted rules
-    count_incomplete += convert_rules(rules, namespaces, 9000, make_priv)
+    count_incomplete += convert_rules(rules, namespaces, 9000, args.private)
 
     stats = "\n// converted rules              : " + str(len(converted_rules))
     stats += "\n//   among those are incomplete : " + str(count_incomplete)
diff --git a/scripts/capa_as_library.py b/scripts/capa_as_library.py
index a3a160784..cc3228f9f 100644
--- a/scripts/capa_as_library.py
+++ b/scripts/capa_as_library.py
@@ -15,6 +15,7 @@
 import capa.main
 import capa.rules
 import capa.engine
+import capa.loader
 import capa.features
 import capa.render.json
 import capa.render.utils as rutils
@@ -168,18 +169,18 @@ def render_dictionary(doc: rd.ResultDocument) -> Dict[str, Any]:
 
 
 # ==== render dictionary helpers
-def capa_details(rules_path: Path, file_path: Path, output_format="dictionary"):
+def capa_details(rules_path: Path, input_file: Path, output_format="dictionary"):
     # load rules from disk
     rules = capa.rules.get_rules([rules_path])
 
     # extract features and find capabilities
     extractor = capa.loader.get_extractor(
-        file_path, FORMAT_AUTO, OS_AUTO, capa.main.BACKEND_VIV, [], False, disable_progress=True
+        input_file, FORMAT_AUTO, OS_AUTO, capa.main.BACKEND_VIV, [], should_save_workspace=False, disable_progress=True
     )
     capabilities, counts = capa.capabilities.common.find_capabilities(rules, extractor, disable_progress=True)
 
     # collect metadata (used only to make rendering more complete)
-    meta = capa.loader.collect_metadata([], file_path, FORMAT_AUTO, OS_AUTO, [rules_path], extractor, counts)
+    meta = capa.loader.collect_metadata([], input_file, FORMAT_AUTO, OS_AUTO, [rules_path], extractor, counts)
     meta.analysis.layout = capa.loader.compute_layout(rules, extractor, capabilities)
 
     capa_output: Any = False
@@ -206,7 +207,7 @@ def capa_details(rules_path: Path, file_path: Path, output_format="dictionary"):
     RULES_PATH = capa.main.get_default_root() / "rules"
 
     parser = argparse.ArgumentParser(description="Extract capabilities from a file")
-    parser.add_argument("file", help="file to extract capabilities from")
+    parser.add_argument("input_file", help="file to extract capabilities from")
     parser.add_argument("--rules", help="path to rules directory", default=RULES_PATH)
     parser.add_argument(
         "--output", help="output format", choices=["dictionary", "json", "texttable"], default="dictionary"
@@ -214,5 +215,5 @@ def capa_details(rules_path: Path, file_path: Path, output_format="dictionary"):
     args = parser.parse_args()
     if args.rules != RULES_PATH:
         args.rules = Path(args.rules)
-    print(capa_details(args.rules, Path(args.file), args.output))
+    print(capa_details(args.rules, Path(args.input_file), args.output))
     sys.exit(0)
diff --git a/scripts/capafmt.py b/scripts/capafmt.py
index fa8298007..b420b8efb 100644
--- a/scripts/capafmt.py
+++ b/scripts/capafmt.py
@@ -19,6 +19,7 @@
 import argparse
 from pathlib import Path
 
+import capa.main
 import capa.rules
 
 logger = logging.getLogger("capafmt")
@@ -29,6 +30,7 @@ def main(argv=None):
         argv = sys.argv[1:]
 
     parser = argparse.ArgumentParser(description="Capa rule formatter.")
+    capa.main.install_common_args(parser, wanted={})
     parser.add_argument("path", type=str, help="Path to rule to format")
     parser.add_argument(
         "-i",
@@ -37,8 +39,6 @@ def main(argv=None):
         dest="in_place",
         help="Format the rule in place, otherwise, write formatted rule to STDOUT",
     )
-    parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging")
-    parser.add_argument("-q", "--quiet", action="store_true", help="Disable all output but errors")
     parser.add_argument(
         "-c",
         "--check",
@@ -47,15 +47,10 @@ def main(argv=None):
     )
     args = parser.parse_args(args=argv)
 
-    if args.verbose:
-        level = logging.DEBUG
-    elif args.quiet:
-        level = logging.ERROR
-    else:
-        level = logging.INFO
-
-    logging.basicConfig(level=level)
-    logging.getLogger("capafmt").setLevel(level)
+    try:
+        capa.main.handle_common_args(args)
+    except capa.main.ShouldExitError as e:
+        return e.status_code
 
     rule = capa.rules.Rule.from_yaml_file(args.path, use_ruamel=True)
     reformatted_rule = rule.to_yaml()
diff --git a/scripts/lint.py b/scripts/lint.py
index 4eb05f289..4f3f98a01 100644
--- a/scripts/lint.py
+++ b/scripts/lint.py
@@ -39,6 +39,7 @@
 import capa.main
 import capa.rules
 import capa.engine
+import capa.loader
 import capa.helpers
 import capa.features.insn
 import capa.capabilities.common
@@ -364,7 +365,13 @@ def get_sample_capabilities(ctx: Context, path: Path) -> Set[str]:
 
     logger.debug("analyzing sample: %s", nice_path)
     extractor = capa.loader.get_extractor(
-        nice_path, format_, OS_AUTO, capa.main.BACKEND_VIV, DEFAULT_SIGNATURES, False, disable_progress=True
+        nice_path,
+        format_,
+        OS_AUTO,
+        capa.main.BACKEND_VIV,
+        DEFAULT_SIGNATURES,
+        should_save_workspace=False,
+        disable_progress=True,
     )
 
     capabilities, _ = capa.capabilities.common.find_capabilities(ctx.rules, extractor, disable_progress=True)
@@ -990,7 +997,11 @@ def main(argv=None):
         help="Enable thorough linting - takes more time, but does a better job",
     )
     args = parser.parse_args(args=argv)
-    capa.main.handle_common_args(args)
+
+    try:
+        capa.main.handle_common_args(args)
+    except capa.main.ShouldExitError as e:
+        return e.status_code
 
     if args.debug:
         logging.getLogger("capa").setLevel(logging.DEBUG)
@@ -1002,16 +1013,9 @@ def main(argv=None):
     time0 = time.time()
 
     try:
-        rules = capa.rules.get_rules(args.rules)
-        logger.info("successfully loaded %s rules", rules.source_rule_count)
-        if args.tag:
-            rules = rules.filter_rules_by_meta(args.tag)
-            logger.debug("selected %s rules", len(rules))
-            for i, r in enumerate(rules.rules, 1):
-                logger.debug(" %d. %s", i, r)
-    except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e:
-        logger.error("%s", str(e))
-        return -1
+        rules = capa.main.get_rules_from_args(args)
+    except capa.main.ShouldExitError as e:
+        return e.status_code
 
     logger.info("collecting potentially referenced samples")
     samples_path = Path(args.samples)
diff --git a/scripts/show-features.py b/scripts/show-features.py
index b448efe6c..92f857f7d 100644
--- a/scripts/show-features.py
+++ b/scripts/show-features.py
@@ -64,12 +64,10 @@
     insn: 0x10001027: mnemonic(shl)
     ...
 """
-import os
 import sys
 import logging
 import argparse
 from typing import Tuple
-from pathlib import Path
 
 import capa.main
 import capa.rules
@@ -82,17 +80,9 @@
 import capa.features.freeze
 import capa.features.address
 import capa.features.extractors.pefile
-from capa.helpers import get_auto_format, log_unsupported_runtime_error
+from capa.helpers import assert_never
 from capa.features.insn import API, Number
-from capa.features.common import (
-    FORMAT_AUTO,
-    FORMAT_CAPE,
-    FORMAT_FREEZE,
-    DYNAMIC_FORMATS,
-    String,
-    Feature,
-    is_global_feature,
-)
+from capa.features.common import String, Feature, is_global_feature
 from capa.features.extractors.base_extractor import FunctionHandle, StaticFeatureExtractor, DynamicFeatureExtractor
 
 logger = logging.getLogger("capa.show-features")
@@ -107,56 +97,33 @@ def main(argv=None):
         argv = sys.argv[1:]
 
     parser = argparse.ArgumentParser(description="Show the features that capa extracts from the given sample")
-    capa.main.install_common_args(parser, wanted={"format", "os", "sample", "signatures", "backend"})
+    capa.main.install_common_args(parser, wanted={"input_file", "format", "os", "signatures", "backend"})
 
     parser.add_argument("-F", "--function", type=str, help="Show features for specific function")
     parser.add_argument("-P", "--process", type=str, help="Show features for specific process name")
     args = parser.parse_args(args=argv)
-    capa.main.handle_common_args(args)
-
-    if args.function and args.backend == "pefile":
-        print("pefile backend does not support extracting function features")
-        return -1
 
     try:
-        _ = capa.helpers.get_file_taste(Path(args.sample))
-    except IOError as e:
-        logger.error("%s", str(e))
-        return -1
+        capa.main.handle_common_args(args)
+        capa.main.ensure_input_exists_from_args(args)
 
-    try:
-        sig_paths = capa.loader.get_signatures(args.signatures)
-    except IOError as e:
-        logger.error("%s", str(e))
-        return -1
-
-    format_ = args.format if args.format != FORMAT_AUTO else get_auto_format(args.sample)
-    if format_ == FORMAT_FREEZE:
-        # this should be moved above the previous if clause after implementing
-        # feature freeze for the dynamic analysis flavor
-        extractor = capa.features.freeze.load(Path(args.sample).read_bytes())
-    else:
-        should_save_workspace = os.environ.get("CAPA_SAVE_WORKSPACE") not in ("0", "no", "NO", "n", None)
-        try:
-            extractor = capa.loader.get_extractor(
-                args.sample, format_, args.os, args.backend, sig_paths, should_save_workspace
-            )
-        except capa.exceptions.UnsupportedFormatError as e:
-            if format_ == FORMAT_CAPE:
-                capa.helpers.log_unsupported_cape_report_error(str(e))
-            else:
-                capa.helpers.log_unsupported_format_error()
-            return -1
-        except capa.exceptions.UnsupportedRuntimeError:
-            log_unsupported_runtime_error()
+        if args.function and args.backend == "pefile":
+            print("pefile backend does not support extracting function features")
             return -1
 
-    if format_ in DYNAMIC_FORMATS:
-        assert isinstance(extractor, DynamicFeatureExtractor)
+        input_format = capa.main.get_input_format_from_args(args)
+
+        backend = capa.main.get_backend_from_args(args, input_format)
+        extractor = capa.main.get_extractor_from_args(args, input_format, backend)
+    except capa.main.ShouldExitError as e:
+        return e.status_code
+
+    if isinstance(extractor, DynamicFeatureExtractor):
         print_dynamic_analysis(extractor, args)
-    else:
-        assert isinstance(extractor, StaticFeatureExtractor)
+    elif isinstance(extractor, StaticFeatureExtractor):
         print_static_analysis(extractor, args)
+    else:
+        assert_never(extractor)
 
     return 0
 
diff --git a/scripts/show-unused-features.py b/scripts/show-unused-features.py
index b045f2613..dc9dc4b42 100644
--- a/scripts/show-unused-features.py
+++ b/scripts/show-unused-features.py
@@ -8,13 +8,11 @@
  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and limitations under the License.
 """
-import os
 import sys
 import typing
 import logging
 import argparse
 from typing import Set, Tuple
-from pathlib import Path
 from collections import Counter
 
 import tabulate
@@ -31,8 +29,7 @@
 import capa.features.address
 import capa.features.extractors.pefile
 import capa.features.extractors.base_extractor
-from capa.helpers import log_unsupported_runtime_error
-from capa.features.common import Feature
+from capa.features.common import FORMAT_FREEZE, Feature
 from capa.features.extractors.base_extractor import FunctionHandle, StaticFeatureExtractor
 
 logger = logging.getLogger("show-unused-features")
@@ -42,10 +39,9 @@ def format_address(addr: capa.features.address.Address) -> str:
     return v.format_address(capa.features.freeze.Address.from_capa((addr)))
 
 
-def get_rules_feature_set(rules_path) -> Set[Feature]:
-    ruleset = capa.rules.get_rules(rules_path)
+def get_rules_feature_set(rules: capa.rules.RuleSet) -> Set[Feature]:
     rules_feature_set: Set[Feature] = set()
-    for _, rule in ruleset.rules.items():
+    for _, rule in rules.rules.items():
         rules_feature_set.update(rule.extract_all_features())
 
     return rules_feature_set
@@ -106,44 +102,23 @@ def main(argv=None):
         argv = sys.argv[1:]
 
     parser = argparse.ArgumentParser(description="Show the features that capa doesn't have rules for yet")
-    capa.main.install_common_args(parser, wanted={"format", "os", "sample", "signatures", "backend", "rules"})
-
+    capa.main.install_common_args(parser, wanted={"format", "os", "input_file", "signatures", "backend", "rules"})
     parser.add_argument("-F", "--function", type=str, help="Show features for specific function")
     args = parser.parse_args(args=argv)
-    capa.main.handle_common_args(args)
 
     if args.function and args.backend == "pefile":
         print("pefile backend does not support extracting function features")
         return -1
 
     try:
-        taste = capa.helpers.get_file_taste(Path(args.sample))
-    except IOError as e:
-        logger.error("%s", str(e))
-        return -1
-
-    try:
-        sig_paths = capa.loader.get_signatures(args.signatures)
-    except IOError as e:
-        logger.error("%s", str(e))
-        return -1
-
-    if (args.format == "freeze") or (
-        args.format == capa.features.common.FORMAT_AUTO and capa.features.freeze.is_freeze(taste)
-    ):
-        extractor = capa.features.freeze.load(Path(args.sample).read_bytes())
-    else:
-        should_save_workspace = os.environ.get("CAPA_SAVE_WORKSPACE") not in ("0", "no", "NO", "n", None)
-        try:
-            extractor = capa.loader.get_extractor(
-                args.sample, args.format, args.os, args.backend, sig_paths, should_save_workspace
-            )
-        except capa.exceptions.UnsupportedFormatError:
-            capa.helpers.log_unsupported_format_error()
-            return -1
-        except capa.exceptions.UnsupportedRuntimeError:
-            log_unsupported_runtime_error()
-            return -1
+        capa.main.handle_common_args(args)
+        capa.main.ensure_input_exists_from_args(args)
+        rules = capa.main.get_rules_from_args(args)
+        input_format = capa.main.get_input_format_from_args(args)
+        backend = capa.main.get_backend_from_args(args, input_format)
+        extractor = capa.main.get_extractor_from_args(args, input_format, backend)
+    except capa.main.ShouldExitError as e:
+        return e.status_code
 
     assert isinstance(extractor, StaticFeatureExtractor), "only static analysis supported today"
 
@@ -159,7 +134,7 @@ def main(argv=None):
         function_handles = tuple(extractor.get_functions())
 
     if args.function:
-        if args.format == "freeze":
+        if input_format == FORMAT_FREEZE:
             function_handles = tuple(filter(lambda fh: fh.address == args.function, function_handles))
         else:
             function_handles = tuple(filter(lambda fh: format_address(fh.address) == args.function, function_handles))
@@ -174,7 +149,7 @@ def main(argv=None):
 
     feature_map.update(get_file_features(function_handles, extractor))
 
-    rules_feature_set = get_rules_feature_set(args.rules)
+    rules_feature_set = get_rules_feature_set(rules)
 
     print_unused_features(feature_map, rules_feature_set)
     return 0
@@ -206,7 +181,8 @@ def ida_main():
     feature_map.update(get_file_features(function_handles, extractor))
 
     rules_path = capa.main.get_default_root() / "rules"
-    rules_feature_set = get_rules_feature_set([rules_path])
+    rules = capa.rules.get_rules([rules_path])
+    rules_feature_set = get_rules_feature_set(rules)
 
     print_unused_features(feature_map, rules_feature_set)
 

From 43bb9e72f56b5c0ab26f80efd725ef0c287be409 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 26 Jan 2024 10:47:23 +0000
Subject: [PATCH 021/200] scripts: port to main routines

---
 capa/main.py                             | 32 ++++++-----
 scripts/detect-elf-os.py                 | 28 +++-------
 scripts/match-function-id.py             | 37 ++++--------
 scripts/profile-time.py                  | 40 ++++---------
 scripts/show-capabilities-by-function.py | 71 ++++++------------------
 5 files changed, 65 insertions(+), 143 deletions(-)

diff --git a/capa/main.py b/capa/main.py
index b8b9eab86..b8e931621 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -684,6 +684,22 @@ def find_file_limitations_from_args(args, rules: RuleSet, file_extractors: List[
     return found_file_limitation
 
 
+def get_signatures_from_args(args, input_format: str, backend: str) -> List[Path]:
+    if backend != BACKEND_VIV:
+        logger.debug("skipping library code matching: only supported by the vivisect backend")
+        return []
+
+    if input_format != FORMAT_PE:
+        logger.debug("skipping library code matching: signatures only supports PE files")
+        return []
+
+    try:
+        return capa.loader.get_signatures(args.signatures)
+    except IOError as e:
+        logger.error("%s", str(e))
+        raise ShouldExitError(E_INVALID_SIG) from e
+
+
 def get_extractor_from_args(args, input_format: str, backend: str) -> FeatureExtractor:
     """
     args:
@@ -694,21 +710,7 @@ def get_extractor_from_args(args, input_format: str, backend: str) -> FeatureExt
     raises:
       ShouldExitError: if the program is invoked incorrectly and should exit.
     """
-    # all other formats we must create an extractor,
-    # such as viv, binary ninja, etc. workspaces
-    # and use those for extracting.
-
-    try:
-        sig_paths = []
-        if backend != BACKEND_VIV:
-            logger.debug("skipping library code matching: only supported by the vivisect backend")
-        elif input_format != FORMAT_PE:
-            logger.debug("skipping library code matching: signatures only supports PE files")
-        else:
-            sig_paths = capa.loader.get_signatures(args.signatures)
-    except IOError as e:
-        logger.error("%s", str(e))
-        raise ShouldExitError(E_INVALID_SIG) from e
+    sig_paths = get_signatures_from_args(args, input_format, backend)
 
     should_save_workspace = os.environ.get("CAPA_SAVE_WORKSPACE") not in ("0", "no", "NO", "n", None)
 
diff --git a/scripts/detect-elf-os.py b/scripts/detect-elf-os.py
index 5adf85de7..89cafe499 100644
--- a/scripts/detect-elf-os.py
+++ b/scripts/detect-elf-os.py
@@ -17,8 +17,8 @@
 import argparse
 import contextlib
 from typing import BinaryIO
-from pathlib import Path
 
+import capa.main
 import capa.helpers
 import capa.features.extractors.elf
 
@@ -36,28 +36,16 @@ def main(argv=None):
             argv = sys.argv[1:]
 
         parser = argparse.ArgumentParser(description="Detect the underlying OS for the given ELF file")
-        parser.add_argument("sample", type=str, help="path to ELF file")
-
-        logging_group = parser.add_argument_group("logging arguments")
-
-        logging_group.add_argument("-d", "--debug", action="store_true", help="enable debugging output on STDERR")
-        logging_group.add_argument(
-            "-q", "--quiet", action="store_true", help="disable all status output except fatal errors"
-        )
-
+        capa.main.install_common_args(parser, wanted={"input_file"})
         args = parser.parse_args(args=argv)
 
-        if args.quiet:
-            logging.basicConfig(level=logging.WARNING)
-            logging.getLogger().setLevel(logging.WARNING)
-        elif args.debug:
-            logging.basicConfig(level=logging.DEBUG)
-            logging.getLogger().setLevel(logging.DEBUG)
-        else:
-            logging.basicConfig(level=logging.INFO)
-            logging.getLogger().setLevel(logging.INFO)
+        try:
+            capa.main.handle_common_args(args)
+            capa.main.ensure_input_exists_from_args(args)
+        except capa.main.ShouldExitError as e:
+            return e.status_code
 
-        f = Path(args.sample).open("rb")
+        f = args.input_file.open("rb")
 
     with contextlib.closing(f):
         try:
diff --git a/scripts/match-function-id.py b/scripts/match-function-id.py
index 7fe51e238..a3e52c5f0 100644
--- a/scripts/match-function-id.py
+++ b/scripts/match-function-id.py
@@ -71,40 +71,26 @@ def main(argv=None):
         argv = sys.argv[1:]
 
     parser = argparse.ArgumentParser(description="FLIRT match each function")
-    parser.add_argument("sample", type=str, help="Path to sample to analyze")
+    capa.main.install_common_args(parser, wanted={"input_file", "signatures"})
     parser.add_argument(
         "-F",
         "--function",
         type=lambda x: int(x, 0x10),
         help="match a specific function by VA, rather than add functions",
     )
-    parser.add_argument(
-        "--signature",
-        action="append",
-        dest="signatures",
-        type=str,
-        default=[],
-        help="use the given signatures to identify library functions, file system paths to .sig/.pat files.",
-    )
-    parser.add_argument("-d", "--debug", action="store_true", help="Enable debugging output on STDERR")
-    parser.add_argument("-q", "--quiet", action="store_true", help="Disable all output but errors")
     args = parser.parse_args(args=argv)
 
-    if args.quiet:
-        logging.basicConfig(level=logging.ERROR)
-        logging.getLogger().setLevel(logging.ERROR)
-    elif args.debug:
-        logging.basicConfig(level=logging.DEBUG)
-        logging.getLogger().setLevel(logging.DEBUG)
-    else:
-        logging.basicConfig(level=logging.INFO)
-        logging.getLogger().setLevel(logging.INFO)
-
-    # disable vivisect-related logging, it's verbose and not relevant for capa users
-    capa.main.set_vivisect_log_level(logging.CRITICAL)
+    try:
+        capa.main.handle_common_args(args)
+        capa.main.ensure_input_exists_from_args(args)
+        input_format = capa.main.get_input_format_from_args(args)
+        backend = capa.main.get_backend_from_args(args, input_format)
+        sig_paths = capa.main.get_signatures_from_args(args, input_format, backend)
+    except capa.main.ShouldExitError as e:
+        return e.status_code
 
     analyzers = []
-    for sigpath in args.signatures:
+    for sigpath in sig_paths:
         sigs = viv_utils.flirt.load_flirt_signature(sigpath)
 
         with capa.main.timing("flirt: compiling sigs"):
@@ -123,7 +109,8 @@ def main(argv=None):
     for function in functions:
         logger.debug("matching function: 0x%04x", function)
         for analyzer in analyzers:
-            name = viv_utils.flirt.match_function_flirt_signatures(analyzer.matcher, vw, function)
+            viv_utils.flirt.match_function_flirt_signatures(analyzer.matcher, vw, function)
+            name = viv_utils.get_function_name(vw, function)
             if name:
                 print(f"0x{function:04x}: {name}")
 
diff --git a/scripts/profile-time.py b/scripts/profile-time.py
index d67ba5385..222da615e 100644
--- a/scripts/profile-time.py
+++ b/scripts/profile-time.py
@@ -41,7 +41,6 @@
 import logging
 import argparse
 import subprocess
-from pathlib import Path
 
 import tqdm
 import tabulate
@@ -50,6 +49,7 @@
 import capa.perf
 import capa.rules
 import capa.engine
+import capa.loader
 import capa.helpers
 import capa.features
 import capa.features.common
@@ -74,42 +74,22 @@ def main(argv=None):
         label += " (dirty)"
 
     parser = argparse.ArgumentParser(description="Profile capa performance")
-    capa.main.install_common_args(parser, wanted={"format", "os", "sample", "signatures", "rules"})
-
+    capa.main.install_common_args(parser, wanted={"format", "os", "input_file", "signatures", "rules"})
     parser.add_argument("--number", type=int, default=3, help="batch size of profile collection")
     parser.add_argument("--repeat", type=int, default=30, help="batch count of profile collection")
     parser.add_argument("--label", type=str, default=label, help="description of the profile collection")
-
     args = parser.parse_args(args=argv)
-    capa.main.handle_common_args(args)
-
-    try:
-        taste = capa.helpers.get_file_taste(Path(args.sample))
-    except IOError as e:
-        logger.error("%s", str(e))
-        return -1
 
     try:
+        capa.main.handle_common_args(args)
+        capa.main.ensure_input_exists_from_args(args)
+        input_format = capa.main.get_input_format_from_args(args)
+        backend = capa.main.get_backend_from_args(args, input_format)
         with capa.main.timing("load rules"):
-            rules = capa.rules.get_rules(args.rules)
-    except IOError as e:
-        logger.error("%s", str(e))
-        return -1
-
-    try:
-        sig_paths = capa.loader.get_signatures(args.signatures)
-    except IOError as e:
-        logger.error("%s", str(e))
-        return -1
-
-    if (args.format == "freeze") or (
-        args.format == capa.features.common.FORMAT_AUTO and capa.features.freeze.is_freeze(taste)
-    ):
-        extractor = capa.features.freeze.load(Path(args.sample).read_bytes())
-    else:
-        extractor = capa.loader.get_extractor(
-            args.sample, args.format, args.os, capa.main.BACKEND_VIV, sig_paths, should_save_workspace=False
-        )
+            rules = capa.main.get_rules_from_args(args)
+        extractor = capa.main.get_extractor_from_args(args, input_format, backend)
+    except capa.main.ShouldExitError as e:
+        return e.status_code
 
     with tqdm.tqdm(total=args.number * args.repeat, leave=False) as pbar:
 
diff --git a/scripts/show-capabilities-by-function.py b/scripts/show-capabilities-by-function.py
index 22ebd1e55..8ad67605f 100644
--- a/scripts/show-capabilities-by-function.py
+++ b/scripts/show-capabilities-by-function.py
@@ -55,13 +55,11 @@
  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and limitations under the License.
 """
-import os
 import sys
 import logging
 import argparse
 import collections
 from typing import Dict
-from pathlib import Path
 
 import colorama
 
@@ -76,10 +74,7 @@
 import capa.features.freeze
 import capa.capabilities.common
 import capa.render.result_document as rd
-from capa.helpers import get_file_taste
-from capa.features.common import FORMAT_AUTO
 from capa.features.freeze import Address
-from capa.features.extractors.base_extractor import FeatureExtractor, StaticFeatureExtractor
 
 logger = logging.getLogger("capa.show-capabilities-by-function")
 
@@ -142,67 +137,37 @@ def main(argv=None):
         argv = sys.argv[1:]
 
     parser = argparse.ArgumentParser(description="detect capabilities in programs.")
-    capa.main.install_common_args(parser, wanted={"format", "os", "backend", "sample", "signatures", "rules", "tag"})
+    capa.main.install_common_args(
+        parser, wanted={"format", "os", "backend", "input_file", "signatures", "rules", "tag"}
+    )
     args = parser.parse_args(args=argv)
-    capa.main.handle_common_args(args)
 
     try:
-        taste = get_file_taste(Path(args.sample))
-    except IOError as e:
-        logger.error("%s", str(e))
-        return -1
-
-    try:
-        rules = capa.rules.get_rules(args.rules)
-        logger.info("successfully loaded %s rules", len(rules))
-        if args.tag:
-            rules = rules.filter_rules_by_meta(args.tag)
-            logger.info("selected %s rules", len(rules))
-    except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e:
-        logger.error("%s", str(e))
-        return -1
-
-    try:
-        sig_paths = capa.loader.get_signatures(args.signatures)
-    except IOError as e:
-        logger.error("%s", str(e))
-        return -1
-
-    if (args.format == "freeze") or (args.format == FORMAT_AUTO and capa.features.freeze.is_freeze(taste)):
-        format_ = "freeze"
-        extractor: FeatureExtractor = capa.features.freeze.load(Path(args.sample).read_bytes())
-    else:
-        format_ = args.format
-        should_save_workspace = os.environ.get("CAPA_SAVE_WORKSPACE") not in ("0", "no", "NO", "n", None)
-
-        try:
-            extractor = capa.loader.get_extractor(
-                args.sample, args.format, args.os, args.backend, sig_paths, should_save_workspace
-            )
-            assert isinstance(extractor, StaticFeatureExtractor)
-        except capa.exceptions.UnsupportedFormatError:
-            capa.helpers.log_unsupported_format_error()
-            return -1
-        except capa.exceptions.UnsupportedRuntimeError:
-            capa.helpers.log_unsupported_runtime_error()
-            return -1
+        capa.main.handle_common_args(args)
+        capa.main.ensure_input_exists_from_args(args)
+        input_format = capa.main.get_input_format_from_args(args)
+        rules = capa.main.get_rules_from_args(args)
+        backend = capa.main.get_backend_from_args(args, input_format)
+        sample_path = capa.main.get_sample_path_from_args(args, backend)
+        if sample_path is None:
+            os_ = "unknown"
+        else:
+            os_ = capa.loader.get_os(sample_path)
+        extractor = capa.main.get_extractor_from_args(args, input_format, backend)
+    except capa.main.ShouldExitError as e:
+        return e.status_code
 
     capabilities, counts = capa.capabilities.common.find_capabilities(rules, extractor)
 
-    meta = capa.loader.collect_metadata(argv, args.sample, format_, args.os, args.rules, extractor, counts)
+    meta = capa.loader.collect_metadata(argv, args.input_file, input_format, os_, args.rules, extractor, counts)
     meta.analysis.layout = capa.loader.compute_layout(rules, extractor, capabilities)
 
     if capa.capabilities.common.has_file_limitation(rules, capabilities):
         # bail if capa encountered file limitation e.g. a packed binary
         # do show the output in verbose mode, though.
         if not (args.verbose or args.vverbose or args.json):
-            return -1
+            return capa.main.E_FILE_LIMITATION
 
-    # colorama will detect:
-    #  - when on Windows console, and fixup coloring, and
-    #  - when not an interactive session, and disable coloring
-    # renderers should use coloring and assume it will be stripped out if necessary.
-    colorama.init()
     doc = rd.ResultDocument.from_capa(meta, rules, capabilities)
     print(render_matches_by_function(doc))
     colorama.deinit()

From 786cbb8ccb66e34e4cf1503e3f7accdc72bacb0c Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 26 Jan 2024 11:28:33 +0000
Subject: [PATCH 022/200] main: better handle auto-detection of backend

---
 capa/main.py | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/capa/main.py b/capa/main.py
index b8e931621..bd417b830 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -84,7 +84,7 @@
 
 RULES_PATH_DEFAULT_STRING = "(embedded rules)"
 SIGNATURES_PATH_DEFAULT_STRING = "(embedded signatures)"
-BACKEND_DEFAULT = "(default) use default backend for given file type"
+BACKEND_AUTO = "auto"
 
 E_MISSING_RULES = 10
 E_MISSING_FILE = 11
@@ -239,6 +239,7 @@ def install_common_args(parser, wanted=None):
             (FORMAT_FREEZE, "features previously frozen by capa"),
         ]
         format_help = ", ".join([f"{f[0]}: {f[1]}" for f in formats])
+
         parser.add_argument(
             "-f",
             "--format",
@@ -248,13 +249,23 @@ def install_common_args(parser, wanted=None):
         )
 
     if "backend" in wanted:
+        backends = [
+            (BACKEND_AUTO, "(default) detect apppropriate backend automatically"),
+            (BACKEND_VIV, "vivisect"),
+            (BACKEND_PEFILE, "pefile (file features only)"),
+            (BACKEND_BINJA, "Binary Ninja"),
+            (BACKEND_DOTNET, ".NET"),
+            (BACKEND_FREEZE, "capa freeze"),
+            (BACKEND_CAPE, "CAPE"),
+        ]
+        backend_help = ", ".join([f"{f[0]}: {f[1]}" for f in backends])
         parser.add_argument(
             "-b",
             "--backend",
             type=str,
-            help="select the backend to use",
-            choices=(BACKEND_VIV, BACKEND_BINJA, BACKEND_PEFILE, BACKEND_CAPE),
-            default=BACKEND_DEFAULT,
+            choices=[f[0] for f in backends],
+            default=BACKEND_AUTO,
+            help=f"select backend, {backend_help}"
         )
 
     if "os" in wanted:
@@ -504,7 +515,7 @@ def get_backend_from_args(args, input_format: str) -> str:
     raises:
       ShouldExitError: if the program is invoked incorrectly and should exit.
     """
-    if args.backend != BACKEND_DEFAULT:
+    if args.backend != BACKEND_AUTO:
         return args.backend
 
     if input_format == FORMAT_CAPE:
@@ -601,7 +612,7 @@ def get_rules_from_args(args) -> RuleSet:
         len(list(filter(lambda r: not (r.is_subscope_rule()), rules.rules.values()))),
     )
 
-    if args.tag:
+    if hasattr(args, "tag"):
         rules = rules.filter_rules_by_meta(args.tag)
         logger.debug("selected %d rules", len(rules))
         for i, r in enumerate(rules.rules, 1):

From 0cb1f9e8f89ff0de13b8f15ea74346a123a2a08e Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 26 Jan 2024 11:29:15 +0000
Subject: [PATCH 023/200] scripts: migrate bulk-process to main wrappers

---
 scripts/bulk-process.py | 100 ++++++++++++++++++++--------------------
 1 file changed, 51 insertions(+), 49 deletions(-)

diff --git a/scripts/bulk-process.py b/scripts/bulk-process.py
index 82c511c25..db3795e4d 100644
--- a/scripts/bulk-process.py
+++ b/scripts/bulk-process.py
@@ -36,7 +36,7 @@
 usage:
 
     usage: bulk-process.py [-h] [-r RULES] [-d] [-q] [-n PARALLELISM] [--no-mp]
-                           input
+                           input_directory
 
     detect capabilities in programs.
 
@@ -62,7 +62,6 @@
  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and limitations under the License.
 """
-import os
 import sys
 import json
 import logging
@@ -74,10 +73,10 @@
 import capa
 import capa.main
 import capa.rules
+import capa.loader
 import capa.render.json
 import capa.capabilities.common
 import capa.render.result_document as rd
-from capa.features.common import OS_AUTO
 
 logger = logging.getLogger("capa")
 
@@ -87,11 +86,8 @@ def get_capa_results(args):
     run capa against the file at the given path, using the given rules.
 
     args is a tuple, containing:
-      rules (capa.rules.RuleSet): the rules to match
-      signatures (List[str]): list of file system paths to signature files
-      format (str): the name of the sample file format
-      os (str): the name of the operating system
-      path (str): the file system path to the sample to process
+      rules, signatures, format, backend, os, input_file
+    as provided via the CLI arguments.
 
     args is a tuple because i'm not quite sure how to unpack multiple arguments using `map`.
 
@@ -106,44 +102,58 @@ def get_capa_results(args):
       meta (dict): the meta analysis results
       capabilities (dict): the matched capabilities and their result objects
     """
-    rules, sigpaths, format, os_, path = args
-    should_save_workspace = os.environ.get("CAPA_SAVE_WORKSPACE") not in ("0", "no", "NO", "n", None)
-    logger.info("computing capa results for: %s", path)
+    rules, signatures, format_, backend, os_, input_file = args
+
+    parser = argparse.ArgumentParser(description="detect capabilities in programs.")
+    capa.main.install_common_args(parser, wanted={"rules", "signatures", "format", "os", "backend", "input_file"})
+    argv = [
+        "--signatures",
+        signatures,
+        "--format",
+        format_,
+        "--backend",
+        backend,
+        "--os",
+        os_,
+        input_file,
+    ]
+    if rules:
+        argv += ["--rules", rules]
+    args = parser.parse_args(args=argv)
+
     try:
-        extractor = capa.loader.get_extractor(
-            path, format, os_, capa.main.BACKEND_VIV, sigpaths, should_save_workspace, disable_progress=True
-        )
-    except capa.exceptions.UnsupportedFormatError:
-        # i'm 100% sure if multiprocessing will reliably raise exceptions across process boundaries.
+        capa.main.handle_common_args(args)
+        capa.main.ensure_input_exists_from_args(args)
+        input_format = capa.main.get_input_format_from_args(args)
+        rules = capa.main.get_rules_from_args(args)
+        backend = capa.main.get_backend_from_args(args, input_format)
+        sample_path = capa.main.get_sample_path_from_args(args, backend)
+        if sample_path is None:
+            os_ = "unknown"
+        else:
+            os_ = capa.loader.get_os(sample_path)
+        extractor = capa.main.get_extractor_from_args(args, input_format, backend)
+    except capa.main.ShouldExitError as e:
+        # i'm not 100% sure if multiprocessing will reliably raise exceptions across process boundaries.
         # so instead, return an object with explicit success/failure status.
         #
         # if success, then status=ok, and results found in property "ok"
         # if error, then status=error, and human readable message in property "error"
-        return {
-            "path": path,
-            "status": "error",
-            "error": f"input file does not appear to be a PE file: {path}",
-        }
-    except capa.exceptions.UnsupportedRuntimeError:
-        return {
-            "path": path,
-            "status": "error",
-            "error": "unsupported runtime or Python interpreter",
-        }
+        return {"path": input_file, "status": "error", "error": str(e), "status_code": e.status_code}
     except Exception as e:
         return {
-            "path": path,
+            "path": input_file,
             "status": "error",
             "error": f"unexpected error: {e}",
         }
 
     capabilities, counts = capa.capabilities.common.find_capabilities(rules, extractor, disable_progress=True)
 
-    meta = capa.loader.collect_metadata([], path, format, os_, [], extractor, counts)
+    meta = capa.loader.collect_metadata(argv, args.input_file, format_, os_, [], extractor, counts)
     meta.analysis.layout = capa.loader.compute_layout(rules, extractor, capabilities)
 
     doc = rd.ResultDocument.from_capa(meta, rules, capabilities)
-    return {"path": path, "status": "ok", "ok": doc.model_dump()}
+    return {"path": input_file, "status": "ok", "ok": doc.model_dump()}
 
 
 def main(argv=None):
@@ -151,30 +161,16 @@ def main(argv=None):
         argv = sys.argv[1:]
 
         parser = argparse.ArgumentParser(description="detect capabilities in programs.")
-        capa.main.install_common_args(parser, wanted={"rules", "signatures", "format", "os"})
-        parser.add_argument("input", type=str, help="Path to directory of files to recursively analyze")
+        capa.main.install_common_args(parser, wanted={"rules", "signatures", "format", "os", "backend"})
+        parser.add_argument("input_directory", type=str, help="Path to directory of files to recursively analyze")
         parser.add_argument(
             "-n", "--parallelism", type=int, default=multiprocessing.cpu_count(), help="parallelism factor"
         )
         parser.add_argument("--no-mp", action="store_true", help="disable subprocesses")
         args = parser.parse_args(args=argv)
-        capa.main.handle_common_args(args)
-
-        try:
-            rules = capa.rules.get_rules(args.rules)
-            logger.info("successfully loaded %s rules", len(rules))
-        except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e:
-            logger.error("%s", str(e))
-            return -1
-
-        try:
-            sig_paths = capa.loader.get_signatures(args.signatures)
-        except IOError as e:
-            logger.error("%s", str(e))
-            return -1
 
         samples = []
-        for file in Path(args.input).rglob("*"):
+        for file in Path(args.input_directory).rglob("*"):
             samples.append(file)
 
         cpu_count = multiprocessing.cpu_count()
@@ -203,18 +199,24 @@ def map(f, args, parallelism=None):
             logger.debug("using process mapper")
             mapper = pmap
 
+        rules = args.rules
+        if rules == [capa.main.RULES_PATH_DEFAULT_STRING]:
+            rules = None
+
         results = {}
         for result in mapper(
             get_capa_results,
-            [(rules, sig_paths, "pe", OS_AUTO, sample) for sample in samples],
+            [(rules, args.signatures, args.format, args.backend, args.os, str(sample)) for sample in samples],
             parallelism=args.parallelism,
         ):
             if result["status"] == "error":
                 logger.warning(result["error"])
             elif result["status"] == "ok":
-                results[result["path"].as_posix()] = rd.ResultDocument.model_validate(result["ok"]).model_dump_json(
+                doc = rd.ResultDocument.model_validate(result["ok"]).model_dump_json(
                     exclude_none=True
                 )
+                results[result["path"]] = json.loads(doc)
+
             else:
                 raise ValueError(f"unexpected status: {result['status']}")
 

From b3b19432016f79031092f5527dad7e87fc47af6d Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 26 Jan 2024 11:37:43 +0000
Subject: [PATCH 024/200] scripts: migrate scripts to main wrappers

---
 capa/main.py                  |  2 +-
 scripts/bulk-process.py       |  4 +---
 scripts/cache-ruleset.py      | 11 +++++------
 scripts/capafmt.py            |  2 +-
 scripts/proto-from-results.py | 23 ++++++-----------------
 scripts/proto-to-results.py   | 23 ++++++-----------------
 6 files changed, 20 insertions(+), 45 deletions(-)

diff --git a/capa/main.py b/capa/main.py
index bd417b830..e029a9322 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -265,7 +265,7 @@ def install_common_args(parser, wanted=None):
             type=str,
             choices=[f[0] for f in backends],
             default=BACKEND_AUTO,
-            help=f"select backend, {backend_help}"
+            help=f"select backend, {backend_help}",
         )
 
     if "os" in wanted:
diff --git a/scripts/bulk-process.py b/scripts/bulk-process.py
index db3795e4d..fd21a6720 100644
--- a/scripts/bulk-process.py
+++ b/scripts/bulk-process.py
@@ -212,9 +212,7 @@ def map(f, args, parallelism=None):
             if result["status"] == "error":
                 logger.warning(result["error"])
             elif result["status"] == "ok":
-                doc = rd.ResultDocument.model_validate(result["ok"]).model_dump_json(
-                    exclude_none=True
-                )
+                doc = rd.ResultDocument.model_validate(result["ok"]).model_dump_json(exclude_none=True)
                 results[result["path"]] = json.loads(doc)
 
             else:
diff --git a/scripts/cache-ruleset.py b/scripts/cache-ruleset.py
index 89137650d..76990b1c6 100644
--- a/scripts/cache-ruleset.py
+++ b/scripts/cache-ruleset.py
@@ -36,15 +36,14 @@ def main(argv=None):
 
     parser = argparse.ArgumentParser(description="Cache ruleset.")
     capa.main.install_common_args(parser)
-    parser.add_argument("rules", type=str, action="append", help="Path to rules")
+    capa.main.install_common_args(parser, wanted={"rules"})
     parser.add_argument("cache", type=str, help="Path to cache directory")
     args = parser.parse_args(args=argv)
-    capa.main.handle_common_args(args)
 
-    if args.debug:
-        logging.getLogger("capa").setLevel(logging.DEBUG)
-    else:
-        logging.getLogger("capa").setLevel(logging.ERROR)
+    try:
+        capa.main.handle_common_args(args)
+    except capa.main.ShouldExitError as e:
+        return e.status_code
 
     try:
         cache_dir = Path(args.cache)
diff --git a/scripts/capafmt.py b/scripts/capafmt.py
index b420b8efb..de4171ea8 100644
--- a/scripts/capafmt.py
+++ b/scripts/capafmt.py
@@ -30,7 +30,7 @@ def main(argv=None):
         argv = sys.argv[1:]
 
     parser = argparse.ArgumentParser(description="Capa rule formatter.")
-    capa.main.install_common_args(parser, wanted={})
+    capa.main.install_common_args(parser)
     parser.add_argument("path", type=str, help="Path to rule to format")
     parser.add_argument(
         "-i",
diff --git a/scripts/proto-from-results.py b/scripts/proto-from-results.py
index 61df56b6e..68b0eefec 100644
--- a/scripts/proto-from-results.py
+++ b/scripts/proto-from-results.py
@@ -33,6 +33,7 @@
 import argparse
 from pathlib import Path
 
+import capa.main
 import capa.render.proto
 import capa.render.result_document
 
@@ -44,26 +45,14 @@ def main(argv=None):
         argv = sys.argv[1:]
 
     parser = argparse.ArgumentParser(description="Convert a capa JSON result document into the protobuf format")
+    capa.main.install_common_args(parser)
     parser.add_argument("json", type=str, help="path to JSON result document file, produced by `capa --json`")
-
-    logging_group = parser.add_argument_group("logging arguments")
-
-    logging_group.add_argument("-d", "--debug", action="store_true", help="enable debugging output on STDERR")
-    logging_group.add_argument(
-        "-q", "--quiet", action="store_true", help="disable all status output except fatal errors"
-    )
-
     args = parser.parse_args(args=argv)
 
-    if args.quiet:
-        logging.basicConfig(level=logging.WARNING)
-        logging.getLogger().setLevel(logging.WARNING)
-    elif args.debug:
-        logging.basicConfig(level=logging.DEBUG)
-        logging.getLogger().setLevel(logging.DEBUG)
-    else:
-        logging.basicConfig(level=logging.INFO)
-        logging.getLogger().setLevel(logging.INFO)
+    try:
+        capa.main.handle_common_args(args)
+    except capa.main.ShouldExitError as e:
+        return e.status_code
 
     rd = capa.render.result_document.ResultDocument.from_file(Path(args.json))
     pb = capa.render.proto.doc_to_pb2(rd)
diff --git a/scripts/proto-to-results.py b/scripts/proto-to-results.py
index 3bb165704..b413cd9dc 100644
--- a/scripts/proto-to-results.py
+++ b/scripts/proto-to-results.py
@@ -36,6 +36,7 @@
 import argparse
 from pathlib import Path
 
+import capa.main
 import capa.render.json
 import capa.render.proto
 import capa.render.proto.capa_pb2
@@ -49,28 +50,16 @@ def main(argv=None):
         argv = sys.argv[1:]
 
     parser = argparse.ArgumentParser(description="Convert a capa protobuf result document into the JSON format")
+    capa.main.install_common_args(parser)
     parser.add_argument(
         "pb", type=str, help="path to protobuf result document file, produced by `proto-from-results.py`"
     )
-
-    logging_group = parser.add_argument_group("logging arguments")
-
-    logging_group.add_argument("-d", "--debug", action="store_true", help="enable debugging output on STDERR")
-    logging_group.add_argument(
-        "-q", "--quiet", action="store_true", help="disable all status output except fatal errors"
-    )
-
     args = parser.parse_args(args=argv)
 
-    if args.quiet:
-        logging.basicConfig(level=logging.WARNING)
-        logging.getLogger().setLevel(logging.WARNING)
-    elif args.debug:
-        logging.basicConfig(level=logging.DEBUG)
-        logging.getLogger().setLevel(logging.DEBUG)
-    else:
-        logging.basicConfig(level=logging.INFO)
-        logging.getLogger().setLevel(logging.INFO)
+    try:
+        capa.main.handle_common_args(args)
+    except capa.main.ShouldExitError as e:
+        return e.status_code
 
     pb = Path(args.pb).read_bytes()
 

From 189ae242b874b8c8d93465436b5407a6e483611b Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 26 Jan 2024 11:47:11 +0000
Subject: [PATCH 025/200] main: rename *_from_args to *_from_cli

---
 capa/features/freeze/__init__.py         |  8 ++--
 capa/main.py                             | 47 ++++++++++++------------
 scripts/bulk-process.py                  | 12 +++---
 scripts/capa2yara.py                     |  2 +-
 scripts/detect-elf-os.py                 |  2 +-
 scripts/lint.py                          |  2 +-
 scripts/match-function-id.py             |  8 ++--
 scripts/profile-time.py                  | 10 ++---
 scripts/show-capabilities-by-function.py | 12 +++---
 scripts/show-features.py                 |  8 ++--
 scripts/show-unused-features.py          | 10 ++---
 11 files changed, 61 insertions(+), 60 deletions(-)

diff --git a/capa/features/freeze/__init__.py b/capa/features/freeze/__init__.py
index cbaafee61..2dac7f48e 100644
--- a/capa/features/freeze/__init__.py
+++ b/capa/features/freeze/__init__.py
@@ -688,10 +688,10 @@ def main(argv=None):
 
     try:
         capa.main.handle_common_args(args)
-        capa.main.ensure_input_exists_from_args(args)
-        input_format = capa.main.get_input_format_from_args(args)
-        backend = capa.main.get_backend_from_args(args, input_format)
-        extractor = capa.main.get_extractor_from_args(args, input_format, backend)
+        capa.main.ensure_input_exists_from_cli(args)
+        input_format = capa.main.get_input_format_from_cli(args)
+        backend = capa.main.get_backend_from_cli(args, input_format)
+        extractor = capa.main.get_extractor_from_cli(args, input_format, backend)
     except capa.main.ShouldExitError as e:
         return e.status_code
 
diff --git a/capa/main.py b/capa/main.py
index e029a9322..c36fc6dd5 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -318,11 +318,12 @@ def install_common_args(parser, wanted=None):
 #
 # Other scripts may use this routines, but should also prefer to invoke them
 # directly within `main()`, not within library code.
+# Library code should *not* call these functions.
 #
 # These main routines may raise `ShouldExitError` to indicate the program
 # ...should exit. Its a tiny step away from doing `sys.exit()` directly.
 # I'm not sure if we should just do that. In the meantime, programs should
-# handle `ShoudlExitError` and pass the status code to `sys.exit()`.
+# handle `ShouldExitError` and pass the status code to `sys.exit()`.
 #
 
 
@@ -458,7 +459,7 @@ def handle_common_args(args):
         args.signatures = sigs_path
 
 
-def ensure_input_exists_from_args(args):
+def ensure_input_exists_from_cli(args):
     """
     args:
       args: The parsed command line arguments from `install_common_args`.
@@ -475,7 +476,7 @@ def ensure_input_exists_from_args(args):
         raise ShouldExitError(E_MISSING_FILE) from e
 
 
-def get_input_format_from_args(args) -> str:
+def get_input_format_from_cli(args) -> str:
     """
     Determine the format of the input file.
 
@@ -503,7 +504,7 @@ def get_input_format_from_args(args) -> str:
         raise ShouldExitError(E_INVALID_FILE_TYPE) from e
 
 
-def get_backend_from_args(args, input_format: str) -> str:
+def get_backend_from_cli(args, input_format: str) -> str:
     """
     Determine the backend that should be used for the given input file.
     Respects an override provided by the user, otherwise, use a good default.
@@ -531,7 +532,7 @@ def get_backend_from_args(args, input_format: str) -> str:
         return BACKEND_VIV
 
 
-def get_sample_path_from_args(args, backend: str) -> Optional[Path]:
+def get_sample_path_from_cli(args, backend: str) -> Optional[Path]:
     """
     Determine the path to the underlying sample, if it exists.
 
@@ -551,7 +552,7 @@ def get_sample_path_from_args(args, backend: str) -> Optional[Path]:
         return args.input_file
 
 
-def get_os_from_args(args, backend) -> str:
+def get_os_from_cli(args, backend) -> str:
     """
     Determine the OS for the given sample.
     Respects an override provided by the user, otherwise, use heuristics and
@@ -567,13 +568,13 @@ def get_os_from_args(args, backend) -> str:
     if args.os:
         return args.os
 
-    sample_path = get_sample_path_from_args(args, backend)
+    sample_path = get_sample_path_from_cli(args, backend)
     if sample_path is None:
         return "unknown"
     return capa.loader.get_os(sample_path)
 
 
-def get_rules_from_args(args) -> RuleSet:
+def get_rules_from_cli(args) -> RuleSet:
     """
     args:
       args: The parsed command line arguments from `install_common_args`.
@@ -621,7 +622,7 @@ def get_rules_from_args(args) -> RuleSet:
     return rules
 
 
-def get_file_extractors_from_args(args, input_format: str) -> List[FeatureExtractor]:
+def get_file_extractors_from_cli(args, input_format: str) -> List[FeatureExtractor]:
     """
     args:
       args: The parsed command line arguments from `install_common_args`.
@@ -660,7 +661,7 @@ def get_file_extractors_from_args(args, input_format: str) -> List[FeatureExtrac
             raise ShouldExitError(E_INVALID_FILE_TYPE) from e
 
 
-def find_file_limitations_from_args(args, rules: RuleSet, file_extractors: List[FeatureExtractor]) -> bool:
+def find_file_limitations_from_cli(args, rules: RuleSet, file_extractors: List[FeatureExtractor]) -> bool:
     """
     args:
       args: The parsed command line arguments from `install_common_args`.
@@ -695,7 +696,7 @@ def find_file_limitations_from_args(args, rules: RuleSet, file_extractors: List[
     return found_file_limitation
 
 
-def get_signatures_from_args(args, input_format: str, backend: str) -> List[Path]:
+def get_signatures_from_cli(args, input_format: str, backend: str) -> List[Path]:
     if backend != BACKEND_VIV:
         logger.debug("skipping library code matching: only supported by the vivisect backend")
         return []
@@ -711,7 +712,7 @@ def get_signatures_from_args(args, input_format: str, backend: str) -> List[Path
         raise ShouldExitError(E_INVALID_SIG) from e
 
 
-def get_extractor_from_args(args, input_format: str, backend: str) -> FeatureExtractor:
+def get_extractor_from_cli(args, input_format: str, backend: str) -> FeatureExtractor:
     """
     args:
       args: The parsed command line arguments from `install_common_args`.
@@ -721,12 +722,12 @@ def get_extractor_from_args(args, input_format: str, backend: str) -> FeatureExt
     raises:
       ShouldExitError: if the program is invoked incorrectly and should exit.
     """
-    sig_paths = get_signatures_from_args(args, input_format, backend)
+    sig_paths = get_signatures_from_cli(args, input_format, backend)
 
     should_save_workspace = os.environ.get("CAPA_SAVE_WORKSPACE") not in ("0", "no", "NO", "n", None)
 
-    os_ = get_os_from_args(args, backend)
-    sample_path = get_sample_path_from_args(args, backend)
+    os_ = get_os_from_cli(args, backend)
+    sample_path = get_sample_path_from_cli(args, backend)
 
     # TODO(mr-tz): this should be wrapped and refactored as it's tedious to update everywhere
     #  see same code and show-features above examples
@@ -801,11 +802,11 @@ def main(argv: Optional[List[str]] = None):
 
     try:
         handle_common_args(args)
-        ensure_input_exists_from_args(args)
-        input_format = get_input_format_from_args(args)
-        rules = get_rules_from_args(args)
-        file_extractors = get_file_extractors_from_args(args, input_format)
-        found_file_limitation = find_file_limitations_from_args(args, rules, file_extractors)
+        ensure_input_exists_from_cli(args)
+        input_format = get_input_format_from_cli(args)
+        rules = get_rules_from_cli(args)
+        file_extractors = get_file_extractors_from_cli(args, input_format)
+        found_file_limitation = find_file_limitations_from_cli(args, rules, file_extractors)
     except ShouldExitError as e:
         return e.status_code
 
@@ -823,13 +824,13 @@ def main(argv: Optional[List[str]] = None):
         # and use that to extract meta and capabilities
 
         try:
-            backend = get_backend_from_args(args, input_format)
-            sample_path = get_sample_path_from_args(args, backend)
+            backend = get_backend_from_cli(args, input_format)
+            sample_path = get_sample_path_from_cli(args, backend)
             if sample_path is None:
                 os_ = "unknown"
             else:
                 os_ = capa.loader.get_os(sample_path)
-            extractor = get_extractor_from_args(args, input_format, backend)
+            extractor = get_extractor_from_cli(args, input_format, backend)
         except ShouldExitError as e:
             return e.status_code
 
diff --git a/scripts/bulk-process.py b/scripts/bulk-process.py
index fd21a6720..0cb315035 100644
--- a/scripts/bulk-process.py
+++ b/scripts/bulk-process.py
@@ -123,16 +123,16 @@ def get_capa_results(args):
 
     try:
         capa.main.handle_common_args(args)
-        capa.main.ensure_input_exists_from_args(args)
-        input_format = capa.main.get_input_format_from_args(args)
-        rules = capa.main.get_rules_from_args(args)
-        backend = capa.main.get_backend_from_args(args, input_format)
-        sample_path = capa.main.get_sample_path_from_args(args, backend)
+        capa.main.ensure_input_exists_from_cli(args)
+        input_format = capa.main.get_input_format_from_cli(args)
+        rules = capa.main.get_rules_from_cli(args)
+        backend = capa.main.get_backend_from_cli(args, input_format)
+        sample_path = capa.main.get_sample_path_from_cli(args, backend)
         if sample_path is None:
             os_ = "unknown"
         else:
             os_ = capa.loader.get_os(sample_path)
-        extractor = capa.main.get_extractor_from_args(args, input_format, backend)
+        extractor = capa.main.get_extractor_from_cli(args, input_format, backend)
     except capa.main.ShouldExitError as e:
         # i'm not 100% sure if multiprocessing will reliably raise exceptions across process boundaries.
         # so instead, return an object with explicit success/failure status.
diff --git a/scripts/capa2yara.py b/scripts/capa2yara.py
index 5f9c1d806..592e4b0bc 100644
--- a/scripts/capa2yara.py
+++ b/scripts/capa2yara.py
@@ -730,7 +730,7 @@ def main(argv=None):
 
     try:
         capa.main.handle_common_args(args)
-        rules = capa.main.get_rules_from_args(args)
+        rules = capa.main.get_rules_from_cli(args)
     except capa.main.ShouldExitError as e:
         return e.status_code
 
diff --git a/scripts/detect-elf-os.py b/scripts/detect-elf-os.py
index 89cafe499..2dfd86b76 100644
--- a/scripts/detect-elf-os.py
+++ b/scripts/detect-elf-os.py
@@ -41,7 +41,7 @@ def main(argv=None):
 
         try:
             capa.main.handle_common_args(args)
-            capa.main.ensure_input_exists_from_args(args)
+            capa.main.ensure_input_exists_from_cli(args)
         except capa.main.ShouldExitError as e:
             return e.status_code
 
diff --git a/scripts/lint.py b/scripts/lint.py
index 4f3f98a01..93440395d 100644
--- a/scripts/lint.py
+++ b/scripts/lint.py
@@ -1013,7 +1013,7 @@ def main(argv=None):
     time0 = time.time()
 
     try:
-        rules = capa.main.get_rules_from_args(args)
+        rules = capa.main.get_rules_from_cli(args)
     except capa.main.ShouldExitError as e:
         return e.status_code
 
diff --git a/scripts/match-function-id.py b/scripts/match-function-id.py
index a3e52c5f0..6f1da4258 100644
--- a/scripts/match-function-id.py
+++ b/scripts/match-function-id.py
@@ -82,10 +82,10 @@ def main(argv=None):
 
     try:
         capa.main.handle_common_args(args)
-        capa.main.ensure_input_exists_from_args(args)
-        input_format = capa.main.get_input_format_from_args(args)
-        backend = capa.main.get_backend_from_args(args, input_format)
-        sig_paths = capa.main.get_signatures_from_args(args, input_format, backend)
+        capa.main.ensure_input_exists_from_cli(args)
+        input_format = capa.main.get_input_format_from_cli(args)
+        backend = capa.main.get_backend_from_cli(args, input_format)
+        sig_paths = capa.main.get_signatures_from_cli(args, input_format, backend)
     except capa.main.ShouldExitError as e:
         return e.status_code
 
diff --git a/scripts/profile-time.py b/scripts/profile-time.py
index 222da615e..dd0107c10 100644
--- a/scripts/profile-time.py
+++ b/scripts/profile-time.py
@@ -82,12 +82,12 @@ def main(argv=None):
 
     try:
         capa.main.handle_common_args(args)
-        capa.main.ensure_input_exists_from_args(args)
-        input_format = capa.main.get_input_format_from_args(args)
-        backend = capa.main.get_backend_from_args(args, input_format)
+        capa.main.ensure_input_exists_from_cli(args)
+        input_format = capa.main.get_input_format_from_cli(args)
+        backend = capa.main.get_backend_from_cli(args, input_format)
         with capa.main.timing("load rules"):
-            rules = capa.main.get_rules_from_args(args)
-        extractor = capa.main.get_extractor_from_args(args, input_format, backend)
+            rules = capa.main.get_rules_from_cli(args)
+        extractor = capa.main.get_extractor_from_cli(args, input_format, backend)
     except capa.main.ShouldExitError as e:
         return e.status_code
 
diff --git a/scripts/show-capabilities-by-function.py b/scripts/show-capabilities-by-function.py
index 8ad67605f..5a1c0ea1c 100644
--- a/scripts/show-capabilities-by-function.py
+++ b/scripts/show-capabilities-by-function.py
@@ -144,16 +144,16 @@ def main(argv=None):
 
     try:
         capa.main.handle_common_args(args)
-        capa.main.ensure_input_exists_from_args(args)
-        input_format = capa.main.get_input_format_from_args(args)
-        rules = capa.main.get_rules_from_args(args)
-        backend = capa.main.get_backend_from_args(args, input_format)
-        sample_path = capa.main.get_sample_path_from_args(args, backend)
+        capa.main.ensure_input_exists_from_cli(args)
+        input_format = capa.main.get_input_format_from_cli(args)
+        rules = capa.main.get_rules_from_cli(args)
+        backend = capa.main.get_backend_from_cli(args, input_format)
+        sample_path = capa.main.get_sample_path_from_cli(args, backend)
         if sample_path is None:
             os_ = "unknown"
         else:
             os_ = capa.loader.get_os(sample_path)
-        extractor = capa.main.get_extractor_from_args(args, input_format, backend)
+        extractor = capa.main.get_extractor_from_cli(args, input_format, backend)
     except capa.main.ShouldExitError as e:
         return e.status_code
 
diff --git a/scripts/show-features.py b/scripts/show-features.py
index 92f857f7d..d70c6815b 100644
--- a/scripts/show-features.py
+++ b/scripts/show-features.py
@@ -105,16 +105,16 @@ def main(argv=None):
 
     try:
         capa.main.handle_common_args(args)
-        capa.main.ensure_input_exists_from_args(args)
+        capa.main.ensure_input_exists_from_cli(args)
 
         if args.function and args.backend == "pefile":
             print("pefile backend does not support extracting function features")
             return -1
 
-        input_format = capa.main.get_input_format_from_args(args)
+        input_format = capa.main.get_input_format_from_cli(args)
 
-        backend = capa.main.get_backend_from_args(args, input_format)
-        extractor = capa.main.get_extractor_from_args(args, input_format, backend)
+        backend = capa.main.get_backend_from_cli(args, input_format)
+        extractor = capa.main.get_extractor_from_cli(args, input_format, backend)
     except capa.main.ShouldExitError as e:
         return e.status_code
 
diff --git a/scripts/show-unused-features.py b/scripts/show-unused-features.py
index dc9dc4b42..0390cd640 100644
--- a/scripts/show-unused-features.py
+++ b/scripts/show-unused-features.py
@@ -112,11 +112,11 @@ def main(argv=None):
 
     try:
         capa.main.handle_common_args(args)
-        capa.main.ensure_input_exists_from_args(args)
-        rules = capa.main.get_rules_from_args(args)
-        input_format = capa.main.get_input_format_from_args(args)
-        backend = capa.main.get_backend_from_args(args, input_format)
-        extractor = capa.main.get_extractor_from_args(args, input_format, backend)
+        capa.main.ensure_input_exists_from_cli(args)
+        rules = capa.main.get_rules_from_cli(args)
+        input_format = capa.main.get_input_format_from_cli(args)
+        backend = capa.main.get_backend_from_cli(args, input_format)
+        extractor = capa.main.get_extractor_from_cli(args, input_format, backend)
     except capa.main.ShouldExitError as e:
         return e.status_code
 

From 7d80c9101d78e8c6c1075f2d2c0ae6b49f394d43 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 26 Jan 2024 11:47:27 +0000
Subject: [PATCH 026/200] changelog

---
 CHANGELOG.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 493aaea87..2caecaebe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,9 @@
 
 ### Breaking Changes
 
+- main: introduce wrapping routines within main for working with CLI args #1813 @williballenthin
+- move functions from `capa.main` to new `capa.loader` namespace #1821 @williballenthin
+
 ### New Rules (0)
 
 -
@@ -45,7 +48,6 @@ Also a big thanks to the other contributors: @aaronatp, @Aayush-Goel-04, @bkojus
 - protobuf: deprecate `Metadata.analysis` in favor of `Metadata.analysis2` that is dynamic analysis aware @williballenthin
 - update freeze format to v3, adding support for dynamic analysis @williballenthin
 - extractor: ignore DLL name for api features #1815 @mr-tz
-- main: introduce wrapping routines within main for working with CLI args #1813 @williballenthin
 
 ### New Rules (41)
 

From 435a3ca55a573833520e1a371d3882c2b4a9d77e Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 26 Jan 2024 11:48:35 +0000
Subject: [PATCH 027/200] cache-ruleset: remove duplication

---
 scripts/cache-ruleset.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/cache-ruleset.py b/scripts/cache-ruleset.py
index 76990b1c6..5f3847446 100644
--- a/scripts/cache-ruleset.py
+++ b/scripts/cache-ruleset.py
@@ -35,7 +35,6 @@ def main(argv=None):
         argv = sys.argv[1:]
 
     parser = argparse.ArgumentParser(description="Cache ruleset.")
-    capa.main.install_common_args(parser)
     capa.main.install_common_args(parser, wanted={"rules"})
     parser.add_argument("cache", type=str, help="Path to cache directory")
     args = parser.parse_args(args=argv)

From 3b4d2f47c6c10beecc34519d191f6988e77af990 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 26 Jan 2024 11:50:24 +0000
Subject: [PATCH 028/200] main: fix tag handling

---
 capa/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/main.py b/capa/main.py
index c36fc6dd5..7fe713dba 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -613,7 +613,7 @@ def get_rules_from_cli(args) -> RuleSet:
         len(list(filter(lambda r: not (r.is_subscope_rule()), rules.rules.values()))),
     )
 
-    if hasattr(args, "tag"):
+    if hasattr(args, "tag") and args.tag:
         rules = rules.filter_rules_by_meta(args.tag)
         logger.debug("selected %d rules", len(rules))
         for i, r in enumerate(rules.rules, 1):

From 2b86c6e889d99559b53bb037d11316c1a0412934 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 26 Jan 2024 11:52:00 +0000
Subject: [PATCH 029/200] cache-ruleset: fix cli args

---
 scripts/cache-ruleset.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/cache-ruleset.py b/scripts/cache-ruleset.py
index 5f3847446..41dbc808c 100644
--- a/scripts/cache-ruleset.py
+++ b/scripts/cache-ruleset.py
@@ -35,7 +35,8 @@ def main(argv=None):
         argv = sys.argv[1:]
 
     parser = argparse.ArgumentParser(description="Cache ruleset.")
-    capa.main.install_common_args(parser, wanted={"rules"})
+    capa.main.install_common_args(parser)
+    parser.add_argument("rules", type=str, help="Path to rules directory")
     parser.add_argument("cache", type=str, help="Path to cache directory")
     args = parser.parse_args(args=argv)
 

From e3c80187ea67ea8f2f71809f30921d6e2c7c6d8b Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 26 Jan 2024 12:02:45 +0000
Subject: [PATCH 030/200] cache-ruleset: fix special rule cli handling

---
 scripts/cache-ruleset.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/scripts/cache-ruleset.py b/scripts/cache-ruleset.py
index 41dbc808c..0e364622b 100644
--- a/scripts/cache-ruleset.py
+++ b/scripts/cache-ruleset.py
@@ -40,15 +40,23 @@ def main(argv=None):
     parser.add_argument("cache", type=str, help="Path to cache directory")
     args = parser.parse_args(args=argv)
 
-    try:
-        capa.main.handle_common_args(args)
-    except capa.main.ShouldExitError as e:
-        return e.status_code
+    # don't use capa.main.handle_common_args
+    # because it expects a different format for the --rules argument
+
+    if args.quiet:
+        logging.basicConfig(level=logging.WARNING)
+        logging.getLogger().setLevel(logging.WARNING)
+    elif args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+        logging.getLogger().setLevel(logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+        logging.getLogger().setLevel(logging.INFO)
 
     try:
         cache_dir = Path(args.cache)
         cache_dir.mkdir(parents=True, exist_ok=True)
-        rules = capa.rules.get_rules(args.rules, cache_dir)
+        rules = capa.rules.get_rules([Path(args.rules)], cache_dir)
         logger.info("successfully loaded %s rules", len(rules))
     except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e:
         logger.error("%s", str(e))

From de45f204a2410f50dc0a0b49f46a55ec9efe55bd Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 26 Jan 2024 14:15:44 +0000
Subject: [PATCH 031/200] scripts: fix type bytes

---
 scripts/capa2yara.py         | 26 ++++++++++++++++++++------
 scripts/match-function-id.py | 20 +++++++++++++-------
 2 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/scripts/capa2yara.py b/scripts/capa2yara.py
index 592e4b0bc..b1adb3625 100644
--- a/scripts/capa2yara.py
+++ b/scripts/capa2yara.py
@@ -723,16 +723,30 @@ def main(argv=None):
         argv = sys.argv[1:]
 
     parser = argparse.ArgumentParser(description="Capa to YARA rule converter")
-    parser.add_argument("rules", type=str, help="Path to rules")
-    parser.add_argument("--private", "-p", action="store_true", help="Create private rules", default=False)
     capa.main.install_common_args(parser, wanted={"tag"})
+    parser.add_argument("--private", "-p", action="store_true", help="Create private rules", default=False)
+    parser.add_argument("rules", type=str, help="Path to rules directory")
     args = parser.parse_args(args=argv)
 
+    # don't use capa.main.handle_common_args
+    # because it expects a different format for the --rules argument
+
+    if args.quiet:
+        logging.basicConfig(level=logging.WARNING)
+        logging.getLogger().setLevel(logging.WARNING)
+    elif args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+        logging.getLogger().setLevel(logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+        logging.getLogger().setLevel(logging.INFO)
+
     try:
-        capa.main.handle_common_args(args)
-        rules = capa.main.get_rules_from_cli(args)
-    except capa.main.ShouldExitError as e:
-        return e.status_code
+        rules = capa.rules.get_rules([Path(args.rules)])
+        logger.info("successfully loaded %s rules", len(rules))
+    except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e:
+        logger.error("%s", str(e))
+        return -1
 
     namespaces = capa.rules.index_rules_by_namespace(list(rules.rules.values()))
 
diff --git a/scripts/match-function-id.py b/scripts/match-function-id.py
index 6f1da4258..7896e24b2 100644
--- a/scripts/match-function-id.py
+++ b/scripts/match-function-id.py
@@ -62,6 +62,7 @@
 import capa.helpers
 import capa.features
 import capa.features.freeze
+from capa.loader import BACKEND_VIV
 
 logger = logging.getLogger("capa.match-function-id")
 
@@ -71,7 +72,7 @@ def main(argv=None):
         argv = sys.argv[1:]
 
     parser = argparse.ArgumentParser(description="FLIRT match each function")
-    capa.main.install_common_args(parser, wanted={"input_file", "signatures"})
+    capa.main.install_common_args(parser, wanted={"input_file", "signatures", "format"})
     parser.add_argument(
         "-F",
         "--function",
@@ -84,35 +85,40 @@ def main(argv=None):
         capa.main.handle_common_args(args)
         capa.main.ensure_input_exists_from_cli(args)
         input_format = capa.main.get_input_format_from_cli(args)
-        backend = capa.main.get_backend_from_cli(args, input_format)
-        sig_paths = capa.main.get_signatures_from_cli(args, input_format, backend)
+        sig_paths = capa.main.get_signatures_from_cli(args, input_format, BACKEND_VIV)
     except capa.main.ShouldExitError as e:
         return e.status_code
 
     analyzers = []
     for sigpath in sig_paths:
-        sigs = viv_utils.flirt.load_flirt_signature(sigpath)
+        sigs = viv_utils.flirt.load_flirt_signature(str(sigpath))
 
         with capa.main.timing("flirt: compiling sigs"):
             matcher = flirt.compile(sigs)
 
-        analyzer = viv_utils.flirt.FlirtFunctionAnalyzer(matcher, sigpath)
+        analyzer = viv_utils.flirt.FlirtFunctionAnalyzer(matcher, str(sigpath))
         logger.debug("registering viv function analyzer: %s", repr(analyzer))
         analyzers.append(analyzer)
 
-    vw = viv_utils.getWorkspace(args.sample, analyze=True, should_save=False)
+    vw = viv_utils.getWorkspace(str(args.input_file), analyze=True, should_save=False)
 
     functions = vw.getFunctions()
     if args.function:
         functions = [args.function]
 
+    seen = set()
     for function in functions:
         logger.debug("matching function: 0x%04x", function)
         for analyzer in analyzers:
             viv_utils.flirt.match_function_flirt_signatures(analyzer.matcher, vw, function)
             name = viv_utils.get_function_name(vw, function)
             if name:
-                print(f"0x{function:04x}: {name}")
+                key = (function, name)
+                if key in seen:
+                    continue
+                else:
+                    print(f"0x{function:04x}: {name}")
+                    seen.add(key)
 
     return 0
 

From 9c61809acd0ff95f7351eec0135d75f9721ba2d6 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 26 Jan 2024 14:29:40 +0000
Subject: [PATCH 032/200] main: nicely format debug messages

---
 capa/main.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/capa/main.py b/capa/main.py
index 7fe713dba..bb61c4879 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -404,8 +404,12 @@ def handle_common_args(args):
         if args.rules == [RULES_PATH_DEFAULT_STRING]:
             logger.debug("-" * 80)
             logger.debug(" Using default embedded rules.")
-            logger.debug(" To provide your own rules, use the form `capa.exe -r ./path/to/rules/  /path/to/mal.exe`.")
+            logger.debug(" To provide your own rules, use the form:")
+            logger.debug("")
+            logger.debug("     `capa.exe -r ./path/to/rules/  /path/to/mal.exe`.")
+            logger.debug("")
             logger.debug(" You can see the current default rule set here:")
+            logger.debug("")
             logger.debug("     https://github.com/mandiant/capa-rules")
             logger.debug("-" * 80)
 
@@ -438,9 +442,9 @@ def handle_common_args(args):
         if args.signatures == SIGNATURES_PATH_DEFAULT_STRING:
             logger.debug("-" * 80)
             logger.debug(" Using default embedded signatures.")
-            logger.debug(
-                " To provide your own signatures, use the form `capa.exe --signature ./path/to/signatures/  /path/to/mal.exe`."
-            )
+            logger.debug(" To provide your own signatures, use the form:")
+            logger.debug("")
+            logger.debug("     capa.exe --signature ./path/to/signatures/  /path/to/mal.exe")
             logger.debug("-" * 80)
 
             sigs_path = get_default_root() / "sigs"

From 10c2e879c87fe68f4474203aa3e78087a40b8d65 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 26 Jan 2024 14:32:48 +0000
Subject: [PATCH 033/200] helpers: ensure log messages aren't very long

---
 capa/helpers.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/capa/helpers.py b/capa/helpers.py
index 985b38b7f..e5cf978e8 100644
--- a/capa/helpers.py
+++ b/capa/helpers.py
@@ -167,7 +167,8 @@ def log_unsupported_format_error():
     logger.error(" Input file does not appear to be a supported file.")
     logger.error(" ")
     logger.error(" See all supported file formats via capa's help output (-h).")
-    logger.error(" If you don't know the input file type, you can try using the `file` utility to guess it.")
+    logger.error(" If you don't know the input file type,")
+    logger.error(" you can try using the `file` utility to guess it.")
     logger.error("-" * 80)
 
 
@@ -175,10 +176,9 @@ def log_unsupported_cape_report_error(error: str):
     logger.error("-" * 80)
     logger.error(" Input file is not a valid CAPE report: %s", error)
     logger.error(" ")
-    logger.error(" capa currently only supports analyzing standard CAPE reports in JSON format.")
-    logger.error(
-        " Please make sure your report file is in the standard format and contains both the static and dynamic sections."
-    )
+    logger.error(" capa currently only analyzes CAPE reports in JSON format.")
+    logger.error(" Please make sure your report file is in the")
+    logger.error(" standard format and contains both the static and dynamic sections.")
     logger.error("-" * 80)
 
 
@@ -194,9 +194,8 @@ def log_unsupported_os_error():
     logger.error("-" * 80)
     logger.error(" Input file does not appear to target a supported OS.")
     logger.error(" ")
-    logger.error(
-        " capa currently only supports analyzing executables for some operating systems (including Windows and Linux)."
-    )
+    logger.error(" capa currently only analyzes executables for some operating systems")
+    logger.error(" (including Windows and Linux).")
     logger.error("-" * 80)
 
 
@@ -214,9 +213,8 @@ def log_unsupported_runtime_error():
     logger.error(" ")
     logger.error(" capa supports running under Python 3.8 and higher.")
     logger.error(" ")
-    logger.error(
-        " If you're seeing this message on the command line, please ensure you're running a supported Python version."
-    )
+    logger.error(" If you're seeing this message on the command line,")
+    logger.error(" please ensure you're running a supported Python version.")
     logger.error("-" * 80)
 
 

From b97d6c5683390934986a363c55ed6282d7975f24 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 26 Jan 2024 14:55:09 +0000
Subject: [PATCH 034/200] flake8 config

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 171293854..f502ce73a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -86,7 +86,7 @@ repos:
         -   "--config"
         -   ".github/flake8.ini"
         -   "--extend-exclude"
-        -   "capa/render/proto/capa_pb2.py"
+        -   "capa/render/proto/capa_pb2.py,capa/features/extractors/binexport2/binexport_pb2.py"
         -   "capa/"
         -   "scripts/"
         -   "tests/"

From 7573c9458532ed22530bc1b68fe7021ee098c68e Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 26 Jan 2024 14:55:38 +0000
Subject: [PATCH 035/200] binexport2: formatting

---
 capa/features/extractors/binexport2/__init__.py   | 13 ++++++++++++-
 capa/features/extractors/binexport2/basicblock.py |  7 ++-----
 capa/features/extractors/binexport2/extractor.py  |  3 +--
 capa/features/extractors/binexport2/file.py       |  2 +-
 capa/features/extractors/binexport2/global_.py    |  7 +++----
 capa/features/extractors/binexport2/insn.py       |  2 +-
 capa/features/extractors/common.py                |  2 +-
 7 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index 858cea593..5a6575e4b 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -1,3 +1,10 @@
+# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
 """
 Proto files generated via protobuf v24.4:
 
@@ -9,7 +16,6 @@
 
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -20,6 +26,11 @@ def get_binexport2(sample: Path) -> BinExport2:
 
 
 def get_sample_from_binexport2(be2: BinExport2) -> Path:
+    # also search in same directory as input
+    # for files with the given sha256,
+    # starting with files with a similar prefix as given.
+    # TODO(wb): 1755
+
     # $CAPA_SAMPLE_DIR/<sha256>
     base = Path(os.environ.get("CAPA_SAMPLES_DIR", "."))
 
diff --git a/capa/features/extractors/binexport2/basicblock.py b/capa/features/extractors/binexport2/basicblock.py
index bf59f4121..13bd9e6d7 100644
--- a/capa/features/extractors/binexport2/basicblock.py
+++ b/capa/features/extractors/binexport2/basicblock.py
@@ -6,14 +6,13 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 
-from typing import Tuple, Iterator, Any
+from typing import Any, Tuple, Iterator
 
 from capa.features.common import Feature
 from capa.features.address import Address
 from capa.features.basicblock import BasicBlock
 from capa.features.extractors.base_extractor import BBHandle, FunctionHandle
 
-
 # TODO(wb): 1755
 TODOType = Any
 
@@ -31,6 +30,4 @@ def extract_features(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Featur
     yield BasicBlock(), bbh.address
 
 
-BASIC_BLOCK_HANDLERS = (
-    extract_bb_tight_loop,
-)
+BASIC_BLOCK_HANDLERS = (extract_bb_tight_loop,)
diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
index 1c748d61b..5ca680400 100644
--- a/capa/features/extractors/binexport2/extractor.py
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -5,7 +5,7 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
-from typing import List, Tuple, Iterator, Any
+from typing import Any, List, Tuple, Iterator
 
 import capa.features.extractors.elf
 import capa.features.extractors.binexport2.file
@@ -23,7 +23,6 @@
     StaticFeatureExtractor,
 )
 
-
 # TODO(wb): 1755
 TODOType = Any
 
diff --git a/capa/features/extractors/binexport2/file.py b/capa/features/extractors/binexport2/file.py
index 47ddb7654..1b4411470 100644
--- a/capa/features/extractors/binexport2/file.py
+++ b/capa/features/extractors/binexport2/file.py
@@ -6,7 +6,7 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 
-from typing import Tuple, Iterator, Any
+from typing import Any, Tuple, Iterator
 
 from capa.features.common import Feature
 from capa.features.address import Address
diff --git a/capa/features/extractors/binexport2/global_.py b/capa/features/extractors/binexport2/global_.py
index 48f39d392..bf060898d 100644
--- a/capa/features/extractors/binexport2/global_.py
+++ b/capa/features/extractors/binexport2/global_.py
@@ -6,13 +6,12 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 import logging
-from typing import Tuple, Iterator, Any
+from typing import Tuple, Iterator
 
-from capa.features.common import Feature, Arch, ARCH_AARCH64
-from capa.features.address import Address, NO_ADDRESS
+from capa.features.common import ARCH_AARCH64, Arch, Feature
+from capa.features.address import NO_ADDRESS, Address
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
-
 logger = logging.getLogger(__name__)
 
 
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index c7e6f6126..40fcf1f00 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -5,7 +5,7 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
-from typing import Any, List, Tuple, Iterator, Optional, Any
+from typing import Tuple, Iterator
 
 from capa.features.common import Feature
 from capa.features.address import Address
diff --git a/capa/features/extractors/common.py b/capa/features/extractors/common.py
index bf5a3e7b4..4ad10c9b6 100644
--- a/capa/features/extractors/common.py
+++ b/capa/features/extractors/common.py
@@ -75,7 +75,7 @@ def extract_format(buf: bytes) -> Iterator[Tuple[Feature, Address]]:
         #  1. handling a file format (e.g. macho)
         #
         # for (1), this logic will need to be updated as the format is implemented.
-        logger.debug("unsupported file format: %s", binascii.hexlify(buf[:4]).decode("ascii"))
+        logger.debug("unknown file format: %s", binascii.hexlify(buf[:4]).decode("ascii"))
         return
 
 

From f01de854b26987f04a4cacf6409e4da89ac36a1e Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 26 Jan 2024 14:56:07 +0000
Subject: [PATCH 036/200] loader: learn to load BinExport2 files

---
 capa/loader.py | 17 +++++++++++++++++
 capa/main.py   | 18 ++++++++++++++----
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/capa/loader.py b/capa/loader.py
index 9f6cfcde8..d71a9dec5 100644
--- a/capa/loader.py
+++ b/capa/loader.py
@@ -48,6 +48,7 @@
     FORMAT_SC32,
     FORMAT_SC64,
     FORMAT_DOTNET,
+    FORMAT_BINEXPORT2,
 )
 from capa.features.address import Address
 from capa.features.extractors.base_extractor import (
@@ -65,6 +66,7 @@
 BACKEND_PEFILE = "pefile"
 BACKEND_CAPE = "cape"
 BACKEND_FREEZE = "freeze"
+BACKEND_BINEXPORT2 = "binexport2"
 
 
 def is_supported_format(sample: Path) -> bool:
@@ -269,6 +271,16 @@ def get_extractor(
     elif backend == BACKEND_FREEZE:
         return frz.load(input_path.read_bytes())
 
+    elif backend == BACKEND_BINEXPORT2:
+        import capa.features.extractors.binexport2
+        import capa.features.extractors.binexport2.extractor
+
+        be2 = capa.features.extractors.binexport2.get_binexport2(input_path)
+        assert sample_path is not None
+        buf = sample_path.read_bytes()
+
+        return capa.features.extractors.binexport2.extractor.BinExport2FeatureExtractor(be2, buf)
+
     else:
         raise ValueError("unexpected backend: " + backend)
 
@@ -290,6 +302,11 @@ def get_file_extractors(input_file: Path, input_format: str) -> List[FeatureExtr
         report = json.loads(input_file.read_text(encoding="utf-8"))
         file_extractors.append(capa.features.extractors.cape.extractor.CapeExtractor.from_report(report))
 
+    elif input_format == FORMAT_BINEXPORT2:
+        # pick pefile/elffile from sample path, after detection
+        # TODO(wb): 1755
+        pass
+
     return file_extractors
 
 
diff --git a/capa/main.py b/capa/main.py
index bb61c4879..56b54d28b 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -47,7 +47,15 @@
 import capa.features.extractors.cape.extractor
 from capa.rules import RuleSet
 from capa.engine import MatchResults
-from capa.loader import BACKEND_VIV, BACKEND_CAPE, BACKEND_BINJA, BACKEND_DOTNET, BACKEND_FREEZE, BACKEND_PEFILE
+from capa.loader import (
+    BACKEND_VIV,
+    BACKEND_CAPE,
+    BACKEND_BINJA,
+    BACKEND_DOTNET,
+    BACKEND_FREEZE,
+    BACKEND_PEFILE,
+    BACKEND_BINEXPORT2,
+)
 from capa.helpers import (
     get_file_taste,
     get_auto_format,
@@ -78,6 +86,7 @@
     FORMAT_DOTNET,
     FORMAT_FREEZE,
     FORMAT_RESULT,
+    FORMAT_BINEXPORT2,
 )
 from capa.capabilities.common import find_capabilities, has_file_limitation, find_file_capabilities
 from capa.features.extractors.base_extractor import FeatureExtractor, StaticFeatureExtractor, DynamicFeatureExtractor
@@ -255,6 +264,7 @@ def install_common_args(parser, wanted=None):
             (BACKEND_PEFILE, "pefile (file features only)"),
             (BACKEND_BINJA, "Binary Ninja"),
             (BACKEND_DOTNET, ".NET"),
+            (BACKEND_BINEXPORT2, "BinExport2"),
             (BACKEND_FREEZE, "capa freeze"),
             (BACKEND_CAPE, "CAPE"),
         ]
@@ -532,6 +542,9 @@ def get_backend_from_cli(args, input_format: str) -> str:
     elif input_format == FORMAT_FREEZE:
         return BACKEND_FREEZE
 
+    elif input_format == FORMAT_BINEXPORT2:
+        return BACKEND_BINEXPORT2
+
     else:
         return BACKEND_VIV
 
@@ -733,9 +746,6 @@ def get_extractor_from_cli(args, input_format: str, backend: str) -> FeatureExtr
     os_ = get_os_from_cli(args, backend)
     sample_path = get_sample_path_from_cli(args, backend)
 
-    # TODO(mr-tz): this should be wrapped and refactored as it's tedious to update everywhere
-    #  see same code and show-features above examples
-    #  https://github.com/mandiant/capa/issues/1813
     try:
         return capa.loader.get_extractor(
             args.input_file,

From 40bcb1a69f168db9e7be511d592438e1ea00994f Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Mon, 29 Jan 2024 13:00:28 +0000
Subject: [PATCH 037/200] main: debug log the format and backend

---
 capa/main.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/capa/main.py b/capa/main.py
index 56b54d28b..3be89be45 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -746,6 +746,9 @@ def get_extractor_from_cli(args, input_format: str, backend: str) -> FeatureExtr
     os_ = get_os_from_cli(args, backend)
     sample_path = get_sample_path_from_cli(args, backend)
 
+    logger.debug("format:  %s", input_format)
+    logger.debug("backend: %s", backend)
+
     try:
         return capa.loader.get_extractor(
             args.input_file,

From 453094bd6ec35d4c1aa6024df929b0ab72b65b36 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Mon, 29 Jan 2024 13:25:25 +0000
Subject: [PATCH 038/200] elf: add more arch constants

---
 capa/features/extractors/elf.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/capa/features/extractors/elf.py b/capa/features/extractors/elf.py
index b969463df..bdafa4572 100644
--- a/capa/features/extractors/elf.py
+++ b/capa/features/extractors/elf.py
@@ -303,6 +303,9 @@ def ei_osabi(self) -> Optional[OS]:
         98: "TPC",
         99: "SNP1K",
         100: "ST200",
+        # https://www.sco.com/developers/gabi/latest/ch4.eheader.html
+        183: "aarch64",
+        243: "riscv",
     }
 
     @property

From 6cb91756397ee71f05627453cb95ed4c6cf220b0 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Mon, 29 Jan 2024 13:25:41 +0000
Subject: [PATCH 039/200] binexport: parse global features

---
 .../extractors/binexport2/extractor.py        | 13 ++++----
 .../features/extractors/binexport2/global_.py | 32 -------------------
 capa/main.py                                  |  4 +++
 3 files changed, 10 insertions(+), 39 deletions(-)
 delete mode 100644 capa/features/extractors/binexport2/global_.py

diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
index 5ca680400..7689b56a7 100644
--- a/capa/features/extractors/binexport2/extractor.py
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -8,9 +8,9 @@
 from typing import Any, List, Tuple, Iterator
 
 import capa.features.extractors.elf
+import capa.features.extractors.common
 import capa.features.extractors.binexport2.file
 import capa.features.extractors.binexport2.insn
-import capa.features.extractors.binexport2.global_
 import capa.features.extractors.binexport2.function
 import capa.features.extractors.binexport2.basicblock
 from capa.features.common import Feature
@@ -22,20 +22,19 @@
     FunctionHandle,
     StaticFeatureExtractor,
 )
+from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
-# TODO(wb): 1755
-TODOType = Any
 
 
 class BinExport2FeatureExtractor(StaticFeatureExtractor):
-    def __init__(self, be2: TODOType, buf: TODOType):
+    def __init__(self, be2: BinExport2, buf: bytes):
         super().__init__(hashes=SampleHashes.from_bytes(buf))
         self.be2 = be2
         self.buf = buf
         self.global_features: List[Tuple[Feature, Address]] = []
-        self.global_features.extend(capa.features.extractors.binexport2.file.extract_file_format(self.be2, self.buf))
-        self.global_features.extend(capa.features.extractors.binexport2.global_.extract_os(self.be2))
-        self.global_features.extend(capa.features.extractors.binexport2.global_.extract_arch(self.be2))
+        self.global_features.extend(list(capa.features.extractors.common.extract_format(self.buf)))
+        self.global_features.extend(list(capa.features.extractors.common.extract_os(self.buf)))
+        self.global_features.extend(list(capa.features.extractors.common.extract_arch(self.buf)))
 
     def get_base_address(self):
         # TODO(wb): 1755
diff --git a/capa/features/extractors/binexport2/global_.py b/capa/features/extractors/binexport2/global_.py
deleted file mode 100644
index bf060898d..000000000
--- a/capa/features/extractors/binexport2/global_.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at: [package root]/LICENSE.txt
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and limitations under the License.
-import logging
-from typing import Tuple, Iterator
-
-from capa.features.common import ARCH_AARCH64, Arch, Feature
-from capa.features.address import NO_ADDRESS, Address
-from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
-
-logger = logging.getLogger(__name__)
-
-
-def extract_os(be2: BinExport2) -> Iterator[Tuple[Feature, Address]]:
-    # fetch from the buf.
-    # TODO(wb): 1755
-    yield from ()
-
-
-def extract_arch(be2: BinExport2) -> Iterator[Tuple[Feature, Address]]:
-    arch = be2.meta_information.architecture_name
-    # TODO: where does this come from? is it from the BinExport extractor? is there any schema??
-    if arch == "aarch64":
-        yield Arch(ARCH_AARCH64), NO_ADDRESS
-    # TODO: x86, etc.
-    else:
-        logger.debug("unsupported architecture: %s", arch)
-        return
diff --git a/capa/main.py b/capa/main.py
index 3be89be45..6cf60d210 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -565,6 +565,10 @@ def get_sample_path_from_cli(args, backend: str) -> Optional[Path]:
     """
     if backend == BACKEND_CAPE:
         return None
+    elif backend == BACKEND_BINEXPORT2:
+        import capa.features.extractors.binexport2
+        be2 = capa.features.extractors.binexport2.get_binexport2(args.input_file)
+        return capa.features.extractors.binexport2.get_sample_from_binexport2(be2)
     else:
         return args.input_file
 

From dbdf33df2b896cc8676b9347ed42f2db33442b65 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Mon, 29 Jan 2024 13:37:27 +0000
Subject: [PATCH 040/200] binexport: extract file features

---
 .../extractors/binexport2/extractor.py        |  3 +-
 capa/features/extractors/binexport2/file.py   | 62 +++++++++++++------
 capa/main.py                                  |  1 +
 3 files changed, 45 insertions(+), 21 deletions(-)

diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
index 7689b56a7..daeb5e60e 100644
--- a/capa/features/extractors/binexport2/extractor.py
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -5,7 +5,7 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
-from typing import Any, List, Tuple, Iterator
+from typing import List, Tuple, Iterator
 
 import capa.features.extractors.elf
 import capa.features.extractors.common
@@ -25,7 +25,6 @@
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
 
-
 class BinExport2FeatureExtractor(StaticFeatureExtractor):
     def __init__(self, be2: BinExport2, buf: bytes):
         super().__init__(hashes=SampleHashes.from_bytes(buf))
diff --git a/capa/features/extractors/binexport2/file.py b/capa/features/extractors/binexport2/file.py
index 1b4411470..a6ee7ce93 100644
--- a/capa/features/extractors/binexport2/file.py
+++ b/capa/features/extractors/binexport2/file.py
@@ -5,42 +5,66 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
+import io
+import logging
+from typing import Tuple, Iterator
 
-from typing import Any, Tuple, Iterator
+import pefile
+from elftools.elf.elffile import ELFFile
 
+import capa.features.common
+import capa.features.extractors.common
+import capa.features.extractors.pefile
+import capa.features.extractors.elffile
 from capa.features.common import Feature
 from capa.features.address import Address
+from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
-# TODO(wb): 1755
-TODOType = Any
+logger = logging.getLogger(__name__)
 
 
-def extract_file_export_names(be2: TODOType, buf: bytes) -> Iterator[Tuple[Feature, Address]]:
-    # TODO(wb): 1755
-    yield from ()
+def extract_file_export_names(_be2: BinExport2, buf: bytes) -> Iterator[Tuple[Feature, Address]]:
+    if buf.startswith(capa.features.extractors.common.MATCH_PE):
+        pe = pefile.PE(data=buf)
+        yield from capa.features.extractors.pefile.extract_file_export_names(pe)
+    elif buf.startswith(capa.features.extractors.common.MATCH_ELF):
+        elf = ELFFile(io.BytesIO(buf))
+        yield from capa.features.extractors.elffile.extract_file_export_names(elf)
+    else:
+        logger.warning("unsupported format")
 
 
-def extract_file_import_names(be2: TODOType, buf: bytes) -> Iterator[Tuple[Feature, Address]]:
-    # TODO(wb): 1755
-    yield from ()
+def extract_file_import_names(_be2: BinExport2, buf: bytes) -> Iterator[Tuple[Feature, Address]]:
+    if buf.startswith(capa.features.extractors.common.MATCH_PE):
+        pe = pefile.PE(data=buf)
+        yield from capa.features.extractors.pefile.extract_file_import_names(pe)
+    elif buf.startswith(capa.features.extractors.common.MATCH_ELF):
+        elf = ELFFile(io.BytesIO(buf))
+        yield from capa.features.extractors.elffile.extract_file_import_names(elf)
+    else:
+        logger.warning("unsupported format")
 
 
-def extract_file_section_names(be2: TODOType, buf: bytes) -> Iterator[Tuple[Feature, Address]]:
-    # TODO(wb): 1755
-    yield from ()
+def extract_file_section_names(_be2: BinExport2, buf: bytes) -> Iterator[Tuple[Feature, Address]]:
+    if buf.startswith(capa.features.extractors.common.MATCH_PE):
+        pe = pefile.PE(data=buf)
+        yield from capa.features.extractors.pefile.extract_file_section_names(pe)
+    elif buf.startswith(capa.features.extractors.common.MATCH_ELF):
+        elf = ELFFile(io.BytesIO(buf))
+        yield from capa.features.extractors.elffile.extract_file_section_names(elf)
+    else:
+        logger.warning("unsupported format")
 
 
-def extract_file_strings(be2: TODOType, buf: bytes) -> Iterator[Tuple[Feature, Address]]:
-    # TODO(wb): 1755
-    yield from ()
+def extract_file_strings(_be2: BinExport2, buf: bytes) -> Iterator[Tuple[Feature, Address]]:
+    yield from capa.features.extractors.common.extract_file_strings(buf)
 
 
-def extract_file_format(be2: TODOType, buf: bytes) -> Iterator[Tuple[Feature, Address]]:
-    # TODO(wb): 1755
-    yield from ()
+def extract_file_format(_be2: BinExport2, buf: bytes) -> Iterator[Tuple[Feature, Address]]:
+    yield from capa.features.extractors.common.extract_format(buf)
 
 
-def extract_features(be2: TODOType, buf: bytes) -> Iterator[Tuple[Feature, Address]]:
+def extract_features(be2: BinExport2, buf: bytes) -> Iterator[Tuple[Feature, Address]]:
     """extract file features"""
     for file_handler in FILE_HANDLERS:
         for feature, addr in file_handler(be2, buf):
diff --git a/capa/main.py b/capa/main.py
index 6cf60d210..2f130952f 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -567,6 +567,7 @@ def get_sample_path_from_cli(args, backend: str) -> Optional[Path]:
         return None
     elif backend == BACKEND_BINEXPORT2:
         import capa.features.extractors.binexport2
+
         be2 = capa.features.extractors.binexport2.get_binexport2(args.input_file)
         return capa.features.extractors.binexport2.get_sample_from_binexport2(be2)
     else:

From 9681c5375c041d73f52c39dbb0d962305da35e4b Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Mon, 29 Jan 2024 13:45:29 +0000
Subject: [PATCH 041/200] binexport2: begin to enumerate function/bb/insns

---
 .../extractors/binexport2/extractor.py        | 69 ++++++++++++++++---
 1 file changed, 61 insertions(+), 8 deletions(-)

diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
index daeb5e60e..9050e3898 100644
--- a/capa/features/extractors/binexport2/extractor.py
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -5,7 +5,7 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
-from typing import List, Tuple, Iterator
+from typing import List, Tuple, Iterator, Dict
 
 import capa.features.extractors.elf
 import capa.features.extractors.common
@@ -30,11 +30,19 @@ def __init__(self, be2: BinExport2, buf: bytes):
         super().__init__(hashes=SampleHashes.from_bytes(buf))
         self.be2 = be2
         self.buf = buf
+
+        self.address_by_instruction_index: List[int] = []
+        self.flow_graph_index_by_function_index: Dict[int, int] = {}
+        self.function_by_address: Dict[int, int] = {}
+
         self.global_features: List[Tuple[Feature, Address]] = []
         self.global_features.extend(list(capa.features.extractors.common.extract_format(self.buf)))
         self.global_features.extend(list(capa.features.extractors.common.extract_os(self.buf)))
         self.global_features.extend(list(capa.features.extractors.common.extract_arch(self.buf)))
 
+        self._index_instruction_addresses()
+        self._index_basic_blocks_by_function()
+
     def get_base_address(self):
         # TODO(wb): 1755
         return AbsoluteVirtualAddress(0x0)
@@ -46,22 +54,67 @@ def extract_file_features(self):
         yield from capa.features.extractors.binexport2.file.extract_features(self.be2, self.buf)
 
     def get_functions(self) -> Iterator[FunctionHandle]:
-        # TODO(wb): 1755
-        yield from ()
+        for function_index in self.flow_graph_index_by_function_index.keys():
+            vertex = self.be2.call_graph.vertex[function_index]
+            yield FunctionHandle(AbsoluteVirtualAddress(vertex.address), inner=function_index)
 
     def extract_function_features(self, fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
         yield from capa.features.extractors.binexport2.function.extract_features(fh)
 
     def get_basic_blocks(self, fh: FunctionHandle) -> Iterator[BBHandle]:
-        # TODO(wb): 1755
-        yield from ()
+        flow_graph_index = self.flow_graph_index_by_function_index[fh.inner]
+        flow_graph = self.be2.flow_graph[flow_graph_index]
 
-    def extract_basic_block_features(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
-        yield from capa.features.extractors.binexport2.basicblock.extract_features(fh, bbh)
+        for basic_block_index in flow_graph.basic_block_index:
+            bb = self.be2.basic_block[basic_block_index]
+            yield BBHandle(
+                address=AbsoluteVirtualAddress(
+                    self.address_by_instruction_index[bb.instruction_index[0].begin_index]
+                ),
+                inner=basic_block_index,
+            )
 
-    def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHandle]:
+    def extract_basic_block_features(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
         # TODO(wb): 1755
         yield from ()
 
+    def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHandle]:
+        bb: BinExport2.BasicBlock = self.be2.basic_block[bbh.inner]
+        for i in range(bb.instruction_index[0].begin_index, bb.instruction_index[0].end_index):
+            yield InsnHandle(
+                address=AbsoluteVirtualAddress(self.address_by_instruction_index[i]),
+                inner=i,
+            )
+
     def extract_insn_features(self, fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle):
         yield from capa.features.extractors.binexport2.insn.extract_features(fh, bbh, ih)
+ 
+    def _index_instruction_addresses(self):
+        address = 0
+        next_address = 0
+        for instruction in self.be2.instruction:
+            if instruction.HasField("address"):
+                address = instruction.address
+                next_address = address + len(instruction.raw_bytes)
+            else:
+                address = next_address
+                next_address += len(instruction.raw_bytes)
+
+            self.address_by_instruction_index.append(address)
+
+    def _index_basic_blocks_by_function(self):
+        function_index_from_address = {}
+
+        for index, vertex in enumerate(self.be2.call_graph.vertex):
+            function_index_from_address[vertex.address] = index
+
+        for flow_graph_index, flow_graph in enumerate(self.be2.flow_graph):
+            basic_block_entry_point = self.be2.basic_block[flow_graph.entry_basic_block_index]
+            basic_block_address = self.address_by_instruction_index[basic_block_entry_point.instruction_index[0].begin_index]
+
+            if basic_block_address not in function_index_from_address:
+                continue
+
+            function_index = function_index_from_address[basic_block_address]
+
+            self.flow_graph_index_by_function_index[function_index] = flow_graph_index
\ No newline at end of file

From d71d087f1b0641f82ff724e453d73af5e19ed7ca Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Mon, 29 Jan 2024 21:12:23 +0000
Subject: [PATCH 042/200] binexport: pass context to function/bb/insn
 extractors

---
 .../extractors/binexport2/__init__.py         | 20 +++++++++++
 .../extractors/binexport2/extractor.py        | 35 ++++++++++++-------
 2 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index 5a6575e4b..f3241ff1c 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -13,6 +13,7 @@
 import os
 import logging
 from pathlib import Path
+from dataclasses import dataclass
 
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
@@ -42,3 +43,22 @@ def get_sample_from_binexport2(be2: BinExport2) -> Path:
         return path
     else:
         raise ValueError("cannot find sample")
+
+
+@dataclass
+class FunctionContext:
+    be2: BinExport2
+    function_index: int
+
+
+@dataclass
+class BasicBlockContext:
+    be2: BinExport2
+    basic_block_index: int
+
+
+@dataclass
+class InstructionContext:
+    be2: BinExport2
+    instruction_index: int
+
diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
index 9050e3898..df6d75de7 100644
--- a/capa/features/extractors/binexport2/extractor.py
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -23,6 +23,7 @@
     StaticFeatureExtractor,
 )
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
+from capa.features.extractors.binexport2 import FunctionContext, BasicBlockContext, InstructionContext
 
 
 class BinExport2FeatureExtractor(StaticFeatureExtractor):
@@ -31,9 +32,9 @@ def __init__(self, be2: BinExport2, buf: bytes):
         self.be2 = be2
         self.buf = buf
 
-        self.address_by_instruction_index: List[int] = []
+        self.address_by_instruction_index: Dict[int, int] = {}
         self.flow_graph_index_by_function_index: Dict[int, int] = {}
-        self.function_by_address: Dict[int, int] = {}
+        self.function_index_by_address: Dict[int, int] = {}
 
         self.global_features: List[Tuple[Feature, Address]] = []
         self.global_features.extend(list(capa.features.extractors.common.extract_format(self.buf)))
@@ -44,8 +45,11 @@ def __init__(self, be2: BinExport2, buf: bytes):
         self._index_basic_blocks_by_function()
 
     def get_base_address(self):
-        # TODO(wb): 1755
-        return AbsoluteVirtualAddress(0x0)
+        # TODO: assume the lowest address is the base address.
+        # this works as long as BinExport doesn't record other
+        # libraries mapped into memory.
+        base_address = min(map(lambda s: s.address, self.be2.section))
+        return AbsoluteVirtualAddress(base_address)
 
     def extract_global_features(self):
         yield from self.global_features
@@ -56,13 +60,17 @@ def extract_file_features(self):
     def get_functions(self) -> Iterator[FunctionHandle]:
         for function_index in self.flow_graph_index_by_function_index.keys():
             vertex = self.be2.call_graph.vertex[function_index]
-            yield FunctionHandle(AbsoluteVirtualAddress(vertex.address), inner=function_index)
+            yield FunctionHandle(
+                AbsoluteVirtualAddress(vertex.address), 
+                inner=FunctionContext(self.be2, function_index)
+            )
 
     def extract_function_features(self, fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
         yield from capa.features.extractors.binexport2.function.extract_features(fh)
 
     def get_basic_blocks(self, fh: FunctionHandle) -> Iterator[BBHandle]:
-        flow_graph_index = self.flow_graph_index_by_function_index[fh.inner]
+        fhi: FunctionContext = fh.inner
+        flow_graph_index = self.flow_graph_index_by_function_index[fhi.function_index]
         flow_graph = self.be2.flow_graph[flow_graph_index]
 
         for basic_block_index in flow_graph.basic_block_index:
@@ -71,7 +79,7 @@ def get_basic_blocks(self, fh: FunctionHandle) -> Iterator[BBHandle]:
                 address=AbsoluteVirtualAddress(
                     self.address_by_instruction_index[bb.instruction_index[0].begin_index]
                 ),
-                inner=basic_block_index,
+                inner=BasicBlockContext(self.be2, basic_block_index)
             )
 
     def extract_basic_block_features(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
@@ -79,11 +87,12 @@ def extract_basic_block_features(self, fh: FunctionHandle, bbh: BBHandle) -> Ite
         yield from ()
 
     def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHandle]:
-        bb: BinExport2.BasicBlock = self.be2.basic_block[bbh.inner]
-        for i in range(bb.instruction_index[0].begin_index, bb.instruction_index[0].end_index):
+        bbi: BasicBlockContext = bbh.inner
+        bb: BinExport2.BasicBlock = self.be2.basic_block[bbi.basic_block_index]
+        for instruction_index in range(bb.instruction_index[0].begin_index, bb.instruction_index[0].end_index):
             yield InsnHandle(
-                address=AbsoluteVirtualAddress(self.address_by_instruction_index[i]),
-                inner=i,
+                address=AbsoluteVirtualAddress(self.address_by_instruction_index[instruction_index]),
+                inner=InstructionContext(self.be2, instruction_index),
             )
 
     def extract_insn_features(self, fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle):
@@ -92,7 +101,7 @@ def extract_insn_features(self, fh: FunctionHandle, bbh: BBHandle, ih: InsnHandl
     def _index_instruction_addresses(self):
         address = 0
         next_address = 0
-        for instruction in self.be2.instruction:
+        for instruction_index, instruction in enumerate(self.be2.instruction):
             if instruction.HasField("address"):
                 address = instruction.address
                 next_address = address + len(instruction.raw_bytes)
@@ -100,7 +109,7 @@ def _index_instruction_addresses(self):
                 address = next_address
                 next_address += len(instruction.raw_bytes)
 
-            self.address_by_instruction_index.append(address)
+            self.address_by_instruction_index[instruction_index] = address
 
     def _index_basic_blocks_by_function(self):
         function_index_from_address = {}

From a7a6e53e0d9e0a6572505e8c5163056a82ce1f5c Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 30 Jan 2024 12:42:31 +0000
Subject: [PATCH 043/200] binexport: linters

---
 .../extractors/binexport2/extractor.py        | 27 +++++++-----
 capa/features/extractors/binexport2/insn.py   | 42 +++++++++++++++++--
 2 files changed, 54 insertions(+), 15 deletions(-)

diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
index df6d75de7..95e52aae5 100644
--- a/capa/features/extractors/binexport2/extractor.py
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -5,7 +5,7 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
-from typing import List, Tuple, Iterator, Dict
+from typing import Dict, List, Tuple, Iterator
 
 import capa.features.extractors.elf
 import capa.features.extractors.common
@@ -15,6 +15,7 @@
 import capa.features.extractors.binexport2.basicblock
 from capa.features.common import Feature
 from capa.features.address import Address, AbsoluteVirtualAddress
+from capa.features.extractors.binexport2 import FunctionContext, BasicBlockContext, InstructionContext
 from capa.features.extractors.base_extractor import (
     BBHandle,
     InsnHandle,
@@ -23,7 +24,6 @@
     StaticFeatureExtractor,
 )
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
-from capa.features.extractors.binexport2 import FunctionContext, BasicBlockContext, InstructionContext
 
 
 class BinExport2FeatureExtractor(StaticFeatureExtractor):
@@ -44,6 +44,12 @@ def __init__(self, be2: BinExport2, buf: bytes):
         self._index_instruction_addresses()
         self._index_basic_blocks_by_function()
 
+        print("base address", hex(self.get_base_address()))
+        ba = self.get_base_address()
+        for v in self.be2.call_graph.vertex:
+            if v.mangled_name:
+                print(hex(v.address - ba), v.mangled_name)
+
     def get_base_address(self):
         # TODO: assume the lowest address is the base address.
         # this works as long as BinExport doesn't record other
@@ -61,8 +67,7 @@ def get_functions(self) -> Iterator[FunctionHandle]:
         for function_index in self.flow_graph_index_by_function_index.keys():
             vertex = self.be2.call_graph.vertex[function_index]
             yield FunctionHandle(
-                AbsoluteVirtualAddress(vertex.address), 
-                inner=FunctionContext(self.be2, function_index)
+                AbsoluteVirtualAddress(vertex.address), inner=FunctionContext(self.be2, function_index)
             )
 
     def extract_function_features(self, fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
@@ -76,10 +81,8 @@ def get_basic_blocks(self, fh: FunctionHandle) -> Iterator[BBHandle]:
         for basic_block_index in flow_graph.basic_block_index:
             bb = self.be2.basic_block[basic_block_index]
             yield BBHandle(
-                address=AbsoluteVirtualAddress(
-                    self.address_by_instruction_index[bb.instruction_index[0].begin_index]
-                ),
-                inner=BasicBlockContext(self.be2, basic_block_index)
+                address=AbsoluteVirtualAddress(self.address_by_instruction_index[bb.instruction_index[0].begin_index]),
+                inner=BasicBlockContext(self.be2, basic_block_index),
             )
 
     def extract_basic_block_features(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
@@ -97,7 +100,7 @@ def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHa
 
     def extract_insn_features(self, fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle):
         yield from capa.features.extractors.binexport2.insn.extract_features(fh, bbh, ih)
- 
+
     def _index_instruction_addresses(self):
         address = 0
         next_address = 0
@@ -119,11 +122,13 @@ def _index_basic_blocks_by_function(self):
 
         for flow_graph_index, flow_graph in enumerate(self.be2.flow_graph):
             basic_block_entry_point = self.be2.basic_block[flow_graph.entry_basic_block_index]
-            basic_block_address = self.address_by_instruction_index[basic_block_entry_point.instruction_index[0].begin_index]
+            basic_block_address = self.address_by_instruction_index[
+                basic_block_entry_point.instruction_index[0].begin_index
+            ]
 
             if basic_block_address not in function_index_from_address:
                 continue
 
             function_index = function_index_from_address[basic_block_address]
 
-            self.flow_graph_index_by_function_index[function_index] = flow_graph_index
\ No newline at end of file
+            self.flow_graph_index_by_function_index[function_index] = flow_graph_index
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 40fcf1f00..bdf20f6d7 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -7,14 +7,48 @@
 # See the License for the specific language governing permissions and limitations under the License.
 from typing import Tuple, Iterator
 
+from capa.features.insn import API
 from capa.features.common import Feature
-from capa.features.address import Address
+from capa.features.address import Address, AbsoluteVirtualAddress
+from capa.features.extractors.binexport2 import FunctionContext, InstructionContext
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
 
 
-def extract_insn_api_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
-    # TODO(wb): 1755
-    yield from ()
+def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
+    fhi: FunctionContext = fh.inner
+    ii: InstructionContext = ih.inner
+
+    be2 = fhi.be2
+
+    insn = be2.instruction[ii.instruction_index]
+    mnem = be2.mnemonic[insn.mnemonic_index]
+
+    if mnem.name not in ("call", "jmp", "BL"):
+        return
+
+    if not insn.call_target:
+        return
+
+    address = insn.call_target[0]
+    print(hex(insn.address), "->", hex(address))
+
+    for vertex in be2.call_graph.vertex:
+        # TODO: need an index here
+        if vertex.address != address:
+            continue
+
+        if not vertex.mangled_name:
+            continue
+
+        yield API(name=vertex.mangled_name), AbsoluteVirtualAddress(address)
+
+        if not vertex.HasField("library_index"):
+            continue
+
+        library = be2.library[vertex.library_index]
+        lib_name = library.name.split("\\")[-1].split(".")[0]
+
+        yield API(name=f"{lib_name}.{vertex.mangled_name}"), AbsoluteVirtualAddress(address)
 
 
 def extract_insn_number_features(

From 217a9a930f2a9c8966e5cb4e2ef7ac651ee3dc11 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 30 Jan 2024 12:42:40 +0000
Subject: [PATCH 044/200] binexport: linters

---
 capa/features/extractors/binexport2/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index f3241ff1c..e8a227c74 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -61,4 +61,3 @@ class BasicBlockContext:
 class InstructionContext:
     be2: BinExport2
     instruction_index: int
-

From f236ff2aef7c2e3bf212eb2aa4d857fb994ce4be Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 30 Jan 2024 12:42:53 +0000
Subject: [PATCH 045/200] scripts: add script to inspect binexport2 file

---
 scripts/inspect-binexport2.py | 327 ++++++++++++++++++++++++++++++++++
 1 file changed, 327 insertions(+)
 create mode 100644 scripts/inspect-binexport2.py

diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
new file mode 100644
index 000000000..e3ea052df
--- /dev/null
+++ b/scripts/inspect-binexport2.py
@@ -0,0 +1,327 @@
+#!/usr/bin/env python
+"""
+Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+You may obtain a copy of the License at: [package root]/LICENSE.txt
+Unless required by applicable law or agreed to in writing, software distributed under the License
+ is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and limitations under the License.
+"""
+import io
+import sys
+import logging
+import argparse
+import contextlib
+from typing import Dict, List, Iterator
+from collections import defaultdict
+
+import capa.main
+import capa.features.extractors.binexport2
+from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
+
+logger = logging.getLogger("inspect-binexport2")
+
+
+class Renderer:
+    def __init__(self, o: io.StringIO):
+        self.o = o
+        self.indent = 0
+
+    @contextlib.contextmanager
+    def indenting(self):
+        self.indent += 1
+        try:
+            yield
+        finally:
+            self.indent -= 1
+
+    def writeln(self, s):
+        self.o.write("  " * self.indent)
+        self.o.write(s)
+        self.o.write("\n")
+
+    @contextlib.contextmanager
+    def section(self, name):
+        self.writeln(name)
+        with self.indenting():
+            try:
+                yield
+            finally:
+                pass
+        self.writeln("/" + name)
+        self.writeln("")
+
+    def getvalue(self):
+        return self.o.getvalue()
+
+
+def main(argv=None):
+    if argv is None:
+        argv = sys.argv[1:]
+
+    parser = argparse.ArgumentParser(description="Inspect BinExport2 files")
+    capa.main.install_common_args(parser, wanted={"input_file"})
+    args = parser.parse_args(args=argv)
+
+    try:
+        capa.main.handle_common_args(args)
+    except capa.main.ShouldExitError as e:
+        return e.status_code
+
+    o = Renderer(io.StringIO())
+    be2: BinExport2 = capa.features.extractors.binexport2.get_binexport2(args.input_file)
+
+    with o.section("meta"):
+        o.writeln(f"name:   {be2.meta_information.executable_name}")
+        o.writeln(f"sha256: {be2.meta_information.executable_id}")
+        o.writeln(f"arch:   {be2.meta_information.architecture_name}")
+        o.writeln(f"ts:     {be2.meta_information.timestamp}")
+
+    with o.section("modules"):
+        for module in be2.module:
+            o.writeln(f"- {module.name}")
+        if not be2.module:
+            o.writeln("(none)")
+
+    with o.section("sections"):
+        for section in be2.section:
+            perms = ""
+            perms += "r" if section.flag_r else "-"
+            perms += "w" if section.flag_w else "-"
+            perms += "x" if section.flag_x else "-"
+            o.writeln(f"- {hex(section.address)} {perms} {hex(section.size)}")
+
+    with o.section("libraries"):
+        for library in be2.library:
+            o.writeln(f"- {library.name:<12s} {'(static)' if library.is_static else ''} at {hex(library.load_address)}")
+        if not be2.library:
+            o.writeln("(none)")
+
+    callers_by_vertex_index: Dict[int, List[int]] = defaultdict(list)
+    callees_by_vertex_index: Dict[int, List[int]] = defaultdict(list)
+
+    for edge in be2.call_graph.edge:
+        if not edge.source_vertex_index:
+            continue
+        if not edge.target_vertex_index:
+            continue
+
+        callers_by_vertex_index[edge.target_vertex_index].append(edge.source_vertex_index)
+        callees_by_vertex_index[edge.source_vertex_index].append(edge.target_vertex_index)
+
+    # note: flow graph != call graph (vertex)
+    flow_graph_index_by_address: Dict[int, int] = {}
+    basic_block_index_by_address: Dict[int, int] = {}
+    basic_block_address_by_index: Dict[int, int] = {}
+    instruction_index_by_address: Dict[int, int] = {}
+    instruction_address_by_index: Dict[int, int] = {}
+
+    instruction_address = 0
+    for instruction_index, instruction in enumerate(be2.instruction):
+        if instruction.HasField("address"):
+            instruction_address = instruction.address
+
+        instruction_index_by_address[instruction_address] = instruction_index
+        instruction_address_by_index[instruction_index] = instruction_address
+
+        assert instruction.HasField("raw_bytes")
+        instruction_address += len(instruction.raw_bytes)
+
+    def instruction_indices(basic_block: BinExport2.BasicBlock) -> Iterator[int]:
+        for index_range in basic_block.instruction_index:
+            if not index_range.HasField("end_index"):
+                yield index_range.begin_index
+                continue
+            else:
+                yield from range(index_range.begin_index, index_range.end_index)
+
+    for flow_graph_index, flow_graph in enumerate(be2.flow_graph):
+        for basic_block_index in flow_graph.basic_block_index:
+            basic_block = be2.basic_block[basic_block_index]
+            for instruction_index in instruction_indices(basic_block):
+                basic_block_address = instruction_address_by_index[instruction_index]
+                basic_block_index_by_address[basic_block_address] = basic_block_index
+                basic_block_address_by_index[basic_block_index] = basic_block_address
+
+        entry_basic_block = be2.basic_block[flow_graph.entry_basic_block_index]
+        entry_instruction_index = next(instruction_indices(entry_basic_block))
+        entry_instruction_address = instruction_address_by_index[entry_instruction_index]
+        function_address = entry_instruction_address
+        flow_graph_index_by_address[function_address] = flow_graph_index
+
+    # edges that come from the given basic block
+    source_edges_by_basic_block_index: Dict[int, List[BinExport2.FlowGraph.Edge]] = defaultdict(list)
+    # edges that end up at the given basic block
+    target_edges_by_basic_block_index: Dict[int, List[BinExport2.FlowGraph.Edge]] = defaultdict(list)
+
+    for flow_graph in be2.flow_graph:
+        for edge in flow_graph.edge:
+            if not edge.HasField("source_basic_block_index") or not edge.HasField("target_basic_block_index"):
+                continue
+
+            source_edges_by_basic_block_index[edge.source_basic_block_index].append(edge)
+            target_edges_by_basic_block_index[edge.target_basic_block_index].append(edge)
+
+    vertex_index_by_address: Dict[int, int] = {}
+
+    for vertex_index, vertex in enumerate(be2.call_graph.vertex):
+        if not vertex.HasField("address"):
+            continue
+
+        vertex_address = vertex.address
+        vertex_index_by_address[vertex_address] = vertex_index
+
+    def get_function_name_by_vertex(be2: BinExport2, vertex_index: int) -> str:
+        vertex = be2.call_graph.vertex[vertex_index]
+        name = f"sub_{vertex.address:x}"
+        if vertex.HasField("mangled_name"):
+            name = vertex.mangled_name
+
+        if vertex.HasField("demangled_name"):
+            name = vertex.demangled_name
+
+        return name
+
+    def get_function_name_by_address(be2: BinExport2, address: int) -> str:
+        if address not in vertex_index_by_address:
+            return ""
+
+        vertex_index = vertex_index_by_address[address]
+        return get_function_name_by_vertex(be2, vertex_index)
+
+    data_reference_index_by_source_instruction_index: Dict[int, List[int]] = defaultdict(list)
+    data_reference_index_by_target_address: Dict[int, List[int]] = defaultdict(list)
+
+    for data_reference_index, data_reference in enumerate(be2.data_reference):
+        data_reference_index_by_source_instruction_index[data_reference.instruction_index].append(data_reference_index)
+        data_reference_index_by_target_address[data_reference.address].append(data_reference_index)
+
+    with o.section("functions"):
+        for vertex_index, vertex in enumerate(be2.call_graph.vertex):
+            if not vertex.HasField("address"):
+                continue
+
+            with o.section(f"function {get_function_name_by_vertex(be2, vertex_index)} @ {hex(vertex.address)}"):
+                o.writeln(f"type:      {vertex.Type.Name(vertex.type)}")
+
+                if vertex.HasField("mangled_name"):
+                    o.writeln(f"name:      {vertex.mangled_name}")
+
+                if vertex.HasField("demangled_name"):
+                    o.writeln(f"demangled: {vertex.demangled_name}")
+
+                if vertex.HasField("library_index"):
+                    # BUG: this seems to be incorrect
+                    library = be2.library[vertex.library_index]
+                    o.writeln(f"library:   [{vertex.library_index}] {library.name}")
+
+                if vertex.HasField("module_index"):
+                    module = be2.library[vertex.module_index]
+                    o.writeln(f"module:    [{vertex.module_index}] {module.name}")
+
+                if callees_by_vertex_index[vertex_index] or callers_by_vertex_index[vertex_index]:
+                    o.writeln("xrefs:")
+
+                    for caller_index in callees_by_vertex_index[vertex_index]:
+                        o.writeln(f"  ← {get_function_name_by_vertex(be2, caller_index)}")
+
+                    for callee_index in callers_by_vertex_index[vertex_index]:
+                        o.writeln(f"  → {get_function_name_by_vertex(be2, callee_index)}")
+
+                if vertex.address not in flow_graph_index_by_address:
+                    o.writeln("(no flow graph)")
+                else:
+                    flow_graph_index = flow_graph_index_by_address[vertex.address]
+                    flow_graph = be2.flow_graph[flow_graph_index]
+
+                    o.writeln("")
+                    for basic_block_index in flow_graph.basic_block_index:
+                        basic_block = be2.basic_block[basic_block_index]
+                        basic_block_address = basic_block_address_by_index[basic_block_index]
+
+                        with o.section(f"basic block {hex(basic_block_address)}"):
+                            for edge in target_edges_by_basic_block_index[basic_block_index]:
+                                if edge.type == BinExport2.FlowGraph.Edge.Type.CONDITION_FALSE:
+                                    continue
+
+                                source_basic_block_index = edge.source_basic_block_index
+                                source_basic_block_address = basic_block_address_by_index[source_basic_block_index]
+
+                                o.writeln(
+                                    f"↓ {BinExport2.FlowGraph.Edge.Type.Name(edge.type)} basic block {hex(source_basic_block_address)}"
+                                )
+
+                            for instruction_index in instruction_indices(basic_block):
+                                instruction = be2.instruction[instruction_index]
+                                instruction_address = instruction_address_by_index[instruction_index]
+
+                                mnemonic = be2.mnemonic[instruction.mnemonic_index]
+
+                                call_targets = ""
+                                if instruction.call_target:
+                                    call_targets = " "
+                                    for call_target_address in instruction.call_target:
+                                        call_target_name = get_function_name_by_address(be2, call_target_address)
+                                        call_targets += f"→ function {call_target_name} @ {hex(call_target_address)} "
+
+                                data_references = ""
+                                if instruction_index in data_reference_index_by_source_instruction_index:
+                                    data_references = " "
+                                    for data_reference_index in data_reference_index_by_source_instruction_index[
+                                        instruction_index
+                                    ]:
+                                        data_reference = be2.data_reference[data_reference_index]
+                                        data_reference_address = data_reference.address
+                                        data_references += f"⇥ data {hex(data_reference_address)} "
+
+                                comments = ""
+                                if instruction.comment_index:
+                                    comments = " "
+                                    for comment_index in instruction.comment_index:
+                                        comment = be2.comment[comment_index]
+                                        comment_string = be2.string_table[comment.string_table_index]
+                                        comments += f"; {BinExport2.Comment.Type.Name(comment.type)} {comment_string} "
+
+                                o.writeln(
+                                    f"{hex(instruction_address)}  {mnemonic.name:<12s}{call_targets}{data_references}{comments}"
+                                )
+
+                            does_fallthrough = False
+                            for edge in source_edges_by_basic_block_index[basic_block_index]:
+                                if edge.type == BinExport2.FlowGraph.Edge.Type.CONDITION_FALSE:
+                                    does_fallthrough = True
+                                    continue
+
+                                back_edge = ""
+                                if edge.HasField("is_back_edge") and edge.is_back_edge:
+                                    back_edge = "↑"
+
+                                target_basic_block_index = edge.target_basic_block_index
+                                target_basic_block_address = basic_block_address_by_index[target_basic_block_index]
+                                o.writeln(
+                                    f"→ {BinExport2.FlowGraph.Edge.Type.Name(edge.type)} basic block {hex(target_basic_block_address)} {back_edge}"
+                                )
+
+                            if does_fallthrough:
+                                o.writeln("↓ CONDITION_FALSE")
+
+    with o.section("data"):
+        for data_address in sorted(data_reference_index_by_target_address.keys()):
+            if data_address in instruction_index_by_address:
+                # appears to be code
+                continue
+
+            data_references = ""
+            for data_reference_index in data_reference_index_by_target_address[data_address]:
+                data_reference = be2.data_reference[data_reference_index]
+                instruction_index = data_reference.instruction_index
+                instruction_address = instruction_address_by_index[instruction_index]
+                data_references += f"⇤ {hex(instruction_address)} "
+            o.writeln(f"{hex(data_address)} {data_references}")
+
+    print(o.getvalue())
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 2202dc7e8577170319125c24367cadd682094333 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 30 Jan 2024 13:06:30 +0000
Subject: [PATCH 046/200] inspect-binexport: fix xref symbols

---
 scripts/inspect-binexport2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index e3ea052df..3d4b907dd 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -223,10 +223,10 @@ def get_function_name_by_address(be2: BinExport2, address: int) -> str:
                 if callees_by_vertex_index[vertex_index] or callers_by_vertex_index[vertex_index]:
                     o.writeln("xrefs:")
 
-                    for caller_index in callees_by_vertex_index[vertex_index]:
+                    for caller_index in callers_by_vertex_index[vertex_index]:
                         o.writeln(f"  ← {get_function_name_by_vertex(be2, caller_index)}")
 
-                    for callee_index in callers_by_vertex_index[vertex_index]:
+                    for callee_index in callees_by_vertex_index[vertex_index]:
                         o.writeln(f"  → {get_function_name_by_vertex(be2, callee_index)}")
 
                 if vertex.address not in flow_graph_index_by_address:

From 265ffe17abe2299264f1deb3221f93fc1ee81500 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 30 Jan 2024 14:31:35 +0000
Subject: [PATCH 047/200] inspect-binexport: factor out the index building

---
 scripts/inspect-binexport2.py | 266 ++++++++++++++++++----------------
 1 file changed, 144 insertions(+), 122 deletions(-)

diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index 3d4b907dd..b57ba0137 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -23,6 +23,126 @@
 logger = logging.getLogger("inspect-binexport2")
 
 
+class BinExport2Index:
+    def __init__(self, be2: BinExport2):
+        self.be2 = be2
+
+        self.callers_by_vertex_index: Dict[int, List[int]] = defaultdict(list)
+        self.callees_by_vertex_index: Dict[int, List[int]] = defaultdict(list)
+
+        # note: flow graph != call graph (vertex)
+        self.flow_graph_index_by_address: Dict[int, int] = {}
+        self.basic_block_index_by_address: Dict[int, int] = {}
+        self.basic_block_address_by_index: Dict[int, int] = {}
+        self.instruction_index_by_address: Dict[int, int] = {}
+        self.instruction_address_by_index: Dict[int, int] = {}
+
+        # edges that come from the given basic block
+        self.source_edges_by_basic_block_index: Dict[int, List[BinExport2.FlowGraph.Edge]] = defaultdict(list)
+        # edges that end up at the given basic block
+        self.target_edges_by_basic_block_index: Dict[int, List[BinExport2.FlowGraph.Edge]] = defaultdict(list)
+
+        self.vertex_index_by_address: Dict[int, int] = {}
+
+        self.data_reference_index_by_source_instruction_index: Dict[int, List[int]] = defaultdict(list)
+        self.data_reference_index_by_target_address: Dict[int, List[int]] = defaultdict(list)
+
+        self._index_vertex_edges()
+        self._index_instruction_addresses()
+        self._index_flow_graph_nodes()
+        self._index_flow_graph_edges()
+        self._index_call_graph_vertices()
+        self._index_data_references()
+
+    def _index_vertex_edges(self):
+        for edge in self.be2.call_graph.edge:
+            if not edge.source_vertex_index:
+                continue
+            if not edge.target_vertex_index:
+                continue
+
+            self.callers_by_vertex_index[edge.target_vertex_index].append(edge.source_vertex_index)
+            self.callees_by_vertex_index[edge.source_vertex_index].append(edge.target_vertex_index)
+
+    def _index_instruction_addresses(self):
+        instruction_address = 0
+        for instruction_index, instruction in enumerate(self.be2.instruction):
+            if instruction.HasField("address"):
+                instruction_address = instruction.address
+
+            self.instruction_index_by_address[instruction_address] = instruction_index
+            self.instruction_address_by_index[instruction_index] = instruction_address
+
+            assert instruction.HasField("raw_bytes")
+            instruction_address += len(instruction.raw_bytes)
+
+    def _index_flow_graph_nodes(self):
+        for flow_graph_index, flow_graph in enumerate(self.be2.flow_graph):
+            for basic_block_index in flow_graph.basic_block_index:
+                basic_block = self.be2.basic_block[basic_block_index]
+                for instruction_index in self.instruction_indices(basic_block):
+                    basic_block_address = self.instruction_address_by_index[instruction_index]
+                    self.basic_block_index_by_address[basic_block_address] = basic_block_index
+                    self.basic_block_address_by_index[basic_block_index] = basic_block_address
+
+            entry_basic_block = self.be2.basic_block[flow_graph.entry_basic_block_index]
+            entry_instruction_index = next(self.instruction_indices(entry_basic_block))
+            entry_instruction_address = self.instruction_address_by_index[entry_instruction_index]
+            function_address = entry_instruction_address
+            self.flow_graph_index_by_address[function_address] = flow_graph_index
+
+    def _index_flow_graph_edges(self):
+        for flow_graph in self.be2.flow_graph:
+            for edge in flow_graph.edge:
+                if not edge.HasField("source_basic_block_index") or not edge.HasField("target_basic_block_index"):
+                    continue
+
+                self.source_edges_by_basic_block_index[edge.source_basic_block_index].append(edge)
+                self.target_edges_by_basic_block_index[edge.target_basic_block_index].append(edge)
+
+    def _index_call_graph_vertices(self):
+        for vertex_index, vertex in enumerate(self.be2.call_graph.vertex):
+            if not vertex.HasField("address"):
+                continue
+
+            vertex_address = vertex.address
+            self.vertex_index_by_address[vertex_address] = vertex_index
+
+    def _index_data_references(self):
+        for data_reference_index, data_reference in enumerate(self.be2.data_reference):
+            self.data_reference_index_by_source_instruction_index[data_reference.instruction_index].append(
+                data_reference_index
+            )
+            self.data_reference_index_by_target_address[data_reference.address].append(data_reference_index)
+
+    @staticmethod
+    def instruction_indices(basic_block: BinExport2.BasicBlock) -> Iterator[int]:
+        for index_range in basic_block.instruction_index:
+            if not index_range.HasField("end_index"):
+                yield index_range.begin_index
+                continue
+            else:
+                yield from range(index_range.begin_index, index_range.end_index)
+
+    def get_function_name_by_vertex(self, vertex_index: int) -> str:
+        vertex = self.be2.call_graph.vertex[vertex_index]
+        name = f"sub_{vertex.address:x}"
+        if vertex.HasField("mangled_name"):
+            name = vertex.mangled_name
+
+        if vertex.HasField("demangled_name"):
+            name = vertex.demangled_name
+
+        return name
+
+    def get_function_name_by_address(self, address: int) -> str:
+        if address not in self.vertex_index_by_address:
+            return ""
+
+        vertex_index = self.vertex_index_by_address[address]
+        return self.get_function_name_by_vertex(vertex_index)
+
+
 class Renderer:
     def __init__(self, o: io.StringIO):
         self.o = o
@@ -71,6 +191,7 @@ def main(argv=None):
 
     o = Renderer(io.StringIO())
     be2: BinExport2 = capa.features.extractors.binexport2.get_binexport2(args.input_file)
+    idx = BinExport2Index(be2)
 
     with o.section("meta"):
         o.writeln(f"name:   {be2.meta_information.executable_name}")
@@ -98,111 +219,12 @@ def main(argv=None):
         if not be2.library:
             o.writeln("(none)")
 
-    callers_by_vertex_index: Dict[int, List[int]] = defaultdict(list)
-    callees_by_vertex_index: Dict[int, List[int]] = defaultdict(list)
-
-    for edge in be2.call_graph.edge:
-        if not edge.source_vertex_index:
-            continue
-        if not edge.target_vertex_index:
-            continue
-
-        callers_by_vertex_index[edge.target_vertex_index].append(edge.source_vertex_index)
-        callees_by_vertex_index[edge.source_vertex_index].append(edge.target_vertex_index)
-
-    # note: flow graph != call graph (vertex)
-    flow_graph_index_by_address: Dict[int, int] = {}
-    basic_block_index_by_address: Dict[int, int] = {}
-    basic_block_address_by_index: Dict[int, int] = {}
-    instruction_index_by_address: Dict[int, int] = {}
-    instruction_address_by_index: Dict[int, int] = {}
-
-    instruction_address = 0
-    for instruction_index, instruction in enumerate(be2.instruction):
-        if instruction.HasField("address"):
-            instruction_address = instruction.address
-
-        instruction_index_by_address[instruction_address] = instruction_index
-        instruction_address_by_index[instruction_index] = instruction_address
-
-        assert instruction.HasField("raw_bytes")
-        instruction_address += len(instruction.raw_bytes)
-
-    def instruction_indices(basic_block: BinExport2.BasicBlock) -> Iterator[int]:
-        for index_range in basic_block.instruction_index:
-            if not index_range.HasField("end_index"):
-                yield index_range.begin_index
-                continue
-            else:
-                yield from range(index_range.begin_index, index_range.end_index)
-
-    for flow_graph_index, flow_graph in enumerate(be2.flow_graph):
-        for basic_block_index in flow_graph.basic_block_index:
-            basic_block = be2.basic_block[basic_block_index]
-            for instruction_index in instruction_indices(basic_block):
-                basic_block_address = instruction_address_by_index[instruction_index]
-                basic_block_index_by_address[basic_block_address] = basic_block_index
-                basic_block_address_by_index[basic_block_index] = basic_block_address
-
-        entry_basic_block = be2.basic_block[flow_graph.entry_basic_block_index]
-        entry_instruction_index = next(instruction_indices(entry_basic_block))
-        entry_instruction_address = instruction_address_by_index[entry_instruction_index]
-        function_address = entry_instruction_address
-        flow_graph_index_by_address[function_address] = flow_graph_index
-
-    # edges that come from the given basic block
-    source_edges_by_basic_block_index: Dict[int, List[BinExport2.FlowGraph.Edge]] = defaultdict(list)
-    # edges that end up at the given basic block
-    target_edges_by_basic_block_index: Dict[int, List[BinExport2.FlowGraph.Edge]] = defaultdict(list)
-
-    for flow_graph in be2.flow_graph:
-        for edge in flow_graph.edge:
-            if not edge.HasField("source_basic_block_index") or not edge.HasField("target_basic_block_index"):
-                continue
-
-            source_edges_by_basic_block_index[edge.source_basic_block_index].append(edge)
-            target_edges_by_basic_block_index[edge.target_basic_block_index].append(edge)
-
-    vertex_index_by_address: Dict[int, int] = {}
-
-    for vertex_index, vertex in enumerate(be2.call_graph.vertex):
-        if not vertex.HasField("address"):
-            continue
-
-        vertex_address = vertex.address
-        vertex_index_by_address[vertex_address] = vertex_index
-
-    def get_function_name_by_vertex(be2: BinExport2, vertex_index: int) -> str:
-        vertex = be2.call_graph.vertex[vertex_index]
-        name = f"sub_{vertex.address:x}"
-        if vertex.HasField("mangled_name"):
-            name = vertex.mangled_name
-
-        if vertex.HasField("demangled_name"):
-            name = vertex.demangled_name
-
-        return name
-
-    def get_function_name_by_address(be2: BinExport2, address: int) -> str:
-        if address not in vertex_index_by_address:
-            return ""
-
-        vertex_index = vertex_index_by_address[address]
-        return get_function_name_by_vertex(be2, vertex_index)
-
-    data_reference_index_by_source_instruction_index: Dict[int, List[int]] = defaultdict(list)
-    data_reference_index_by_target_address: Dict[int, List[int]] = defaultdict(list)
-
-    for data_reference_index, data_reference in enumerate(be2.data_reference):
-        data_reference_index_by_source_instruction_index[data_reference.instruction_index].append(data_reference_index)
-        data_reference_index_by_target_address[data_reference.address].append(data_reference_index)
-
     with o.section("functions"):
         for vertex_index, vertex in enumerate(be2.call_graph.vertex):
             if not vertex.HasField("address"):
                 continue
 
-            with o.section(f"function {get_function_name_by_vertex(be2, vertex_index)} @ {hex(vertex.address)}"):
+            with o.section(f"function {idx.get_function_name_by_vertex(vertex_index)} @ {hex(vertex.address)}"):
                 o.writeln(f"type:      {vertex.Type.Name(vertex.type)}")
 
                 if vertex.HasField("mangled_name"):
@@ -217,44 +239,44 @@ def get_function_name_by_address(be2: BinExport2, address: int) -> str:
                     o.writeln(f"library:   [{vertex.library_index}] {library.name}")
 
                 if vertex.HasField("module_index"):
-                    module = be2.library[vertex.module_index]
+                    module = be2.module[vertex.module_index]
                     o.writeln(f"module:    [{vertex.module_index}] {module.name}")
 
-                if callees_by_vertex_index[vertex_index] or callers_by_vertex_index[vertex_index]:
+                if idx.callees_by_vertex_index[vertex_index] or idx.callers_by_vertex_index[vertex_index]:
                     o.writeln("xrefs:")
 
-                    for caller_index in callers_by_vertex_index[vertex_index]:
-                        o.writeln(f"  ← {get_function_name_by_vertex(be2, caller_index)}")
+                    for caller_index in idx.callers_by_vertex_index[vertex_index]:
+                        o.writeln(f"  ← {idx.get_function_name_by_vertex(caller_index)}")
 
-                    for callee_index in callees_by_vertex_index[vertex_index]:
-                        o.writeln(f"  → {get_function_name_by_vertex(be2, callee_index)}")
+                    for callee_index in idx.callees_by_vertex_index[vertex_index]:
+                        o.writeln(f"  → {idx.get_function_name_by_vertex(callee_index)}")
 
-                if vertex.address not in flow_graph_index_by_address:
+                if vertex.address not in idx.flow_graph_index_by_address:
                     o.writeln("(no flow graph)")
                 else:
-                    flow_graph_index = flow_graph_index_by_address[vertex.address]
+                    flow_graph_index = idx.flow_graph_index_by_address[vertex.address]
                     flow_graph = be2.flow_graph[flow_graph_index]
 
                     o.writeln("")
                     for basic_block_index in flow_graph.basic_block_index:
                         basic_block = be2.basic_block[basic_block_index]
-                        basic_block_address = basic_block_address_by_index[basic_block_index]
+                        basic_block_address = idx.basic_block_address_by_index[basic_block_index]
 
                         with o.section(f"basic block {hex(basic_block_address)}"):
-                            for edge in target_edges_by_basic_block_index[basic_block_index]:
+                            for edge in idx.target_edges_by_basic_block_index[basic_block_index]:
                                 if edge.type == BinExport2.FlowGraph.Edge.Type.CONDITION_FALSE:
                                     continue
 
                                 source_basic_block_index = edge.source_basic_block_index
-                                source_basic_block_address = basic_block_address_by_index[source_basic_block_index]
+                                source_basic_block_address = idx.basic_block_address_by_index[source_basic_block_index]
 
                                 o.writeln(
                                     f"↓ {BinExport2.FlowGraph.Edge.Type.Name(edge.type)} basic block {hex(source_basic_block_address)}"
                                 )
 
-                            for instruction_index in instruction_indices(basic_block):
+                            for instruction_index in idx.instruction_indices(basic_block):
                                 instruction = be2.instruction[instruction_index]
-                                instruction_address = instruction_address_by_index[instruction_index]
+                                instruction_address = idx.instruction_address_by_index[instruction_index]
 
                                 mnemonic = be2.mnemonic[instruction.mnemonic_index]
 
@@ -262,13 +284,13 @@ def get_function_name_by_address(be2: BinExport2, address: int) -> str:
                                 if instruction.call_target:
                                     call_targets = " "
                                     for call_target_address in instruction.call_target:
-                                        call_target_name = get_function_name_by_address(be2, call_target_address)
+                                        call_target_name = idx.get_function_name_by_address(call_target_address)
                                         call_targets += f"→ function {call_target_name} @ {hex(call_target_address)} "
 
                                 data_references = ""
-                                if instruction_index in data_reference_index_by_source_instruction_index:
+                                if instruction_index in idx.data_reference_index_by_source_instruction_index:
                                     data_references = " "
-                                    for data_reference_index in data_reference_index_by_source_instruction_index[
+                                    for data_reference_index in idx.data_reference_index_by_source_instruction_index[
                                         instruction_index
                                     ]:
                                         data_reference = be2.data_reference[data_reference_index]
@@ -288,7 +310,7 @@ def get_function_name_by_address(be2: BinExport2, address: int) -> str:
                                 )
 
                             does_fallthrough = False
-                            for edge in source_edges_by_basic_block_index[basic_block_index]:
+                            for edge in idx.source_edges_by_basic_block_index[basic_block_index]:
                                 if edge.type == BinExport2.FlowGraph.Edge.Type.CONDITION_FALSE:
                                     does_fallthrough = True
                                     continue
@@ -298,7 +320,7 @@ def get_function_name_by_address(be2: BinExport2, address: int) -> str:
                                     back_edge = "↑"
 
                                 target_basic_block_index = edge.target_basic_block_index
-                                target_basic_block_address = basic_block_address_by_index[target_basic_block_index]
+                                target_basic_block_address = idx.basic_block_address_by_index[target_basic_block_index]
                                 o.writeln(
                                     f"→ {BinExport2.FlowGraph.Edge.Type.Name(edge.type)} basic block {hex(target_basic_block_address)} {back_edge}"
                                 )
@@ -307,16 +329,16 @@ def get_function_name_by_address(be2: BinExport2, address: int) -> str:
                                 o.writeln("↓ CONDITION_FALSE")
 
     with o.section("data"):
-        for data_address in sorted(data_reference_index_by_target_address.keys()):
-            if data_address in instruction_index_by_address:
+        for data_address in sorted(idx.data_reference_index_by_target_address.keys()):
+            if data_address in idx.instruction_index_by_address:
                 # appears to be code
                 continue
 
             data_references = ""
-            for data_reference_index in data_reference_index_by_target_address[data_address]:
+            for data_reference_index in idx.data_reference_index_by_target_address[data_address]:
                 data_reference = be2.data_reference[data_reference_index]
                 instruction_index = data_reference.instruction_index
-                instruction_address = instruction_address_by_index[instruction_index]
+                instruction_address = idx.instruction_address_by_index[instruction_index]
                 data_references += f"⇤ {hex(instruction_address)} "
             o.writeln(f"{hex(data_address)} {data_references}")
 

From 27f60f3317068a1ff09f40ee0587893450d21d32 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 30 Jan 2024 15:48:10 +0000
Subject: [PATCH 048/200] binexport: move index to binexport extractor module

---
 .../extractors/binexport2/__init__.py         | 129 +++++++++++++++++-
 scripts/inspect-binexport2.py                 | 124 +----------------
 2 files changed, 127 insertions(+), 126 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index e8a227c74..26ac0179b 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -12,7 +12,9 @@
 """
 import os
 import logging
+from typing import Any, Dict, List, Iterator
 from pathlib import Path
+from collections import defaultdict
 from dataclasses import dataclass
 
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
@@ -45,19 +47,140 @@ def get_sample_from_binexport2(be2: BinExport2) -> Path:
         raise ValueError("cannot find sample")
 
 
+class BinExport2Index:
+    def __init__(self, be2: BinExport2):
+        self.be2 = be2
+
+        self.callers_by_vertex_index: Dict[int, List[int]] = defaultdict(list)
+        self.callees_by_vertex_index: Dict[int, List[int]] = defaultdict(list)
+
+        # note: flow graph != call graph (vertex)
+        self.flow_graph_index_by_address: Dict[int, int] = {}
+        self.basic_block_index_by_address: Dict[int, int] = {}
+        self.basic_block_address_by_index: Dict[int, int] = {}
+        self.instruction_index_by_address: Dict[int, int] = {}
+        self.instruction_address_by_index: Dict[int, int] = {}
+
+        # edges that come from the given basic block
+        self.source_edges_by_basic_block_index: Dict[int, List[BinExport2.FlowGraph.Edge]] = defaultdict(list)
+        # edges that end up at the given basic block
+        self.target_edges_by_basic_block_index: Dict[int, List[BinExport2.FlowGraph.Edge]] = defaultdict(list)
+
+        self.vertex_index_by_address: Dict[int, int] = {}
+
+        self.data_reference_index_by_source_instruction_index: Dict[int, List[int]] = defaultdict(list)
+        self.data_reference_index_by_target_address: Dict[int, List[int]] = defaultdict(list)
+
+        self._index_vertex_edges()
+        self._index_instruction_addresses()
+        self._index_flow_graph_nodes()
+        self._index_flow_graph_edges()
+        self._index_call_graph_vertices()
+        self._index_data_references()
+
+    def _index_vertex_edges(self):
+        for edge in self.be2.call_graph.edge:
+            if not edge.source_vertex_index:
+                continue
+            if not edge.target_vertex_index:
+                continue
+
+            self.callers_by_vertex_index[edge.target_vertex_index].append(edge.source_vertex_index)
+            self.callees_by_vertex_index[edge.source_vertex_index].append(edge.target_vertex_index)
+
+    def _index_instruction_addresses(self):
+        instruction_address = 0
+        for instruction_index, instruction in enumerate(self.be2.instruction):
+            if instruction.HasField("address"):
+                instruction_address = instruction.address
+
+            self.instruction_index_by_address[instruction_address] = instruction_index
+            self.instruction_address_by_index[instruction_index] = instruction_address
+
+            assert instruction.HasField("raw_bytes")
+            instruction_address += len(instruction.raw_bytes)
+
+    def _index_flow_graph_nodes(self):
+        for flow_graph_index, flow_graph in enumerate(self.be2.flow_graph):
+            for basic_block_index in flow_graph.basic_block_index:
+                basic_block = self.be2.basic_block[basic_block_index]
+                for instruction_index in self.instruction_indices(basic_block):
+                    basic_block_address = self.instruction_address_by_index[instruction_index]
+                    self.basic_block_index_by_address[basic_block_address] = basic_block_index
+                    self.basic_block_address_by_index[basic_block_index] = basic_block_address
+
+            entry_basic_block = self.be2.basic_block[flow_graph.entry_basic_block_index]
+            entry_instruction_index = next(self.instruction_indices(entry_basic_block))
+            entry_instruction_address = self.instruction_address_by_index[entry_instruction_index]
+            function_address = entry_instruction_address
+            self.flow_graph_index_by_address[function_address] = flow_graph_index
+
+    def _index_flow_graph_edges(self):
+        for flow_graph in self.be2.flow_graph:
+            for edge in flow_graph.edge:
+                if not edge.HasField("source_basic_block_index") or not edge.HasField("target_basic_block_index"):
+                    continue
+
+                self.source_edges_by_basic_block_index[edge.source_basic_block_index].append(edge)
+                self.target_edges_by_basic_block_index[edge.target_basic_block_index].append(edge)
+
+    def _index_call_graph_vertices(self):
+        for vertex_index, vertex in enumerate(self.be2.call_graph.vertex):
+            if not vertex.HasField("address"):
+                continue
+
+            vertex_address = vertex.address
+            self.vertex_index_by_address[vertex_address] = vertex_index
+
+    def _index_data_references(self):
+        for data_reference_index, data_reference in enumerate(self.be2.data_reference):
+            self.data_reference_index_by_source_instruction_index[data_reference.instruction_index].append(
+                data_reference_index
+            )
+            self.data_reference_index_by_target_address[data_reference.address].append(data_reference_index)
+
+    @staticmethod
+    def instruction_indices(basic_block: BinExport2.BasicBlock) -> Iterator[int]:
+        for index_range in basic_block.instruction_index:
+            if not index_range.HasField("end_index"):
+                yield index_range.begin_index
+                continue
+            else:
+                yield from range(index_range.begin_index, index_range.end_index)
+
+    def get_function_name_by_vertex(self, vertex_index: int) -> str:
+        vertex = self.be2.call_graph.vertex[vertex_index]
+        name = f"sub_{vertex.address:x}"
+        if vertex.HasField("mangled_name"):
+            name = vertex.mangled_name
+
+        if vertex.HasField("demangled_name"):
+            name = vertex.demangled_name
+
+        return name
+
+    def get_function_name_by_address(self, address: int) -> str:
+        if address not in self.vertex_index_by_address:
+            return ""
+
+        vertex_index = self.vertex_index_by_address[address]
+        return self.get_function_name_by_vertex(vertex_index)
+
+
 @dataclass
 class FunctionContext:
     be2: BinExport2
-    function_index: int
+    idx: BinExport2Index
+    # TODO: typing
+    analysis: Any
+    flow_graph_index: int
 
 
 @dataclass
 class BasicBlockContext:
-    be2: BinExport2
     basic_block_index: int
 
 
 @dataclass
 class InstructionContext:
-    be2: BinExport2
     instruction_index: int
diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index b57ba0137..e9963584a 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -13,8 +13,6 @@
 import logging
 import argparse
 import contextlib
-from typing import Dict, List, Iterator
-from collections import defaultdict
 
 import capa.main
 import capa.features.extractors.binexport2
@@ -23,126 +21,6 @@
 logger = logging.getLogger("inspect-binexport2")
 
 
-class BinExport2Index:
-    def __init__(self, be2: BinExport2):
-        self.be2 = be2
-
-        self.callers_by_vertex_index: Dict[int, List[int]] = defaultdict(list)
-        self.callees_by_vertex_index: Dict[int, List[int]] = defaultdict(list)
-
-        # note: flow graph != call graph (vertex)
-        self.flow_graph_index_by_address: Dict[int, int] = {}
-        self.basic_block_index_by_address: Dict[int, int] = {}
-        self.basic_block_address_by_index: Dict[int, int] = {}
-        self.instruction_index_by_address: Dict[int, int] = {}
-        self.instruction_address_by_index: Dict[int, int] = {}
-
-        # edges that come from the given basic block
-        self.source_edges_by_basic_block_index: Dict[int, List[BinExport2.FlowGraph.Edge]] = defaultdict(list)
-        # edges that end up at the given basic block
-        self.target_edges_by_basic_block_index: Dict[int, List[BinExport2.FlowGraph.Edge]] = defaultdict(list)
-
-        self.vertex_index_by_address: Dict[int, int] = {}
-
-        self.data_reference_index_by_source_instruction_index: Dict[int, List[int]] = defaultdict(list)
-        self.data_reference_index_by_target_address: Dict[int, List[int]] = defaultdict(list)
-
-        self._index_vertex_edges()
-        self._index_instruction_addresses()
-        self._index_flow_graph_nodes()
-        self._index_flow_graph_edges()
-        self._index_call_graph_vertices()
-        self._index_data_references()
-
-    def _index_vertex_edges(self):
-        for edge in self.be2.call_graph.edge:
-            if not edge.source_vertex_index:
-                continue
-            if not edge.target_vertex_index:
-                continue
-
-            self.callers_by_vertex_index[edge.target_vertex_index].append(edge.source_vertex_index)
-            self.callees_by_vertex_index[edge.source_vertex_index].append(edge.target_vertex_index)
-
-    def _index_instruction_addresses(self):
-        instruction_address = 0
-        for instruction_index, instruction in enumerate(self.be2.instruction):
-            if instruction.HasField("address"):
-                instruction_address = instruction.address
-
-            self.instruction_index_by_address[instruction_address] = instruction_index
-            self.instruction_address_by_index[instruction_index] = instruction_address
-
-            assert instruction.HasField("raw_bytes")
-            instruction_address += len(instruction.raw_bytes)
-
-    def _index_flow_graph_nodes(self):
-        for flow_graph_index, flow_graph in enumerate(self.be2.flow_graph):
-            for basic_block_index in flow_graph.basic_block_index:
-                basic_block = self.be2.basic_block[basic_block_index]
-                for instruction_index in self.instruction_indices(basic_block):
-                    basic_block_address = self.instruction_address_by_index[instruction_index]
-                    self.basic_block_index_by_address[basic_block_address] = basic_block_index
-                    self.basic_block_address_by_index[basic_block_index] = basic_block_address
-
-            entry_basic_block = self.be2.basic_block[flow_graph.entry_basic_block_index]
-            entry_instruction_index = next(self.instruction_indices(entry_basic_block))
-            entry_instruction_address = self.instruction_address_by_index[entry_instruction_index]
-            function_address = entry_instruction_address
-            self.flow_graph_index_by_address[function_address] = flow_graph_index
-
-    def _index_flow_graph_edges(self):
-        for flow_graph in self.be2.flow_graph:
-            for edge in flow_graph.edge:
-                if not edge.HasField("source_basic_block_index") or not edge.HasField("target_basic_block_index"):
-                    continue
-
-                self.source_edges_by_basic_block_index[edge.source_basic_block_index].append(edge)
-                self.target_edges_by_basic_block_index[edge.target_basic_block_index].append(edge)
-
-    def _index_call_graph_vertices(self):
-        for vertex_index, vertex in enumerate(self.be2.call_graph.vertex):
-            if not vertex.HasField("address"):
-                continue
-
-            vertex_address = vertex.address
-            self.vertex_index_by_address[vertex_address] = vertex_index
-
-    def _index_data_references(self):
-        for data_reference_index, data_reference in enumerate(self.be2.data_reference):
-            self.data_reference_index_by_source_instruction_index[data_reference.instruction_index].append(
-                data_reference_index
-            )
-            self.data_reference_index_by_target_address[data_reference.address].append(data_reference_index)
-
-    @staticmethod
-    def instruction_indices(basic_block: BinExport2.BasicBlock) -> Iterator[int]:
-        for index_range in basic_block.instruction_index:
-            if not index_range.HasField("end_index"):
-                yield index_range.begin_index
-                continue
-            else:
-                yield from range(index_range.begin_index, index_range.end_index)
-
-    def get_function_name_by_vertex(self, vertex_index: int) -> str:
-        vertex = self.be2.call_graph.vertex[vertex_index]
-        name = f"sub_{vertex.address:x}"
-        if vertex.HasField("mangled_name"):
-            name = vertex.mangled_name
-
-        if vertex.HasField("demangled_name"):
-            name = vertex.demangled_name
-
-        return name
-
-    def get_function_name_by_address(self, address: int) -> str:
-        if address not in self.vertex_index_by_address:
-            return ""
-
-        vertex_index = self.vertex_index_by_address[address]
-        return self.get_function_name_by_vertex(vertex_index)
-
-
 class Renderer:
     def __init__(self, o: io.StringIO):
         self.o = o
@@ -191,7 +69,7 @@ def main(argv=None):
 
     o = Renderer(io.StringIO())
     be2: BinExport2 = capa.features.extractors.binexport2.get_binexport2(args.input_file)
-    idx = BinExport2Index(be2)
+    idx = capa.features.extractors.binexport2.BinExport2Index(be2)
 
     with o.section("meta"):
         o.writeln(f"name:   {be2.meta_information.executable_name}")

From 5d510c1d0fc3c0ab185951f3ae8f76287073f394 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 30 Jan 2024 15:48:27 +0000
Subject: [PATCH 049/200] binexport: implement ELF/aarch64 GOT/thunk analyzer

---
 .../extractors/binexport2/extractor.py        | 141 +++++++++++-------
 1 file changed, 83 insertions(+), 58 deletions(-)

diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
index 95e52aae5..3ddc5d2ec 100644
--- a/capa/features/extractors/binexport2/extractor.py
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -5,6 +5,7 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
+import logging
 from typing import Dict, List, Tuple, Iterator
 
 import capa.features.extractors.elf
@@ -15,7 +16,7 @@
 import capa.features.extractors.binexport2.basicblock
 from capa.features.common import Feature
 from capa.features.address import Address, AbsoluteVirtualAddress
-from capa.features.extractors.binexport2 import FunctionContext, BasicBlockContext, InstructionContext
+from capa.features.extractors.binexport2 import BinExport2Index, FunctionContext, BasicBlockContext, InstructionContext
 from capa.features.extractors.base_extractor import (
     BBHandle,
     InsnHandle,
@@ -25,30 +26,84 @@
 )
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
+logger = logging.getLogger(__name__)
+
+
+class BinExport2Analysis:
+    def __init__(self, be2: BinExport2, idx: BinExport2Index, buf: bytes):
+        self.be2 = be2
+        self.idx = idx
+        self.buf = buf
+
+        # from virtual address to import name
+        self.thunks: Dict[int, str] = {}
+
+    def _find_got_thunks(self):
+        if self.be2.meta_information.architecture_name != "aarch64":
+            logger.debug("skipping GOT thunk analysis on non-aarch64")
+            return
+
+        if not self.buf.startswith(capa.features.extractors.common.MATCH_ELF):
+            logger.debug("skipping GOT thunk analysis on non-ELF")
+            return
+
+        for vertex_index, vertex in enumerate(self.be2.call_graph.vertex):
+            if not vertex.HasField("address"):
+                continue
+
+            if not vertex.HasField("mangled_name"):
+                continue
+
+            if BinExport2.CallGraph.Vertex.Type.IMPORTED != vertex.type:
+                continue
+
+            if len(self.idx.callers_by_vertex_index[vertex_index]) != 1:
+                # find imports with a single caller,
+                # which should be the thunk
+                continue
+
+            maybe_thunk_vertex_index = self.idx.callers_by_vertex_index[vertex_index][0]
+            maybe_thunk_vertex = self.be2.call_graph.vertex[maybe_thunk_vertex_index]
+            maybe_thunk_address = maybe_thunk_vertex.address
+
+            maybe_thunk_flow_graph_index = self.idx.flow_graph_index_by_address[maybe_thunk_address]
+            maybe_thunk_flow_graph = self.be2.flow_graph[maybe_thunk_flow_graph_index]
+
+            if len(maybe_thunk_flow_graph.basic_block_index) != 1:
+                # should have a single basic block
+                continue
+
+            maybe_thunk_basic_block = self.be2.basic_block[maybe_thunk_flow_graph.entry_basic_block_index]
+            if len(list(self.idx.instruction_indices(maybe_thunk_basic_block))) != 4:
+                # fstat:
+                # 000008b0  adrp    x16, 0x11000
+                # 000008b4  ldr     x17, [x16, #0xf88]  {fstat}
+                # 000008b8  add     x16, x16, #0xf88  {fstat}
+                # 000008bc  br      x17
+                continue
+
+            thunk_address = maybe_thunk_address
+            thunk_name = vertex.mangled_name
+            logger.debug("found GOT thunk: 0x%x -> %s", thunk_address, thunk_name)
+
+            self.thunks[thunk_address] = thunk_name
+
 
 class BinExport2FeatureExtractor(StaticFeatureExtractor):
     def __init__(self, be2: BinExport2, buf: bytes):
         super().__init__(hashes=SampleHashes.from_bytes(buf))
         self.be2 = be2
         self.buf = buf
-
-        self.address_by_instruction_index: Dict[int, int] = {}
-        self.flow_graph_index_by_function_index: Dict[int, int] = {}
-        self.function_index_by_address: Dict[int, int] = {}
+        self.idx = BinExport2Index(self.be2)
+        self.analysis = BinExport2Analysis(self.be2, self.idx, self.buf)
 
         self.global_features: List[Tuple[Feature, Address]] = []
         self.global_features.extend(list(capa.features.extractors.common.extract_format(self.buf)))
         self.global_features.extend(list(capa.features.extractors.common.extract_os(self.buf)))
         self.global_features.extend(list(capa.features.extractors.common.extract_arch(self.buf)))
 
-        self._index_instruction_addresses()
-        self._index_basic_blocks_by_function()
-
-        print("base address", hex(self.get_base_address()))
-        ba = self.get_base_address()
-        for v in self.be2.call_graph.vertex:
-            if v.mangled_name:
-                print(hex(v.address - ba), v.mangled_name)
+        # TODO: assert supported file formats, arches
+        # and gradually relax restrictions as they're tested.
 
     def get_base_address(self):
         # TODO: assume the lowest address is the base address.
@@ -64,10 +119,12 @@ def extract_file_features(self):
         yield from capa.features.extractors.binexport2.file.extract_features(self.be2, self.buf)
 
     def get_functions(self) -> Iterator[FunctionHandle]:
-        for function_index in self.flow_graph_index_by_function_index.keys():
-            vertex = self.be2.call_graph.vertex[function_index]
+        for flow_graph_index, flow_graph in enumerate(self.be2.flow_graph):
+            entry_basic_block_index = flow_graph.entry_basic_block_index
+            flow_graph_address = self.idx.basic_block_address_by_index[entry_basic_block_index]
             yield FunctionHandle(
-                AbsoluteVirtualAddress(vertex.address), inner=FunctionContext(self.be2, function_index)
+                AbsoluteVirtualAddress(flow_graph_address),
+                inner=FunctionContext(self.be2, self.idx, self.analysis, flow_graph_index),
             )
 
     def extract_function_features(self, fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
@@ -75,60 +132,28 @@ def extract_function_features(self, fh: FunctionHandle) -> Iterator[Tuple[Featur
 
     def get_basic_blocks(self, fh: FunctionHandle) -> Iterator[BBHandle]:
         fhi: FunctionContext = fh.inner
-        flow_graph_index = self.flow_graph_index_by_function_index[fhi.function_index]
+        flow_graph_index = fhi.flow_graph_index
         flow_graph = self.be2.flow_graph[flow_graph_index]
 
         for basic_block_index in flow_graph.basic_block_index:
-            bb = self.be2.basic_block[basic_block_index]
+            basic_block_address = self.idx.basic_block_address_by_index[basic_block_index]
             yield BBHandle(
-                address=AbsoluteVirtualAddress(self.address_by_instruction_index[bb.instruction_index[0].begin_index]),
-                inner=BasicBlockContext(self.be2, basic_block_index),
+                address=AbsoluteVirtualAddress(basic_block_address),
+                inner=BasicBlockContext(basic_block_index),
             )
 
     def extract_basic_block_features(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
-        # TODO(wb): 1755
-        yield from ()
+        yield from capa.features.extractors.binexport2.basicblock.extract_features(fh, bbh)
 
     def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHandle]:
         bbi: BasicBlockContext = bbh.inner
-        bb: BinExport2.BasicBlock = self.be2.basic_block[bbi.basic_block_index]
-        for instruction_index in range(bb.instruction_index[0].begin_index, bb.instruction_index[0].end_index):
+        basic_block: BinExport2.BasicBlock = self.be2.basic_block[bbi.basic_block_index]
+        for instruction_index in self.idx.instruction_indices(basic_block):
+            instruction_address = self.idx.instruction_address_by_index[instruction_index]
             yield InsnHandle(
-                address=AbsoluteVirtualAddress(self.address_by_instruction_index[instruction_index]),
-                inner=InstructionContext(self.be2, instruction_index),
+                address=AbsoluteVirtualAddress(instruction_address),
+                inner=InstructionContext(instruction_index),
             )
 
     def extract_insn_features(self, fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle):
         yield from capa.features.extractors.binexport2.insn.extract_features(fh, bbh, ih)
-
-    def _index_instruction_addresses(self):
-        address = 0
-        next_address = 0
-        for instruction_index, instruction in enumerate(self.be2.instruction):
-            if instruction.HasField("address"):
-                address = instruction.address
-                next_address = address + len(instruction.raw_bytes)
-            else:
-                address = next_address
-                next_address += len(instruction.raw_bytes)
-
-            self.address_by_instruction_index[instruction_index] = address
-
-    def _index_basic_blocks_by_function(self):
-        function_index_from_address = {}
-
-        for index, vertex in enumerate(self.be2.call_graph.vertex):
-            function_index_from_address[vertex.address] = index
-
-        for flow_graph_index, flow_graph in enumerate(self.be2.flow_graph):
-            basic_block_entry_point = self.be2.basic_block[flow_graph.entry_basic_block_index]
-            basic_block_address = self.address_by_instruction_index[
-                basic_block_entry_point.instruction_index[0].begin_index
-            ]
-
-            if basic_block_address not in function_index_from_address:
-                continue
-
-            function_index = function_index_from_address[basic_block_address]
-
-            self.flow_graph_index_by_function_index[function_index] = flow_graph_index

From 162a0e81d73958cddccb1c45636d86c2d0bbc8e5 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 30 Jan 2024 15:48:44 +0000
Subject: [PATCH 050/200] binexport: implement API features

---
 capa/features/extractors/binexport2/insn.py | 42 +++++++++++----------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index bdf20f6d7..2d15f0476 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -15,40 +15,42 @@
 
 
 def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
+    from capa.features.extractors.binexport2.extractor import BinExport2Analysis
+
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
 
     be2 = fhi.be2
+    idx = fhi.idx
+    analysis: BinExport2Analysis = fhi.analysis
 
-    insn = be2.instruction[ii.instruction_index]
-    mnem = be2.mnemonic[insn.mnemonic_index]
-
-    if mnem.name not in ("call", "jmp", "BL"):
-        return
+    instruction = be2.instruction[ii.instruction_index]
 
-    if not insn.call_target:
+    if not instruction.call_target:
         return
 
-    address = insn.call_target[0]
-    print(hex(insn.address), "->", hex(address))
+    for call_target_address in instruction.call_target:
+        if call_target_address in analysis.thunks:
+            call_target_name = analysis.thunks[call_target_address]
+            yield API(call_target_name), AbsoluteVirtualAddress(call_target_address)
 
-    for vertex in be2.call_graph.vertex:
-        # TODO: need an index here
-        if vertex.address != address:
+        if call_target_address not in idx.vertex_index_by_address:
             continue
 
-        if not vertex.mangled_name:
-            continue
-
-        yield API(name=vertex.mangled_name), AbsoluteVirtualAddress(address)
-
-        if not vertex.HasField("library_index"):
+        vertex_index = idx.vertex_index_by_address[call_target_address]
+        vertex = be2.call_graph.vertex[vertex_index]
+        if not vertex.HasField("mangled_name"):
             continue
 
-        library = be2.library[vertex.library_index]
-        lib_name = library.name.split("\\")[-1].split(".")[0]
+        yield API(vertex.mangled_name), AbsoluteVirtualAddress(call_target_address)
 
-        yield API(name=f"{lib_name}.{vertex.mangled_name}"), AbsoluteVirtualAddress(address)
+        if vertex.HasField("library_index"):
+            # BUG: this seems to be incorrect
+            library = be2.library[vertex.library_index]
+            library_name = library.name
+            if library_name.endswith(".so"):
+                library_name = library_name.rpartition(".so")[0]
+            yield API(f"{library_name}.{vertex.mangled_name}"), AbsoluteVirtualAddress(call_target_address)
 
 
 def extract_insn_number_features(

From 3a943bf40f1a084beb1efce9a112653730e82f03 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 30 Jan 2024 16:00:43 +0000
Subject: [PATCH 051/200] binexport: record the full vertex for a thunk

---
 capa/features/extractors/binexport2/extractor.py | 9 ++++++---
 capa/features/extractors/binexport2/insn.py      | 9 ++++-----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
index 3ddc5d2ec..452fb2df4 100644
--- a/capa/features/extractors/binexport2/extractor.py
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -35,8 +35,8 @@ def __init__(self, be2: BinExport2, idx: BinExport2Index, buf: bytes):
         self.idx = idx
         self.buf = buf
 
-        # from virtual address to import name
-        self.thunks: Dict[int, str] = {}
+        # from virtual address to call graph vertex representing the import
+        self.thunks: Dict[int, int] = {}
 
     def _find_got_thunks(self):
         if self.be2.meta_information.architecture_name != "aarch64":
@@ -75,18 +75,21 @@ def _find_got_thunks(self):
 
             maybe_thunk_basic_block = self.be2.basic_block[maybe_thunk_flow_graph.entry_basic_block_index]
             if len(list(self.idx.instruction_indices(maybe_thunk_basic_block))) != 4:
+                # thunk should look like these four instructions.
                 # fstat:
                 # 000008b0  adrp    x16, 0x11000
                 # 000008b4  ldr     x17, [x16, #0xf88]  {fstat}
                 # 000008b8  add     x16, x16, #0xf88  {fstat}
                 # 000008bc  br      x17
+                # which relies on the disassembler to recognize the target of the call/br
+                # to go to the GOT/external symbol.
                 continue
 
             thunk_address = maybe_thunk_address
             thunk_name = vertex.mangled_name
             logger.debug("found GOT thunk: 0x%x -> %s", thunk_address, thunk_name)
 
-            self.thunks[thunk_address] = thunk_name
+            self.thunks[thunk_address] = vertex_index
 
 
 class BinExport2FeatureExtractor(StaticFeatureExtractor):
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 2d15f0476..baad209ba 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -31,13 +31,12 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
 
     for call_target_address in instruction.call_target:
         if call_target_address in analysis.thunks:
-            call_target_name = analysis.thunks[call_target_address]
-            yield API(call_target_name), AbsoluteVirtualAddress(call_target_address)
-
-        if call_target_address not in idx.vertex_index_by_address:
+            vertex_index = analysis.thunks[call_target_address]
+        elif call_target_address not in idx.vertex_index_by_address:
             continue
+        else:
+            vertex_index = idx.vertex_index_by_address[call_target_address]
 
-        vertex_index = idx.vertex_index_by_address[call_target_address]
         vertex = be2.call_graph.vertex[vertex_index]
         if not vertex.HasField("mangled_name"):
             continue

From f318129586b9951bc241f47867598c4eba0c4bba Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 30 Jan 2024 16:16:27 +0000
Subject: [PATCH 052/200] binexport: learn to extract numbers

---
 capa/features/extractors/binexport2/insn.py | 47 ++++++++++++++++++---
 1 file changed, 40 insertions(+), 7 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index baad209ba..97406f25d 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -7,11 +7,12 @@
 # See the License for the specific language governing permissions and limitations under the License.
 from typing import Tuple, Iterator
 
-from capa.features.insn import API
+from capa.features.insn import API, Number
 from capa.features.common import Feature
-from capa.features.address import Address, AbsoluteVirtualAddress
+from capa.features.address import Address
 from capa.features.extractors.binexport2 import FunctionContext, InstructionContext
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
+from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
 
 def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
@@ -41,7 +42,7 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
         if not vertex.HasField("mangled_name"):
             continue
 
-        yield API(vertex.mangled_name), AbsoluteVirtualAddress(call_target_address)
+        yield API(vertex.mangled_name), ih.address
 
         if vertex.HasField("library_index"):
             # BUG: this seems to be incorrect
@@ -49,14 +50,46 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
             library_name = library.name
             if library_name.endswith(".so"):
                 library_name = library_name.rpartition(".so")[0]
-            yield API(f"{library_name}.{vertex.mangled_name}"), AbsoluteVirtualAddress(call_target_address)
+            yield API(f"{library_name}.{vertex.mangled_name}"), ih.address
 
 
 def extract_insn_number_features(
-    fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
+    fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
 ) -> Iterator[Tuple[Feature, Address]]:
-    # TODO(wb): 1755
-    yield from ()
+    fhi: FunctionContext = fh.inner
+    ii: InstructionContext = ih.inner
+
+    be2 = fhi.be2
+
+    instruction = be2.instruction[ii.instruction_index]
+
+    for operand_index in instruction.operand_index:
+        operand = be2.operand[operand_index]
+
+        if len(operand.expression_index) != 2:
+            # we only care about immediate constants,
+            # which have a two expression node:
+            #
+            # - type: SIZE_PREFIX
+            #   symbol: "b8"
+            # - type: IMMEDIATE_INT
+            #   immediate: 20588728364
+            #   parent_index: 0
+            continue
+
+        expression0 = be2.expression[operand.expression_index[0]]
+        expression1 = be2.expression[operand.expression_index[1]]
+
+        if BinExport2.Expression.Type.SIZE_PREFIX != expression0.type:
+            continue
+
+        if BinExport2.Expression.Type.IMMEDIATE_INT != expression1.type:
+            continue
+
+        value = expression1.immediate
+        # TODO: skip small numbers
+        # TODO: skip mapped pointers
+        yield Number(value), ih.address
 
 
 def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:

From afbff1b5acdea6ad107ed83ff182bcdc1e0c3927 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 30 Jan 2024 20:19:48 +0000
Subject: [PATCH 053/200] binexport: number: skipped mapped numbers

---
 capa/features/extractors/binexport2/insn.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 97406f25d..e1f62c014 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -53,6 +53,14 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
             yield API(f"{library_name}.{vertex.mangled_name}"), ih.address
 
 
+def probe_memory(be2: BinExport2, address: int) -> bool:
+    """return True if the given address is mapped"""
+    for section in be2.section:
+        if section.address <= address < section.address + section.size:
+            return True
+    return False
+
+
 def extract_insn_number_features(
     fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
 ) -> Iterator[Tuple[Feature, Address]]:
@@ -88,7 +96,10 @@ def extract_insn_number_features(
 
         value = expression1.immediate
         # TODO: skip small numbers
-        # TODO: skip mapped pointers
+
+        if probe_memory(be2, value):
+            continue
+
         yield Number(value), ih.address
 
 

From eb72d41f95f6b5882fbf4356dc8c34a668cee90a Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 31 Jan 2024 10:57:46 +0000
Subject: [PATCH 054/200] binexport: fix basic block address indexing

---
 capa/features/extractors/binexport2/__init__.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index 26ac0179b..597ad547c 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -104,10 +104,10 @@ def _index_flow_graph_nodes(self):
         for flow_graph_index, flow_graph in enumerate(self.be2.flow_graph):
             for basic_block_index in flow_graph.basic_block_index:
                 basic_block = self.be2.basic_block[basic_block_index]
-                for instruction_index in self.instruction_indices(basic_block):
-                    basic_block_address = self.instruction_address_by_index[instruction_index]
-                    self.basic_block_index_by_address[basic_block_address] = basic_block_index
-                    self.basic_block_address_by_index[basic_block_index] = basic_block_address
+                first_instruction_index = next(self.instruction_indices(basic_block))
+                basic_block_address = self.instruction_address_by_index[first_instruction_index]
+                self.basic_block_index_by_address[basic_block_address] = basic_block_index
+                self.basic_block_address_by_index[basic_block_index] = basic_block_address
 
             entry_basic_block = self.be2.basic_block[flow_graph.entry_basic_block_index]
             entry_instruction_index = next(self.instruction_indices(entry_basic_block))

From 24ebea8bcc684c5623f7a2bb67d8deb1861ac185 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 31 Jan 2024 10:58:44 +0000
Subject: [PATCH 055/200] binexport: rename function

---
 capa/features/extractors/binexport2/insn.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index e1f62c014..2485ce609 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -53,7 +53,7 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
             yield API(f"{library_name}.{vertex.mangled_name}"), ih.address
 
 
-def probe_memory(be2: BinExport2, address: int) -> bool:
+def is_address_mapped(be2: BinExport2, address: int) -> bool:
     """return True if the given address is mapped"""
     for section in be2.section:
         if section.address <= address < section.address + section.size:
@@ -95,9 +95,10 @@ def extract_insn_number_features(
             continue
 
         value = expression1.immediate
-        # TODO: skip small numbers
 
-        if probe_memory(be2, value):
+        # TODO: skip small numbers?
+
+        if is_address_mapped(be2, value):
             continue
 
         yield Number(value), ih.address

From e9e93dac48d76ddabfd1476fa0d65735f6b19dbd Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 31 Jan 2024 10:59:29 +0000
Subject: [PATCH 056/200] binexport: extract operand numbers

---
 capa/features/extractors/binexport2/insn.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 2485ce609..4fac93f8a 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -7,9 +7,9 @@
 # See the License for the specific language governing permissions and limitations under the License.
 from typing import Tuple, Iterator
 
-from capa.features.insn import API, Number
 from capa.features.common import Feature
 from capa.features.address import Address
+from capa.features.insn import API, Number, OperandNumber
 from capa.features.extractors.binexport2 import FunctionContext, InstructionContext
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
@@ -71,7 +71,7 @@ def extract_insn_number_features(
 
     instruction = be2.instruction[ii.instruction_index]
 
-    for operand_index in instruction.operand_index:
+    for i, operand_index in enumerate(instruction.operand_index):
         operand = be2.operand[operand_index]
 
         if len(operand.expression_index) != 2:
@@ -102,6 +102,7 @@ def extract_insn_number_features(
             continue
 
         yield Number(value), ih.address
+        yield OperandNumber(i, value), ih.address
 
 
 def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:

From a405d4c6ec9c4c61116fa62482ba80c6afc747c4 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 31 Jan 2024 10:59:58 +0000
Subject: [PATCH 057/200] binexport: learn to extract calls from
 characteristics

---
 capa/features/extractors/binexport2/insn.py | 24 ++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 4fac93f8a..b6b0445a4 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -7,9 +7,9 @@
 # See the License for the specific language governing permissions and limitations under the License.
 from typing import Tuple, Iterator
 
-from capa.features.common import Feature
-from capa.features.address import Address
 from capa.features.insn import API, Number, OperandNumber
+from capa.features.common import Feature, Characteristic
+from capa.features.address import Address, AbsoluteVirtualAddress
 from capa.features.extractors.binexport2 import FunctionContext, InstructionContext
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
@@ -141,10 +141,24 @@ def extract_insn_mnemonic_features(
 def extract_function_calls_from(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
     """extract functions calls from features
 
-    most relevant at the function scope, however, its most efficient to extract at the instruction scope
+    most relevant at the function scope;
+    however, its most efficient to extract at the instruction scope.
     """
-    # TODO(wb): 1755
-    yield from ()
+    fhi: FunctionContext = fh.inner
+    ii: InstructionContext = ih.inner
+
+    be2 = fhi.be2
+
+    instruction = be2.instruction[ii.instruction_index]
+    if not instruction.call_target:
+        return
+
+    for call_target_address in instruction.call_target:
+        addr = AbsoluteVirtualAddress(call_target_address)
+        yield Characteristic("calls from"), addr
+
+        if fh.address == addr:
+            yield Characteristic("recursive call"), addr
 
 
 def extract_function_indirect_call_characteristic_features(

From 874fa89e7da2e9129da1b347ebdd70ea2e7c91e0 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 31 Jan 2024 11:06:51 +0000
Subject: [PATCH 058/200] binexport: learn to extract mnemonics

---
 capa/features/extractors/binexport2/insn.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index b6b0445a4..6543f9831 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -7,7 +7,7 @@
 # See the License for the specific language governing permissions and limitations under the License.
 from typing import Tuple, Iterator
 
-from capa.features.insn import API, Number, OperandNumber
+from capa.features.insn import API, Number, Mnemonic, OperandNumber
 from capa.features.common import Feature, Characteristic
 from capa.features.address import Address, AbsoluteVirtualAddress
 from capa.features.extractors.binexport2 import FunctionContext, InstructionContext
@@ -134,8 +134,15 @@ def extract_insn_nzxor_characteristic_features(
 def extract_insn_mnemonic_features(
     fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
 ) -> Iterator[Tuple[Feature, Address]]:
-    # TODO(wb): 1755
-    yield from ()
+    fhi: FunctionContext = fh.inner
+    ii: InstructionContext = ih.inner
+
+    be2 = fhi.be2
+
+    instruction = be2.instruction[ii.instruction_index]
+    mnemonic = be2.mnemonic[instruction.mnemonic_index]
+    mnemonic_name = mnemonic.name.lower()
+    yield Mnemonic(mnemonic_name), ih.address
 
 
 def extract_function_calls_from(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:

From 159a796f4e2ca1018e9b4cc2e2790b04b1e868ff Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 31 Jan 2024 11:25:28 +0000
Subject: [PATCH 059/200] pre-commit: skip protobuf file

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f502ce73a..9f299d311 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -86,7 +86,7 @@ repos:
         -   "--config"
         -   ".github/flake8.ini"
         -   "--extend-exclude"
-        -   "capa/render/proto/capa_pb2.py,capa/features/extractors/binexport2/binexport_pb2.py"
+        -   "capa/render/proto/capa_pb2.py,capa/features/extractors/binexport2/binexport2_pb2.py"
         -   "capa/"
         -   "scripts/"
         -   "tests/"

From a1ad2d03a750f47d25036880ec2711f37f336323 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 31 Jan 2024 11:25:53 +0000
Subject: [PATCH 060/200] binexport: better search for sample file

---
 .../extractors/binexport2/__init__.py         | 53 ++++++++++++++-----
 capa/main.py                                  |  2 +-
 2 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index 597ad547c..c9131752f 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -11,6 +11,7 @@
     protoc --python_out=. --mypy_out=. binexport2.proto
 """
 import os
+import hashlib
 import logging
 from typing import Any, Dict, List, Iterator
 from pathlib import Path
@@ -28,23 +29,47 @@ def get_binexport2(sample: Path) -> BinExport2:
     return be2
 
 
-def get_sample_from_binexport2(be2: BinExport2) -> Path:
-    # also search in same directory as input
-    # for files with the given sha256,
-    # starting with files with a similar prefix as given.
-    # TODO(wb): 1755
+def compute_common_prefix_length(m: str, n: str) -> int:
+    # ensure #m < #n
+    if len(n) < len(m):
+        m, n = n, m
 
-    # $CAPA_SAMPLE_DIR/<sha256>
-    base = Path(os.environ.get("CAPA_SAMPLES_DIR", "."))
+    for i, c in enumerate(m):
+        if n[i] != c:
+            return i
+
+    return len(m)
+
+
+def get_sample_from_binexport2(input_file: Path, be2: BinExport2) -> Path:
+    """attempt to find the sample file, given a BinExport2 file.
+
+    searches in the same directory as the BinExport2 file, and then
+    in $CAPA_SAMPLES_DIR.
+    """
 
-    sha256 = be2.meta_information.executable_id.lower()
+    def filename_similarity_key(p: Path):
+        # note closure over input_file.
+        # sort first by length of common prefix, then by name (for stability)
+        return (compute_common_prefix_length(p.name, input_file.name), p.name)
+
+    wanted_sha256 = be2.meta_information.executable_id.lower()
+
+    input_directory = input_file.parent
+    siblings = [p for p in input_directory.iterdir() if p.is_file()]
+    siblings.sort(key=filename_similarity_key, reverse=True)
+    for sibling in siblings:
+        if hashlib.sha256(sibling.read_bytes()).hexdigest().lower() == wanted_sha256:
+            return sibling
+
+    base = Path(os.environ.get("CAPA_SAMPLES_DIR", "."))
+    candidates = [p for p in base.iterdir() if p.is_file()]
+    candidates.sort(key=filename_similarity_key, reverse=True)
+    for candidate in candidates:
+        if hashlib.sha256(candidate.read_bytes()).hexdigest().lower() == wanted_sha256:
+            return candidate
 
-    logger.debug("searching for sample in: %s", base)
-    path = base / sha256
-    if path.exists():
-        return path
-    else:
-        raise ValueError("cannot find sample")
+    raise ValueError("cannot find sample")
 
 
 class BinExport2Index:
diff --git a/capa/main.py b/capa/main.py
index 2f130952f..b113043a1 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -569,7 +569,7 @@ def get_sample_path_from_cli(args, backend: str) -> Optional[Path]:
         import capa.features.extractors.binexport2
 
         be2 = capa.features.extractors.binexport2.get_binexport2(args.input_file)
-        return capa.features.extractors.binexport2.get_sample_from_binexport2(be2)
+        return capa.features.extractors.binexport2.get_sample_from_binexport2(args.input_file, be2)
     else:
         return args.input_file
 

From 673048f7d03e7fc85752cba0de7734e394b4662e Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 31 Jan 2024 11:43:06 +0000
Subject: [PATCH 061/200] loader: add file extractors for BinExport2

---
 capa/loader.py                | 24 +++++++++++++++++++++---
 scripts/inspect-binexport2.py |  2 +-
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/capa/loader.py b/capa/loader.py
index d71a9dec5..6d026ec3b 100644
--- a/capa/loader.py
+++ b/capa/loader.py
@@ -285,6 +285,26 @@ def get_extractor(
         raise ValueError("unexpected backend: " + backend)
 
 
+def _get_binexport2_file_extractors(input_file: Path, input_format: str) -> List[FeatureExtractor]:
+    # I'm not sure this is where this logic should live, but it works for now.
+    # we'll keep this a "private" routine until we're sure.
+    import capa.features.extractors.binexport2
+
+    be2 = capa.features.extractors.binexport2.get_binexport2(input_file)
+    sample_path = capa.features.extractors.binexport2.get_sample_from_binexport2(input_file, be2)
+
+    with sample_path.open("rb") as f:
+        taste = f.read()
+
+    if taste.startswith(capa.features.extractors.common.MATCH_PE):
+        return get_file_extractors(sample_path, FORMAT_PE)
+    elif taste.startswith(capa.features.extractors.common.MATCH_ELF):
+        return get_file_extractors(sample_path, FORMAT_ELF)
+    else:
+        logger.warning("unsupported format")
+        return []
+
+
 def get_file_extractors(input_file: Path, input_format: str) -> List[FeatureExtractor]:
     file_extractors: List[FeatureExtractor] = []
 
@@ -303,9 +323,7 @@ def get_file_extractors(input_file: Path, input_format: str) -> List[FeatureExtr
         file_extractors.append(capa.features.extractors.cape.extractor.CapeExtractor.from_report(report))
 
     elif input_format == FORMAT_BINEXPORT2:
-        # pick pefile/elffile from sample path, after detection
-        # TODO(wb): 1755
-        pass
+        file_extractors = _get_binexport2_file_extractors(input_file, input_format)
 
     return file_extractors
 
diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index e9963584a..12c05a14b 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -112,7 +112,7 @@ def main(argv=None):
                     o.writeln(f"demangled: {vertex.demangled_name}")
 
                 if vertex.HasField("library_index"):
-                    # BUG: this seems to be incorrect
+                    # TODO(williballenthin): this seems to be incorrect
                     library = be2.library[vertex.library_index]
                     o.writeln(f"library:   [{vertex.library_index}] {library.name}")
 

From 0f5d47c7617f43b032e9fdcc82dd905810cf94a4 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 31 Jan 2024 12:48:19 +0000
Subject: [PATCH 062/200] binexport: remove extra parameter

---
 capa/features/extractors/binexport2/insn.py | 30 ++++++++++++++++++---
 capa/loader.py                              |  4 +--
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 6543f9831..35e312d68 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -7,7 +7,8 @@
 # See the License for the specific language governing permissions and limitations under the License.
 from typing import Tuple, Iterator
 
-from capa.features.insn import API, Number, Mnemonic, OperandNumber
+import capa.features.extractors.helpers
+from capa.features.insn import API, Number, Mnemonic, OperandNumber, Bytes
 from capa.features.common import Feature, Characteristic
 from capa.features.address import Address, AbsoluteVirtualAddress
 from capa.features.extractors.binexport2 import FunctionContext, InstructionContext
@@ -106,8 +107,31 @@ def extract_insn_number_features(
 
 
 def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
-    # TODO(wb): 1755
-    yield from ()
+    fhi: FunctionContext = fh.inner
+    ii: InstructionContext = ih.inner
+
+    be2 = fhi.be2
+    idx = fhi.idx
+
+    instruction_index = ii.instruction_index
+
+    if instruction_index in idx.data_reference_index_by_source_instruction_index:
+        for data_reference_index in idx.data_reference_index_by_source_instruction_index[
+            instruction_index
+        ]:
+            data_reference = be2.data_reference[data_reference_index]
+            data_reference_address = data_reference.address
+
+            # TODO: read data
+            buf = b""
+
+            if capa.features.extractors.helpers.all_zeros(buf):
+                continue
+
+            if is_probably_string(buf):
+                pass
+            else:
+                yield Bytes(buf), ih.address
 
 
 def extract_insn_string_features(
diff --git a/capa/loader.py b/capa/loader.py
index 6d026ec3b..a412aaef2 100644
--- a/capa/loader.py
+++ b/capa/loader.py
@@ -285,7 +285,7 @@ def get_extractor(
         raise ValueError("unexpected backend: " + backend)
 
 
-def _get_binexport2_file_extractors(input_file: Path, input_format: str) -> List[FeatureExtractor]:
+def _get_binexport2_file_extractors(input_file: Path) -> List[FeatureExtractor]:
     # I'm not sure this is where this logic should live, but it works for now.
     # we'll keep this a "private" routine until we're sure.
     import capa.features.extractors.binexport2
@@ -323,7 +323,7 @@ def get_file_extractors(input_file: Path, input_format: str) -> List[FeatureExtr
         file_extractors.append(capa.features.extractors.cape.extractor.CapeExtractor.from_report(report))
 
     elif input_format == FORMAT_BINEXPORT2:
-        file_extractors = _get_binexport2_file_extractors(input_file, input_format)
+        file_extractors = _get_binexport2_file_extractors(input_file)
 
     return file_extractors
 

From ffce03beaa6c30da82da44d7d96c7578c7a16777 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 31 Jan 2024 14:18:41 +0000
Subject: [PATCH 063/200] new black config

---
 capa/features/address.py         | 3 +--
 capa/features/freeze/__init__.py | 1 +
 capa/ida/plugin/form.py          | 6 +++---
 capa/ida/plugin/view.py          | 8 +++++---
 capa/loader.py                   | 4 ++--
 capa/render/result_document.py   | 9 ++++-----
 capa/render/verbose.py           | 1 +
 scripts/cache-ruleset.py         | 1 +
 scripts/capafmt.py               | 1 +
 scripts/import-to-ida.py         | 1 +
 scripts/lint.py                  | 1 +
 11 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/capa/features/address.py b/capa/features/address.py
index 800cefcd3..0edf4cec2 100644
--- a/capa/features/address.py
+++ b/capa/features/address.py
@@ -10,8 +10,7 @@
 
 class Address(abc.ABC):
     @abc.abstractmethod
-    def __eq__(self, other):
-        ...
+    def __eq__(self, other): ...
 
     @abc.abstractmethod
     def __lt__(self, other):
diff --git a/capa/features/freeze/__init__.py b/capa/features/freeze/__init__.py
index 2dac7f48e..258ba07a8 100644
--- a/capa/features/freeze/__init__.py
+++ b/capa/features/freeze/__init__.py
@@ -9,6 +9,7 @@
  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and limitations under the License.
 """
+
 import json
 import zlib
 import logging
diff --git a/capa/ida/plugin/form.py b/capa/ida/plugin/form.py
index ddd4c4e0d..301ead723 100644
--- a/capa/ida/plugin/form.py
+++ b/capa/ida/plugin/form.py
@@ -932,9 +932,9 @@ def get_ask_use_persistent_cache(self, analyze):
                     update_wait_box("verifying cached results")
 
                     try:
-                        results: Optional[
-                            capa.render.result_document.ResultDocument
-                        ] = capa.ida.helpers.load_and_verify_cached_results()
+                        results: Optional[capa.render.result_document.ResultDocument] = (
+                            capa.ida.helpers.load_and_verify_cached_results()
+                        )
                     except Exception as e:
                         capa.ida.helpers.inform_user_ida_ui("Failed to verify cached results, reanalyzing program")
                         logger.exception("Failed to verify cached results (error: %s)", e)
diff --git a/capa/ida/plugin/view.py b/capa/ida/plugin/view.py
index 0225e453c..bbb8287a2 100644
--- a/capa/ida/plugin/view.py
+++ b/capa/ida/plugin/view.py
@@ -200,9 +200,11 @@ def load_preview_meta(self, ea, author, scope):
             "    references:",
             "      - <insert_references>",
             "    examples:",
-            f"      - {capa.ida.helpers.get_file_md5().upper()}:{hex(ea)}"
-            if ea
-            else f"      - {capa.ida.helpers.get_file_md5().upper()}",
+            (
+                f"      - {capa.ida.helpers.get_file_md5().upper()}:{hex(ea)}"
+                if ea
+                else f"      - {capa.ida.helpers.get_file_md5().upper()}"
+            ),
             "  features:",
         ]
         self.setText("\n".join(metadata_default))
diff --git a/capa/loader.py b/capa/loader.py
index a412aaef2..660d3ad1a 100644
--- a/capa/loader.py
+++ b/capa/loader.py
@@ -517,7 +517,7 @@ def result_rec(result: capa.features.common.Result):
                     )
                     for t in threads
                     if t in matched_threads
-                )  # this object is open to extension in the future,
+                ),  # this object is open to extension in the future,
                 # such as with the function name, etc.
             )
             for p, threads in threads_by_process.items()
@@ -559,7 +559,7 @@ def compute_static_layout(rules: RuleSet, extractor: StaticFeatureExtractor, cap
                 address=frz.Address.from_capa(f),
                 matched_basic_blocks=tuple(
                     rdoc.BasicBlockLayout(address=frz.Address.from_capa(bb)) for bb in bbs if bb in matched_bbs
-                )  # this object is open to extension in the future,
+                ),  # this object is open to extension in the future,
                 # such as with the function name, etc.
             )
             for f, bbs in bbs_by_function.items()
diff --git a/capa/render/result_document.py b/capa/render/result_document.py
index 2ef85185e..ce95245e0 100644
--- a/capa/render/result_document.py
+++ b/capa/render/result_document.py
@@ -160,8 +160,7 @@ class CompoundStatementType:
     OPTIONAL = "optional"
 
 
-class StatementModel(FrozenModel):
-    ...
+class StatementModel(FrozenModel): ...
 
 
 class CompoundStatement(StatementModel):
@@ -650,9 +649,9 @@ def from_capa(cls, meta: Metadata, rules: RuleSet, capabilities: MatchResults) -
         return ResultDocument(meta=meta, rules=rule_matches)
 
     def to_capa(self) -> Tuple[Metadata, Dict]:
-        capabilities: Dict[
-            str, List[Tuple[capa.features.address.Address, capa.features.common.Result]]
-        ] = collections.defaultdict(list)
+        capabilities: Dict[str, List[Tuple[capa.features.address.Address, capa.features.common.Result]]] = (
+            collections.defaultdict(list)
+        )
 
         # this doesn't quite work because we don't have the rule source for rules that aren't matched.
         rules_by_name = {
diff --git a/capa/render/verbose.py b/capa/render/verbose.py
index f6f566dec..44024acf4 100644
--- a/capa/render/verbose.py
+++ b/capa/render/verbose.py
@@ -22,6 +22,7 @@
  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and limitations under the License.
 """
+
 from typing import cast
 
 import tabulate
diff --git a/scripts/cache-ruleset.py b/scripts/cache-ruleset.py
index 0e364622b..8a10cf504 100644
--- a/scripts/cache-ruleset.py
+++ b/scripts/cache-ruleset.py
@@ -15,6 +15,7 @@
  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and limitations under the License.
 """
+
 import sys
 import logging
 import argparse
diff --git a/scripts/capafmt.py b/scripts/capafmt.py
index de4171ea8..be46b2ade 100644
--- a/scripts/capafmt.py
+++ b/scripts/capafmt.py
@@ -14,6 +14,7 @@
  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and limitations under the License.
 """
+
 import sys
 import logging
 import argparse
diff --git a/scripts/import-to-ida.py b/scripts/import-to-ida.py
index e52a029d2..3c6533047 100644
--- a/scripts/import-to-ida.py
+++ b/scripts/import-to-ida.py
@@ -28,6 +28,7 @@
  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and limitations under the License.
 """
+
 import logging
 import binascii
 from pathlib import Path
diff --git a/scripts/lint.py b/scripts/lint.py
index 93440395d..49ff70e6a 100644
--- a/scripts/lint.py
+++ b/scripts/lint.py
@@ -13,6 +13,7 @@
  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and limitations under the License.
 """
+
 import gc
 import os
 import re

From 249398ae8f64a28f7e80994ec92cca1b6d783b3c Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 31 Jan 2024 14:19:05 +0000
Subject: [PATCH 064/200] binexport: index string xrefs

---
 capa/features/extractors/binexport2/__init__.py |  8 ++++++++
 scripts/inspect-binexport2.py                   | 13 ++++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index c9131752f..eda274a41 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -95,6 +95,7 @@ def __init__(self, be2: BinExport2):
 
         self.data_reference_index_by_source_instruction_index: Dict[int, List[int]] = defaultdict(list)
         self.data_reference_index_by_target_address: Dict[int, List[int]] = defaultdict(list)
+        self.string_reference_index_by_source_instruction_index: Dict[int, List[int]] = defaultdict(list)
 
         self._index_vertex_edges()
         self._index_instruction_addresses()
@@ -102,6 +103,7 @@ def __init__(self, be2: BinExport2):
         self._index_flow_graph_edges()
         self._index_call_graph_vertices()
         self._index_data_references()
+        self._index_string_references()
 
     def _index_vertex_edges(self):
         for edge in self.be2.call_graph.edge:
@@ -164,6 +166,12 @@ def _index_data_references(self):
             )
             self.data_reference_index_by_target_address[data_reference.address].append(data_reference_index)
 
+    def _index_string_references(self):
+        for string_reference_index, string_reference in enumerate(self.be2.string_reference):
+            self.string_reference_index_by_source_instruction_index[string_reference.instruction_index].append(
+                string_reference_index
+            )
+
     @staticmethod
     def instruction_indices(basic_block: BinExport2.BasicBlock) -> Iterator[int]:
         for index_range in basic_block.instruction_index:
diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index 12c05a14b..3dfb9c804 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -175,6 +175,17 @@ def main(argv=None):
                                         data_reference_address = data_reference.address
                                         data_references += f"⇥ data {hex(data_reference_address)} "
 
+                                string_references = ""
+                                if instruction_index in idx.string_reference_index_by_source_instruction_index:
+                                    string_references = " "
+                                    for (
+                                        string_reference_index
+                                    ) in idx.string_reference_index_by_source_instruction_index[instruction_index]:
+                                        string_reference = be2.string_reference[string_reference_index]
+                                        string_index = string_reference.string_table_index
+                                        string = be2.string_table[string_index]
+                                        string_references += f'⇥ string "{string}" '
+
                                 comments = ""
                                 if instruction.comment_index:
                                     comments = " "
@@ -184,7 +195,7 @@ def main(argv=None):
                                         comments += f"; {BinExport2.Comment.Type.Name(comment.type)} {comment_string} "
 
                                 o.writeln(
-                                    f"{hex(instruction_address)}  {mnemonic.name:<12s}{call_targets}{data_references}{comments}"
+                                    f"{hex(instruction_address)}  {mnemonic.name:<12s}{call_targets}{data_references}{string_references}{comments}"
                                 )
 
                             does_fallthrough = False

From 5e9b308dfd7c6b7ff1f5891c5bbff3511616f376 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 31 Jan 2024 14:19:25 +0000
Subject: [PATCH 065/200] binexport: learn to extract bytes and strings

---
 .../extractors/binexport2/__init__.py         |   1 +
 .../extractors/binexport2/extractor.py        |   2 +-
 capa/features/extractors/binexport2/insn.py   | 111 +++++++++++++++---
 3 files changed, 99 insertions(+), 15 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index eda274a41..5f26a1dd2 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -202,6 +202,7 @@ def get_function_name_by_address(self, address: int) -> str:
 
 @dataclass
 class FunctionContext:
+    sample_bytes: bytes
     be2: BinExport2
     idx: BinExport2Index
     # TODO: typing
diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
index 452fb2df4..09a4406cb 100644
--- a/capa/features/extractors/binexport2/extractor.py
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -127,7 +127,7 @@ def get_functions(self) -> Iterator[FunctionHandle]:
             flow_graph_address = self.idx.basic_block_address_by_index[entry_basic_block_index]
             yield FunctionHandle(
                 AbsoluteVirtualAddress(flow_graph_address),
-                inner=FunctionContext(self.be2, self.idx, self.analysis, flow_graph_index),
+                inner=FunctionContext(self.buf, self.be2, self.idx, self.analysis, flow_graph_index),
             )
 
     def extract_function_features(self, fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 35e312d68..8f232a2b3 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -5,16 +5,24 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
+import io
+import logging
 from typing import Tuple, Iterator
 
+import pefile
+from elftools.elf.elffile import ELFFile
+
 import capa.features.extractors.helpers
-from capa.features.insn import API, Number, Mnemonic, OperandNumber, Bytes
-from capa.features.common import Feature, Characteristic
+import capa.features.extractors.strings
+from capa.features.insn import API, Number, Mnemonic, OperandNumber
+from capa.features.common import Bytes, String, Feature, Characteristic
 from capa.features.address import Address, AbsoluteVirtualAddress
 from capa.features.extractors.binexport2 import FunctionContext, InstructionContext
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
+logger = logging.getLogger(__name__)
+
 
 def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
     from capa.features.extractors.binexport2.extractor import BinExport2Analysis
@@ -106,39 +114,114 @@ def extract_insn_number_features(
         yield OperandNumber(i, value), ih.address
 
 
-def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
+class ReadMemoryError(ValueError): ...
+
+
+def read_memory(be2: BinExport2, sample_bytes: bytes, address: int, size: int) -> bytes:
+    base_address = min(map(lambda s: s.address, be2.section))
+    rva = address - base_address
+
+    try:
+        # TODO: cache the parsed file
+
+        if sample_bytes.startswith(capa.features.extractors.common.MATCH_PE):
+            pe = pefile.PE(data=sample_bytes)
+            return pe.get_data(rva, size)
+        elif sample_bytes.startswith(capa.features.extractors.common.MATCH_ELF):
+            elf = ELFFile(io.BytesIO(sample_bytes))
+
+            # ELF segments are for runtime data,
+            # ELF sections are for link-time data.
+            for segment in elf.iter_segments():
+                # assume p_align is consistent with addresses here.
+                # otherwise, should harden this loader.
+                segment_rva = segment.header.p_vaddr
+                segment_size = segment.header.p_memsz
+                if segment_rva <= rva < segment_rva + segment_size:
+                    segment_data = segment.data()
+
+                    # pad the section with NULLs
+                    # assume page alignment is already handled.
+                    # might need more hardening here.
+                    if len(segment_data) < segment_size:
+                        segment_data += b"\x00" * (segment_size - len(segment_data))
+
+                    segment_offset = rva - segment_rva
+                    return segment_data[segment_offset : segment_offset + size]
+        else:
+            logger.warning("unsupported format")
+            raise ReadMemoryError("unsupported file format")
+    except Exception as e:
+        # TODO: remove logging message here
+        logger.warning("failed to read memory: %s", e, exc_info=True)
+
+        raise ReadMemoryError("failed to read memory: " + str(e)) from e
+
+
+def extract_insn_bytes_features(
+    fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
+) -> Iterator[Tuple[Feature, Address]]:
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
 
     be2 = fhi.be2
+    sample_bytes = fhi.sample_bytes
     idx = fhi.idx
 
     instruction_index = ii.instruction_index
 
     if instruction_index in idx.data_reference_index_by_source_instruction_index:
-        for data_reference_index in idx.data_reference_index_by_source_instruction_index[
-            instruction_index
-        ]:
+        for data_reference_index in idx.data_reference_index_by_source_instruction_index[instruction_index]:
             data_reference = be2.data_reference[data_reference_index]
             data_reference_address = data_reference.address
 
-            # TODO: read data
-            buf = b""
+            # at end of segment then there might be an overrun here.
+            buf = read_memory(be2, sample_bytes, data_reference_address, 0x100)
 
             if capa.features.extractors.helpers.all_zeros(buf):
                 continue
 
-            if is_probably_string(buf):
-                pass
-            else:
+            is_string = False
+
+            # note: we *always* break after the first iteration
+            for s in capa.features.extractors.strings.extract_ascii_strings(buf):
+                if s.offset != 0:
+                    break
+
+                yield String(s.s), ih.address
+                is_string = True
+                break
+
+            # note: we *always* break after the first iteration
+            for s in capa.features.extractors.strings.extract_unicode_strings(buf):
+                if s.offset != 0:
+                    break
+
+                yield String(s.s), ih.address
+                is_string = True
+                break
+
+            if not is_string:
                 yield Bytes(buf), ih.address
 
 
 def extract_insn_string_features(
-    fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
+    fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
 ) -> Iterator[Tuple[Feature, Address]]:
-    # TODO(wb): 1755
-    yield from ()
+    fhi: FunctionContext = fh.inner
+    ii: InstructionContext = ih.inner
+
+    be2 = fhi.be2
+    idx = fhi.idx
+
+    instruction_index = ii.instruction_index
+
+    if instruction_index in idx.string_reference_index_by_source_instruction_index:
+        for string_reference_index in idx.string_reference_index_by_source_instruction_index[instruction_index]:
+            string_reference = be2.string_reference[string_reference_index]
+            string_index = string_reference.string_table_index
+            string = be2.string_table[string_index]
+            yield String(string), ih.address
 
 
 def extract_insn_offset_features(

From 46453637ad1ff4e3b9c9189458914f340eab06a8 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 31 Jan 2024 14:24:23 +0000
Subject: [PATCH 066/200] binexport: cache parsed PE/ELF

---
 capa/features/extractors/binexport2/insn.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 8f232a2b3..b9e1f3330 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -117,18 +117,22 @@ def extract_insn_number_features(
 class ReadMemoryError(ValueError): ...
 
 
-def read_memory(be2: BinExport2, sample_bytes: bytes, address: int, size: int) -> bytes:
+def read_memory(be2: BinExport2, sample_bytes: bytes, address: int, size: int, cache={}) -> bytes:
     base_address = min(map(lambda s: s.address, be2.section))
     rva = address - base_address
 
     try:
-        # TODO: cache the parsed file
-
         if sample_bytes.startswith(capa.features.extractors.common.MATCH_PE):
-            pe = pefile.PE(data=sample_bytes)
+            pe = cache.get("pe")
+            if not pe:
+                pe = pefile.PE(data=sample_bytes)
+                cache["pe"] = pe
             return pe.get_data(rva, size)
         elif sample_bytes.startswith(capa.features.extractors.common.MATCH_ELF):
-            elf = ELFFile(io.BytesIO(sample_bytes))
+            elf = cache.get("elf")
+            if not elf:
+                elf = ELFFile(io.BytesIO(sample_bytes))
+                cache["elf"] = elf
 
             # ELF segments are for runtime data,
             # ELF sections are for link-time data.
@@ -148,6 +152,8 @@ def read_memory(be2: BinExport2, sample_bytes: bytes, address: int, size: int) -
 
                     segment_offset = rva - segment_rva
                     return segment_data[segment_offset : segment_offset + size]
+
+            raise ReadMemoryError("address not mapped")
         else:
             logger.warning("unsupported format")
             raise ReadMemoryError("unsupported file format")
@@ -176,7 +182,7 @@ def extract_insn_bytes_features(
             data_reference_address = data_reference.address
 
             # at end of segment then there might be an overrun here.
-            buf = read_memory(be2, sample_bytes, data_reference_address, 0x100)
+            buf = read_memory(be2, sample_bytes, data_reference_address, 0x100, cache=fh.ctx)
 
             if capa.features.extractors.helpers.all_zeros(buf):
                 continue

From d2c744aa1feae1b3a4187dc6e44fc74593bc809d Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 31 Jan 2024 15:42:00 +0000
Subject: [PATCH 067/200] binexport: handle Ghidra SYMBOL numbers

---
 capa/features/extractors/binexport2/insn.py | 75 ++++++++++++++++-----
 1 file changed, 59 insertions(+), 16 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index b9e1f3330..437d86cde 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -83,35 +83,78 @@ def extract_insn_number_features(
     for i, operand_index in enumerate(instruction.operand_index):
         operand = be2.operand[operand_index]
 
-        if len(operand.expression_index) != 2:
-            # we only care about immediate constants,
-            # which have a two expression node:
+        if len(operand.expression_index) == 1:
+            # Ghidra extracts everything as a SYMBOL today,
+            # which is very wrong.
+            #
+            # temporarily, we'll have to try to guess at the interpretation.
+            # TODO: report this bug.
+            expression0 = be2.expression[operand.expression_index[0]]
+
+            if BinExport2.Expression.Type.SYMBOL != expression0.type:
+                continue
+
+            if expression0.symbol.startswith("#0x"):
+                # like:
+                # - type: SYMBOL
+                #   symbol: "#0xffffffff"
+                try:
+                    value = int(expression0.symbol[len("#") :], 0x10)
+                except ValueError:
+                    # failed to parse as integer
+                    continue
+
+            elif expression0.symbol.startswith("0x"):
+                # like:
+                # - type: SYMBOL
+                #   symbol: "0x1000"
+                try:
+                    value = int(expression0.symbol, 0x10)
+                except ValueError:
+                    # failed to parse as integer
+                    continue
+
+            else:
+                continue
+
+            # TODO: maybe if the base address is 0, disable this check.
+            # Otherwise we miss numbers smaller than the image size.
+            if is_address_mapped(be2, value):
+                continue
+
+            yield Number(value), ih.address
+            yield OperandNumber(i, value), ih.address
+
+        elif len(operand.expression_index) == 2:
+            # from BinDetego,
+            # we get the following pattern for immediate constants:
             #
             # - type: SIZE_PREFIX
             #   symbol: "b8"
             # - type: IMMEDIATE_INT
             #   immediate: 20588728364
             #   parent_index: 0
-            continue
 
-        expression0 = be2.expression[operand.expression_index[0]]
-        expression1 = be2.expression[operand.expression_index[1]]
+            expression0 = be2.expression[operand.expression_index[0]]
+            expression1 = be2.expression[operand.expression_index[1]]
 
-        if BinExport2.Expression.Type.SIZE_PREFIX != expression0.type:
-            continue
+            if BinExport2.Expression.Type.SIZE_PREFIX != expression0.type:
+                continue
 
-        if BinExport2.Expression.Type.IMMEDIATE_INT != expression1.type:
-            continue
+            if BinExport2.Expression.Type.IMMEDIATE_INT != expression1.type:
+                continue
 
-        value = expression1.immediate
+            value = expression1.immediate
 
-        # TODO: skip small numbers?
+            # TODO: skip small numbers?
 
-        if is_address_mapped(be2, value):
-            continue
+            if is_address_mapped(be2, value):
+                continue
 
-        yield Number(value), ih.address
-        yield OperandNumber(i, value), ih.address
+            yield Number(value), ih.address
+            yield OperandNumber(i, value), ih.address
+        else:
+            continue
 
 
 class ReadMemoryError(ValueError): ...

From 18355875f0405603d9ea03b62a2ae1950925eb9b Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Thu, 1 Feb 2024 11:09:34 +0000
Subject: [PATCH 068/200] binexport2: handle binexport#78 (Ghidra only uses
 SYMBOL expresssions)

---
 .../extractors/binexport2/__init__.py         |  87 ++++++-
 .../extractors/binexport2/extractor.py        |  83 +------
 capa/features/extractors/binexport2/insn.py   | 233 +++++++++++++-----
 3 files changed, 260 insertions(+), 143 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index 5f26a1dd2..f115858af 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -13,11 +13,12 @@
 import os
 import hashlib
 import logging
-from typing import Any, Dict, List, Iterator
+from typing import Dict, List, Iterator
 from pathlib import Path
 from collections import defaultdict
 from dataclasses import dataclass
 
+import capa.features.extractors.common
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
 logger = logging.getLogger(__name__)
@@ -200,13 +201,91 @@ def get_function_name_by_address(self, address: int) -> str:
         return self.get_function_name_by_vertex(vertex_index)
 
 
+class BinExport2Analysis:
+    def __init__(self, be2: BinExport2, idx: BinExport2Index, buf: bytes):
+        self.be2 = be2
+        self.idx = idx
+        self.buf = buf
+
+        # from virtual address to call graph vertex representing the import
+        self.thunks: Dict[int, int] = {}
+        self.base_address: int = 0
+
+        self._find_got_thunks()
+        self._find_base_address()
+
+    def _find_got_thunks(self):
+        if self.be2.meta_information.architecture_name != "aarch64":
+            logger.debug("skipping GOT thunk analysis on non-aarch64")
+            return
+
+        if not self.buf.startswith(capa.features.extractors.common.MATCH_ELF):
+            logger.debug("skipping GOT thunk analysis on non-ELF")
+            return
+
+        for vertex_index, vertex in enumerate(self.be2.call_graph.vertex):
+            if not vertex.HasField("address"):
+                continue
+
+            if not vertex.HasField("mangled_name"):
+                continue
+
+            if BinExport2.CallGraph.Vertex.Type.IMPORTED != vertex.type:
+                continue
+
+            if len(self.idx.callers_by_vertex_index[vertex_index]) != 1:
+                # find imports with a single caller,
+                # which should be the thunk
+                continue
+
+            maybe_thunk_vertex_index = self.idx.callers_by_vertex_index[vertex_index][0]
+            maybe_thunk_vertex = self.be2.call_graph.vertex[maybe_thunk_vertex_index]
+            maybe_thunk_address = maybe_thunk_vertex.address
+
+            maybe_thunk_flow_graph_index = self.idx.flow_graph_index_by_address[maybe_thunk_address]
+            maybe_thunk_flow_graph = self.be2.flow_graph[maybe_thunk_flow_graph_index]
+
+            if len(maybe_thunk_flow_graph.basic_block_index) != 1:
+                # should have a single basic block
+                continue
+
+            maybe_thunk_basic_block = self.be2.basic_block[maybe_thunk_flow_graph.entry_basic_block_index]
+            if len(list(self.idx.instruction_indices(maybe_thunk_basic_block))) != 4:
+                # thunk should look like these four instructions.
+                # fstat:
+                # 000008b0  adrp    x16, 0x11000
+                # 000008b4  ldr     x17, [x16, #0xf88]  {fstat}
+                # 000008b8  add     x16, x16, #0xf88  {fstat}
+                # 000008bc  br      x17
+                # which relies on the disassembler to recognize the target of the call/br
+                # to go to the GOT/external symbol.
+                continue
+
+            thunk_address = maybe_thunk_address
+            thunk_name = vertex.mangled_name
+            logger.debug("found GOT thunk: 0x%x -> %s", thunk_address, thunk_name)
+
+            self.thunks[thunk_address] = vertex_index
+
+    def _find_base_address(self):
+        sections_with_perms = filter(lambda s: s.flag_r or s.flag_w or s.flag_x, self.be2.section)
+        # assume the lowest address is the base address.
+        # this works as long as BinExport doesn't record other
+        # libraries mapped into memory.
+        self.base_address = min(s.address for s in sections_with_perms)
+
+
 @dataclass
-class FunctionContext:
+class AnalysisContext:
     sample_bytes: bytes
     be2: BinExport2
     idx: BinExport2Index
-    # TODO: typing
-    analysis: Any
+    analysis: BinExport2Analysis
+
+
+@dataclass
+class FunctionContext:
+    ctx: AnalysisContext
     flow_graph_index: int
 
 
diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
index 09a4406cb..776d60ef6 100644
--- a/capa/features/extractors/binexport2/extractor.py
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -6,7 +6,7 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 import logging
-from typing import Dict, List, Tuple, Iterator
+from typing import List, Tuple, Iterator
 
 import capa.features.extractors.elf
 import capa.features.extractors.common
@@ -16,7 +16,14 @@
 import capa.features.extractors.binexport2.basicblock
 from capa.features.common import Feature
 from capa.features.address import Address, AbsoluteVirtualAddress
-from capa.features.extractors.binexport2 import BinExport2Index, FunctionContext, BasicBlockContext, InstructionContext
+from capa.features.extractors.binexport2 import (
+    AnalysisContext,
+    BinExport2Index,
+    FunctionContext,
+    BasicBlockContext,
+    BinExport2Analysis,
+    InstructionContext,
+)
 from capa.features.extractors.base_extractor import (
     BBHandle,
     InsnHandle,
@@ -29,69 +36,6 @@
 logger = logging.getLogger(__name__)
 
 
-class BinExport2Analysis:
-    def __init__(self, be2: BinExport2, idx: BinExport2Index, buf: bytes):
-        self.be2 = be2
-        self.idx = idx
-        self.buf = buf
-
-        # from virtual address to call graph vertex representing the import
-        self.thunks: Dict[int, int] = {}
-
-    def _find_got_thunks(self):
-        if self.be2.meta_information.architecture_name != "aarch64":
-            logger.debug("skipping GOT thunk analysis on non-aarch64")
-            return
-
-        if not self.buf.startswith(capa.features.extractors.common.MATCH_ELF):
-            logger.debug("skipping GOT thunk analysis on non-ELF")
-            return
-
-        for vertex_index, vertex in enumerate(self.be2.call_graph.vertex):
-            if not vertex.HasField("address"):
-                continue
-
-            if not vertex.HasField("mangled_name"):
-                continue
-
-            if BinExport2.CallGraph.Vertex.Type.IMPORTED != vertex.type:
-                continue
-
-            if len(self.idx.callers_by_vertex_index[vertex_index]) != 1:
-                # find imports with a single caller,
-                # which should be the thunk
-                continue
-
-            maybe_thunk_vertex_index = self.idx.callers_by_vertex_index[vertex_index][0]
-            maybe_thunk_vertex = self.be2.call_graph.vertex[maybe_thunk_vertex_index]
-            maybe_thunk_address = maybe_thunk_vertex.address
-
-            maybe_thunk_flow_graph_index = self.idx.flow_graph_index_by_address[maybe_thunk_address]
-            maybe_thunk_flow_graph = self.be2.flow_graph[maybe_thunk_flow_graph_index]
-
-            if len(maybe_thunk_flow_graph.basic_block_index) != 1:
-                # should have a single basic block
-                continue
-
-            maybe_thunk_basic_block = self.be2.basic_block[maybe_thunk_flow_graph.entry_basic_block_index]
-            if len(list(self.idx.instruction_indices(maybe_thunk_basic_block))) != 4:
-                # thunk should look like these four instructions.
-                # fstat:
-                # 000008b0  adrp    x16, 0x11000
-                # 000008b4  ldr     x17, [x16, #0xf88]  {fstat}
-                # 000008b8  add     x16, x16, #0xf88  {fstat}
-                # 000008bc  br      x17
-                # which relies on the disassembler to recognize the target of the call/br
-                # to go to the GOT/external symbol.
-                continue
-
-            thunk_address = maybe_thunk_address
-            thunk_name = vertex.mangled_name
-            logger.debug("found GOT thunk: 0x%x -> %s", thunk_address, thunk_name)
-
-            self.thunks[thunk_address] = vertex_index
-
-
 class BinExport2FeatureExtractor(StaticFeatureExtractor):
     def __init__(self, be2: BinExport2, buf: bytes):
         super().__init__(hashes=SampleHashes.from_bytes(buf))
@@ -99,6 +43,7 @@ def __init__(self, be2: BinExport2, buf: bytes):
         self.buf = buf
         self.idx = BinExport2Index(self.be2)
         self.analysis = BinExport2Analysis(self.be2, self.idx, self.buf)
+        self.ctx = AnalysisContext(self.buf, self.be2, self.idx, self.analysis)
 
         self.global_features: List[Tuple[Feature, Address]] = []
         self.global_features.extend(list(capa.features.extractors.common.extract_format(self.buf)))
@@ -109,11 +54,7 @@ def __init__(self, be2: BinExport2, buf: bytes):
         # and gradually relax restrictions as they're tested.
 
     def get_base_address(self):
-        # TODO: assume the lowest address is the base address.
-        # this works as long as BinExport doesn't record other
-        # libraries mapped into memory.
-        base_address = min(map(lambda s: s.address, self.be2.section))
-        return AbsoluteVirtualAddress(base_address)
+        return AbsoluteVirtualAddress(self.analysis.base_address)
 
     def extract_global_features(self):
         yield from self.global_features
@@ -127,7 +68,7 @@ def get_functions(self) -> Iterator[FunctionHandle]:
             flow_graph_address = self.idx.basic_block_address_by_index[entry_basic_block_index]
             yield FunctionHandle(
                 AbsoluteVirtualAddress(flow_graph_address),
-                inner=FunctionContext(self.buf, self.be2, self.idx, self.analysis, flow_graph_index),
+                inner=FunctionContext(self.ctx, flow_graph_index),
             )
 
     def extract_function_features(self, fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 437d86cde..c296bd4a3 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -7,7 +7,7 @@
 # See the License for the specific language governing permissions and limitations under the License.
 import io
 import logging
-from typing import Tuple, Iterator
+from typing import List, Tuple, Iterator
 
 import pefile
 from elftools.elf.elffile import ELFFile
@@ -17,7 +17,7 @@
 from capa.features.insn import API, Number, Mnemonic, OperandNumber
 from capa.features.common import Bytes, String, Feature, Characteristic
 from capa.features.address import Address, AbsoluteVirtualAddress
-from capa.features.extractors.binexport2 import FunctionContext, InstructionContext
+from capa.features.extractors.binexport2 import AnalysisContext, FunctionContext, BasicBlockContext, InstructionContext
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
@@ -30,9 +30,9 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
 
-    be2 = fhi.be2
-    idx = fhi.idx
-    analysis: BinExport2Analysis = fhi.analysis
+    be2 = fhi.ctx.be2
+    idx = fhi.ctx.idx
+    analysis: BinExport2Analysis = fhi.ctx.analysis
 
     instruction = be2.instruction[ii.instruction_index]
 
@@ -64,10 +64,50 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
 
 def is_address_mapped(be2: BinExport2, address: int) -> bool:
     """return True if the given address is mapped"""
-    for section in be2.section:
-        if section.address <= address < section.address + section.size:
-            return True
-    return False
+    sections_with_perms = filter(lambda s: s.flag_r or s.flag_w or s.flag_x, be2.section)
+    return any(section.address <= address < section.address + section.size for section in sections_with_perms)
+
+
+###############################################################################
+#
+# begin Ghidra symbol madness ("gsm").
+#
+# This is a "temporary" section of code to deal with
+#   https://github.com/google/binexport/issues/78
+# because Ghidra exports all operands as a single SYMBOL expression node.
+#
+# Use references to `_is_ghidra_symbol_madness` to remove all this up later.
+
+
+def _is_ghidra_symbol_madness(be2: BinExport2, instruction_index: int) -> bool:
+    instruction = be2.instruction[instruction_index]
+    for operand_index in instruction.operand_index:
+        operand = be2.operand[operand_index]
+
+        if len(operand.expression_index) != 1:
+            return False
+
+        expression0 = be2.expression[operand.expression_index[0]]
+
+        if BinExport2.Expression.Type.SYMBOL != expression0.type:
+            return False
+
+    return True
+
+
+def _gsm_get_instruction_operand(be2: BinExport2, instruction_index: int, operand_index: int) -> str:
+    """since Ghidra represents all operands as a single string, just fetch that."""
+    instruction = be2.instruction[instruction_index]
+    operand = be2.operand[instruction.operand_index[operand_index]]
+    assert len(operand.expression_index) == 1
+    expression = be2.expression[operand.expression_index[0]]
+    assert expression.type == BinExport2.Expression.Type.SYMBOL
+    return expression.symbol
+
+
+# end Ghidra symbol madness.
+#
+###############################################################################
 
 
 def extract_insn_number_features(
@@ -76,55 +116,48 @@ def extract_insn_number_features(
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
 
-    be2 = fhi.be2
+    be2 = fhi.ctx.be2
+    analysis = fhi.ctx.analysis
 
-    instruction = be2.instruction[ii.instruction_index]
+    instruction_index = ii.instruction_index
+    instruction = be2.instruction[instruction_index]
+
+    _is_gsm = _is_ghidra_symbol_madness(be2, instruction_index)
 
     for i, operand_index in enumerate(instruction.operand_index):
         operand = be2.operand[operand_index]
 
-        if len(operand.expression_index) == 1:
-            # Ghidra extracts everything as a SYMBOL today,
-            # which is very wrong.
-            #
+        if len(operand.expression_index) == 1 and _is_gsm:
             # temporarily, we'll have to try to guess at the interpretation.
-            # TODO: report this bug.
-            expression0 = be2.expression[operand.expression_index[0]]
-
-            if BinExport2.Expression.Type.SYMBOL != expression0.type:
-                continue
+            symbol = _gsm_get_instruction_operand(be2, instruction_index, i)
 
-            if expression0.symbol.startswith("#0x"):
+            if symbol.startswith("#0x"):
                 # like:
                 # - type: SYMBOL
                 #   symbol: "#0xffffffff"
                 try:
-                    value = int(expression0.symbol[len("#") :], 0x10)
+                    value = int(symbol[len("#") :], 0x10)
                 except ValueError:
                     # failed to parse as integer
                     continue
 
-            elif expression0.symbol.startswith("0x"):
+                # handling continues below at label: has a value
+
+            elif symbol.startswith("0x"):
                 # like:
                 # - type: SYMBOL
                 #   symbol: "0x1000"
                 try:
-                    value = int(expression0.symbol, 0x10)
+                    value = int(symbol, 0x10)
                 except ValueError:
                     # failed to parse as integer
                     continue
 
-            else:
-                continue
+                # handling continues below at label: has a value
 
-            # TODO: maybe if the base address is 0, disable this check.
-            # Otherwise we miss numbers smaller than the image size.
-            if is_address_mapped(be2, value):
+            else:
                 continue
 
-            yield Number(value), ih.address
-            yield OperandNumber(i, value), ih.address
-
         elif len(operand.expression_index) == 2:
             # from BinDetego,
             # we get the following pattern for immediate constants:
@@ -146,23 +179,32 @@ def extract_insn_number_features(
 
             value = expression1.immediate
 
-            # TODO: skip small numbers?
+            # handling continues below at label: has a value
+
+        else:
+            continue
+
+        # label: has a value
 
+        if analysis.base_address != 0x0:
+            # When the image is mapped at 0x0,
+            #  then its hard to tell if numbers are pointers or numbers.
+            # So be a little less conservative here.
             if is_address_mapped(be2, value):
                 continue
 
-            yield Number(value), ih.address
-            yield OperandNumber(i, value), ih.address
-        else:
+        if is_address_mapped(be2, value):
             continue
 
+        yield Number(value), ih.address
+        yield OperandNumber(i, value), ih.address
+
 
 class ReadMemoryError(ValueError): ...
 
 
-def read_memory(be2: BinExport2, sample_bytes: bytes, address: int, size: int, cache={}) -> bytes:
-    base_address = min(map(lambda s: s.address, be2.section))
-    rva = address - base_address
+def read_memory(ctx: AnalysisContext, sample_bytes: bytes, address: int, size: int, cache) -> bytes:
+    rva = address - ctx.analysis.base_address
 
     try:
         if sample_bytes.startswith(capa.features.extractors.common.MATCH_PE):
@@ -207,51 +249,106 @@ def read_memory(be2: BinExport2, sample_bytes: bytes, address: int, size: int, c
         raise ReadMemoryError("failed to read memory: " + str(e)) from e
 
 
-def extract_insn_bytes_features(
-    fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
-) -> Iterator[Tuple[Feature, Address]]:
+def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
     fhi: FunctionContext = fh.inner
+    bbi: BasicBlockContext = bbh.inner
     ii: InstructionContext = ih.inner
 
-    be2 = fhi.be2
-    sample_bytes = fhi.sample_bytes
-    idx = fhi.idx
+    ctx = fhi.ctx
+    be2 = fhi.ctx.be2
+    sample_bytes = fhi.ctx.sample_bytes
+    idx = fhi.ctx.idx
 
+    basic_block_index = bbi.basic_block_index
     instruction_index = ii.instruction_index
 
+    reference_addresses: List[int] = []
+
     if instruction_index in idx.data_reference_index_by_source_instruction_index:
         for data_reference_index in idx.data_reference_index_by_source_instruction_index[instruction_index]:
             data_reference = be2.data_reference[data_reference_index]
             data_reference_address = data_reference.address
 
-            # at end of segment then there might be an overrun here.
-            buf = read_memory(be2, sample_bytes, data_reference_address, 0x100, cache=fh.ctx)
+            reference_addresses.append(data_reference_address)
 
-            if capa.features.extractors.helpers.all_zeros(buf):
-                continue
+    if (not reference_addresses) and _is_ghidra_symbol_madness(be2, instruction_index):
+        instruction = be2.instruction[ii.instruction_index]
+        mnemonic = be2.mnemonic[instruction.mnemonic_index]
+        mnemonic_name = mnemonic.name.lower()
 
-            is_string = False
+        if mnemonic_name == "adrp":
+            # Look for sequence like:
+            #
+            #     adrp x2, 0x1000     ; fetch global anchor address, relocatable
+            #     add  x2, x2, #0x3c  ; offset into global data of string
+            #
+            # to resolve 0x103c at the address of the adrp instruction.
+            # Ideally, the underlying disassembler would do this (IDA, Ghidra, etc.)
+            #  and use a data reference.
+            # However, the Ghidra exporter doesn't do this today.
+
+            # get the first operand register name and then second operand number,
+            # then find the next add instruction that references the register,
+            # fetching the third operand number.
+
+            assert len(instruction.operand_index) == 2
+            register_name = _gsm_get_instruction_operand(be2, instruction_index, 0)
+            page_address = int(_gsm_get_instruction_operand(be2, instruction_index, 1), 0x10)
+
+            basic_block = be2.basic_block[basic_block_index]
+
+            scanning_active = False
+            for scanning_instruction_index in idx.instruction_indices(basic_block):
+                if not scanning_active:
+                    # the given instruction not encountered yet
+                    if scanning_instruction_index == instruction_index:
+                        scanning_active = True
+                else:
+                    scanning_instruction = be2.instruction[scanning_instruction_index]
+                    scanning_mnemonic = be2.mnemonic[scanning_instruction.mnemonic_index]
+                    scanning_mnemonic_name = scanning_mnemonic.name.lower()
+                    if scanning_mnemonic_name != "add":
+                        continue
+
+                    if _gsm_get_instruction_operand(be2, scanning_instruction_index, 0) != register_name:
+                        continue
+
+                    if _gsm_get_instruction_operand(be2, scanning_instruction_index, 1) != register_name:
+                        continue
+
+                    page_offset = int(_gsm_get_instruction_operand(be2, scanning_instruction_index, 2).strip("#"), 0x10)
+                    reference_address = page_address + page_offset
+                    reference_addresses.append(reference_address)
+
+    for reference_address in reference_addresses:
+        # at end of segment then there might be an overrun here.
+        buf = read_memory(ctx, sample_bytes, reference_address, 0x100, fh.ctx)
+
+        if capa.features.extractors.helpers.all_zeros(buf):
+            continue
 
-            # note: we *always* break after the first iteration
-            for s in capa.features.extractors.strings.extract_ascii_strings(buf):
-                if s.offset != 0:
-                    break
+        is_string = False
 
-                yield String(s.s), ih.address
-                is_string = True
+        # note: we *always* break after the first iteration
+        for s in capa.features.extractors.strings.extract_ascii_strings(buf):
+            if s.offset != 0:
                 break
 
-            # note: we *always* break after the first iteration
-            for s in capa.features.extractors.strings.extract_unicode_strings(buf):
-                if s.offset != 0:
-                    break
+            yield String(s.s), ih.address
+            is_string = True
+            break
 
-                yield String(s.s), ih.address
-                is_string = True
+        # note: we *always* break after the first iteration
+        for s in capa.features.extractors.strings.extract_unicode_strings(buf):
+            if s.offset != 0:
                 break
 
-            if not is_string:
-                yield Bytes(buf), ih.address
+            yield String(s.s), ih.address
+            is_string = True
+            break
+
+        if not is_string:
+            yield Bytes(buf), ih.address
 
 
 def extract_insn_string_features(
@@ -260,8 +357,8 @@ def extract_insn_string_features(
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
 
-    be2 = fhi.be2
-    idx = fhi.idx
+    be2 = fhi.ctx.be2
+    idx = fhi.ctx.idx
 
     instruction_index = ii.instruction_index
 
@@ -293,7 +390,7 @@ def extract_insn_mnemonic_features(
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
 
-    be2 = fhi.be2
+    be2 = fhi.ctx.be2
 
     instruction = be2.instruction[ii.instruction_index]
     mnemonic = be2.mnemonic[instruction.mnemonic_index]
@@ -310,7 +407,7 @@ def extract_function_calls_from(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandl
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
 
-    be2 = fhi.be2
+    be2 = fhi.ctx.be2
 
     instruction = be2.instruction[ii.instruction_index]
     if not instruction.call_target:

From 933c9b176f4ae24cf59c4604b093f4a21decdd92 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Thu, 1 Feb 2024 11:47:08 +0000
Subject: [PATCH 069/200] main: write error output to stderr, not stdout

---
 capa/main.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/capa/main.py b/capa/main.py
index 961743b0b..3d99dca85 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -178,12 +178,13 @@ def simple_message_exception_handler(exctype, value: BaseException, traceback: T
     """
 
     if exctype is KeyboardInterrupt:
-        print("KeyboardInterrupt detected, program terminated")
+        print("KeyboardInterrupt detected, program terminated", file=sys.stderr)
     else:
         print(
             f"Unexpected exception raised: {exctype}. Please run capa in debug mode (-d/--debug) "
             + "to see the stack trace. Please also report your issue on the capa GitHub page so we "
-            + "can improve the code! (https://github.com/mandiant/capa/issues)"
+            + "can improve the code! (https://github.com/mandiant/capa/issues)",
+            file=sys.stderr,
         )
 
 

From f067f77d709945512650da953b4ea12bb3501684 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Thu, 1 Feb 2024 11:47:20 +0000
Subject: [PATCH 070/200] scripts: add example
 detect-binexport2-capabilities.py

---
 scripts/detect-binexport2-capabilities.py | 106 ++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 scripts/detect-binexport2-capabilities.py

diff --git a/scripts/detect-binexport2-capabilities.py b/scripts/detect-binexport2-capabilities.py
new file mode 100644
index 000000000..7718d5023
--- /dev/null
+++ b/scripts/detect-binexport2-capabilities.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python2
+"""
+Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+You may obtain a copy of the License at: [package root]/LICENSE.txt
+Unless required by applicable law or agreed to in writing, software distributed under the License
+ is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and limitations under the License.
+
+detect-binexport2-capabilities.py
+
+Detect capabilities in a BinExport2 file and write the results into the protobuf format.
+
+Example:
+
+    $ python detect-binexport2-capabilities.py suspicious.BinExport2 | xxd | head
+    ┌────────┬─────────────────────────┬─────────────────────────┬────────┬────────┐
+    │00000000│ 0a d4 05 0a 1a 32 30 32 ┊ 33 2d 30 32 2d 31 30 20 │_.•_•202┊3-02-10 │
+    │00000010│ 31 31 3a 34 39 3a 35 32 ┊ 2e 36 39 33 34 30 30 12 │11:49:52┊.693400•│
+    │00000020│ 05 35 2e 30 2e 30 1a 34 ┊ 74 65 73 74 73 2f 64 61 │•5.0.0•4┊tests/da│
+    │00000030│ 74 61 2f 50 72 61 63 74 ┊ 69 63 61 6c 20 4d 61 6c │ta/Pract┊ical Mal│
+    │00000040│ 77 61 72 65 20 41 6e 61 ┊ 6c 79 73 69 73 20 4c 61 │ware Ana┊lysis La│
+    │00000050│ 62 20 30 31 2d 30 31 2e ┊ 64 6c 6c 5f 1a 02 2d 6a │b 01-01.┊dll_••-j│
+    │00000060│ 22 c4 01 0a 20 32 39 30 ┊ 39 33 34 63 36 31 64 65 │".•_ 290┊934c61de│
+    │00000070│ 39 31 37 36 61 64 36 38 ┊ 32 66 66 64 64 36 35 66 │9176ad68┊2ffdd65f│
+    │00000080│ 30 61 36 36 39 12 28 61 ┊ 34 62 33 35 64 65 37 31 │0a669•(a┊4b35de71│
+"""
+import sys
+import logging
+import argparse
+
+import capa.main
+import capa.rules
+import capa.engine
+import capa.loader
+import capa.helpers
+import capa.features
+import capa.exceptions
+import capa.render.proto
+import capa.render.verbose
+import capa.features.freeze
+import capa.capabilities.common
+import capa.render.result_document as rd
+from capa.loader import FORMAT_BINEXPORT2, BACKEND_BINEXPORT2
+
+logger = logging.getLogger("capa.detect-binexport2-capabilities")
+
+
+def main(argv=None):
+    if argv is None:
+        argv = sys.argv[1:]
+
+    parser = argparse.ArgumentParser(description="detect capabilities in programs.")
+    capa.main.install_common_args(
+        parser,
+        wanted={"format", "os", "backend", "input_file", "signatures", "rules", "tag"},
+    )
+    args = parser.parse_args(args=argv)
+
+    try:
+        capa.main.handle_common_args(args)
+        capa.main.ensure_input_exists_from_cli(args)
+
+        input_format = capa.main.get_input_format_from_cli(args)
+        assert input_format == FORMAT_BINEXPORT2
+
+        backend = capa.main.get_backend_from_cli(args, input_format)
+        assert backend == BACKEND_BINEXPORT2
+
+        sample_path = capa.main.get_sample_path_from_cli(args, backend)
+        assert sample_path is not None
+        os_ = capa.loader.get_os(sample_path)
+
+        rules = capa.main.get_rules_from_cli(args)
+
+        extractor = capa.main.get_extractor_from_cli(args, input_format, backend)
+        # alternatively, if you have all this handy in your library code:
+        #
+        #     extractor = capa.loader.get_extractor(
+        #         args.input_file,
+        #         FORMAT_BINEXPORT2,
+        #         os_,
+        #         BACKEND_BINEXPORT2,
+        #         sig_paths=[],
+        #         sample_path=sample_path,
+        #     )
+    except capa.main.ShouldExitError as e:
+        return e.status_code
+
+    capabilities, counts = capa.capabilities.common.find_capabilities(rules, extractor)
+
+    meta = capa.loader.collect_metadata(argv, args.input_file, input_format, os_, args.rules, extractor, counts)
+    meta.analysis.layout = capa.loader.compute_layout(rules, extractor, capabilities)
+
+    doc = rd.ResultDocument.from_capa(meta, rules, capabilities)
+    pb = capa.render.proto.doc_to_pb2(doc)
+
+    sys.stdout.buffer.write(pb.SerializeToString(deterministic=True))
+    sys.stdout.flush()
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From fead3a6c7a3225d0ae2fceae10f28166b2e891be Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Thu, 1 Feb 2024 11:49:23 +0000
Subject: [PATCH 071/200] detect-binexport2-capabilities: more
 documentation/examples

---
 scripts/detect-binexport2-capabilities.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/scripts/detect-binexport2-capabilities.py b/scripts/detect-binexport2-capabilities.py
index 7718d5023..3c914de2c 100644
--- a/scripts/detect-binexport2-capabilities.py
+++ b/scripts/detect-binexport2-capabilities.py
@@ -85,6 +85,12 @@ def main(argv=None):
         #         sig_paths=[],
         #         sample_path=sample_path,
         #     )
+        #
+        # or even more concisely:
+        #
+        #     be2 = capa.features.extractors.binexport2.get_binexport2(input_path)
+        #     buf = sample_path.read_bytes()
+        #     extractor = capa.features.extractors.binexport2.extractor.BinExport2FeatureExtractor(be2, buf)
     except capa.main.ShouldExitError as e:
         return e.status_code
 

From 8387be5222318de411f07030664648631b93fe6e Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 2 Feb 2024 11:45:56 +0000
Subject: [PATCH 072/200] elffile: recognize more architectures

---
 capa/features/extractors/elffile.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/capa/features/extractors/elffile.py b/capa/features/extractors/elffile.py
index 5881c0358..7b040c760 100644
--- a/capa/features/extractors/elffile.py
+++ b/capa/features/extractors/elffile.py
@@ -122,6 +122,10 @@ def extract_file_arch(elf: ELFFile, **kwargs):
         yield Arch("i386"), NO_ADDRESS
     elif arch == "x64":
         yield Arch("amd64"), NO_ADDRESS
+    elif arch == "ARM":
+        yield Arch("arm"), NO_ADDRESS
+    elif arch == "AArch64":
+        yield Arch("aarch64"), NO_ADDRESS
     else:
         logger.warning("unsupported architecture: %s", arch)
 

From 457df8a8d3592b4ca124348bd8691d4c1adcc319 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 2 Feb 2024 11:46:06 +0000
Subject: [PATCH 073/200] binexport: handle read_memory errors

---
 capa/features/extractors/binexport2/insn.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index c296bd4a3..cc566a09e 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -321,8 +321,11 @@ def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandl
                     reference_addresses.append(reference_address)
 
     for reference_address in reference_addresses:
-        # at end of segment then there might be an overrun here.
-        buf = read_memory(ctx, sample_bytes, reference_address, 0x100, fh.ctx)
+        try:
+            # at end of segment then there might be an overrun here.
+            buf = read_memory(ctx, sample_bytes, reference_address, 0x100, fh.ctx)
+        except ReadMemoryError:
+            continue
 
         if capa.features.extractors.helpers.all_zeros(buf):
             continue

From 03c51304bb32423f44ff77424ae5ac634baa2668 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 2 Feb 2024 12:00:40 +0000
Subject: [PATCH 074/200] binexport: index flow graphs by address

---
 capa/features/extractors/binexport2/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index f115858af..f09cc03ff 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -82,6 +82,7 @@ def __init__(self, be2: BinExport2):
 
         # note: flow graph != call graph (vertex)
         self.flow_graph_index_by_address: Dict[int, int] = {}
+        self.flow_graph_address_by_index: Dict[int, int] = {}
         self.basic_block_index_by_address: Dict[int, int] = {}
         self.basic_block_address_by_index: Dict[int, int] = {}
         self.instruction_index_by_address: Dict[int, int] = {}
@@ -142,6 +143,7 @@ def _index_flow_graph_nodes(self):
             entry_instruction_address = self.instruction_address_by_index[entry_instruction_index]
             function_address = entry_instruction_address
             self.flow_graph_index_by_address[function_address] = flow_graph_index
+            self.flow_graph_address_by_index[flow_graph_index] = function_address
 
     def _index_flow_graph_edges(self):
         for flow_graph in self.be2.flow_graph:

From 8283e3670b67a56a13a0f3c5a028cadcd902a69a Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 2 Feb 2024 12:01:05 +0000
Subject: [PATCH 075/200] binexport: cleanup logging

---
 capa/features/extractors/binexport2/insn.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index cc566a09e..9a09089a3 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -25,14 +25,12 @@
 
 
 def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
-    from capa.features.extractors.binexport2.extractor import BinExport2Analysis
-
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
 
     be2 = fhi.ctx.be2
     idx = fhi.ctx.idx
-    analysis: BinExport2Analysis = fhi.ctx.analysis
+    analysis = fhi.ctx.analysis
 
     instruction = be2.instruction[ii.instruction_index]
 
@@ -243,9 +241,6 @@ def read_memory(ctx: AnalysisContext, sample_bytes: bytes, address: int, size: i
             logger.warning("unsupported format")
             raise ReadMemoryError("unsupported file format")
     except Exception as e:
-        # TODO: remove logging message here
-        logger.warning("failed to read memory: %s", e, exc_info=True)
-
         raise ReadMemoryError("failed to read memory: " + str(e)) from e
 
 

From 5ea8826f2b85f2ba1deb565fac3f1dba50019762 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 2 Feb 2024 12:01:19 +0000
Subject: [PATCH 076/200] binexport: learn to extract function names

---
 .../extractors/binexport2/function.py         | 23 +++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/capa/features/extractors/binexport2/function.py b/capa/features/extractors/binexport2/function.py
index 53a0088b7..89972bd1e 100644
--- a/capa/features/extractors/binexport2/function.py
+++ b/capa/features/extractors/binexport2/function.py
@@ -7,8 +7,10 @@
 # See the License for the specific language governing permissions and limitations under the License.
 from typing import Tuple, Iterator
 
+from capa.features.file import FunctionName
 from capa.features.common import Feature
 from capa.features.address import Address
+from capa.features.extractors.binexport2 import FunctionContext
 from capa.features.extractors.base_extractor import FunctionHandle
 
 
@@ -28,8 +30,25 @@ def extract_recursive_call(fh: FunctionHandle):
 
 
 def extract_function_name(fh: FunctionHandle):
-    # TODO(wb): 1755
-    yield from ()
+    fhi: FunctionContext = fh.inner
+
+    be2 = fhi.ctx.be2
+    idx = fhi.ctx.idx
+    analysis = fhi.ctx.analysis
+
+    flow_graph_index = fhi.flow_graph_index
+
+    flow_graph_address = idx.flow_graph_address_by_index[flow_graph_index]
+    vertex_index = idx.vertex_index_by_address[flow_graph_address]
+    vertex = be2.call_graph.vertex[vertex_index]
+
+    if vertex.HasField("mangled_name"):
+        yield FunctionName(vertex.mangled_name), fh.address
+    elif flow_graph_address in analysis.thunks:
+        thunk_vertex_index = analysis.thunks[flow_graph_address]
+        thunk_vertex = be2.call_graph.vertex[thunk_vertex_index]
+        if thunk_vertex.HasField("mangled_name"):
+            yield FunctionName(thunk_vertex.mangled_name), fh.address
 
 
 def extract_features(fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:

From 6a54e0681da269087517f2c993b75411ed9286f9 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 2 Feb 2024 12:10:06 +0000
Subject: [PATCH 077/200] binexport: learn to extract all function features

---
 .../extractors/binexport2/function.py         | 35 +++++++++++++------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/capa/features/extractors/binexport2/function.py b/capa/features/extractors/binexport2/function.py
index 89972bd1e..33878e563 100644
--- a/capa/features/extractors/binexport2/function.py
+++ b/capa/features/extractors/binexport2/function.py
@@ -8,25 +8,40 @@
 from typing import Tuple, Iterator
 
 from capa.features.file import FunctionName
-from capa.features.common import Feature
-from capa.features.address import Address
+from capa.features.common import Feature, Characteristic
+from capa.features.address import Address, AbsoluteVirtualAddress
 from capa.features.extractors.binexport2 import FunctionContext
 from capa.features.extractors.base_extractor import FunctionHandle
 
 
 def extract_function_calls_to(fh: FunctionHandle):
-    # TODO(wb): 1755
-    yield from ()
+    fhi: FunctionContext = fh.inner
+
+    be2 = fhi.ctx.be2
+    idx = fhi.ctx.idx
+
+    flow_graph_index = fhi.flow_graph_index
+    flow_graph_address = idx.flow_graph_address_by_index[flow_graph_index]
+    vertex_index = idx.vertex_index_by_address[flow_graph_address]
+
+    for caller_index in idx.callers_by_vertex_index[vertex_index]:
+        caller = be2.call_graph.vertex[caller_index]
+        caller_address = caller.address
+        yield Characteristic("calls to"), AbsoluteVirtualAddress(caller_address)
 
 
 def extract_function_loop(fh: FunctionHandle):
-    # TODO(wb): 1755
-    yield from ()
+    fhi: FunctionContext = fh.inner
 
+    be2 = fhi.ctx.be2
+
+    flow_graph_index = fhi.flow_graph_index
+    flow_graph = be2.flow_graph[flow_graph_index]
 
-def extract_recursive_call(fh: FunctionHandle):
-    # TODO(wb): 1755
-    yield from ()
+    for edge in flow_graph.edge:
+        if edge.is_back_edge:
+            yield Characteristic("loop"), fh.address
+            break
 
 
 def extract_function_name(fh: FunctionHandle):
@@ -57,4 +72,4 @@ def extract_features(fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
             yield feature, addr
 
 
-FUNCTION_HANDLERS = (extract_function_calls_to, extract_function_loop, extract_recursive_call, extract_function_name)
+FUNCTION_HANDLERS = (extract_function_calls_to, extract_function_loop, extract_function_name)

From 4b451566b2f782c18707f608d79324d36ba1e97a Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 2 Feb 2024 12:17:37 +0000
Subject: [PATCH 078/200] binexport: learn to extract bb tight loops

---
 .../extractors/binexport2/basicblock.py       | 21 ++++++++++++-------
 capa/features/extractors/binexport2/insn.py   |  3 ---
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/capa/features/extractors/binexport2/basicblock.py b/capa/features/extractors/binexport2/basicblock.py
index 13bd9e6d7..5ffb9b11a 100644
--- a/capa/features/extractors/binexport2/basicblock.py
+++ b/capa/features/extractors/binexport2/basicblock.py
@@ -6,20 +6,25 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 
-from typing import Any, Tuple, Iterator
+from typing import Tuple, Iterator
 
-from capa.features.common import Feature
-from capa.features.address import Address
+from capa.features.common import Feature, Characteristic
+from capa.features.address import Address, AbsoluteVirtualAddress
 from capa.features.basicblock import BasicBlock
+from capa.features.extractors.binexport2 import FunctionContext, BasicBlockContext
 from capa.features.extractors.base_extractor import BBHandle, FunctionHandle
 
-# TODO(wb): 1755
-TODOType = Any
-
 
 def extract_bb_tight_loop(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
-    # TODO(wb): 1755
-    yield from ()
+    fhi: FunctionContext = fh.inner
+    bbi: BasicBlockContext = bbh.inner
+
+    idx = fhi.ctx.idx
+
+    basic_block_index = bbi.basic_block_index
+    if basic_block_index in idx.target_edges_by_basic_block_index[basic_block_index]:
+        basic_block_address = idx.basic_block_address_by_index[basic_block_index]
+        yield Characteristic("tight loop"), AbsoluteVirtualAddress(basic_block_address)
 
 
 def extract_features(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 9a09089a3..81a7539af 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -408,9 +408,6 @@ def extract_function_calls_from(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandl
     be2 = fhi.ctx.be2
 
     instruction = be2.instruction[ii.instruction_index]
-    if not instruction.call_target:
-        return
-
     for call_target_address in instruction.call_target:
         addr = AbsoluteVirtualAddress(call_target_address)
         yield Characteristic("calls from"), addr

From fdf5305e5d52870bd3ade25eb06ee7d8d8a0fcbf Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 14 Feb 2024 14:12:06 +0000
Subject: [PATCH 079/200] elf: don't require vivisect just for type annotations

---
 capa/features/extractors/elf.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/capa/features/extractors/elf.py b/capa/features/extractors/elf.py
index b969463df..1e50ca2f7 100644
--- a/capa/features/extractors/elf.py
+++ b/capa/features/extractors/elf.py
@@ -10,10 +10,11 @@
 import itertools
 import collections
 from enum import Enum
-from typing import Set, Dict, List, Tuple, BinaryIO, Iterator, Optional
+from typing import TYPE_CHECKING, Set, Dict, List, Tuple, BinaryIO, Iterator, Optional
 from dataclasses import dataclass
 
-import Elf  # from vivisect
+if TYPE_CHECKING:
+    import Elf  # from vivisect
 
 logger = logging.getLogger(__name__)
 
@@ -724,7 +725,7 @@ def get_symbols(self) -> Iterator[Symbol]:
         yield from self.symbols
 
     @classmethod
-    def from_viv(cls, elf: Elf.Elf) -> Optional["SymTab"]:
+    def from_viv(cls, elf: "Elf.Elf") -> Optional["SymTab"]:
         endian = "<" if elf.getEndian() == 0 else ">"
         bitness = elf.bits
 

From a95e46ca4c02bee510b1139de7722db8165114af Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 14 Feb 2024 14:12:23 +0000
Subject: [PATCH 080/200] main: remove unused imports

---
 capa/main.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/capa/main.py b/capa/main.py
index 9d0b1af59..16d6d3cba 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -40,11 +40,6 @@
 import capa.render.result_document
 import capa.render.result_document as rdoc
 import capa.features.extractors.common
-import capa.features.extractors.pefile
-import capa.features.extractors.elffile
-import capa.features.extractors.dotnetfile
-import capa.features.extractors.base_extractor
-import capa.features.extractors.cape.extractor
 from capa.rules import RuleSet
 from capa.engine import MatchResults
 from capa.loader import BACKEND_VIV, BACKEND_CAPE, BACKEND_BINJA, BACKEND_DOTNET, BACKEND_FREEZE, BACKEND_PEFILE

From 181309140909dddfeaa7bc4ff13ad1f9f72a8328 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 14 Feb 2024 14:12:34 +0000
Subject: [PATCH 081/200] rules: don't eagerly import ruamel until needed

---
 capa/rules/__init__.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index d9e43dfc5..8ce3433da 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -31,7 +31,6 @@
 
 import yaml
 import pydantic
-import ruamel.yaml
 import yaml.parser
 
 import capa.perf
@@ -1053,6 +1052,11 @@ def _get_yaml_loader():
 
     @staticmethod
     def _get_ruamel_yaml_parser():
+        # we use lazy importing here to avoid eagerly loading dependencies
+        # that some specialized environments may not have,
+        # e.g., those that run capa without ruamel.
+        import ruamel.yaml
+
         # use ruamel to enable nice formatting
 
         # we use the ruamel.yaml parser because it supports roundtripping of documents with comments.

From 365b712119337cce0b23f36512fe425dfc13649a Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 14 Feb 2024 14:12:46 +0000
Subject: [PATCH 082/200] loader: avoid eager imports of some backend-related
 code

---
 capa/loader.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/capa/loader.py b/capa/loader.py
index a8ffccf0f..f05dbd383 100644
--- a/capa/loader.py
+++ b/capa/loader.py
@@ -31,9 +31,6 @@
 import capa.render.result_document
 import capa.render.result_document as rdoc
 import capa.features.extractors.common
-import capa.features.extractors.pefile
-import capa.features.extractors.elffile
-import capa.features.extractors.dotnetfile
 import capa.features.extractors.base_extractor
 import capa.features.extractors.cape.extractor
 from capa.rules import RuleSet
@@ -276,17 +273,26 @@ def get_extractor(
 def get_file_extractors(input_file: Path, input_format: str) -> List[FeatureExtractor]:
     file_extractors: List[FeatureExtractor] = []
 
+    # we use lazy importing here to avoid eagerly loading dependencies
+    # that some specialized environments may not have,
+    # e.g., those that run capa without vivisect.
+
     if input_format == FORMAT_PE:
+        import capa.features.extractors.pefile
         file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(input_file))
 
     elif input_format == FORMAT_DOTNET:
+        import capa.features.extractors.pefile
+        import capa.features.extractors.dotnetfile
         file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(input_file))
         file_extractors.append(capa.features.extractors.dotnetfile.DotnetFileFeatureExtractor(input_file))
 
     elif input_format == FORMAT_ELF:
+        import capa.features.extractors.elffile
         file_extractors.append(capa.features.extractors.elffile.ElfFeatureExtractor(input_file))
 
     elif input_format == FORMAT_CAPE:
+        import capa.features.extractors.cape.extractor
         report = json.loads(input_file.read_text(encoding="utf-8"))
         file_extractors.append(capa.features.extractors.cape.extractor.CapeExtractor.from_report(report))
 

From a20fef52876c73d75b088b1742a47bf8fdceadf7 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 14 Feb 2024 14:19:13 +0000
Subject: [PATCH 083/200] changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 57e1a60e6..685a1bbec 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,8 @@
 
 ### Bug Fixes
 
+- do some imports closer to where they are used #1810 @williballenthin
+
 
 ### capa explorer IDA Pro plugin
 

From 44b3d8503705db499b835ddac06f99f0d2e25364 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 14 Feb 2024 14:20:25 +0000
Subject: [PATCH 084/200] fmt

---
 capa/loader.py         | 4 ++++
 capa/rules/__init__.py | 1 -
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/capa/loader.py b/capa/loader.py
index f05dbd383..e4f0a5c92 100644
--- a/capa/loader.py
+++ b/capa/loader.py
@@ -279,20 +279,24 @@ def get_file_extractors(input_file: Path, input_format: str) -> List[FeatureExtr
 
     if input_format == FORMAT_PE:
         import capa.features.extractors.pefile
+
         file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(input_file))
 
     elif input_format == FORMAT_DOTNET:
         import capa.features.extractors.pefile
         import capa.features.extractors.dotnetfile
+
         file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(input_file))
         file_extractors.append(capa.features.extractors.dotnetfile.DotnetFileFeatureExtractor(input_file))
 
     elif input_format == FORMAT_ELF:
         import capa.features.extractors.elffile
+
         file_extractors.append(capa.features.extractors.elffile.ElfFeatureExtractor(input_file))
 
     elif input_format == FORMAT_CAPE:
         import capa.features.extractors.cape.extractor
+
         report = json.loads(input_file.read_text(encoding="utf-8"))
         file_extractors.append(capa.features.extractors.cape.extractor.CapeExtractor.from_report(report))
 
diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index 8ce3433da..530c8424c 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -1058,7 +1058,6 @@ def _get_ruamel_yaml_parser():
         import ruamel.yaml
 
         # use ruamel to enable nice formatting
-
         # we use the ruamel.yaml parser because it supports roundtripping of documents with comments.
         y = ruamel.yaml.YAML(typ="rt")
 

From 5c417cc83a23c722db9c217cf1b08277578299db Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Mon, 18 Mar 2024 11:48:50 +0100
Subject: [PATCH 085/200] binexport: better render optional fields

---
 capa/features/extractors/binexport2/__init__.py | 5 +++++
 scripts/inspect-binexport2.py                   | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index f09cc03ff..f8f0d7341 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -193,6 +193,11 @@ def get_function_name_by_vertex(self, vertex_index: int) -> str:
         if vertex.HasField("demangled_name"):
             name = vertex.demangled_name
 
+        if vertex.HasField("library_index"):
+            library = self.be2.library[vertex.library_index]
+            if library.HasField("name"):
+                name = f"{library.name}!{name}"
+
         return name
 
     def get_function_name_by_address(self, address: int) -> str:
diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index 3dfb9c804..5b33f1d3d 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -93,7 +93,7 @@ def main(argv=None):
 
     with o.section("libraries"):
         for library in be2.library:
-            o.writeln(f"- {library.name:<12s} {'(static)' if library.is_static else ''} at {hex(library.load_address)}")
+            o.writeln(f"- {library.name:<12s} {'(static)' if library.is_static else ''}{(' at ' + hex(library.load_address)) if library.HasField('load_address') else ''}")
         if not be2.library:
             o.writeln("(none)")
 

From 0d9d4c6ce325c5775027863459fd8b6b3bc9429a Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 22 Mar 2024 14:18:59 -0600
Subject: [PATCH 086/200] fix merge conflicts

---
 CHANGELOG.md   | 3 ---
 capa/loader.py | 4 ----
 2 files changed, 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 873e1fc43..c6de93bc4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,11 +24,8 @@
 ### Bug Fixes
 
 - do some imports closer to where they are used #1810 @williballenthin
-<<<<<<< HEAD
-=======
 - binja: fix and simplify stack string detection code after binja 4.0 @xusheng6
 - binja: add support for forwarded export #1646 @xusheng6
->>>>>>> master
 
 
 ### capa explorer IDA Pro plugin
diff --git a/capa/loader.py b/capa/loader.py
index 951b7cc17..50ff71b9f 100644
--- a/capa/loader.py
+++ b/capa/loader.py
@@ -328,11 +328,7 @@ def get_file_extractors(input_file: Path, input_format: str) -> List[FeatureExtr
     elif input_format == FORMAT_CAPE:
         import capa.features.extractors.cape.extractor
 
-<<<<<<< HEAD
-        report = json.loads(input_file.read_text(encoding="utf-8"))
-=======
         report = capa.helpers.load_json_from_path(input_file)
->>>>>>> master
         file_extractors.append(capa.features.extractors.cape.extractor.CapeExtractor.from_report(report))
 
     elif input_format == FORMAT_BINEXPORT2:

From 9c66b3a0ad963f80ee7a44dc37709386472fb1fb Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 22 Mar 2024 14:31:53 -0600
Subject: [PATCH 087/200] fix formatting

---
 scripts/inspect-binexport2.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index 5b33f1d3d..954df7355 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -93,7 +93,9 @@ def main(argv=None):
 
     with o.section("libraries"):
         for library in be2.library:
-            o.writeln(f"- {library.name:<12s} {'(static)' if library.is_static else ''}{(' at ' + hex(library.load_address)) if library.HasField('load_address') else ''}")
+            o.writeln(
+                f"- {library.name:<12s} {'(static)' if library.is_static else ''}{(' at ' + hex(library.load_address)) if library.HasField('load_address') else ''}"
+            )
         if not be2.library:
             o.writeln("(none)")
 

From 59775b27e4c8bd7e43a42000cc11da4d1b0767bc Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 22 Mar 2024 16:08:12 -0600
Subject: [PATCH 088/200] remove Ghidra data reference madness

---
 capa/features/extractors/binexport2/insn.py | 49 ---------------------
 1 file changed, 49 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 81a7539af..61ca2963d 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -266,55 +266,6 @@ def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandl
 
             reference_addresses.append(data_reference_address)
 
-    if (not reference_addresses) and _is_ghidra_symbol_madness(be2, instruction_index):
-        instruction = be2.instruction[ii.instruction_index]
-        mnemonic = be2.mnemonic[instruction.mnemonic_index]
-        mnemonic_name = mnemonic.name.lower()
-
-        if mnemonic_name == "adrp":
-            # Look for sequence like:
-            #
-            #     adrp x2, 0x1000     ; fetch global anchor address, relocatable
-            #     add  x2, x2, #0x3c  ; offset into global data of string
-            #
-            # to resolve 0x103c at the address of the adrp instruction.
-            # Ideally, the underlying disassembler would do this (IDA, Ghidra, etc.)
-            #  and use a data reference.
-            # However, the Ghidra exporter doesn't do this today.
-
-            # get the first operand register name and then second operand number,
-            # then find the next add instruction that references the register,
-            # fetching the third operand number.
-
-            assert len(instruction.operand_index) == 2
-            register_name = _gsm_get_instruction_operand(be2, instruction_index, 0)
-            page_address = int(_gsm_get_instruction_operand(be2, instruction_index, 1), 0x10)
-
-            basic_block = be2.basic_block[basic_block_index]
-
-            scanning_active = False
-            for scanning_instruction_index in idx.instruction_indices(basic_block):
-                if not scanning_active:
-                    # the given instruction not encountered yet
-                    if scanning_instruction_index == instruction_index:
-                        scanning_active = True
-                else:
-                    scanning_instruction = be2.instruction[scanning_instruction_index]
-                    scanning_mnemonic = be2.mnemonic[scanning_instruction.mnemonic_index]
-                    scanning_mnemonic_name = scanning_mnemonic.name.lower()
-                    if scanning_mnemonic_name != "add":
-                        continue
-
-                    if _gsm_get_instruction_operand(be2, scanning_instruction_index, 0) != register_name:
-                        continue
-
-                    if _gsm_get_instruction_operand(be2, scanning_instruction_index, 1) != register_name:
-                        continue
-
-                    page_offset = int(_gsm_get_instruction_operand(be2, scanning_instruction_index, 2).strip("#"), 0x10)
-                    reference_address = page_address + page_offset
-                    reference_addresses.append(reference_address)
-
     for reference_address in reference_addresses:
         try:
             # at end of segment then there might be an overrun here.

From d39358ee81c6a6d171b1fbeae60eda2d5f983144 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Wed, 3 Apr 2024 07:58:41 +0200
Subject: [PATCH 089/200] handle PermissionError when searching sample file for
 BinExport2 file

---
 capa/features/extractors/binexport2/__init__.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index f8f0d7341..0b02b0ef5 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -60,8 +60,12 @@ def filename_similarity_key(p: Path):
     siblings = [p for p in input_directory.iterdir() if p.is_file()]
     siblings.sort(key=filename_similarity_key, reverse=True)
     for sibling in siblings:
-        if hashlib.sha256(sibling.read_bytes()).hexdigest().lower() == wanted_sha256:
-            return sibling
+        try:
+            if hashlib.sha256(sibling.read_bytes()).hexdigest().lower() == wanted_sha256:
+                return sibling
+        except PermissionError:
+            # e.g. with open IDA files in the same directory on Windows
+            pass
 
     base = Path(os.environ.get("CAPA_SAMPLES_DIR", "."))
     candidates = [p for p in base.iterdir() if p.is_file()]

From c1243cdb912c0329d20ea4c48c822330939d4cce Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Wed, 3 Apr 2024 07:58:41 +0200
Subject: [PATCH 090/200] handle PermissionError when searching sample file for
 BinExport2 file

---
 capa/features/extractors/binexport2/__init__.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index f8f0d7341..c65201024 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -60,15 +60,23 @@ def filename_similarity_key(p: Path):
     siblings = [p for p in input_directory.iterdir() if p.is_file()]
     siblings.sort(key=filename_similarity_key, reverse=True)
     for sibling in siblings:
-        if hashlib.sha256(sibling.read_bytes()).hexdigest().lower() == wanted_sha256:
-            return sibling
+        try:
+            if hashlib.sha256(sibling.read_bytes()).hexdigest().lower() == wanted_sha256:
+                return sibling
+        except PermissionError:
+            # e.g. with open IDA files in the same directory on Windows
+            pass
 
     base = Path(os.environ.get("CAPA_SAMPLES_DIR", "."))
     candidates = [p for p in base.iterdir() if p.is_file()]
     candidates.sort(key=filename_similarity_key, reverse=True)
     for candidate in candidates:
-        if hashlib.sha256(candidate.read_bytes()).hexdigest().lower() == wanted_sha256:
-            return candidate
+        try:
+            if hashlib.sha256(candidate.read_bytes()).hexdigest().lower() == wanted_sha256:
+                return candidate
+        except PermissionError:
+            # e.g. with open IDA files in the same directory on Windows
+            pass
 
     raise ValueError("cannot find sample")
 

From b1d9554dd3a9755bc94af526b770d205001fec5e Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Thu, 4 Apr 2024 08:55:28 +0200
Subject: [PATCH 091/200] add Android as valid OS

---
 capa/features/common.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/capa/features/common.py b/capa/features/common.py
index f076bf859..39b65bb09 100644
--- a/capa/features/common.py
+++ b/capa/features/common.py
@@ -422,10 +422,11 @@ def __init__(self, value: str, description=None):
 OS_WINDOWS = "windows"
 OS_LINUX = "linux"
 OS_MACOS = "macos"
+OS_ANDROID = "android"
 # dotnet
 OS_ANY = "any"
 VALID_OS = {os.value for os in capa.features.extractors.elf.OS}
-VALID_OS.update({OS_WINDOWS, OS_LINUX, OS_MACOS, OS_ANY})
+VALID_OS.update({OS_WINDOWS, OS_LINUX, OS_MACOS, OS_ANY, OS_ANDROID})
 # internal only, not to be used in rules
 OS_AUTO = "auto"
 

From 14ff1892cfcafdee83f66625e9477f1108b75cf5 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Mon, 8 Apr 2024 12:56:01 +0200
Subject: [PATCH 092/200] inspect-binexport: strip strings

---
 scripts/inspect-binexport2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index 954df7355..eb46f14f2 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -186,7 +186,7 @@ def main(argv=None):
                                         string_reference = be2.string_reference[string_reference_index]
                                         string_index = string_reference.string_table_index
                                         string = be2.string_table[string_index]
-                                        string_references += f'⇥ string "{string}" '
+                                        string_references += f'⇥ string "{string.rstrip()}" '
 
                                 comments = ""
                                 if instruction.comment_index:

From 10291e71429018e7f807fd3a5b52040261ea293f Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 9 Apr 2024 11:47:19 +0200
Subject: [PATCH 093/200] inspect-binexport: render operands

---
 scripts/inspect-binexport2.py | 147 +++++++++++++++++++++++++++++++++-
 1 file changed, 146 insertions(+), 1 deletion(-)

diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index eb46f14f2..7c2a78f56 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -13,6 +13,7 @@
 import logging
 import argparse
 import contextlib
+from typing import List
 
 import capa.main
 import capa.features.extractors.binexport2
@@ -34,6 +35,9 @@ def indenting(self):
         finally:
             self.indent -= 1
 
+    def write(self, s):
+        self.o.write(s)
+
     def writeln(self, s):
         self.o.write("  " * self.indent)
         self.o.write(s)
@@ -54,6 +58,142 @@ def getvalue(self):
         return self.o.getvalue()
 
 
+# internal to `render_operand`
+def _render_expression_tree(
+        be2: BinExport2, 
+        instruction: BinExport2.Instruction, 
+        operand: BinExport2.Operand, 
+        expression_tree: List[List[int]], 
+        tree_index: int, 
+        o: io.StringIO):
+
+    expression_index = operand.expression_index[tree_index]
+    expression = be2.expression[expression_index]
+    children_tree_indexes: List[int] = expression_tree[tree_index]
+
+    if expression.type == BinExport2.Expression.REGISTER:
+        o.write(expression.symbol)
+        assert len(children_tree_indexes) == 0
+        return
+
+    elif expression.type == BinExport2.Expression.SYMBOL:
+        o.write(expression.symbol)
+        assert len(children_tree_indexes) == 0
+        return
+
+    elif expression.type == BinExport2.Expression.IMMEDIATE_INT:
+        o.write(f"0x{expression.immediate:X}")
+        assert len(children_tree_indexes) == 0
+        return
+
+    elif expression.type == BinExport2.Expression.SIZE_PREFIX:
+        # like: b4
+        #
+        # We might want to use this occasionally, such as to disambiguate the
+        # size of MOVs into/out of memory. But I'm not sure when/where we need that yet.
+        #
+        # IDA spams this size prefix hint *everywhere*, so we can't rely on the exporter
+        # to provide it only when necessary.
+        assert len(children_tree_indexes) == 1
+        child_index = children_tree_indexes[0]
+        _render_expression_tree(be2, instruction, operand, expression_tree, child_index, o)
+        return
+
+    elif expression.type == BinExport2.Expression.OPERATOR:
+
+        if len(children_tree_indexes) == 1:
+            # prefix operator, like "ds:"
+            o.write(expression.symbol)
+            child_index = children_tree_indexes[0]
+            _render_expression_tree(be2, instruction, operand, expression_tree, child_index, o)
+            return
+
+        elif len(children_tree_indexes) == 2:
+            # infix operator: like "+" in "ebp+10"
+            child_a = children_tree_indexes[0]
+            child_b = children_tree_indexes[1]
+            _render_expression_tree(be2, instruction, operand, expression_tree, child_a, o)
+            o.write(expression.symbol)
+            _render_expression_tree(be2, instruction, operand, expression_tree, child_b, o)
+            return
+
+        elif len(children_tree_indexes) == 3:
+            # infix operator: like "+" in "ebp+ecx+10"
+            child_a = children_tree_indexes[0]
+            child_b = children_tree_indexes[1]
+            child_c = children_tree_indexes[2]
+            _render_expression_tree(be2, instruction, operand, expression_tree, child_a, o)
+            o.write(expression.symbol)
+            _render_expression_tree(be2, instruction, operand, expression_tree, child_b, o)
+            o.write(expression.symbol)
+            _render_expression_tree(be2, instruction, operand, expression_tree, child_c, o)
+            return
+
+        else:
+            raise NotImplementedError(len(children_tree_indexes))
+
+    elif expression.type == BinExport2.Expression.DEREFERENCE:
+        o.write("[")
+        assert len(children_tree_indexes) == 1
+        child_index = children_tree_indexes[0]
+        _render_expression_tree(be2, instruction, operand, expression_tree, child_index, o)
+        o.write("]")
+        return
+
+    elif expression.type == BinExport2.Expression.IMMEDIATE_FLOAT:
+        raise NotImplementedError(expression.type)
+
+    else:
+        raise NotImplementedError(expression.type)
+
+
+def render_operand(be2: BinExport2, instruction: BinExport2.Instruction, operand: BinExport2.Operand) -> str:
+    o = io.StringIO()
+
+    # TODO
+    # for expression_index in operand.expression_index:
+    #     expression = be2.expression[expression_index]
+    #     print(f"{expression.parent_index if expression.HasField('parent_index') else 'null'} -> {expression_index} '{expression.symbol or hex(expression.immediate)}'")
+
+    # The reconstructed expression tree layout, linking parent nodes to their children.
+    #
+    # There is one list of integers for each expression in the operand.
+    # These integers are indexes of other expressions in the same operand,
+    # which are the children of that expression.
+    #
+    # So:
+    #
+    #   [ [1, 3], [2], [], [4], [5], []]
+    #
+    # means the first expression has two children, at index 1 and 3,
+    # and the tree looks like:
+    #
+    #        0
+    #       / \
+    #      1   3
+    #      |   |
+    #      2   4
+    #          |
+    #          5
+    #
+    # Remember, these are the indices into the entries in operand.expression_index.
+    tree: List[List[int]] = []
+    for i, expression_index in enumerate(operand.expression_index):
+        children = []
+
+        # scan all subsequent expressions, looking for those that have parent_index == current.expression_index
+        for j, candidate_index in enumerate(operand.expression_index[i + 1:]):
+            candidate = be2.expression[candidate_index]
+
+            if candidate.parent_index == expression_index:
+                children.append(i + j + 1)
+
+        tree.append(children)
+
+    _render_expression_tree(be2, instruction, operand, tree, 0, o)
+    return o.getvalue()
+
+
 def main(argv=None):
     if argv is None:
         argv = sys.argv[1:]
@@ -160,6 +300,11 @@ def main(argv=None):
 
                                 mnemonic = be2.mnemonic[instruction.mnemonic_index]
 
+                                operands = []
+                                for operand_index in instruction.operand_index:
+                                    operand = be2.operand[operand_index]
+                                    operands.append(render_operand(be2, instruction, operand))
+
                                 call_targets = ""
                                 if instruction.call_target:
                                     call_targets = " "
@@ -197,7 +342,7 @@ def main(argv=None):
                                         comments += f"; {BinExport2.Comment.Type.Name(comment.type)} {comment_string} "
 
                                 o.writeln(
-                                    f"{hex(instruction_address)}  {mnemonic.name:<12s}{call_targets}{data_references}{string_references}{comments}"
+                                    f"{hex(instruction_address)}  {mnemonic.name:<12s}{', '.join(operands)}{call_targets}{data_references}{string_references}{comments}"
                                 )
 
                             does_fallthrough = False

From 2783b1094e2973d4cc10e942e10ffcd35535c79e Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 9 Apr 2024 11:53:37 +0200
Subject: [PATCH 094/200] fix lints

---
 .../extractors/binexport2/__init__.py         | 12 ++++------
 capa/features/extractors/binexport2/insn.py   |  4 +---
 scripts/inspect-binexport2.py                 | 22 ++++++++-----------
 3 files changed, 14 insertions(+), 24 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index c65201024..845214fdc 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -13,6 +13,7 @@
 import os
 import hashlib
 import logging
+import contextlib
 from typing import Dict, List, Iterator
 from pathlib import Path
 from collections import defaultdict
@@ -60,23 +61,18 @@ def filename_similarity_key(p: Path):
     siblings = [p for p in input_directory.iterdir() if p.is_file()]
     siblings.sort(key=filename_similarity_key, reverse=True)
     for sibling in siblings:
-        try:
+        # e.g. with open IDA files in the same directory on Windows
+        with contextlib.suppress(PermissionError):
             if hashlib.sha256(sibling.read_bytes()).hexdigest().lower() == wanted_sha256:
                 return sibling
-        except PermissionError:
-            # e.g. with open IDA files in the same directory on Windows
-            pass
 
     base = Path(os.environ.get("CAPA_SAMPLES_DIR", "."))
     candidates = [p for p in base.iterdir() if p.is_file()]
     candidates.sort(key=filename_similarity_key, reverse=True)
     for candidate in candidates:
-        try:
+        with contextlib.suppress(PermissionError):
             if hashlib.sha256(candidate.read_bytes()).hexdigest().lower() == wanted_sha256:
                 return candidate
-        except PermissionError:
-            # e.g. with open IDA files in the same directory on Windows
-            pass
 
     raise ValueError("cannot find sample")
 
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 61ca2963d..24ce6a97e 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -52,7 +52,7 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
         yield API(vertex.mangled_name), ih.address
 
         if vertex.HasField("library_index"):
-            # BUG: this seems to be incorrect
+            # TODO: this seems to be incorrect for Ghidra extractor
             library = be2.library[vertex.library_index]
             library_name = library.name
             if library_name.endswith(".so"):
@@ -246,7 +246,6 @@ def read_memory(ctx: AnalysisContext, sample_bytes: bytes, address: int, size: i
 
 def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
     fhi: FunctionContext = fh.inner
-    bbi: BasicBlockContext = bbh.inner
     ii: InstructionContext = ih.inner
 
     ctx = fhi.ctx
@@ -254,7 +253,6 @@ def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandl
     sample_bytes = fhi.ctx.sample_bytes
     idx = fhi.ctx.idx
 
-    basic_block_index = bbi.basic_block_index
     instruction_index = ii.instruction_index
 
     reference_addresses: List[int] = []
diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index 7c2a78f56..ff7341fd1 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -60,12 +60,13 @@ def getvalue(self):
 
 # internal to `render_operand`
 def _render_expression_tree(
-        be2: BinExport2, 
-        instruction: BinExport2.Instruction, 
-        operand: BinExport2.Operand, 
-        expression_tree: List[List[int]], 
-        tree_index: int, 
-        o: io.StringIO):
+    be2: BinExport2,
+    instruction: BinExport2.Instruction,
+    operand: BinExport2.Operand,
+    expression_tree: List[List[int]],
+    tree_index: int,
+    o: io.StringIO,
+):
 
     expression_index = operand.expression_index[tree_index]
     expression = be2.expression[expression_index]
@@ -150,11 +151,6 @@ def _render_expression_tree(
 def render_operand(be2: BinExport2, instruction: BinExport2.Instruction, operand: BinExport2.Operand) -> str:
     o = io.StringIO()
 
-    # TODO
-    # for expression_index in operand.expression_index:
-    #     expression = be2.expression[expression_index]
-    #     print(f"{expression.parent_index if expression.HasField('parent_index') else 'null'} -> {expression_index} '{expression.symbol or hex(expression.immediate)}'")
-
     # The reconstructed expression tree layout, linking parent nodes to their children.
     #
     # There is one list of integers for each expression in the operand.
@@ -182,7 +178,7 @@ def render_operand(be2: BinExport2, instruction: BinExport2.Instruction, operand
         children = []
 
         # scan all subsequent expressions, looking for those that have parent_index == current.expression_index
-        for j, candidate_index in enumerate(operand.expression_index[i + 1:]):
+        for j, candidate_index in enumerate(operand.expression_index[i + 1 :]):
             candidate = be2.expression[candidate_index]
 
             if candidate.parent_index == expression_index:
@@ -254,7 +250,7 @@ def main(argv=None):
                     o.writeln(f"demangled: {vertex.demangled_name}")
 
                 if vertex.HasField("library_index"):
-                    # TODO(williballenthin): this seems to be incorrect
+                    # TODO(williballenthin): this seems to be incorrect for Ghidra exporter
                     library = be2.library[vertex.library_index]
                     o.writeln(f"library:   [{vertex.library_index}] {library.name}")
 

From a7447eddbccc2597a6e24537f0a57b897c1b2fe2 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 9 Apr 2024 11:55:50 +0200
Subject: [PATCH 095/200] ruff: update config layout

---
 .github/ruff.toml                           | 29 +++++++++++----------
 capa/features/extractors/binexport2/insn.py |  2 +-
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/.github/ruff.toml b/.github/ruff.toml
index 306cc891e..c1938b903 100644
--- a/.github/ruff.toml
+++ b/.github/ruff.toml
@@ -1,17 +1,3 @@
-# Enable the pycodestyle (`E`) and Pyflakes (`F`) rules by default.
-# Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
-# McCabe complexity (`C901`) by default.
-select = ["E", "F"]
-
-# Allow autofix for all enabled rules (when `--fix`) is provided.
-fixable = ["ALL"]
-unfixable = []
-
-# E402 module level import not at top of file
-# E722 do not use bare 'except'
-# E501 line too long
-ignore = ["E402", "E722", "E501"]
-
 line-length = 120
 
 exclude = [
@@ -41,3 +27,18 @@ exclude = [
     "*_pb2.py", 
     "*_pb2.pyi"
 ]
+
+[lint]
+# Enable the pycodestyle (`E`) and Pyflakes (`F`) rules by default.
+# Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
+# McCabe complexity (`C901`) by default.
+select = ["E", "F"]
+
+# Allow autofix for all enabled rules (when `--fix`) is provided.
+fixable = ["ALL"]
+unfixable = []
+
+# E402 module level import not at top of file
+# E722 do not use bare 'except'
+# E501 line too long
+ignore = ["E402", "E722", "E501"]
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 24ce6a97e..8453d8069 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -17,7 +17,7 @@
 from capa.features.insn import API, Number, Mnemonic, OperandNumber
 from capa.features.common import Bytes, String, Feature, Characteristic
 from capa.features.address import Address, AbsoluteVirtualAddress
-from capa.features.extractors.binexport2 import AnalysisContext, FunctionContext, BasicBlockContext, InstructionContext
+from capa.features.extractors.binexport2 import AnalysisContext, FunctionContext, InstructionContext
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 

From 82dd3d7cbae70385a9e10b08f4aa26580377ff0a Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 9 Apr 2024 12:01:26 +0200
Subject: [PATCH 096/200] inspect-binexport: better align comments/xrefs

---
 scripts/inspect-binexport2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index ff7341fd1..5a42b9810 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -338,7 +338,7 @@ def main(argv=None):
                                         comments += f"; {BinExport2.Comment.Type.Name(comment.type)} {comment_string} "
 
                                 o.writeln(
-                                    f"{hex(instruction_address)}  {mnemonic.name:<12s}{', '.join(operands)}{call_targets}{data_references}{string_references}{comments}"
+                                    f"{hex(instruction_address)}  {mnemonic.name:<12s}{', '.join(operands):<14s}{call_targets}{data_references}{string_references}{comments}"
                                 )
 
                             does_fallthrough = False

From 221eaa208ee8bf99d3f3599f140644a553311272 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Tue, 9 Apr 2024 13:42:41 +0200
Subject: [PATCH 097/200] use explicit search paths to get sample for BinExport
 file

---
 .../extractors/binexport2/__init__.py         | 20 +++++++++----------
 capa/loader.py                                |  5 ++++-
 capa/main.py                                  |  4 +++-
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index 845214fdc..a29b65ed3 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -10,7 +10,6 @@
 
     protoc --python_out=. --mypy_out=. binexport2.proto
 """
-import os
 import hashlib
 import logging
 import contextlib
@@ -43,11 +42,10 @@ def compute_common_prefix_length(m: str, n: str) -> int:
     return len(m)
 
 
-def get_sample_from_binexport2(input_file: Path, be2: BinExport2) -> Path:
+def get_sample_from_binexport2(input_file: Path, be2: BinExport2, search_paths: List[Path]) -> Path:
     """attempt to find the sample file, given a BinExport2 file.
 
-    searches in the same directory as the BinExport2 file, and then
-    in $CAPA_SAMPLES_DIR.
+    searches in the same directory as the BinExport2 file, and then in search_paths.
     """
 
     def filename_similarity_key(p: Path):
@@ -66,13 +64,13 @@ def filename_similarity_key(p: Path):
             if hashlib.sha256(sibling.read_bytes()).hexdigest().lower() == wanted_sha256:
                 return sibling
 
-    base = Path(os.environ.get("CAPA_SAMPLES_DIR", "."))
-    candidates = [p for p in base.iterdir() if p.is_file()]
-    candidates.sort(key=filename_similarity_key, reverse=True)
-    for candidate in candidates:
-        with contextlib.suppress(PermissionError):
-            if hashlib.sha256(candidate.read_bytes()).hexdigest().lower() == wanted_sha256:
-                return candidate
+    for search_path in search_paths:
+        candidates = [p for p in search_path.iterdir() if p.is_file()]
+        candidates.sort(key=filename_similarity_key, reverse=True)
+        for candidate in candidates:
+            with contextlib.suppress(PermissionError):
+                if hashlib.sha256(candidate.read_bytes()).hexdigest().lower() == wanted_sha256:
+                    return candidate
 
     raise ValueError("cannot find sample")
 
diff --git a/capa/loader.py b/capa/loader.py
index 50ff71b9f..41899e2af 100644
--- a/capa/loader.py
+++ b/capa/loader.py
@@ -5,6 +5,7 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
+import os
 import sys
 import logging
 import datetime
@@ -287,7 +288,9 @@ def _get_binexport2_file_extractors(input_file: Path) -> List[FeatureExtractor]:
     import capa.features.extractors.binexport2
 
     be2 = capa.features.extractors.binexport2.get_binexport2(input_file)
-    sample_path = capa.features.extractors.binexport2.get_sample_from_binexport2(input_file, be2)
+    sample_path = capa.features.extractors.binexport2.get_sample_from_binexport2(
+        input_file, be2, [Path(os.environ.get("CAPA_SAMPLES_DIR", "."))]
+    )
 
     with sample_path.open("rb") as f:
         taste = f.read()
diff --git a/capa/main.py b/capa/main.py
index 7d0c109b6..c7607aaea 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -553,7 +553,9 @@ def get_sample_path_from_cli(args, backend: str) -> Optional[Path]:
         import capa.features.extractors.binexport2
 
         be2 = capa.features.extractors.binexport2.get_binexport2(args.input_file)
-        return capa.features.extractors.binexport2.get_sample_from_binexport2(args.input_file, be2)
+        return capa.features.extractors.binexport2.get_sample_from_binexport2(
+            args.input_file, be2, [Path(os.environ.get("CAPA_SAMPLES_DIR", "."))]
+        )
     else:
         return args.input_file
 

From 85f72ecdcf16ad2572c74a93f76dd0186e489fd5 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Tue, 9 Apr 2024 14:39:09 +0200
Subject: [PATCH 098/200] add initial BinExport tests

---
 tests/fixtures.py                |  20 +++
 tests/test_binexport_features.py | 278 +++++++++++++++++++++++++++++++
 2 files changed, 298 insertions(+)
 create mode 100644 tests/test_binexport_features.py

diff --git a/tests/fixtures.py b/tests/fixtures.py
index 531043861..17bb24ac4 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -209,6 +209,19 @@ def get_ghidra_extractor(path: Path):
     return extractor
 
 
+@lru_cache(maxsize=1)
+def get_binexport_extractor(path):
+    import capa.features.extractors.binexport2
+    import capa.features.extractors.binexport2.extractor
+
+    be2 = capa.features.extractors.binexport2.get_binexport2(path)
+    search_paths = [CD / "data", CD / "data" / "aarch64"]
+    path = capa.features.extractors.binexport2.get_sample_from_binexport2(path, be2, search_paths)
+    buf = path.read_bytes()
+
+    return capa.features.extractors.binexport2.extractor.BinExport2FeatureExtractor(be2, buf)
+
+
 def extract_global_features(extractor):
     features = collections.defaultdict(set)
     for feature, va in extractor.extract_global_features():
@@ -393,6 +406,13 @@ def get_data_path_by_name(name) -> Path:
         return CD / "data" / "dotnet" / "dd9098ff91717f4906afe9dafdfa2f52.exe_"
     elif name.startswith("nested_typeref"):
         return CD / "data" / "dotnet" / "2c7d60f77812607dec5085973ff76cea.dll_"
+    elif name.startswith("687e79.be2"):
+        return (
+            CD
+            / "data"
+            / "binexport2"
+            / "687e79cde5b0ced75ac229465835054931f9ec438816f2827a8be5f3bd474929.elf_.ida.BinExport"
+        )
     else:
         raise ValueError(f"unexpected sample fixture: {name}")
 
diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
new file mode 100644
index 000000000..225cb16ab
--- /dev/null
+++ b/tests/test_binexport_features.py
@@ -0,0 +1,278 @@
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+from pathlib import Path
+
+import pytest
+
+import fixtures
+
+import capa.features.file
+import capa.features.insn
+import capa.features.common
+import capa.features.basicblock
+from capa.features.common import (
+    OS,
+    OS_LINUX,
+    ARCH_I386,
+    FORMAT_PE,
+    ARCH_AMD64,
+    FORMAT_ELF,
+    OS_ANDROID,
+    OS_WINDOWS,
+    ARCH_AARCH64,
+    Arch,
+    Format,
+)
+
+FEATURE_PRESENCE_TESTS_BE2_ELF_AARCH64 = sorted(
+    [
+        # file/string
+        ("687e79.be2", "file", capa.features.common.String("AppDataService start"), True),
+        ("687e79.be2", "file", capa.features.common.String("nope"), False),
+        # file/sections
+        ("687e79.be2", "file", capa.features.file.Section(".text"), True),
+        ("687e79.be2", "file", capa.features.file.Section(".nope"), False),
+        # file/exports
+        # TODO? ("687e79.be2", "file", capa.features.file.Export("android::clearDir"), True),
+        ("687e79.be2", "file", capa.features.file.Export("nope"), False),
+        # file/imports
+        ("687e79.be2", "file", capa.features.file.Import("fopen"), True),
+        ("687e79.be2", "file", capa.features.file.Import("exit"), True),
+        ("687e79.be2", "file", capa.features.file.Import("_ZN7android10IInterfaceD0Ev"), True),
+        ("687e79.be2", "file", capa.features.file.Import("nope"), False),
+        # function/characteristic(loop)
+        ("687e79.be2", "function=0x56c0", capa.features.common.Characteristic("loop"), True),
+        ("687e79.be2", "function=0x4c40", capa.features.common.Characteristic("loop"), False),
+        # bb/characteristic(tight loop)
+        ("687e79.be2", "function=0x0", capa.features.common.Characteristic("tight loop"), "xfail: not implemented yet"),
+        ("687e79.be2", "function=0x0", capa.features.common.Characteristic("tight loop"), "xfail: not implemented yet"),
+        # bb/characteristic(stack string)
+        (
+            "687e79.be2",
+            "function=0x",
+            capa.features.common.Characteristic("stack string"),
+            "xfail: not implemented yet",
+        ),
+        (
+            "687e79.be2",
+            "function=0x",
+            capa.features.common.Characteristic("stack string"),
+            "xfail: not implemented yet",
+        ),
+        # bb/characteristic(tight loop)
+        (
+            "687e79.be2",
+            "function=0x0,bb=0x0",
+            capa.features.common.Characteristic("tight loop"),
+            "xfail: not implemented yet",
+        ),
+        (
+            "687e79.be2",
+            "function=0x0,bb=0x0",
+            capa.features.common.Characteristic("tight loop"),
+            "xfail: not implemented yet",
+        ),
+        # insn/mnemonic
+        ("687e79.be2", "function=0x7588", capa.features.insn.Mnemonic("stp"), True),
+        ("687e79.be2", "function=0x7588", capa.features.insn.Mnemonic("adrl"), True),
+        ("687e79.be2", "function=0x7588", capa.features.insn.Mnemonic("bl"), True),
+        ("687e79.be2", "function=0x7588", capa.features.insn.Mnemonic("in"), False),
+        ("687e79.be2", "function=0x7588", capa.features.insn.Mnemonic("out"), False),
+        # insn/operand.number
+        ("687e79.be2", "function=0x5128,bb=0x51e4", capa.features.insn.OperandNumber(1, 0xFFFFFFFF), True),
+        ("687e79.be2", "function=0x7588,bb=0x7588", capa.features.insn.OperandNumber(1, 0x3), True),
+        ("687e79.be2", "function=0x7588,bb=0x7588", capa.features.insn.OperandNumber(1, 0x10), True),
+        ("687e79.be2", "function=0x7588,bb=0x7588", capa.features.insn.OperandNumber(3, 0x10), True),
+        # insn/operand.offset
+        ("687e79.be2", "function=0x0,bb=0x0", capa.features.insn.OperandOffset(0, 4), "xfail: not implemented yet"),
+        ("687e79.be2", "function=0x0,bb=0x0", capa.features.insn.OperandOffset(1, 4), "xfail: not implemented yet"),
+        # insn/number
+        ("687e79.be2", "function=0x7588", capa.features.insn.Number(0x3), True),
+        ("687e79.be2", "function=0x7588", capa.features.insn.Number(0x10), True),
+        ("687e79.be2", "function=0x5C88", capa.features.insn.Number(0xF000), True),
+        # insn/number: stack adjustments
+        # ("mimikatz", "function=0x40105D", capa.features.insn.Number(0xC), False),
+        # ("mimikatz", "function=0x40105D", capa.features.insn.Number(0x10), False),
+        # insn/number: negative
+        # ("mimikatz", "function=0x401553", capa.features.insn.Number(0xFFFFFFFF), True),
+        # ("mimikatz", "function=0x43e543", capa.features.insn.Number(0xFFFFFFF0), True),
+        # insn/offset
+        ("mimikatz", "function=0x0", capa.features.insn.Offset(0x0), "xfail: not implemented yet"),
+        ("mimikatz", "function=0x0", capa.features.insn.Offset(0x4), "xfail: not implemented yet"),
+        ("mimikatz", "function=0x0", capa.features.insn.Offset(0xC), "xfail: not implemented yet"),
+        # insn/offset, issue #276
+        # ("64d9f", "function=0x10001510,bb=0x100015B0", capa.features.insn.Offset(0x4000), True),
+        # insn/offset: stack references
+        # ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x8), False),
+        # ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x10), False),
+        # insn/offset: negative
+        # ("mimikatz", "function=0x4011FB", capa.features.insn.Offset(-0x1), True),
+        # ("mimikatz", "function=0x4011FB", capa.features.insn.Offset(-0x2), True),
+        #
+        # insn/offset from mnemonic: add
+        #
+        # should not be considered, too big for an offset:
+        #    .text:00401D85 81 C1 00 00 00 80       add     ecx, 80000000h
+        # ("mimikatz", "function=0x401D64,bb=0x401D73,insn=0x401D85", capa.features.insn.Offset(0x80000000), False),
+        # should not be considered, relative to stack:
+        #    .text:00401CF6 83 C4 10                add     esp, 10h
+        # ("mimikatz", "function=0x401CC7,bb=0x401CDE,insn=0x401CF6", capa.features.insn.Offset(0x10), False),
+        # yes, this is also a offset (imagine eax is a pointer):
+        #    .text:0040223C 83 C0 04                add     eax, 4
+        # ("mimikatz", "function=0x402203,bb=0x402221,insn=0x40223C", capa.features.insn.Offset(0x4), True),
+        #
+        # insn/number from mnemonic: lea
+        #
+        # should not be considered, lea operand invalid encoding
+        #    .text:00471EE6 8D 1C 81                lea     ebx, [ecx+eax*4]
+        # ("mimikatz", "function=0x471EAB,bb=0x471ED8,insn=0x471EE6", capa.features.insn.Number(0x4), False),
+        # should not be considered, lea operand invalid encoding
+        #    .text:004717B1 8D 4C 31 D0             lea     ecx, [ecx+esi-30h]
+        # ("mimikatz", "function=0x47153B,bb=0x4717AB,insn=0x4717B1", capa.features.insn.Number(-0x30), False),
+        # yes, this is also a number (imagine edx is zero):
+        #    .text:004018C0 8D 4B 02                lea     ecx, [ebx+2]
+        # ("mimikatz", "function=0x401873,bb=0x4018B2,insn=0x4018C0", capa.features.insn.Number(0x2), True),
+        # insn/api
+        # not extracting dll name
+        ("687e79.be2", "function=0x5c88", capa.features.insn.API("memset"), True),
+        ("687e79.be2", "function=0x5c88", capa.features.insn.API(".memset"), True),
+        ("687e79.be2", "function=0x5c88", capa.features.insn.API("Nope"), False),
+        # insn/string
+        ("687e79.be2", "function=0x7588", capa.features.common.String("AppDataService start"), True),
+        ("687e79.be2", "function=0x75c0", capa.features.common.String("AppDataService"), True),
+        ("687e79.be2", "function=0x7588", capa.features.common.String("nope"), False),
+        ("687e79.be2", "function=0x6d58", capa.features.common.String("/data/misc/wpa_supplicant"), True),
+        # insn/regex
+        ("687e79.be2", "function=0x5c88", capa.features.common.Regex("innerRename"), True),
+        ("687e79.be2", "function=0x6d58", capa.features.common.Regex("/data/misc"), True),
+        ("687e79.be2", "function=0x6d58", capa.features.common.Substring("/data/misc"), True),
+        # # insn/string, pointer to string
+        # ("mimikatz", "function=0x44EDEF", capa.features.common.String("INPUTEVENT"), True),
+        # # insn/string, direct memory reference
+        # ("mimikatz", "function=0x46D6CE", capa.features.common.String("(null)"), True),
+        # # insn/bytes
+        # ("mimikatz", "function=0x401517", capa.features.common.Bytes(binascii.unhexlify("CA3B0E000000F8AF47")), True),
+        # ("mimikatz", "function=0x404414", capa.features.common.Bytes(binascii.unhexlify("0180000040EA4700")), True),
+        # # don't extract byte features for obvious strings
+        # ("mimikatz", "function=0x40105D", capa.features.common.Bytes("SCardControl".encode("utf-16le")), False),
+        # ("mimikatz", "function=0x40105D", capa.features.common.Bytes("SCardTransmit".encode("utf-16le")), False),
+        # ("mimikatz", "function=0x40105D", capa.features.common.Bytes("ACR  > ".encode("utf-16le")), False),
+        # ("mimikatz", "function=0x40105D", capa.features.common.Bytes("nope".encode("ascii")), False),
+        # # push    offset aAcsAcr1220 ; "ACS..." -> where ACS == 41 00 43 00 == valid pointer to middle of instruction
+        # ("mimikatz", "function=0x401000", capa.features.common.Bytes(binascii.unhexlify("FDFF59F647")), False),
+        # # IDA features included byte sequences read from invalid memory, fixed in #409
+        # ("mimikatz", "function=0x44570F", capa.features.common.Bytes(binascii.unhexlify("FF" * 256)), False),
+        # # insn/bytes, pointer to string bytes
+        # ("mimikatz", "function=0x44EDEF", capa.features.common.Bytes("INPUTEVENT".encode("utf-16le")), False),
+        # # insn/characteristic(nzxor)
+        # ("mimikatz", "function=0x410DFC", capa.features.common.Characteristic("nzxor"), True),
+        # ("mimikatz", "function=0x40105D", capa.features.common.Characteristic("nzxor"), False),
+        # # insn/characteristic(nzxor): no security cookies
+        # ("mimikatz", "function=0x46D534", capa.features.common.Characteristic("nzxor"), False),
+        # # insn/characteristic(nzxor): xorps
+        # # viv needs fixup to recognize function, see above
+        # ("mimikatz", "function=0x410dfc", capa.features.common.Characteristic("nzxor"), True),
+        # # insn/characteristic(peb access)
+        # ("kernel32-64", "function=0x1800017D0", capa.features.common.Characteristic("peb access"), True),
+        # ("mimikatz", "function=0x4556E5", capa.features.common.Characteristic("peb access"), False),
+        # # insn/characteristic(gs access)
+        # ("kernel32-64", "function=0x180001068", capa.features.common.Characteristic("gs access"), True),
+        # ("mimikatz", "function=0x4556E5", capa.features.common.Characteristic("gs access"), False),
+        # # insn/characteristic(cross section flow)
+        # ("a1982...", "function=0x4014D0", capa.features.common.Characteristic("cross section flow"), True),
+        # # insn/characteristic(cross section flow): imports don't count
+        # ("kernel32-64", "function=0x180001068", capa.features.common.Characteristic("cross section flow"), False),
+        # ("mimikatz", "function=0x4556E5", capa.features.common.Characteristic("cross section flow"), False),
+        # # insn/characteristic(recursive call)
+        # ("mimikatz", "function=0x40640e", capa.features.common.Characteristic("recursive call"), True),
+        # # before this we used ambiguous (0x4556E5, False), which has a data reference / indirect recursive call, see #386
+        # ("mimikatz", "function=0x4175FF", capa.features.common.Characteristic("recursive call"), False),
+        # # insn/characteristic(indirect call)
+        # ("mimikatz", "function=0x4175FF", capa.features.common.Characteristic("indirect call"), True),
+        # ("mimikatz", "function=0x4556E5", capa.features.common.Characteristic("indirect call"), False),
+        # # insn/characteristic(calls from)
+        # ("mimikatz", "function=0x4556E5", capa.features.common.Characteristic("calls from"), True),
+        # ("mimikatz", "function=0x4702FD", capa.features.common.Characteristic("calls from"), False),
+        # # function/characteristic(calls to)
+        # ("mimikatz", "function=0x40105D", capa.features.common.Characteristic("calls to"), True),
+        # # function/characteristic(forwarded export)
+        # ("ea2876", "file", capa.features.common.Characteristic("forwarded export"), True),
+        # # before this we used ambiguous (0x4556E5, False), which has a data reference / indirect recursive call, see #386
+        # ("mimikatz", "function=0x456BB9", capa.features.common.Characteristic("calls to"), False),
+        # file/function-name
+        ("687e79.be2", "file", capa.features.file.FunctionName(".__libc_init"), True),
+        # os & format & arch
+        ("687e79.be2", "file", OS(OS_ANDROID), True),
+        ("687e79.be2", "file", OS(OS_LINUX), False),
+        ("687e79.be2", "file", OS(OS_WINDOWS), False),
+        # os & format & arch are also global features
+        ("687e79.be2", "function=0x7588", OS(OS_ANDROID), True),
+        ("687e79.be2", "function=0x75c0,bb=0x76c0", OS(OS_ANDROID), True),
+        ("687e79.be2", "file", Arch(ARCH_I386), False),
+        ("687e79.be2", "file", Arch(ARCH_AMD64), False),
+        ("687e79.be2", "file", Arch(ARCH_AARCH64), True),
+        ("687e79.be2", "function=0x7588", Arch(ARCH_AARCH64), True),
+        ("687e79.be2", "function=0x75c0,bb=0x76c0", Arch(ARCH_AARCH64), True),
+        ("687e79.be2", "file", Format(FORMAT_ELF), True),
+        ("687e79.be2", "file", Format(FORMAT_PE), False),
+        ("687e79.be2", "function=0x7588", Format(FORMAT_ELF), True),
+        ("687e79.be2", "function=0x7588", Format(FORMAT_PE), False),
+        (
+            "687e79.be2",
+            "function=0x10002385,bb=0x10002385",
+            capa.features.common.Characteristic("call $+5"),
+            "xfail: not implemented yet",
+        ),
+        (
+            "687e79.be2",
+            "function=0x10001510,bb=0x100015c0",
+            capa.features.common.Characteristic("call $+5"),
+            "xfail: not implemented yet",
+        ),
+    ],
+    # order tests by (file, item)
+    # so that our LRU cache is most effective.
+    key=lambda t: (t[0], t[1]),
+)
+
+
+@fixtures.parametrize(
+    "sample,scope,feature,expected",
+    FEATURE_PRESENCE_TESTS_BE2_ELF_AARCH64,
+    indirect=["sample", "scope"],
+)
+def test_binexport_features_elf_aarch64(sample, scope, feature, expected):
+    if not isinstance(expected, bool):
+        pytest.xfail(expected)
+    fixtures.do_test_feature_presence(fixtures.get_binexport_extractor, sample, scope, feature, expected)
+
+
+@fixtures.parametrize(
+    "sample,scope,feature,expected",
+    fixtures.FEATURE_PRESENCE_TESTS,
+    indirect=["sample", "scope"],
+)
+def test_binexport_features_pe_x86(sample, scope, feature, expected):
+    if "mimikatz.exe_" not in sample.name:
+        pytest.skip("for now only testing mimikatz.exe_ IDA BinExport file")
+    sample = sample.parent / "binexport2" / (sample.name + ".ida.BinExport")
+    assert sample.exists()
+    fixtures.do_test_feature_presence(fixtures.get_binexport_extractor, sample, scope, feature, expected)
+
+
+@fixtures.parametrize(
+    "sample,scope,feature,expected",
+    fixtures.FEATURE_COUNT_TESTS,
+    indirect=["sample", "scope"],
+)
+def test_binexport_feature_counts(sample, scope, feature, expected):
+    if "mimikatz.exe_" not in sample.name:
+        pytest.skip("for now only testing mimikatz.exe_ IDA BinExport file")
+    sample = sample.parent / "binexport2" / (sample.name + ".ida.BinExport")
+    assert sample.exists()
+    fixtures.do_test_feature_count(fixtures.get_binexport_extractor, sample, scope, feature, expected)

From 172b66dc16687cd50c41d5baec7acfd9171bc332 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Tue, 9 Apr 2024 18:51:21 +0200
Subject: [PATCH 099/200] add/update BinExport tests and minor fixes

---
 capa/features/extractors/binexport2/insn.py |  12 +--
 capa/features/insn.py                       |   5 +-
 tests/test_binexport_features.py            | 112 +++++++++-----------
 3 files changed, 62 insertions(+), 67 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 8453d8069..209ae4b18 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -184,15 +184,15 @@ def extract_insn_number_features(
 
         # label: has a value
 
-        if analysis.base_address != 0x0:
+        if analysis.base_address == 0x0:
             # When the image is mapped at 0x0,
             #  then its hard to tell if numbers are pointers or numbers.
-            # So be a little less conservative here.
-            if is_address_mapped(be2, value):
-                continue
+            # TODO(mr): 1755 be a little less conservative otherwise?
 
-        if is_address_mapped(be2, value):
-            continue
+            # TODO(mr): 1755 this removes a lot of valid numbers, could check alignment and use additional heuristics
+            # if is_address_mapped(be2, value):
+            #     continue
+            pass
 
         yield Number(value), ih.address
         yield OperandNumber(i, value), ih.address
diff --git a/capa/features/insn.py b/capa/features/insn.py
index f4be23c87..47f18dfc3 100644
--- a/capa/features/insn.py
+++ b/capa/features/insn.py
@@ -100,9 +100,10 @@ def __init__(self, value: str, description=None):
 
 
 # max number of operands to consider for a given instruction.
-# since we only support Intel and .NET, we can assume this is 3
+# for Intel and .NET, this is 3
 # which covers cases up to e.g. "vinserti128 ymm0,ymm0,ymm5,1"
-MAX_OPERAND_COUNT = 4
+# for ARM/aarch64, we assume 4
+MAX_OPERAND_COUNT = 5
 MAX_OPERAND_INDEX = MAX_OPERAND_COUNT - 1
 
 
diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index 225cb16ab..8e9885ecd 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -5,7 +5,7 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
-from pathlib import Path
+import binascii
 
 import pytest
 
@@ -38,7 +38,7 @@
         ("687e79.be2", "file", capa.features.file.Section(".text"), True),
         ("687e79.be2", "file", capa.features.file.Section(".nope"), False),
         # file/exports
-        # TODO? ("687e79.be2", "file", capa.features.file.Export("android::clearDir"), True),
+        ("687e79.be2", "file", capa.features.file.Export("android::clearDir"), "xfail: not implemented yet?!"),
         ("687e79.be2", "file", capa.features.file.Export("nope"), False),
         # file/imports
         ("687e79.be2", "file", capa.features.file.Import("fopen"), True),
@@ -54,13 +54,13 @@
         # bb/characteristic(stack string)
         (
             "687e79.be2",
-            "function=0x",
+            "function=0x0",
             capa.features.common.Characteristic("stack string"),
             "xfail: not implemented yet",
         ),
         (
             "687e79.be2",
-            "function=0x",
+            "function=0x0",
             capa.features.common.Characteristic("stack string"),
             "xfail: not implemented yet",
         ),
@@ -86,34 +86,35 @@
         # insn/operand.number
         ("687e79.be2", "function=0x5128,bb=0x51e4", capa.features.insn.OperandNumber(1, 0xFFFFFFFF), True),
         ("687e79.be2", "function=0x7588,bb=0x7588", capa.features.insn.OperandNumber(1, 0x3), True),
-        ("687e79.be2", "function=0x7588,bb=0x7588", capa.features.insn.OperandNumber(1, 0x10), True),
+        ("687e79.be2", "function=0x7588,bb=0x7588,insn=0x7598", capa.features.insn.OperandNumber(1, 0x3), True),
         ("687e79.be2", "function=0x7588,bb=0x7588", capa.features.insn.OperandNumber(3, 0x10), True),
         # insn/operand.offset
-        ("687e79.be2", "function=0x0,bb=0x0", capa.features.insn.OperandOffset(0, 4), "xfail: not implemented yet"),
-        ("687e79.be2", "function=0x0,bb=0x0", capa.features.insn.OperandOffset(1, 4), "xfail: not implemented yet"),
+        (
+            "687e79.be2",
+            "function=0x0,bb=0x0",
+            capa.features.insn.OperandOffset(1, 100),
+            "xfail: not implemented yet",
+        ),
+        (
+            "687e79.be2",
+            "function=0x0,bb=0x0",
+            capa.features.insn.OperandOffset(3, 100),
+            "xfail: not implemented yet",
+        ),
         # insn/number
         ("687e79.be2", "function=0x7588", capa.features.insn.Number(0x3), True),
-        ("687e79.be2", "function=0x7588", capa.features.insn.Number(0x10), True),
+        ("687e79.be2", "function=0x7588", capa.features.insn.Number(0x10), "xfail: do we want this for ldp?"),
         ("687e79.be2", "function=0x5C88", capa.features.insn.Number(0xF000), True),
-        # insn/number: stack adjustments
-        # ("mimikatz", "function=0x40105D", capa.features.insn.Number(0xC), False),
-        # ("mimikatz", "function=0x40105D", capa.features.insn.Number(0x10), False),
         # insn/number: negative
-        # ("mimikatz", "function=0x401553", capa.features.insn.Number(0xFFFFFFFF), True),
-        # ("mimikatz", "function=0x43e543", capa.features.insn.Number(0xFFFFFFF0), True),
+        ("687e79.be2", "function=0x57f8,bb=0x57f8", capa.features.insn.Number(0xFFFFFFFFFFFFFFFF), True),
+        ("687e79.be2", "function=0x66e0,bb=0x68c4", capa.features.insn.Number(0xFFFFFFFF), True),
         # insn/offset
-        ("mimikatz", "function=0x0", capa.features.insn.Offset(0x0), "xfail: not implemented yet"),
-        ("mimikatz", "function=0x0", capa.features.insn.Offset(0x4), "xfail: not implemented yet"),
-        ("mimikatz", "function=0x0", capa.features.insn.Offset(0xC), "xfail: not implemented yet"),
-        # insn/offset, issue #276
-        # ("64d9f", "function=0x10001510,bb=0x100015B0", capa.features.insn.Offset(0x4000), True),
-        # insn/offset: stack references
-        # ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x8), False),
-        # ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x10), False),
+        ("687e79.be2", "function=0x0", capa.features.insn.Offset(0x0), "xfail: not implemented yet"),
+        ("687e79.be2", "function=0x0", capa.features.insn.Offset(0x4), "xfail: not implemented yet"),
+        ("687e79.be2", "function=0x0", capa.features.insn.Offset(0xC), "xfail: not implemented yet"),
         # insn/offset: negative
-        # ("mimikatz", "function=0x4011FB", capa.features.insn.Offset(-0x1), True),
-        # ("mimikatz", "function=0x4011FB", capa.features.insn.Offset(-0x2), True),
-        #
+        ("687e79.be2", "function=0x0", capa.features.insn.Offset(-0x1), "xfail: not implemented yet"),
+        ("687e79.be2", "function=0x0", capa.features.insn.Offset(-0x2), "xfail: not implemented yet"),
         # insn/offset from mnemonic: add
         #
         # should not be considered, too big for an offset:
@@ -139,7 +140,7 @@
         # ("mimikatz", "function=0x401873,bb=0x4018B2,insn=0x4018C0", capa.features.insn.Number(0x2), True),
         # insn/api
         # not extracting dll name
-        ("687e79.be2", "function=0x5c88", capa.features.insn.API("memset"), True),
+        ("687e79.be2", "function=0x5c88", capa.features.insn.API("memset"), "xfail: not working yet"),
         ("687e79.be2", "function=0x5c88", capa.features.insn.API(".memset"), True),
         ("687e79.be2", "function=0x5c88", capa.features.insn.API("Nope"), False),
         # insn/string
@@ -155,9 +156,19 @@
         # ("mimikatz", "function=0x44EDEF", capa.features.common.String("INPUTEVENT"), True),
         # # insn/string, direct memory reference
         # ("mimikatz", "function=0x46D6CE", capa.features.common.String("(null)"), True),
-        # # insn/bytes
-        # ("mimikatz", "function=0x401517", capa.features.common.Bytes(binascii.unhexlify("CA3B0E000000F8AF47")), True),
-        # ("mimikatz", "function=0x404414", capa.features.common.Bytes(binascii.unhexlify("0180000040EA4700")), True),
+        # insn/bytes
+        (
+            "687e79.be2",
+            "function=0x0",
+            capa.features.common.Bytes(binascii.unhexlify("00")),
+            "xfail: not implemented yet, may need other test sample",
+        ),
+        (
+            "687e79.be2",
+            "function=0x0",
+            capa.features.common.Bytes(binascii.unhexlify("00")),
+            "xfail: not implemented yet, may need other test sample",
+        ),
         # # don't extract byte features for obvious strings
         # ("mimikatz", "function=0x40105D", capa.features.common.Bytes("SCardControl".encode("utf-16le")), False),
         # ("mimikatz", "function=0x40105D", capa.features.common.Bytes("SCardTransmit".encode("utf-16le")), False),
@@ -169,41 +180,24 @@
         # ("mimikatz", "function=0x44570F", capa.features.common.Bytes(binascii.unhexlify("FF" * 256)), False),
         # # insn/bytes, pointer to string bytes
         # ("mimikatz", "function=0x44EDEF", capa.features.common.Bytes("INPUTEVENT".encode("utf-16le")), False),
-        # # insn/characteristic(nzxor)
-        # ("mimikatz", "function=0x410DFC", capa.features.common.Characteristic("nzxor"), True),
-        # ("mimikatz", "function=0x40105D", capa.features.common.Characteristic("nzxor"), False),
-        # # insn/characteristic(nzxor): no security cookies
-        # ("mimikatz", "function=0x46D534", capa.features.common.Characteristic("nzxor"), False),
-        # # insn/characteristic(nzxor): xorps
-        # # viv needs fixup to recognize function, see above
-        # ("mimikatz", "function=0x410dfc", capa.features.common.Characteristic("nzxor"), True),
-        # # insn/characteristic(peb access)
-        # ("kernel32-64", "function=0x1800017D0", capa.features.common.Characteristic("peb access"), True),
-        # ("mimikatz", "function=0x4556E5", capa.features.common.Characteristic("peb access"), False),
-        # # insn/characteristic(gs access)
-        # ("kernel32-64", "function=0x180001068", capa.features.common.Characteristic("gs access"), True),
-        # ("mimikatz", "function=0x4556E5", capa.features.common.Characteristic("gs access"), False),
+        # insn/characteristic(nzxor)
+        ("687e79.be2", "function=0x0", capa.features.common.Characteristic("nzxor"), "xfail: not implemented yet, may need other test sample"),
+        ("687e79.be2", "function=0x0", capa.features.common.Characteristic("nzxor"), "xfail: not implemented yet, may need other test sample"),
         # # insn/characteristic(cross section flow)
         # ("a1982...", "function=0x4014D0", capa.features.common.Characteristic("cross section flow"), True),
         # # insn/characteristic(cross section flow): imports don't count
-        # ("kernel32-64", "function=0x180001068", capa.features.common.Characteristic("cross section flow"), False),
         # ("mimikatz", "function=0x4556E5", capa.features.common.Characteristic("cross section flow"), False),
-        # # insn/characteristic(recursive call)
-        # ("mimikatz", "function=0x40640e", capa.features.common.Characteristic("recursive call"), True),
-        # # before this we used ambiguous (0x4556E5, False), which has a data reference / indirect recursive call, see #386
-        # ("mimikatz", "function=0x4175FF", capa.features.common.Characteristic("recursive call"), False),
-        # # insn/characteristic(indirect call)
-        # ("mimikatz", "function=0x4175FF", capa.features.common.Characteristic("indirect call"), True),
-        # ("mimikatz", "function=0x4556E5", capa.features.common.Characteristic("indirect call"), False),
-        # # insn/characteristic(calls from)
-        # ("mimikatz", "function=0x4556E5", capa.features.common.Characteristic("calls from"), True),
-        # ("mimikatz", "function=0x4702FD", capa.features.common.Characteristic("calls from"), False),
-        # # function/characteristic(calls to)
-        # ("mimikatz", "function=0x40105D", capa.features.common.Characteristic("calls to"), True),
-        # # function/characteristic(forwarded export)
-        # ("ea2876", "file", capa.features.common.Characteristic("forwarded export"), True),
-        # # before this we used ambiguous (0x4556E5, False), which has a data reference / indirect recursive call, see #386
-        # ("mimikatz", "function=0x456BB9", capa.features.common.Characteristic("calls to"), False),
+        # insn/characteristic(recursive call)
+        ("687e79.be2", "function=0x5b38", capa.features.common.Characteristic("recursive call"), True),
+        ("687e79.be2", "function=0x6530", capa.features.common.Characteristic("recursive call"), True),
+        # insn/characteristic(indirect call)
+        ("687e79.be2", "function=0x0", capa.features.common.Characteristic("indirect call"), "xfail: not implemented yet"),
+        ("687e79.be2", "function=0x0", capa.features.common.Characteristic("indirect call"), "xfail: not implemented yet"),
+        # insn/characteristic(calls from)
+        ("687e79.be2", "function=0x5080", capa.features.common.Characteristic("calls from"), True),
+        ("687e79.be2", "function=0x4d20", capa.features.common.Characteristic("calls from"), False),
+        # function/characteristic(calls to)
+        ("687e79.be2", "function=0x4b90", capa.features.common.Characteristic("calls to"), True),
         # file/function-name
         ("687e79.be2", "file", capa.features.file.FunctionName(".__libc_init"), True),
         # os & format & arch

From b07b498e6fc30151d654ca6db783b2a23929f53f Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 12 Apr 2024 09:06:45 +0200
Subject: [PATCH 100/200] inspect-binexport: add perf tracking

---
 scripts/inspect-binexport2.py | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index 5a42b9810..3b1549c8c 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -10,10 +10,11 @@
 """
 import io
 import sys
+import time
 import logging
 import argparse
 import contextlib
-from typing import List
+from typing import List, Dict, Optional
 
 import capa.main
 import capa.features.extractors.binexport2
@@ -22,6 +23,14 @@
 logger = logging.getLogger("inspect-binexport2")
 
 
+@contextlib.contextmanager
+def timing(msg: str):
+    t0 = time.time()
+    yield
+    t1 = time.time()
+    logger.debug("perf: %s: %0.2fs", msg, t1 - t0)
+
+
 class Renderer:
     def __init__(self, o: io.StringIO):
         self.o = o
@@ -204,8 +213,13 @@ def main(argv=None):
         return e.status_code
 
     o = Renderer(io.StringIO())
-    be2: BinExport2 = capa.features.extractors.binexport2.get_binexport2(args.input_file)
-    idx = capa.features.extractors.binexport2.BinExport2Index(be2)
+    with timing("loading BinExport2"):
+        be2: BinExport2 = capa.features.extractors.binexport2.get_binexport2(args.input_file)
+
+    with timing("indexing BinExport2"):
+        idx = capa.features.extractors.binexport2.BinExport2Index(be2)
+
+    t0 = time.time()
 
     with o.section("meta"):
         o.writeln(f"name:   {be2.meta_information.executable_name}")
@@ -374,7 +388,11 @@ def main(argv=None):
                 data_references += f"⇤ {hex(instruction_address)} "
             o.writeln(f"{hex(data_address)} {data_references}")
 
-    print(o.getvalue())
+    t1 = time.time()
+    logger.debug("perf: rendering BinExport2: %0.2fs", msg, t1 - t0)
+
+    with timing("writing BinExport2"):
+        print(o.getvalue())
 
 
 if __name__ == "__main__":

From 14116f7c24e516bffe2de389b68581d1701ccb56 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 12 Apr 2024 09:06:57 +0200
Subject: [PATCH 101/200] inspect-binexport: cache rendered operands

---
 scripts/inspect-binexport2.py | 38 ++++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index 3b1549c8c..10af75e4f 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -157,7 +157,33 @@ def _render_expression_tree(
         raise NotImplementedError(expression.type)
 
 
-def render_operand(be2: BinExport2, instruction: BinExport2.Instruction, operand: BinExport2.Operand) -> str:
+_OPERAND_CACHE: Dict[int, str] = {}
+def render_operand(be2: BinExport2, instruction: BinExport2.Instruction, operand: BinExport2.Operand, index: Optional[int]=None) -> str:
+    # For the mimikatz example file, there are 138k distinct operands.
+    # Of those, only 11k are unique, which is less than 10% of the total.
+    # The most common operands are seen 37k, 24k, 17k, 15k, 11k, ... times.
+    # In other words, the most common five operands account for 100k instances,
+    # which is around 75% of operand instances.
+    # Therefore, we expect caching to be fruitful, trading memory for CPU time.
+    #
+    # No caching:   6.045 s ± 0.164 s   [User: 5.916 s, System: 0.129 s]
+    # With caching: 4.259 s ±  0.161 s  [User: 4.141 s, System: 0.117 s]
+    #
+    # So we can save 30% of CPU time by caching operand rendering.
+    #
+    # Other measurements:
+    #
+    # perf: loading BinExport2:   0.06s
+    # perf: indexing BinExport2:  0.34s
+    # perf: rendering BinExport2: 1.96s
+    # perf: writing BinExport2:   1.13s
+    # ________________________________________________________
+    # Executed in    4.40 secs    fish           external
+    #    usr time    4.22 secs    0.00 micros    4.22 secs
+    #    sys time    0.18 secs  842.00 micros    0.18 secs
+    if index and index in _OPERAND_CACHE:
+        return _OPERAND_CACHE[index]
+
     o = io.StringIO()
 
     # The reconstructed expression tree layout, linking parent nodes to their children.
@@ -196,10 +222,16 @@ def render_operand(be2: BinExport2, instruction: BinExport2.Instruction, operand
         tree.append(children)
 
     _render_expression_tree(be2, instruction, operand, tree, 0, o)
-    return o.getvalue()
+    s = o.getvalue()
+
+    if index:
+        _OPERAND_CACHE[index] = s
+
+    return s
 
 
 def main(argv=None):
+
     if argv is None:
         argv = sys.argv[1:]
 
@@ -313,7 +345,7 @@ def main(argv=None):
                                 operands = []
                                 for operand_index in instruction.operand_index:
                                     operand = be2.operand[operand_index]
-                                    operands.append(render_operand(be2, instruction, operand))
+                                    operands.append(render_operand(be2, instruction, operand, index=operand_index))
 
                                 call_targets = ""
                                 if instruction.call_target:

From 26d4badd5a9aecf2169518113c004c9d55675169 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 12 Apr 2024 09:11:24 +0200
Subject: [PATCH 102/200] lints

---
 scripts/inspect-binexport2.py    | 10 +++++++---
 tests/test_binexport_features.py | 29 ++++++++++++++++++++++++-----
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index 10af75e4f..c68b573a2 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -14,7 +14,7 @@
 import logging
 import argparse
 import contextlib
-from typing import List, Dict, Optional
+from typing import Dict, List, Optional
 
 import capa.main
 import capa.features.extractors.binexport2
@@ -158,7 +158,11 @@ def _render_expression_tree(
 
 
 _OPERAND_CACHE: Dict[int, str] = {}
-def render_operand(be2: BinExport2, instruction: BinExport2.Instruction, operand: BinExport2.Operand, index: Optional[int]=None) -> str:
+
+
+def render_operand(
+    be2: BinExport2, instruction: BinExport2.Instruction, operand: BinExport2.Operand, index: Optional[int] = None
+) -> str:
     # For the mimikatz example file, there are 138k distinct operands.
     # Of those, only 11k are unique, which is less than 10% of the total.
     # The most common operands are seen 37k, 24k, 17k, 15k, 11k, ... times.
@@ -421,7 +425,7 @@ def main(argv=None):
             o.writeln(f"{hex(data_address)} {data_references}")
 
     t1 = time.time()
-    logger.debug("perf: rendering BinExport2: %0.2fs", msg, t1 - t0)
+    logger.debug("perf: rendering BinExport2: %0.2fs", t1 - t0)
 
     with timing("writing BinExport2"):
         print(o.getvalue())
diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index 8e9885ecd..12cae0b2b 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -8,7 +8,6 @@
 import binascii
 
 import pytest
-
 import fixtures
 
 import capa.features.file
@@ -181,8 +180,18 @@
         # # insn/bytes, pointer to string bytes
         # ("mimikatz", "function=0x44EDEF", capa.features.common.Bytes("INPUTEVENT".encode("utf-16le")), False),
         # insn/characteristic(nzxor)
-        ("687e79.be2", "function=0x0", capa.features.common.Characteristic("nzxor"), "xfail: not implemented yet, may need other test sample"),
-        ("687e79.be2", "function=0x0", capa.features.common.Characteristic("nzxor"), "xfail: not implemented yet, may need other test sample"),
+        (
+            "687e79.be2",
+            "function=0x0",
+            capa.features.common.Characteristic("nzxor"),
+            "xfail: not implemented yet, may need other test sample",
+        ),
+        (
+            "687e79.be2",
+            "function=0x0",
+            capa.features.common.Characteristic("nzxor"),
+            "xfail: not implemented yet, may need other test sample",
+        ),
         # # insn/characteristic(cross section flow)
         # ("a1982...", "function=0x4014D0", capa.features.common.Characteristic("cross section flow"), True),
         # # insn/characteristic(cross section flow): imports don't count
@@ -191,8 +200,18 @@
         ("687e79.be2", "function=0x5b38", capa.features.common.Characteristic("recursive call"), True),
         ("687e79.be2", "function=0x6530", capa.features.common.Characteristic("recursive call"), True),
         # insn/characteristic(indirect call)
-        ("687e79.be2", "function=0x0", capa.features.common.Characteristic("indirect call"), "xfail: not implemented yet"),
-        ("687e79.be2", "function=0x0", capa.features.common.Characteristic("indirect call"), "xfail: not implemented yet"),
+        (
+            "687e79.be2",
+            "function=0x0",
+            capa.features.common.Characteristic("indirect call"),
+            "xfail: not implemented yet",
+        ),
+        (
+            "687e79.be2",
+            "function=0x0",
+            capa.features.common.Characteristic("indirect call"),
+            "xfail: not implemented yet",
+        ),
         # insn/characteristic(calls from)
         ("687e79.be2", "function=0x5080", capa.features.common.Characteristic("calls from"), True),
         ("687e79.be2", "function=0x4d20", capa.features.common.Characteristic("calls from"), False),

From 29c2cbd367b7b322f94c05040dc55fadde6bd230 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Mon, 22 Apr 2024 12:53:55 +0200
Subject: [PATCH 103/200] do not extract number features for ret instructions

---
 capa/features/extractors/binexport2/insn.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 209ae4b18..9ce2d6362 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -120,6 +120,13 @@ def extract_insn_number_features(
     instruction_index = ii.instruction_index
     instruction = be2.instruction[instruction_index]
 
+    # x86 / amd64
+    mnemonic = be2.mnemonic[instruction.mnemonic_index]
+    if mnemonic.name.lower().startswith("ret"):
+        # skip things like:
+        #   .text:0042250E retn 8
+        return
+
     _is_gsm = _is_ghidra_symbol_madness(be2, instruction_index)
 
     for i, operand_index in enumerate(instruction.operand_index):

From 9543e468f20db220d37a76fa51aa67ef8a28914d Mon Sep 17 00:00:00 2001
From: Lin Chen <larch.lin.chen@gmail.com>
Date: Mon, 22 Apr 2024 13:27:56 +0000
Subject: [PATCH 104/200] Fix BinExport's "tight loop" feature extraction.

`idx.target_edges_by_basic_block_index[basic_block_index]` is of type
`List[Edges]`. The index `basic_block_index` was definitely not an
element.
---
 capa/features/extractors/binexport2/basicblock.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/capa/features/extractors/binexport2/basicblock.py b/capa/features/extractors/binexport2/basicblock.py
index 5ffb9b11a..c3f9e6d51 100644
--- a/capa/features/extractors/binexport2/basicblock.py
+++ b/capa/features/extractors/binexport2/basicblock.py
@@ -22,7 +22,8 @@ def extract_bb_tight_loop(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[F
     idx = fhi.ctx.idx
 
     basic_block_index = bbi.basic_block_index
-    if basic_block_index in idx.target_edges_by_basic_block_index[basic_block_index]:
+    target_edges = idx.target_edges_by_basic_block_index[basic_block_index]
+    if basic_block_index in (e.target_basic_block_index for e in target_edges):
         basic_block_address = idx.basic_block_address_by_index[basic_block_index]
         yield Characteristic("tight loop"), AbsoluteVirtualAddress(basic_block_address)
 

From 498ff72df493cfb801957f684f59e93a30aadf04 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 23 Apr 2024 10:36:39 +0200
Subject: [PATCH 105/200] inspect-binexport: better render data section

---
 scripts/inspect-binexport2.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index c68b573a2..022c65104 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -416,14 +416,21 @@ def main(argv=None):
                 # appears to be code
                 continue
 
-            data_references = ""
+            data_references = []
             for data_reference_index in idx.data_reference_index_by_target_address[data_address]:
                 data_reference = be2.data_reference[data_reference_index]
                 instruction_index = data_reference.instruction_index
                 instruction_address = idx.instruction_address_by_index[instruction_index]
-                data_references += f"⇤ {hex(instruction_address)} "
-            o.writeln(f"{hex(data_address)} {data_references}")
+                data_references.append(instruction_address)
 
+            if not data_references:
+                continue
+
+            o.writeln(f"{hex(data_address)} ⇤ {hex(data_references[0])}")
+            for data_reference in data_references[1:]:
+                o.writeln(f"{' ' * len(hex(data_address))} ↖ {hex(data_reference)}")
+
+                
     t1 = time.time()
     logger.debug("perf: rendering BinExport2: %0.2fs", t1 - t0)
 

From f83da383526c2e749d5e051aa6e66cb768e1c34d Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 23 Apr 2024 10:46:02 +0200
Subject: [PATCH 106/200] linters

---
 scripts/inspect-binexport2.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index 022c65104..28774f6bc 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -416,25 +416,24 @@ def main(argv=None):
                 # appears to be code
                 continue
 
-            data_references = []
+            data_xrefs = []
             for data_reference_index in idx.data_reference_index_by_target_address[data_address]:
                 data_reference = be2.data_reference[data_reference_index]
                 instruction_index = data_reference.instruction_index
                 instruction_address = idx.instruction_address_by_index[instruction_index]
-                data_references.append(instruction_address)
+                data_xrefs.append(instruction_address)
 
-            if not data_references:
+            if not data_xrefs:
                 continue
 
-            o.writeln(f"{hex(data_address)} ⇤ {hex(data_references[0])}")
-            for data_reference in data_references[1:]:
-                o.writeln(f"{' ' * len(hex(data_address))} ↖ {hex(data_reference)}")
+            o.writeln(f"{hex(data_address)} ⇤ {hex(data_xrefs[0])}")
+            for data_xref in data_xrefs[1:]:
+                o.writeln(f"{' ' * len(hex(data_address))} ↖ {hex(data_xref)}")
 
-                
     t1 = time.time()
     logger.debug("perf: rendering BinExport2: %0.2fs", t1 - t0)
 
-    with timing("writing BinExport2"):
+    with timing("writing to STDOUT"):
         print(o.getvalue())
 
 

From 992049dd2f6786b32ebef8fbb770e423a1270cc2 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 23 Apr 2024 11:07:31 +0200
Subject: [PATCH 107/200] main: accept --format=binexport2

---
 capa/main.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/capa/main.py b/capa/main.py
index c7607aaea..b744aeda2 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -243,6 +243,7 @@ def install_common_args(parser, wanted=None):
             (FORMAT_SC64, "64-bit shellcode"),
             (FORMAT_CAPE, "CAPE sandbox report"),
             (FORMAT_FREEZE, "features previously frozen by capa"),
+            (FORMAT_BINEXPORT2, "BinExport2"),
         ]
         format_help = ", ".join([f"{f[0]}: {f[1]}" for f in formats])
 

From 0dea7a351b8531ef33fc23e1739644c6d1db7c1a Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 23 Apr 2024 11:19:06 +0200
Subject: [PATCH 108/200] binexport: insn: add support for parsing bare
 immediate int operands

---
 capa/features/extractors/binexport2/insn.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 9ce2d6362..6ac01a4f2 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -163,8 +163,22 @@ def extract_insn_number_features(
             else:
                 continue
 
+        elif len(operand.expression_index) == 1:
+            # - type: IMMEDIATE_INT
+            #   immediate: 20588728364
+            #   parent_index: 0
+
+            expression0 = be2.expression[operand.expression_index[0]]
+
+            if BinExport2.Expression.Type.IMMEDIATE_INT != expression0.type:
+                continue
+
+            value = expression0.immediate
+
+            # handling continues below at label: has a value
+
         elif len(operand.expression_index) == 2:
-            # from BinDetego,
+            # from IDA, which provides a size hint for every operand,
             # we get the following pattern for immediate constants:
             #
             # - type: SIZE_PREFIX

From 966e62d95b55f4f60b6fb47f74039e10bfa041ab Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 23 Apr 2024 11:33:59 +0200
Subject: [PATCH 109/200] binexport2: bb: fix tight loop detection

ref #2050
---
 capa/features/extractors/binexport2/basicblock.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/features/extractors/binexport2/basicblock.py b/capa/features/extractors/binexport2/basicblock.py
index c3f9e6d51..8aab079a2 100644
--- a/capa/features/extractors/binexport2/basicblock.py
+++ b/capa/features/extractors/binexport2/basicblock.py
@@ -23,7 +23,7 @@ def extract_bb_tight_loop(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[F
 
     basic_block_index = bbi.basic_block_index
     target_edges = idx.target_edges_by_basic_block_index[basic_block_index]
-    if basic_block_index in (e.target_basic_block_index for e in target_edges):
+    if basic_block_index in (e.source_basic_block_index for e in target_edges):
         basic_block_address = idx.basic_block_address_by_index[basic_block_index]
         yield Characteristic("tight loop"), AbsoluteVirtualAddress(basic_block_address)
 

From dc8c7e8861b6d4d6eeef9c03f62b7e1728600de6 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 23 Apr 2024 13:18:04 +0200
Subject: [PATCH 110/200] binexport: api: generate variations of Win32 APIs

---
 capa/features/extractors/binexport2/insn.py | 13 +++++++------
 capa/features/extractors/helpers.py         |  1 +
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 6ac01a4f2..f39fcfa1b 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -49,15 +49,16 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
         if not vertex.HasField("mangled_name"):
             continue
 
-        yield API(vertex.mangled_name), ih.address
-
+        function_name = vertex.mangled_name
         if vertex.HasField("library_index"):
             # TODO: this seems to be incorrect for Ghidra extractor
             library = be2.library[vertex.library_index]
             library_name = library.name
-            if library_name.endswith(".so"):
-                library_name = library_name.rpartition(".so")[0]
-            yield API(f"{library_name}.{vertex.mangled_name}"), ih.address
+                
+            for name in capa.features.extractors.helpers.generate_symbols(library_name, function_name):
+                yield API(name), ih.address
+        else:
+            yield API(function_name), ih.address
 
 
 def is_address_mapped(be2: BinExport2, address: int) -> bool:
@@ -176,7 +177,7 @@ def extract_insn_number_features(
             value = expression0.immediate
 
             # handling continues below at label: has a value
-
+ 
         elif len(operand.expression_index) == 2:
             # from IDA, which provides a size hint for every operand,
             # we get the following pattern for immediate constants:
diff --git a/capa/features/extractors/helpers.py b/capa/features/extractors/helpers.py
index 541a6eae5..09f76f589 100644
--- a/capa/features/extractors/helpers.py
+++ b/capa/features/extractors/helpers.py
@@ -63,6 +63,7 @@ def generate_symbols(dll: str, symbol: str, include_dll=False) -> Iterator[str]:
     # trim extensions observed in dynamic traces
     dll = dll[0:-4] if dll.endswith(".dll") else dll
     dll = dll[0:-4] if dll.endswith(".drv") else dll
+    dll = dll[0:-3] if dll.endswith(".so") else dll
 
     if include_dll or is_ordinal(symbol):
         # ws2_32.#1

From f37dd70e7cbf43b8aa2865188211ed39bc7c5f10 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 23 Apr 2024 13:18:59 +0200
Subject: [PATCH 111/200] lints

---
 capa/features/extractors/binexport2/insn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index f39fcfa1b..205eb2880 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -54,7 +54,7 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
             # TODO: this seems to be incorrect for Ghidra extractor
             library = be2.library[vertex.library_index]
             library_name = library.name
-                
+
             for name in capa.features.extractors.helpers.generate_symbols(library_name, function_name):
                 yield API(name), ih.address
         else:
@@ -177,7 +177,7 @@ def extract_insn_number_features(
             value = expression0.immediate
 
             # handling continues below at label: has a value
- 
+
         elif len(operand.expression_index) == 2:
             # from IDA, which provides a size hint for every operand,
             # we get the following pattern for immediate constants:

From b4558df9d938bb1115571ca70a78d5cb6be165ec Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 24 Apr 2024 12:00:57 +0200
Subject: [PATCH 112/200] binexport: index: don't assume instruction index is
 1:1 with address

---
 .../extractors/binexport2/__init__.py         | 59 ++++++++++---------
 .../extractors/binexport2/basicblock.py       |  2 +-
 .../extractors/binexport2/extractor.py        |  7 +--
 scripts/inspect-binexport2.py                 | 27 +++++----
 4 files changed, 48 insertions(+), 47 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index a29b65ed3..bfb9b2f06 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -13,7 +13,7 @@
 import hashlib
 import logging
 import contextlib
-from typing import Dict, List, Iterator
+from typing import Dict, List, Tuple, Iterator
 from pathlib import Path
 from collections import defaultdict
 from dataclasses import dataclass
@@ -85,10 +85,6 @@ def __init__(self, be2: BinExport2):
         # note: flow graph != call graph (vertex)
         self.flow_graph_index_by_address: Dict[int, int] = {}
         self.flow_graph_address_by_index: Dict[int, int] = {}
-        self.basic_block_index_by_address: Dict[int, int] = {}
-        self.basic_block_address_by_index: Dict[int, int] = {}
-        self.instruction_index_by_address: Dict[int, int] = {}
-        self.instruction_address_by_index: Dict[int, int] = {}
 
         # edges that come from the given basic block
         self.source_edges_by_basic_block_index: Dict[int, List[BinExport2.FlowGraph.Edge]] = defaultdict(list)
@@ -102,13 +98,19 @@ def __init__(self, be2: BinExport2):
         self.string_reference_index_by_source_instruction_index: Dict[int, List[int]] = defaultdict(list)
 
         self._index_vertex_edges()
-        self._index_instruction_addresses()
         self._index_flow_graph_nodes()
         self._index_flow_graph_edges()
         self._index_call_graph_vertices()
         self._index_data_references()
         self._index_string_references()
 
+    def get_basic_block_address(self, basic_block_index: int) -> int:
+        basic_block = self.be2.basic_block[basic_block_index]
+        first_instruction_index = next(self.instruction_indices(basic_block))
+        insn = self.be2.instruction[first_instruction_index]
+        assert insn.HasField("address"), "first insn in a basic block must have an explicit address"
+        return insn.address
+
     def _index_vertex_edges(self):
         for edge in self.be2.call_graph.edge:
             if not edge.source_vertex_index:
@@ -119,31 +121,9 @@ def _index_vertex_edges(self):
             self.callers_by_vertex_index[edge.target_vertex_index].append(edge.source_vertex_index)
             self.callees_by_vertex_index[edge.source_vertex_index].append(edge.target_vertex_index)
 
-    def _index_instruction_addresses(self):
-        instruction_address = 0
-        for instruction_index, instruction in enumerate(self.be2.instruction):
-            if instruction.HasField("address"):
-                instruction_address = instruction.address
-
-            self.instruction_index_by_address[instruction_address] = instruction_index
-            self.instruction_address_by_index[instruction_index] = instruction_address
-
-            assert instruction.HasField("raw_bytes")
-            instruction_address += len(instruction.raw_bytes)
-
     def _index_flow_graph_nodes(self):
         for flow_graph_index, flow_graph in enumerate(self.be2.flow_graph):
-            for basic_block_index in flow_graph.basic_block_index:
-                basic_block = self.be2.basic_block[basic_block_index]
-                first_instruction_index = next(self.instruction_indices(basic_block))
-                basic_block_address = self.instruction_address_by_index[first_instruction_index]
-                self.basic_block_index_by_address[basic_block_address] = basic_block_index
-                self.basic_block_address_by_index[basic_block_index] = basic_block_address
-
-            entry_basic_block = self.be2.basic_block[flow_graph.entry_basic_block_index]
-            entry_instruction_index = next(self.instruction_indices(entry_basic_block))
-            entry_instruction_address = self.instruction_address_by_index[entry_instruction_index]
-            function_address = entry_instruction_address
+            function_address = self.get_basic_block_address(flow_graph.entry_basic_block_index)
             self.flow_graph_index_by_address[function_address] = flow_graph_index
             self.flow_graph_address_by_index[flow_graph_index] = function_address
 
@@ -179,6 +159,9 @@ def _index_string_references(self):
 
     @staticmethod
     def instruction_indices(basic_block: BinExport2.BasicBlock) -> Iterator[int]:
+        """
+        For a given basic block, enumerate the instruction indices.
+        """
         for index_range in basic_block.instruction_index:
             if not index_range.HasField("end_index"):
                 yield index_range.begin_index
@@ -186,6 +169,24 @@ def instruction_indices(basic_block: BinExport2.BasicBlock) -> Iterator[int]:
             else:
                 yield from range(index_range.begin_index, index_range.end_index)
 
+    def basic_block_instructions(
+        self, basic_block: BinExport2.BasicBlock
+    ) -> Iterator[Tuple[int, BinExport2.Instruction, int]]:
+        """
+        For a given basic block, enumerate the instruction indices,
+        the instruction instances, and their addresses.
+        """
+        instruction_address = 0
+        for instruction_index in self.instruction_indices(basic_block):
+            instruction = self.be2.instruction[instruction_index]
+            if instruction.HasField("address"):
+                instruction_address = instruction.address
+
+            yield instruction_index, instruction, instruction_address
+
+            assert instruction.HasField("raw_bytes")
+            instruction_address += len(instruction.raw_bytes)
+
     def get_function_name_by_vertex(self, vertex_index: int) -> str:
         vertex = self.be2.call_graph.vertex[vertex_index]
         name = f"sub_{vertex.address:x}"
diff --git a/capa/features/extractors/binexport2/basicblock.py b/capa/features/extractors/binexport2/basicblock.py
index 8aab079a2..4674791f7 100644
--- a/capa/features/extractors/binexport2/basicblock.py
+++ b/capa/features/extractors/binexport2/basicblock.py
@@ -24,7 +24,7 @@ def extract_bb_tight_loop(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[F
     basic_block_index = bbi.basic_block_index
     target_edges = idx.target_edges_by_basic_block_index[basic_block_index]
     if basic_block_index in (e.source_basic_block_index for e in target_edges):
-        basic_block_address = idx.basic_block_address_by_index[basic_block_index]
+        basic_block_address = idx.get_basic_block_address(basic_block_index)
         yield Characteristic("tight loop"), AbsoluteVirtualAddress(basic_block_address)
 
 
diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
index 776d60ef6..780fc7592 100644
--- a/capa/features/extractors/binexport2/extractor.py
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -65,7 +65,7 @@ def extract_file_features(self):
     def get_functions(self) -> Iterator[FunctionHandle]:
         for flow_graph_index, flow_graph in enumerate(self.be2.flow_graph):
             entry_basic_block_index = flow_graph.entry_basic_block_index
-            flow_graph_address = self.idx.basic_block_address_by_index[entry_basic_block_index]
+            flow_graph_address = self.idx.get_basic_block_address(entry_basic_block_index)
             yield FunctionHandle(
                 AbsoluteVirtualAddress(flow_graph_address),
                 inner=FunctionContext(self.ctx, flow_graph_index),
@@ -80,7 +80,7 @@ def get_basic_blocks(self, fh: FunctionHandle) -> Iterator[BBHandle]:
         flow_graph = self.be2.flow_graph[flow_graph_index]
 
         for basic_block_index in flow_graph.basic_block_index:
-            basic_block_address = self.idx.basic_block_address_by_index[basic_block_index]
+            basic_block_address = self.idx.get_basic_block_address(basic_block_index)
             yield BBHandle(
                 address=AbsoluteVirtualAddress(basic_block_address),
                 inner=BasicBlockContext(basic_block_index),
@@ -92,8 +92,7 @@ def extract_basic_block_features(self, fh: FunctionHandle, bbh: BBHandle) -> Ite
     def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHandle]:
         bbi: BasicBlockContext = bbh.inner
         basic_block: BinExport2.BasicBlock = self.be2.basic_block[bbi.basic_block_index]
-        for instruction_index in self.idx.instruction_indices(basic_block):
-            instruction_address = self.idx.instruction_address_by_index[instruction_index]
+        for instruction_index, _, instruction_address in self.idx.basic_block_instructions(basic_block):
             yield InsnHandle(
                 address=AbsoluteVirtualAddress(instruction_address),
                 inner=InstructionContext(instruction_index),
diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index 28774f6bc..65c9e1094 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -326,7 +326,7 @@ def main(argv=None):
                     o.writeln("")
                     for basic_block_index in flow_graph.basic_block_index:
                         basic_block = be2.basic_block[basic_block_index]
-                        basic_block_address = idx.basic_block_address_by_index[basic_block_index]
+                        basic_block_address = idx.get_basic_block_address(basic_block_index)
 
                         with o.section(f"basic block {hex(basic_block_address)}"):
                             for edge in idx.target_edges_by_basic_block_index[basic_block_index]:
@@ -334,16 +334,15 @@ def main(argv=None):
                                     continue
 
                                 source_basic_block_index = edge.source_basic_block_index
-                                source_basic_block_address = idx.basic_block_address_by_index[source_basic_block_index]
+                                source_basic_block_address = idx.get_basic_block_address(source_basic_block_index)
 
                                 o.writeln(
                                     f"↓ {BinExport2.FlowGraph.Edge.Type.Name(edge.type)} basic block {hex(source_basic_block_address)}"
                                 )
 
-                            for instruction_index in idx.instruction_indices(basic_block):
-                                instruction = be2.instruction[instruction_index]
-                                instruction_address = idx.instruction_address_by_index[instruction_index]
-
+                            for instruction_index, instruction, instruction_address in idx.basic_block_instructions(
+                                basic_block
+                            ):
                                 mnemonic = be2.mnemonic[instruction.mnemonic_index]
 
                                 operands = []
@@ -402,7 +401,7 @@ def main(argv=None):
                                     back_edge = "↑"
 
                                 target_basic_block_index = edge.target_basic_block_index
-                                target_basic_block_address = idx.basic_block_address_by_index[target_basic_block_index]
+                                target_basic_block_address = idx.get_basic_block_address(target_basic_block_index)
                                 o.writeln(
                                     f"→ {BinExport2.FlowGraph.Edge.Type.Name(edge.type)} basic block {hex(target_basic_block_address)} {back_edge}"
                                 )
@@ -412,16 +411,18 @@ def main(argv=None):
 
     with o.section("data"):
         for data_address in sorted(idx.data_reference_index_by_target_address.keys()):
-            if data_address in idx.instruction_index_by_address:
-                # appears to be code
-                continue
+            # TODO(wb): re-enable this
+            # if data_address in idx.instruction_index_by_address:
+            #     # appears to be code
+            #     continue
 
-            data_xrefs = []
+            data_xrefs: List[int] = []
             for data_reference_index in idx.data_reference_index_by_target_address[data_address]:
                 data_reference = be2.data_reference[data_reference_index]
                 instruction_index = data_reference.instruction_index
-                instruction_address = idx.instruction_address_by_index[instruction_index]
-                data_xrefs.append(instruction_address)
+                # TODO(wb): uh-oh, how to reconstruct address?
+                # instruction_address = idx.instruction_address_by_index[instruction_index]
+                # data_xrefs.append(instruction_address)
 
             if not data_xrefs:
                 continue

From 9c99af9908773a090f6f2fa52651343cebb3a03d Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Tue, 30 Apr 2024 12:40:20 -0600
Subject: [PATCH 113/200] be2: index instruction addresses

---
 .../extractors/binexport2/__init__.py         | 38 ++++++++++++++-----
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index bfb9b2f06..b0bf597a3 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -97,6 +97,10 @@ def __init__(self, be2: BinExport2):
         self.data_reference_index_by_target_address: Dict[int, List[int]] = defaultdict(list)
         self.string_reference_index_by_source_instruction_index: Dict[int, List[int]] = defaultdict(list)
 
+        self.insn_address_by_index: Dict[int, int] = {}
+
+        # must index instructions first
+        self._index_insn_addresses()
         self._index_vertex_edges()
         self._index_flow_graph_nodes()
         self._index_flow_graph_edges()
@@ -104,12 +108,15 @@ def __init__(self, be2: BinExport2):
         self._index_data_references()
         self._index_string_references()
 
+    def get_insn_address(self, insn_index: int) -> int:
+        assert insn_index in self.insn_address_by_index, f"insn must be indexed, missing {insn_index}"
+        return self.insn_address_by_index[insn_index]
+
     def get_basic_block_address(self, basic_block_index: int) -> int:
         basic_block = self.be2.basic_block[basic_block_index]
         first_instruction_index = next(self.instruction_indices(basic_block))
-        insn = self.be2.instruction[first_instruction_index]
-        assert insn.HasField("address"), "first insn in a basic block must have an explicit address"
-        return insn.address
+
+        return self.get_insn_address(first_instruction_index)
 
     def _index_vertex_edges(self):
         for edge in self.be2.call_graph.edge:
@@ -157,6 +164,24 @@ def _index_string_references(self):
                 string_reference_index
             )
 
+    def _index_insn_addresses(self):
+        # see https://github.com/google/binexport/blob/39f6445c232bb5caf5c4a2a996de91dfa20c48e8/binexport.cc#L45
+        if len(self.be2.instruction) == 0:
+            return
+
+        assert self.be2.instruction[0].HasField("address"), "first insn must have explicit address"
+
+        addr = 0
+        next_addr = 0
+        for idx, insn in enumerate(self.be2.instruction):
+            if insn.HasField("address"):
+                addr = insn.address
+                next_addr = addr + len(insn.raw_bytes)
+            else:
+                addr = next_addr
+                next_addr += len(insn.raw_bytes)
+            self.insn_address_by_index[idx] = addr
+
     @staticmethod
     def instruction_indices(basic_block: BinExport2.BasicBlock) -> Iterator[int]:
         """
@@ -176,17 +201,12 @@ def basic_block_instructions(
         For a given basic block, enumerate the instruction indices,
         the instruction instances, and their addresses.
         """
-        instruction_address = 0
         for instruction_index in self.instruction_indices(basic_block):
             instruction = self.be2.instruction[instruction_index]
-            if instruction.HasField("address"):
-                instruction_address = instruction.address
+            instruction_address = self.get_insn_address(instruction_index)
 
             yield instruction_index, instruction, instruction_address
 
-            assert instruction.HasField("raw_bytes")
-            instruction_address += len(instruction.raw_bytes)
-
     def get_function_name_by_vertex(self, vertex_index: int) -> str:
         vertex = self.be2.call_graph.vertex[vertex_index]
         name = f"sub_{vertex.address:x}"

From 1fea6ab5cee4edf21b7250e3aaa25b902a263498 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Tue, 30 Apr 2024 12:49:20 -0600
Subject: [PATCH 114/200] be2: temp remove bytes feature processing

---
 capa/features/extractors/binexport2/insn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 205eb2880..5ee5f22a8 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -404,7 +404,7 @@ def extract_features(f: FunctionHandle, bbh: BBHandle, insn: InsnHandle) -> Iter
 INSTRUCTION_HANDLERS = (
     extract_insn_api_features,
     extract_insn_number_features,
-    extract_insn_bytes_features,
+    #extract_insn_bytes_features,
     extract_insn_string_features,
     extract_insn_offset_features,
     extract_insn_nzxor_characteristic_features,

From 87604811cbc3eea41a6aa94c85607b6a4ff3c135 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Fri, 3 May 2024 10:03:08 +0200
Subject: [PATCH 115/200] binexport: read memory from an address space
 extracted from PE/ELF

closes #2061
---
 .../extractors/binexport2/__init__.py         | 95 +++++++++++++++++++
 .../extractors/binexport2/extractor.py        |  4 +-
 capa/features/extractors/binexport2/insn.py   | 65 ++-----------
 3 files changed, 106 insertions(+), 58 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index b0bf597a3..e6bfc5aab 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -10,6 +10,7 @@
 
     protoc --python_out=. --mypy_out=. binexport2.proto
 """
+import io
 import hashlib
 import logging
 import contextlib
@@ -18,6 +19,9 @@
 from collections import defaultdict
 from dataclasses import dataclass
 
+from pefile import PE
+from elftools.elf.elffile import ELFFile
+
 import capa.features.extractors.common
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
@@ -305,12 +309,103 @@ def _find_base_address(self):
         self.base_address = min(s.address for s in sections_with_perms)
 
 
+@dataclass
+class MemoryRegion:
+    # location of the bytes, potentially relative to a base address
+    address: int
+    buf: bytes
+
+    @property
+    def end(self) -> int:
+        return self.address + len(self.buf)
+
+    def contains(self, address: int) -> bool:
+        # note: address must be relative to any base address
+        return self.address <= address < self.end
+
+
+class ReadMemoryError(ValueError): ...
+
+
+class AddressNotMappedError(ReadMemoryError): ...
+
+
+@dataclass
+class AddressSpace:
+    base_address: int
+    memory_regions: Tuple[MemoryRegion, ...]
+
+    def read_memory(self, address: int, length: int) -> bytes:
+        rva = address - self.base_address
+        for region in self.memory_regions:
+            if region.contains(rva):
+                offset = rva - region.address
+                return region.buf[offset : offset + length]
+
+        raise AddressNotMappedError(address)
+
+    @classmethod
+    def from_pe(cls, pe: PE):
+        base_address = pe.OPTIONAL_HEADER.ImageBase
+
+        regions = []
+        for section in pe.sections:
+            address = section.VirtualAddress
+            size = section.Misc_VirtualSize
+            buf = section.get_data()
+
+            if len(buf) != size:
+                # pad the section with NULLs
+                # assume page alignment is already handled.
+                # might need more hardening here.
+                buf += b"\x00" * (size - len(buf))
+
+            regions.append(MemoryRegion(address, buf))
+
+        return cls(base_address, tuple(regions))
+
+    @classmethod
+    def from_elf(cls, elf: ELFFile):
+        regions = []
+
+        # ELF segments are for runtime data,
+        # ELF sections are for link-time data.
+        for segment in elf.iter_segments():
+            # assume p_align is consistent with addresses here.
+            # otherwise, should harden this loader.
+            segment_rva = segment.header.p_vaddr
+            segment_size = segment.header.p_memsz
+            segment_data = segment.data()
+
+            if len(segment_data) < segment_size:
+                # pad the section with NULLs
+                # assume page alignment is already handled.
+                # might need more hardening here.
+                segment_data += b"\x00" * (segment_size - len(segment_data))
+
+            regions.append(MemoryRegion(segment_rva, segment_data))
+
+        return cls(0, tuple(regions))
+
+    @classmethod
+    def from_buf(cls, buf: bytes):
+        if buf.startswith(capa.features.extractors.common.MATCH_PE):
+            pe = PE(data=buf)
+            return cls.from_pe(pe)
+        elif buf.startswith(capa.features.extractors.common.MATCH_ELF):
+            elf = ELFFile(io.BytesIO(buf))
+            return cls.from_elf(elf)
+        else:
+            raise NotImplementedError("file format address space")
+
+
 @dataclass
 class AnalysisContext:
     sample_bytes: bytes
     be2: BinExport2
     idx: BinExport2Index
     analysis: BinExport2Analysis
+    address_space: AddressSpace
 
 
 @dataclass
diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
index 780fc7592..ac54996d2 100644
--- a/capa/features/extractors/binexport2/extractor.py
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -17,6 +17,7 @@
 from capa.features.common import Feature
 from capa.features.address import Address, AbsoluteVirtualAddress
 from capa.features.extractors.binexport2 import (
+    AddressSpace,
     AnalysisContext,
     BinExport2Index,
     FunctionContext,
@@ -43,7 +44,8 @@ def __init__(self, be2: BinExport2, buf: bytes):
         self.buf = buf
         self.idx = BinExport2Index(self.be2)
         self.analysis = BinExport2Analysis(self.be2, self.idx, self.buf)
-        self.ctx = AnalysisContext(self.buf, self.be2, self.idx, self.analysis)
+        address_space = AddressSpace.from_buf(buf)
+        self.ctx = AnalysisContext(self.buf, self.be2, self.idx, self.analysis, address_space)
 
         self.global_features: List[Tuple[Feature, Address]] = []
         self.global_features.extend(list(capa.features.extractors.common.extract_format(self.buf)))
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 5ee5f22a8..b1eead657 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -5,19 +5,15 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
-import io
 import logging
 from typing import List, Tuple, Iterator
 
-import pefile
-from elftools.elf.elffile import ELFFile
-
 import capa.features.extractors.helpers
 import capa.features.extractors.strings
 from capa.features.insn import API, Number, Mnemonic, OperandNumber
 from capa.features.common import Bytes, String, Feature, Characteristic
 from capa.features.address import Address, AbsoluteVirtualAddress
-from capa.features.extractors.binexport2 import AnalysisContext, FunctionContext, InstructionContext
+from capa.features.extractors.binexport2 import FunctionContext, ReadMemoryError, InstructionContext
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
@@ -220,60 +216,14 @@ def extract_insn_number_features(
         yield OperandNumber(i, value), ih.address
 
 
-class ReadMemoryError(ValueError): ...
-
-
-def read_memory(ctx: AnalysisContext, sample_bytes: bytes, address: int, size: int, cache) -> bytes:
-    rva = address - ctx.analysis.base_address
-
-    try:
-        if sample_bytes.startswith(capa.features.extractors.common.MATCH_PE):
-            pe = cache.get("pe")
-            if not pe:
-                pe = pefile.PE(data=sample_bytes)
-                cache["pe"] = pe
-            return pe.get_data(rva, size)
-        elif sample_bytes.startswith(capa.features.extractors.common.MATCH_ELF):
-            elf = cache.get("elf")
-            if not elf:
-                elf = ELFFile(io.BytesIO(sample_bytes))
-                cache["elf"] = elf
-
-            # ELF segments are for runtime data,
-            # ELF sections are for link-time data.
-            for segment in elf.iter_segments():
-                # assume p_align is consistent with addresses here.
-                # otherwise, should harden this loader.
-                segment_rva = segment.header.p_vaddr
-                segment_size = segment.header.p_memsz
-                if segment_rva <= rva < segment_rva + segment_size:
-                    segment_data = segment.data()
-
-                    # pad the section with NULLs
-                    # assume page alignment is already handled.
-                    # might need more hardening here.
-                    if len(segment_data) < segment_size:
-                        segment_data += b"\x00" * (segment_size - len(segment_data))
-
-                    segment_offset = rva - segment_rva
-                    return segment_data[segment_offset : segment_offset + size]
-
-            raise ReadMemoryError("address not mapped")
-        else:
-            logger.warning("unsupported format")
-            raise ReadMemoryError("unsupported file format")
-    except Exception as e:
-        raise ReadMemoryError("failed to read memory: " + str(e)) from e
-
-
 def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
 
     ctx = fhi.ctx
-    be2 = fhi.ctx.be2
-    sample_bytes = fhi.ctx.sample_bytes
-    idx = fhi.ctx.idx
+    be2 = ctx.be2
+    idx = ctx.idx
+    address_space = ctx.address_space
 
     instruction_index = ii.instruction_index
 
@@ -288,9 +238,10 @@ def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandl
 
     for reference_address in reference_addresses:
         try:
-            # at end of segment then there might be an overrun here.
-            buf = read_memory(ctx, sample_bytes, reference_address, 0x100, fh.ctx)
+            # if at end of segment then there might be an overrun here.
+            buf = address_space.read_memory(reference_address, 0x100)
         except ReadMemoryError:
+            logger.debug("failed to read memory: 0x%x", reference_address)
             continue
 
         if capa.features.extractors.helpers.all_zeros(buf):
@@ -404,7 +355,7 @@ def extract_features(f: FunctionHandle, bbh: BBHandle, insn: InsnHandle) -> Iter
 INSTRUCTION_HANDLERS = (
     extract_insn_api_features,
     extract_insn_number_features,
-    #extract_insn_bytes_features,
+    extract_insn_bytes_features,
     extract_insn_string_features,
     extract_insn_offset_features,
     extract_insn_nzxor_characteristic_features,

From 37aca874a809820d498398f0ac7745330651c3b9 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 3 May 2024 10:24:51 -0600
Subject: [PATCH 116/200] be2: resolve thunks to imported functions

---
 .../extractors/binexport2/__init__.py         | 57 -----------------
 .../extractors/binexport2/function.py         |  7 ---
 .../features/extractors/binexport2/helpers.py | 11 ++++
 capa/features/extractors/binexport2/insn.py   | 62 ++++++++++++++-----
 4 files changed, 58 insertions(+), 79 deletions(-)
 create mode 100644 capa/features/extractors/binexport2/helpers.py

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index b0bf597a3..a4e43602b 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -236,67 +236,10 @@ def __init__(self, be2: BinExport2, idx: BinExport2Index, buf: bytes):
         self.be2 = be2
         self.idx = idx
         self.buf = buf
-
-        # from virtual address to call graph vertex representing the import
-        self.thunks: Dict[int, int] = {}
         self.base_address: int = 0
 
-        self._find_got_thunks()
         self._find_base_address()
 
-    def _find_got_thunks(self):
-        if self.be2.meta_information.architecture_name != "aarch64":
-            logger.debug("skipping GOT thunk analysis on non-aarch64")
-            return
-
-        if not self.buf.startswith(capa.features.extractors.common.MATCH_ELF):
-            logger.debug("skipping GOT thunk analysis on non-ELF")
-            return
-
-        for vertex_index, vertex in enumerate(self.be2.call_graph.vertex):
-            if not vertex.HasField("address"):
-                continue
-
-            if not vertex.HasField("mangled_name"):
-                continue
-
-            if BinExport2.CallGraph.Vertex.Type.IMPORTED != vertex.type:
-                continue
-
-            if len(self.idx.callers_by_vertex_index[vertex_index]) != 1:
-                # find imports with a single caller,
-                # which should be the thunk
-                continue
-
-            maybe_thunk_vertex_index = self.idx.callers_by_vertex_index[vertex_index][0]
-            maybe_thunk_vertex = self.be2.call_graph.vertex[maybe_thunk_vertex_index]
-            maybe_thunk_address = maybe_thunk_vertex.address
-
-            maybe_thunk_flow_graph_index = self.idx.flow_graph_index_by_address[maybe_thunk_address]
-            maybe_thunk_flow_graph = self.be2.flow_graph[maybe_thunk_flow_graph_index]
-
-            if len(maybe_thunk_flow_graph.basic_block_index) != 1:
-                # should have a single basic block
-                continue
-
-            maybe_thunk_basic_block = self.be2.basic_block[maybe_thunk_flow_graph.entry_basic_block_index]
-            if len(list(self.idx.instruction_indices(maybe_thunk_basic_block))) != 4:
-                # thunk should look like these four instructions.
-                # fstat:
-                # 000008b0  adrp    x16, 0x11000
-                # 000008b4  ldr     x17, [x16, #0xf88]  {fstat}
-                # 000008b8  add     x16, x16, #0xf88  {fstat}
-                # 000008bc  br      x17
-                # which relies on the disassembler to recognize the target of the call/br
-                # to go to the GOT/external symbol.
-                continue
-
-            thunk_address = maybe_thunk_address
-            thunk_name = vertex.mangled_name
-            logger.debug("found GOT thunk: 0x%x -> %s", thunk_address, thunk_name)
-
-            self.thunks[thunk_address] = vertex_index
-
     def _find_base_address(self):
         sections_with_perms = filter(lambda s: s.flag_r or s.flag_w or s.flag_x, self.be2.section)
         # assume the lowest address is the base address.
diff --git a/capa/features/extractors/binexport2/function.py b/capa/features/extractors/binexport2/function.py
index 33878e563..e437c0dc7 100644
--- a/capa/features/extractors/binexport2/function.py
+++ b/capa/features/extractors/binexport2/function.py
@@ -49,8 +49,6 @@ def extract_function_name(fh: FunctionHandle):
 
     be2 = fhi.ctx.be2
     idx = fhi.ctx.idx
-    analysis = fhi.ctx.analysis
-
     flow_graph_index = fhi.flow_graph_index
 
     flow_graph_address = idx.flow_graph_address_by_index[flow_graph_index]
@@ -59,11 +57,6 @@ def extract_function_name(fh: FunctionHandle):
 
     if vertex.HasField("mangled_name"):
         yield FunctionName(vertex.mangled_name), fh.address
-    elif flow_graph_address in analysis.thunks:
-        thunk_vertex_index = analysis.thunks[flow_graph_address]
-        thunk_vertex = be2.call_graph.vertex[thunk_vertex_index]
-        if thunk_vertex.HasField("mangled_name"):
-            yield FunctionName(thunk_vertex.mangled_name), fh.address
 
 
 def extract_features(fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
diff --git a/capa/features/extractors/binexport2/helpers.py b/capa/features/extractors/binexport2/helpers.py
new file mode 100644
index 000000000..e4698da6b
--- /dev/null
+++ b/capa/features/extractors/binexport2/helpers.py
@@ -0,0 +1,11 @@
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+
+
+def is_vertex_type(vertex, type_):
+    return vertex.HasField("type") and vertex.type == type_
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 5ee5f22a8..5892061b7 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -14,8 +14,9 @@
 
 import capa.features.extractors.helpers
 import capa.features.extractors.strings
+import capa.features.extractors.binexport2.helpers
 from capa.features.insn import API, Number, Mnemonic, OperandNumber
-from capa.features.common import Bytes, String, Feature, Characteristic
+from capa.features.common import THUNK_CHAIN_DEPTH_DELTA, Bytes, String, Feature, Characteristic
 from capa.features.address import Address, AbsoluteVirtualAddress
 from capa.features.extractors.binexport2 import AnalysisContext, FunctionContext, InstructionContext
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
@@ -24,31 +25,61 @@
 logger = logging.getLogger(__name__)
 
 
+def resolve_vertex_thunk_by_index(vertex_idx, ctx, thunk_depth=THUNK_CHAIN_DEPTH_DELTA):
+    curr_idx = vertex_idx
+    for _ in range(thunk_depth):
+        thunked_idx = ctx.idx.callees_by_vertex_index[curr_idx][0]
+        thunked_be2_vertex = ctx.be2.call_graph.vertex[thunked_idx]
+
+        if not capa.features.extractors.binexport2.helpers.is_vertex_type(
+            thunked_be2_vertex, BinExport2.CallGraph.Vertex.Type.THUNK
+        ):
+            return thunked_idx
+
+        curr_idx = thunked_idx
+    return vertex_idx
+
+
 def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
 
     be2 = fhi.ctx.be2
-    idx = fhi.ctx.idx
-    analysis = fhi.ctx.analysis
-
-    instruction = be2.instruction[ii.instruction_index]
+    be2_idx = fhi.ctx.idx
+    be2_insn = be2.instruction[ii.instruction_index]
 
-    if not instruction.call_target:
+    if not be2_insn.call_target:
         return
 
-    for call_target_address in instruction.call_target:
-        if call_target_address in analysis.thunks:
-            vertex_index = analysis.thunks[call_target_address]
-        elif call_target_address not in idx.vertex_index_by_address:
+    for addr in be2_insn.call_target:
+        if addr not in be2_idx.vertex_index_by_address:
+            # disassembler did not define function at address
+            logger.debug("0x%x is not a vertex", addr)
+            continue
+
+        vertex_idx = be2_idx.vertex_index_by_address[addr]
+        be2_vertex = be2.call_graph.vertex[vertex_idx]
+
+        if capa.features.extractors.binexport2.helpers.is_vertex_type(
+            be2_vertex, BinExport2.CallGraph.Vertex.Type.THUNK
+        ):
+            vertex_idx = resolve_vertex_thunk_by_index(vertex_idx, fhi.ctx)
+            be2_vertex = be2.call_graph.vertex[vertex_idx]
+
+        if not capa.features.extractors.binexport2.helpers.is_vertex_type(
+            be2_vertex, BinExport2.CallGraph.Vertex.Type.IMPORTED
+        ):
             continue
-        else:
-            vertex_index = idx.vertex_index_by_address[call_target_address]
 
-        vertex = be2.call_graph.vertex[vertex_index]
-        if not vertex.HasField("mangled_name"):
+        if not be2_vertex.HasField("mangled_name"):
+            logger.debug("vertex %d does not have mangled_name", vertex_idx)
             continue
 
+        api_name = be2_vertex.mangled_name
+        yield API(api_name), ih.address
+
+    """
+        # TODO: re-enable pending https://github.com/google/binexport/issues/126#issuecomment-2074402906
         function_name = vertex.mangled_name
         if vertex.HasField("library_index"):
             # TODO: this seems to be incorrect for Ghidra extractor
@@ -59,6 +90,7 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
                 yield API(name), ih.address
         else:
             yield API(function_name), ih.address
+    """
 
 
 def is_address_mapped(be2: BinExport2, address: int) -> bool:
@@ -404,7 +436,7 @@ def extract_features(f: FunctionHandle, bbh: BBHandle, insn: InsnHandle) -> Iter
 INSTRUCTION_HANDLERS = (
     extract_insn_api_features,
     extract_insn_number_features,
-    #extract_insn_bytes_features,
+    # extract_insn_bytes_features,
     extract_insn_string_features,
     extract_insn_offset_features,
     extract_insn_nzxor_characteristic_features,

From 89c9126dc670606324bff9bbf5d6a1c8203ec3fb Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 3 May 2024 10:59:00 -0600
Subject: [PATCH 117/200] be2: check for be2 string reference before
 bytes/string extraction overhead

---
 capa/features/extractors/binexport2/insn.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index f4e86dc44..ccb27c3ea 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -259,6 +259,10 @@ def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandl
 
     instruction_index = ii.instruction_index
 
+    if instruction_index in idx.string_reference_index_by_source_instruction_index:
+        # disassembler already identified string reference from instruction
+        return
+
     reference_addresses: List[int] = []
 
     if instruction_index in idx.data_reference_index_by_source_instruction_index:

From bf33db8850c459004236a5f8412c4cbb957a5677 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 3 May 2024 11:05:23 -0600
Subject: [PATCH 118/200] be2: remove unneeded check

---
 capa/features/extractors/binexport2/insn.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index ccb27c3ea..ff2892f00 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -44,9 +44,6 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
     be2_idx = fhi.ctx.idx
     be2_insn = be2.instruction[ii.instruction_index]
 
-    if not be2_insn.call_target:
-        return
-
     for addr in be2_insn.call_target:
         if addr not in be2_idx.vertex_index_by_address:
             # disassembler did not define function at address

From 8050a2f2e58dca4130d204dbbadee1a379f88a84 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 3 May 2024 13:38:36 -0600
Subject: [PATCH 119/200] be2: do not process thunks

---
 capa/features/extractors/binexport2/extractor.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
index ac54996d2..8373f1f4a 100644
--- a/capa/features/extractors/binexport2/extractor.py
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -12,6 +12,7 @@
 import capa.features.extractors.common
 import capa.features.extractors.binexport2.file
 import capa.features.extractors.binexport2.insn
+import capa.features.extractors.binexport2.helpers
 import capa.features.extractors.binexport2.function
 import capa.features.extractors.binexport2.basicblock
 from capa.features.common import Feature
@@ -68,6 +69,16 @@ def get_functions(self) -> Iterator[FunctionHandle]:
         for flow_graph_index, flow_graph in enumerate(self.be2.flow_graph):
             entry_basic_block_index = flow_graph.entry_basic_block_index
             flow_graph_address = self.idx.get_basic_block_address(entry_basic_block_index)
+
+            vertex_idx = self.idx.vertex_index_by_address[flow_graph_address]
+            be2_vertex = self.be2.call_graph.vertex[vertex_idx]
+
+            # skip thunks
+            if capa.features.extractors.binexport2.helpers.is_vertex_type(
+                be2_vertex, BinExport2.CallGraph.Vertex.Type.THUNK
+            ):
+                continue
+
             yield FunctionHandle(
                 AbsoluteVirtualAddress(flow_graph_address),
                 inner=FunctionContext(self.ctx, flow_graph_index),

From 5390e1a0e801d2f6e8c5bcb7926ec453479601bf Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 7 May 2024 15:47:38 +0200
Subject: [PATCH 120/200] be2: insn: polish thunk handling a bit

---
 capa/features/extractors/binexport2/insn.py | 31 +++++++++++++++------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index ff2892f00..9d05ca845 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -14,21 +14,35 @@
 from capa.features.insn import API, Number, Mnemonic, OperandNumber
 from capa.features.common import THUNK_CHAIN_DEPTH_DELTA, Bytes, String, Feature, Characteristic
 from capa.features.address import Address, AbsoluteVirtualAddress
-from capa.features.extractors.binexport2 import FunctionContext, ReadMemoryError, InstructionContext
+from capa.features.extractors.binexport2 import FunctionContext, ReadMemoryError, InstructionContext, AnalysisContext
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
 logger = logging.getLogger(__name__)
 
 
-def resolve_vertex_thunk_by_index(vertex_idx, ctx, thunk_depth=THUNK_CHAIN_DEPTH_DELTA):
+def resolve_vertex_thunk_by_index(ctx: AnalysisContext, vertex_idx: int, thunk_depth=THUNK_CHAIN_DEPTH_DELTA):
+    vertex = ctx.be2.call_graph.vertex[vertex_idx]
+    assert capa.features.extractors.binexport2.helpers.is_vertex_type(
+        vertex, BinExport2.CallGraph.Vertex.Type.THUNK
+    )
+
     curr_idx = vertex_idx
     for _ in range(thunk_depth):
-        thunked_idx = ctx.idx.callees_by_vertex_index[curr_idx][0]
-        thunked_be2_vertex = ctx.be2.call_graph.vertex[thunked_idx]
+        # follow the chain of thunks one link
+        thunk_callees = ctx.idx.callees_by_vertex_index[curr_idx]
+
+        # if this doesn't hold, then it doesn't seem like this is a thunk,
+        # because either, len is:
+        #    0 and the thunk doesn't point to anything, or
+        #   >1 and the thunk may end up at many functions.
+        assert len(thunk_callees) == 1
+
+        thunked_idx = thunk_callees[0]
+        thunked_vertex = ctx.be2.call_graph.vertex[thunked_idx]
 
         if not capa.features.extractors.binexport2.helpers.is_vertex_type(
-            thunked_be2_vertex, BinExport2.CallGraph.Vertex.Type.THUNK
+            thunked_vertex, BinExport2.CallGraph.Vertex.Type.THUNK
         ):
             return thunked_idx
 
@@ -40,8 +54,9 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
 
-    be2 = fhi.ctx.be2
-    be2_idx = fhi.ctx.idx
+    ctx = fhi.ctx
+    be2 = ctx.be2
+    be2_idx = ctx.idx
     be2_insn = be2.instruction[ii.instruction_index]
 
     for addr in be2_insn.call_target:
@@ -56,7 +71,7 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
         if capa.features.extractors.binexport2.helpers.is_vertex_type(
             be2_vertex, BinExport2.CallGraph.Vertex.Type.THUNK
         ):
-            vertex_idx = resolve_vertex_thunk_by_index(vertex_idx, fhi.ctx)
+            vertex_idx = resolve_vertex_thunk_by_index(ctx, vertex_idx)
             be2_vertex = be2.call_graph.vertex[vertex_idx]
 
         if not capa.features.extractors.binexport2.helpers.is_vertex_type(

From 9a9d5a2e33ea2363ba59f030688ce03c434da598 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Tue, 7 May 2024 14:55:50 -0600
Subject: [PATCH 121/200] be2: pre-compute thunk targets

---
 .../extractors/binexport2/__init__.py         | 34 ++++++++++
 capa/features/extractors/binexport2/insn.py   | 62 +++++--------------
 2 files changed, 49 insertions(+), 47 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index 08179db40..2a953b880 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -22,7 +22,9 @@
 from pefile import PE
 from elftools.elf.elffile import ELFFile
 
+import capa.features.common
 import capa.features.extractors.common
+import capa.features.extractors.binexport2.helpers
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
 logger = logging.getLogger(__name__)
@@ -241,8 +243,10 @@ def __init__(self, be2: BinExport2, idx: BinExport2Index, buf: bytes):
         self.idx = idx
         self.buf = buf
         self.base_address: int = 0
+        self.thunks: Dict[int, int] = {}
 
         self._find_base_address()
+        self._compute_thunks()
 
     def _find_base_address(self):
         sections_with_perms = filter(lambda s: s.flag_r or s.flag_w or s.flag_x, self.be2.section)
@@ -251,6 +255,36 @@ def _find_base_address(self):
         # libraries mapped into memory.
         self.base_address = min(s.address for s in sections_with_perms)
 
+    def _compute_thunks(self):
+        for addr, idx in self.idx.vertex_index_by_address.items():
+            vertex = self.be2.call_graph.vertex[idx]
+            if not capa.features.extractors.binexport2.helpers.is_vertex_type(
+                vertex, BinExport2.CallGraph.Vertex.Type.THUNK
+            ):
+                continue
+
+            curr_idx = idx
+            for _ in range(capa.features.common.THUNK_CHAIN_DEPTH_DELTA):
+                thunk_callees = self.idx.callees_by_vertex_index[curr_idx]
+                # if this doesn't hold, then it doesn't seem like this is a thunk,
+                # because either, len is:
+                #    0 and the thunk doesn't point to anything, or
+                #   >1 and the thunk may end up at many functions.
+                assert len(thunk_callees) == 1
+
+                thunked_idx = thunk_callees[0]
+                thunked_vertex = self.be2.call_graph.vertex[thunked_idx]
+
+                if not capa.features.extractors.binexport2.helpers.is_vertex_type(
+                    thunked_vertex, BinExport2.CallGraph.Vertex.Type.THUNK
+                ):
+                    assert thunked_vertex.HasField("address")
+
+                    self.thunks[addr] = thunked_vertex.address
+                    break
+
+                curr_idx = thunked_idx
+
 
 @dataclass
 class MemoryRegion:
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 9d05ca845..9b027ec88 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -12,7 +12,7 @@
 import capa.features.extractors.strings
 import capa.features.extractors.binexport2.helpers
 from capa.features.insn import API, Number, Mnemonic, OperandNumber
-from capa.features.common import THUNK_CHAIN_DEPTH_DELTA, Bytes, String, Feature, Characteristic
+from capa.features.common import Bytes, String, Feature, Characteristic
 from capa.features.address import Address, AbsoluteVirtualAddress
 from capa.features.extractors.binexport2 import FunctionContext, ReadMemoryError, InstructionContext, AnalysisContext
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
@@ -21,69 +21,37 @@
 logger = logging.getLogger(__name__)
 
 
-def resolve_vertex_thunk_by_index(ctx: AnalysisContext, vertex_idx: int, thunk_depth=THUNK_CHAIN_DEPTH_DELTA):
-    vertex = ctx.be2.call_graph.vertex[vertex_idx]
-    assert capa.features.extractors.binexport2.helpers.is_vertex_type(
-        vertex, BinExport2.CallGraph.Vertex.Type.THUNK
-    )
-
-    curr_idx = vertex_idx
-    for _ in range(thunk_depth):
-        # follow the chain of thunks one link
-        thunk_callees = ctx.idx.callees_by_vertex_index[curr_idx]
-
-        # if this doesn't hold, then it doesn't seem like this is a thunk,
-        # because either, len is:
-        #    0 and the thunk doesn't point to anything, or
-        #   >1 and the thunk may end up at many functions.
-        assert len(thunk_callees) == 1
-
-        thunked_idx = thunk_callees[0]
-        thunked_vertex = ctx.be2.call_graph.vertex[thunked_idx]
-
-        if not capa.features.extractors.binexport2.helpers.is_vertex_type(
-            thunked_vertex, BinExport2.CallGraph.Vertex.Type.THUNK
-        ):
-            return thunked_idx
-
-        curr_idx = thunked_idx
-    return vertex_idx
-
-
 def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
 
-    ctx = fhi.ctx
-    be2 = ctx.be2
-    be2_idx = ctx.idx
-    be2_insn = be2.instruction[ii.instruction_index]
+    be2 = fhi.ctx.be2
+    be2_index = fhi.ctx.idx
+    be2_analysis = fhi.ctx.analysis
+    insn = be2.instruction[ii.instruction_index]
 
-    for addr in be2_insn.call_target:
-        if addr not in be2_idx.vertex_index_by_address:
+    for addr in insn.call_target:
+        if addr in be2_analysis.thunks:
+            addr = be2_analysis.thunks[addr]
+
+        if addr not in be2_index.vertex_index_by_address:
             # disassembler did not define function at address
             logger.debug("0x%x is not a vertex", addr)
             continue
 
-        vertex_idx = be2_idx.vertex_index_by_address[addr]
-        be2_vertex = be2.call_graph.vertex[vertex_idx]
-
-        if capa.features.extractors.binexport2.helpers.is_vertex_type(
-            be2_vertex, BinExport2.CallGraph.Vertex.Type.THUNK
-        ):
-            vertex_idx = resolve_vertex_thunk_by_index(ctx, vertex_idx)
-            be2_vertex = be2.call_graph.vertex[vertex_idx]
+        vertex_idx = be2_index.vertex_index_by_address[addr]
+        vertex = be2.call_graph.vertex[vertex_idx]
 
         if not capa.features.extractors.binexport2.helpers.is_vertex_type(
-            be2_vertex, BinExport2.CallGraph.Vertex.Type.IMPORTED
+            vertex, BinExport2.CallGraph.Vertex.Type.IMPORTED
         ):
             continue
 
-        if not be2_vertex.HasField("mangled_name"):
+        if not vertex.HasField("mangled_name"):
             logger.debug("vertex %d does not have mangled_name", vertex_idx)
             continue
 
-        api_name = be2_vertex.mangled_name
+        api_name = vertex.mangled_name
         yield API(api_name), ih.address
 
     """

From 8a3b267a0593853fbeef4906dfa8659d54039bc1 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Thu, 23 May 2024 14:10:53 +0200
Subject: [PATCH 122/200] parse negative numbers

---
 capa/features/extractors/binexport2/insn.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 9b027ec88..ba1a25f27 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -145,10 +145,12 @@ def extract_insn_number_features(
             # temporarily, we'll have to try to guess at the interpretation.
             symbol = _gsm_get_instruction_operand(be2, instruction_index, i)
 
-            if symbol.startswith("#0x"):
+            if symbol.startswith(("#0x", "#-0x")):
                 # like:
                 # - type: SYMBOL
                 #   symbol: "#0xffffffff"
+                # - type: SYMBOL
+                #   symbol: "#-0x1"
                 try:
                     value = int(symbol[len("#") :], 0x10)
                 except ValueError:

From 0ad7aeaa426ab6f831bc2f603a6b4af13b05018a Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Thu, 23 May 2024 14:14:17 +0200
Subject: [PATCH 123/200] update tests to use Ghidra-generated BinExport file

---
 tests/fixtures.py                |  2 +-
 tests/test_binexport_features.py | 86 ++++++++++++++++----------------
 2 files changed, 45 insertions(+), 43 deletions(-)

diff --git a/tests/fixtures.py b/tests/fixtures.py
index 17bb24ac4..6a01c1d4e 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -411,7 +411,7 @@ def get_data_path_by_name(name) -> Path:
             CD
             / "data"
             / "binexport2"
-            / "687e79cde5b0ced75ac229465835054931f9ec438816f2827a8be5f3bd474929.elf_.ida.BinExport"
+            / "687e79cde5b0ced75ac229465835054931f9ec438816f2827a8be5f3bd474929.elf_.ghidra.BinExport"
         )
     else:
         raise ValueError(f"unexpected sample fixture: {name}")
diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index 12cae0b2b..ce5d5bb04 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -45,11 +45,12 @@
         ("687e79.be2", "file", capa.features.file.Import("_ZN7android10IInterfaceD0Ev"), True),
         ("687e79.be2", "file", capa.features.file.Import("nope"), False),
         # function/characteristic(loop)
-        ("687e79.be2", "function=0x56c0", capa.features.common.Characteristic("loop"), True),
-        ("687e79.be2", "function=0x4c40", capa.features.common.Characteristic("loop"), False),
+        ("687e79.be2", "function=0x1056c0", capa.features.common.Characteristic("loop"), True),  # TODO
+        ("687e79.be2", "function=0x1075c0", capa.features.common.Characteristic("loop"), False),
         # bb/characteristic(tight loop)
         ("687e79.be2", "function=0x0", capa.features.common.Characteristic("tight loop"), "xfail: not implemented yet"),
         ("687e79.be2", "function=0x0", capa.features.common.Characteristic("tight loop"), "xfail: not implemented yet"),
+        ("687e79.be2", "function=0x1075c0", capa.features.common.Characteristic("tight loop"), False),
         # bb/characteristic(stack string)
         (
             "687e79.be2",
@@ -77,16 +78,16 @@
             "xfail: not implemented yet",
         ),
         # insn/mnemonic
-        ("687e79.be2", "function=0x7588", capa.features.insn.Mnemonic("stp"), True),
-        ("687e79.be2", "function=0x7588", capa.features.insn.Mnemonic("adrl"), True),
-        ("687e79.be2", "function=0x7588", capa.features.insn.Mnemonic("bl"), True),
-        ("687e79.be2", "function=0x7588", capa.features.insn.Mnemonic("in"), False),
-        ("687e79.be2", "function=0x7588", capa.features.insn.Mnemonic("out"), False),
+        ("687e79.be2", "function=0x107588", capa.features.insn.Mnemonic("stp"), True),
+        ("687e79.be2", "function=0x107588", capa.features.insn.Mnemonic("adrp"), True),
+        ("687e79.be2", "function=0x107588", capa.features.insn.Mnemonic("bl"), True),
+        ("687e79.be2", "function=0x107588", capa.features.insn.Mnemonic("in"), False),
+        ("687e79.be2", "function=0x107588", capa.features.insn.Mnemonic("adrl"), False),
         # insn/operand.number
-        ("687e79.be2", "function=0x5128,bb=0x51e4", capa.features.insn.OperandNumber(1, 0xFFFFFFFF), True),
-        ("687e79.be2", "function=0x7588,bb=0x7588", capa.features.insn.OperandNumber(1, 0x3), True),
-        ("687e79.be2", "function=0x7588,bb=0x7588,insn=0x7598", capa.features.insn.OperandNumber(1, 0x3), True),
-        ("687e79.be2", "function=0x7588,bb=0x7588", capa.features.insn.OperandNumber(3, 0x10), True),
+        ("687e79.be2", "function=0x105128,bb=0x1051e4", capa.features.insn.OperandNumber(1, 0xFFFFFFFF), True),
+        ("687e79.be2", "function=0x107588,bb=0x107588", capa.features.insn.OperandNumber(1, 0x8), True),
+        ("687e79.be2", "function=0x107588,bb=0x107588,insn=0x1075a4", capa.features.insn.OperandNumber(1, 0x8), True),
+        ("687e79.be2", "function=0x107588,bb=0x107588", capa.features.insn.OperandNumber(3, 0x10), True),
         # insn/operand.offset
         (
             "687e79.be2",
@@ -101,12 +102,13 @@
             "xfail: not implemented yet",
         ),
         # insn/number
-        ("687e79.be2", "function=0x7588", capa.features.insn.Number(0x3), True),
-        ("687e79.be2", "function=0x7588", capa.features.insn.Number(0x10), "xfail: do we want this for ldp?"),
-        ("687e79.be2", "function=0x5C88", capa.features.insn.Number(0xF000), True),
+        ("687e79.be2", "function=0x107588", capa.features.insn.Number(0x3), True),
+        ("687e79.be2", "function=0x107588", capa.features.insn.Number(0x10), "xfail: do we want this for ldp?"),
+        ("687e79.be2", "function=0x105C88", capa.features.insn.Number(0xF000), True),
         # insn/number: negative
-        ("687e79.be2", "function=0x57f8,bb=0x57f8", capa.features.insn.Number(0xFFFFFFFFFFFFFFFF), True),
-        ("687e79.be2", "function=0x66e0,bb=0x68c4", capa.features.insn.Number(0xFFFFFFFF), True),
+        ("687e79.be2", "function=0x1057f8,bb=0x1057f8", capa.features.insn.Number(-1), True),  # TODO this should be unsigned / use two's complement
+        ("687e79.be2", "function=0x1057f8,bb=0x1057f8", capa.features.insn.Number(0xFFFFFFFFFFFFFFFF), "xfail: not implemented yet"),
+        ("687e79.be2", "function=0x1066e0,bb=0x1068c4", capa.features.insn.Number(0xFFFFFFFF), True),
         # insn/offset
         ("687e79.be2", "function=0x0", capa.features.insn.Offset(0x0), "xfail: not implemented yet"),
         ("687e79.be2", "function=0x0", capa.features.insn.Offset(0x4), "xfail: not implemented yet"),
@@ -139,18 +141,17 @@
         # ("mimikatz", "function=0x401873,bb=0x4018B2,insn=0x4018C0", capa.features.insn.Number(0x2), True),
         # insn/api
         # not extracting dll name
-        ("687e79.be2", "function=0x5c88", capa.features.insn.API("memset"), "xfail: not working yet"),
-        ("687e79.be2", "function=0x5c88", capa.features.insn.API(".memset"), True),
-        ("687e79.be2", "function=0x5c88", capa.features.insn.API("Nope"), False),
+        ("687e79.be2", "function=0x105c88", capa.features.insn.API("memset"), True),
+        ("687e79.be2", "function=0x105c88", capa.features.insn.API("Nope"), False),
         # insn/string
-        ("687e79.be2", "function=0x7588", capa.features.common.String("AppDataService start"), True),
-        ("687e79.be2", "function=0x75c0", capa.features.common.String("AppDataService"), True),
-        ("687e79.be2", "function=0x7588", capa.features.common.String("nope"), False),
-        ("687e79.be2", "function=0x6d58", capa.features.common.String("/data/misc/wpa_supplicant"), True),
+        ("687e79.be2", "function=0x107588", capa.features.common.String("AppDataService start"), True),
+        ("687e79.be2", "function=0x1075c0", capa.features.common.String("AppDataService"), True),
+        ("687e79.be2", "function=0x107588", capa.features.common.String("nope"), False),
+        ("687e79.be2", "function=0x106d58", capa.features.common.String("/data/misc/wpa_supplicant"), True),
         # insn/regex
-        ("687e79.be2", "function=0x5c88", capa.features.common.Regex("innerRename"), True),
-        ("687e79.be2", "function=0x6d58", capa.features.common.Regex("/data/misc"), True),
-        ("687e79.be2", "function=0x6d58", capa.features.common.Substring("/data/misc"), True),
+        ("687e79.be2", "function=0x105c88", capa.features.common.Regex("innerRename"), True),
+        ("687e79.be2", "function=0x106d58", capa.features.common.Regex("/data/misc"), True),
+        ("687e79.be2", "function=0x106d58", capa.features.common.Substring("/data/misc"), True),
         # # insn/string, pointer to string
         # ("mimikatz", "function=0x44EDEF", capa.features.common.String("INPUTEVENT"), True),
         # # insn/string, direct memory reference
@@ -197,8 +198,8 @@
         # # insn/characteristic(cross section flow): imports don't count
         # ("mimikatz", "function=0x4556E5", capa.features.common.Characteristic("cross section flow"), False),
         # insn/characteristic(recursive call)
-        ("687e79.be2", "function=0x5b38", capa.features.common.Characteristic("recursive call"), True),
-        ("687e79.be2", "function=0x6530", capa.features.common.Characteristic("recursive call"), True),
+        ("687e79.be2", "function=0x105b38", capa.features.common.Characteristic("recursive call"), True),
+        ("687e79.be2", "function=0x106530", capa.features.common.Characteristic("recursive call"), True),
         # insn/characteristic(indirect call)
         (
             "687e79.be2",
@@ -213,37 +214,37 @@
             "xfail: not implemented yet",
         ),
         # insn/characteristic(calls from)
-        ("687e79.be2", "function=0x5080", capa.features.common.Characteristic("calls from"), True),
-        ("687e79.be2", "function=0x4d20", capa.features.common.Characteristic("calls from"), False),
+        ("687e79.be2", "function=0x105080", capa.features.common.Characteristic("calls from"), True),
+        ("687e79.be2", "function=0x1070e8", capa.features.common.Characteristic("calls from"), False),
         # function/characteristic(calls to)
-        ("687e79.be2", "function=0x4b90", capa.features.common.Characteristic("calls to"), True),
+        ("687e79.be2", "function=0x1075c0", capa.features.common.Characteristic("calls to"), True),
         # file/function-name
-        ("687e79.be2", "file", capa.features.file.FunctionName(".__libc_init"), True),
+        ("687e79.be2", "file", capa.features.file.FunctionName("__libc_init"), "xfail: TODO should this be a function-name?"),
         # os & format & arch
         ("687e79.be2", "file", OS(OS_ANDROID), True),
         ("687e79.be2", "file", OS(OS_LINUX), False),
         ("687e79.be2", "file", OS(OS_WINDOWS), False),
         # os & format & arch are also global features
-        ("687e79.be2", "function=0x7588", OS(OS_ANDROID), True),
-        ("687e79.be2", "function=0x75c0,bb=0x76c0", OS(OS_ANDROID), True),
+        ("687e79.be2", "function=0x107588", OS(OS_ANDROID), True),
+        ("687e79.be2", "function=0x1075c0,bb=0x1076c0", OS(OS_ANDROID), True),
         ("687e79.be2", "file", Arch(ARCH_I386), False),
         ("687e79.be2", "file", Arch(ARCH_AMD64), False),
         ("687e79.be2", "file", Arch(ARCH_AARCH64), True),
-        ("687e79.be2", "function=0x7588", Arch(ARCH_AARCH64), True),
-        ("687e79.be2", "function=0x75c0,bb=0x76c0", Arch(ARCH_AARCH64), True),
+        ("687e79.be2", "function=0x107588", Arch(ARCH_AARCH64), True),
+        ("687e79.be2", "function=0x1075c0,bb=0x1076c0", Arch(ARCH_AARCH64), True),
         ("687e79.be2", "file", Format(FORMAT_ELF), True),
         ("687e79.be2", "file", Format(FORMAT_PE), False),
-        ("687e79.be2", "function=0x7588", Format(FORMAT_ELF), True),
-        ("687e79.be2", "function=0x7588", Format(FORMAT_PE), False),
+        ("687e79.be2", "function=0x107588", Format(FORMAT_ELF), True),
+        ("687e79.be2", "function=0x107588", Format(FORMAT_PE), False),
         (
             "687e79.be2",
-            "function=0x10002385,bb=0x10002385",
+            "function=0x0,bb=0x0",
             capa.features.common.Characteristic("call $+5"),
             "xfail: not implemented yet",
         ),
         (
             "687e79.be2",
-            "function=0x10001510,bb=0x100015c0",
+            "function=0x0,bb=0x0",
             capa.features.common.Characteristic("call $+5"),
             "xfail: not implemented yet",
         ),
@@ -261,6 +262,7 @@
 )
 def test_binexport_features_elf_aarch64(sample, scope, feature, expected):
     if not isinstance(expected, bool):
+        # (for now) xfails indicates using string like: "xfail: not implemented yet"
         pytest.xfail(expected)
     fixtures.do_test_feature_presence(fixtures.get_binexport_extractor, sample, scope, feature, expected)
 
@@ -272,8 +274,8 @@ def test_binexport_features_elf_aarch64(sample, scope, feature, expected):
 )
 def test_binexport_features_pe_x86(sample, scope, feature, expected):
     if "mimikatz.exe_" not in sample.name:
-        pytest.skip("for now only testing mimikatz.exe_ IDA BinExport file")
-    sample = sample.parent / "binexport2" / (sample.name + ".ida.BinExport")
+        pytest.skip("for now only testing mimikatz.exe_ Ghidra BinExport file")
+    sample = sample.parent / "binexport2" / (sample.name + ".ghidra.BinExport")
     assert sample.exists()
     fixtures.do_test_feature_presence(fixtures.get_binexport_extractor, sample, scope, feature, expected)
 

From b364485ba96603fbc633e6be8a038aaa90179e22 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Thu, 23 May 2024 14:23:37 +0200
Subject: [PATCH 124/200] remove unused import

---
 capa/features/extractors/binexport2/insn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index ba1a25f27..c173aae02 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -14,7 +14,7 @@
 from capa.features.insn import API, Number, Mnemonic, OperandNumber
 from capa.features.common import Bytes, String, Feature, Characteristic
 from capa.features.address import Address, AbsoluteVirtualAddress
-from capa.features.extractors.binexport2 import FunctionContext, ReadMemoryError, InstructionContext, AnalysisContext
+from capa.features.extractors.binexport2 import FunctionContext, ReadMemoryError, InstructionContext
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 

From 674a89b7bb0d233645aaa22082e1f67515348dc4 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Thu, 23 May 2024 14:35:13 +0200
Subject: [PATCH 125/200] black reformat

---
 tests/test_binexport_features.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index ce5d5bb04..432ce6ee2 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -106,8 +106,18 @@
         ("687e79.be2", "function=0x107588", capa.features.insn.Number(0x10), "xfail: do we want this for ldp?"),
         ("687e79.be2", "function=0x105C88", capa.features.insn.Number(0xF000), True),
         # insn/number: negative
-        ("687e79.be2", "function=0x1057f8,bb=0x1057f8", capa.features.insn.Number(-1), True),  # TODO this should be unsigned / use two's complement
-        ("687e79.be2", "function=0x1057f8,bb=0x1057f8", capa.features.insn.Number(0xFFFFFFFFFFFFFFFF), "xfail: not implemented yet"),
+        (
+            "687e79.be2",
+            "function=0x1057f8,bb=0x1057f8",
+            capa.features.insn.Number(-1),
+            True,
+        ),  # TODO this should be unsigned / use two's complement
+        (
+            "687e79.be2",
+            "function=0x1057f8,bb=0x1057f8",
+            capa.features.insn.Number(0xFFFFFFFFFFFFFFFF),
+            "xfail: not implemented yet",
+        ),
         ("687e79.be2", "function=0x1066e0,bb=0x1068c4", capa.features.insn.Number(0xFFFFFFFF), True),
         # insn/offset
         ("687e79.be2", "function=0x0", capa.features.insn.Offset(0x0), "xfail: not implemented yet"),
@@ -219,7 +229,12 @@
         # function/characteristic(calls to)
         ("687e79.be2", "function=0x1075c0", capa.features.common.Characteristic("calls to"), True),
         # file/function-name
-        ("687e79.be2", "file", capa.features.file.FunctionName("__libc_init"), "xfail: TODO should this be a function-name?"),
+        (
+            "687e79.be2",
+            "file",
+            capa.features.file.FunctionName("__libc_init"),
+            "xfail: TODO should this be a function-name?",
+        ),
         # os & format & arch
         ("687e79.be2", "file", OS(OS_ANDROID), True),
         ("687e79.be2", "file", OS(OS_LINUX), False),

From 2b0cc2c0e6a737d287f6fdc400661bef65a9ad44 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Thu, 23 May 2024 14:53:36 +0200
Subject: [PATCH 126/200] run tests always (for now)

---
 .github/workflows/tests.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 9a96fbcd7..7a88b6853 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -64,6 +64,7 @@ jobs:
       run: python scripts/lint.py rules/
 
   tests:
+    if: always()  # TODO remove once code_style passes
     name: Tests in ${{ matrix.python-version }} on ${{ matrix.os }}
     runs-on: ${{ matrix.os }}
     needs: [code_style, rule_linter]

From 51578ca96d929248abc7b9786342e2be4b6de47d Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 24 May 2024 13:38:56 -0600
Subject: [PATCH 127/200] binexport: tests: fix test case

---
 tests/test_binexport_features.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index 432ce6ee2..b39fa2487 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -157,7 +157,7 @@
         ("687e79.be2", "function=0x107588", capa.features.common.String("AppDataService start"), True),
         ("687e79.be2", "function=0x1075c0", capa.features.common.String("AppDataService"), True),
         ("687e79.be2", "function=0x107588", capa.features.common.String("nope"), False),
-        ("687e79.be2", "function=0x106d58", capa.features.common.String("/data/misc/wpa_supplicant"), True),
+        ("687e79.be2", "function=0x106d58", capa.features.common.String("/data/misc/wpa_supplicant.conf"), True),
         # insn/regex
         ("687e79.be2", "function=0x105c88", capa.features.common.Regex("innerRename"), True),
         ("687e79.be2", "function=0x106d58", capa.features.common.Regex("/data/misc"), True),

From a80bcc73d7eff539ec6a02bbeb87ce23e878cd8f Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 24 May 2024 13:39:54 -0600
Subject: [PATCH 128/200] binexport: extractor: fix insn lint

---
 capa/features/extractors/binexport2/insn.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index c173aae02..826a507e0 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -31,8 +31,7 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
     insn = be2.instruction[ii.instruction_index]
 
     for addr in insn.call_target:
-        if addr in be2_analysis.thunks:
-            addr = be2_analysis.thunks[addr]
+        addr = be2_analysis.thunks.get(addr, addr)
 
         if addr not in be2_index.vertex_index_by_address:
             # disassembler did not define function at address

From 510aed20649a4b05381c2aa47cfdd698bb17c327 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 24 May 2024 15:12:43 -0600
Subject: [PATCH 129/200] binexport: addressspace: use base address recovered
 from binexport file

---
 capa/features/extractors/binexport2/__init__.py  | 16 ++++++++--------
 capa/features/extractors/binexport2/extractor.py |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index 2a953b880..29b55ee95 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -255,6 +255,8 @@ def _find_base_address(self):
         # libraries mapped into memory.
         self.base_address = min(s.address for s in sections_with_perms)
 
+        logger.debug("found base address: %x", self.base_address)
+
     def _compute_thunks(self):
         for addr, idx in self.idx.vertex_index_by_address.items():
             vertex = self.be2.call_graph.vertex[idx]
@@ -322,9 +324,7 @@ def read_memory(self, address: int, length: int) -> bytes:
         raise AddressNotMappedError(address)
 
     @classmethod
-    def from_pe(cls, pe: PE):
-        base_address = pe.OPTIONAL_HEADER.ImageBase
-
+    def from_pe(cls, pe: PE, base_address: int):
         regions = []
         for section in pe.sections:
             address = section.VirtualAddress
@@ -342,7 +342,7 @@ def from_pe(cls, pe: PE):
         return cls(base_address, tuple(regions))
 
     @classmethod
-    def from_elf(cls, elf: ELFFile):
+    def from_elf(cls, elf: ELFFile, base_address: int):
         regions = []
 
         # ELF segments are for runtime data,
@@ -362,16 +362,16 @@ def from_elf(cls, elf: ELFFile):
 
             regions.append(MemoryRegion(segment_rva, segment_data))
 
-        return cls(0, tuple(regions))
+        return cls(base_address, tuple(regions))
 
     @classmethod
-    def from_buf(cls, buf: bytes):
+    def from_buf(cls, buf: bytes, base_address: int):
         if buf.startswith(capa.features.extractors.common.MATCH_PE):
             pe = PE(data=buf)
-            return cls.from_pe(pe)
+            return cls.from_pe(pe, base_address)
         elif buf.startswith(capa.features.extractors.common.MATCH_ELF):
             elf = ELFFile(io.BytesIO(buf))
-            return cls.from_elf(elf)
+            return cls.from_elf(elf, base_address)
         else:
             raise NotImplementedError("file format address space")
 
diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
index 8373f1f4a..57e693359 100644
--- a/capa/features/extractors/binexport2/extractor.py
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -45,7 +45,7 @@ def __init__(self, be2: BinExport2, buf: bytes):
         self.buf = buf
         self.idx = BinExport2Index(self.be2)
         self.analysis = BinExport2Analysis(self.be2, self.idx, self.buf)
-        address_space = AddressSpace.from_buf(buf)
+        address_space = AddressSpace.from_buf(buf, self.analysis.base_address)
         self.ctx = AnalysisContext(self.buf, self.be2, self.idx, self.analysis, address_space)
 
         self.global_features: List[Tuple[Feature, Address]] = []

From 9066a21198e667d89780ccb95075e6917a069d63 Mon Sep 17 00:00:00 2001
From: Lin Chen <larch.lin.chen@gmail.com>
Date: Wed, 8 May 2024 14:19:26 +0000
Subject: [PATCH 130/200] Add nzxor charecteristic in BinExport extractor.

by referencing vivisect implementation.
---
 capa/features/extractors/binexport2/insn.py | 88 ++++++++++++++++++++-
 1 file changed, 86 insertions(+), 2 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 826a507e0..bce34ce5f 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -14,12 +14,20 @@
 from capa.features.insn import API, Number, Mnemonic, OperandNumber
 from capa.features.common import Bytes, String, Feature, Characteristic
 from capa.features.address import Address, AbsoluteVirtualAddress
+<<<<<<< HEAD
 from capa.features.extractors.binexport2 import FunctionContext, ReadMemoryError, InstructionContext
+=======
+from capa.features.extractors.binexport2 import FunctionContext, ReadMemoryError, InstructionContext, AnalysisContext, BasicBlockContext
+>>>>>>> d60e4bcb (Add nzxor charecteristic in BinExport extractor.)
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
 logger = logging.getLogger(__name__)
 
+# security cookie checks may perform non-zeroing XORs, these are expected within a certain
+# byte range within the first and returning basic blocks, this helps to reduce FP features
+SECURITY_COOKIE_BYTES_DELTA = 0x40
+
 
 def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
     fhi: FunctionContext = fh.inner
@@ -314,11 +322,87 @@ def extract_insn_offset_features(
     yield from ()
 
 
+def is_security_cookie(
+    fhi: FunctionContext,
+    bbi: BasicBlockContext,
+    instruction: BinExport2.Instruction,
+) -> bool:
+    """
+    check if an instruction is related to security cookie checks.
+    """
+    be2 = fhi.ctx.be2
+
+    # security cookie check should use SP or BP
+    op1 = be2.operand[instruction.operand_index[1]]
+    op1_exprs = [be2.expression[expr_i] for expr_i in op1.expression_index]
+    if all(
+        expr.type != BinExport2.Expression.Type.REGISTER or
+        expr.symbol.lower() not in ("bp", "esp", "ebp", "rbp", "rsp")
+        for expr in op1_exprs
+    ):
+        return False
+
+    # check_nzxor_security_cookie_delta
+    # if insn falls at the start of first entry block of the parent function.
+    flow_graph = be2.flow_graph[fhi.flow_graph_index]
+    basic_block_index = bbi.basic_block_index
+    bb = be2.basic_block[basic_block_index]
+    if flow_graph.entry_basic_block_index == basic_block_index:
+        first_addr = min((
+            be2.instruction[ir.begin_index].address
+            for ir in bb.instruction_index))
+        if instruction.address < first_addr + SECURITY_COOKIE_BYTES_DELTA:
+            return True
+    # or insn falls at the end before return in a terminal basic block.
+    if basic_block_index not in (
+        e.source_basic_block_index for e in flow_graph.edge):
+        last_addr = max((
+            be2.instruction[ir.end_index - 1].address
+            for ir in bb.instruction_index))
+        if instruction.address > last_addr - SECURITY_COOKIE_BYTES_DELTA:
+            return True
+    return False
+
+
 def extract_insn_nzxor_characteristic_features(
     fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
 ) -> Iterator[Tuple[Feature, Address]]:
-    # TODO(wb): 1755
-    yield from ()
+    """
+    parse non-zeroing XOR instruction from the given instruction.
+    ignore expected non-zeroing XORs, e.g. security cookies.
+    """
+    fhi: FunctionContext = fh.inner
+    ii: InstructionContext = ih.inner
+
+    be2 = fhi.ctx.be2
+
+    instruction = be2.instruction[ii.instruction_index]
+    mnemonic = be2.mnemonic[instruction.mnemonic_index]
+    mnemonic_name = mnemonic.name.lower()
+    if mnemonic_name not in (
+        "xor", "xorpd", "xorps", "pxor",  # x86 / amd64
+        "eor",  # arm / aarch64
+    ):
+        return
+
+    operands = [
+        be2.operand[operand_index]
+        for operand_index in instruction.operand_index]
+
+    # check whether operands are same for x86 / amd64
+    if mnemonic_name in ("xor", "xorpd", "xorps", "pxor"):
+       if operands[0] == operands[1]:
+           return
+       if is_security_cookie(fhi, bbh.inner, instruction):
+           return
+
+    # check whether 2nd/3rd operands are same for arm / aarch64
+    if mnemonic_name == "eor":
+        assert len(operands) == 3
+        if operands[1] == operands[2]:
+            return
+
+    yield Characteristic("nzxor"), ih.address
 
 
 def extract_insn_mnemonic_features(

From bba29f471bf240d8889a60d25277e4e8a896e055 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Tue, 28 May 2024 13:51:43 +0000
Subject: [PATCH 131/200] add tests, fix stack cookie detection

---
 capa/features/extractors/binexport2/__init__.py |  8 +++++++-
 capa/features/extractors/binexport2/insn.py     |  6 +-----
 tests/fixtures.py                               |  7 +++++++
 tests/test_binexport_features.py                | 12 ++++++------
 4 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index 29b55ee95..df503ee06 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -78,7 +78,7 @@ def filename_similarity_key(p: Path):
                 if hashlib.sha256(candidate.read_bytes()).hexdigest().lower() == wanted_sha256:
                     return candidate
 
-    raise ValueError("cannot find sample")
+    raise ValueError("cannot find sample, you may specify the path using the CAPA_SAMPLES_DIR environment variable")
 
 
 class BinExport2Index:
@@ -272,6 +272,12 @@ def _compute_thunks(self):
                 # because either, len is:
                 #    0 and the thunk doesn't point to anything, or
                 #   >1 and the thunk may end up at many functions.
+                
+                # TODO (mr-tz): fails on d1e6506964edbfffb08c0dd32e1486b11fbced7a4bd870ffe79f110298f0efb8:0x113AE0
+                if len(thunk_callees) != 1:
+                    logger.error("callees: %s, addr: 0x%x, idx: %d", thunk_callees, addr, idx)
+                    continue
+                
                 assert len(thunk_callees) == 1
 
                 thunked_idx = thunk_callees[0]
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index bce34ce5f..5f23b6a11 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -14,11 +14,7 @@
 from capa.features.insn import API, Number, Mnemonic, OperandNumber
 from capa.features.common import Bytes, String, Feature, Characteristic
 from capa.features.address import Address, AbsoluteVirtualAddress
-<<<<<<< HEAD
-from capa.features.extractors.binexport2 import FunctionContext, ReadMemoryError, InstructionContext
-=======
-from capa.features.extractors.binexport2 import FunctionContext, ReadMemoryError, InstructionContext, AnalysisContext, BasicBlockContext
->>>>>>> d60e4bcb (Add nzxor charecteristic in BinExport extractor.)
+from capa.features.extractors.binexport2 import FunctionContext, ReadMemoryError, InstructionContext, BasicBlockContext
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
diff --git a/tests/fixtures.py b/tests/fixtures.py
index 6a01c1d4e..190975f95 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -413,6 +413,13 @@ def get_data_path_by_name(name) -> Path:
             / "binexport2"
             / "687e79cde5b0ced75ac229465835054931f9ec438816f2827a8be5f3bd474929.elf_.ghidra.BinExport"
         )
+    elif name.startswith("d1e650.be2"):
+        return (
+            CD
+            / "data"
+            / "binexport2"
+            / "d1e6506964edbfffb08c0dd32e1486b11fbced7a4bd870ffe79f110298f0efb8.elf_.ghidra.BinExport"
+        )
     else:
         raise ValueError(f"unexpected sample fixture: {name}")
 
diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index b39fa2487..967cabb30 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -192,16 +192,16 @@
         # ("mimikatz", "function=0x44EDEF", capa.features.common.Bytes("INPUTEVENT".encode("utf-16le")), False),
         # insn/characteristic(nzxor)
         (
-            "687e79.be2",
-            "function=0x0",
+            "d1e650.be2",
+            "function=0x114af4",
             capa.features.common.Characteristic("nzxor"),
-            "xfail: not implemented yet, may need other test sample",
+            True,
         ),
         (
-            "687e79.be2",
-            "function=0x0",
+            "d1e650.be2",
+            "function=0x117988",
             capa.features.common.Characteristic("nzxor"),
-            "xfail: not implemented yet, may need other test sample",
+            True,
         ),
         # # insn/characteristic(cross section flow)
         # ("a1982...", "function=0x4014D0", capa.features.common.Characteristic("cross section flow"), True),

From 58a81180a9c4b8eccc0b118a09787726a36ef670 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Tue, 28 May 2024 14:18:32 +0000
Subject: [PATCH 132/200] test BinExport feature PRs

---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 7a88b6853..80256778d 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -4,7 +4,7 @@ on:
   push:
     branches: [ master ]
   pull_request:
-    branches: [ master ]
+    branches: [ master, feat/1755 ]
 
 permissions: read-all
 

From b92eba79530791bf52fc2988c7b0450e4623cecd Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Tue, 28 May 2024 14:38:08 +0000
Subject: [PATCH 133/200] reformat and fix

---
 .../extractors/binexport2/__init__.py         |  4 +--
 capa/features/extractors/binexport2/insn.py   | 36 ++++++++-----------
 2 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index df503ee06..e3c9add5e 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -272,12 +272,12 @@ def _compute_thunks(self):
                 # because either, len is:
                 #    0 and the thunk doesn't point to anything, or
                 #   >1 and the thunk may end up at many functions.
-                
+
                 # TODO (mr-tz): fails on d1e6506964edbfffb08c0dd32e1486b11fbced7a4bd870ffe79f110298f0efb8:0x113AE0
                 if len(thunk_callees) != 1:
                     logger.error("callees: %s, addr: 0x%x, idx: %d", thunk_callees, addr, idx)
                     continue
-                
+
                 assert len(thunk_callees) == 1
 
                 thunked_idx = thunk_callees[0]
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 5f23b6a11..2cf21fb1c 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -14,7 +14,7 @@
 from capa.features.insn import API, Number, Mnemonic, OperandNumber
 from capa.features.common import Bytes, String, Feature, Characteristic
 from capa.features.address import Address, AbsoluteVirtualAddress
-from capa.features.extractors.binexport2 import FunctionContext, ReadMemoryError, InstructionContext, BasicBlockContext
+from capa.features.extractors.binexport2 import FunctionContext, ReadMemoryError, BasicBlockContext, InstructionContext
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
@@ -331,11 +331,7 @@ def is_security_cookie(
     # security cookie check should use SP or BP
     op1 = be2.operand[instruction.operand_index[1]]
     op1_exprs = [be2.expression[expr_i] for expr_i in op1.expression_index]
-    if all(
-        expr.type != BinExport2.Expression.Type.REGISTER or
-        expr.symbol.lower() not in ("bp", "esp", "ebp", "rbp", "rsp")
-        for expr in op1_exprs
-    ):
+    if all(expr.symbol.lower() not in ("bp", "esp", "ebp", "rbp", "rsp") for expr in op1_exprs):
         return False
 
     # check_nzxor_security_cookie_delta
@@ -344,17 +340,12 @@ def is_security_cookie(
     basic_block_index = bbi.basic_block_index
     bb = be2.basic_block[basic_block_index]
     if flow_graph.entry_basic_block_index == basic_block_index:
-        first_addr = min((
-            be2.instruction[ir.begin_index].address
-            for ir in bb.instruction_index))
+        first_addr = min((be2.instruction[ir.begin_index].address for ir in bb.instruction_index))
         if instruction.address < first_addr + SECURITY_COOKIE_BYTES_DELTA:
             return True
     # or insn falls at the end before return in a terminal basic block.
-    if basic_block_index not in (
-        e.source_basic_block_index for e in flow_graph.edge):
-        last_addr = max((
-            be2.instruction[ir.end_index - 1].address
-            for ir in bb.instruction_index))
+    if basic_block_index not in (e.source_basic_block_index for e in flow_graph.edge):
+        last_addr = max((be2.instruction[ir.end_index - 1].address for ir in bb.instruction_index))
         if instruction.address > last_addr - SECURITY_COOKIE_BYTES_DELTA:
             return True
     return False
@@ -376,21 +367,22 @@ def extract_insn_nzxor_characteristic_features(
     mnemonic = be2.mnemonic[instruction.mnemonic_index]
     mnemonic_name = mnemonic.name.lower()
     if mnemonic_name not in (
-        "xor", "xorpd", "xorps", "pxor",  # x86 / amd64
+        "xor",
+        "xorpd",
+        "xorps",
+        "pxor",  # x86 / amd64
         "eor",  # arm / aarch64
     ):
         return
 
-    operands = [
-        be2.operand[operand_index]
-        for operand_index in instruction.operand_index]
+    operands = [be2.operand[operand_index] for operand_index in instruction.operand_index]
 
     # check whether operands are same for x86 / amd64
     if mnemonic_name in ("xor", "xorpd", "xorps", "pxor"):
-       if operands[0] == operands[1]:
-           return
-       if is_security_cookie(fhi, bbh.inner, instruction):
-           return
+        if operands[0] == operands[1]:
+            return
+        if is_security_cookie(fhi, bbh.inner, instruction):
+            return
 
     # check whether 2nd/3rd operands are same for arm / aarch64
     if mnemonic_name == "eor":

From acbbca2cdcdc7422e51241e7665b79f07c783106 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Tue, 28 May 2024 15:10:44 +0000
Subject: [PATCH 134/200] complete TODO descriptions

---
 capa/features/extractors/binexport2/extractor.py |  3 ++-
 capa/features/extractors/binexport2/insn.py      | 12 ++++++++----
 scripts/inspect-binexport2.py                    |  3 +++
 tests/test_binexport_features.py                 |  9 +++++++--
 4 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
index 57e693359..fad1d6927 100644
--- a/capa/features/extractors/binexport2/extractor.py
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -53,8 +53,9 @@ def __init__(self, be2: BinExport2, buf: bytes):
         self.global_features.extend(list(capa.features.extractors.common.extract_os(self.buf)))
         self.global_features.extend(list(capa.features.extractors.common.extract_arch(self.buf)))
 
-        # TODO: assert supported file formats, arches
+        # TODO(mr): assert supported file formats, arches
         # and gradually relax restrictions as they're tested.
+        # https://github.com/mandiant/capa/issues/1755
 
     def get_base_address(self):
         return AbsoluteVirtualAddress(self.analysis.base_address)
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 2cf21fb1c..2f4145d8f 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -222,9 +222,11 @@ def extract_insn_number_features(
         if analysis.base_address == 0x0:
             # When the image is mapped at 0x0,
             #  then its hard to tell if numbers are pointers or numbers.
-            # TODO(mr): 1755 be a little less conservative otherwise?
+            # TODO(mr): be a little less conservative otherwise?
+            # https://github.com/mandiant/capa/issues/1755
 
-            # TODO(mr): 1755 this removes a lot of valid numbers, could check alignment and use additional heuristics
+            # TODO(mr): this removes a lot of valid numbers, could check alignment and use additional heuristics
+            # https://github.com/mandiant/capa/issues/1755
             # if is_address_mapped(be2, value):
             #     continue
             pass
@@ -314,7 +316,8 @@ def extract_insn_string_features(
 def extract_insn_offset_features(
     fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
 ) -> Iterator[Tuple[Feature, Address]]:
-    # TODO(wb): 1755
+    # TODO(wb): complete
+    # https://github.com/mandiant/capa/issues/1755
     yield from ()
 
 
@@ -430,7 +433,8 @@ def extract_function_calls_from(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandl
 def extract_function_indirect_call_characteristic_features(
     fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
 ) -> Iterator[Tuple[Feature, Address]]:
-    # TODO(wb): 1755
+    # TODO(wb): complete
+    # https://github.com/mandiant/capa/issues/1755
     yield from ()
 
 
diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index 65c9e1094..998741474 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -301,6 +301,7 @@ def main(argv=None):
 
                 if vertex.HasField("library_index"):
                     # TODO(williballenthin): this seems to be incorrect for Ghidra exporter
+                    # https://github.com/mandiant/capa/issues/1755
                     library = be2.library[vertex.library_index]
                     o.writeln(f"library:   [{vertex.library_index}] {library.name}")
 
@@ -415,6 +416,7 @@ def main(argv=None):
             # if data_address in idx.instruction_index_by_address:
             #     # appears to be code
             #     continue
+            # https://github.com/mandiant/capa/issues/1755
 
             data_xrefs: List[int] = []
             for data_reference_index in idx.data_reference_index_by_target_address[data_address]:
@@ -423,6 +425,7 @@ def main(argv=None):
                 # TODO(wb): uh-oh, how to reconstruct address?
                 # instruction_address = idx.instruction_address_by_index[instruction_index]
                 # data_xrefs.append(instruction_address)
+                # https://github.com/mandiant/capa/issues/1755
 
             if not data_xrefs:
                 continue
diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index 967cabb30..3fe5bfd69 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -45,7 +45,12 @@
         ("687e79.be2", "file", capa.features.file.Import("_ZN7android10IInterfaceD0Ev"), True),
         ("687e79.be2", "file", capa.features.file.Import("nope"), False),
         # function/characteristic(loop)
-        ("687e79.be2", "function=0x1056c0", capa.features.common.Characteristic("loop"), True),  # TODO
+        (
+            "687e79.be2",
+            "function=0x1056c0",
+            capa.features.common.Characteristic("loop"),
+            True,
+        ),  # TODO(mr): https://github.com/mandiant/capa/issues/1755
         ("687e79.be2", "function=0x1075c0", capa.features.common.Characteristic("loop"), False),
         # bb/characteristic(tight loop)
         ("687e79.be2", "function=0x0", capa.features.common.Characteristic("tight loop"), "xfail: not implemented yet"),
@@ -111,7 +116,7 @@
             "function=0x1057f8,bb=0x1057f8",
             capa.features.insn.Number(-1),
             True,
-        ),  # TODO this should be unsigned / use two's complement
+        ),  # TODO(mr): this should be unsigned / use two's complement, https://github.com/mandiant/capa/issues/1755
         (
             "687e79.be2",
             "function=0x1057f8,bb=0x1057f8",

From 70891e4a69755f28ae5311795d86b601e611d1cb Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Wed, 29 May 2024 11:55:59 +0000
Subject: [PATCH 135/200] wip tests

---
 tests/test_binexport_features.py | 207 ++++++++++++++++++++++++++-----
 1 file changed, 176 insertions(+), 31 deletions(-)

diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index 3fe5bfd69..6de429419 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -31,31 +31,66 @@
 FEATURE_PRESENCE_TESTS_BE2_ELF_AARCH64 = sorted(
     [
         # file/string
-        ("687e79.be2", "file", capa.features.common.String("AppDataService start"), True),
+        (
+            "687e79.be2",
+            "file",
+            capa.features.common.String("AppDataService start"),
+            True,
+        ),
         ("687e79.be2", "file", capa.features.common.String("nope"), False),
         # file/sections
         ("687e79.be2", "file", capa.features.file.Section(".text"), True),
         ("687e79.be2", "file", capa.features.file.Section(".nope"), False),
         # file/exports
-        ("687e79.be2", "file", capa.features.file.Export("android::clearDir"), "xfail: not implemented yet?!"),
+        (
+            "687e79.be2",
+            "file",
+            capa.features.file.Export("android::clearDir"),
+            "xfail: not implemented yet?!",
+        ),
         ("687e79.be2", "file", capa.features.file.Export("nope"), False),
         # file/imports
         ("687e79.be2", "file", capa.features.file.Import("fopen"), True),
         ("687e79.be2", "file", capa.features.file.Import("exit"), True),
-        ("687e79.be2", "file", capa.features.file.Import("_ZN7android10IInterfaceD0Ev"), True),
+        (
+            "687e79.be2",
+            "file",
+            capa.features.file.Import("_ZN7android10IInterfaceD0Ev"),
+            True,
+        ),
         ("687e79.be2", "file", capa.features.file.Import("nope"), False),
         # function/characteristic(loop)
         (
             "687e79.be2",
             "function=0x1056c0",
             capa.features.common.Characteristic("loop"),
-            True,
-        ),  # TODO(mr): https://github.com/mandiant/capa/issues/1755
-        ("687e79.be2", "function=0x1075c0", capa.features.common.Characteristic("loop"), False),
+            "xfail: not implemented yet",
+        ),  # TODO(mr): https://github.com/mandiant/capa/issues/2101
+        (
+            "687e79.be2",
+            "function=0x1075c0",
+            capa.features.common.Characteristic("loop"),
+            False,
+        ),
         # bb/characteristic(tight loop)
-        ("687e79.be2", "function=0x0", capa.features.common.Characteristic("tight loop"), "xfail: not implemented yet"),
-        ("687e79.be2", "function=0x0", capa.features.common.Characteristic("tight loop"), "xfail: not implemented yet"),
-        ("687e79.be2", "function=0x1075c0", capa.features.common.Characteristic("tight loop"), False),
+        (
+            "687e79.be2",
+            "function=0x0",
+            capa.features.common.Characteristic("tight loop"),
+            "xfail: not implemented yet",
+        ),
+        (
+            "687e79.be2",
+            "function=0x0",
+            capa.features.common.Characteristic("tight loop"),
+            "xfail: not implemented yet",
+        ),
+        (
+            "687e79.be2",
+            "function=0x1075c0",
+            capa.features.common.Characteristic("tight loop"),
+            False,
+        ),
         # bb/characteristic(stack string)
         (
             "687e79.be2",
@@ -89,10 +124,30 @@
         ("687e79.be2", "function=0x107588", capa.features.insn.Mnemonic("in"), False),
         ("687e79.be2", "function=0x107588", capa.features.insn.Mnemonic("adrl"), False),
         # insn/operand.number
-        ("687e79.be2", "function=0x105128,bb=0x1051e4", capa.features.insn.OperandNumber(1, 0xFFFFFFFF), True),
-        ("687e79.be2", "function=0x107588,bb=0x107588", capa.features.insn.OperandNumber(1, 0x8), True),
-        ("687e79.be2", "function=0x107588,bb=0x107588,insn=0x1075a4", capa.features.insn.OperandNumber(1, 0x8), True),
-        ("687e79.be2", "function=0x107588,bb=0x107588", capa.features.insn.OperandNumber(3, 0x10), True),
+        (
+            "687e79.be2",
+            "function=0x105128,bb=0x1051e4",
+            capa.features.insn.OperandNumber(1, 0xFFFFFFFF),
+            True,
+        ),
+        (
+            "687e79.be2",
+            "function=0x107588,bb=0x107588",
+            capa.features.insn.OperandNumber(1, 0x8),
+            True,
+        ),
+        (
+            "687e79.be2",
+            "function=0x107588,bb=0x107588,insn=0x1075a4",
+            capa.features.insn.OperandNumber(1, 0x8),
+            True,
+        ),
+        (
+            "687e79.be2",
+            "function=0x107588,bb=0x107588,insn=0x1075b8",
+            capa.features.insn.OperandNumber(3, 0x10),
+            "xfail: GSM?",
+        ),  # TODO(mr): https://github.com/mandiant/capa/issues/2102
         # insn/operand.offset
         (
             "687e79.be2",
@@ -108,7 +163,12 @@
         ),
         # insn/number
         ("687e79.be2", "function=0x107588", capa.features.insn.Number(0x3), True),
-        ("687e79.be2", "function=0x107588", capa.features.insn.Number(0x10), "xfail: do we want this for ldp?"),
+        (
+            "687e79.be2",
+            "function=0x107588",
+            capa.features.insn.Number(0x10),
+            "xfail: do we want this for ldp?",
+        ),
         ("687e79.be2", "function=0x105C88", capa.features.insn.Number(0xF000), True),
         # insn/number: negative
         (
@@ -123,14 +183,44 @@
             capa.features.insn.Number(0xFFFFFFFFFFFFFFFF),
             "xfail: not implemented yet",
         ),
-        ("687e79.be2", "function=0x1066e0,bb=0x1068c4", capa.features.insn.Number(0xFFFFFFFF), True),
+        (
+            "687e79.be2",
+            "function=0x1066e0,bb=0x1068c4",
+            capa.features.insn.Number(0xFFFFFFFF),
+            True,
+        ),
         # insn/offset
-        ("687e79.be2", "function=0x0", capa.features.insn.Offset(0x0), "xfail: not implemented yet"),
-        ("687e79.be2", "function=0x0", capa.features.insn.Offset(0x4), "xfail: not implemented yet"),
-        ("687e79.be2", "function=0x0", capa.features.insn.Offset(0xC), "xfail: not implemented yet"),
+        (
+            "687e79.be2",
+            "function=0x0",
+            capa.features.insn.Offset(0x0),
+            "xfail: not implemented yet",
+        ),
+        (
+            "687e79.be2",
+            "function=0x0",
+            capa.features.insn.Offset(0x4),
+            "xfail: not implemented yet",
+        ),
+        (
+            "687e79.be2",
+            "function=0x0",
+            capa.features.insn.Offset(0xC),
+            "xfail: not implemented yet",
+        ),
         # insn/offset: negative
-        ("687e79.be2", "function=0x0", capa.features.insn.Offset(-0x1), "xfail: not implemented yet"),
-        ("687e79.be2", "function=0x0", capa.features.insn.Offset(-0x2), "xfail: not implemented yet"),
+        (
+            "687e79.be2",
+            "function=0x0",
+            capa.features.insn.Offset(-0x1),
+            "xfail: not implemented yet",
+        ),
+        (
+            "687e79.be2",
+            "function=0x0",
+            capa.features.insn.Offset(-0x2),
+            "xfail: not implemented yet",
+        ),
         # insn/offset from mnemonic: add
         #
         # should not be considered, too big for an offset:
@@ -159,14 +249,44 @@
         ("687e79.be2", "function=0x105c88", capa.features.insn.API("memset"), True),
         ("687e79.be2", "function=0x105c88", capa.features.insn.API("Nope"), False),
         # insn/string
-        ("687e79.be2", "function=0x107588", capa.features.common.String("AppDataService start"), True),
-        ("687e79.be2", "function=0x1075c0", capa.features.common.String("AppDataService"), True),
+        (
+            "687e79.be2",
+            "function=0x107588",
+            capa.features.common.String("AppDataService start"),
+            True,
+        ),
+        (
+            "687e79.be2",
+            "function=0x1075c0",
+            capa.features.common.String("AppDataService"),
+            True,
+        ),
         ("687e79.be2", "function=0x107588", capa.features.common.String("nope"), False),
-        ("687e79.be2", "function=0x106d58", capa.features.common.String("/data/misc/wpa_supplicant.conf"), True),
+        (
+            "687e79.be2",
+            "function=0x106d58",
+            capa.features.common.String("/data/misc/wifi/wpa_supplicant.conf"),
+            True,
+        ),
         # insn/regex
-        ("687e79.be2", "function=0x105c88", capa.features.common.Regex("innerRename"), True),
-        ("687e79.be2", "function=0x106d58", capa.features.common.Regex("/data/misc"), True),
-        ("687e79.be2", "function=0x106d58", capa.features.common.Substring("/data/misc"), True),
+        (
+            "687e79.be2",
+            "function=0x105c88",
+            capa.features.common.Regex("innerRename"),
+            True,
+        ),
+        (
+            "687e79.be2",
+            "function=0x106d58",
+            capa.features.common.Regex("/data/misc"),
+            True,
+        ),
+        (
+            "687e79.be2",
+            "function=0x106d58",
+            capa.features.common.Substring("/data/misc"),
+            True,
+        ),
         # # insn/string, pointer to string
         # ("mimikatz", "function=0x44EDEF", capa.features.common.String("INPUTEVENT"), True),
         # # insn/string, direct memory reference
@@ -213,8 +333,18 @@
         # # insn/characteristic(cross section flow): imports don't count
         # ("mimikatz", "function=0x4556E5", capa.features.common.Characteristic("cross section flow"), False),
         # insn/characteristic(recursive call)
-        ("687e79.be2", "function=0x105b38", capa.features.common.Characteristic("recursive call"), True),
-        ("687e79.be2", "function=0x106530", capa.features.common.Characteristic("recursive call"), True),
+        (
+            "687e79.be2",
+            "function=0x105b38",
+            capa.features.common.Characteristic("recursive call"),
+            True,
+        ),
+        (
+            "687e79.be2",
+            "function=0x106530",
+            capa.features.common.Characteristic("recursive call"),
+            True,
+        ),
         # insn/characteristic(indirect call)
         (
             "687e79.be2",
@@ -229,10 +359,25 @@
             "xfail: not implemented yet",
         ),
         # insn/characteristic(calls from)
-        ("687e79.be2", "function=0x105080", capa.features.common.Characteristic("calls from"), True),
-        ("687e79.be2", "function=0x1070e8", capa.features.common.Characteristic("calls from"), False),
+        (
+            "687e79.be2",
+            "function=0x105080",
+            capa.features.common.Characteristic("calls from"),
+            True,
+        ),
+        (
+            "687e79.be2",
+            "function=0x1070e8",
+            capa.features.common.Characteristic("calls from"),
+            False,
+        ),
         # function/characteristic(calls to)
-        ("687e79.be2", "function=0x1075c0", capa.features.common.Characteristic("calls to"), True),
+        (
+            "687e79.be2",
+            "function=0x1075c0",
+            capa.features.common.Characteristic("calls to"),
+            True,
+        ),
         # file/function-name
         (
             "687e79.be2",

From cbe83ddad4ff2887830802a3a9d99b30171bce3f Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 31 May 2024 14:49:20 -0400
Subject: [PATCH 136/200] binexport: add typing where applicable (#2106)

---
 .../extractors/binexport2/__init__.py         |  94 +++++++--------
 .../extractors/binexport2/basicblock.py       |   9 +-
 .../extractors/binexport2/extractor.py        |  41 +++----
 capa/features/extractors/binexport2/file.py   |  12 +-
 .../extractors/binexport2/function.py         |  42 +++----
 .../features/extractors/binexport2/helpers.py |   3 +-
 capa/features/extractors/binexport2/insn.py   | 112 ++++++++++--------
 7 files changed, 163 insertions(+), 150 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index e3c9add5e..d6fe87ae9 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -26,12 +26,13 @@
 import capa.features.extractors.common
 import capa.features.extractors.binexport2.helpers
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
+from capa.features.extractors.binexport2.binexport2_pb2.BinExport2 import CallGraph, FlowGraph
 
 logger = logging.getLogger(__name__)
 
 
 def get_binexport2(sample: Path) -> BinExport2:
-    be2 = BinExport2()
+    be2: BinExport2 = BinExport2()
     be2.ParseFromString(sample.read_bytes())
     return be2
 
@@ -54,15 +55,15 @@ def get_sample_from_binexport2(input_file: Path, be2: BinExport2, search_paths:
     searches in the same directory as the BinExport2 file, and then in search_paths.
     """
 
-    def filename_similarity_key(p: Path):
+    def filename_similarity_key(p: Path) -> Tuple[int, str]:
         # note closure over input_file.
         # sort first by length of common prefix, then by name (for stability)
         return (compute_common_prefix_length(p.name, input_file.name), p.name)
 
-    wanted_sha256 = be2.meta_information.executable_id.lower()
+    wanted_sha256: str = be2.meta_information.executable_id.lower()
 
-    input_directory = input_file.parent
-    siblings = [p for p in input_directory.iterdir() if p.is_file()]
+    input_directory: Path = input_file.parent
+    siblings: List[Path] = [p for p in input_directory.iterdir() if p.is_file()]
     siblings.sort(key=filename_similarity_key, reverse=True)
     for sibling in siblings:
         # e.g. with open IDA files in the same directory on Windows
@@ -71,7 +72,7 @@ def filename_similarity_key(p: Path):
                 return sibling
 
     for search_path in search_paths:
-        candidates = [p for p in search_path.iterdir() if p.is_file()]
+        candidates: List[Path] = [p for p in search_path.iterdir() if p.is_file()]
         candidates.sort(key=filename_similarity_key, reverse=True)
         for candidate in candidates:
             with contextlib.suppress(PermissionError):
@@ -83,7 +84,7 @@ def filename_similarity_key(p: Path):
 
 class BinExport2Index:
     def __init__(self, be2: BinExport2):
-        self.be2 = be2
+        self.be2: BinExport2 = be2
 
         self.callers_by_vertex_index: Dict[int, List[int]] = defaultdict(list)
         self.callees_by_vertex_index: Dict[int, List[int]] = defaultdict(list)
@@ -93,9 +94,9 @@ def __init__(self, be2: BinExport2):
         self.flow_graph_address_by_index: Dict[int, int] = {}
 
         # edges that come from the given basic block
-        self.source_edges_by_basic_block_index: Dict[int, List[BinExport2.FlowGraph.Edge]] = defaultdict(list)
+        self.source_edges_by_basic_block_index: Dict[int, List[FlowGraph.Edge]] = defaultdict(list)
         # edges that end up at the given basic block
-        self.target_edges_by_basic_block_index: Dict[int, List[BinExport2.FlowGraph.Edge]] = defaultdict(list)
+        self.target_edges_by_basic_block_index: Dict[int, List[FlowGraph.Edge]] = defaultdict(list)
 
         self.vertex_index_by_address: Dict[int, int] = {}
 
@@ -119,9 +120,8 @@ def get_insn_address(self, insn_index: int) -> int:
         return self.insn_address_by_index[insn_index]
 
     def get_basic_block_address(self, basic_block_index: int) -> int:
-        basic_block = self.be2.basic_block[basic_block_index]
-        first_instruction_index = next(self.instruction_indices(basic_block))
-
+        basic_block: BinExport2.BasicBlock = self.be2.basic_block[basic_block_index]
+        first_instruction_index: int = next(self.instruction_indices(basic_block))
         return self.get_insn_address(first_instruction_index)
 
     def _index_vertex_edges(self):
@@ -136,7 +136,7 @@ def _index_vertex_edges(self):
 
     def _index_flow_graph_nodes(self):
         for flow_graph_index, flow_graph in enumerate(self.be2.flow_graph):
-            function_address = self.get_basic_block_address(flow_graph.entry_basic_block_index)
+            function_address: int = self.get_basic_block_address(flow_graph.entry_basic_block_index)
             self.flow_graph_index_by_address[function_address] = flow_graph_index
             self.flow_graph_address_by_index[flow_graph_index] = function_address
 
@@ -154,7 +154,7 @@ def _index_call_graph_vertices(self):
             if not vertex.HasField("address"):
                 continue
 
-            vertex_address = vertex.address
+            vertex_address: int = vertex.address
             self.vertex_index_by_address[vertex_address] = vertex_index
 
     def _index_data_references(self):
@@ -177,8 +177,8 @@ def _index_insn_addresses(self):
 
         assert self.be2.instruction[0].HasField("address"), "first insn must have explicit address"
 
-        addr = 0
-        next_addr = 0
+        addr: int = 0
+        next_addr: int = 0
         for idx, insn in enumerate(self.be2.instruction):
             if insn.HasField("address"):
                 addr = insn.address
@@ -208,14 +208,14 @@ def basic_block_instructions(
         the instruction instances, and their addresses.
         """
         for instruction_index in self.instruction_indices(basic_block):
-            instruction = self.be2.instruction[instruction_index]
-            instruction_address = self.get_insn_address(instruction_index)
+            instruction: BinExport2.Instruction = self.be2.instruction[instruction_index]
+            instruction_address: int = self.get_insn_address(instruction_index)
 
             yield instruction_index, instruction, instruction_address
 
     def get_function_name_by_vertex(self, vertex_index: int) -> str:
-        vertex = self.be2.call_graph.vertex[vertex_index]
-        name = f"sub_{vertex.address:x}"
+        vertex: CallGraph.Vertex = self.be2.call_graph.vertex[vertex_index]
+        name: str = f"sub_{vertex.address:x}"
         if vertex.HasField("mangled_name"):
             name = vertex.mangled_name
 
@@ -223,7 +223,7 @@ def get_function_name_by_vertex(self, vertex_index: int) -> str:
             name = vertex.demangled_name
 
         if vertex.HasField("library_index"):
-            library = self.be2.library[vertex.library_index]
+            library: BinExport2.Library = self.be2.library[vertex.library_index]
             if library.HasField("name"):
                 name = f"{library.name}!{name}"
 
@@ -233,15 +233,15 @@ def get_function_name_by_address(self, address: int) -> str:
         if address not in self.vertex_index_by_address:
             return ""
 
-        vertex_index = self.vertex_index_by_address[address]
+        vertex_index: int = self.vertex_index_by_address[address]
         return self.get_function_name_by_vertex(vertex_index)
 
 
 class BinExport2Analysis:
     def __init__(self, be2: BinExport2, idx: BinExport2Index, buf: bytes):
-        self.be2 = be2
-        self.idx = idx
-        self.buf = buf
+        self.be2: BinExport2 = be2
+        self.idx: BinExport2Index = idx
+        self.buf: bytes = buf
         self.base_address: int = 0
         self.thunks: Dict[int, int] = {}
 
@@ -249,7 +249,9 @@ def __init__(self, be2: BinExport2, idx: BinExport2Index, buf: bytes):
         self._compute_thunks()
 
     def _find_base_address(self):
-        sections_with_perms = filter(lambda s: s.flag_r or s.flag_w or s.flag_x, self.be2.section)
+        sections_with_perms: Iterator[BinExport2.Section] = filter(
+            lambda s: s.flag_r or s.flag_w or s.flag_x, self.be2.section
+        )
         # assume the lowest address is the base address.
         # this works as long as BinExport doesn't record other
         # libraries mapped into memory.
@@ -259,15 +261,13 @@ def _find_base_address(self):
 
     def _compute_thunks(self):
         for addr, idx in self.idx.vertex_index_by_address.items():
-            vertex = self.be2.call_graph.vertex[idx]
-            if not capa.features.extractors.binexport2.helpers.is_vertex_type(
-                vertex, BinExport2.CallGraph.Vertex.Type.THUNK
-            ):
+            vertex: CallGraph.Vertex = self.be2.call_graph.vertex[idx]
+            if not capa.features.extractors.binexport2.helpers.is_vertex_type(vertex, CallGraph.Vertex.Type.THUNK):
                 continue
 
-            curr_idx = idx
+            curr_idx: int = idx
             for _ in range(capa.features.common.THUNK_CHAIN_DEPTH_DELTA):
-                thunk_callees = self.idx.callees_by_vertex_index[curr_idx]
+                thunk_callees: List[int] = self.idx.callees_by_vertex_index[curr_idx]
                 # if this doesn't hold, then it doesn't seem like this is a thunk,
                 # because either, len is:
                 #    0 and the thunk doesn't point to anything, or
@@ -280,11 +280,11 @@ def _compute_thunks(self):
 
                 assert len(thunk_callees) == 1
 
-                thunked_idx = thunk_callees[0]
-                thunked_vertex = self.be2.call_graph.vertex[thunked_idx]
+                thunked_idx: int = thunk_callees[0]
+                thunked_vertex: CallGraph.Vertex = self.be2.call_graph.vertex[thunked_idx]
 
                 if not capa.features.extractors.binexport2.helpers.is_vertex_type(
-                    thunked_vertex, BinExport2.CallGraph.Vertex.Type.THUNK
+                    thunked_vertex, CallGraph.Vertex.Type.THUNK
                 ):
                     assert thunked_vertex.HasField("address")
 
@@ -321,21 +321,21 @@ class AddressSpace:
     memory_regions: Tuple[MemoryRegion, ...]
 
     def read_memory(self, address: int, length: int) -> bytes:
-        rva = address - self.base_address
+        rva: int = address - self.base_address
         for region in self.memory_regions:
             if region.contains(rva):
-                offset = rva - region.address
+                offset: int = rva - region.address
                 return region.buf[offset : offset + length]
 
         raise AddressNotMappedError(address)
 
     @classmethod
     def from_pe(cls, pe: PE, base_address: int):
-        regions = []
+        regions: List[MemoryRegion] = []
         for section in pe.sections:
-            address = section.VirtualAddress
-            size = section.Misc_VirtualSize
-            buf = section.get_data()
+            address: int = section.VirtualAddress
+            size: int = section.Misc_VirtualSize
+            buf: bytes = section.get_data()
 
             if len(buf) != size:
                 # pad the section with NULLs
@@ -349,16 +349,16 @@ def from_pe(cls, pe: PE, base_address: int):
 
     @classmethod
     def from_elf(cls, elf: ELFFile, base_address: int):
-        regions = []
+        regions: List[MemoryRegion] = []
 
         # ELF segments are for runtime data,
         # ELF sections are for link-time data.
         for segment in elf.iter_segments():
             # assume p_align is consistent with addresses here.
             # otherwise, should harden this loader.
-            segment_rva = segment.header.p_vaddr
-            segment_size = segment.header.p_memsz
-            segment_data = segment.data()
+            segment_rva: int = segment.header.p_vaddr
+            segment_size: int = segment.header.p_memsz
+            segment_data: bytes = segment.data()
 
             if len(segment_data) < segment_size:
                 # pad the section with NULLs
@@ -373,10 +373,10 @@ def from_elf(cls, elf: ELFFile, base_address: int):
     @classmethod
     def from_buf(cls, buf: bytes, base_address: int):
         if buf.startswith(capa.features.extractors.common.MATCH_PE):
-            pe = PE(data=buf)
+            pe: PE = PE(data=buf)
             return cls.from_pe(pe, base_address)
         elif buf.startswith(capa.features.extractors.common.MATCH_ELF):
-            elf = ELFFile(io.BytesIO(buf))
+            elf: ELFFile = ELFFile(io.BytesIO(buf))
             return cls.from_elf(elf, base_address)
         else:
             raise NotImplementedError("file format address space")
diff --git a/capa/features/extractors/binexport2/basicblock.py b/capa/features/extractors/binexport2/basicblock.py
index 4674791f7..f44754f58 100644
--- a/capa/features/extractors/binexport2/basicblock.py
+++ b/capa/features/extractors/binexport2/basicblock.py
@@ -6,13 +6,14 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 
-from typing import Tuple, Iterator
+from typing import List, Tuple, Iterator
 
 from capa.features.common import Feature, Characteristic
 from capa.features.address import Address, AbsoluteVirtualAddress
 from capa.features.basicblock import BasicBlock
 from capa.features.extractors.binexport2 import FunctionContext, BasicBlockContext
 from capa.features.extractors.base_extractor import BBHandle, FunctionHandle
+from capa.features.extractors.binexport2.binexport2_pb2.BinExport2 import FlowGraph
 
 
 def extract_bb_tight_loop(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
@@ -21,10 +22,10 @@ def extract_bb_tight_loop(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[F
 
     idx = fhi.ctx.idx
 
-    basic_block_index = bbi.basic_block_index
-    target_edges = idx.target_edges_by_basic_block_index[basic_block_index]
+    basic_block_index: int = bbi.basic_block_index
+    target_edges: List[FlowGraph.Edge] = idx.target_edges_by_basic_block_index[basic_block_index]
     if basic_block_index in (e.source_basic_block_index for e in target_edges):
-        basic_block_address = idx.get_basic_block_address(basic_block_index)
+        basic_block_address: int = idx.get_basic_block_address(basic_block_index)
         yield Characteristic("tight loop"), AbsoluteVirtualAddress(basic_block_address)
 
 
diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
index fad1d6927..665d0bda1 100644
--- a/capa/features/extractors/binexport2/extractor.py
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -34,6 +34,7 @@
     StaticFeatureExtractor,
 )
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
+from capa.features.extractors.binexport2.binexport2_pb2.BinExport2 import CallGraph
 
 logger = logging.getLogger(__name__)
 
@@ -41,12 +42,12 @@
 class BinExport2FeatureExtractor(StaticFeatureExtractor):
     def __init__(self, be2: BinExport2, buf: bytes):
         super().__init__(hashes=SampleHashes.from_bytes(buf))
-        self.be2 = be2
-        self.buf = buf
-        self.idx = BinExport2Index(self.be2)
-        self.analysis = BinExport2Analysis(self.be2, self.idx, self.buf)
-        address_space = AddressSpace.from_buf(buf, self.analysis.base_address)
-        self.ctx = AnalysisContext(self.buf, self.be2, self.idx, self.analysis, address_space)
+        self.be2: BinExport2 = be2
+        self.buf: bytes = buf
+        self.idx: BinExport2Index = BinExport2Index(self.be2)
+        self.analysis: BinExport2Analysis = BinExport2Analysis(self.be2, self.idx, self.buf)
+        address_space: AddressSpace = AddressSpace.from_buf(buf, self.analysis.base_address)
+        self.ctx: AnalysisContext = AnalysisContext(self.buf, self.be2, self.idx, self.analysis, address_space)
 
         self.global_features: List[Tuple[Feature, Address]] = []
         self.global_features.extend(list(capa.features.extractors.common.extract_format(self.buf)))
@@ -57,27 +58,25 @@ def __init__(self, be2: BinExport2, buf: bytes):
         # and gradually relax restrictions as they're tested.
         # https://github.com/mandiant/capa/issues/1755
 
-    def get_base_address(self):
+    def get_base_address(self) -> AbsoluteVirtualAddress:
         return AbsoluteVirtualAddress(self.analysis.base_address)
 
-    def extract_global_features(self):
+    def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]:
         yield from self.global_features
 
-    def extract_file_features(self):
+    def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]:
         yield from capa.features.extractors.binexport2.file.extract_features(self.be2, self.buf)
 
     def get_functions(self) -> Iterator[FunctionHandle]:
         for flow_graph_index, flow_graph in enumerate(self.be2.flow_graph):
-            entry_basic_block_index = flow_graph.entry_basic_block_index
-            flow_graph_address = self.idx.get_basic_block_address(entry_basic_block_index)
+            entry_basic_block_index: int = flow_graph.entry_basic_block_index
+            flow_graph_address: int = self.idx.get_basic_block_address(entry_basic_block_index)
 
-            vertex_idx = self.idx.vertex_index_by_address[flow_graph_address]
-            be2_vertex = self.be2.call_graph.vertex[vertex_idx]
+            vertex_idx: int = self.idx.vertex_index_by_address[flow_graph_address]
+            be2_vertex: CallGraph.Vertex = self.be2.call_graph.vertex[vertex_idx]
 
             # skip thunks
-            if capa.features.extractors.binexport2.helpers.is_vertex_type(
-                be2_vertex, BinExport2.CallGraph.Vertex.Type.THUNK
-            ):
+            if capa.features.extractors.binexport2.helpers.is_vertex_type(be2_vertex, CallGraph.Vertex.Type.THUNK):
                 continue
 
             yield FunctionHandle(
@@ -90,11 +89,11 @@ def extract_function_features(self, fh: FunctionHandle) -> Iterator[Tuple[Featur
 
     def get_basic_blocks(self, fh: FunctionHandle) -> Iterator[BBHandle]:
         fhi: FunctionContext = fh.inner
-        flow_graph_index = fhi.flow_graph_index
-        flow_graph = self.be2.flow_graph[flow_graph_index]
+        flow_graph_index: int = fhi.flow_graph_index
+        flow_graph: BinExport2.FlowGraph = self.be2.flow_graph[flow_graph_index]
 
         for basic_block_index in flow_graph.basic_block_index:
-            basic_block_address = self.idx.get_basic_block_address(basic_block_index)
+            basic_block_address: int = self.idx.get_basic_block_address(basic_block_index)
             yield BBHandle(
                 address=AbsoluteVirtualAddress(basic_block_address),
                 inner=BasicBlockContext(basic_block_index),
@@ -112,5 +111,7 @@ def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHa
                 inner=InstructionContext(instruction_index),
             )
 
-    def extract_insn_features(self, fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle):
+    def extract_insn_features(
+        self, fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
+    ) -> Iterator[Tuple[Feature, Address]]:
         yield from capa.features.extractors.binexport2.insn.extract_features(fh, bbh, ih)
diff --git a/capa/features/extractors/binexport2/file.py b/capa/features/extractors/binexport2/file.py
index a6ee7ce93..9d9872bc2 100644
--- a/capa/features/extractors/binexport2/file.py
+++ b/capa/features/extractors/binexport2/file.py
@@ -25,10 +25,10 @@
 
 def extract_file_export_names(_be2: BinExport2, buf: bytes) -> Iterator[Tuple[Feature, Address]]:
     if buf.startswith(capa.features.extractors.common.MATCH_PE):
-        pe = pefile.PE(data=buf)
+        pe: pefile.PE = pefile.PE(data=buf)
         yield from capa.features.extractors.pefile.extract_file_export_names(pe)
     elif buf.startswith(capa.features.extractors.common.MATCH_ELF):
-        elf = ELFFile(io.BytesIO(buf))
+        elf: ELFFile = ELFFile(io.BytesIO(buf))
         yield from capa.features.extractors.elffile.extract_file_export_names(elf)
     else:
         logger.warning("unsupported format")
@@ -36,10 +36,10 @@ def extract_file_export_names(_be2: BinExport2, buf: bytes) -> Iterator[Tuple[Fe
 
 def extract_file_import_names(_be2: BinExport2, buf: bytes) -> Iterator[Tuple[Feature, Address]]:
     if buf.startswith(capa.features.extractors.common.MATCH_PE):
-        pe = pefile.PE(data=buf)
+        pe: pefile.PE = pefile.PE(data=buf)
         yield from capa.features.extractors.pefile.extract_file_import_names(pe)
     elif buf.startswith(capa.features.extractors.common.MATCH_ELF):
-        elf = ELFFile(io.BytesIO(buf))
+        elf: ELFFile = ELFFile(io.BytesIO(buf))
         yield from capa.features.extractors.elffile.extract_file_import_names(elf)
     else:
         logger.warning("unsupported format")
@@ -47,10 +47,10 @@ def extract_file_import_names(_be2: BinExport2, buf: bytes) -> Iterator[Tuple[Fe
 
 def extract_file_section_names(_be2: BinExport2, buf: bytes) -> Iterator[Tuple[Feature, Address]]:
     if buf.startswith(capa.features.extractors.common.MATCH_PE):
-        pe = pefile.PE(data=buf)
+        pe: pefile.PE = pefile.PE(data=buf)
         yield from capa.features.extractors.pefile.extract_file_section_names(pe)
     elif buf.startswith(capa.features.extractors.common.MATCH_ELF):
-        elf = ELFFile(io.BytesIO(buf))
+        elf: ELFFile = ELFFile(io.BytesIO(buf))
         yield from capa.features.extractors.elffile.extract_file_section_names(elf)
     else:
         logger.warning("unsupported format")
diff --git a/capa/features/extractors/binexport2/function.py b/capa/features/extractors/binexport2/function.py
index e437c0dc7..685b3ab08 100644
--- a/capa/features/extractors/binexport2/function.py
+++ b/capa/features/extractors/binexport2/function.py
@@ -10,33 +10,35 @@
 from capa.features.file import FunctionName
 from capa.features.common import Feature, Characteristic
 from capa.features.address import Address, AbsoluteVirtualAddress
-from capa.features.extractors.binexport2 import FunctionContext
+from capa.features.extractors.binexport2 import BinExport2Index, FunctionContext
 from capa.features.extractors.base_extractor import FunctionHandle
+from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
+from capa.features.extractors.binexport2.binexport2_pb2.BinExport2 import CallGraph
 
 
-def extract_function_calls_to(fh: FunctionHandle):
+def extract_function_calls_to(fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
     fhi: FunctionContext = fh.inner
 
-    be2 = fhi.ctx.be2
-    idx = fhi.ctx.idx
+    be2: BinExport2 = fhi.ctx.be2
+    idx: BinExport2Index = fhi.ctx.idx
 
-    flow_graph_index = fhi.flow_graph_index
-    flow_graph_address = idx.flow_graph_address_by_index[flow_graph_index]
-    vertex_index = idx.vertex_index_by_address[flow_graph_address]
+    flow_graph_index: int = fhi.flow_graph_index
+    flow_graph_address: int = idx.flow_graph_address_by_index[flow_graph_index]
+    vertex_index: int = idx.vertex_index_by_address[flow_graph_address]
 
     for caller_index in idx.callers_by_vertex_index[vertex_index]:
-        caller = be2.call_graph.vertex[caller_index]
-        caller_address = caller.address
+        caller: CallGraph.Vertex = be2.call_graph.vertex[caller_index]
+        caller_address: int = caller.address
         yield Characteristic("calls to"), AbsoluteVirtualAddress(caller_address)
 
 
-def extract_function_loop(fh: FunctionHandle):
+def extract_function_loop(fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
     fhi: FunctionContext = fh.inner
 
-    be2 = fhi.ctx.be2
+    be2: BinExport2 = fhi.ctx.be2
 
-    flow_graph_index = fhi.flow_graph_index
-    flow_graph = be2.flow_graph[flow_graph_index]
+    flow_graph_index: int = fhi.flow_graph_index
+    flow_graph: BinExport2.FlowGraph = be2.flow_graph[flow_graph_index]
 
     for edge in flow_graph.edge:
         if edge.is_back_edge:
@@ -44,16 +46,16 @@ def extract_function_loop(fh: FunctionHandle):
             break
 
 
-def extract_function_name(fh: FunctionHandle):
+def extract_function_name(fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
     fhi: FunctionContext = fh.inner
 
-    be2 = fhi.ctx.be2
-    idx = fhi.ctx.idx
-    flow_graph_index = fhi.flow_graph_index
+    be2: BinExport2 = fhi.ctx.be2
+    idx: BinExport2Index = fhi.ctx.idx
+    flow_graph_index: int = fhi.flow_graph_index
 
-    flow_graph_address = idx.flow_graph_address_by_index[flow_graph_index]
-    vertex_index = idx.vertex_index_by_address[flow_graph_address]
-    vertex = be2.call_graph.vertex[vertex_index]
+    flow_graph_address: int = idx.flow_graph_address_by_index[flow_graph_index]
+    vertex_index: int = idx.vertex_index_by_address[flow_graph_address]
+    vertex: CallGraph.Vertex = be2.call_graph.vertex[vertex_index]
 
     if vertex.HasField("mangled_name"):
         yield FunctionName(vertex.mangled_name), fh.address
diff --git a/capa/features/extractors/binexport2/helpers.py b/capa/features/extractors/binexport2/helpers.py
index e4698da6b..6c1f17de8 100644
--- a/capa/features/extractors/binexport2/helpers.py
+++ b/capa/features/extractors/binexport2/helpers.py
@@ -5,7 +5,8 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
+from capa.features.extractors.binexport2.binexport2_pb2.BinExport2 import CallGraph
 
 
-def is_vertex_type(vertex, type_):
+def is_vertex_type(vertex: CallGraph.Vertex, type_: CallGraph.Vertex.Type.ValueType) -> bool:
     return vertex.HasField("type") and vertex.type == type_
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 2f4145d8f..cee4f053a 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -14,25 +14,35 @@
 from capa.features.insn import API, Number, Mnemonic, OperandNumber
 from capa.features.common import Bytes, String, Feature, Characteristic
 from capa.features.address import Address, AbsoluteVirtualAddress
-from capa.features.extractors.binexport2 import FunctionContext, ReadMemoryError, BasicBlockContext, InstructionContext
+from capa.features.extractors.binexport2 import (
+    AddressSpace,
+    AnalysisContext,
+    BinExport2Index,
+    FunctionContext,
+    ReadMemoryError,
+    BasicBlockContext,
+    BinExport2Analysis,
+    InstructionContext,
+)
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
+from capa.features.extractors.binexport2.binexport2_pb2.BinExport2 import CallGraph
 
 logger = logging.getLogger(__name__)
 
 # security cookie checks may perform non-zeroing XORs, these are expected within a certain
 # byte range within the first and returning basic blocks, this helps to reduce FP features
-SECURITY_COOKIE_BYTES_DELTA = 0x40
+SECURITY_COOKIE_BYTES_DELTA: int = 0x40
 
 
 def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
 
-    be2 = fhi.ctx.be2
-    be2_index = fhi.ctx.idx
-    be2_analysis = fhi.ctx.analysis
-    insn = be2.instruction[ii.instruction_index]
+    be2: BinExport2 = fhi.ctx.be2
+    be2_index: BinExport2Index = fhi.ctx.idx
+    be2_analysis: BinExport2Analysis = fhi.ctx.analysis
+    insn: BinExport2.Instruction = be2.instruction[ii.instruction_index]
 
     for addr in insn.call_target:
         addr = be2_analysis.thunks.get(addr, addr)
@@ -42,19 +52,17 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
             logger.debug("0x%x is not a vertex", addr)
             continue
 
-        vertex_idx = be2_index.vertex_index_by_address[addr]
-        vertex = be2.call_graph.vertex[vertex_idx]
+        vertex_idx: int = be2_index.vertex_index_by_address[addr]
+        vertex: CallGraph.Vertex = be2.call_graph.vertex[vertex_idx]
 
-        if not capa.features.extractors.binexport2.helpers.is_vertex_type(
-            vertex, BinExport2.CallGraph.Vertex.Type.IMPORTED
-        ):
+        if not capa.features.extractors.binexport2.helpers.is_vertex_type(vertex, CallGraph.Vertex.Type.IMPORTED):
             continue
 
         if not vertex.HasField("mangled_name"):
             logger.debug("vertex %d does not have mangled_name", vertex_idx)
             continue
 
-        api_name = vertex.mangled_name
+        api_name: str = vertex.mangled_name
         yield API(api_name), ih.address
 
     """
@@ -74,7 +82,7 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
 
 def is_address_mapped(be2: BinExport2, address: int) -> bool:
     """return True if the given address is mapped"""
-    sections_with_perms = filter(lambda s: s.flag_r or s.flag_w or s.flag_x, be2.section)
+    sections_with_perms: Iterator[BinExport2.Section] = filter(lambda s: s.flag_r or s.flag_w or s.flag_x, be2.section)
     return any(section.address <= address < section.address + section.size for section in sections_with_perms)
 
 
@@ -126,11 +134,11 @@ def extract_insn_number_features(
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
 
-    be2 = fhi.ctx.be2
-    analysis = fhi.ctx.analysis
+    be2: BinExport2 = fhi.ctx.be2
+    analysis: BinExport2Analysis = fhi.ctx.analysis
 
-    instruction_index = ii.instruction_index
-    instruction = be2.instruction[instruction_index]
+    instruction_index: int = ii.instruction_index
+    instruction: BinExport2.Instruction = be2.instruction[instruction_index]
 
     # x86 / amd64
     mnemonic = be2.mnemonic[instruction.mnemonic_index]
@@ -239,12 +247,12 @@ def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandl
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
 
-    ctx = fhi.ctx
-    be2 = ctx.be2
-    idx = ctx.idx
-    address_space = ctx.address_space
+    ctx: AnalysisContext = fhi.ctx
+    be2: BinExport2 = ctx.be2
+    idx: BinExport2Index = ctx.idx
+    address_space: AddressSpace = ctx.address_space
 
-    instruction_index = ii.instruction_index
+    instruction_index: int = ii.instruction_index
 
     if instruction_index in idx.string_reference_index_by_source_instruction_index:
         # disassembler already identified string reference from instruction
@@ -254,15 +262,15 @@ def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandl
 
     if instruction_index in idx.data_reference_index_by_source_instruction_index:
         for data_reference_index in idx.data_reference_index_by_source_instruction_index[instruction_index]:
-            data_reference = be2.data_reference[data_reference_index]
-            data_reference_address = data_reference.address
+            data_reference: BinExport2.DataReference = be2.data_reference[data_reference_index]
+            data_reference_address: int = data_reference.address
 
             reference_addresses.append(data_reference_address)
 
     for reference_address in reference_addresses:
         try:
             # if at end of segment then there might be an overrun here.
-            buf = address_space.read_memory(reference_address, 0x100)
+            buf: bytes = address_space.read_memory(reference_address, 0x100)
         except ReadMemoryError:
             logger.debug("failed to read memory: 0x%x", reference_address)
             continue
@@ -270,7 +278,7 @@ def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandl
         if capa.features.extractors.helpers.all_zeros(buf):
             continue
 
-        is_string = False
+        is_string: bool = False
 
         # note: we *always* break after the first iteration
         for s in capa.features.extractors.strings.extract_ascii_strings(buf):
@@ -300,16 +308,16 @@ def extract_insn_string_features(
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
 
-    be2 = fhi.ctx.be2
-    idx = fhi.ctx.idx
+    be2: BinExport2 = fhi.ctx.be2
+    idx: BinExport2Index = fhi.ctx.idx
 
-    instruction_index = ii.instruction_index
+    instruction_index: int = ii.instruction_index
 
     if instruction_index in idx.string_reference_index_by_source_instruction_index:
         for string_reference_index in idx.string_reference_index_by_source_instruction_index[instruction_index]:
-            string_reference = be2.string_reference[string_reference_index]
-            string_index = string_reference.string_table_index
-            string = be2.string_table[string_index]
+            string_reference: BinExport2.Reference = be2.string_reference[string_reference_index]
+            string_index: int = string_reference.string_table_index
+            string: str = be2.string_table[string_index]
             yield String(string), ih.address
 
 
@@ -329,26 +337,26 @@ def is_security_cookie(
     """
     check if an instruction is related to security cookie checks.
     """
-    be2 = fhi.ctx.be2
+    be2: BinExport2 = fhi.ctx.be2
 
     # security cookie check should use SP or BP
-    op1 = be2.operand[instruction.operand_index[1]]
-    op1_exprs = [be2.expression[expr_i] for expr_i in op1.expression_index]
+    op1: BinExport2.Operand = be2.operand[instruction.operand_index[1]]
+    op1_exprs: List[BinExport2.Expression] = [be2.expression[expr_i] for expr_i in op1.expression_index]
     if all(expr.symbol.lower() not in ("bp", "esp", "ebp", "rbp", "rsp") for expr in op1_exprs):
         return False
 
     # check_nzxor_security_cookie_delta
     # if insn falls at the start of first entry block of the parent function.
-    flow_graph = be2.flow_graph[fhi.flow_graph_index]
-    basic_block_index = bbi.basic_block_index
-    bb = be2.basic_block[basic_block_index]
+    flow_graph: BinExport2.FlowGraph = be2.flow_graph[fhi.flow_graph_index]
+    basic_block_index: int = bbi.basic_block_index
+    bb: BinExport2.BasicBlock = be2.basic_block[basic_block_index]
     if flow_graph.entry_basic_block_index == basic_block_index:
-        first_addr = min((be2.instruction[ir.begin_index].address for ir in bb.instruction_index))
+        first_addr: int = min((be2.instruction[ir.begin_index].address for ir in bb.instruction_index))
         if instruction.address < first_addr + SECURITY_COOKIE_BYTES_DELTA:
             return True
     # or insn falls at the end before return in a terminal basic block.
     if basic_block_index not in (e.source_basic_block_index for e in flow_graph.edge):
-        last_addr = max((be2.instruction[ir.end_index - 1].address for ir in bb.instruction_index))
+        last_addr: int = max((be2.instruction[ir.end_index - 1].address for ir in bb.instruction_index))
         if instruction.address > last_addr - SECURITY_COOKIE_BYTES_DELTA:
             return True
     return False
@@ -364,11 +372,11 @@ def extract_insn_nzxor_characteristic_features(
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
 
-    be2 = fhi.ctx.be2
+    be2: BinExport2 = fhi.ctx.be2
 
-    instruction = be2.instruction[ii.instruction_index]
-    mnemonic = be2.mnemonic[instruction.mnemonic_index]
-    mnemonic_name = mnemonic.name.lower()
+    instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
+    mnemonic: BinExport2.Mnemonic = be2.mnemonic[instruction.mnemonic_index]
+    mnemonic_name: str = mnemonic.name.lower()
     if mnemonic_name not in (
         "xor",
         "xorpd",
@@ -378,7 +386,7 @@ def extract_insn_nzxor_characteristic_features(
     ):
         return
 
-    operands = [be2.operand[operand_index] for operand_index in instruction.operand_index]
+    operands: List[BinExport2.Operand] = [be2.operand[operand_index] for operand_index in instruction.operand_index]
 
     # check whether operands are same for x86 / amd64
     if mnemonic_name in ("xor", "xorpd", "xorps", "pxor"):
@@ -402,11 +410,11 @@ def extract_insn_mnemonic_features(
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
 
-    be2 = fhi.ctx.be2
+    be2: BinExport2 = fhi.ctx.be2
 
-    instruction = be2.instruction[ii.instruction_index]
-    mnemonic = be2.mnemonic[instruction.mnemonic_index]
-    mnemonic_name = mnemonic.name.lower()
+    instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
+    mnemonic: BinExport2.Mnemonic = be2.mnemonic[instruction.mnemonic_index]
+    mnemonic_name: str = mnemonic.name.lower()
     yield Mnemonic(mnemonic_name), ih.address
 
 
@@ -419,11 +427,11 @@ def extract_function_calls_from(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandl
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
 
-    be2 = fhi.ctx.be2
+    be2: BinExport2 = fhi.ctx.be2
 
-    instruction = be2.instruction[ii.instruction_index]
+    instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
     for call_target_address in instruction.call_target:
-        addr = AbsoluteVirtualAddress(call_target_address)
+        addr: AbsoluteVirtualAddress = AbsoluteVirtualAddress(call_target_address)
         yield Characteristic("calls from"), addr
 
         if fh.address == addr:

From bb4e892fc739e33fd7881e716c1ea6d58e4a37d0 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Mon, 3 Jun 2024 02:52:58 -0600
Subject: [PATCH 137/200] binexport2: revert import names from BinExport2 proto

binexport2_pb.BinExport2 isnt a package so we can't import it like:

    from ...binexport2_pb.BinExport2 import CallGraph
---
 capa/features/extractors/binexport2/__init__.py   | 15 +++++++--------
 capa/features/extractors/binexport2/basicblock.py |  3 +--
 capa/features/extractors/binexport2/extractor.py  |  5 ++---
 capa/features/extractors/binexport2/function.py   |  5 ++---
 capa/features/extractors/binexport2/helpers.py    |  4 ++--
 capa/features/extractors/binexport2/insn.py       |  5 ++---
 6 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index d6fe87ae9..ba488cd86 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -26,7 +26,6 @@
 import capa.features.extractors.common
 import capa.features.extractors.binexport2.helpers
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
-from capa.features.extractors.binexport2.binexport2_pb2.BinExport2 import CallGraph, FlowGraph
 
 logger = logging.getLogger(__name__)
 
@@ -94,9 +93,9 @@ def __init__(self, be2: BinExport2):
         self.flow_graph_address_by_index: Dict[int, int] = {}
 
         # edges that come from the given basic block
-        self.source_edges_by_basic_block_index: Dict[int, List[FlowGraph.Edge]] = defaultdict(list)
+        self.source_edges_by_basic_block_index: Dict[int, List[BinExport2.FlowGraph.Edge]] = defaultdict(list)
         # edges that end up at the given basic block
-        self.target_edges_by_basic_block_index: Dict[int, List[FlowGraph.Edge]] = defaultdict(list)
+        self.target_edges_by_basic_block_index: Dict[int, List[BinExport2.FlowGraph.Edge]] = defaultdict(list)
 
         self.vertex_index_by_address: Dict[int, int] = {}
 
@@ -214,7 +213,7 @@ def basic_block_instructions(
             yield instruction_index, instruction, instruction_address
 
     def get_function_name_by_vertex(self, vertex_index: int) -> str:
-        vertex: CallGraph.Vertex = self.be2.call_graph.vertex[vertex_index]
+        vertex: BinExport2.CallGraph.Vertex = self.be2.call_graph.vertex[vertex_index]
         name: str = f"sub_{vertex.address:x}"
         if vertex.HasField("mangled_name"):
             name = vertex.mangled_name
@@ -261,8 +260,8 @@ def _find_base_address(self):
 
     def _compute_thunks(self):
         for addr, idx in self.idx.vertex_index_by_address.items():
-            vertex: CallGraph.Vertex = self.be2.call_graph.vertex[idx]
-            if not capa.features.extractors.binexport2.helpers.is_vertex_type(vertex, CallGraph.Vertex.Type.THUNK):
+            vertex: BinExport2.CallGraph.Vertex = self.be2.call_graph.vertex[idx]
+            if not capa.features.extractors.binexport2.helpers.is_vertex_type(vertex, BinExport2.CallGraph.Vertex.Type.THUNK):
                 continue
 
             curr_idx: int = idx
@@ -281,10 +280,10 @@ def _compute_thunks(self):
                 assert len(thunk_callees) == 1
 
                 thunked_idx: int = thunk_callees[0]
-                thunked_vertex: CallGraph.Vertex = self.be2.call_graph.vertex[thunked_idx]
+                thunked_vertex: BinExport2.CallGraph.Vertex = self.be2.call_graph.vertex[thunked_idx]
 
                 if not capa.features.extractors.binexport2.helpers.is_vertex_type(
-                    thunked_vertex, CallGraph.Vertex.Type.THUNK
+                    thunked_vertex, BinExport2.CallGraph.Vertex.Type.THUNK
                 ):
                     assert thunked_vertex.HasField("address")
 
diff --git a/capa/features/extractors/binexport2/basicblock.py b/capa/features/extractors/binexport2/basicblock.py
index f44754f58..5d7398aa1 100644
--- a/capa/features/extractors/binexport2/basicblock.py
+++ b/capa/features/extractors/binexport2/basicblock.py
@@ -13,7 +13,6 @@
 from capa.features.basicblock import BasicBlock
 from capa.features.extractors.binexport2 import FunctionContext, BasicBlockContext
 from capa.features.extractors.base_extractor import BBHandle, FunctionHandle
-from capa.features.extractors.binexport2.binexport2_pb2.BinExport2 import FlowGraph
 
 
 def extract_bb_tight_loop(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
@@ -23,7 +22,7 @@ def extract_bb_tight_loop(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[F
     idx = fhi.ctx.idx
 
     basic_block_index: int = bbi.basic_block_index
-    target_edges: List[FlowGraph.Edge] = idx.target_edges_by_basic_block_index[basic_block_index]
+    target_edges: List[BinExport2.FlowGraph.Edge] = idx.target_edges_by_basic_block_index[basic_block_index]
     if basic_block_index in (e.source_basic_block_index for e in target_edges):
         basic_block_address: int = idx.get_basic_block_address(basic_block_index)
         yield Characteristic("tight loop"), AbsoluteVirtualAddress(basic_block_address)
diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
index 665d0bda1..6e1e4c633 100644
--- a/capa/features/extractors/binexport2/extractor.py
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -34,7 +34,6 @@
     StaticFeatureExtractor,
 )
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
-from capa.features.extractors.binexport2.binexport2_pb2.BinExport2 import CallGraph
 
 logger = logging.getLogger(__name__)
 
@@ -73,10 +72,10 @@ def get_functions(self) -> Iterator[FunctionHandle]:
             flow_graph_address: int = self.idx.get_basic_block_address(entry_basic_block_index)
 
             vertex_idx: int = self.idx.vertex_index_by_address[flow_graph_address]
-            be2_vertex: CallGraph.Vertex = self.be2.call_graph.vertex[vertex_idx]
+            be2_vertex: BinExport2.CallGraph.Vertex = self.be2.call_graph.vertex[vertex_idx]
 
             # skip thunks
-            if capa.features.extractors.binexport2.helpers.is_vertex_type(be2_vertex, CallGraph.Vertex.Type.THUNK):
+            if capa.features.extractors.binexport2.helpers.is_vertex_type(be2_vertex, BinExport2.CallGraph.Vertex.Type.THUNK):
                 continue
 
             yield FunctionHandle(
diff --git a/capa/features/extractors/binexport2/function.py b/capa/features/extractors/binexport2/function.py
index 685b3ab08..396545256 100644
--- a/capa/features/extractors/binexport2/function.py
+++ b/capa/features/extractors/binexport2/function.py
@@ -13,7 +13,6 @@
 from capa.features.extractors.binexport2 import BinExport2Index, FunctionContext
 from capa.features.extractors.base_extractor import FunctionHandle
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
-from capa.features.extractors.binexport2.binexport2_pb2.BinExport2 import CallGraph
 
 
 def extract_function_calls_to(fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
@@ -27,7 +26,7 @@ def extract_function_calls_to(fh: FunctionHandle) -> Iterator[Tuple[Feature, Add
     vertex_index: int = idx.vertex_index_by_address[flow_graph_address]
 
     for caller_index in idx.callers_by_vertex_index[vertex_index]:
-        caller: CallGraph.Vertex = be2.call_graph.vertex[caller_index]
+        caller: BinExport2.CallGraph.Vertex = be2.call_graph.vertex[caller_index]
         caller_address: int = caller.address
         yield Characteristic("calls to"), AbsoluteVirtualAddress(caller_address)
 
@@ -55,7 +54,7 @@ def extract_function_name(fh: FunctionHandle) -> Iterator[Tuple[Feature, Address
 
     flow_graph_address: int = idx.flow_graph_address_by_index[flow_graph_index]
     vertex_index: int = idx.vertex_index_by_address[flow_graph_address]
-    vertex: CallGraph.Vertex = be2.call_graph.vertex[vertex_index]
+    vertex: BinExport2.CallGraph.Vertex = be2.call_graph.vertex[vertex_index]
 
     if vertex.HasField("mangled_name"):
         yield FunctionName(vertex.mangled_name), fh.address
diff --git a/capa/features/extractors/binexport2/helpers.py b/capa/features/extractors/binexport2/helpers.py
index 6c1f17de8..d06b5f93d 100644
--- a/capa/features/extractors/binexport2/helpers.py
+++ b/capa/features/extractors/binexport2/helpers.py
@@ -5,8 +5,8 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
-from capa.features.extractors.binexport2.binexport2_pb2.BinExport2 import CallGraph
+from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
 
-def is_vertex_type(vertex: CallGraph.Vertex, type_: CallGraph.Vertex.Type.ValueType) -> bool:
+def is_vertex_type(vertex: BinExport2.CallGraph.Vertex, type_: BinExport2.CallGraph.Vertex.Type.ValueType) -> bool:
     return vertex.HasField("type") and vertex.type == type_
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index cee4f053a..13b067497 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -26,7 +26,6 @@
 )
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
-from capa.features.extractors.binexport2.binexport2_pb2.BinExport2 import CallGraph
 
 logger = logging.getLogger(__name__)
 
@@ -53,9 +52,9 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
             continue
 
         vertex_idx: int = be2_index.vertex_index_by_address[addr]
-        vertex: CallGraph.Vertex = be2.call_graph.vertex[vertex_idx]
+        vertex: BinExport2.CallGraph.Vertex = be2.call_graph.vertex[vertex_idx]
 
-        if not capa.features.extractors.binexport2.helpers.is_vertex_type(vertex, CallGraph.Vertex.Type.IMPORTED):
+        if not capa.features.extractors.binexport2.helpers.is_vertex_type(vertex, BinExport2.CallGraph.Vertex.Type.IMPORTED):
             continue
 
         if not vertex.HasField("mangled_name"):

From 8deb2808c7cd5b51cb95c52a9bc68680d8ef8310 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Mon, 3 Jun 2024 13:24:42 +0000
Subject: [PATCH 138/200] fix stack offset numbers and disable offset tests

---
 capa/features/extractors/binexport2/insn.py | 8 ++++++++
 tests/test_binexport_features.py            | 4 ++++
 2 files changed, 12 insertions(+)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 13b067497..a8a680a2b 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -155,6 +155,14 @@ def extract_insn_number_features(
             # temporarily, we'll have to try to guess at the interpretation.
             symbol = _gsm_get_instruction_operand(be2, instruction_index, i)
 
+            # x86 / amd64
+            if mnemonic.name.lower() == "add" and symbol.lower() == "esp":
+                # skip things like:
+                #
+                #    .text:00401140                 call    sub_407E2B
+                #    .text:00401145                 add     esp, 0Ch
+                return
+
             if symbol.startswith(("#0x", "#-0x")):
                 # like:
                 # - type: SYMBOL
diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index 6de429419..cbea6c066 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -440,6 +440,10 @@ def test_binexport_features_elf_aarch64(sample, scope, feature, expected):
 def test_binexport_features_pe_x86(sample, scope, feature, expected):
     if "mimikatz.exe_" not in sample.name:
         pytest.skip("for now only testing mimikatz.exe_ Ghidra BinExport file")
+
+    if isinstance(feature, capa.features.insn.Offset):
+        pytest.xfail("Offset features not supported yet")
+
     sample = sample.parent / "binexport2" / (sample.name + ".ghidra.BinExport")
     assert sample.exists()
     fixtures.do_test_feature_presence(fixtures.get_binexport_extractor, sample, scope, feature, expected)

From a2dc8554d5150deca38eb47acff94b72e4467301 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Mon, 3 Jun 2024 13:48:42 +0000
Subject: [PATCH 139/200] xfail OperandOffset

---
 tests/test_binexport_features.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index cbea6c066..3b0154726 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -441,7 +441,7 @@ def test_binexport_features_pe_x86(sample, scope, feature, expected):
     if "mimikatz.exe_" not in sample.name:
         pytest.skip("for now only testing mimikatz.exe_ Ghidra BinExport file")
 
-    if isinstance(feature, capa.features.insn.Offset):
+    if isinstance(feature, (capa.features.insn.Offset, capa.features.insn.OperandOffset)):
         pytest.xfail("Offset features not supported yet")
 
     sample = sample.parent / "binexport2" / (sample.name + ".ghidra.BinExport")

From 36cb7d99871d6954637b3e8016fac1e07dae90e7 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Mon, 3 Jun 2024 13:49:48 +0000
Subject: [PATCH 140/200] generate symbol variants

---
 capa/features/extractors/binexport2/insn.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index a8a680a2b..59aeb3d27 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -62,7 +62,8 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
             continue
 
         api_name: str = vertex.mangled_name
-        yield API(api_name), ih.address
+        for name in capa.features.extractors.helpers.generate_symbols("", api_name):
+            yield API(name), ih.address
 
     """
         # TODO: re-enable pending https://github.com/google/binexport/issues/126#issuecomment-2074402906

From f98465abc644b1c81bcdbcb7e94e73c3d56285a8 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Mon, 3 Jun 2024 13:50:23 +0000
Subject: [PATCH 141/200] wip: read negative numbers

---
 capa/features/extractors/binexport2/insn.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 59aeb3d27..1a907eb7b 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -178,10 +178,12 @@ def extract_insn_number_features(
 
                 # handling continues below at label: has a value
 
-            elif symbol.startswith("0x"):
+            elif symbol.startswith(("0x", "-0x")):
                 # like:
                 # - type: SYMBOL
                 #   symbol: "0x1000"
+                # - type: SYMBOL
+                #   symbol: "-0x1"
                 try:
                     value = int(symbol, 0x10)
                 except ValueError:

From fe2e80fb90c8807af6acf16e65f14c2ec995d871 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Tue, 4 Jun 2024 10:44:17 +0000
Subject: [PATCH 142/200] update tight loop tests

---
 tests/test_binexport_features.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index 3b0154726..38670268c 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -74,20 +74,20 @@
         ),
         # bb/characteristic(tight loop)
         (
-            "687e79.be2",
-            "function=0x0",
+            "d1e650.be2",
+            "function=0x114af4",
             capa.features.common.Characteristic("tight loop"),
-            "xfail: not implemented yet",
+            True,
         ),
         (
-            "687e79.be2",
-            "function=0x0",
+            "d1e650.be2",
+            "function=0x118F1C",
             capa.features.common.Characteristic("tight loop"),
-            "xfail: not implemented yet",
+            True,
         ),
         (
-            "687e79.be2",
-            "function=0x1075c0",
+            "d1e650.be2",
+            "function=0x11464c",
             capa.features.common.Characteristic("tight loop"),
             False,
         ),

From 798894a475f80694f1fee06750dd26f8a3b57406 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Tue, 4 Jun 2024 14:35:54 -0600
Subject: [PATCH 143/200] binexport: fix function loop feature detection

---
 capa/features/extractors/binexport2/function.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/capa/features/extractors/binexport2/function.py b/capa/features/extractors/binexport2/function.py
index 396545256..282e4b5f2 100644
--- a/capa/features/extractors/binexport2/function.py
+++ b/capa/features/extractors/binexport2/function.py
@@ -5,11 +5,12 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
-from typing import Tuple, Iterator
+from typing import List, Tuple, Iterator
 
 from capa.features.file import FunctionName
 from capa.features.common import Feature, Characteristic
 from capa.features.address import Address, AbsoluteVirtualAddress
+from capa.features.extractors import loops
 from capa.features.extractors.binexport2 import BinExport2Index, FunctionContext
 from capa.features.extractors.base_extractor import FunctionHandle
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
@@ -39,10 +40,13 @@ def extract_function_loop(fh: FunctionHandle) -> Iterator[Tuple[Feature, Address
     flow_graph_index: int = fhi.flow_graph_index
     flow_graph: BinExport2.FlowGraph = be2.flow_graph[flow_graph_index]
 
+    edges: List[Tuple[int, int]] = []
     for edge in flow_graph.edge:
-        if edge.is_back_edge:
-            yield Characteristic("loop"), fh.address
-            break
+        # TODO (meh): use Edge.is_back_edge pending https://github.com/mandiant/capa/issues/2101
+        edges.append((edge.source_basic_block_index, edge.target_basic_block_index))
+
+    if loops.has_loop(edges):
+        yield Characteristic("loop"), fh.address
 
 
 def extract_function_name(fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:

From 9e949872e35b8c518c67a5f29bca101afa73b608 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Tue, 4 Jun 2024 14:36:29 -0600
Subject: [PATCH 144/200] binexport: update binexport function loop tests

---
 tests/test_binexport_features.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index 38670268c..98b4971b8 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -64,8 +64,8 @@
             "687e79.be2",
             "function=0x1056c0",
             capa.features.common.Characteristic("loop"),
-            "xfail: not implemented yet",
-        ),  # TODO(mr): https://github.com/mandiant/capa/issues/2101
+            True,
+        ),
         (
             "687e79.be2",
             "function=0x1075c0",

From 45b7b59899328005686521488fba38959318032b Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Tue, 4 Jun 2024 14:36:59 -0600
Subject: [PATCH 145/200] binexport: fix lints and imports

---
 capa/features/extractors/binexport2/__init__.py   | 4 +++-
 capa/features/extractors/binexport2/basicblock.py | 1 +
 capa/features/extractors/binexport2/extractor.py  | 4 +++-
 capa/features/extractors/binexport2/insn.py       | 4 +++-
 4 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index ba488cd86..a09ca2a52 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -261,7 +261,9 @@ def _find_base_address(self):
     def _compute_thunks(self):
         for addr, idx in self.idx.vertex_index_by_address.items():
             vertex: BinExport2.CallGraph.Vertex = self.be2.call_graph.vertex[idx]
-            if not capa.features.extractors.binexport2.helpers.is_vertex_type(vertex, BinExport2.CallGraph.Vertex.Type.THUNK):
+            if not capa.features.extractors.binexport2.helpers.is_vertex_type(
+                vertex, BinExport2.CallGraph.Vertex.Type.THUNK
+            ):
                 continue
 
             curr_idx: int = idx
diff --git a/capa/features/extractors/binexport2/basicblock.py b/capa/features/extractors/binexport2/basicblock.py
index 5d7398aa1..bcb7977b4 100644
--- a/capa/features/extractors/binexport2/basicblock.py
+++ b/capa/features/extractors/binexport2/basicblock.py
@@ -13,6 +13,7 @@
 from capa.features.basicblock import BasicBlock
 from capa.features.extractors.binexport2 import FunctionContext, BasicBlockContext
 from capa.features.extractors.base_extractor import BBHandle, FunctionHandle
+from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
 
 def extract_bb_tight_loop(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
index 6e1e4c633..1c3c4d393 100644
--- a/capa/features/extractors/binexport2/extractor.py
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -75,7 +75,9 @@ def get_functions(self) -> Iterator[FunctionHandle]:
             be2_vertex: BinExport2.CallGraph.Vertex = self.be2.call_graph.vertex[vertex_idx]
 
             # skip thunks
-            if capa.features.extractors.binexport2.helpers.is_vertex_type(be2_vertex, BinExport2.CallGraph.Vertex.Type.THUNK):
+            if capa.features.extractors.binexport2.helpers.is_vertex_type(
+                be2_vertex, BinExport2.CallGraph.Vertex.Type.THUNK
+            ):
                 continue
 
             yield FunctionHandle(
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 1a907eb7b..92ef311ee 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -54,7 +54,9 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
         vertex_idx: int = be2_index.vertex_index_by_address[addr]
         vertex: BinExport2.CallGraph.Vertex = be2.call_graph.vertex[vertex_idx]
 
-        if not capa.features.extractors.binexport2.helpers.is_vertex_type(vertex, BinExport2.CallGraph.Vertex.Type.IMPORTED):
+        if not capa.features.extractors.binexport2.helpers.is_vertex_type(
+            vertex, BinExport2.CallGraph.Vertex.Type.IMPORTED
+        ):
             continue
 
         if not vertex.HasField("mangled_name"):

From e7e786c0926165b4a45953ec12ab571c69246876 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Tue, 4 Jun 2024 15:29:59 -0600
Subject: [PATCH 146/200] binexport: add back assert statement to thunk
 calculation

---
 capa/features/extractors/binexport2/__init__.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index a09ca2a52..7fc32be71 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -273,12 +273,6 @@ def _compute_thunks(self):
                 # because either, len is:
                 #    0 and the thunk doesn't point to anything, or
                 #   >1 and the thunk may end up at many functions.
-
-                # TODO (mr-tz): fails on d1e6506964edbfffb08c0dd32e1486b11fbced7a4bd870ffe79f110298f0efb8:0x113AE0
-                if len(thunk_callees) != 1:
-                    logger.error("callees: %s, addr: 0x%x, idx: %d", thunk_callees, addr, idx)
-                    continue
-
                 assert len(thunk_callees) == 1
 
                 thunked_idx: int = thunk_callees[0]

From 427aad4e9ac194e04494b1654cffba6435e66553 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Tue, 4 Jun 2024 15:30:29 -0600
Subject: [PATCH 147/200] binexport: update tests to use Ghidra binexport file

---
 tests/test_binexport_features.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index 98b4971b8..abec60139 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -456,7 +456,7 @@ def test_binexport_features_pe_x86(sample, scope, feature, expected):
 )
 def test_binexport_feature_counts(sample, scope, feature, expected):
     if "mimikatz.exe_" not in sample.name:
-        pytest.skip("for now only testing mimikatz.exe_ IDA BinExport file")
-    sample = sample.parent / "binexport2" / (sample.name + ".ida.BinExport")
+        pytest.skip("for now only testing mimikatz.exe_ Ghidra BinExport file")
+    sample = sample.parent / "binexport2" / (sample.name + ".ghidra.BinExport")
     assert sample.exists()
     fixtures.do_test_feature_count(fixtures.get_binexport_extractor, sample, scope, feature, expected)

From 6efb46e1d699abd030f221200da8a129723424d7 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Wed, 5 Jun 2024 12:16:34 -0600
Subject: [PATCH 148/200] binexport: add additional debug info to thunk
 calculation assert

---
 capa/features/extractors/binexport2/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index 7fc32be71..da7bbcaf3 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -273,7 +273,7 @@ def _compute_thunks(self):
                 # because either, len is:
                 #    0 and the thunk doesn't point to anything, or
                 #   >1 and the thunk may end up at many functions.
-                assert len(thunk_callees) == 1
+                assert len(thunk_callees) == 1, f"thunk @ {hex(addr)} failed"
 
                 thunked_idx: int = thunk_callees[0]
                 thunked_vertex: BinExport2.CallGraph.Vertex = self.be2.call_graph.vertex[thunked_idx]

From 869b2f670715fe4c0075d4e916357ed4a65cf816 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 6 Jun 2024 10:46:50 -0600
Subject: [PATCH 149/200] binexport: update unit tests to focus on Ghidra

---
 tests/fixtures.py                |   8 +-
 tests/test_binexport_features.py | 162 +++++++++++++++----------------
 2 files changed, 85 insertions(+), 85 deletions(-)

diff --git a/tests/fixtures.py b/tests/fixtures.py
index 190975f95..a5a2261fd 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -406,14 +406,14 @@ def get_data_path_by_name(name) -> Path:
         return CD / "data" / "dotnet" / "dd9098ff91717f4906afe9dafdfa2f52.exe_"
     elif name.startswith("nested_typeref"):
         return CD / "data" / "dotnet" / "2c7d60f77812607dec5085973ff76cea.dll_"
-    elif name.startswith("687e79.be2"):
+    elif name.startswith("687e79.ghidra.be2"):
         return (
             CD
             / "data"
             / "binexport2"
             / "687e79cde5b0ced75ac229465835054931f9ec438816f2827a8be5f3bd474929.elf_.ghidra.BinExport"
         )
-    elif name.startswith("d1e650.be2"):
+    elif name.startswith("d1e650.ghidra.be2"):
         return (
             CD
             / "data"
@@ -1438,9 +1438,9 @@ def parametrize(params, values, **kwargs):
 
 FEATURE_COUNT_TESTS_GHIDRA = [
     # Ghidra may render functions as labels, as well as provide differing amounts of call references
-    # (Colton) TODO: Add more test cases
     ("mimikatz", "function=0x4702FD", capa.features.common.Characteristic("calls from"), 0),
-    ("mimikatz", "function=0x4556E5", capa.features.common.Characteristic("calls to"), 0),
+    ("mimikatz", "function=0x401000", capa.features.common.Characteristic("calls to"), 0),
+    ("mimikatz", "function=0x401000", capa.features.basicblock.BasicBlock(), 3),
 ]
 
 
diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index abec60139..0d1fea392 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -32,191 +32,191 @@
     [
         # file/string
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "file",
             capa.features.common.String("AppDataService start"),
             True,
         ),
-        ("687e79.be2", "file", capa.features.common.String("nope"), False),
+        ("687e79.ghidra.be2", "file", capa.features.common.String("nope"), False),
         # file/sections
-        ("687e79.be2", "file", capa.features.file.Section(".text"), True),
-        ("687e79.be2", "file", capa.features.file.Section(".nope"), False),
+        ("687e79.ghidra.be2", "file", capa.features.file.Section(".text"), True),
+        ("687e79.ghidra.be2", "file", capa.features.file.Section(".nope"), False),
         # file/exports
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "file",
             capa.features.file.Export("android::clearDir"),
             "xfail: not implemented yet?!",
         ),
-        ("687e79.be2", "file", capa.features.file.Export("nope"), False),
+        ("687e79.ghidra.be2", "file", capa.features.file.Export("nope"), False),
         # file/imports
-        ("687e79.be2", "file", capa.features.file.Import("fopen"), True),
-        ("687e79.be2", "file", capa.features.file.Import("exit"), True),
+        ("687e79.ghidra.be2", "file", capa.features.file.Import("fopen"), True),
+        ("687e79.ghidra.be2", "file", capa.features.file.Import("exit"), True),
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "file",
             capa.features.file.Import("_ZN7android10IInterfaceD0Ev"),
             True,
         ),
-        ("687e79.be2", "file", capa.features.file.Import("nope"), False),
+        ("687e79.ghidra.be2", "file", capa.features.file.Import("nope"), False),
         # function/characteristic(loop)
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x1056c0",
             capa.features.common.Characteristic("loop"),
             True,
         ),
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x1075c0",
             capa.features.common.Characteristic("loop"),
             False,
         ),
         # bb/characteristic(tight loop)
         (
-            "d1e650.be2",
+            "d1e650.ghidra.be2",
             "function=0x114af4",
             capa.features.common.Characteristic("tight loop"),
             True,
         ),
         (
-            "d1e650.be2",
+            "d1e650.ghidra.be2",
             "function=0x118F1C",
             capa.features.common.Characteristic("tight loop"),
             True,
         ),
         (
-            "d1e650.be2",
+            "d1e650.ghidra.be2",
             "function=0x11464c",
             capa.features.common.Characteristic("tight loop"),
             False,
         ),
         # bb/characteristic(stack string)
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x0",
             capa.features.common.Characteristic("stack string"),
             "xfail: not implemented yet",
         ),
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x0",
             capa.features.common.Characteristic("stack string"),
             "xfail: not implemented yet",
         ),
         # bb/characteristic(tight loop)
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x0,bb=0x0",
             capa.features.common.Characteristic("tight loop"),
             "xfail: not implemented yet",
         ),
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x0,bb=0x0",
             capa.features.common.Characteristic("tight loop"),
             "xfail: not implemented yet",
         ),
         # insn/mnemonic
-        ("687e79.be2", "function=0x107588", capa.features.insn.Mnemonic("stp"), True),
-        ("687e79.be2", "function=0x107588", capa.features.insn.Mnemonic("adrp"), True),
-        ("687e79.be2", "function=0x107588", capa.features.insn.Mnemonic("bl"), True),
-        ("687e79.be2", "function=0x107588", capa.features.insn.Mnemonic("in"), False),
-        ("687e79.be2", "function=0x107588", capa.features.insn.Mnemonic("adrl"), False),
+        ("687e79.ghidra.be2", "function=0x107588", capa.features.insn.Mnemonic("stp"), True),
+        ("687e79.ghidra.be2", "function=0x107588", capa.features.insn.Mnemonic("adrp"), True),
+        ("687e79.ghidra.be2", "function=0x107588", capa.features.insn.Mnemonic("bl"), True),
+        ("687e79.ghidra.be2", "function=0x107588", capa.features.insn.Mnemonic("in"), False),
+        ("687e79.ghidra.be2", "function=0x107588", capa.features.insn.Mnemonic("adrl"), False),
         # insn/operand.number
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x105128,bb=0x1051e4",
             capa.features.insn.OperandNumber(1, 0xFFFFFFFF),
             True,
         ),
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x107588,bb=0x107588",
             capa.features.insn.OperandNumber(1, 0x8),
             True,
         ),
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x107588,bb=0x107588,insn=0x1075a4",
             capa.features.insn.OperandNumber(1, 0x8),
             True,
         ),
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x107588,bb=0x107588,insn=0x1075b8",
             capa.features.insn.OperandNumber(3, 0x10),
             "xfail: GSM?",
         ),  # TODO(mr): https://github.com/mandiant/capa/issues/2102
         # insn/operand.offset
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x0,bb=0x0",
             capa.features.insn.OperandOffset(1, 100),
             "xfail: not implemented yet",
         ),
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x0,bb=0x0",
             capa.features.insn.OperandOffset(3, 100),
             "xfail: not implemented yet",
         ),
         # insn/number
-        ("687e79.be2", "function=0x107588", capa.features.insn.Number(0x3), True),
+        ("687e79.ghidra.be2", "function=0x107588", capa.features.insn.Number(0x3), True),
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x107588",
             capa.features.insn.Number(0x10),
             "xfail: do we want this for ldp?",
         ),
-        ("687e79.be2", "function=0x105C88", capa.features.insn.Number(0xF000), True),
+        ("687e79.ghidra.be2", "function=0x105C88", capa.features.insn.Number(0xF000), True),
         # insn/number: negative
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x1057f8,bb=0x1057f8",
             capa.features.insn.Number(-1),
             True,
         ),  # TODO(mr): this should be unsigned / use two's complement, https://github.com/mandiant/capa/issues/1755
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x1057f8,bb=0x1057f8",
             capa.features.insn.Number(0xFFFFFFFFFFFFFFFF),
             "xfail: not implemented yet",
         ),
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x1066e0,bb=0x1068c4",
             capa.features.insn.Number(0xFFFFFFFF),
             True,
         ),
         # insn/offset
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x0",
             capa.features.insn.Offset(0x0),
             "xfail: not implemented yet",
         ),
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x0",
             capa.features.insn.Offset(0x4),
             "xfail: not implemented yet",
         ),
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x0",
             capa.features.insn.Offset(0xC),
             "xfail: not implemented yet",
         ),
         # insn/offset: negative
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x0",
             capa.features.insn.Offset(-0x1),
             "xfail: not implemented yet",
         ),
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x0",
             capa.features.insn.Offset(-0x2),
             "xfail: not implemented yet",
@@ -246,43 +246,43 @@
         # ("mimikatz", "function=0x401873,bb=0x4018B2,insn=0x4018C0", capa.features.insn.Number(0x2), True),
         # insn/api
         # not extracting dll name
-        ("687e79.be2", "function=0x105c88", capa.features.insn.API("memset"), True),
-        ("687e79.be2", "function=0x105c88", capa.features.insn.API("Nope"), False),
+        ("687e79.ghidra.be2", "function=0x105c88", capa.features.insn.API("memset"), True),
+        ("687e79.ghidra.be2", "function=0x105c88", capa.features.insn.API("Nope"), False),
         # insn/string
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x107588",
             capa.features.common.String("AppDataService start"),
             True,
         ),
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x1075c0",
             capa.features.common.String("AppDataService"),
             True,
         ),
-        ("687e79.be2", "function=0x107588", capa.features.common.String("nope"), False),
+        ("687e79.ghidra.be2", "function=0x107588", capa.features.common.String("nope"), False),
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x106d58",
             capa.features.common.String("/data/misc/wifi/wpa_supplicant.conf"),
             True,
         ),
         # insn/regex
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x105c88",
             capa.features.common.Regex("innerRename"),
             True,
         ),
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x106d58",
             capa.features.common.Regex("/data/misc"),
             True,
         ),
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x106d58",
             capa.features.common.Substring("/data/misc"),
             True,
@@ -293,13 +293,13 @@
         # ("mimikatz", "function=0x46D6CE", capa.features.common.String("(null)"), True),
         # insn/bytes
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x0",
             capa.features.common.Bytes(binascii.unhexlify("00")),
             "xfail: not implemented yet, may need other test sample",
         ),
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x0",
             capa.features.common.Bytes(binascii.unhexlify("00")),
             "xfail: not implemented yet, may need other test sample",
@@ -317,13 +317,13 @@
         # ("mimikatz", "function=0x44EDEF", capa.features.common.Bytes("INPUTEVENT".encode("utf-16le")), False),
         # insn/characteristic(nzxor)
         (
-            "d1e650.be2",
+            "d1e650.ghidra.be2",
             "function=0x114af4",
             capa.features.common.Characteristic("nzxor"),
             True,
         ),
         (
-            "d1e650.be2",
+            "d1e650.ghidra.be2",
             "function=0x117988",
             capa.features.common.Characteristic("nzxor"),
             True,
@@ -334,81 +334,81 @@
         # ("mimikatz", "function=0x4556E5", capa.features.common.Characteristic("cross section flow"), False),
         # insn/characteristic(recursive call)
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x105b38",
             capa.features.common.Characteristic("recursive call"),
             True,
         ),
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x106530",
             capa.features.common.Characteristic("recursive call"),
             True,
         ),
         # insn/characteristic(indirect call)
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x0",
             capa.features.common.Characteristic("indirect call"),
             "xfail: not implemented yet",
         ),
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x0",
             capa.features.common.Characteristic("indirect call"),
             "xfail: not implemented yet",
         ),
         # insn/characteristic(calls from)
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x105080",
             capa.features.common.Characteristic("calls from"),
             True,
         ),
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x1070e8",
             capa.features.common.Characteristic("calls from"),
             False,
         ),
         # function/characteristic(calls to)
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x1075c0",
             capa.features.common.Characteristic("calls to"),
             True,
         ),
         # file/function-name
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "file",
             capa.features.file.FunctionName("__libc_init"),
             "xfail: TODO should this be a function-name?",
         ),
         # os & format & arch
-        ("687e79.be2", "file", OS(OS_ANDROID), True),
-        ("687e79.be2", "file", OS(OS_LINUX), False),
-        ("687e79.be2", "file", OS(OS_WINDOWS), False),
+        ("687e79.ghidra.be2", "file", OS(OS_ANDROID), True),
+        ("687e79.ghidra.be2", "file", OS(OS_LINUX), False),
+        ("687e79.ghidra.be2", "file", OS(OS_WINDOWS), False),
         # os & format & arch are also global features
-        ("687e79.be2", "function=0x107588", OS(OS_ANDROID), True),
-        ("687e79.be2", "function=0x1075c0,bb=0x1076c0", OS(OS_ANDROID), True),
-        ("687e79.be2", "file", Arch(ARCH_I386), False),
-        ("687e79.be2", "file", Arch(ARCH_AMD64), False),
-        ("687e79.be2", "file", Arch(ARCH_AARCH64), True),
-        ("687e79.be2", "function=0x107588", Arch(ARCH_AARCH64), True),
-        ("687e79.be2", "function=0x1075c0,bb=0x1076c0", Arch(ARCH_AARCH64), True),
-        ("687e79.be2", "file", Format(FORMAT_ELF), True),
-        ("687e79.be2", "file", Format(FORMAT_PE), False),
-        ("687e79.be2", "function=0x107588", Format(FORMAT_ELF), True),
-        ("687e79.be2", "function=0x107588", Format(FORMAT_PE), False),
-        (
-            "687e79.be2",
+        ("687e79.ghidra.be2", "function=0x107588", OS(OS_ANDROID), True),
+        ("687e79.ghidra.be2", "function=0x1075c0,bb=0x1076c0", OS(OS_ANDROID), True),
+        ("687e79.ghidra.be2", "file", Arch(ARCH_I386), False),
+        ("687e79.ghidra.be2", "file", Arch(ARCH_AMD64), False),
+        ("687e79.ghidra.be2", "file", Arch(ARCH_AARCH64), True),
+        ("687e79.ghidra.be2", "function=0x107588", Arch(ARCH_AARCH64), True),
+        ("687e79.ghidra.be2", "function=0x1075c0,bb=0x1076c0", Arch(ARCH_AARCH64), True),
+        ("687e79.ghidra.be2", "file", Format(FORMAT_ELF), True),
+        ("687e79.ghidra.be2", "file", Format(FORMAT_PE), False),
+        ("687e79.ghidra.be2", "function=0x107588", Format(FORMAT_ELF), True),
+        ("687e79.ghidra.be2", "function=0x107588", Format(FORMAT_PE), False),
+        (
+            "687e79.ghidra.be2",
             "function=0x0,bb=0x0",
             capa.features.common.Characteristic("call $+5"),
             "xfail: not implemented yet",
         ),
         (
-            "687e79.be2",
+            "687e79.ghidra.be2",
             "function=0x0,bb=0x0",
             capa.features.common.Characteristic("call $+5"),
             "xfail: not implemented yet",
@@ -451,10 +451,10 @@ def test_binexport_features_pe_x86(sample, scope, feature, expected):
 
 @fixtures.parametrize(
     "sample,scope,feature,expected",
-    fixtures.FEATURE_COUNT_TESTS,
+    fixtures.FEATURE_COUNT_TESTS_GHIDRA,
     indirect=["sample", "scope"],
 )
-def test_binexport_feature_counts(sample, scope, feature, expected):
+def test_binexport_feature_counts_ghidra(sample, scope, feature, expected):
     if "mimikatz.exe_" not in sample.name:
         pytest.skip("for now only testing mimikatz.exe_ Ghidra BinExport file")
     sample = sample.parent / "binexport2" / (sample.name + ".ghidra.BinExport")

From f8b0f5025b23fd84748b5dcb5981628238721a28 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 2 Aug 2024 11:31:37 -0600
Subject: [PATCH 150/200] binexport: fix lints

---
 .github/ruff.toml | 15 ---------------
 capa/helpers.py   |  1 +
 capa/main.py      |  5 -----
 3 files changed, 1 insertion(+), 20 deletions(-)

diff --git a/.github/ruff.toml b/.github/ruff.toml
index 741f504f9..c3a1de6d9 100644
--- a/.github/ruff.toml
+++ b/.github/ruff.toml
@@ -41,18 +41,3 @@ exclude = [
     "*_pb2.py", 
     "*_pb2.pyi"
 ]
-
-[lint]
-# Enable the pycodestyle (`E`) and Pyflakes (`F`) rules by default.
-# Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
-# McCabe complexity (`C901`) by default.
-select = ["E", "F"]
-
-# Allow autofix for all enabled rules (when `--fix`) is provided.
-fixable = ["ALL"]
-unfixable = []
-
-# E402 module level import not at top of file
-# E722 do not use bare 'except'
-# E501 line too long
-ignore = ["E402", "E722", "E501"]
diff --git a/capa/helpers.py b/capa/helpers.py
index c8d27d492..5b601b263 100644
--- a/capa/helpers.py
+++ b/capa/helpers.py
@@ -36,6 +36,7 @@
 # CAPE extensions: .json, .json_, .json.gz
 # DRAKVUF Sandbox extensions: .log, .log.gz
 EXTENSIONS_DYNAMIC = ("json", "json_", "json.gz", "log", ".log.gz")
+EXTENSIONS_BINEXPORT2 = ("BinExport", "BinExport2")
 EXTENSIONS_ELF = "elf_"
 EXTENSIONS_FREEZE = "frz"
 
diff --git a/capa/main.py b/capa/main.py
index 75349f29b..7b9a48947 100644
--- a/capa/main.py
+++ b/capa/main.py
@@ -44,15 +44,10 @@
 from capa.engine import MatchResults
 from capa.loader import (
     BACKEND_VIV,
-   
     BACKEND_CAPE,
-   
     BACKEND_BINJA,
-   
     BACKEND_DOTNET,
-   
     BACKEND_FREEZE,
-   
     BACKEND_PEFILE,
     BACKEND_DRAKVUF,
     BACKEND_BINEXPORT2,

From 227fdeb06f8845b628c6b77cabd65f049e094e42 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 2 Aug 2024 12:52:02 -0600
Subject: [PATCH 151/200] binexport: remove Ghidra symbol madness and fix
 x86/amd64 stack offset number tests

---
 capa/features/extractors/binexport2/insn.py | 110 ++++----------------
 1 file changed, 22 insertions(+), 88 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 92ef311ee..970306756 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -6,7 +6,7 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 import logging
-from typing import List, Tuple, Iterator
+from typing import List, Tuple, Iterator, Optional
 
 import capa.features.extractors.helpers
 import capa.features.extractors.strings
@@ -34,6 +34,15 @@
 SECURITY_COOKIE_BYTES_DELTA: int = 0x40
 
 
+def get_operand_expression_register(op_index: int, be2: BinExport2) -> Optional[str]:
+    op: BinExport2.Operand = be2.operand[op_index]
+    if len(op.expression_index) == 1:
+        exp: BinExport2.Expression = be2.expression[op.expression_index[0]]
+        if exp.type == BinExport2.Expression.Type.REGISTER:
+            return exp.symbol.lower()
+    return None
+
+
 def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
@@ -88,48 +97,6 @@ def is_address_mapped(be2: BinExport2, address: int) -> bool:
     return any(section.address <= address < section.address + section.size for section in sections_with_perms)
 
 
-###############################################################################
-#
-# begin Ghidra symbol madness ("gsm").
-#
-# This is a "temporary" section of code to deal with
-#   https://github.com/google/binexport/issues/78
-# because Ghidra exports all operands as a single SYMBOL expression node.
-#
-# Use references to `_is_ghidra_symbol_madness` to remove all this up later.
-
-
-def _is_ghidra_symbol_madness(be2: BinExport2, instruction_index: int) -> bool:
-    instruction = be2.instruction[instruction_index]
-    for operand_index in instruction.operand_index:
-        operand = be2.operand[operand_index]
-
-        if len(operand.expression_index) != 1:
-            return False
-
-        expression0 = be2.expression[operand.expression_index[0]]
-
-        if BinExport2.Expression.Type.SYMBOL != expression0.type:
-            return False
-
-    return True
-
-
-def _gsm_get_instruction_operand(be2: BinExport2, instruction_index: int, operand_index: int) -> str:
-    """since Ghidra represents all operands as a single string, just fetch that."""
-    instruction = be2.instruction[instruction_index]
-    operand = be2.operand[instruction.operand_index[operand_index]]
-    assert len(operand.expression_index) == 1
-    expression = be2.expression[operand.expression_index[0]]
-    assert expression.type == BinExport2.Expression.Type.SYMBOL
-    return expression.symbol
-
-
-# end Ghidra symbol madness.
-#
-###############################################################################
-
-
 def extract_insn_number_features(
     fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
 ) -> Iterator[Tuple[Feature, Address]]:
@@ -142,6 +109,11 @@ def extract_insn_number_features(
     instruction_index: int = ii.instruction_index
     instruction: BinExport2.Instruction = be2.instruction[instruction_index]
 
+    if len(instruction.operand_index) == 0:
+        # skip things like:
+        #   .text:0040116e leave
+        return
+
     # x86 / amd64
     mnemonic = be2.mnemonic[instruction.mnemonic_index]
     if mnemonic.name.lower().startswith("ret"):
@@ -149,55 +121,17 @@ def extract_insn_number_features(
         #   .text:0042250E retn 8
         return
 
-    _is_gsm = _is_ghidra_symbol_madness(be2, instruction_index)
+    register: Optional[str] = get_operand_expression_register(instruction.operand_index[0], be2)
+    if register is not None:
+        # x86 / amd64
+        if mnemonic.name.lower().startswith(("add", "sub")):
+            if register.endswith(("sp", "bp")):
+                return
 
     for i, operand_index in enumerate(instruction.operand_index):
         operand = be2.operand[operand_index]
 
-        if len(operand.expression_index) == 1 and _is_gsm:
-            # temporarily, we'll have to try to guess at the interpretation.
-            symbol = _gsm_get_instruction_operand(be2, instruction_index, i)
-
-            # x86 / amd64
-            if mnemonic.name.lower() == "add" and symbol.lower() == "esp":
-                # skip things like:
-                #
-                #    .text:00401140                 call    sub_407E2B
-                #    .text:00401145                 add     esp, 0Ch
-                return
-
-            if symbol.startswith(("#0x", "#-0x")):
-                # like:
-                # - type: SYMBOL
-                #   symbol: "#0xffffffff"
-                # - type: SYMBOL
-                #   symbol: "#-0x1"
-                try:
-                    value = int(symbol[len("#") :], 0x10)
-                except ValueError:
-                    # failed to parse as integer
-                    continue
-
-                # handling continues below at label: has a value
-
-            elif symbol.startswith(("0x", "-0x")):
-                # like:
-                # - type: SYMBOL
-                #   symbol: "0x1000"
-                # - type: SYMBOL
-                #   symbol: "-0x1"
-                try:
-                    value = int(symbol, 0x10)
-                except ValueError:
-                    # failed to parse as integer
-                    continue
-
-                # handling continues below at label: has a value
-
-            else:
-                continue
-
-        elif len(operand.expression_index) == 1:
+        if len(operand.expression_index) == 1:
             # - type: IMMEDIATE_INT
             #   immediate: 20588728364
             #   parent_index: 0

From 446a500f18506472af68b18d277f0d2ebfdf558e Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 2 Aug 2024 16:13:57 -0600
Subject: [PATCH 152/200] binexport: use masking for Number features

---
 .../extractors/binexport2/__init__.py         |   5 +-
 .../extractors/binexport2/extractor.py        |  20 ++-
 capa/features/extractors/binexport2/insn.py   | 120 +++++++++---------
 tests/test_binexport_features.py              |   4 +-
 4 files changed, 86 insertions(+), 63 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index da7bbcaf3..76731e8ac 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -14,7 +14,7 @@
 import hashlib
 import logging
 import contextlib
-from typing import Dict, List, Tuple, Iterator
+from typing import Set, Dict, List, Tuple, Iterator
 from pathlib import Path
 from collections import defaultdict
 from dataclasses import dataclass
@@ -390,6 +390,9 @@ class AnalysisContext:
 class FunctionContext:
     ctx: AnalysisContext
     flow_graph_index: int
+    format: Set[str]
+    os: Set[str]
+    arch: Set[str]
 
 
 @dataclass
diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
index 1c3c4d393..3ed3b5d07 100644
--- a/capa/features/extractors/binexport2/extractor.py
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -6,7 +6,7 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 import logging
-from typing import List, Tuple, Iterator
+from typing import Set, List, Tuple, Iterator
 
 import capa.features.extractors.elf
 import capa.features.extractors.common
@@ -15,7 +15,7 @@
 import capa.features.extractors.binexport2.helpers
 import capa.features.extractors.binexport2.function
 import capa.features.extractors.binexport2.basicblock
-from capa.features.common import Feature
+from capa.features.common import OS, Arch, Format, Feature
 from capa.features.address import Address, AbsoluteVirtualAddress
 from capa.features.extractors.binexport2 import (
     AddressSpace,
@@ -53,6 +53,20 @@ def __init__(self, be2: BinExport2, buf: bytes):
         self.global_features.extend(list(capa.features.extractors.common.extract_os(self.buf)))
         self.global_features.extend(list(capa.features.extractors.common.extract_arch(self.buf)))
 
+        self.format: Set[str] = set()
+        self.os: Set[str] = set()
+        self.arch: Set[str] = set()
+
+        for feature, _ in self.global_features:
+            assert isinstance(feature.value, str)
+
+            if isinstance(feature, Format):
+                self.format.add(feature.value)
+            elif isinstance(feature, OS):
+                self.os.add(feature.value)
+            elif isinstance(feature, Arch):
+                self.arch.add(feature.value)
+
         # TODO(mr): assert supported file formats, arches
         # and gradually relax restrictions as they're tested.
         # https://github.com/mandiant/capa/issues/1755
@@ -82,7 +96,7 @@ def get_functions(self) -> Iterator[FunctionHandle]:
 
             yield FunctionHandle(
                 AbsoluteVirtualAddress(flow_graph_address),
-                inner=FunctionContext(self.ctx, flow_graph_index),
+                inner=FunctionContext(self.ctx, flow_graph_index, self.format, self.os, self.arch),
             )
 
     def extract_function_features(self, fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 970306756..a8b84a930 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -12,7 +12,7 @@
 import capa.features.extractors.strings
 import capa.features.extractors.binexport2.helpers
 from capa.features.insn import API, Number, Mnemonic, OperandNumber
-from capa.features.common import Bytes, String, Feature, Characteristic
+from capa.features.common import ARCH_I386, ARCH_AMD64, ARCH_AARCH64, Bytes, String, Feature, Characteristic
 from capa.features.address import Address, AbsoluteVirtualAddress
 from capa.features.extractors.binexport2 import (
     AddressSpace,
@@ -34,15 +34,61 @@
 SECURITY_COOKIE_BYTES_DELTA: int = 0x40
 
 
-def get_operand_expression_register(op_index: int, be2: BinExport2) -> Optional[str]:
-    op: BinExport2.Operand = be2.operand[op_index]
+HAS_ARCH32 = {ARCH_I386}
+HAS_ARCH64 = {ARCH_AARCH64, ARCH_AMD64}
+
+HAS_ARCH_INTEL = {ARCH_I386, ARCH_AMD64}
+HAS_ARCH_ARM = {ARCH_AARCH64}
+
+
+def get_operand_expression_register(op_index: int, fhi: FunctionContext) -> Optional[str]:
+    op: BinExport2.Operand = fhi.ctx.be2.operand[op_index]
     if len(op.expression_index) == 1:
-        exp: BinExport2.Expression = be2.expression[op.expression_index[0]]
+        exp: BinExport2.Expression = fhi.ctx.be2.expression[op.expression_index[0]]
         if exp.type == BinExport2.Expression.Type.REGISTER:
             return exp.symbol.lower()
     return None
 
 
+def get_operand_expression_immediate(op_index: int, fhi: FunctionContext) -> Optional[int]:
+    op: BinExport2.Operand = fhi.ctx.be2.operand[op_index]
+    immediate: Optional[int] = None
+
+    if len(op.expression_index) == 1:
+        # - type: IMMEDIATE_INT
+        #   immediate: 20588728364
+        #   parent_index: 0
+
+        exp: BinExport2.Expression = fhi.ctx.be2.expression[op.expression_index[0]]
+        if BinExport2.Expression.Type.IMMEDIATE_INT == exp.type:
+            immediate = exp.immediate
+
+    elif len(op.expression_index) == 2:
+        # from IDA, which provides a size hint for every operand,
+        # we get the following pattern for immediate constants:
+        #
+        # - type: SIZE_PREFIX
+        #   symbol: "b8"
+        # - type: IMMEDIATE_INT
+        #   immediate: 20588728364
+        #   parent_index: 0
+
+        exp0: BinExport2.Expression = fhi.ctx.be2.expression[op.expression_index[0]]
+        exp1: BinExport2.Expression = fhi.ctx.be2.expression[op.expression_index[1]]
+
+        if BinExport2.Expression.Type.SIZE_PREFIX == exp0.type:
+            if BinExport2.Expression.Type.IMMEDIATE_INT == exp1.type:
+                immediate = exp1.immediate
+
+    if immediate is not None:
+        if fhi.arch & HAS_ARCH64:
+            immediate &= 0xFFFFFFFFFFFFFFFF
+        elif fhi.arch & HAS_ARCH32:
+            immediate &= 0xFFFFFFFF
+
+    return immediate
+
+
 def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
@@ -114,65 +160,25 @@ def extract_insn_number_features(
         #   .text:0040116e leave
         return
 
-    # x86 / amd64
-    mnemonic = be2.mnemonic[instruction.mnemonic_index]
-    if mnemonic.name.lower().startswith("ret"):
-        # skip things like:
-        #   .text:0042250E retn 8
-        return
+    if fhi.arch & HAS_ARCH_INTEL:
+        mnemonic = be2.mnemonic[instruction.mnemonic_index]
+        if mnemonic.name.lower().startswith("ret"):
+            # skip things like:
+            #   .text:0042250E retn 8
+            return
 
-    register: Optional[str] = get_operand_expression_register(instruction.operand_index[0], be2)
-    if register is not None:
-        # x86 / amd64
+    if fhi.arch & HAS_ARCH_INTEL:
         if mnemonic.name.lower().startswith(("add", "sub")):
-            if register.endswith(("sp", "bp")):
-                return
+            register: Optional[str] = get_operand_expression_register(instruction.operand_index[0], fhi)
+            if register is not None:
+                if register.endswith(("sp", "bp")):
+                    return
 
     for i, operand_index in enumerate(instruction.operand_index):
-        operand = be2.operand[operand_index]
-
-        if len(operand.expression_index) == 1:
-            # - type: IMMEDIATE_INT
-            #   immediate: 20588728364
-            #   parent_index: 0
-
-            expression0 = be2.expression[operand.expression_index[0]]
-
-            if BinExport2.Expression.Type.IMMEDIATE_INT != expression0.type:
-                continue
-
-            value = expression0.immediate
-
-            # handling continues below at label: has a value
-
-        elif len(operand.expression_index) == 2:
-            # from IDA, which provides a size hint for every operand,
-            # we get the following pattern for immediate constants:
-            #
-            # - type: SIZE_PREFIX
-            #   symbol: "b8"
-            # - type: IMMEDIATE_INT
-            #   immediate: 20588728364
-            #   parent_index: 0
-
-            expression0 = be2.expression[operand.expression_index[0]]
-            expression1 = be2.expression[operand.expression_index[1]]
-
-            if BinExport2.Expression.Type.SIZE_PREFIX != expression0.type:
-                continue
-
-            if BinExport2.Expression.Type.IMMEDIATE_INT != expression1.type:
-                continue
-
-            value = expression1.immediate
-
-            # handling continues below at label: has a value
-
-        else:
+        value: Optional[int] = get_operand_expression_immediate(operand_index, fhi)
+        if value is None:
             continue
 
-        # label: has a value
-
         if analysis.base_address == 0x0:
             # When the image is mapped at 0x0,
             #  then its hard to tell if numbers are pointers or numbers.
diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index 0d1fea392..4180baaf2 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -174,9 +174,9 @@
         (
             "687e79.ghidra.be2",
             "function=0x1057f8,bb=0x1057f8",
-            capa.features.insn.Number(-1),
+            capa.features.insn.Number(0xFFFFFFFFFFFFFFFF),
             True,
-        ),  # TODO(mr): this should be unsigned / use two's complement, https://github.com/mandiant/capa/issues/1755
+        ),
         (
             "687e79.ghidra.be2",
             "function=0x1057f8,bb=0x1057f8",

From 5836b369ec40c48694b27828ce29722a13fc26be Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 2 Aug 2024 16:24:25 -0600
Subject: [PATCH 153/200] binexport: ignore call/jmp immediates for intel
 architecture

---
 capa/features/extractors/binexport2/insn.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index a8b84a930..241b0a9ca 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -161,17 +161,24 @@ def extract_insn_number_features(
         return
 
     if fhi.arch & HAS_ARCH_INTEL:
+        # short-circut checks for intel architecture
         mnemonic = be2.mnemonic[instruction.mnemonic_index]
         if mnemonic.name.lower().startswith("ret"):
             # skip things like:
             #   .text:0042250E retn 8
             return
 
-    if fhi.arch & HAS_ARCH_INTEL:
+        if mnemonic.name.lower().startswith(("call, j")):
+            # skip things like:
+            # 0x415b81  JNZ         0x415B85
+            return
+
         if mnemonic.name.lower().startswith(("add", "sub")):
             register: Optional[str] = get_operand_expression_register(instruction.operand_index[0], fhi)
             if register is not None:
                 if register.endswith(("sp", "bp")):
+                    # skip things like:
+                    # 0x415bbc  ADD         ESP, 0xC
                     return
 
     for i, operand_index in enumerate(instruction.operand_index):

From dfda0de942383aef6f71a414c096446e6d18a50b Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 2 Aug 2024 16:38:32 -0600
Subject: [PATCH 154/200] binexport: check if immediate is a mapped address

---
 capa/features/extractors/binexport2/insn.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 241b0a9ca..94eb2c1d9 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -168,11 +168,6 @@ def extract_insn_number_features(
             #   .text:0042250E retn 8
             return
 
-        if mnemonic.name.lower().startswith(("call, j")):
-            # skip things like:
-            # 0x415b81  JNZ         0x415B85
-            return
-
         if mnemonic.name.lower().startswith(("add", "sub")):
             register: Optional[str] = get_operand_expression_register(instruction.operand_index[0], fhi)
             if register is not None:
@@ -198,8 +193,9 @@ def extract_insn_number_features(
             #     continue
             pass
 
-        yield Number(value), ih.address
-        yield OperandNumber(i, value), ih.address
+        if not is_address_mapped(be2, value):
+            yield Number(value), ih.address
+            yield OperandNumber(i, value), ih.address
 
 
 def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:

From 7260c29ef9a3aaf61240fa146a0801f4562cf546 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 2 Aug 2024 16:49:18 -0600
Subject: [PATCH 155/200] binexport: emit offset features for immediates likely
 structure offsets

---
 capa/features/extractors/binexport2/insn.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 94eb2c1d9..cd7be03b9 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -11,7 +11,7 @@
 import capa.features.extractors.helpers
 import capa.features.extractors.strings
 import capa.features.extractors.binexport2.helpers
-from capa.features.insn import API, Number, Mnemonic, OperandNumber
+from capa.features.insn import API, MAX_STRUCTURE_SIZE, Number, Offset, Mnemonic, OperandNumber, OperandOffset
 from capa.features.common import ARCH_I386, ARCH_AMD64, ARCH_AARCH64, Bytes, String, Feature, Characteristic
 from capa.features.address import Address, AbsoluteVirtualAddress
 from capa.features.extractors.binexport2 import (
@@ -160,9 +160,10 @@ def extract_insn_number_features(
         #   .text:0040116e leave
         return
 
+    mnemonic = be2.mnemonic[instruction.mnemonic_index]
+
     if fhi.arch & HAS_ARCH_INTEL:
         # short-circut checks for intel architecture
-        mnemonic = be2.mnemonic[instruction.mnemonic_index]
         if mnemonic.name.lower().startswith("ret"):
             # skip things like:
             #   .text:0042250E retn 8
@@ -193,9 +194,17 @@ def extract_insn_number_features(
             #     continue
             pass
 
-        if not is_address_mapped(be2, value):
-            yield Number(value), ih.address
-            yield OperandNumber(i, value), ih.address
+        if is_address_mapped(be2, value):
+            continue
+
+        yield Number(value), ih.address
+        yield OperandNumber(i, value), ih.address
+
+        if fhi.arch & HAS_ARCH_INTEL:
+            if mnemonic.name.lower().startswith("add"):
+                if 0 < value < MAX_STRUCTURE_SIZE:
+                    yield Offset(value), ih.address
+                    yield OperandOffset(i, value), ih.address
 
 
 def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:

From fc3be313c5562691e48b39bffe313868cc16df4c Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 2 Aug 2024 16:57:06 -0600
Subject: [PATCH 156/200] binexport: add twos complement wrapper insn.py

---
 capa/features/extractors/binexport2/insn.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index cd7be03b9..8fbfa5705 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -89,6 +89,14 @@ def get_operand_expression_immediate(op_index: int, fhi: FunctionContext) -> Opt
     return immediate
 
 
+def get_immediate_twos_complement(immediate: int, fhi: FunctionContext) -> int:
+    if fhi.arch & HAS_ARCH64:
+        return capa.features.extractors.helpers.twos_complement(immediate, 64)
+    elif fhi.arch & HAS_ARCH32:
+        return capa.features.extractors.helpers.twos_complement(immediate, 32)
+    return immediate
+
+
 def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner

From 21d2b99e9f8ba1bb0acb1d4dcccf0570dcef46c8 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Tue, 6 Aug 2024 10:24:31 -0600
Subject: [PATCH 157/200] binexport: add support for x86 offset features

---
 .../features/extractors/binexport2/helpers.py | 265 ++++++++++++++++++
 capa/features/extractors/binexport2/insn.py   | 142 ++++++----
 scripts/inspect-binexport2.py                 |  19 +-
 tests/test_binexport_features.py              |   3 -
 4 files changed, 369 insertions(+), 60 deletions(-)

diff --git a/capa/features/extractors/binexport2/helpers.py b/capa/features/extractors/binexport2/helpers.py
index d06b5f93d..708dc446a 100644
--- a/capa/features/extractors/binexport2/helpers.py
+++ b/capa/features/extractors/binexport2/helpers.py
@@ -5,8 +5,273 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
+from typing import List, Optional
+from dataclasses import dataclass
+
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
 
+@dataclass
+class ExpressionPhraseInfo:
+    scale: Optional[BinExport2.Expression] = None
+    index: Optional[BinExport2.Expression] = None
+    base: Optional[BinExport2.Expression] = None
+    displacement: Optional[BinExport2.Expression] = None
+
+
 def is_vertex_type(vertex: BinExport2.CallGraph.Vertex, type_: BinExport2.CallGraph.Vertex.Type.ValueType) -> bool:
     return vertex.HasField("type") and vertex.type == type_
+
+
+def get_operand_expression_phrase_info(be2: BinExport2, operand: BinExport2.Operand) -> Optional[ExpressionPhraseInfo]:
+    # assume the following (see https://blog.yossarian.net/2020/06/13/How-x86_64-addresses-memory):
+    #
+    # Scale: A 2-bit constant factor
+    # Index: Any general purpose register
+    # Base: Any general purpose register
+    # Displacement: An integral offset
+
+    # skip first expression, assume BinExport2.Expression.DEREFERENCE
+    expressions: List[BinExport2.Expression] = get_operand_expressions(be2, operand)
+
+    for i, expression in enumerate(expressions):
+        if expression.type == BinExport2.Expression.DEREFERENCE:
+            expressions = expressions[i + 1 :]
+            break
+
+    expression0: BinExport2.Expression
+    expression1: BinExport2.Expression
+    expression2: BinExport2.Expression
+    expression3: BinExport2.Expression
+    expression4: BinExport2.Expression
+
+    if len(expressions) == 1:
+        expression0 = expressions[0]
+
+        assert (
+            expression0.type == BinExport2.Expression.IMMEDIATE_INT
+            or expression0.type == BinExport2.Expression.REGISTER
+        )
+
+        if expression0.type == BinExport2.Expression.IMMEDIATE_INT:
+            # Displacement
+            return ExpressionPhraseInfo(displacement=expression0)
+        elif expression0.type == BinExport2.Expression.REGISTER:
+            # Base
+            return ExpressionPhraseInfo(base=expression0)
+
+    elif len(expressions) == 3:
+        expression0 = expressions[0]
+        expression1 = expressions[1]
+        expression2 = expressions[2]
+
+        assert expression0.type == BinExport2.Expression.REGISTER
+        assert expression1.type == BinExport2.Expression.OPERATOR
+        assert (
+            expression2.type == BinExport2.Expression.IMMEDIATE_INT
+            or expression2.type == BinExport2.Expression.REGISTER
+        )
+
+        if expression2.type == BinExport2.Expression.REGISTER:
+            # Base + Index
+            return ExpressionPhraseInfo(base=expression0, index=expression2)
+        elif expression2.type == BinExport2.Expression.IMMEDIATE_INT:
+            # Base + Displacement
+            return ExpressionPhraseInfo(base=expression0, displacement=expression2)
+
+    elif len(expressions) == 5:
+        expression0 = expressions[0]
+        expression1 = expressions[1]
+        expression2 = expressions[2]
+        expression3 = expressions[3]
+        expression4 = expressions[4]
+
+        assert expression0.type == BinExport2.Expression.REGISTER
+        assert expression1.type == BinExport2.Expression.OPERATOR
+        assert (
+            expression2.type == BinExport2.Expression.REGISTER
+            or expression2.type == BinExport2.Expression.IMMEDIATE_INT
+        )
+        assert expression3.type == BinExport2.Expression.OPERATOR
+        assert expression4.type == BinExport2.Expression.IMMEDIATE_INT
+
+        if expression1.symbol == "+" and expression3.symbol == "+":
+            # Base + Index + Displacement
+            return ExpressionPhraseInfo(base=expression0, index=expression2, displacement=expression4)
+        elif expression1.symbol == "+" and expression3.symbol == "*":
+            # Base + (Index * Scale)
+            return ExpressionPhraseInfo(base=expression0, index=expression2, scale=expression3)
+        elif expression1.symbol == "*" and expression3.symbol == "+":
+            # (Index * Scale) + Displacement
+            return ExpressionPhraseInfo(index=expression0, scale=expression2, displacement=expression3)
+        else:
+            raise NotImplementedError(expression1.symbol, expression3.symbol)
+
+    elif len(expressions) == 7:
+        expression0 = expressions[0]
+        expression1 = expressions[1]
+        expression2 = expressions[2]
+        expression3 = expressions[3]
+        expression4 = expressions[4]
+        expression5 = expressions[5]
+        expression6 = expressions[6]
+
+        assert expression0.type == BinExport2.Expression.REGISTER
+        assert expression1.type == BinExport2.Expression.OPERATOR
+        assert expression2.type == BinExport2.Expression.REGISTER
+        assert expression3.type == BinExport2.Expression.OPERATOR
+        assert expression4.type == BinExport2.Expression.IMMEDIATE_INT
+        assert expression5.type == BinExport2.Expression.OPERATOR
+        assert expression6.type == BinExport2.Expression.IMMEDIATE_INT
+
+        # Base + (Index * Scale) + Displacement
+        return ExpressionPhraseInfo(base=expression0, index=expression2, scale=expression4, displacement=expression6)
+
+    else:
+        raise NotImplementedError(len(expressions))
+
+    return None
+
+
+def _get_operand_expression_list(
+    be2: BinExport2,
+    operand: BinExport2.Operand,
+    expression_tree: List[List[int]],
+    tree_index: int,
+    expression_list: List[BinExport2.Expression],
+):
+    exp_index = operand.expression_index[tree_index]
+    expression = be2.expression[exp_index]
+    children_tree_indexes: List[int] = expression_tree[tree_index]
+
+    if expression.type == BinExport2.Expression.REGISTER:
+        expression_list.append(expression)
+        assert len(children_tree_indexes) == 0
+        return
+
+    elif expression.type == BinExport2.Expression.SYMBOL:
+        expression_list.append(expression)
+        assert len(children_tree_indexes) <= 1
+
+        if len(children_tree_indexes) == 0:
+            return
+        elif len(children_tree_indexes) == 1:
+            # like: v
+            # from: mov v0.D[0x1], x9
+            #           |
+            #           0
+            #           .
+            #           |
+            #           D
+            child_index = children_tree_indexes[0]
+            _get_operand_expression_list(be2, operand, expression_tree, child_index, expression_list)
+            return
+        else:
+            raise NotImplementedError(len(children_tree_indexes))
+
+    elif expression.type == BinExport2.Expression.IMMEDIATE_INT:
+        expression_list.append(expression)
+        assert len(children_tree_indexes) == 0
+        return
+
+    elif expression.type == BinExport2.Expression.SIZE_PREFIX:
+        # like: b4
+        #
+        # We might want to use this occasionally, such as to disambiguate the
+        # size of MOVs into/out of memory. But I'm not sure when/where we need that yet.
+        #
+        # IDA spams this size prefix hint *everywhere*, so we can't rely on the exporter
+        # to provide it only when necessary.
+        assert len(children_tree_indexes) == 1
+        child_index = children_tree_indexes[0]
+        _get_operand_expression_list(be2, operand, expression_tree, child_index, expression_list)
+        return
+
+    elif expression.type == BinExport2.Expression.OPERATOR:
+
+        if len(children_tree_indexes) == 1:
+            # prefix operator, like "ds:"
+            expression_list.append(expression)
+            child_index = children_tree_indexes[0]
+            _get_operand_expression_list(be2, operand, expression_tree, child_index, expression_list)
+            return
+
+        elif len(children_tree_indexes) == 2:
+            # infix operator: like "+" in "ebp+10"
+            child_a = children_tree_indexes[0]
+            child_b = children_tree_indexes[1]
+            _get_operand_expression_list(be2, operand, expression_tree, child_a, expression_list)
+            expression_list.append(expression)
+            _get_operand_expression_list(be2, operand, expression_tree, child_b, expression_list)
+            return
+
+        elif len(children_tree_indexes) == 3:
+            # infix operator: like "+" in "ebp+ecx+10"
+            child_a = children_tree_indexes[0]
+            child_b = children_tree_indexes[1]
+            child_c = children_tree_indexes[2]
+            _get_operand_expression_list(be2, operand, expression_tree, child_a, expression_list)
+            expression_list.append(expression)
+            _get_operand_expression_list(be2, operand, expression_tree, child_b, expression_list)
+            expression_list.append(expression)
+            _get_operand_expression_list(be2, operand, expression_tree, child_c, expression_list)
+            return
+
+        else:
+            raise NotImplementedError(len(children_tree_indexes))
+
+    elif expression.type == BinExport2.Expression.DEREFERENCE:
+        expression_list.append(expression)
+
+        assert len(children_tree_indexes) == 1
+        child_index = children_tree_indexes[0]
+        _get_operand_expression_list(be2, operand, expression_tree, child_index, expression_list)
+        return
+
+    elif expression.type == BinExport2.Expression.IMMEDIATE_FLOAT:
+        raise NotImplementedError(expression.type)
+
+    else:
+        raise NotImplementedError(expression.type)
+
+
+def get_operand_expressions(be2: BinExport2, op: BinExport2.Operand) -> List[BinExport2.Expression]:
+    # The reconstructed expression tree layout, linking parent nodes to their children.
+    #
+    # There is one list of integers for each expression in the operand.
+    # These integers are indexes of other expressions in the same operand,
+    # which are the children of that expression.
+    #
+    # So:
+    #
+    #   [ [1, 3], [2], [], [4], [5], []]
+    #
+    # means the first expression has two children, at index 1 and 3,
+    # and the tree looks like:
+    #
+    #        0
+    #       / \
+    #      1   3
+    #      |   |
+    #      2   4
+    #          |
+    #          5
+    #
+    # Remember, these are the indices into the entries in operand.expression_index.
+    exp_tree: List[List[int]] = []
+    for i, exp_index in enumerate(op.expression_index):
+        children = []
+
+        # scan all subsequent expressions, looking for those that have parent_index == current.expression_index
+        for j, candidate_index in enumerate(op.expression_index[i + 1 :]):
+            candidate = be2.expression[candidate_index]
+
+            if candidate.parent_index == exp_index:
+                children.append(i + j + 1)
+
+        exp_tree.append(children)
+
+    exp_list: List[BinExport2.Expression] = []
+    _get_operand_expression_list(be2, op, exp_tree, 0, exp_list)
+
+    return exp_list
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 8fbfa5705..6bf15700d 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -25,6 +25,7 @@
     InstructionContext,
 )
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
+from capa.features.extractors.binexport2.helpers import ExpressionPhraseInfo
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
 logger = logging.getLogger(__name__)
@@ -41,7 +42,15 @@
 HAS_ARCH_ARM = {ARCH_AARCH64}
 
 
-def get_operand_expression_register(op_index: int, fhi: FunctionContext) -> Optional[str]:
+def mask_immediate(fhi: FunctionContext, immediate: int) -> int:
+    if fhi.arch & HAS_ARCH64:
+        immediate &= 0xFFFFFFFFFFFFFFFF
+    elif fhi.arch & HAS_ARCH32:
+        immediate &= 0xFFFFFFFF
+    return immediate
+
+
+def get_operand_register(op_index: int, fhi: FunctionContext) -> Optional[str]:
     op: BinExport2.Operand = fhi.ctx.be2.operand[op_index]
     if len(op.expression_index) == 1:
         exp: BinExport2.Expression = fhi.ctx.be2.expression[op.expression_index[0]]
@@ -50,20 +59,16 @@ def get_operand_expression_register(op_index: int, fhi: FunctionContext) -> Opti
     return None
 
 
-def get_operand_expression_immediate(op_index: int, fhi: FunctionContext) -> Optional[int]:
-    op: BinExport2.Operand = fhi.ctx.be2.operand[op_index]
-    immediate: Optional[int] = None
-
-    if len(op.expression_index) == 1:
+def get_operand_immediate_expression(be2: BinExport2, operand: BinExport2.Operand) -> Optional[BinExport2.Expression]:
+    if len(operand.expression_index) == 1:
         # - type: IMMEDIATE_INT
         #   immediate: 20588728364
         #   parent_index: 0
+        expression: BinExport2.Expression = be2.expression[operand.expression_index[0]]
+        if expression.type == BinExport2.Expression.IMMEDIATE_INT:
+            return expression
 
-        exp: BinExport2.Expression = fhi.ctx.be2.expression[op.expression_index[0]]
-        if BinExport2.Expression.Type.IMMEDIATE_INT == exp.type:
-            immediate = exp.immediate
-
-    elif len(op.expression_index) == 2:
+    elif len(operand.expression_index) == 2:
         # from IDA, which provides a size hint for every operand,
         # we get the following pattern for immediate constants:
         #
@@ -72,29 +77,14 @@ def get_operand_expression_immediate(op_index: int, fhi: FunctionContext) -> Opt
         # - type: IMMEDIATE_INT
         #   immediate: 20588728364
         #   parent_index: 0
+        expression0: BinExport2.Expression = be2.expression[operand.expression_index[0]]
+        expression1: BinExport2.Expression = be2.expression[operand.expression_index[1]]
 
-        exp0: BinExport2.Expression = fhi.ctx.be2.expression[op.expression_index[0]]
-        exp1: BinExport2.Expression = fhi.ctx.be2.expression[op.expression_index[1]]
+        if expression0.type == BinExport2.Expression.SIZE_PREFIX:
+            if expression1.type == BinExport2.Expression.IMMEDIATE_INT:
+                return expression1
 
-        if BinExport2.Expression.Type.SIZE_PREFIX == exp0.type:
-            if BinExport2.Expression.Type.IMMEDIATE_INT == exp1.type:
-                immediate = exp1.immediate
-
-    if immediate is not None:
-        if fhi.arch & HAS_ARCH64:
-            immediate &= 0xFFFFFFFFFFFFFFFF
-        elif fhi.arch & HAS_ARCH32:
-            immediate &= 0xFFFFFFFF
-
-    return immediate
-
-
-def get_immediate_twos_complement(immediate: int, fhi: FunctionContext) -> int:
-    if fhi.arch & HAS_ARCH64:
-        return capa.features.extractors.helpers.twos_complement(immediate, 64)
-    elif fhi.arch & HAS_ARCH32:
-        return capa.features.extractors.helpers.twos_complement(immediate, 32)
-    return immediate
+    return None
 
 
 def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
@@ -158,7 +148,6 @@ def extract_insn_number_features(
     ii: InstructionContext = ih.inner
 
     be2: BinExport2 = fhi.ctx.be2
-    analysis: BinExport2Analysis = fhi.ctx.analysis
 
     instruction_index: int = ii.instruction_index
     instruction: BinExport2.Instruction = be2.instruction[instruction_index]
@@ -168,17 +157,17 @@ def extract_insn_number_features(
         #   .text:0040116e leave
         return
 
-    mnemonic = be2.mnemonic[instruction.mnemonic_index]
+    mnemonic: str = be2.mnemonic[instruction.mnemonic_index].name.lower()
 
     if fhi.arch & HAS_ARCH_INTEL:
         # short-circut checks for intel architecture
-        if mnemonic.name.lower().startswith("ret"):
+        if mnemonic.startswith("ret"):
             # skip things like:
             #   .text:0042250E retn 8
             return
 
-        if mnemonic.name.lower().startswith(("add", "sub")):
-            register: Optional[str] = get_operand_expression_register(instruction.operand_index[0], fhi)
+        if mnemonic.startswith(("add", "sub")):
+            register: Optional[str] = get_operand_register(instruction.operand_index[0], fhi)
             if register is not None:
                 if register.endswith(("sp", "bp")):
                     # skip things like:
@@ -186,22 +175,13 @@ def extract_insn_number_features(
                     return
 
     for i, operand_index in enumerate(instruction.operand_index):
-        value: Optional[int] = get_operand_expression_immediate(operand_index, fhi)
-        if value is None:
-            continue
-
-        if analysis.base_address == 0x0:
-            # When the image is mapped at 0x0,
-            #  then its hard to tell if numbers are pointers or numbers.
-            # TODO(mr): be a little less conservative otherwise?
-            # https://github.com/mandiant/capa/issues/1755
+        operand: BinExport2.Operand = be2.operand[operand_index]
 
-            # TODO(mr): this removes a lot of valid numbers, could check alignment and use additional heuristics
-            # https://github.com/mandiant/capa/issues/1755
-            # if is_address_mapped(be2, value):
-            #     continue
-            pass
+        expression: Optional[BinExport2.Expression] = get_operand_immediate_expression(be2, operand)
+        if expression is None:
+            continue
 
+        value: int = mask_immediate(fhi, expression.immediate)
         if is_address_mapped(be2, value):
             continue
 
@@ -209,7 +189,7 @@ def extract_insn_number_features(
         yield OperandNumber(i, value), ih.address
 
         if fhi.arch & HAS_ARCH_INTEL:
-            if mnemonic.name.lower().startswith("add"):
+            if mnemonic.startswith("add"):
                 if 0 < value < MAX_STRUCTURE_SIZE:
                     yield Offset(value), ih.address
                     yield OperandOffset(i, value), ih.address
@@ -296,9 +276,61 @@ def extract_insn_string_features(
 def extract_insn_offset_features(
     fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
 ) -> Iterator[Tuple[Feature, Address]]:
-    # TODO(wb): complete
-    # https://github.com/mandiant/capa/issues/1755
-    yield from ()
+    fhi: FunctionContext = fh.inner
+    ii: InstructionContext = ih.inner
+
+    be2: BinExport2 = fhi.ctx.be2
+    instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
+
+    if len(instruction.operand_index) == 0:
+        # skip things like:
+        #   .text:0040116e leave
+        return
+
+    mnemonic: str = be2.mnemonic[instruction.mnemonic_index].name.lower()
+
+    for i, operand_index in enumerate(instruction.operand_index):
+        operand: BinExport2.Operand = be2.operand[operand_index]
+
+        is_dereference = False
+        for expression_index in operand.expression_index:
+            if be2.expression[expression_index].type == BinExport2.Expression.DEREFERENCE:
+                is_dereference = True
+                break
+
+        if not is_dereference:
+            continue
+
+        if fhi.arch & HAS_ARCH_INTEL:
+            phrase_info: Optional[ExpressionPhraseInfo] = (
+                capa.features.extractors.binexport2.helpers.get_operand_expression_phrase_info(be2, operand)
+            )
+            if not phrase_info:
+                continue
+
+            if phrase_info.displacement:
+                if phrase_info.base and phrase_info.base.symbol.lower().endswith(("bp", "sp")):
+                    # skips things like:
+                    # 00401068 MOV dword ptr [EBP + local_8],EAX
+                    continue
+
+                value: int = mask_immediate(fhi, phrase_info.displacement.immediate)
+                if not is_address_mapped(be2, value):
+                    value = capa.features.extractors.helpers.twos_complement(value, 32)
+
+                    yield Offset(value), ih.address
+                    yield OperandOffset(i, value), ih.address
+
+                    if mnemonic == "lea" and i == 1:
+                        if phrase_info.base and not any((phrase_info.scale, phrase_info.index)):
+                            yield Number(value), ih.address
+                            yield OperandNumber(i, value), ih.address
+
+            elif phrase_info.base and not any((phrase_info.index, phrase_info.scale)):
+                # like:
+                # 00401062 MOVZX EAX,word ptr [EDI]
+                yield Offset(0), ih.address
+                yield OperandOffset(i, 0), ih.address
 
 
 def is_security_cookie(
diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index 998741474..1a0abaffb 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -88,8 +88,23 @@ def _render_expression_tree(
 
     elif expression.type == BinExport2.Expression.SYMBOL:
         o.write(expression.symbol)
-        assert len(children_tree_indexes) == 0
-        return
+        assert len(children_tree_indexes) <= 1
+
+        if len(children_tree_indexes) == 0:
+            return
+        elif len(children_tree_indexes) == 1:
+            # like: v
+            # from: mov v0.D[0x1], x9
+            #           |
+            #           0
+            #           .
+            #           |
+            #           D
+            child_index = children_tree_indexes[0]
+            _render_expression_tree(be2, instruction, operand, expression_tree, child_index, o)
+            return
+        else:
+            raise NotImplementedError(len(children_tree_indexes))
 
     elif expression.type == BinExport2.Expression.IMMEDIATE_INT:
         o.write(f"0x{expression.immediate:X}")
diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index 4180baaf2..1cf8686f4 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -441,9 +441,6 @@ def test_binexport_features_pe_x86(sample, scope, feature, expected):
     if "mimikatz.exe_" not in sample.name:
         pytest.skip("for now only testing mimikatz.exe_ Ghidra BinExport file")
 
-    if isinstance(feature, (capa.features.insn.Offset, capa.features.insn.OperandOffset)):
-        pytest.xfail("Offset features not supported yet")
-
     sample = sample.parent / "binexport2" / (sample.name + ".ghidra.BinExport")
     assert sample.exists()
     fixtures.do_test_feature_presence(fixtures.get_binexport_extractor, sample, scope, feature, expected)

From 210f127819e9a6b417c31bc1bc025495939c320c Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Tue, 6 Aug 2024 10:43:30 -0600
Subject: [PATCH 158/200] binexport: code refactor

---
 .../features/extractors/binexport2/helpers.py | 59 +++++++++++++---
 capa/features/extractors/binexport2/insn.py   | 67 +++++--------------
 2 files changed, 65 insertions(+), 61 deletions(-)

diff --git a/capa/features/extractors/binexport2/helpers.py b/capa/features/extractors/binexport2/helpers.py
index 708dc446a..28f95cac6 100644
--- a/capa/features/extractors/binexport2/helpers.py
+++ b/capa/features/extractors/binexport2/helpers.py
@@ -12,7 +12,7 @@
 
 
 @dataclass
-class ExpressionPhraseInfo:
+class OperandPhraseInfo:
     scale: Optional[BinExport2.Expression] = None
     index: Optional[BinExport2.Expression] = None
     base: Optional[BinExport2.Expression] = None
@@ -23,7 +23,7 @@ def is_vertex_type(vertex: BinExport2.CallGraph.Vertex, type_: BinExport2.CallGr
     return vertex.HasField("type") and vertex.type == type_
 
 
-def get_operand_expression_phrase_info(be2: BinExport2, operand: BinExport2.Operand) -> Optional[ExpressionPhraseInfo]:
+def get_operand_phrase_info(be2: BinExport2, operand: BinExport2.Operand) -> Optional[OperandPhraseInfo]:
     # assume the following (see https://blog.yossarian.net/2020/06/13/How-x86_64-addresses-memory):
     #
     # Scale: A 2-bit constant factor
@@ -31,9 +31,10 @@ def get_operand_expression_phrase_info(be2: BinExport2, operand: BinExport2.Oper
     # Base: Any general purpose register
     # Displacement: An integral offset
 
-    # skip first expression, assume BinExport2.Expression.DEREFERENCE
     expressions: List[BinExport2.Expression] = get_operand_expressions(be2, operand)
 
+    # skip expression up to and including BinExport2.Expression.DEREFERENCE, assume caller
+    # has checked for BinExport2.Expression.DEREFERENCE
     for i, expression in enumerate(expressions):
         if expression.type == BinExport2.Expression.DEREFERENCE:
             expressions = expressions[i + 1 :]
@@ -55,10 +56,10 @@ def get_operand_expression_phrase_info(be2: BinExport2, operand: BinExport2.Oper
 
         if expression0.type == BinExport2.Expression.IMMEDIATE_INT:
             # Displacement
-            return ExpressionPhraseInfo(displacement=expression0)
+            return OperandPhraseInfo(displacement=expression0)
         elif expression0.type == BinExport2.Expression.REGISTER:
             # Base
-            return ExpressionPhraseInfo(base=expression0)
+            return OperandPhraseInfo(base=expression0)
 
     elif len(expressions) == 3:
         expression0 = expressions[0]
@@ -74,10 +75,10 @@ def get_operand_expression_phrase_info(be2: BinExport2, operand: BinExport2.Oper
 
         if expression2.type == BinExport2.Expression.REGISTER:
             # Base + Index
-            return ExpressionPhraseInfo(base=expression0, index=expression2)
+            return OperandPhraseInfo(base=expression0, index=expression2)
         elif expression2.type == BinExport2.Expression.IMMEDIATE_INT:
             # Base + Displacement
-            return ExpressionPhraseInfo(base=expression0, displacement=expression2)
+            return OperandPhraseInfo(base=expression0, displacement=expression2)
 
     elif len(expressions) == 5:
         expression0 = expressions[0]
@@ -97,13 +98,13 @@ def get_operand_expression_phrase_info(be2: BinExport2, operand: BinExport2.Oper
 
         if expression1.symbol == "+" and expression3.symbol == "+":
             # Base + Index + Displacement
-            return ExpressionPhraseInfo(base=expression0, index=expression2, displacement=expression4)
+            return OperandPhraseInfo(base=expression0, index=expression2, displacement=expression4)
         elif expression1.symbol == "+" and expression3.symbol == "*":
             # Base + (Index * Scale)
-            return ExpressionPhraseInfo(base=expression0, index=expression2, scale=expression3)
+            return OperandPhraseInfo(base=expression0, index=expression2, scale=expression3)
         elif expression1.symbol == "*" and expression3.symbol == "+":
             # (Index * Scale) + Displacement
-            return ExpressionPhraseInfo(index=expression0, scale=expression2, displacement=expression3)
+            return OperandPhraseInfo(index=expression0, scale=expression2, displacement=expression3)
         else:
             raise NotImplementedError(expression1.symbol, expression3.symbol)
 
@@ -125,7 +126,7 @@ def get_operand_expression_phrase_info(be2: BinExport2, operand: BinExport2.Oper
         assert expression6.type == BinExport2.Expression.IMMEDIATE_INT
 
         # Base + (Index * Scale) + Displacement
-        return ExpressionPhraseInfo(base=expression0, index=expression2, scale=expression4, displacement=expression6)
+        return OperandPhraseInfo(base=expression0, index=expression2, scale=expression4, displacement=expression6)
 
     else:
         raise NotImplementedError(len(expressions))
@@ -275,3 +276,39 @@ def get_operand_expressions(be2: BinExport2, op: BinExport2.Operand) -> List[Bin
     _get_operand_expression_list(be2, op, exp_tree, 0, exp_list)
 
     return exp_list
+
+
+def get_operand_register_expression(be2: BinExport2, operand: BinExport2.Operand) -> Optional[BinExport2.Expression]:
+    if len(operand.expression_index) == 1:
+        expression: BinExport2.Expression = be2.expression[operand.expression_index[0]]
+        if expression.type == BinExport2.Expression.REGISTER:
+            return expression
+    return None
+
+
+def get_operand_immediate_expression(be2: BinExport2, operand: BinExport2.Operand) -> Optional[BinExport2.Expression]:
+    if len(operand.expression_index) == 1:
+        # - type: IMMEDIATE_INT
+        #   immediate: 20588728364
+        #   parent_index: 0
+        expression: BinExport2.Expression = be2.expression[operand.expression_index[0]]
+        if expression.type == BinExport2.Expression.IMMEDIATE_INT:
+            return expression
+
+    elif len(operand.expression_index) == 2:
+        # from IDA, which provides a size hint for every operand,
+        # we get the following pattern for immediate constants:
+        #
+        # - type: SIZE_PREFIX
+        #   symbol: "b8"
+        # - type: IMMEDIATE_INT
+        #   immediate: 20588728364
+        #   parent_index: 0
+        expression0: BinExport2.Expression = be2.expression[operand.expression_index[0]]
+        expression1: BinExport2.Expression = be2.expression[operand.expression_index[1]]
+
+        if expression0.type == BinExport2.Expression.SIZE_PREFIX:
+            if expression1.type == BinExport2.Expression.IMMEDIATE_INT:
+                return expression1
+
+    return None
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 6bf15700d..50488e146 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -25,7 +25,12 @@
     InstructionContext,
 )
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
-from capa.features.extractors.binexport2.helpers import ExpressionPhraseInfo
+from capa.features.extractors.binexport2.helpers import (
+    OperandPhraseInfo,
+    get_operand_phrase_info,
+    get_operand_register_expression,
+    get_operand_immediate_expression,
+)
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
 logger = logging.getLogger(__name__)
@@ -50,43 +55,6 @@ def mask_immediate(fhi: FunctionContext, immediate: int) -> int:
     return immediate
 
 
-def get_operand_register(op_index: int, fhi: FunctionContext) -> Optional[str]:
-    op: BinExport2.Operand = fhi.ctx.be2.operand[op_index]
-    if len(op.expression_index) == 1:
-        exp: BinExport2.Expression = fhi.ctx.be2.expression[op.expression_index[0]]
-        if exp.type == BinExport2.Expression.Type.REGISTER:
-            return exp.symbol.lower()
-    return None
-
-
-def get_operand_immediate_expression(be2: BinExport2, operand: BinExport2.Operand) -> Optional[BinExport2.Expression]:
-    if len(operand.expression_index) == 1:
-        # - type: IMMEDIATE_INT
-        #   immediate: 20588728364
-        #   parent_index: 0
-        expression: BinExport2.Expression = be2.expression[operand.expression_index[0]]
-        if expression.type == BinExport2.Expression.IMMEDIATE_INT:
-            return expression
-
-    elif len(operand.expression_index) == 2:
-        # from IDA, which provides a size hint for every operand,
-        # we get the following pattern for immediate constants:
-        #
-        # - type: SIZE_PREFIX
-        #   symbol: "b8"
-        # - type: IMMEDIATE_INT
-        #   immediate: 20588728364
-        #   parent_index: 0
-        expression0: BinExport2.Expression = be2.expression[operand.expression_index[0]]
-        expression1: BinExport2.Expression = be2.expression[operand.expression_index[1]]
-
-        if expression0.type == BinExport2.Expression.SIZE_PREFIX:
-            if expression1.type == BinExport2.Expression.IMMEDIATE_INT:
-                return expression1
-
-    return None
-
-
 def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
@@ -167,21 +135,22 @@ def extract_insn_number_features(
             return
 
         if mnemonic.startswith(("add", "sub")):
-            register: Optional[str] = get_operand_register(instruction.operand_index[0], fhi)
-            if register is not None:
-                if register.endswith(("sp", "bp")):
-                    # skip things like:
-                    # 0x415bbc  ADD         ESP, 0xC
-                    return
+            register_expression: Optional[BinExport2.Expression] = get_operand_register_expression(
+                be2, be2.operand[instruction.operand_index[0]]
+            )
+            if register_expression and register_expression.symbol.lower().endswith(("sp", "bp")):
+                # skip things like:
+                # 0x415bbc  ADD         ESP, 0xC
+                return
 
     for i, operand_index in enumerate(instruction.operand_index):
         operand: BinExport2.Operand = be2.operand[operand_index]
 
-        expression: Optional[BinExport2.Expression] = get_operand_immediate_expression(be2, operand)
-        if expression is None:
+        immediate_expression: Optional[BinExport2.Expression] = get_operand_immediate_expression(be2, operand)
+        if not immediate_expression:
             continue
 
-        value: int = mask_immediate(fhi, expression.immediate)
+        value: int = mask_immediate(fhi, immediate_expression.immediate)
         if is_address_mapped(be2, value):
             continue
 
@@ -302,9 +271,7 @@ def extract_insn_offset_features(
             continue
 
         if fhi.arch & HAS_ARCH_INTEL:
-            phrase_info: Optional[ExpressionPhraseInfo] = (
-                capa.features.extractors.binexport2.helpers.get_operand_expression_phrase_info(be2, operand)
-            )
+            phrase_info: Optional[OperandPhraseInfo] = get_operand_phrase_info(be2, operand)
             if not phrase_info:
                 continue
 

From 877134e86c1a503dbe5982128e935ba2f6d158ef Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Tue, 6 Aug 2024 16:24:15 -0600
Subject: [PATCH 159/200] binexport: init refactor for multi-arch instruction
 feature parsing

---
 .../extractors/binexport2/arch/__init__.py    |   0
 .../binexport2/arch/arm/__init__.py           |   0
 .../extractors/binexport2/arch/arm/insn.py    | 131 ++++++++++
 .../binexport2/arch/intel/__init__.py         |   0
 .../binexport2/arch/intel/helpers.py          | 135 ++++++++++
 .../extractors/binexport2/arch/intel/insn.py  | 214 ++++++++++++++++
 .../features/extractors/binexport2/helpers.py | 146 +++--------
 capa/features/extractors/binexport2/insn.py   | 237 +++---------------
 tests/test_binexport_features.py              |  77 ++----
 9 files changed, 567 insertions(+), 373 deletions(-)
 create mode 100644 capa/features/extractors/binexport2/arch/__init__.py
 create mode 100644 capa/features/extractors/binexport2/arch/arm/__init__.py
 create mode 100644 capa/features/extractors/binexport2/arch/arm/insn.py
 create mode 100644 capa/features/extractors/binexport2/arch/intel/__init__.py
 create mode 100644 capa/features/extractors/binexport2/arch/intel/helpers.py
 create mode 100644 capa/features/extractors/binexport2/arch/intel/insn.py

diff --git a/capa/features/extractors/binexport2/arch/__init__.py b/capa/features/extractors/binexport2/arch/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/capa/features/extractors/binexport2/arch/arm/__init__.py b/capa/features/extractors/binexport2/arch/arm/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/capa/features/extractors/binexport2/arch/arm/insn.py b/capa/features/extractors/binexport2/arch/arm/insn.py
new file mode 100644
index 000000000..3fdc6fb9f
--- /dev/null
+++ b/capa/features/extractors/binexport2/arch/arm/insn.py
@@ -0,0 +1,131 @@
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+import logging
+from typing import List, Tuple, Iterator, Optional
+
+import capa.features.extractors.binexport2.helpers
+from capa.features.insn import Number, Offset, OperandNumber, OperandOffset
+from capa.features.common import Feature, Characteristic
+from capa.features.address import Address
+from capa.features.extractors.binexport2 import FunctionContext, InstructionContext
+from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
+from capa.features.extractors.binexport2.helpers import (
+    mask_immediate,
+    is_address_mapped,
+    get_operand_expressions,
+    get_operand_immediate_expression,
+)
+from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
+
+logger = logging.getLogger(__name__)
+
+
+def extract_insn_number_features(
+    fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
+) -> Iterator[Tuple[Feature, Address]]:
+    fhi: FunctionContext = fh.inner
+    ii: InstructionContext = ih.inner
+
+    be2: BinExport2 = fhi.ctx.be2
+
+    instruction_index: int = ii.instruction_index
+    instruction: BinExport2.Instruction = be2.instruction[instruction_index]
+
+    if len(instruction.operand_index) == 0:
+        # skip things like:
+        #   .text:0040116e leave
+        return
+
+    for i, operand_index in enumerate(instruction.operand_index):
+        operand: BinExport2.Operand = be2.operand[operand_index]
+
+        immediate_expression: Optional[BinExport2.Expression] = get_operand_immediate_expression(be2, operand)
+        if not immediate_expression:
+            continue
+
+        value: int = mask_immediate(fhi.arch, immediate_expression.immediate)
+        if is_address_mapped(be2, value):
+            continue
+
+        yield Number(value), ih.address
+        yield OperandNumber(i, value), ih.address
+
+
+def extract_insn_offset_features(
+    fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
+) -> Iterator[Tuple[Feature, Address]]:
+    fhi: FunctionContext = fh.inner
+    ii: InstructionContext = ih.inner
+
+    be2: BinExport2 = fhi.ctx.be2
+    instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
+
+    if len(instruction.operand_index) == 0:
+        # skip things like:
+        #   .text:0040116e leave
+        return
+
+    mnemonic: str = be2.mnemonic[instruction.mnemonic_index].name.lower()
+
+    for i, operand_index in enumerate(instruction.operand_index):
+        operand: BinExport2.Operand = be2.operand[operand_index]
+
+        is_dereference = False
+        for expression_index in operand.expression_index:
+            if be2.expression[expression_index].type == BinExport2.Expression.DEREFERENCE:
+                is_dereference = True
+                break
+
+        if not is_dereference:
+            continue
+
+        if mnemonic == "ldp":
+            # like:
+            # 0013a2f0 ldp x22,x9,[x21, #0x18]
+            expressions: List[BinExport2.Expression] = get_operand_expressions(be2, operand)
+            if len(expressions) <= 2:
+                continue
+
+            if expressions[1].symbol.lower().endswith("sp"):
+                continue
+
+            value = mask_immediate(fhi.arch, expressions[-1].immediate)
+
+            if not is_address_mapped(be2, value):
+                value = capa.features.extractors.binexport2.helpers.twos_complement(fhi.arch, value)
+
+                yield Offset(value), ih.address
+                yield OperandOffset(i, value), ih.address
+
+
+def extract_insn_nzxor_characteristic_features(
+    fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
+) -> Iterator[Tuple[Feature, Address]]:
+    fhi: FunctionContext = fh.inner
+    ii: InstructionContext = ih.inner
+
+    be2: BinExport2 = fhi.ctx.be2
+
+    instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
+    mnemonic: str = be2.mnemonic[instruction.mnemonic_index].name.lower()
+
+    if mnemonic != "eor":
+        return
+
+    operands: List[BinExport2.Operand] = [be2.operand[operand_index] for operand_index in instruction.operand_index]
+
+    assert len(operands) == 3
+
+    if operands[1] != operands[2]:
+        yield Characteristic("nzxor"), ih.address
+
+
+def extract_function_indirect_call_characteristic_features(
+    fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
+) -> Iterator[Tuple[Feature, Address]]:
+    yield from ()
diff --git a/capa/features/extractors/binexport2/arch/intel/__init__.py b/capa/features/extractors/binexport2/arch/intel/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/capa/features/extractors/binexport2/arch/intel/helpers.py b/capa/features/extractors/binexport2/arch/intel/helpers.py
new file mode 100644
index 000000000..3696c0d93
--- /dev/null
+++ b/capa/features/extractors/binexport2/arch/intel/helpers.py
@@ -0,0 +1,135 @@
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+from typing import List, Optional
+from dataclasses import dataclass
+
+from capa.features.extractors.binexport2.helpers import get_operand_expressions
+from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
+
+# security cookie checks may perform non-zeroing XORs, these are expected within a certain
+# byte range within the first and returning basic blocks, this helps to reduce FP features
+SECURITY_COOKIE_BYTES_DELTA: int = 0x40
+
+
+@dataclass
+class OperandPhraseInfo:
+    scale: Optional[BinExport2.Expression] = None
+    index: Optional[BinExport2.Expression] = None
+    base: Optional[BinExport2.Expression] = None
+    displacement: Optional[BinExport2.Expression] = None
+
+
+def get_operand_phrase_info(be2: BinExport2, operand: BinExport2.Operand) -> Optional[OperandPhraseInfo]:
+    # assume the following (see https://blog.yossarian.net/2020/06/13/How-x86_64-addresses-memory):
+    #
+    # Scale: A 2-bit constant factor
+    # Index: Any general purpose register
+    # Base: Any general purpose register
+    # Displacement: An integral offset
+
+    expressions: List[BinExport2.Expression] = get_operand_expressions(be2, operand)
+
+    # skip expression up to and including BinExport2.Expression.DEREFERENCE, assume caller
+    # has checked for BinExport2.Expression.DEREFERENCE
+    for i, expression in enumerate(expressions):
+        if expression.type == BinExport2.Expression.DEREFERENCE:
+            expressions = expressions[i + 1 :]
+            break
+
+    expression0: BinExport2.Expression
+    expression1: BinExport2.Expression
+    expression2: BinExport2.Expression
+    expression3: BinExport2.Expression
+    expression4: BinExport2.Expression
+
+    if len(expressions) == 1:
+        expression0 = expressions[0]
+
+        assert (
+            expression0.type == BinExport2.Expression.IMMEDIATE_INT
+            or expression0.type == BinExport2.Expression.REGISTER
+        )
+
+        if expression0.type == BinExport2.Expression.IMMEDIATE_INT:
+            # Displacement
+            return OperandPhraseInfo(displacement=expression0)
+        elif expression0.type == BinExport2.Expression.REGISTER:
+            # Base
+            return OperandPhraseInfo(base=expression0)
+
+    elif len(expressions) == 3:
+        expression0 = expressions[0]
+        expression1 = expressions[1]
+        expression2 = expressions[2]
+
+        assert expression0.type == BinExport2.Expression.REGISTER
+        assert expression1.type == BinExport2.Expression.OPERATOR
+        assert (
+            expression2.type == BinExport2.Expression.IMMEDIATE_INT
+            or expression2.type == BinExport2.Expression.REGISTER
+        )
+
+        if expression2.type == BinExport2.Expression.REGISTER:
+            # Base + Index
+            return OperandPhraseInfo(base=expression0, index=expression2)
+        elif expression2.type == BinExport2.Expression.IMMEDIATE_INT:
+            # Base + Displacement
+            return OperandPhraseInfo(base=expression0, displacement=expression2)
+
+    elif len(expressions) == 5:
+        expression0 = expressions[0]
+        expression1 = expressions[1]
+        expression2 = expressions[2]
+        expression3 = expressions[3]
+        expression4 = expressions[4]
+
+        assert expression0.type == BinExport2.Expression.REGISTER
+        assert expression1.type == BinExport2.Expression.OPERATOR
+        assert (
+            expression2.type == BinExport2.Expression.REGISTER
+            or expression2.type == BinExport2.Expression.IMMEDIATE_INT
+        )
+        assert expression3.type == BinExport2.Expression.OPERATOR
+        assert expression4.type == BinExport2.Expression.IMMEDIATE_INT
+
+        if expression1.symbol == "+" and expression3.symbol == "+":
+            # Base + Index + Displacement
+            return OperandPhraseInfo(base=expression0, index=expression2, displacement=expression4)
+        elif expression1.symbol == "+" and expression3.symbol == "*":
+            # Base + (Index * Scale)
+            return OperandPhraseInfo(base=expression0, index=expression2, scale=expression3)
+        elif expression1.symbol == "*" and expression3.symbol == "+":
+            # (Index * Scale) + Displacement
+            return OperandPhraseInfo(index=expression0, scale=expression2, displacement=expression3)
+        else:
+            raise NotImplementedError(expression1.symbol, expression3.symbol)
+
+    elif len(expressions) == 7:
+        expression0 = expressions[0]
+        expression1 = expressions[1]
+        expression2 = expressions[2]
+        expression3 = expressions[3]
+        expression4 = expressions[4]
+        expression5 = expressions[5]
+        expression6 = expressions[6]
+
+        assert expression0.type == BinExport2.Expression.REGISTER
+        assert expression1.type == BinExport2.Expression.OPERATOR
+        assert expression2.type == BinExport2.Expression.REGISTER
+        assert expression3.type == BinExport2.Expression.OPERATOR
+        assert expression4.type == BinExport2.Expression.IMMEDIATE_INT
+        assert expression5.type == BinExport2.Expression.OPERATOR
+        assert expression6.type == BinExport2.Expression.IMMEDIATE_INT
+
+        # Base + (Index * Scale) + Displacement
+        return OperandPhraseInfo(base=expression0, index=expression2, scale=expression4, displacement=expression6)
+
+    else:
+        raise NotImplementedError(len(expressions))
+
+    return None
diff --git a/capa/features/extractors/binexport2/arch/intel/insn.py b/capa/features/extractors/binexport2/arch/intel/insn.py
new file mode 100644
index 000000000..9836ef1f7
--- /dev/null
+++ b/capa/features/extractors/binexport2/arch/intel/insn.py
@@ -0,0 +1,214 @@
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+import logging
+from typing import List, Tuple, Iterator, Optional
+
+import capa.features.extractors.strings
+import capa.features.extractors.binexport2.helpers
+from capa.features.insn import MAX_STRUCTURE_SIZE, Number, Offset, OperandNumber, OperandOffset
+from capa.features.common import Feature, Characteristic
+from capa.features.address import Address
+from capa.features.extractors.binexport2 import FunctionContext, BasicBlockContext, InstructionContext
+from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
+from capa.features.extractors.binexport2.helpers import (
+    mask_immediate,
+    is_address_mapped,
+    get_operand_register_expression,
+    get_operand_immediate_expression,
+)
+from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
+from capa.features.extractors.binexport2.arch.intel.helpers import (
+    SECURITY_COOKIE_BYTES_DELTA,
+    OperandPhraseInfo,
+    get_operand_phrase_info,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def extract_insn_number_features(
+    fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
+) -> Iterator[Tuple[Feature, Address]]:
+    fhi: FunctionContext = fh.inner
+    ii: InstructionContext = ih.inner
+
+    be2: BinExport2 = fhi.ctx.be2
+
+    instruction_index: int = ii.instruction_index
+    instruction: BinExport2.Instruction = be2.instruction[instruction_index]
+
+    if len(instruction.operand_index) == 0:
+        # skip things like:
+        #   .text:0040116e leave
+        return
+
+    mnemonic: str = be2.mnemonic[instruction.mnemonic_index].name.lower()
+
+    if mnemonic.startswith("ret"):
+        # skip things like:
+        #   .text:0042250E retn 8
+        return
+
+    if mnemonic.startswith(("add", "sub")):
+        register_expression: Optional[BinExport2.Expression] = get_operand_register_expression(
+            be2, be2.operand[instruction.operand_index[0]]
+        )
+        if register_expression and register_expression.symbol.lower().endswith(("sp", "bp")):
+            # skip things like:
+            # 0x415bbc  ADD         ESP, 0xC
+            return
+
+    for i, operand_index in enumerate(instruction.operand_index):
+        operand: BinExport2.Operand = be2.operand[operand_index]
+
+        immediate_expression: Optional[BinExport2.Expression] = get_operand_immediate_expression(be2, operand)
+        if not immediate_expression:
+            continue
+
+        value: int = mask_immediate(fhi.arch, immediate_expression.immediate)
+        if is_address_mapped(be2, value):
+            continue
+
+        yield Number(value), ih.address
+        yield OperandNumber(i, value), ih.address
+
+        if mnemonic.startswith("add"):
+            if 0 < value < MAX_STRUCTURE_SIZE:
+                yield Offset(value), ih.address
+                yield OperandOffset(i, value), ih.address
+
+
+def extract_insn_offset_features(
+    fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
+) -> Iterator[Tuple[Feature, Address]]:
+    fhi: FunctionContext = fh.inner
+    ii: InstructionContext = ih.inner
+
+    be2: BinExport2 = fhi.ctx.be2
+    instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
+
+    if len(instruction.operand_index) == 0:
+        # skip things like:
+        #   .text:0040116e leave
+        return
+
+    mnemonic: str = be2.mnemonic[instruction.mnemonic_index].name.lower()
+    value: int
+
+    for i, operand_index in enumerate(instruction.operand_index):
+        operand: BinExport2.Operand = be2.operand[operand_index]
+
+        is_dereference = False
+        for expression_index in operand.expression_index:
+            if be2.expression[expression_index].type == BinExport2.Expression.DEREFERENCE:
+                is_dereference = True
+                break
+
+        if not is_dereference:
+            continue
+
+        phrase_info: Optional[OperandPhraseInfo] = get_operand_phrase_info(be2, operand)
+        if not phrase_info:
+            continue
+
+        if phrase_info.displacement:
+            if phrase_info.base and phrase_info.base.symbol.lower().endswith(("bp", "sp")):
+                # skips things like:
+                # 00401068 MOV dword ptr [EBP + local_8],EAX
+                continue
+
+            value = mask_immediate(fhi.arch, phrase_info.displacement.immediate)
+            if not is_address_mapped(be2, value):
+                value = capa.features.extractors.binexport2.helpers.twos_complement(fhi.arch, value, 32)
+
+                yield Offset(value), ih.address
+                yield OperandOffset(i, value), ih.address
+
+                if mnemonic == "lea" and i == 1:
+                    if phrase_info.base and not any((phrase_info.scale, phrase_info.index)):
+                        yield Number(value), ih.address
+                        yield OperandNumber(i, value), ih.address
+
+        elif phrase_info.base and not any((phrase_info.index, phrase_info.scale)):
+            # like:
+            # 00401062 MOVZX EAX,word ptr [EDI]
+            yield Offset(0), ih.address
+            yield OperandOffset(i, 0), ih.address
+
+
+def is_security_cookie(
+    fhi: FunctionContext,
+    bbi: BasicBlockContext,
+    instruction: BinExport2.Instruction,
+) -> bool:
+    """
+    check if an instruction is related to security cookie checks.
+    """
+    be2: BinExport2 = fhi.ctx.be2
+
+    # security cookie check should use SP or BP
+    op1: BinExport2.Operand = be2.operand[instruction.operand_index[1]]
+    op1_exprs: List[BinExport2.Expression] = [be2.expression[expr_i] for expr_i in op1.expression_index]
+    if all(expr.symbol.lower() not in ("bp", "esp", "ebp", "rbp", "rsp") for expr in op1_exprs):
+        return False
+
+    # check_nzxor_security_cookie_delta
+    # if insn falls at the start of first entry block of the parent function.
+    flow_graph: BinExport2.FlowGraph = be2.flow_graph[fhi.flow_graph_index]
+    basic_block_index: int = bbi.basic_block_index
+    bb: BinExport2.BasicBlock = be2.basic_block[basic_block_index]
+    if flow_graph.entry_basic_block_index == basic_block_index:
+        first_addr: int = min((be2.instruction[ir.begin_index].address for ir in bb.instruction_index))
+        if instruction.address < first_addr + SECURITY_COOKIE_BYTES_DELTA:
+            return True
+    # or insn falls at the end before return in a terminal basic block.
+    if basic_block_index not in (e.source_basic_block_index for e in flow_graph.edge):
+        last_addr: int = max((be2.instruction[ir.end_index - 1].address for ir in bb.instruction_index))
+        if instruction.address > last_addr - SECURITY_COOKIE_BYTES_DELTA:
+            return True
+    return False
+
+
+def extract_insn_nzxor_characteristic_features(
+    fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
+) -> Iterator[Tuple[Feature, Address]]:
+    """
+    parse non-zeroing XOR instruction from the given instruction.
+    ignore expected non-zeroing XORs, e.g. security cookies.
+    """
+    fhi: FunctionContext = fh.inner
+    ii: InstructionContext = ih.inner
+
+    be2: BinExport2 = fhi.ctx.be2
+
+    instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
+    mnemonic: BinExport2.Mnemonic = be2.mnemonic[instruction.mnemonic_index]
+    mnemonic_name: str = mnemonic.name.lower()
+    if mnemonic_name not in (
+        "xor",
+        "xorpd",
+        "xorps",
+        "pxor",
+    ):
+        return
+
+    operands: List[BinExport2.Operand] = [be2.operand[operand_index] for operand_index in instruction.operand_index]
+
+    if mnemonic_name in ("xor", "xorpd", "xorps", "pxor"):
+        if operands[0] == operands[1]:
+            return
+        if is_security_cookie(fhi, bbh.inner, instruction):
+            return
+
+    yield Characteristic("nzxor"), ih.address
+
+
+def extract_function_indirect_call_characteristic_features(
+    fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
+) -> Iterator[Tuple[Feature, Address]]:
+    yield from ()
diff --git a/capa/features/extractors/binexport2/helpers.py b/capa/features/extractors/binexport2/helpers.py
index 28f95cac6..4e2ef07e0 100644
--- a/capa/features/extractors/binexport2/helpers.py
+++ b/capa/features/extractors/binexport2/helpers.py
@@ -5,133 +5,45 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
-from typing import List, Optional
-from dataclasses import dataclass
+from typing import Set, List, Iterator, Optional
 
+import capa.features.extractors.helpers
+from capa.features.common import ARCH_I386, ARCH_AMD64, ARCH_AARCH64
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
+HAS_ARCH32 = {ARCH_I386}
+HAS_ARCH64 = {ARCH_AARCH64, ARCH_AMD64}
 
-@dataclass
-class OperandPhraseInfo:
-    scale: Optional[BinExport2.Expression] = None
-    index: Optional[BinExport2.Expression] = None
-    base: Optional[BinExport2.Expression] = None
-    displacement: Optional[BinExport2.Expression] = None
+HAS_ARCH_INTEL = {ARCH_I386, ARCH_AMD64}
+HAS_ARCH_ARM = {ARCH_AARCH64}
 
 
-def is_vertex_type(vertex: BinExport2.CallGraph.Vertex, type_: BinExport2.CallGraph.Vertex.Type.ValueType) -> bool:
-    return vertex.HasField("type") and vertex.type == type_
+def mask_immediate(arch: Set[str], immediate: int) -> int:
+    if arch & HAS_ARCH64:
+        immediate &= 0xFFFFFFFFFFFFFFFF
+    elif arch & HAS_ARCH32:
+        immediate &= 0xFFFFFFFF
+    return immediate
 
 
-def get_operand_phrase_info(be2: BinExport2, operand: BinExport2.Operand) -> Optional[OperandPhraseInfo]:
-    # assume the following (see https://blog.yossarian.net/2020/06/13/How-x86_64-addresses-memory):
-    #
-    # Scale: A 2-bit constant factor
-    # Index: Any general purpose register
-    # Base: Any general purpose register
-    # Displacement: An integral offset
-
-    expressions: List[BinExport2.Expression] = get_operand_expressions(be2, operand)
-
-    # skip expression up to and including BinExport2.Expression.DEREFERENCE, assume caller
-    # has checked for BinExport2.Expression.DEREFERENCE
-    for i, expression in enumerate(expressions):
-        if expression.type == BinExport2.Expression.DEREFERENCE:
-            expressions = expressions[i + 1 :]
-            break
-
-    expression0: BinExport2.Expression
-    expression1: BinExport2.Expression
-    expression2: BinExport2.Expression
-    expression3: BinExport2.Expression
-    expression4: BinExport2.Expression
-
-    if len(expressions) == 1:
-        expression0 = expressions[0]
-
-        assert (
-            expression0.type == BinExport2.Expression.IMMEDIATE_INT
-            or expression0.type == BinExport2.Expression.REGISTER
-        )
-
-        if expression0.type == BinExport2.Expression.IMMEDIATE_INT:
-            # Displacement
-            return OperandPhraseInfo(displacement=expression0)
-        elif expression0.type == BinExport2.Expression.REGISTER:
-            # Base
-            return OperandPhraseInfo(base=expression0)
-
-    elif len(expressions) == 3:
-        expression0 = expressions[0]
-        expression1 = expressions[1]
-        expression2 = expressions[2]
-
-        assert expression0.type == BinExport2.Expression.REGISTER
-        assert expression1.type == BinExport2.Expression.OPERATOR
-        assert (
-            expression2.type == BinExport2.Expression.IMMEDIATE_INT
-            or expression2.type == BinExport2.Expression.REGISTER
-        )
-
-        if expression2.type == BinExport2.Expression.REGISTER:
-            # Base + Index
-            return OperandPhraseInfo(base=expression0, index=expression2)
-        elif expression2.type == BinExport2.Expression.IMMEDIATE_INT:
-            # Base + Displacement
-            return OperandPhraseInfo(base=expression0, displacement=expression2)
-
-    elif len(expressions) == 5:
-        expression0 = expressions[0]
-        expression1 = expressions[1]
-        expression2 = expressions[2]
-        expression3 = expressions[3]
-        expression4 = expressions[4]
-
-        assert expression0.type == BinExport2.Expression.REGISTER
-        assert expression1.type == BinExport2.Expression.OPERATOR
-        assert (
-            expression2.type == BinExport2.Expression.REGISTER
-            or expression2.type == BinExport2.Expression.IMMEDIATE_INT
-        )
-        assert expression3.type == BinExport2.Expression.OPERATOR
-        assert expression4.type == BinExport2.Expression.IMMEDIATE_INT
-
-        if expression1.symbol == "+" and expression3.symbol == "+":
-            # Base + Index + Displacement
-            return OperandPhraseInfo(base=expression0, index=expression2, displacement=expression4)
-        elif expression1.symbol == "+" and expression3.symbol == "*":
-            # Base + (Index * Scale)
-            return OperandPhraseInfo(base=expression0, index=expression2, scale=expression3)
-        elif expression1.symbol == "*" and expression3.symbol == "+":
-            # (Index * Scale) + Displacement
-            return OperandPhraseInfo(index=expression0, scale=expression2, displacement=expression3)
-        else:
-            raise NotImplementedError(expression1.symbol, expression3.symbol)
-
-    elif len(expressions) == 7:
-        expression0 = expressions[0]
-        expression1 = expressions[1]
-        expression2 = expressions[2]
-        expression3 = expressions[3]
-        expression4 = expressions[4]
-        expression5 = expressions[5]
-        expression6 = expressions[6]
-
-        assert expression0.type == BinExport2.Expression.REGISTER
-        assert expression1.type == BinExport2.Expression.OPERATOR
-        assert expression2.type == BinExport2.Expression.REGISTER
-        assert expression3.type == BinExport2.Expression.OPERATOR
-        assert expression4.type == BinExport2.Expression.IMMEDIATE_INT
-        assert expression5.type == BinExport2.Expression.OPERATOR
-        assert expression6.type == BinExport2.Expression.IMMEDIATE_INT
-
-        # Base + (Index * Scale) + Displacement
-        return OperandPhraseInfo(base=expression0, index=expression2, scale=expression4, displacement=expression6)
+def twos_complement(arch: Set[str], immediate: int, default: Optional[int] = None) -> int:
+    if default is not None:
+        return capa.features.extractors.helpers.twos_complement(immediate, default)
+    elif arch & HAS_ARCH64:
+        return capa.features.extractors.helpers.twos_complement(immediate, 64)
+    elif arch & HAS_ARCH32:
+        return capa.features.extractors.helpers.twos_complement(immediate, 32)
+    return immediate
 
-    else:
-        raise NotImplementedError(len(expressions))
 
-    return None
+def is_address_mapped(be2: BinExport2, address: int) -> bool:
+    """return True if the given address is mapped"""
+    sections_with_perms: Iterator[BinExport2.Section] = filter(lambda s: s.flag_r or s.flag_w or s.flag_x, be2.section)
+    return any(section.address <= address < section.address + section.size for section in sections_with_perms)
+
+
+def is_vertex_type(vertex: BinExport2.CallGraph.Vertex, type_: BinExport2.CallGraph.Vertex.Type.ValueType) -> bool:
+    return vertex.HasField("type") and vertex.type == type_
 
 
 def _get_operand_expression_list(
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 50488e146..eab2182f8 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -6,13 +6,15 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 import logging
-from typing import List, Tuple, Iterator, Optional
+from typing import List, Tuple, Iterator
 
 import capa.features.extractors.helpers
 import capa.features.extractors.strings
 import capa.features.extractors.binexport2.helpers
-from capa.features.insn import API, MAX_STRUCTURE_SIZE, Number, Offset, Mnemonic, OperandNumber, OperandOffset
-from capa.features.common import ARCH_I386, ARCH_AMD64, ARCH_AARCH64, Bytes, String, Feature, Characteristic
+import capa.features.extractors.binexport2.arch.arm.insn
+import capa.features.extractors.binexport2.arch.intel.insn
+from capa.features.insn import API, Mnemonic
+from capa.features.common import Bytes, String, Feature, Characteristic
 from capa.features.address import Address, AbsoluteVirtualAddress
 from capa.features.extractors.binexport2 import (
     AddressSpace,
@@ -20,40 +22,15 @@
     BinExport2Index,
     FunctionContext,
     ReadMemoryError,
-    BasicBlockContext,
     BinExport2Analysis,
     InstructionContext,
 )
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
-from capa.features.extractors.binexport2.helpers import (
-    OperandPhraseInfo,
-    get_operand_phrase_info,
-    get_operand_register_expression,
-    get_operand_immediate_expression,
-)
+from capa.features.extractors.binexport2.helpers import HAS_ARCH_ARM, HAS_ARCH_INTEL
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
 logger = logging.getLogger(__name__)
 
-# security cookie checks may perform non-zeroing XORs, these are expected within a certain
-# byte range within the first and returning basic blocks, this helps to reduce FP features
-SECURITY_COOKIE_BYTES_DELTA: int = 0x40
-
-
-HAS_ARCH32 = {ARCH_I386}
-HAS_ARCH64 = {ARCH_AARCH64, ARCH_AMD64}
-
-HAS_ARCH_INTEL = {ARCH_I386, ARCH_AMD64}
-HAS_ARCH_ARM = {ARCH_AARCH64}
-
-
-def mask_immediate(fhi: FunctionContext, immediate: int) -> int:
-    if fhi.arch & HAS_ARCH64:
-        immediate &= 0xFFFFFFFFFFFFFFFF
-    elif fhi.arch & HAS_ARCH32:
-        immediate &= 0xFFFFFFFF
-    return immediate
-
 
 def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
     fhi: FunctionContext = fh.inner
@@ -103,65 +80,15 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
     """
 
 
-def is_address_mapped(be2: BinExport2, address: int) -> bool:
-    """return True if the given address is mapped"""
-    sections_with_perms: Iterator[BinExport2.Section] = filter(lambda s: s.flag_r or s.flag_w or s.flag_x, be2.section)
-    return any(section.address <= address < section.address + section.size for section in sections_with_perms)
-
-
 def extract_insn_number_features(
-    fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
+    fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
 ) -> Iterator[Tuple[Feature, Address]]:
     fhi: FunctionContext = fh.inner
-    ii: InstructionContext = ih.inner
-
-    be2: BinExport2 = fhi.ctx.be2
-
-    instruction_index: int = ii.instruction_index
-    instruction: BinExport2.Instruction = be2.instruction[instruction_index]
-
-    if len(instruction.operand_index) == 0:
-        # skip things like:
-        #   .text:0040116e leave
-        return
-
-    mnemonic: str = be2.mnemonic[instruction.mnemonic_index].name.lower()
 
     if fhi.arch & HAS_ARCH_INTEL:
-        # short-circut checks for intel architecture
-        if mnemonic.startswith("ret"):
-            # skip things like:
-            #   .text:0042250E retn 8
-            return
-
-        if mnemonic.startswith(("add", "sub")):
-            register_expression: Optional[BinExport2.Expression] = get_operand_register_expression(
-                be2, be2.operand[instruction.operand_index[0]]
-            )
-            if register_expression and register_expression.symbol.lower().endswith(("sp", "bp")):
-                # skip things like:
-                # 0x415bbc  ADD         ESP, 0xC
-                return
-
-    for i, operand_index in enumerate(instruction.operand_index):
-        operand: BinExport2.Operand = be2.operand[operand_index]
-
-        immediate_expression: Optional[BinExport2.Expression] = get_operand_immediate_expression(be2, operand)
-        if not immediate_expression:
-            continue
-
-        value: int = mask_immediate(fhi, immediate_expression.immediate)
-        if is_address_mapped(be2, value):
-            continue
-
-        yield Number(value), ih.address
-        yield OperandNumber(i, value), ih.address
-
-        if fhi.arch & HAS_ARCH_INTEL:
-            if mnemonic.startswith("add"):
-                if 0 < value < MAX_STRUCTURE_SIZE:
-                    yield Offset(value), ih.address
-                    yield OperandOffset(i, value), ih.address
+        yield from capa.features.extractors.binexport2.arch.intel.insn.extract_insn_number_features(fh, bbh, ih)
+    elif fhi.arch & HAS_ARCH_ARM:
+        yield from capa.features.extractors.binexport2.arch.arm.insn.extract_insn_number_features(fh, bbh, ih)
 
 
 def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
@@ -246,133 +173,26 @@ def extract_insn_offset_features(
     fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
 ) -> Iterator[Tuple[Feature, Address]]:
     fhi: FunctionContext = fh.inner
-    ii: InstructionContext = ih.inner
-
-    be2: BinExport2 = fhi.ctx.be2
-    instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
-
-    if len(instruction.operand_index) == 0:
-        # skip things like:
-        #   .text:0040116e leave
-        return
-
-    mnemonic: str = be2.mnemonic[instruction.mnemonic_index].name.lower()
-
-    for i, operand_index in enumerate(instruction.operand_index):
-        operand: BinExport2.Operand = be2.operand[operand_index]
-
-        is_dereference = False
-        for expression_index in operand.expression_index:
-            if be2.expression[expression_index].type == BinExport2.Expression.DEREFERENCE:
-                is_dereference = True
-                break
-
-        if not is_dereference:
-            continue
 
-        if fhi.arch & HAS_ARCH_INTEL:
-            phrase_info: Optional[OperandPhraseInfo] = get_operand_phrase_info(be2, operand)
-            if not phrase_info:
-                continue
-
-            if phrase_info.displacement:
-                if phrase_info.base and phrase_info.base.symbol.lower().endswith(("bp", "sp")):
-                    # skips things like:
-                    # 00401068 MOV dword ptr [EBP + local_8],EAX
-                    continue
-
-                value: int = mask_immediate(fhi, phrase_info.displacement.immediate)
-                if not is_address_mapped(be2, value):
-                    value = capa.features.extractors.helpers.twos_complement(value, 32)
-
-                    yield Offset(value), ih.address
-                    yield OperandOffset(i, value), ih.address
-
-                    if mnemonic == "lea" and i == 1:
-                        if phrase_info.base and not any((phrase_info.scale, phrase_info.index)):
-                            yield Number(value), ih.address
-                            yield OperandNumber(i, value), ih.address
-
-            elif phrase_info.base and not any((phrase_info.index, phrase_info.scale)):
-                # like:
-                # 00401062 MOVZX EAX,word ptr [EDI]
-                yield Offset(0), ih.address
-                yield OperandOffset(i, 0), ih.address
-
-
-def is_security_cookie(
-    fhi: FunctionContext,
-    bbi: BasicBlockContext,
-    instruction: BinExport2.Instruction,
-) -> bool:
-    """
-    check if an instruction is related to security cookie checks.
-    """
-    be2: BinExport2 = fhi.ctx.be2
-
-    # security cookie check should use SP or BP
-    op1: BinExport2.Operand = be2.operand[instruction.operand_index[1]]
-    op1_exprs: List[BinExport2.Expression] = [be2.expression[expr_i] for expr_i in op1.expression_index]
-    if all(expr.symbol.lower() not in ("bp", "esp", "ebp", "rbp", "rsp") for expr in op1_exprs):
-        return False
-
-    # check_nzxor_security_cookie_delta
-    # if insn falls at the start of first entry block of the parent function.
-    flow_graph: BinExport2.FlowGraph = be2.flow_graph[fhi.flow_graph_index]
-    basic_block_index: int = bbi.basic_block_index
-    bb: BinExport2.BasicBlock = be2.basic_block[basic_block_index]
-    if flow_graph.entry_basic_block_index == basic_block_index:
-        first_addr: int = min((be2.instruction[ir.begin_index].address for ir in bb.instruction_index))
-        if instruction.address < first_addr + SECURITY_COOKIE_BYTES_DELTA:
-            return True
-    # or insn falls at the end before return in a terminal basic block.
-    if basic_block_index not in (e.source_basic_block_index for e in flow_graph.edge):
-        last_addr: int = max((be2.instruction[ir.end_index - 1].address for ir in bb.instruction_index))
-        if instruction.address > last_addr - SECURITY_COOKIE_BYTES_DELTA:
-            return True
-    return False
+    if fhi.arch & HAS_ARCH_INTEL:
+        yield from capa.features.extractors.binexport2.arch.intel.insn.extract_insn_offset_features(fh, bbh, ih)
+    elif fhi.arch & HAS_ARCH_ARM:
+        yield from capa.features.extractors.binexport2.arch.arm.insn.extract_insn_offset_features(fh, bbh, ih)
 
 
 def extract_insn_nzxor_characteristic_features(
     fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
 ) -> Iterator[Tuple[Feature, Address]]:
-    """
-    parse non-zeroing XOR instruction from the given instruction.
-    ignore expected non-zeroing XORs, e.g. security cookies.
-    """
     fhi: FunctionContext = fh.inner
-    ii: InstructionContext = ih.inner
 
-    be2: BinExport2 = fhi.ctx.be2
-
-    instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
-    mnemonic: BinExport2.Mnemonic = be2.mnemonic[instruction.mnemonic_index]
-    mnemonic_name: str = mnemonic.name.lower()
-    if mnemonic_name not in (
-        "xor",
-        "xorpd",
-        "xorps",
-        "pxor",  # x86 / amd64
-        "eor",  # arm / aarch64
-    ):
-        return
-
-    operands: List[BinExport2.Operand] = [be2.operand[operand_index] for operand_index in instruction.operand_index]
-
-    # check whether operands are same for x86 / amd64
-    if mnemonic_name in ("xor", "xorpd", "xorps", "pxor"):
-        if operands[0] == operands[1]:
-            return
-        if is_security_cookie(fhi, bbh.inner, instruction):
-            return
-
-    # check whether 2nd/3rd operands are same for arm / aarch64
-    if mnemonic_name == "eor":
-        assert len(operands) == 3
-        if operands[1] == operands[2]:
-            return
-
-    yield Characteristic("nzxor"), ih.address
+    if fhi.arch & HAS_ARCH_INTEL:
+        yield from capa.features.extractors.binexport2.arch.intel.insn.extract_insn_nzxor_characteristic_features(
+            fh, bbh, ih
+        )
+    elif fhi.arch & HAS_ARCH_ARM:
+        yield from capa.features.extractors.binexport2.arch.arm.insn.extract_insn_nzxor_characteristic_features(
+            fh, bbh, ih
+        )
 
 
 def extract_insn_mnemonic_features(
@@ -412,9 +232,16 @@ def extract_function_calls_from(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandl
 def extract_function_indirect_call_characteristic_features(
     fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
 ) -> Iterator[Tuple[Feature, Address]]:
-    # TODO(wb): complete
-    # https://github.com/mandiant/capa/issues/1755
-    yield from ()
+    fhi: FunctionContext = fh.inner
+
+    if fhi.arch & HAS_ARCH_INTEL:
+        yield from capa.features.extractors.binexport2.arch.intel.insn.extract_function_indirect_call_characteristic_features(
+            fh, bbh, ih
+        )
+    elif fhi.arch & HAS_ARCH_ARM:
+        yield from capa.features.extractors.binexport2.arch.arm.insn.extract_function_indirect_call_characteristic_features(
+            fh, bbh, ih
+        )
 
 
 def extract_features(f: FunctionHandle, bbh: BBHandle, insn: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index 1cf8686f4..872a89d11 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -46,7 +46,7 @@
             "687e79.ghidra.be2",
             "file",
             capa.features.file.Export("android::clearDir"),
-            "xfail: not implemented yet?!",
+            "xfail: name demangling is not implemented",
         ),
         ("687e79.ghidra.be2", "file", capa.features.file.Export("nope"), False),
         # file/imports
@@ -104,19 +104,6 @@
             capa.features.common.Characteristic("stack string"),
             "xfail: not implemented yet",
         ),
-        # bb/characteristic(tight loop)
-        (
-            "687e79.ghidra.be2",
-            "function=0x0,bb=0x0",
-            capa.features.common.Characteristic("tight loop"),
-            "xfail: not implemented yet",
-        ),
-        (
-            "687e79.ghidra.be2",
-            "function=0x0,bb=0x0",
-            capa.features.common.Characteristic("tight loop"),
-            "xfail: not implemented yet",
-        ),
         # insn/mnemonic
         ("687e79.ghidra.be2", "function=0x107588", capa.features.insn.Mnemonic("stp"), True),
         ("687e79.ghidra.be2", "function=0x107588", capa.features.insn.Mnemonic("adrp"), True),
@@ -142,24 +129,24 @@
             capa.features.insn.OperandNumber(1, 0x8),
             True,
         ),
-        (
-            "687e79.ghidra.be2",
-            "function=0x107588,bb=0x107588,insn=0x1075b8",
-            capa.features.insn.OperandNumber(3, 0x10),
-            "xfail: GSM?",
-        ),  # TODO(mr): https://github.com/mandiant/capa/issues/2102
         # insn/operand.offset
         (
             "687e79.ghidra.be2",
-            "function=0x0,bb=0x0",
-            capa.features.insn.OperandOffset(1, 100),
-            "xfail: not implemented yet",
+            "function=0x105128,bb=0x105450",
+            capa.features.insn.OperandOffset(2, 0x10),
+            True,
         ),
         (
-            "687e79.ghidra.be2",
-            "function=0x0,bb=0x0",
-            capa.features.insn.OperandOffset(3, 100),
-            "xfail: not implemented yet",
+            "d1e650.ghidra.be2",
+            "function=0x124854,bb=0x1248AC,insn=0x1248B4",
+            capa.features.insn.OperandOffset(2, -0x48),
+            True,
+        ),
+        (
+            "d1e650.ghidra.be2",
+            "function=0x13347c,bb=0x133548,insn=0x133554",
+            capa.features.insn.OperandOffset(2, 0x20),
+            False,
         ),
         # insn/number
         ("687e79.ghidra.be2", "function=0x107588", capa.features.insn.Number(0x3), True),
@@ -181,7 +168,7 @@
             "687e79.ghidra.be2",
             "function=0x1057f8,bb=0x1057f8",
             capa.features.insn.Number(0xFFFFFFFFFFFFFFFF),
-            "xfail: not implemented yet",
+            True,
         ),
         (
             "687e79.ghidra.be2",
@@ -192,34 +179,22 @@
         # insn/offset
         (
             "687e79.ghidra.be2",
-            "function=0x0",
-            capa.features.insn.Offset(0x0),
-            "xfail: not implemented yet",
-        ),
-        (
-            "687e79.ghidra.be2",
-            "function=0x0",
-            capa.features.insn.Offset(0x4),
-            "xfail: not implemented yet",
+            "function=0x105128,bb=0x105450",
+            capa.features.insn.Offset(0x10),
+            True,
         ),
         (
-            "687e79.ghidra.be2",
-            "function=0x0",
-            capa.features.insn.Offset(0xC),
-            "xfail: not implemented yet",
+            "d1e650.ghidra.be2",
+            "function=0x13347c,bb=0x133548,insn=0x133554",
+            capa.features.insn.Offset(0x20),
+            False,
         ),
         # insn/offset: negative
         (
-            "687e79.ghidra.be2",
-            "function=0x0",
-            capa.features.insn.Offset(-0x1),
-            "xfail: not implemented yet",
-        ),
-        (
-            "687e79.ghidra.be2",
-            "function=0x0",
-            capa.features.insn.Offset(-0x2),
-            "xfail: not implemented yet",
+            "d1e650.ghidra.be2",
+            "function=0x124854,bb=0x1248AC,insn=0x1248B4",
+            capa.features.insn.Offset(-0x48),
+            True,
         ),
         # insn/offset from mnemonic: add
         #

From be5f49a88d9f46b488557b580ede4b69edb7e2e8 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Mon, 12 Aug 2024 13:24:18 -0600
Subject: [PATCH 160/200] binexport: intel: emit indirect call characteristic

---
 .../extractors/binexport2/arch/intel/insn.py  | 38 ++++++++++++++++++-
 .../features/extractors/binexport2/helpers.py |  4 ++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/capa/features/extractors/binexport2/arch/intel/insn.py b/capa/features/extractors/binexport2/arch/intel/insn.py
index 9836ef1f7..20d505f8a 100644
--- a/capa/features/extractors/binexport2/arch/intel/insn.py
+++ b/capa/features/extractors/binexport2/arch/intel/insn.py
@@ -18,6 +18,7 @@
 from capa.features.extractors.binexport2.helpers import (
     mask_immediate,
     is_address_mapped,
+    get_instruction_mnemonic,
     get_operand_register_expression,
     get_operand_immediate_expression,
 )
@@ -211,4 +212,39 @@ def extract_insn_nzxor_characteristic_features(
 def extract_function_indirect_call_characteristic_features(
     fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
 ) -> Iterator[Tuple[Feature, Address]]:
-    yield from ()
+    fhi: FunctionContext = fh.inner
+    ii: InstructionContext = ih.inner
+
+    be2: BinExport2 = fhi.ctx.be2
+    instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
+
+    if len(instruction.operand_index) == 0:
+        # skip things like:
+        #   .text:0040116e leave
+        return
+
+    mnemonic: str = get_instruction_mnemonic(be2, instruction)
+    if mnemonic not in ("call", "jmp"):
+        return
+
+    assert len(instruction.operand_index) == 1
+
+    operand: BinExport2.Operand = be2.operand[instruction.operand_index[0]]
+
+    if len(operand.expression_index) == 1:
+        expression0: BinExport2.Expression = be2.expression[operand.expression_index[0]]
+        # call edx
+        if expression0.type == BinExport2.Expression.REGISTER:
+            yield Characteristic("indirect call"), ih.address
+    else:
+        is_dereference = False
+        for expression_index in operand.expression_index:
+            if be2.expression[expression_index].type == BinExport2.Expression.DEREFERENCE:
+                is_dereference = True
+                break
+
+        if is_dereference:
+            phrase_info: Optional[OperandPhraseInfo] = get_operand_phrase_info(be2, operand)
+            if phrase_info and phrase_info.base:
+                # call dword ptr [eax+50h]
+                yield Characteristic("indirect call"), ih.address
diff --git a/capa/features/extractors/binexport2/helpers.py b/capa/features/extractors/binexport2/helpers.py
index 4e2ef07e0..285ab17f0 100644
--- a/capa/features/extractors/binexport2/helpers.py
+++ b/capa/features/extractors/binexport2/helpers.py
@@ -224,3 +224,7 @@ def get_operand_immediate_expression(be2: BinExport2, operand: BinExport2.Operan
                 return expression1
 
     return None
+
+
+def get_instruction_mnemonic(be2: BinExport2, instruction: BinExport2.Instruction) -> str:
+    return be2.mnemonic[instruction.mnemonic_index].name.lower()

From 08c3429de3036f8e5c27cf13c9657898beda15cf Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Mon, 12 Aug 2024 13:30:48 -0600
Subject: [PATCH 161/200] binexport: use helper method for instruction mnemonic

---
 capa/features/extractors/binexport2/arch/arm/insn.py  |  5 +++--
 .../features/extractors/binexport2/arch/intel/insn.py | 11 +++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/capa/features/extractors/binexport2/arch/arm/insn.py b/capa/features/extractors/binexport2/arch/arm/insn.py
index 3fdc6fb9f..65de8da7d 100644
--- a/capa/features/extractors/binexport2/arch/arm/insn.py
+++ b/capa/features/extractors/binexport2/arch/arm/insn.py
@@ -18,6 +18,7 @@
     mask_immediate,
     is_address_mapped,
     get_operand_expressions,
+    get_instruction_mnemonic,
     get_operand_immediate_expression,
 )
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
@@ -70,7 +71,7 @@ def extract_insn_offset_features(
         #   .text:0040116e leave
         return
 
-    mnemonic: str = be2.mnemonic[instruction.mnemonic_index].name.lower()
+    mnemonic: str = get_instruction_mnemonic(be2, instruction)
 
     for i, operand_index in enumerate(instruction.operand_index):
         operand: BinExport2.Operand = be2.operand[operand_index]
@@ -112,7 +113,7 @@ def extract_insn_nzxor_characteristic_features(
     be2: BinExport2 = fhi.ctx.be2
 
     instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
-    mnemonic: str = be2.mnemonic[instruction.mnemonic_index].name.lower()
+    mnemonic: str = get_instruction_mnemonic(be2, instruction)
 
     if mnemonic != "eor":
         return
diff --git a/capa/features/extractors/binexport2/arch/intel/insn.py b/capa/features/extractors/binexport2/arch/intel/insn.py
index 20d505f8a..94459de14 100644
--- a/capa/features/extractors/binexport2/arch/intel/insn.py
+++ b/capa/features/extractors/binexport2/arch/intel/insn.py
@@ -48,7 +48,7 @@ def extract_insn_number_features(
         #   .text:0040116e leave
         return
 
-    mnemonic: str = be2.mnemonic[instruction.mnemonic_index].name.lower()
+    mnemonic: str = get_instruction_mnemonic(be2, instruction)
 
     if mnemonic.startswith("ret"):
         # skip things like:
@@ -98,7 +98,7 @@ def extract_insn_offset_features(
         #   .text:0040116e leave
         return
 
-    mnemonic: str = be2.mnemonic[instruction.mnemonic_index].name.lower()
+    mnemonic: str = get_instruction_mnemonic(be2, instruction)
     value: int
 
     for i, operand_index in enumerate(instruction.operand_index):
@@ -188,9 +188,8 @@ def extract_insn_nzxor_characteristic_features(
     be2: BinExport2 = fhi.ctx.be2
 
     instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
-    mnemonic: BinExport2.Mnemonic = be2.mnemonic[instruction.mnemonic_index]
-    mnemonic_name: str = mnemonic.name.lower()
-    if mnemonic_name not in (
+    mnemonic: str = get_instruction_mnemonic(be2, instruction)
+    if mnemonic not in (
         "xor",
         "xorpd",
         "xorps",
@@ -200,7 +199,7 @@ def extract_insn_nzxor_characteristic_features(
 
     operands: List[BinExport2.Operand] = [be2.operand[operand_index] for operand_index in instruction.operand_index]
 
-    if mnemonic_name in ("xor", "xorpd", "xorps", "pxor"):
+    if mnemonic in ("xor", "xorpd", "xorps", "pxor"):
         if operands[0] == operands[1]:
             return
         if is_security_cookie(fhi, bbh.inner, instruction):

From a388b716c5fcc1862ea1967987cfd34583e22868 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Mon, 12 Aug 2024 14:09:21 -0600
Subject: [PATCH 162/200] binexport: arm: emit offset features from stp
 instruction

---
 capa/features/extractors/binexport2/arch/arm/insn.py | 2 +-
 tests/test_binexport_features.py                     | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/capa/features/extractors/binexport2/arch/arm/insn.py b/capa/features/extractors/binexport2/arch/arm/insn.py
index 65de8da7d..9da0bc1dc 100644
--- a/capa/features/extractors/binexport2/arch/arm/insn.py
+++ b/capa/features/extractors/binexport2/arch/arm/insn.py
@@ -85,7 +85,7 @@ def extract_insn_offset_features(
         if not is_dereference:
             continue
 
-        if mnemonic == "ldp":
+        if mnemonic in ("ldp", "stp"):
             # like:
             # 0013a2f0 ldp x22,x9,[x21, #0x18]
             expressions: List[BinExport2.Expression] = get_operand_expressions(be2, operand)
diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index 872a89d11..fa02285f1 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -189,6 +189,12 @@
             capa.features.insn.Offset(0x20),
             False,
         ),
+        (
+            "d1e650.ghidra.be2",
+            "function=0x1183e0,bb=0x11849c,insn=0x1184b0",
+            capa.features.insn.Offset(0x8),
+            True,
+        ),
         # insn/offset: negative
         (
             "d1e650.ghidra.be2",

From d74c1daf85cc526891af9e998b8a893aabfa1d3f Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Mon, 12 Aug 2024 14:29:39 -0600
Subject: [PATCH 163/200] binexport: arm: emit indirect call characteristic

---
 .../extractors/binexport2/arch/arm/insn.py    | 20 ++++++++++++++++++-
 tests/test_binexport_features.py              | 12 +++++------
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/capa/features/extractors/binexport2/arch/arm/insn.py b/capa/features/extractors/binexport2/arch/arm/insn.py
index 9da0bc1dc..7fb80f72c 100644
--- a/capa/features/extractors/binexport2/arch/arm/insn.py
+++ b/capa/features/extractors/binexport2/arch/arm/insn.py
@@ -129,4 +129,22 @@ def extract_insn_nzxor_characteristic_features(
 def extract_function_indirect_call_characteristic_features(
     fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
 ) -> Iterator[Tuple[Feature, Address]]:
-    yield from ()
+    fhi: FunctionContext = fh.inner
+    ii: InstructionContext = ih.inner
+
+    be2: BinExport2 = fhi.ctx.be2
+
+    instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
+    mnemonic: str = get_instruction_mnemonic(be2, instruction)
+
+    if mnemonic not in ("blx", "bx", "blr"):
+        return
+
+    assert len(instruction.operand_index) == 1
+
+    expressions: List[BinExport2.Expression] = get_operand_expressions(be2, be2.operand[instruction.operand_index[0]])
+
+    assert len(expressions) == 1
+
+    if expressions[0].type == BinExport2.Expression.REGISTER:
+        yield Characteristic("indirect call"), ih.address
diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index fa02285f1..75a224a81 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -328,16 +328,16 @@
         ),
         # insn/characteristic(indirect call)
         (
-            "687e79.ghidra.be2",
-            "function=0x0",
+            "d1e650.ghidra.be2",
+            "function=0x118620",
             capa.features.common.Characteristic("indirect call"),
-            "xfail: not implemented yet",
+            True
         ),
         (
-            "687e79.ghidra.be2",
-            "function=0x0",
+            "d1e650.ghidra.be2",
+            "function=0x118500",
             capa.features.common.Characteristic("indirect call"),
-            "xfail: not implemented yet",
+            False,
         ),
         # insn/characteristic(calls from)
         (

From fe48a75d8b04808178b70523a32a6ae2caa84efb Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Mon, 12 Aug 2024 17:26:01 -0600
Subject: [PATCH 164/200] binexport: arm: improve offset feature extraction

---
 .../extractors/binexport2/arch/arm/helpers.py |  15 +++
 .../extractors/binexport2/arch/arm/insn.py    | 106 ++++++++++++++----
 .../features/extractors/binexport2/helpers.py |   4 +
 tests/test_binexport_features.py              |  37 +++++-
 4 files changed, 134 insertions(+), 28 deletions(-)
 create mode 100644 capa/features/extractors/binexport2/arch/arm/helpers.py

diff --git a/capa/features/extractors/binexport2/arch/arm/helpers.py b/capa/features/extractors/binexport2/arch/arm/helpers.py
new file mode 100644
index 000000000..13e1f8b64
--- /dev/null
+++ b/capa/features/extractors/binexport2/arch/arm/helpers.py
@@ -0,0 +1,15 @@
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+
+from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
+
+
+def is_stack_register_expression(be2: BinExport2, expression: BinExport2.Expression) -> bool:
+    return bool(
+        expression and expression.type == BinExport2.Expression.REGISTER and expression.symbol.lower().endswith("sp")
+    )
diff --git a/capa/features/extractors/binexport2/arch/arm/insn.py b/capa/features/extractors/binexport2/arch/arm/insn.py
index 7fb80f72c..a77b6be41 100644
--- a/capa/features/extractors/binexport2/arch/arm/insn.py
+++ b/capa/features/extractors/binexport2/arch/arm/insn.py
@@ -19,9 +19,11 @@
     is_address_mapped,
     get_operand_expressions,
     get_instruction_mnemonic,
+    get_instruction_operands,
     get_operand_immediate_expression,
 )
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
+from capa.features.extractors.binexport2.arch.arm.helpers import is_stack_register_expression
 
 logger = logging.getLogger(__name__)
 
@@ -72,36 +74,96 @@ def extract_insn_offset_features(
         return
 
     mnemonic: str = get_instruction_mnemonic(be2, instruction)
+    value: Optional[int] = None
+    value_index: Optional[int] = None
 
-    for i, operand_index in enumerate(instruction.operand_index):
-        operand: BinExport2.Operand = be2.operand[operand_index]
+    operands: List[BinExport2.Operand]
+    immediate_expression: Optional[BinExport2.Expression]
 
-        is_dereference = False
-        for expression_index in operand.expression_index:
-            if be2.expression[expression_index].type == BinExport2.Expression.DEREFERENCE:
-                is_dereference = True
-                break
-
-        if not is_dereference:
-            continue
+    if mnemonic.startswith(("ldr", "str")):
+        operands = get_instruction_operands(be2, instruction)
+        expressions1: List[BinExport2.Expression]
 
-        if mnemonic in ("ldp", "stp"):
+        if len(operands) == 2:
             # like:
-            # 0013a2f0 ldp x22,x9,[x21, #0x18]
-            expressions: List[BinExport2.Expression] = get_operand_expressions(be2, operand)
-            if len(expressions) <= 2:
-                continue
+            # ldr x0, [x1, 8]
+            expressions1 = get_operand_expressions(be2, operands[1])
+
+            if len(expressions1) == 4:
+                # like:
+                # ldr x0, [x1, 8]
+                if not is_stack_register_expression(be2, expressions1[1]):
+                    if expressions1[3].type == BinExport2.Expression.IMMEDIATE_INT:
+                        value = expressions1[3].immediate
+                        value_index = 1
+
+            elif len(expressions1) == 5:
+                # like
+                # ldr x0, [x1, 8]!
+                if not is_stack_register_expression(be2, expressions1[2]):
+                    if expressions1[4].type == BinExport2.Expression.IMMEDIATE_INT:
+                        value = expressions1[4].immediate
+                        value_index = 1
+
+        elif len(operands) == 3:
+            # like:
+            # ldr x0, [x1], 8
+            expressions1 = get_operand_expressions(be2, operands[1])
+            if not is_stack_register_expression(be2, expressions1[1]):
+                immediate_expression = get_operand_immediate_expression(be2, operands[2])
+
+                if immediate_expression:
+                    value = immediate_expression.immediate
+                    value_index = 2
 
-            if expressions[1].symbol.lower().endswith("sp"):
-                continue
+    elif mnemonic in ("ldp", "stp"):
+        operands = get_instruction_operands(be2, instruction)
+        expressions2: List[BinExport2.Expression]
 
-            value = mask_immediate(fhi.arch, expressions[-1].immediate)
+        if len(operands) == 3:
+            # like:
+            # ldp x0, x1, [x3, 8]!
+            expressions2 = get_operand_expressions(be2, operands[2])
+
+            if len(expressions2) == 4:
+                # like:
+                # ldp x0, x1, [x3, 8]
+                if not is_stack_register_expression(be2, expressions2[1]):
+                    if expressions2[3].type == BinExport2.Expression.IMMEDIATE_INT:
+                        value = expressions2[3].immediate
+                        value_index = 2
+
+            elif len(expressions2) == 5:
+                # like:
+                # ldp x0, x1, [x3, 8]!
+                if not is_stack_register_expression(be2, expressions2[2]):
+                    if expressions2[4].type == BinExport2.Expression.IMMEDIATE_INT:
+                        value = expressions2[4].immediate
+                        value_index = 2
+
+        elif len(operands) == 4:
+            # like
+            # ldp x0, x1, [x3], 8
+            expressions2 = get_operand_expressions(be2, operands[2])
+
+            if not is_stack_register_expression(be2, expressions2[1]):
+                immediate_expression = get_operand_immediate_expression(be2, operands[3])
+
+                if immediate_expression:
+                    value = immediate_expression.immediate
+                    value_index = 3
+
+    if value is None:
+        return
 
-            if not is_address_mapped(be2, value):
-                value = capa.features.extractors.binexport2.helpers.twos_complement(fhi.arch, value)
+    # we shouldn't make it here if index is not set
+    assert value_index is not None
 
-                yield Offset(value), ih.address
-                yield OperandOffset(i, value), ih.address
+    value = mask_immediate(fhi.arch, value)
+    if not is_address_mapped(be2, value):
+        value = capa.features.extractors.binexport2.helpers.twos_complement(fhi.arch, value)
+        yield Offset(value), ih.address
+        yield OperandOffset(value_index, value), ih.address
 
 
 def extract_insn_nzxor_characteristic_features(
diff --git a/capa/features/extractors/binexport2/helpers.py b/capa/features/extractors/binexport2/helpers.py
index 285ab17f0..b2ce75fa5 100644
--- a/capa/features/extractors/binexport2/helpers.py
+++ b/capa/features/extractors/binexport2/helpers.py
@@ -228,3 +228,7 @@ def get_operand_immediate_expression(be2: BinExport2, operand: BinExport2.Operan
 
 def get_instruction_mnemonic(be2: BinExport2, instruction: BinExport2.Instruction) -> str:
     return be2.mnemonic[instruction.mnemonic_index].name.lower()
+
+
+def get_instruction_operands(be2: BinExport2, instruction: BinExport2.Instruction) -> List[BinExport2.Operand]:
+    return [be2.operand[operand_index] for operand_index in instruction.operand_index]
diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index 75a224a81..f685ba24d 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -183,18 +183,48 @@
             capa.features.insn.Offset(0x10),
             True,
         ),
+        # ldp x29,x30,[sp, #0x20]
         (
             "d1e650.ghidra.be2",
             "function=0x13347c,bb=0x133548,insn=0x133554",
             capa.features.insn.Offset(0x20),
             False,
         ),
+        # stp x20,x0,[x19, #0x8]
         (
             "d1e650.ghidra.be2",
             "function=0x1183e0,bb=0x11849c,insn=0x1184b0",
             capa.features.insn.Offset(0x8),
             True,
         ),
+        # str xzr,[x8, #0x8]!
+        (
+            "d1e650.ghidra.be2",
+            "function=0x138688,bb=0x138994,insn=0x1389a8",
+            capa.features.insn.Offset(0x8),
+            True,
+        ),
+        # ldr x9,[x8, #0x8]!
+        (
+            "d1e650.ghidra.be2",
+            "function=0x138688,bb=0x138978,insn=0x138984",
+            capa.features.insn.Offset(0x8),
+            True,
+        ),
+        # ldr x19,[sp], #0x20
+        (
+            "d1e650.ghidra.be2",
+            "function=0x11451c",
+            capa.features.insn.Offset(0x20),
+            False,
+        ),
+        # ldrb w9,[x8, #0x1]
+        (
+            "d1e650.ghidra.be2",
+            "function=0x138a9c,bb=0x138b00,insn=0x138b00",
+            capa.features.insn.Offset(0x1),
+            True,
+        ),
         # insn/offset: negative
         (
             "d1e650.ghidra.be2",
@@ -327,12 +357,7 @@
             True,
         ),
         # insn/characteristic(indirect call)
-        (
-            "d1e650.ghidra.be2",
-            "function=0x118620",
-            capa.features.common.Characteristic("indirect call"),
-            True
-        ),
+        ("d1e650.ghidra.be2", "function=0x118620", capa.features.common.Characteristic("indirect call"), True),
         (
             "d1e650.ghidra.be2",
             "function=0x118500",

From c22f7736f8f8b130fa253235a22ebd067d867427 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 15 Aug 2024 13:55:52 -0600
Subject: [PATCH 165/200] binexport: add workaroud for Ghidra bug that results
 in empty operands (no expressions)

---
 capa/features/extractors/binexport2/helpers.py | 5 +++++
 scripts/inspect-binexport2.py                  | 5 ++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/capa/features/extractors/binexport2/helpers.py b/capa/features/extractors/binexport2/helpers.py
index b2ce75fa5..3bad3162c 100644
--- a/capa/features/extractors/binexport2/helpers.py
+++ b/capa/features/extractors/binexport2/helpers.py
@@ -171,6 +171,11 @@ def get_operand_expressions(be2: BinExport2, op: BinExport2.Operand) -> List[Bin
     #          5
     #
     # Remember, these are the indices into the entries in operand.expression_index.
+    if len(op.expression_index) == 0:
+        # Ghidra bug where empty operands (no expressions) may
+        # exist (see https://github.com/NationalSecurityAgency/ghidra/issues/6817)
+        return []
+
     exp_tree: List[List[int]] = []
     for i, exp_index in enumerate(op.expression_index):
         children = []
diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index 1a0abaffb..89c223d0f 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -364,7 +364,10 @@ def main(argv=None):
                                 operands = []
                                 for operand_index in instruction.operand_index:
                                     operand = be2.operand[operand_index]
-                                    operands.append(render_operand(be2, instruction, operand, index=operand_index))
+                                    # Ghidra bug where empty operands (no expressions) may
+                                    # exist so we skip those for now (see https://github.com/NationalSecurityAgency/ghidra/issues/6817)
+                                    if len(operand.expression_index) > 0:
+                                        operands.append(render_operand(be2, instruction, operand, index=operand_index))
 
                                 call_targets = ""
                                 if instruction.call_target:

From a9b7713df351279987c51b2172a1c4fdcb986e4c Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 15 Aug 2024 14:13:06 -0600
Subject: [PATCH 166/200] binexport: skip x86 stack string tests

---
 tests/test_binexport_features.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index f685ba24d..4c8649a22 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -6,6 +6,7 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 import binascii
+from typing import cast
 
 import pytest
 import fixtures
@@ -447,6 +448,9 @@ def test_binexport_features_pe_x86(sample, scope, feature, expected):
     if "mimikatz.exe_" not in sample.name:
         pytest.skip("for now only testing mimikatz.exe_ Ghidra BinExport file")
 
+    if isinstance(feature, capa.features.common.Characteristic) and "stack string" in cast(str, feature.value):
+        pytest.skip("for now only testing basic features")
+
     sample = sample.parent / "binexport2" / (sample.name + ".ghidra.BinExport")
     assert sample.exists()
     fixtures.do_test_feature_presence(fixtures.get_binexport_extractor, sample, scope, feature, expected)

From 65e320e132aaf28a942a7306b9905bb365f1ad7a Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 15 Aug 2024 15:44:34 -0600
Subject: [PATCH 167/200] binexport: update mimikatz.exe_ feature count tests
 for Ghidra

---
 tests/fixtures.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/fixtures.py b/tests/fixtures.py
index a2aa56aea..9b93b19ca 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -1381,7 +1381,7 @@ def parametrize(params, values, **kwargs):
 FEATURE_COUNT_TESTS_GHIDRA = [
     # Ghidra may render functions as labels, as well as provide differing amounts of call references
     ("mimikatz", "function=0x4702FD", capa.features.common.Characteristic("calls from"), 0),
-    ("mimikatz", "function=0x401000", capa.features.common.Characteristic("calls to"), 0),
+    ("mimikatz", "function=0x401bf1", capa.features.common.Characteristic("calls to"), 2),
     ("mimikatz", "function=0x401000", capa.features.basicblock.BasicBlock(), 3),
 ]
 

From 1f105197501ffbd0b99275763163ae1f7bc3f3fa Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 15 Aug 2024 15:51:42 -0600
Subject: [PATCH 168/200] core: loader: update binja import

---
 capa/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/loader.py b/capa/loader.py
index e8aad26a6..065710e7f 100644
--- a/capa/loader.py
+++ b/capa/loader.py
@@ -231,7 +231,7 @@ def get_extractor(
 
     elif backend == BACKEND_BINJA:
         import capa.helpers
-        from capa.features.extractors.binaryninja.find_binja_api import find_binja_path
+        from capa.features.extractors.binja.find_binja_api import find_binja_path
 
         # When we are running as a standalone executable, we cannot directly import binaryninja
         # We need to fist find the binja API installation path and add it into sys.path

From 5d89c29b8e9a31dd3470ea2e1e1a0357728bf484 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 15 Aug 2024 16:33:14 -0600
Subject: [PATCH 169/200] core: loader: update binja imports

---
 capa/loader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/capa/loader.py b/capa/loader.py
index 065710e7f..c442dcb8e 100644
--- a/capa/loader.py
+++ b/capa/loader.py
@@ -249,7 +249,7 @@ def get_extractor(
                 + "https://docs.binary.ninja/dev/batch.html#install-the-api)."
             )
 
-        import capa.features.extractors.binaryninja.extractor
+        import capa.features.extractors.binja.extractor
 
         if input_format not in (FORMAT_SC32, FORMAT_SC64):
             if not is_supported_format(input_path):
@@ -266,7 +266,7 @@ def get_extractor(
             if bv is None:
                 raise RuntimeError(f"Binary Ninja cannot open file {input_path}")
 
-        return capa.features.extractors.binaryninja.extractor.BinjaFeatureExtractor(bv)
+        return capa.features.extractors.binja.extractor.BinjaFeatureExtractor(bv)
 
     elif backend == BACKEND_PEFILE:
         import capa.features.extractors.pefile

From a10efe078e93ea7f23854fa4a51dabcaf4bf814e Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 15 Aug 2024 16:38:38 -0600
Subject: [PATCH 170/200] binexport: arm: ignore number features for add
 instruction manipulating stack

---
 .../extractors/binexport2/arch/arm/insn.py         | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/capa/features/extractors/binexport2/arch/arm/insn.py b/capa/features/extractors/binexport2/arch/arm/insn.py
index a77b6be41..f3d0d8743 100644
--- a/capa/features/extractors/binexport2/arch/arm/insn.py
+++ b/capa/features/extractors/binexport2/arch/arm/insn.py
@@ -20,6 +20,7 @@
     get_operand_expressions,
     get_instruction_mnemonic,
     get_instruction_operands,
+    get_operand_register_expression,
     get_operand_immediate_expression,
 )
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
@@ -44,6 +45,19 @@ def extract_insn_number_features(
         #   .text:0040116e leave
         return
 
+    mnemonic: str = get_instruction_mnemonic(be2, instruction)
+
+    if mnemonic == "add":
+        assert len(instruction.operand_index) == 3
+
+        expression1: Optional[BinExport2.Expression] = get_operand_register_expression(
+            be2, be2.operand[instruction.operand_index[1]]
+        )
+        if expression1 and is_stack_register_expression(be2, expression1):
+            # skip things like:
+            # add x0,sp,#0x8
+            return
+
     for i, operand_index in enumerate(instruction.operand_index):
         operand: BinExport2.Operand = be2.operand[operand_index]
 

From 1fa7f02364257b2f6db2820b612594f2fd4aaea5 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 15 Aug 2024 16:39:13 -0600
Subject: [PATCH 171/200] binexport: update unit tests

---
 tests/test_binexport_features.py | 65 +++++++++++---------------------
 1 file changed, 23 insertions(+), 42 deletions(-)

diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index 4c8649a22..e31b2327d 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -111,6 +111,14 @@
         ("687e79.ghidra.be2", "function=0x107588", capa.features.insn.Mnemonic("bl"), True),
         ("687e79.ghidra.be2", "function=0x107588", capa.features.insn.Mnemonic("in"), False),
         ("687e79.ghidra.be2", "function=0x107588", capa.features.insn.Mnemonic("adrl"), False),
+        # insn/number
+        # 00114524 add x29,sp,#0x10
+        (
+            "d1e650.ghidra.be2",
+            "function=0x11451c",
+            capa.features.insn.Number(0x10),
+            False,
+        ),
         # insn/operand.number
         (
             "687e79.ghidra.be2",
@@ -149,14 +157,6 @@
             capa.features.insn.OperandOffset(2, 0x20),
             False,
         ),
-        # insn/number
-        ("687e79.ghidra.be2", "function=0x107588", capa.features.insn.Number(0x3), True),
-        (
-            "687e79.ghidra.be2",
-            "function=0x107588",
-            capa.features.insn.Number(0x10),
-            "xfail: do we want this for ldp?",
-        ),
         ("687e79.ghidra.be2", "function=0x105C88", capa.features.insn.Number(0xF000), True),
         # insn/number: negative
         (
@@ -299,34 +299,20 @@
             capa.features.common.Substring("/data/misc"),
             True,
         ),
-        # # insn/string, pointer to string
-        # ("mimikatz", "function=0x44EDEF", capa.features.common.String("INPUTEVENT"), True),
-        # # insn/string, direct memory reference
-        # ("mimikatz", "function=0x46D6CE", capa.features.common.String("(null)"), True),
         # insn/bytes
         (
-            "687e79.ghidra.be2",
-            "function=0x0",
-            capa.features.common.Bytes(binascii.unhexlify("00")),
-            "xfail: not implemented yet, may need other test sample",
+            "d1e650.ghidra.be2",
+            "function=0x1165a4",
+            capa.features.common.Bytes(binascii.unhexlify("E405B89370BA6B419CD7925275BF6FCC1E8360CC")),
+            True,
         ),
+        # # don't extract byte features for obvious strings
         (
             "687e79.ghidra.be2",
-            "function=0x0",
-            capa.features.common.Bytes(binascii.unhexlify("00")),
-            "xfail: not implemented yet, may need other test sample",
+            "function=0x1057f8",
+            capa.features.common.Bytes("/system/xbin/busybox".encode("utf-16le")),
+            False,
         ),
-        # # don't extract byte features for obvious strings
-        # ("mimikatz", "function=0x40105D", capa.features.common.Bytes("SCardControl".encode("utf-16le")), False),
-        # ("mimikatz", "function=0x40105D", capa.features.common.Bytes("SCardTransmit".encode("utf-16le")), False),
-        # ("mimikatz", "function=0x40105D", capa.features.common.Bytes("ACR  > ".encode("utf-16le")), False),
-        # ("mimikatz", "function=0x40105D", capa.features.common.Bytes("nope".encode("ascii")), False),
-        # # push    offset aAcsAcr1220 ; "ACS..." -> where ACS == 41 00 43 00 == valid pointer to middle of instruction
-        # ("mimikatz", "function=0x401000", capa.features.common.Bytes(binascii.unhexlify("FDFF59F647")), False),
-        # # IDA features included byte sequences read from invalid memory, fixed in #409
-        # ("mimikatz", "function=0x44570F", capa.features.common.Bytes(binascii.unhexlify("FF" * 256)), False),
-        # # insn/bytes, pointer to string bytes
-        # ("mimikatz", "function=0x44EDEF", capa.features.common.Bytes("INPUTEVENT".encode("utf-16le")), False),
         # insn/characteristic(nzxor)
         (
             "d1e650.ghidra.be2",
@@ -365,6 +351,13 @@
             capa.features.common.Characteristic("indirect call"),
             False,
         ),
+        ("d1e650.ghidra.be2", "function=0x118620", capa.features.common.Characteristic("indirect call"), True),
+        (
+            "d1e650.ghidra.be2",
+            "function=0x11451c",
+            capa.features.common.Characteristic("indirect call"),
+            True,
+        ),
         # insn/characteristic(calls from)
         (
             "687e79.ghidra.be2",
@@ -408,18 +401,6 @@
         ("687e79.ghidra.be2", "file", Format(FORMAT_PE), False),
         ("687e79.ghidra.be2", "function=0x107588", Format(FORMAT_ELF), True),
         ("687e79.ghidra.be2", "function=0x107588", Format(FORMAT_PE), False),
-        (
-            "687e79.ghidra.be2",
-            "function=0x0,bb=0x0",
-            capa.features.common.Characteristic("call $+5"),
-            "xfail: not implemented yet",
-        ),
-        (
-            "687e79.ghidra.be2",
-            "function=0x0,bb=0x0",
-            capa.features.common.Characteristic("call $+5"),
-            "xfail: not implemented yet",
-        ),
     ],
     # order tests by (file, item)
     # so that our LRU cache is most effective.

From 5624d9f0625ad24488fe478dd15da7e64eb48c12 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 15 Aug 2024 16:45:32 -0600
Subject: [PATCH 172/200] binexport: arm: ignore number features for sub
 instruction manipulating stack

---
 capa/features/extractors/binexport2/arch/arm/insn.py | 2 +-
 tests/test_binexport_features.py                     | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/capa/features/extractors/binexport2/arch/arm/insn.py b/capa/features/extractors/binexport2/arch/arm/insn.py
index f3d0d8743..b8e3a51d4 100644
--- a/capa/features/extractors/binexport2/arch/arm/insn.py
+++ b/capa/features/extractors/binexport2/arch/arm/insn.py
@@ -47,7 +47,7 @@ def extract_insn_number_features(
 
     mnemonic: str = get_instruction_mnemonic(be2, instruction)
 
-    if mnemonic == "add":
+    if mnemonic in ("add", "sub"):
         assert len(instruction.operand_index) == 3
 
         expression1: Optional[BinExport2.Expression] = get_operand_register_expression(
diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index e31b2327d..3b3ea7a58 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -119,6 +119,13 @@
             capa.features.insn.Number(0x10),
             False,
         ),
+        # 00105128 sub sp,sp,#0xE0
+        (
+            "687e79.ghidra.be2",
+            "function=0x105128",
+            capa.features.insn.Number(0xE0),
+            False,
+        ),
         # insn/operand.number
         (
             "687e79.ghidra.be2",

From c9d58a301ba084778bdb61ad78ec0becf0da317b Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 15 Aug 2024 17:00:20 -0600
Subject: [PATCH 173/200] binexport: arm: emit offset features for add
 instructions

---
 .../extractors/binexport2/arch/arm/insn.py    |  8 ++++-
 tests/test_binexport_features.py              | 29 +++++--------------
 2 files changed, 14 insertions(+), 23 deletions(-)

diff --git a/capa/features/extractors/binexport2/arch/arm/insn.py b/capa/features/extractors/binexport2/arch/arm/insn.py
index b8e3a51d4..6452d7111 100644
--- a/capa/features/extractors/binexport2/arch/arm/insn.py
+++ b/capa/features/extractors/binexport2/arch/arm/insn.py
@@ -9,7 +9,7 @@
 from typing import List, Tuple, Iterator, Optional
 
 import capa.features.extractors.binexport2.helpers
-from capa.features.insn import Number, Offset, OperandNumber, OperandOffset
+from capa.features.insn import MAX_STRUCTURE_SIZE, Number, Offset, OperandNumber, OperandOffset
 from capa.features.common import Feature, Characteristic
 from capa.features.address import Address
 from capa.features.extractors.binexport2 import FunctionContext, InstructionContext
@@ -72,6 +72,12 @@ def extract_insn_number_features(
         yield Number(value), ih.address
         yield OperandNumber(i, value), ih.address
 
+        if mnemonic == "add":
+            if i == 2:
+                if 0 < value < MAX_STRUCTURE_SIZE:
+                    yield Offset(value), ih.address
+                    yield OperandOffset(i, value), ih.address
+
 
 def extract_insn_offset_features(
     fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py
index 3b3ea7a58..3bf6d56d7 100644
--- a/tests/test_binexport_features.py
+++ b/tests/test_binexport_features.py
@@ -241,28 +241,13 @@
             True,
         ),
         # insn/offset from mnemonic: add
-        #
-        # should not be considered, too big for an offset:
-        #    .text:00401D85 81 C1 00 00 00 80       add     ecx, 80000000h
-        # ("mimikatz", "function=0x401D64,bb=0x401D73,insn=0x401D85", capa.features.insn.Offset(0x80000000), False),
-        # should not be considered, relative to stack:
-        #    .text:00401CF6 83 C4 10                add     esp, 10h
-        # ("mimikatz", "function=0x401CC7,bb=0x401CDE,insn=0x401CF6", capa.features.insn.Offset(0x10), False),
-        # yes, this is also a offset (imagine eax is a pointer):
-        #    .text:0040223C 83 C0 04                add     eax, 4
-        # ("mimikatz", "function=0x402203,bb=0x402221,insn=0x40223C", capa.features.insn.Offset(0x4), True),
-        #
-        # insn/number from mnemonic: lea
-        #
-        # should not be considered, lea operand invalid encoding
-        #    .text:00471EE6 8D 1C 81                lea     ebx, [ecx+eax*4]
-        # ("mimikatz", "function=0x471EAB,bb=0x471ED8,insn=0x471EE6", capa.features.insn.Number(0x4), False),
-        # should not be considered, lea operand invalid encoding
-        #    .text:004717B1 8D 4C 31 D0             lea     ecx, [ecx+esi-30h]
-        # ("mimikatz", "function=0x47153B,bb=0x4717AB,insn=0x4717B1", capa.features.insn.Number(-0x30), False),
-        # yes, this is also a number (imagine edx is zero):
-        #    .text:004018C0 8D 4B 02                lea     ecx, [ebx+2]
-        # ("mimikatz", "function=0x401873,bb=0x4018B2,insn=0x4018C0", capa.features.insn.Number(0x2), True),
+        # 0010514c add x23,param_1,#0x8
+        (
+            "687e79.ghidra.be2",
+            "function=0x105128,bb=0x105128,insn=0x10514c",
+            capa.features.insn.Offset(0x8),
+            True,
+        ),
         # insn/api
         # not extracting dll name
         ("687e79.ghidra.be2", "function=0x105c88", capa.features.insn.API("memset"), True),

From 981e93a14cdd9ba759c49bf794c0c7424e0e4c83 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 16 Aug 2024 11:25:39 -0600
Subject: [PATCH 174/200] binexport: remove TODO from tests workflow

---
 .github/workflows/tests.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index ed597ee0e..c0081a699 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -82,7 +82,6 @@ jobs:
       run: python scripts/lint.py rules/
 
   tests:
-    if: always()  # TODO remove once code_style passes
     name: Tests in ${{ matrix.python-version }} on ${{ matrix.os }}
     runs-on: ${{ matrix.os }}
     needs: [code_style, rule_linter]

From afa02150c2fa07c8fdd94e4e1ac25ce0b655a898 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 16 Aug 2024 11:29:48 -0600
Subject: [PATCH 175/200] binexport: update CHANGELOG

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 54d41d1f2..16eb15701 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@
 - webui: explore capa analysis results in a web-based UI online and offline #2224 @s-ff
 - support analyzing DRAKVUF traces #2143 @yelhamer
 - IDA extractor: extract names from dynamically resolved APIs stored in renamed global variables #2201 @Ana06
+- support analyzing BinExport2 files generated by Ghidra #1950 @williballenthin @mehunhoff @mr-tz
 
 
 ### Breaking Changes

From bfbd4ad51ac88516cc1a98922df3df21bc36049b Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 16 Aug 2024 11:45:12 -0600
Subject: [PATCH 176/200] binexport: remove outdated TODOs

---
 capa/features/extractors/binexport2/function.py |  1 -
 capa/features/extractors/binexport2/insn.py     | 14 --------------
 2 files changed, 15 deletions(-)

diff --git a/capa/features/extractors/binexport2/function.py b/capa/features/extractors/binexport2/function.py
index 282e4b5f2..0c49036d1 100644
--- a/capa/features/extractors/binexport2/function.py
+++ b/capa/features/extractors/binexport2/function.py
@@ -42,7 +42,6 @@ def extract_function_loop(fh: FunctionHandle) -> Iterator[Tuple[Feature, Address
 
     edges: List[Tuple[int, int]] = []
     for edge in flow_graph.edge:
-        # TODO (meh): use Edge.is_back_edge pending https://github.com/mandiant/capa/issues/2101
         edges.append((edge.source_basic_block_index, edge.target_basic_block_index))
 
     if loops.has_loop(edges):
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index eab2182f8..d83a73088 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -65,20 +65,6 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
         for name in capa.features.extractors.helpers.generate_symbols("", api_name):
             yield API(name), ih.address
 
-    """
-        # TODO: re-enable pending https://github.com/google/binexport/issues/126#issuecomment-2074402906
-        function_name = vertex.mangled_name
-        if vertex.HasField("library_index"):
-            # TODO: this seems to be incorrect for Ghidra extractor
-            library = be2.library[vertex.library_index]
-            library_name = library.name
-
-            for name in capa.features.extractors.helpers.generate_symbols(library_name, function_name):
-                yield API(name), ih.address
-        else:
-            yield API(function_name), ih.address
-    """
-
 
 def extract_insn_number_features(
     fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle

From 5ea55d17bea58b18d9dd67d0f0ccee9854358f4b Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 16 Aug 2024 12:33:19 -0600
Subject: [PATCH 177/200] binexport: re-enable support for data references in
 inspect-binexport2.py

---
 scripts/inspect-binexport2.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index 89c223d0f..b984ce1ee 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -430,20 +430,14 @@ def main(argv=None):
 
     with o.section("data"):
         for data_address in sorted(idx.data_reference_index_by_target_address.keys()):
-            # TODO(wb): re-enable this
-            # if data_address in idx.instruction_index_by_address:
-            #     # appears to be code
-            #     continue
-            # https://github.com/mandiant/capa/issues/1755
+            if data_address in idx.insn_address_by_index:
+                continue
 
             data_xrefs: List[int] = []
             for data_reference_index in idx.data_reference_index_by_target_address[data_address]:
                 data_reference = be2.data_reference[data_reference_index]
-                instruction_index = data_reference.instruction_index
-                # TODO(wb): uh-oh, how to reconstruct address?
-                # instruction_address = idx.instruction_address_by_index[instruction_index]
-                # data_xrefs.append(instruction_address)
-                # https://github.com/mandiant/capa/issues/1755
+                instruction_address = idx.get_insn_address(data_reference.instruction_index)
+                data_xrefs.append(instruction_address)
 
             if not data_xrefs:
                 continue

From 80cbe2a12e673321c23fee0dc47024e21bb6c3b8 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 16 Aug 2024 12:40:51 -0600
Subject: [PATCH 178/200] binexport: skip data references to code

---
 capa/features/extractors/binexport2/insn.py | 4 ++++
 scripts/inspect-binexport2.py               | 1 +
 2 files changed, 5 insertions(+)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index d83a73088..8f2e6af99 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -99,6 +99,10 @@ def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandl
             data_reference: BinExport2.DataReference = be2.data_reference[data_reference_index]
             data_reference_address: int = data_reference.address
 
+            if data_reference_address in idx.insn_address_by_index:
+                # appears to be code
+                continue
+
             reference_addresses.append(data_reference_address)
 
     for reference_address in reference_addresses:
diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index b984ce1ee..55462a235 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -431,6 +431,7 @@ def main(argv=None):
     with o.section("data"):
         for data_address in sorted(idx.data_reference_index_by_target_address.keys()):
             if data_address in idx.insn_address_by_index:
+                # appears to be code
                 continue
 
             data_xrefs: List[int] = []

From 7123f1f5ce54efd50d6150a62a90b506f0c5aa9a Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Fri, 16 Aug 2024 14:44:53 -0600
Subject: [PATCH 179/200] binexport: remove outdated TODOs

---
 capa/features/extractors/binexport2/extractor.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
index 3ed3b5d07..29eaf841a 100644
--- a/capa/features/extractors/binexport2/extractor.py
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -67,10 +67,6 @@ def __init__(self, be2: BinExport2, buf: bytes):
             elif isinstance(feature, Arch):
                 self.arch.add(feature.value)
 
-        # TODO(mr): assert supported file formats, arches
-        # and gradually relax restrictions as they're tested.
-        # https://github.com/mandiant/capa/issues/1755
-
     def get_base_address(self) -> AbsoluteVirtualAddress:
         return AbsoluteVirtualAddress(self.analysis.base_address)
 

From 1a3e63f0499ca73ad6d1a1d1c6f7b08be31548fd Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <wballenthin@google.com>
Date: Fri, 23 Aug 2024 10:46:15 +0200
Subject: [PATCH 180/200] Update scripts/inspect-binexport2.py

---
 scripts/inspect-binexport2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index 55462a235..1d0f7a8dd 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -186,7 +186,7 @@ def render_operand(
     # Therefore, we expect caching to be fruitful, trading memory for CPU time.
     #
     # No caching:   6.045 s ± 0.164 s   [User: 5.916 s, System: 0.129 s]
-    # With caching: 4.259 s ±  0.161 s  [User: 4.141 s, System: 0.117 s]
+    # With caching: 4.259 s ± 0.161 s   [User: 4.141 s, System: 0.117 s]
     #
     # So we can save 30% of CPU time by caching operand rendering.
     #

From 210ba484b01acb5c9410acf5b1162fc44ea17cad Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <wballenthin@google.com>
Date: Fri, 23 Aug 2024 10:46:27 +0200
Subject: [PATCH 181/200] Update CHANGELOG.md

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2fcf4166e..9fd2bf7a3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,8 @@
 - add rules website https://mandiant.github.io/capa/rules @DeeyaSingh #2310
 - add .justfile @williballenthin #2325
 - support analyzing BinExport2 files generated by Ghidra #1950 @williballenthin @mehunhoff @mr-tz
+- add support for Android OS #1950 @williballenthin @mehunhoff @mr-tz
+- add support for aarch64 architecture via BinExport2 backend #1950 @williballenthin @mehunhoff @mr-tz
 
 ### Breaking Changes
 

From a4f849c75dc681b2fa77dac4e340b16fdbe2c5c2 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <wballenthin@google.com>
Date: Fri, 23 Aug 2024 10:46:47 +0200
Subject: [PATCH 182/200] Update capa/helpers.py

---
 capa/helpers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/helpers.py b/capa/helpers.py
index 5b601b263..d730fe443 100644
--- a/capa/helpers.py
+++ b/capa/helpers.py
@@ -260,7 +260,7 @@ def log_unsupported_os_error():
     logger.error(" Input file does not appear to target a supported OS.")
     logger.error(" ")
     logger.error(" capa currently only analyzes executables for some operating systems")
-    logger.error(" (including Windows and Linux).")
+    logger.error(" (including Windows, Linux, and Android).")
     logger.error("-" * 80)
 
 

From f0fc44eb04462ad773092bf3dde9d9350ebbe840 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <wballenthin@google.com>
Date: Fri, 23 Aug 2024 10:47:04 +0200
Subject: [PATCH 183/200] Update capa/features/extractors/common.py

---
 capa/features/extractors/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/features/extractors/common.py b/capa/features/extractors/common.py
index e37253d5c..aa2144c73 100644
--- a/capa/features/extractors/common.py
+++ b/capa/features/extractors/common.py
@@ -75,7 +75,7 @@ def extract_format(buf: bytes) -> Iterator[Tuple[Feature, Address]]:
         #  1. handling a file format (e.g. macho)
         #
         # for (1), this logic will need to be updated as the format is implemented.
-        logger.debug("unknown file format: %s", binascii.hexlify(buf[:4]).decode("ascii"))
+        logger.debug("unknown file format: %s", buf[:4].hex())
         return
 
 

From 5b3962f1d110d5ec91fa3bf246ae7f7c0b22a6d8 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <wballenthin@google.com>
Date: Fri, 23 Aug 2024 10:47:20 +0200
Subject: [PATCH 184/200] Update
 capa/features/extractors/binexport2/extractor.py

---
 capa/features/extractors/binexport2/extractor.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
index 29eaf841a..40d61e694 100644
--- a/capa/features/extractors/binexport2/extractor.py
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -66,6 +66,8 @@ def __init__(self, be2: BinExport2, buf: bytes):
                 self.os.add(feature.value)
             elif isinstance(feature, Arch):
                 self.arch.add(feature.value)
+            else:
+                raise ValueError("unexpected global feature: %s", feature)
 
     def get_base_address(self) -> AbsoluteVirtualAddress:
         return AbsoluteVirtualAddress(self.analysis.base_address)

From 7b7a68065bd2be769fd58a486a539171aa311ed1 Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Tue, 27 Aug 2024 09:11:31 -0600
Subject: [PATCH 185/200] Update
 capa/features/extractors/binexport2/arch/arm/insn.py

Co-authored-by: Moritz <mr-tz@users.noreply.github.com>
---
 capa/features/extractors/binexport2/arch/arm/insn.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/capa/features/extractors/binexport2/arch/arm/insn.py b/capa/features/extractors/binexport2/arch/arm/insn.py
index 6452d7111..2f2fc0f52 100644
--- a/capa/features/extractors/binexport2/arch/arm/insn.py
+++ b/capa/features/extractors/binexport2/arch/arm/insn.py
@@ -72,11 +72,10 @@ def extract_insn_number_features(
         yield Number(value), ih.address
         yield OperandNumber(i, value), ih.address
 
-        if mnemonic == "add":
-            if i == 2:
-                if 0 < value < MAX_STRUCTURE_SIZE:
-                    yield Offset(value), ih.address
-                    yield OperandOffset(i, value), ih.address
+        if mnemonic == "add" and i == 2:
+            if 0 < value < MAX_STRUCTURE_SIZE:
+                yield Offset(value), ih.address
+                yield OperandOffset(i, value), ih.address
 
 
 def extract_insn_offset_features(

From 577577b08344e84308da0dc3c6990348fecee871 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Thu, 29 Aug 2024 11:13:51 +0000
Subject: [PATCH 186/200] initial add

---
 tests/test_binexport_accessors.py | 82 +++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 tests/test_binexport_accessors.py

diff --git a/tests/test_binexport_accessors.py b/tests/test_binexport_accessors.py
new file mode 100644
index 000000000..357623a3b
--- /dev/null
+++ b/tests/test_binexport_accessors.py
@@ -0,0 +1,82 @@
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+
+import logging
+from typing import Any, Dict
+
+from google.protobuf.json_format import ParseDict
+
+from capa.features.extractors.binexport2.helpers import (
+    get_operand_expressions,
+    get_instruction_mnemonic,
+    get_instruction_operands,
+)
+from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
+
+logger = logging.getLogger(__name__)
+
+"""
+mov     x0, 0x20
+bl      0x100
+add     x0, sp, 0x10
+
+# not here yet ldr     x0, [x1, 8]
+"""
+
+BE2_DICT: Dict[str, Any] = {
+    "expression": [
+        {"type": 1, "symbol": "x0"},
+        {"type": 2, "immediate": 0x20},
+        {"type": 3, "immediate": 0x100},
+        {"type": 1, "symbol": "sp"},
+        {"type": 3, "immediate": 0x10},
+    ],
+    # operand consists of 1 or more expressions, linked together as a tree
+    "operand": [
+        {"expression_index": [0]},
+        {"expression_index": [1]},
+        {"expression_index": [2]},
+        {"expression_index": [3]},
+        {"expression_index": [4]},
+    ],
+    "mnemonic": [
+        {"name": "mov"},  # mnem 0
+        {"name": "bl"},  # mnem 1
+        {"name": "add"},  # mnem 2
+    ],
+    # instruction may have 0 or more operands
+    "instruction": [
+        {"mnemonic_index": 0, "operand_index": [0, 1]},
+        {"mnemonic_index": 1, "operand_index": [2]},
+        {"mnemonic_index": 2, "operand_index": [0, 3, 4]},
+    ],
+}
+BE2 = ParseDict(
+    BE2_DICT,
+    BinExport2(),
+)
+
+
+def test_get_instruction_mnemonic():
+    mov = ParseDict(BE2_DICT["instruction"][0], BinExport2.Instruction())
+    call = ParseDict(BE2_DICT["instruction"][1], BinExport2.Instruction())
+
+    assert get_instruction_mnemonic(BE2, mov) == "mov"
+    assert get_instruction_mnemonic(BE2, call) == "bl"
+
+
+def test_get_instruction_operands():
+    insn = ParseDict(BE2_DICT["instruction"][2], BinExport2.Instruction())
+
+    assert len(get_instruction_operands(BE2, insn)) == 3
+
+
+def test_get_operand_expressions():
+    oper = ParseDict(BE2_DICT["operand"][0], BinExport2.Operand())
+
+    assert len(get_operand_expressions(BE2, oper)) == 1

From 5fd16c8bd3f2b098dc3225da6c277cc7b85c04f5 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Thu, 29 Aug 2024 12:02:59 +0000
Subject: [PATCH 187/200] test binexport scripts

---
 tests/test_scripts.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tests/test_scripts.py b/tests/test_scripts.py
index 35bf5347f..5735143cf 100644
--- a/tests/test_scripts.py
+++ b/tests/test_scripts.py
@@ -6,6 +6,7 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 
+import os
 import sys
 import logging
 import textwrap
@@ -38,6 +39,10 @@ def get_report_file_path():
     )
 
 
+def get_binexport2_file_path():
+    return str(CD / "data" / "binexport2" / "mimikatz.exe_.ghidra.BinExport")
+
+
 def get_rules_path():
     return str(CD / ".." / "rules")
 
@@ -74,6 +79,22 @@ def test_scripts(script, args):
     assert p.returncode == 0
 
 
+@pytest.mark.parametrize(
+    "script,args",
+    [
+        pytest.param("inspect-binexport2.py", [get_binexport2_file_path()]),
+        pytest.param("detect-binexport2-capabilities.py", [get_binexport2_file_path()]),
+    ],
+)
+def test_binexport_scripts(script, args):
+    # define sample bytes location
+    os.environ["CAPA_SAMPLES_DIR"] = str(Path(CD / "data"))
+
+    script_path = get_script_path(script)
+    p = run_program(script_path, args)
+    assert p.returncode == 0
+
+
 def test_bulk_process(tmp_path):
     # create test directory to recursively analyze
     t = tmp_path / "test"

From b1211d91de7e6f1fa33ca7700e89e89b408190f1 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Tue, 3 Sep 2024 09:49:43 +0000
Subject: [PATCH 188/200] add tests using small ARM ELF

---
 tests/test_binexport_accessors.py | 254 +++++++++++++++++++++++++++---
 1 file changed, 236 insertions(+), 18 deletions(-)

diff --git a/tests/test_binexport_accessors.py b/tests/test_binexport_accessors.py
index 357623a3b..cef655eb0 100644
--- a/tests/test_binexport_accessors.py
+++ b/tests/test_binexport_accessors.py
@@ -6,28 +6,247 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 
+import re
 import logging
 from typing import Any, Dict
+from pathlib import Path
 
+import pytest
+import fixtures
 from google.protobuf.json_format import ParseDict
 
 from capa.features.extractors.binexport2.helpers import (
     get_operand_expressions,
     get_instruction_mnemonic,
     get_instruction_operands,
+    get_operand_register_expression,
+    get_operand_immediate_expression,
 )
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
+from capa.features.extractors.binexport2.arch.arm.helpers import is_stack_register_expression
 
 logger = logging.getLogger(__name__)
 
+CD = Path(__file__).resolve().parent
+
+
+# found via https://www.virustotal.com/gui/search/type%253Aelf%2520and%2520size%253A1.2kb%252B%2520and%2520size%253A1.4kb-%2520and%2520tag%253Aarm%2520and%2520not%2520tag%253Arelocatable%2520and%2520tag%253A64bits/files
+# Ghidra disassembly of c7f38027552a3eca84e2bfc846ac1307fbf98657545426bb93a2d63555cbb486
+GHIDRA_DISASSEMBLY = """
+                             //
+                             // segment_1 
+                             // Loadable segment  [0x200000 - 0x200157]
+                             // ram:00200000-ram:00200157
+                             //
+        00200000 7f 45 4c        Elf64_Ehdr
+...
+                             //
+                             // .text 
+                             // SHT_PROGBITS  [0x210158 - 0x2101c7]
+                             // ram:00210158-ram:002101c7
+                             //
+                             **************************************************************
+                             *                          FUNCTION                          *
+                             **************************************************************
+                             undefined entry()
+             undefined         w0:1           <RETURN>
+                             _start                                          XREF[4]:     Entry Point(*), 00200018(*), 
+                             entry                                                        002000c0(*), 
+                                                                                          _elfSectionHeaders::00000050(*)  
+        00210158 20 00 80 d2     mov        x0,#0x1
+        0021015c a1 02 00 58     ldr        x1=>helloWorldStr,DAT_002101b0                   = "Hello World!\n"
+                                                                                             = 00000000002201C8h
+        00210160 c2 02 00 58     ldr        x2,DAT_002101b8                                  = 000000000000000Eh
+        00210164 08 08 80 d2     mov        x8,#0x40
+        00210168 01 00 00 d4     svc        0x0
+        0021016c a0 02 00 58     ldr        x0=>$stringWith_Weird_Name,DAT_002101c0          = "This string has a very strang
+                                                                                             = 00000000002201D6h
+        00210170 04 00 00 94     bl         printString                                      undefined printString()
+        00210174 60 0f 80 d2     mov        x0,#0x7b
+        00210178 a8 0b 80 d2     mov        x8,#0x5d
+        0021017c 01 00 00 d4     svc        0x0
+                             **************************************************************
+                             *                          FUNCTION                          *
+                             **************************************************************
+                             undefined printString()
+             undefined         w0:1           <RETURN>
+                             printString                                     XREF[1]:     entry:00210170(c)  
+        00210180 01 00 80 d2     mov        x1,#0x0
+                             strlenLoop                                      XREF[1]:     00210194(j)  
+        00210184 02 68 61 38     ldrb       w2,[x0, x1, LSL ]
+        00210188 5f 00 00 71     cmp        w2,#0x0
+        0021018c 60 00 00 54     b.eq       strlenDone
+        00210190 21 04 00 91     add        x1,x1,#0x1
+        00210194 fc ff ff 17     b          strlenLoop
+                             strlenDone                                      XREF[1]:     0021018c(j)  
+        00210198 e2 03 01 aa     mov        x2,x1
+        0021019c e1 03 00 aa     mov        x1,x0
+        002101a0 20 00 80 d2     mov        x0,#0x1
+        002101a4 08 08 80 d2     mov        x8,#0x40
+        002101a8 01 00 00 d4     svc        0x0
+        002101ac c0 03 5f d6     ret
+                             DAT_002101b0                                    XREF[1]:     entry:0021015c(R)  
+        002101b0 c8 01 22        undefined8 00000000002201C8h                                ?  ->  002201c8
+                 00 00 00 
+                 00 00
+                             DAT_002101b8                                    XREF[1]:     entry:00210160(R)  
+        002101b8 0e 00 00        undefined8 000000000000000Eh
+                 00 00 00 
+                 00 00
+                             DAT_002101c0                                    XREF[1]:     entry:0021016c(R)  
+        002101c0 d6 01 22        undefined8 00000000002201D6h                                ?  ->  002201d6
+                 00 00 00 
+                 00 00
+                             //
+                             // .data 
+                             // SHT_PROGBITS  [0x2201c8 - 0x2201fb]
+                             // ram:002201c8-ram:002201fb
+                             //
+                             helloWorldStr                                   XREF[3]:     002000f8(*), entry:0021015c(*), 
+                                                                                          _elfSectionHeaders::00000090(*)  
+        002201c8 48 65 6c        ds         "Hello World!\n"
+                 6c 6f 20 
+                 57 6f 72 
+                             $stringWith_Weird_Name                          XREF[1]:     entry:0021016c(*)  
+        002201d6 54 68 69        ds         "This string has a very strange label\n"
+                 73 20 73 
+                 74 72 69 
+...
+"""
+
+
+def _parse_ghidra_disassembly(disasm: str) -> dict:
+    dd = {}
+    # 00210158 20 00 80 d2     mov        x0,#0x1
+    # ^^^^^^^^ ^^^^^^^^^^^     ^^^        ^^ ^^^^
+    # address  bytes           mnemonic   o1,o2  (,o3)
+    pattern = re.compile(
+        r"^( ){8}(?P<address>[0-9a-f]+) "
+        "(?P<bytes>([0-9a-f]{2}[ ]){4})\s+"
+        "(?P<mnemonic>[\w\.]+)\s*"
+        "(?P<operand1>[\w#$=>]+)?,?"
+        "((?P<operand2>[\w#$=>]+))?,?"
+        "((?P<operand3>[\w#$=>]+))?"
+    )
+    for line in disasm.splitlines()[20:]:
+        m = pattern.match(line)
+        if m:
+            logger.debug("Match found\t%s\n\t\t\t\t%s", line, m.groupdict())
+            dd[int(m["address"], 0x10)] = {
+                "bytes": m["bytes"].strip(),
+                "mnemonic": m["mnemonic"],
+                "operands": [e for e in [m["operand1"], m["operand2"], m["operand3"]] if e is not None],
+            }
+        else:
+            logger.debug("No match\t%s", line)
+    return dd
+
+
+BE2_EXTRACTOR = fixtures.get_binexport_extractor(
+    CD
+    / "data"
+    / "binexport2"
+    / "c7f38027552a3eca84e2bfc846ac1307fbf98657545426bb93a2d63555cbb486.elf_.ghidra.BinExport"
+)
+PARSED_DISASM = _parse_ghidra_disassembly(GHIDRA_DISASSEMBLY)
+
+
+def test_instruction_bytes():
+    # more a data sanity check here as we don't test our code
+    for addr, de in PARSED_DISASM.items():
+        insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr)
+        assert insn.raw_bytes == bytes.fromhex(de["bytes"])
+
+
+def test_get_instruction_mnemonic():
+    for addr, de in PARSED_DISASM.items():
+        insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr)
+        assert get_instruction_mnemonic(BE2_EXTRACTOR.be2, insn) == de["mnemonic"]
+
+
+def test_get_instruction_operands_count():
+    for addr, de in PARSED_DISASM.items():
+        insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr)
+        ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn)
+        # this line is not properly parsed from the Ghidra disassembly using the current regex
+        # 00210184 02 68 61 38     ldrb       w2,[x0, x1, LSL ]
+        if addr == 0x210184:
+            assert len(ops) == 2
+        else:
+            assert len(ops) == len(de["operands"])
+
+
+@pytest.mark.parametrize(
+    "addr,op_expressions",
+    [
+        # 00210158 20 00 80 d2     mov        x0,#0x1
+        (0x210158, ("x0", "#0x1")),
+        # 0021015c a1 02 00 58     ldr        x1=>helloWorldStr,DAT_002101b0
+        (0x21015C, ("x1", "DAT_002101b0")),
+        # 00210184 02 68 61 38     ldrb       w2,[x0, x1, LSL ]
+        (0x210184, ("w2", "[x0, x1, LSL ]")),
+        # 00210190 21 04 00 91     add        x1,x1,#0x1
+        (0x210190, ("x1", "x1", "#0x1")),
+    ],
+)
+def test_get_operand_expressions(addr, op_expressions):
+    insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr)
+    ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn)
+    for i, op in enumerate(ops):
+        exps = get_operand_expressions(BE2_EXTRACTOR.be2, op)
+        assert len(exps) == 1
+        assert exps[0].symbol == op_expressions[i]
+
+
+@pytest.mark.parametrize(
+    "addr,reg_expressions",
+    [
+        # 00210158 20 00 80 d2     mov        x0,#0x1
+        (0x210158, ("x0", None)),
+        # 0021015c a1 02 00 58     ldr        x1=>helloWorldStr,DAT_002101b0
+        (0x21015C, ("x1", None)),
+        # 00210184 02 68 61 38     ldrb       w2,[x0, x1, LSL ]
+        (0x210184, ("w2", None)),
+        # 00210190 21 04 00 91     add        x1,x1,#0x1
+        (0x210190, ("x1", "x1", None)),
+    ],
+)
+def _TODO_test_get_operand_register_expression(addr, reg_expressions):
+    insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr)
+    ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn)
+    for i, op in enumerate(ops):
+        reg_exp = get_operand_register_expression(BE2_EXTRACTOR.be2, op)
+        logger.debug("%s", get_operand_expressions(BE2_EXTRACTOR.be2, op))
+        assert reg_exp == reg_expressions[i]
+
+
+@pytest.mark.parametrize(
+    "addr,expressions",
+    [
+        # 00210158 20 00 80 d2     mov        x0,#0x1
+        (0x210158, (None, 0x1)),
+        # 0021015c a1 02 00 58     ldr        x1=>helloWorldStr,DAT_002101b0
+        (0x21015C, (None, None)),
+        # 00210184 02 68 61 38     ldrb       w2,[x0, x1, LSL ]
+        (0x210184, (None, None)),
+        # 00210190 21 04 00 91     add        x1,x1,#0x1
+        (0x210190, (None, None, 0x1)),
+    ],
+)
+def _TODO_test_get_operand_immediate_expression(addr, expressions):
+    insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr)
+    ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn)
+    for i, op in enumerate(ops):
+        reg_exp = get_operand_immediate_expression(BE2_EXTRACTOR.be2, op)
+        logger.debug("%s", get_operand_expressions(BE2_EXTRACTOR.be2, op))
+        assert reg_exp == expressions[i]
+
+
 """
 mov     x0, 0x20
 bl      0x100
 add     x0, sp, 0x10
-
-# not here yet ldr     x0, [x1, 8]
 """
-
 BE2_DICT: Dict[str, Any] = {
     "expression": [
         {"type": 1, "symbol": "x0"},
@@ -62,21 +281,20 @@
 )
 
 
-def test_get_instruction_mnemonic():
+def _TODO_test_is_stack_register_expression():
     mov = ParseDict(BE2_DICT["instruction"][0], BinExport2.Instruction())
-    call = ParseDict(BE2_DICT["instruction"][1], BinExport2.Instruction())
-
-    assert get_instruction_mnemonic(BE2, mov) == "mov"
-    assert get_instruction_mnemonic(BE2, call) == "bl"
-
-
-def test_get_instruction_operands():
-    insn = ParseDict(BE2_DICT["instruction"][2], BinExport2.Instruction())
-
-    assert len(get_instruction_operands(BE2, insn)) == 3
-
+    add = ParseDict(BE2_DICT["instruction"][2], BinExport2.Instruction())
 
-def test_get_operand_expressions():
-    oper = ParseDict(BE2_DICT["operand"][0], BinExport2.Operand())
+    ops = get_instruction_operands(BE2_EXTRACTOR.be2, mov)
+    exps = get_operand_expressions(BE2_EXTRACTOR.be2, ops[0])
+    assert is_stack_register_expression(BE2_EXTRACTOR.be2, exps[0]) is False
+    exps = get_operand_expressions(BE2_EXTRACTOR.be2, ops[1])
+    assert is_stack_register_expression(BE2_EXTRACTOR.be2, exps[0]) is False
 
-    assert len(get_operand_expressions(BE2, oper)) == 1
+    ops = get_instruction_operands(BE2_EXTRACTOR.be2, add)
+    exps = get_operand_expressions(BE2_EXTRACTOR.be2, ops[0])
+    assert is_stack_register_expression(BE2_EXTRACTOR.be2, exps[0]) is False
+    exps = get_operand_expressions(BE2_EXTRACTOR.be2, ops[1])
+    assert is_stack_register_expression(BE2_EXTRACTOR.be2, exps[0]) is True
+    exps = get_operand_expressions(BE2_EXTRACTOR.be2, ops[1])
+    assert is_stack_register_expression(BE2_EXTRACTOR.be2, exps[0]) is False

From c662176e38920b741dccca50637a664e9c94664d Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Tue, 3 Sep 2024 09:52:38 +0000
Subject: [PATCH 189/200] add method to get instruction by address

---
 .../extractors/binexport2/__init__.py         |  7 +++
 tests/test_binexport_accessors.py             | 54 +++++++++----------
 2 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index 76731e8ac..dd860dbf6 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -235,6 +235,13 @@ def get_function_name_by_address(self, address: int) -> str:
         vertex_index: int = self.vertex_index_by_address[address]
         return self.get_function_name_by_vertex(vertex_index)
 
+    def get_instruction_by_address(self, address: int) -> BinExport2.Instruction:
+        for i, be2_insn in enumerate(self.be2.instruction):
+            insn = self.get_insn_address(i)
+            if address == insn:
+                return be2_insn
+        raise ValueError(f"address 0x{address:x} not found")
+
 
 class BinExport2Analysis:
     def __init__(self, be2: BinExport2, idx: BinExport2Index, buf: bytes):
diff --git a/tests/test_binexport_accessors.py b/tests/test_binexport_accessors.py
index cef655eb0..9f61f1722 100644
--- a/tests/test_binexport_accessors.py
+++ b/tests/test_binexport_accessors.py
@@ -34,14 +34,14 @@
 # Ghidra disassembly of c7f38027552a3eca84e2bfc846ac1307fbf98657545426bb93a2d63555cbb486
 GHIDRA_DISASSEMBLY = """
                              //
-                             // segment_1 
+                             // segment_1
                              // Loadable segment  [0x200000 - 0x200157]
                              // ram:00200000-ram:00200157
                              //
         00200000 7f 45 4c        Elf64_Ehdr
 ...
                              //
-                             // .text 
+                             // .text
                              // SHT_PROGBITS  [0x210158 - 0x2101c7]
                              // ram:00210158-ram:002101c7
                              //
@@ -50,9 +50,9 @@
                              **************************************************************
                              undefined entry()
              undefined         w0:1           <RETURN>
-                             _start                                          XREF[4]:     Entry Point(*), 00200018(*), 
-                             entry                                                        002000c0(*), 
-                                                                                          _elfSectionHeaders::00000050(*)  
+                             _start                                          XREF[4]:     Entry Point(*), 00200018(*),
+                             entry                                                        002000c0(*),
+                                                                                          _elfSectionHeaders::00000050(*)
         00210158 20 00 80 d2     mov        x0,#0x1
         0021015c a1 02 00 58     ldr        x1=>helloWorldStr,DAT_002101b0                   = "Hello World!\n"
                                                                                              = 00000000002201C8h
@@ -70,47 +70,47 @@
                              **************************************************************
                              undefined printString()
              undefined         w0:1           <RETURN>
-                             printString                                     XREF[1]:     entry:00210170(c)  
+                             printString                                     XREF[1]:     entry:00210170(c)
         00210180 01 00 80 d2     mov        x1,#0x0
-                             strlenLoop                                      XREF[1]:     00210194(j)  
+                             strlenLoop                                      XREF[1]:     00210194(j)
         00210184 02 68 61 38     ldrb       w2,[x0, x1, LSL ]
         00210188 5f 00 00 71     cmp        w2,#0x0
         0021018c 60 00 00 54     b.eq       strlenDone
         00210190 21 04 00 91     add        x1,x1,#0x1
         00210194 fc ff ff 17     b          strlenLoop
-                             strlenDone                                      XREF[1]:     0021018c(j)  
+                             strlenDone                                      XREF[1]:     0021018c(j)
         00210198 e2 03 01 aa     mov        x2,x1
         0021019c e1 03 00 aa     mov        x1,x0
         002101a0 20 00 80 d2     mov        x0,#0x1
         002101a4 08 08 80 d2     mov        x8,#0x40
         002101a8 01 00 00 d4     svc        0x0
         002101ac c0 03 5f d6     ret
-                             DAT_002101b0                                    XREF[1]:     entry:0021015c(R)  
+                             DAT_002101b0                                    XREF[1]:     entry:0021015c(R)
         002101b0 c8 01 22        undefined8 00000000002201C8h                                ?  ->  002201c8
-                 00 00 00 
+                 00 00 00
                  00 00
-                             DAT_002101b8                                    XREF[1]:     entry:00210160(R)  
+                             DAT_002101b8                                    XREF[1]:     entry:00210160(R)
         002101b8 0e 00 00        undefined8 000000000000000Eh
-                 00 00 00 
+                 00 00 00
                  00 00
-                             DAT_002101c0                                    XREF[1]:     entry:0021016c(R)  
+                             DAT_002101c0                                    XREF[1]:     entry:0021016c(R)
         002101c0 d6 01 22        undefined8 00000000002201D6h                                ?  ->  002201d6
-                 00 00 00 
+                 00 00 00
                  00 00
                              //
-                             // .data 
+                             // .data
                              // SHT_PROGBITS  [0x2201c8 - 0x2201fb]
                              // ram:002201c8-ram:002201fb
                              //
-                             helloWorldStr                                   XREF[3]:     002000f8(*), entry:0021015c(*), 
-                                                                                          _elfSectionHeaders::00000090(*)  
+                             helloWorldStr                                   XREF[3]:     002000f8(*), entry:0021015c(*),
+                                                                                          _elfSectionHeaders::00000090(*)
         002201c8 48 65 6c        ds         "Hello World!\n"
-                 6c 6f 20 
-                 57 6f 72 
-                             $stringWith_Weird_Name                          XREF[1]:     entry:0021016c(*)  
+                 6c 6f 20
+                 57 6f 72
+                             $stringWith_Weird_Name                          XREF[1]:     entry:0021016c(*)
         002201d6 54 68 69        ds         "This string has a very strange label\n"
-                 73 20 73 
-                 74 72 69 
+                 73 20 73
+                 74 72 69
 ...
 """
 
@@ -122,11 +122,11 @@ def _parse_ghidra_disassembly(disasm: str) -> dict:
     # address  bytes           mnemonic   o1,o2  (,o3)
     pattern = re.compile(
         r"^( ){8}(?P<address>[0-9a-f]+) "
-        "(?P<bytes>([0-9a-f]{2}[ ]){4})\s+"
-        "(?P<mnemonic>[\w\.]+)\s*"
-        "(?P<operand1>[\w#$=>]+)?,?"
-        "((?P<operand2>[\w#$=>]+))?,?"
-        "((?P<operand3>[\w#$=>]+))?"
+        + r"(?P<bytes>([0-9a-f]{2}[ ]){4})\s+"
+        + r"(?P<mnemonic>[\w\.]+)\s*"
+        + r"(?P<operand1>[\w#$=>]+)?,?"
+        + r"((?P<operand2>[\w#$=>]+))?,?"
+        + r"((?P<operand3>[\w#$=>]+))?"
     )
     for line in disasm.splitlines()[20:]:
         m = pattern.match(line)

From bf38f225af9376f83966accb341b1d1682ebf6a9 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Wed, 4 Sep 2024 08:31:13 +0000
Subject: [PATCH 190/200] index instructions by address

---
 capa/features/extractors/binexport2/__init__.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index dd860dbf6..cfe926d8f 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -104,6 +104,7 @@ def __init__(self, be2: BinExport2):
         self.string_reference_index_by_source_instruction_index: Dict[int, List[int]] = defaultdict(list)
 
         self.insn_address_by_index: Dict[int, int] = {}
+        self.insn_by_address: Dict[int, BinExport2.Instruction] = {}
 
         # must index instructions first
         self._index_insn_addresses()
@@ -186,6 +187,7 @@ def _index_insn_addresses(self):
                 addr = next_addr
                 next_addr += len(insn.raw_bytes)
             self.insn_address_by_index[idx] = addr
+            self.insn_by_address[addr] = insn
 
     @staticmethod
     def instruction_indices(basic_block: BinExport2.BasicBlock) -> Iterator[int]:
@@ -236,11 +238,8 @@ def get_function_name_by_address(self, address: int) -> str:
         return self.get_function_name_by_vertex(vertex_index)
 
     def get_instruction_by_address(self, address: int) -> BinExport2.Instruction:
-        for i, be2_insn in enumerate(self.be2.instruction):
-            insn = self.get_insn_address(i)
-            if address == insn:
-                return be2_insn
-        raise ValueError(f"address 0x{address:x} not found")
+        assert address in self.insn_by_address, f"address must be indexed, missing {address:x}"
+        return self.insn_by_address[address]
 
 
 class BinExport2Analysis:

From 3c97edc8d2c87051ac1616d15d9ac24071f9f268 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Wed, 4 Sep 2024 12:40:15 +0000
Subject: [PATCH 191/200] adjust and extend tests

---
 tests/test_binexport_accessors.py | 124 +++++++++++++++++++++---------
 1 file changed, 86 insertions(+), 38 deletions(-)

diff --git a/tests/test_binexport_accessors.py b/tests/test_binexport_accessors.py
index 9f61f1722..5097f4d90 100644
--- a/tests/test_binexport_accessors.py
+++ b/tests/test_binexport_accessors.py
@@ -177,47 +177,93 @@ def test_get_instruction_operands_count():
 
 
 @pytest.mark.parametrize(
-    "addr,op_expressions",
+    "addr,expressions",
     [
         # 00210158 20 00 80 d2     mov        x0,#0x1
-        (0x210158, ("x0", "#0x1")),
+        (
+            0x210158,
+            (
+                BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x0"),
+                BinExport2.Expression(type=BinExport2.Expression.IMMEDIATE_INT, immediate=0x1),
+            ),
+        ),
         # 0021015c a1 02 00 58     ldr        x1=>helloWorldStr,DAT_002101b0
-        (0x21015C, ("x1", "DAT_002101b0")),
+        (
+            0x21015C,
+            (
+                BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x1"),
+                BinExport2.Expression(
+                    type=BinExport2.Expression.IMMEDIATE_INT, symbol="DAT_002101b0", immediate=0x2101B0
+                ),
+            ),
+        ),
         # 00210184 02 68 61 38     ldrb       w2,[x0, x1, LSL ]
-        (0x210184, ("w2", "[x0, x1, LSL ]")),
+        #                                                 ^^^ issue in Ghidra?
+        #  IDA gives               LDRB       W2, [X0,X1]
+        # still need to test/handle this and it's the only complex operand expression in this test binary :/
+        (
+            0x210184,
+            (
+                BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="w2"),
+                (
+                    BinExport2.Expression(type=BinExport2.Expression.DEREFERENCE, symbol="["),
+                    BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x0"),
+                    BinExport2.Expression(type=BinExport2.Expression.OPERATOR, symbol=","),
+                    BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x1"),
+                    BinExport2.Expression(type=BinExport2.Expression.OPERATOR, symbol=","),
+                    BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="LSL"),
+                    BinExport2.Expression(type=BinExport2.Expression.DEREFERENCE, symbol="]"),
+                ),
+            ),
+        ),
         # 00210190 21 04 00 91     add        x1,x1,#0x1
-        (0x210190, ("x1", "x1", "#0x1")),
+        (
+            0x210190,
+            (
+                BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x1"),
+                BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x1"),
+                BinExport2.Expression(type=BinExport2.Expression.IMMEDIATE_INT, immediate=0x1),
+            ),
+        ),
     ],
 )
-def test_get_operand_expressions(addr, op_expressions):
+def test_get_operand_expressions(addr, expressions):
     insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr)
     ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn)
     for i, op in enumerate(ops):
+        op_expression = expressions[i]
         exps = get_operand_expressions(BE2_EXTRACTOR.be2, op)
-        assert len(exps) == 1
-        assert exps[0].symbol == op_expressions[i]
+        if len(exps) > 1:
+            for j, exp in enumerate(exps):
+                assert exp.type == op_expression[j].type
+                assert exp.symbol == op_expression[j].symbol
+        else:
+            assert len(exps) == 1
+            assert exps[0] == op_expression
 
 
 @pytest.mark.parametrize(
-    "addr,reg_expressions",
+    "addr,expressions",
     [
         # 00210158 20 00 80 d2     mov        x0,#0x1
         (0x210158, ("x0", None)),
         # 0021015c a1 02 00 58     ldr        x1=>helloWorldStr,DAT_002101b0
         (0x21015C, ("x1", None)),
-        # 00210184 02 68 61 38     ldrb       w2,[x0, x1, LSL ]
-        (0x210184, ("w2", None)),
+        # 0021019c e1 03 00 aa     mov        x1,x0
+        (0x21019C, ("x1", "x0")),
         # 00210190 21 04 00 91     add        x1,x1,#0x1
         (0x210190, ("x1", "x1", None)),
     ],
 )
-def _TODO_test_get_operand_register_expression(addr, reg_expressions):
+def test_get_operand_register_expression(addr, expressions):
     insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr)
     ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn)
     for i, op in enumerate(ops):
         reg_exp = get_operand_register_expression(BE2_EXTRACTOR.be2, op)
-        logger.debug("%s", get_operand_expressions(BE2_EXTRACTOR.be2, op))
-        assert reg_exp == reg_expressions[i]
+        if reg_exp is None:
+            assert reg_exp == expressions[i]
+        else:
+            assert reg_exp.symbol == expressions[i]
 
 
 @pytest.mark.parametrize(
@@ -226,20 +272,22 @@ def _TODO_test_get_operand_register_expression(addr, reg_expressions):
         # 00210158 20 00 80 d2     mov        x0,#0x1
         (0x210158, (None, 0x1)),
         # 0021015c a1 02 00 58     ldr        x1=>helloWorldStr,DAT_002101b0
-        (0x21015C, (None, None)),
-        # 00210184 02 68 61 38     ldrb       w2,[x0, x1, LSL ]
-        (0x210184, (None, None)),
+        (0x21015C, (None, 0x2101B0)),
+        # 002101a8 01 00 00 d4     svc        0x0
+        (0x2101A8, (0x0,)),
         # 00210190 21 04 00 91     add        x1,x1,#0x1
         (0x210190, (None, None, 0x1)),
     ],
 )
-def _TODO_test_get_operand_immediate_expression(addr, expressions):
+def test_get_operand_immediate_expression(addr, expressions):
     insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr)
     ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn)
     for i, op in enumerate(ops):
         reg_exp = get_operand_immediate_expression(BE2_EXTRACTOR.be2, op)
-        logger.debug("%s", get_operand_expressions(BE2_EXTRACTOR.be2, op))
-        assert reg_exp == expressions[i]
+        if reg_exp is None:
+            assert reg_exp == expressions[i]
+        else:
+            assert reg_exp.immediate == expressions[i]
 
 
 """
@@ -249,11 +297,11 @@ def _TODO_test_get_operand_immediate_expression(addr, expressions):
 """
 BE2_DICT: Dict[str, Any] = {
     "expression": [
-        {"type": 1, "symbol": "x0"},
-        {"type": 2, "immediate": 0x20},
-        {"type": 3, "immediate": 0x100},
-        {"type": 1, "symbol": "sp"},
-        {"type": 3, "immediate": 0x10},
+        {"type": BinExport2.Expression.REGISTER, "symbol": "x0"},
+        {"type": BinExport2.Expression.IMMEDIATE_INT, "immediate": 0x20},
+        {"type": BinExport2.Expression.IMMEDIATE_INT, "immediate": 0x100},
+        {"type": BinExport2.Expression.REGISTER, "symbol": "sp"},
+        {"type": BinExport2.Expression.IMMEDIATE_INT, "immediate": 0x10},
     ],
     # operand consists of 1 or more expressions, linked together as a tree
     "operand": [
@@ -281,20 +329,20 @@ def _TODO_test_get_operand_immediate_expression(addr, expressions):
 )
 
 
-def _TODO_test_is_stack_register_expression():
+def test_is_stack_register_expression():
     mov = ParseDict(BE2_DICT["instruction"][0], BinExport2.Instruction())
     add = ParseDict(BE2_DICT["instruction"][2], BinExport2.Instruction())
 
-    ops = get_instruction_operands(BE2_EXTRACTOR.be2, mov)
-    exps = get_operand_expressions(BE2_EXTRACTOR.be2, ops[0])
-    assert is_stack_register_expression(BE2_EXTRACTOR.be2, exps[0]) is False
-    exps = get_operand_expressions(BE2_EXTRACTOR.be2, ops[1])
-    assert is_stack_register_expression(BE2_EXTRACTOR.be2, exps[0]) is False
+    mov_op0, mov_op1 = get_instruction_operands(BE2, mov)
+    op0_exp0 = get_operand_expressions(BE2, mov_op0)[0]
+    assert is_stack_register_expression(BE2, op0_exp0) is False
+    op0_exp1 = get_operand_expressions(BE2, mov_op1)[0]
+    assert is_stack_register_expression(BE2, op0_exp1) is False
 
-    ops = get_instruction_operands(BE2_EXTRACTOR.be2, add)
-    exps = get_operand_expressions(BE2_EXTRACTOR.be2, ops[0])
-    assert is_stack_register_expression(BE2_EXTRACTOR.be2, exps[0]) is False
-    exps = get_operand_expressions(BE2_EXTRACTOR.be2, ops[1])
-    assert is_stack_register_expression(BE2_EXTRACTOR.be2, exps[0]) is True
-    exps = get_operand_expressions(BE2_EXTRACTOR.be2, ops[1])
-    assert is_stack_register_expression(BE2_EXTRACTOR.be2, exps[0]) is False
+    add_op0, add_op1, add_op2 = get_instruction_operands(BE2, add)
+    op0_exp0 = get_operand_expressions(BE2, add_op0)[0]
+    assert is_stack_register_expression(BE2, op0_exp0) is False
+    op1_exp0 = get_operand_expressions(BE2, add_op1)[0]
+    assert is_stack_register_expression(BE2, op1_exp0) is True
+    op2_exp0 = get_operand_expressions(BE2, add_op2)[0]
+    assert is_stack_register_expression(BE2, op2_exp0) is False

From 7142bf70e8a534351ce0c75f12aef5c5ca54a136 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Wed, 4 Sep 2024 12:41:36 +0000
Subject: [PATCH 192/200] handle operator with no children bug

---
 capa/features/extractors/binexport2/helpers.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/capa/features/extractors/binexport2/helpers.py b/capa/features/extractors/binexport2/helpers.py
index 3bad3162c..a6fd5827d 100644
--- a/capa/features/extractors/binexport2/helpers.py
+++ b/capa/features/extractors/binexport2/helpers.py
@@ -102,7 +102,14 @@ def _get_operand_expression_list(
 
     elif expression.type == BinExport2.Expression.OPERATOR:
 
-        if len(children_tree_indexes) == 1:
+        if len(children_tree_indexes) == 0:
+            # TODO(mr-tz): Ghidra bug?
+            # https://github.com/mandiant/capa/pull/2340
+            # 00210184 02 68 61 38     ldrb       w2,[x0, x1, LSL ]
+            #                                                 ^^^
+            pass
+
+        elif len(children_tree_indexes) == 1:
             # prefix operator, like "ds:"
             expression_list.append(expression)
             child_index = children_tree_indexes[0]

From e4d1b04a7b28a9a3f8b2c2a9308b10dd1e101cc8 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <wballenthin@google.com>
Date: Mon, 9 Sep 2024 10:10:00 +0000
Subject: [PATCH 193/200] binexport: use instruction address index

ref: https://github.com/mandiant/capa/pull/1950/files#r1728570811
---
 .../extractors/binexport2/arch/intel/insn.py      | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/capa/features/extractors/binexport2/arch/intel/insn.py b/capa/features/extractors/binexport2/arch/intel/insn.py
index 94459de14..3b4621acd 100644
--- a/capa/features/extractors/binexport2/arch/intel/insn.py
+++ b/capa/features/extractors/binexport2/arch/intel/insn.py
@@ -13,7 +13,7 @@
 from capa.features.insn import MAX_STRUCTURE_SIZE, Number, Offset, OperandNumber, OperandOffset
 from capa.features.common import Feature, Characteristic
 from capa.features.address import Address
-from capa.features.extractors.binexport2 import FunctionContext, BasicBlockContext, InstructionContext
+from capa.features.extractors.binexport2 import FunctionContext, BasicBlockContext, InstructionContext, BinExport2Index
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
 from capa.features.extractors.binexport2.helpers import (
     mask_immediate,
@@ -145,12 +145,14 @@ def extract_insn_offset_features(
 def is_security_cookie(
     fhi: FunctionContext,
     bbi: BasicBlockContext,
+    instruction_address: int,
     instruction: BinExport2.Instruction,
 ) -> bool:
     """
     check if an instruction is related to security cookie checks.
     """
     be2: BinExport2 = fhi.ctx.be2
+    idx: BinExport2Index = fhi.ctx.idx
 
     # security cookie check should use SP or BP
     op1: BinExport2.Operand = be2.operand[instruction.operand_index[1]]
@@ -164,13 +166,13 @@ def is_security_cookie(
     basic_block_index: int = bbi.basic_block_index
     bb: BinExport2.BasicBlock = be2.basic_block[basic_block_index]
     if flow_graph.entry_basic_block_index == basic_block_index:
-        first_addr: int = min((be2.instruction[ir.begin_index].address for ir in bb.instruction_index))
-        if instruction.address < first_addr + SECURITY_COOKIE_BYTES_DELTA:
+        first_addr: int = min((idx.insn_address_by_index[ir.begin_index] for ir in bb.instruction_index))
+        if instruction_address < first_addr + SECURITY_COOKIE_BYTES_DELTA:
             return True
     # or insn falls at the end before return in a terminal basic block.
     if basic_block_index not in (e.source_basic_block_index for e in flow_graph.edge):
-        last_addr: int = max((be2.instruction[ir.end_index - 1].address for ir in bb.instruction_index))
-        if instruction.address > last_addr - SECURITY_COOKIE_BYTES_DELTA:
+        last_addr: int = max((idx.insn_address_by_index[ir.end_index - 1] for ir in bb.instruction_index))
+        if instruction_address > last_addr - SECURITY_COOKIE_BYTES_DELTA:
             return True
     return False
 
@@ -202,7 +204,8 @@ def extract_insn_nzxor_characteristic_features(
     if mnemonic in ("xor", "xorpd", "xorps", "pxor"):
         if operands[0] == operands[1]:
             return
-        if is_security_cookie(fhi, bbh.inner, instruction):
+        instruction_address: int = idx.insn_address_by_index[ii.instruction_index]
+        if is_security_cookie(fhi, bbh.inner, instruction_address, instruction):
             return
 
     yield Characteristic("nzxor"), ih.address

From 0a5cc8e3c3db005efeba974cbf5cfff09452ac4e Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <wballenthin@google.com>
Date: Mon, 9 Sep 2024 10:37:59 +0000
Subject: [PATCH 194/200] inspect binexport: handle lsl with no children

add pruning phase to expression tree building
to remove known-bad branches. This might address
some of the data we're seeing due to:
https://github.com/NationalSecurityAgency/ghidra/issues/6821

Also introduces a --instruction optional argument
to dump the details of a specific instruction.
---
 scripts/inspect-binexport2.py | 169 +++++++++++++++++++++++++---------
 1 file changed, 123 insertions(+), 46 deletions(-)

diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index 1d0f7a8dd..3d6a72cee 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -67,10 +67,38 @@ def getvalue(self):
         return self.o.getvalue()
 
 
+# internal to `render_operand`
+def _prune_expression_tree(
+    be2: BinExport2,
+    operand: BinExport2.Operand,
+    expression_tree: List[List[int]],
+    tree_index: int = 0,
+):
+    expression_index = operand.expression_index[tree_index]
+    expression = be2.expression[expression_index]
+    children_tree_indexes: List[int] = expression_tree[tree_index]
+
+    if expression.type == BinExport2.Expression.OPERATOR:
+        if len(children_tree_indexes) == 0 and expression.symbol in ("lsl", "lsr"):
+            # ghidra may emit superfluous lsl nodes with no children.
+            # https://github.com/mandiant/capa/pull/2340/files#r1750003919
+            # which is maybe: https://github.com/NationalSecurityAgency/ghidra/issues/6821#issuecomment-2295394697
+            #
+            # which seems to be as if the shift wasn't there (shift of #0)
+            # so we want to remove references to this node from any parent nodes.
+            for tree_node in expression_tree:
+                if tree_index in tree_node:
+                    tree_node.remove(tree_index)
+
+            return
+
+    for child_tree_index in children_tree_indexes:
+        _prune_expression_tree(be2, operand, expression_tree, child_tree_index)
+
+
 # internal to `render_operand`
 def _render_expression_tree(
     be2: BinExport2,
-    instruction: BinExport2.Instruction,
     operand: BinExport2.Operand,
     expression_tree: List[List[int]],
     tree_index: int,
@@ -101,7 +129,7 @@ def _render_expression_tree(
             #           |
             #           D
             child_index = children_tree_indexes[0]
-            _render_expression_tree(be2, instruction, operand, expression_tree, child_index, o)
+            _render_expression_tree(be2, operand, expression_tree, child_index, o)
             return
         else:
             raise NotImplementedError(len(children_tree_indexes))
@@ -121,25 +149,28 @@ def _render_expression_tree(
         # to provide it only when necessary.
         assert len(children_tree_indexes) == 1
         child_index = children_tree_indexes[0]
-        _render_expression_tree(be2, instruction, operand, expression_tree, child_index, o)
+        _render_expression_tree(be2, operand, expression_tree, child_index, o)
         return
 
     elif expression.type == BinExport2.Expression.OPERATOR:
 
         if len(children_tree_indexes) == 1:
             # prefix operator, like "ds:"
-            o.write(expression.symbol)
+            if expression.symbol != ",":
+                # or there's a binary operator, like ",", that's missing a child,
+                # such as when we prune "lsl" branches.
+                o.write(expression.symbol)
             child_index = children_tree_indexes[0]
-            _render_expression_tree(be2, instruction, operand, expression_tree, child_index, o)
+            _render_expression_tree(be2, operand, expression_tree, child_index, o)
             return
 
         elif len(children_tree_indexes) == 2:
             # infix operator: like "+" in "ebp+10"
             child_a = children_tree_indexes[0]
             child_b = children_tree_indexes[1]
-            _render_expression_tree(be2, instruction, operand, expression_tree, child_a, o)
+            _render_expression_tree(be2, operand, expression_tree, child_a, o)
             o.write(expression.symbol)
-            _render_expression_tree(be2, instruction, operand, expression_tree, child_b, o)
+            _render_expression_tree(be2, operand, expression_tree, child_b, o)
             return
 
         elif len(children_tree_indexes) == 3:
@@ -147,11 +178,11 @@ def _render_expression_tree(
             child_a = children_tree_indexes[0]
             child_b = children_tree_indexes[1]
             child_c = children_tree_indexes[2]
-            _render_expression_tree(be2, instruction, operand, expression_tree, child_a, o)
+            _render_expression_tree(be2, operand, expression_tree, child_a, o)
             o.write(expression.symbol)
-            _render_expression_tree(be2, instruction, operand, expression_tree, child_b, o)
+            _render_expression_tree(be2, operand, expression_tree, child_b, o)
             o.write(expression.symbol)
-            _render_expression_tree(be2, instruction, operand, expression_tree, child_c, o)
+            _render_expression_tree(be2, operand, expression_tree, child_c, o)
             return
 
         else:
@@ -161,7 +192,7 @@ def _render_expression_tree(
         o.write("[")
         assert len(children_tree_indexes) == 1
         child_index = children_tree_indexes[0]
-        _render_expression_tree(be2, instruction, operand, expression_tree, child_index, o)
+        _render_expression_tree(be2, operand, expression_tree, child_index, o)
         o.write("]")
         return
 
@@ -172,39 +203,10 @@ def _render_expression_tree(
         raise NotImplementedError(expression.type)
 
 
-_OPERAND_CACHE: Dict[int, str] = {}
-
-
-def render_operand(
-    be2: BinExport2, instruction: BinExport2.Instruction, operand: BinExport2.Operand, index: Optional[int] = None
-) -> str:
-    # For the mimikatz example file, there are 138k distinct operands.
-    # Of those, only 11k are unique, which is less than 10% of the total.
-    # The most common operands are seen 37k, 24k, 17k, 15k, 11k, ... times.
-    # In other words, the most common five operands account for 100k instances,
-    # which is around 75% of operand instances.
-    # Therefore, we expect caching to be fruitful, trading memory for CPU time.
-    #
-    # No caching:   6.045 s ± 0.164 s   [User: 5.916 s, System: 0.129 s]
-    # With caching: 4.259 s ± 0.161 s   [User: 4.141 s, System: 0.117 s]
-    #
-    # So we can save 30% of CPU time by caching operand rendering.
-    #
-    # Other measurements:
-    #
-    # perf: loading BinExport2:   0.06s
-    # perf: indexing BinExport2:  0.34s
-    # perf: rendering BinExport2: 1.96s
-    # perf: writing BinExport2:   1.13s
-    # ________________________________________________________
-    # Executed in    4.40 secs    fish           external
-    #    usr time    4.22 secs    0.00 micros    4.22 secs
-    #    sys time    0.18 secs  842.00 micros    0.18 secs
-    if index and index in _OPERAND_CACHE:
-        return _OPERAND_CACHE[index]
-
-    o = io.StringIO()
-
+def _build_expression_tree(
+    be2: BinExport2,
+    operand: BinExport2.Operand,
+) -> List[List[int]]:
     # The reconstructed expression tree layout, linking parent nodes to their children.
     #
     # There is one list of integers for each expression in the operand.
@@ -240,7 +242,45 @@ def render_operand(
 
         tree.append(children)
 
-    _render_expression_tree(be2, instruction, operand, tree, 0, o)
+    _prune_expression_tree(be2, operand, tree)
+
+    return tree
+
+
+_OPERAND_CACHE: Dict[int, str] = {}
+
+
+def render_operand(
+    be2: BinExport2, operand: BinExport2.Operand, index: Optional[int] = None
+) -> str:
+    # For the mimikatz example file, there are 138k distinct operands.
+    # Of those, only 11k are unique, which is less than 10% of the total.
+    # The most common operands are seen 37k, 24k, 17k, 15k, 11k, ... times.
+    # In other words, the most common five operands account for 100k instances,
+    # which is around 75% of operand instances.
+    # Therefore, we expect caching to be fruitful, trading memory for CPU time.
+    #
+    # No caching:   6.045 s ± 0.164 s   [User: 5.916 s, System: 0.129 s]
+    # With caching: 4.259 s ± 0.161 s   [User: 4.141 s, System: 0.117 s]
+    #
+    # So we can save 30% of CPU time by caching operand rendering.
+    #
+    # Other measurements:
+    #
+    # perf: loading BinExport2:   0.06s
+    # perf: indexing BinExport2:  0.34s
+    # perf: rendering BinExport2: 1.96s
+    # perf: writing BinExport2:   1.13s
+    # ________________________________________________________
+    # Executed in    4.40 secs    fish           external
+    #    usr time    4.22 secs    0.00 micros    4.22 secs
+    #    sys time    0.18 secs  842.00 micros    0.18 secs
+    if index and index in _OPERAND_CACHE:
+        return _OPERAND_CACHE[index]
+
+    o = io.StringIO()
+    tree = _build_expression_tree(be2, operand)
+    _render_expression_tree(be2, operand, tree, 0, o)
     s = o.getvalue()
 
     if index:
@@ -249,6 +289,38 @@ def render_operand(
     return s
 
 
+def inspect_operand(be2: BinExport2, operand: BinExport2.Operand):
+    expression_tree = _build_expression_tree(be2, operand)
+
+    def rec(tree_index, indent=0):
+        expression_index = operand.expression_index[tree_index]
+        expression = be2.expression[expression_index]
+        children_tree_indexes: List[int] = expression_tree[tree_index]
+
+        print(f"    {'  ' * indent}expression: {str(expression).replace('\n', ', ')}")
+        for child_index in children_tree_indexes:
+            rec(child_index, indent+1)
+
+    rec(0)
+
+
+def inspect_instruction(be2: BinExport2, instruction: BinExport2.Instruction, address: int):
+    mnemonic = be2.mnemonic[instruction.mnemonic_index]
+    print("instruction:")
+    print(f"  address: {hex(address)}")
+    print(f"  mnemonic: {mnemonic.name}")
+
+    print("  operands:")
+    operands = []
+    for i, operand_index in enumerate(instruction.operand_index):
+        print(f"  - operand {i}: [{operand_index}]")
+        operand = be2.operand[operand_index]
+        # Ghidra bug where empty operands (no expressions) may
+        # exist so we skip those for now (see https://github.com/NationalSecurityAgency/ghidra/issues/6817)
+        if len(operand.expression_index) > 0:
+            inspect_operand(be2, operand)
+
+
 def main(argv=None):
 
     if argv is None:
@@ -256,6 +328,7 @@ def main(argv=None):
 
     parser = argparse.ArgumentParser(description="Inspect BinExport2 files")
     capa.main.install_common_args(parser, wanted={"input_file"})
+    parser.add_argument("--instruction", type=lambda v: int(v, 0))
     args = parser.parse_args(args=argv)
 
     try:
@@ -367,7 +440,7 @@ def main(argv=None):
                                     # Ghidra bug where empty operands (no expressions) may
                                     # exist so we skip those for now (see https://github.com/NationalSecurityAgency/ghidra/issues/6817)
                                     if len(operand.expression_index) > 0:
-                                        operands.append(render_operand(be2, instruction, operand, index=operand_index))
+                                        operands.append(render_operand(be2, operand, index=operand_index))
 
                                 call_targets = ""
                                 if instruction.call_target:
@@ -453,6 +526,10 @@ def main(argv=None):
     with timing("writing to STDOUT"):
         print(o.getvalue())
 
+    if args.instruction:
+        insn = idx.insn_by_address[args.instruction]
+        inspect_instruction(be2, insn, args.instruction)
+
 
 if __name__ == "__main__":
     sys.exit(main())

From c59ed87e858428ae135e5ac97898e91142cc01e7 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <wballenthin@google.com>
Date: Mon, 9 Sep 2024 11:29:23 +0000
Subject: [PATCH 195/200] binexport: consolidate expression tree logic into
 helpers

---
 .../features/extractors/binexport2/helpers.py | 219 ++++++++++++------
 scripts/inspect-binexport2.py                 |  77 +-----
 tests/test_binexport_accessors.py             |   3 -
 3 files changed, 154 insertions(+), 145 deletions(-)

diff --git a/capa/features/extractors/binexport2/helpers.py b/capa/features/extractors/binexport2/helpers.py
index a6fd5827d..4de4623dd 100644
--- a/capa/features/extractors/binexport2/helpers.py
+++ b/capa/features/extractors/binexport2/helpers.py
@@ -46,25 +46,156 @@ def is_vertex_type(vertex: BinExport2.CallGraph.Vertex, type_: BinExport2.CallGr
     return vertex.HasField("type") and vertex.type == type_
 
 
-def _get_operand_expression_list(
+# internal to `build_expression_tree`
+# this is unstable: it is subject to change, so don't rely on it!
+def _prune_expression_tree_empty_shifts(
+    be2: BinExport2,
+    operand: BinExport2.Operand,
+    expression_tree: List[List[int]],
+    tree_index: int,
+):
+    expression_index = operand.expression_index[tree_index]
+    expression = be2.expression[expression_index]
+    children_tree_indexes: List[int] = expression_tree[tree_index]
+
+    if expression.type == BinExport2.Expression.OPERATOR:
+        if len(children_tree_indexes) == 0 and expression.symbol in ("lsl", "lsr"):
+            # Ghidra may emit superfluous lsl nodes with no children.
+            # https://github.com/mandiant/capa/pull/2340/files#r1750003919
+            # Which is maybe: https://github.com/NationalSecurityAgency/ghidra/issues/6821#issuecomment-2295394697
+            #
+            # Which seems to be as if the shift wasn't there (shift of #0)
+            # so we want to remove references to this node from any parent nodes.
+            for tree_node in expression_tree:
+                if tree_index in tree_node:
+                    tree_node.remove(tree_index)
+
+            return
+
+    for child_tree_index in children_tree_indexes:
+        _prune_expression_tree_empty_shifts(be2, operand, expression_tree, child_tree_index)
+
+
+# internal to `build_expression_tree`
+# this is unstable: it is subject to change, so don't rely on it!
+def _prune_expression_tree_empty_commas(
+    be2: BinExport2,
+    operand: BinExport2.Operand,
+    expression_tree: List[List[int]],
+    tree_index: int,
+):
+    expression_index = operand.expression_index[tree_index]
+    expression = be2.expression[expression_index]
+    children_tree_indexes: List[int] = expression_tree[tree_index]
+
+    if expression.type == BinExport2.Expression.OPERATOR:
+        if len(children_tree_indexes) == 1 and expression.symbol == ",":
+            # Due to the above pruning of empty LSL or LSR expressions,
+            # the parents might need to be fixed up.
+            #
+            # Specifically, if the pruned node was part of a comma list with two children,
+            # now there's only a single child, which renders as an extra comma,
+            # so we replace references to the comma node with the immediate child.
+            #
+            # A more correct way of doing this might be to walk up the parents and do fixups,
+            # but I'm not quite sure how to do this yet. Just do two passes right now.
+            child = children_tree_indexes[0]
+
+            for tree_node in expression_tree:
+                tree_node.index
+                if tree_index in tree_node:
+                    tree_node[tree_node.index(tree_index)] = child
+
+            return
+
+    for child_tree_index in children_tree_indexes:
+        _prune_expression_tree_empty_commas(be2, operand, expression_tree, child_tree_index)
+
+
+# internal to `build_expression_tree`
+# this is unstable: it is subject to change, so don't rely on it!
+def _prune_expression_tree(
+    be2: BinExport2,
+    operand: BinExport2.Operand,
+    expression_tree: List[List[int]],
+):
+    _prune_expression_tree_empty_shifts(be2, operand, expression_tree, 0)
+    _prune_expression_tree_empty_commas(be2, operand, expression_tree, 0)
+
+
+# this is unstable: it is subject to change, so don't rely on it!
+def _build_expression_tree(
+    be2: BinExport2,
+    operand: BinExport2.Operand,
+) -> List[List[int]]:
+    # The reconstructed expression tree layout, linking parent nodes to their children.
+    #
+    # There is one list of integers for each expression in the operand.
+    # These integers are indexes of other expressions in the same operand,
+    # which are the children of that expression.
+    #
+    # So:
+    #
+    #   [ [1, 3], [2], [], [4], [5], []]
+    #
+    # means the first expression has two children, at index 1 and 3,
+    # and the tree looks like:
+    #
+    #        0
+    #       / \
+    #      1   3
+    #      |   |
+    #      2   4
+    #          |
+    #          5
+    #
+    # Remember, these are the indices into the entries in operand.expression_index.
+    if len(operand.expression_index) == 0:
+        # Ghidra bug where empty operands (no expressions) may
+        # exist (see https://github.com/NationalSecurityAgency/ghidra/issues/6817)
+        return []
+
+    tree: List[List[int]] = []
+    for i, expression_index in enumerate(operand.expression_index):
+        children = []
+
+        # scan all subsequent expressions, looking for those that have parent_index == current.expression_index
+        for j, candidate_index in enumerate(operand.expression_index[i + 1 :]):
+            candidate = be2.expression[candidate_index]
+
+            if candidate.parent_index == expression_index:
+                children.append(i + j + 1)
+
+        tree.append(children)
+
+    _prune_expression_tree(be2, operand, tree)
+    _prune_expression_tree(be2, operand, tree)
+
+    return tree
+
+
+def _fill_operand_expression_list(
     be2: BinExport2,
     operand: BinExport2.Operand,
     expression_tree: List[List[int]],
     tree_index: int,
     expression_list: List[BinExport2.Expression],
 ):
-    exp_index = operand.expression_index[tree_index]
-    expression = be2.expression[exp_index]
+    """
+    Walk the given expression tree and collect the expression nodes in-order.
+    """
+    expression_index = operand.expression_index[tree_index]
+    expression = be2.expression[expression_index]
     children_tree_indexes: List[int] = expression_tree[tree_index]
 
     if expression.type == BinExport2.Expression.REGISTER:
-        expression_list.append(expression)
         assert len(children_tree_indexes) == 0
+        expression_list.append(expression)
         return
 
     elif expression.type == BinExport2.Expression.SYMBOL:
-        expression_list.append(expression)
         assert len(children_tree_indexes) <= 1
+        expression_list.append(expression)
 
         if len(children_tree_indexes) == 0:
             return
@@ -77,14 +208,14 @@ def _get_operand_expression_list(
             #           |
             #           D
             child_index = children_tree_indexes[0]
-            _get_operand_expression_list(be2, operand, expression_tree, child_index, expression_list)
+            _fill_operand_expression_list(be2, operand, expression_tree, child_index, expression_list)
             return
         else:
             raise NotImplementedError(len(children_tree_indexes))
 
     elif expression.type == BinExport2.Expression.IMMEDIATE_INT:
-        expression_list.append(expression)
         assert len(children_tree_indexes) == 0
+        expression_list.append(expression)
         return
 
     elif expression.type == BinExport2.Expression.SIZE_PREFIX:
@@ -97,32 +228,24 @@ def _get_operand_expression_list(
         # to provide it only when necessary.
         assert len(children_tree_indexes) == 1
         child_index = children_tree_indexes[0]
-        _get_operand_expression_list(be2, operand, expression_tree, child_index, expression_list)
+        _fill_operand_expression_list(be2, operand, expression_tree, child_index, expression_list)
         return
 
     elif expression.type == BinExport2.Expression.OPERATOR:
-
-        if len(children_tree_indexes) == 0:
-            # TODO(mr-tz): Ghidra bug?
-            # https://github.com/mandiant/capa/pull/2340
-            # 00210184 02 68 61 38     ldrb       w2,[x0, x1, LSL ]
-            #                                                 ^^^
-            pass
-
-        elif len(children_tree_indexes) == 1:
+        if len(children_tree_indexes) == 1:
             # prefix operator, like "ds:"
             expression_list.append(expression)
             child_index = children_tree_indexes[0]
-            _get_operand_expression_list(be2, operand, expression_tree, child_index, expression_list)
+            _fill_operand_expression_list(be2, operand, expression_tree, child_index, expression_list)
             return
 
         elif len(children_tree_indexes) == 2:
             # infix operator: like "+" in "ebp+10"
             child_a = children_tree_indexes[0]
             child_b = children_tree_indexes[1]
-            _get_operand_expression_list(be2, operand, expression_tree, child_a, expression_list)
+            _fill_operand_expression_list(be2, operand, expression_tree, child_a, expression_list)
             expression_list.append(expression)
-            _get_operand_expression_list(be2, operand, expression_tree, child_b, expression_list)
+            _fill_operand_expression_list(be2, operand, expression_tree, child_b, expression_list)
             return
 
         elif len(children_tree_indexes) == 3:
@@ -130,22 +253,22 @@ def _get_operand_expression_list(
             child_a = children_tree_indexes[0]
             child_b = children_tree_indexes[1]
             child_c = children_tree_indexes[2]
-            _get_operand_expression_list(be2, operand, expression_tree, child_a, expression_list)
+            _fill_operand_expression_list(be2, operand, expression_tree, child_a, expression_list)
             expression_list.append(expression)
-            _get_operand_expression_list(be2, operand, expression_tree, child_b, expression_list)
+            _fill_operand_expression_list(be2, operand, expression_tree, child_b, expression_list)
             expression_list.append(expression)
-            _get_operand_expression_list(be2, operand, expression_tree, child_c, expression_list)
+            _fill_operand_expression_list(be2, operand, expression_tree, child_c, expression_list)
             return
 
         else:
             raise NotImplementedError(len(children_tree_indexes))
 
     elif expression.type == BinExport2.Expression.DEREFERENCE:
+        assert len(children_tree_indexes) == 1
         expression_list.append(expression)
 
-        assert len(children_tree_indexes) == 1
         child_index = children_tree_indexes[0]
-        _get_operand_expression_list(be2, operand, expression_tree, child_index, expression_list)
+        _fill_operand_expression_list(be2, operand, expression_tree, child_index, expression_list)
         return
 
     elif expression.type == BinExport2.Expression.IMMEDIATE_FLOAT:
@@ -156,50 +279,12 @@ def _get_operand_expression_list(
 
 
 def get_operand_expressions(be2: BinExport2, op: BinExport2.Operand) -> List[BinExport2.Expression]:
-    # The reconstructed expression tree layout, linking parent nodes to their children.
-    #
-    # There is one list of integers for each expression in the operand.
-    # These integers are indexes of other expressions in the same operand,
-    # which are the children of that expression.
-    #
-    # So:
-    #
-    #   [ [1, 3], [2], [], [4], [5], []]
-    #
-    # means the first expression has two children, at index 1 and 3,
-    # and the tree looks like:
-    #
-    #        0
-    #       / \
-    #      1   3
-    #      |   |
-    #      2   4
-    #          |
-    #          5
-    #
-    # Remember, these are the indices into the entries in operand.expression_index.
-    if len(op.expression_index) == 0:
-        # Ghidra bug where empty operands (no expressions) may
-        # exist (see https://github.com/NationalSecurityAgency/ghidra/issues/6817)
-        return []
-
-    exp_tree: List[List[int]] = []
-    for i, exp_index in enumerate(op.expression_index):
-        children = []
-
-        # scan all subsequent expressions, looking for those that have parent_index == current.expression_index
-        for j, candidate_index in enumerate(op.expression_index[i + 1 :]):
-            candidate = be2.expression[candidate_index]
-
-            if candidate.parent_index == exp_index:
-                children.append(i + j + 1)
-
-        exp_tree.append(children)
+    tree = _build_expression_tree(be2, op)
 
-    exp_list: List[BinExport2.Expression] = []
-    _get_operand_expression_list(be2, op, exp_tree, 0, exp_list)
+    expressions: List[BinExport2.Expression] = []
+    _fill_operand_expression_list(be2, op, tree, 0, expressions)
 
-    return exp_list
+    return expressions
 
 
 def get_operand_register_expression(be2: BinExport2, operand: BinExport2.Operand) -> Optional[BinExport2.Expression]:
diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index 3d6a72cee..b579fabd0 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -67,35 +67,6 @@ def getvalue(self):
         return self.o.getvalue()
 
 
-# internal to `render_operand`
-def _prune_expression_tree(
-    be2: BinExport2,
-    operand: BinExport2.Operand,
-    expression_tree: List[List[int]],
-    tree_index: int = 0,
-):
-    expression_index = operand.expression_index[tree_index]
-    expression = be2.expression[expression_index]
-    children_tree_indexes: List[int] = expression_tree[tree_index]
-
-    if expression.type == BinExport2.Expression.OPERATOR:
-        if len(children_tree_indexes) == 0 and expression.symbol in ("lsl", "lsr"):
-            # ghidra may emit superfluous lsl nodes with no children.
-            # https://github.com/mandiant/capa/pull/2340/files#r1750003919
-            # which is maybe: https://github.com/NationalSecurityAgency/ghidra/issues/6821#issuecomment-2295394697
-            #
-            # which seems to be as if the shift wasn't there (shift of #0)
-            # so we want to remove references to this node from any parent nodes.
-            for tree_node in expression_tree:
-                if tree_index in tree_node:
-                    tree_node.remove(tree_index)
-
-            return
-
-    for child_tree_index in children_tree_indexes:
-        _prune_expression_tree(be2, operand, expression_tree, child_tree_index)
-
-
 # internal to `render_operand`
 def _render_expression_tree(
     be2: BinExport2,
@@ -203,50 +174,6 @@ def _render_expression_tree(
         raise NotImplementedError(expression.type)
 
 
-def _build_expression_tree(
-    be2: BinExport2,
-    operand: BinExport2.Operand,
-) -> List[List[int]]:
-    # The reconstructed expression tree layout, linking parent nodes to their children.
-    #
-    # There is one list of integers for each expression in the operand.
-    # These integers are indexes of other expressions in the same operand,
-    # which are the children of that expression.
-    #
-    # So:
-    #
-    #   [ [1, 3], [2], [], [4], [5], []]
-    #
-    # means the first expression has two children, at index 1 and 3,
-    # and the tree looks like:
-    #
-    #        0
-    #       / \
-    #      1   3
-    #      |   |
-    #      2   4
-    #          |
-    #          5
-    #
-    # Remember, these are the indices into the entries in operand.expression_index.
-    tree: List[List[int]] = []
-    for i, expression_index in enumerate(operand.expression_index):
-        children = []
-
-        # scan all subsequent expressions, looking for those that have parent_index == current.expression_index
-        for j, candidate_index in enumerate(operand.expression_index[i + 1 :]):
-            candidate = be2.expression[candidate_index]
-
-            if candidate.parent_index == expression_index:
-                children.append(i + j + 1)
-
-        tree.append(children)
-
-    _prune_expression_tree(be2, operand, tree)
-
-    return tree
-
-
 _OPERAND_CACHE: Dict[int, str] = {}
 
 
@@ -279,7 +206,7 @@ def render_operand(
         return _OPERAND_CACHE[index]
 
     o = io.StringIO()
-    tree = _build_expression_tree(be2, operand)
+    tree = capa.features.extractors.binexport2.helpers._build_expression_tree(be2, operand)
     _render_expression_tree(be2, operand, tree, 0, o)
     s = o.getvalue()
 
@@ -290,7 +217,7 @@ def render_operand(
 
 
 def inspect_operand(be2: BinExport2, operand: BinExport2.Operand):
-    expression_tree = _build_expression_tree(be2, operand)
+    expression_tree = capa.features.extractors.binexport2.helpers._build_expression_tree(be2, operand)
 
     def rec(tree_index, indent=0):
         expression_index = operand.expression_index[tree_index]
diff --git a/tests/test_binexport_accessors.py b/tests/test_binexport_accessors.py
index 5097f4d90..ba63b89c5 100644
--- a/tests/test_binexport_accessors.py
+++ b/tests/test_binexport_accessors.py
@@ -200,7 +200,6 @@ def test_get_instruction_operands_count():
         # 00210184 02 68 61 38     ldrb       w2,[x0, x1, LSL ]
         #                                                 ^^^ issue in Ghidra?
         #  IDA gives               LDRB       W2, [X0,X1]
-        # still need to test/handle this and it's the only complex operand expression in this test binary :/
         (
             0x210184,
             (
@@ -210,8 +209,6 @@ def test_get_instruction_operands_count():
                     BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x0"),
                     BinExport2.Expression(type=BinExport2.Expression.OPERATOR, symbol=","),
                     BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x1"),
-                    BinExport2.Expression(type=BinExport2.Expression.OPERATOR, symbol=","),
-                    BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="LSL"),
                     BinExport2.Expression(type=BinExport2.Expression.DEREFERENCE, symbol="]"),
                 ),
             ),

From b5ec35d08f239777639a102aa1a7516524a80683 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <wballenthin@google.com>
Date: Wed, 11 Sep 2024 10:13:34 +0000
Subject: [PATCH 196/200] binexport: index instruction indices by address

---
 capa/features/extractors/binexport2/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index cfe926d8f..ccb5ab570 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -104,6 +104,7 @@ def __init__(self, be2: BinExport2):
         self.string_reference_index_by_source_instruction_index: Dict[int, List[int]] = defaultdict(list)
 
         self.insn_address_by_index: Dict[int, int] = {}
+        self.insn_index_by_address: Dict[int, int] = {}
         self.insn_by_address: Dict[int, BinExport2.Instruction] = {}
 
         # must index instructions first
@@ -187,6 +188,7 @@ def _index_insn_addresses(self):
                 addr = next_addr
                 next_addr += len(insn.raw_bytes)
             self.insn_address_by_index[idx] = addr
+            self.insn_index_by_address[addr] = idx
             self.insn_by_address[addr] = insn
 
     @staticmethod

From 38dab5c899c64e69295580d08c666350788bb52c Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <wballenthin@google.com>
Date: Mon, 9 Sep 2024 12:13:36 +0000
Subject: [PATCH 197/200] binexport: introduce instruction pattern matching

Introduce intruction pattern matching to declaratively
describe the instructions and operands that we want to
extract. While there's a bit more code, its much more
thoroughly tested, and is less brittle than the prior
if/else/if/else/if/else implementation.
---
 .../extractors/binexport2/__init__.py         |   3 +
 .../extractors/binexport2/arch/arm/insn.py    | 154 +++------
 .../extractors/binexport2/arch/intel/insn.py  | 256 +++++++-------
 .../features/extractors/binexport2/helpers.py | 321 +++++++++++++++++-
 scripts/inspect-binexport2.py                 |  19 +-
 tests/fixtures.py                             |   4 +-
 tests/test_binexport_accessors.py             | 257 ++++++++++++++
 7 files changed, 758 insertions(+), 256 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index ccb5ab570..d3ce77d22 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -9,6 +9,9 @@
 Proto files generated via protobuf v24.4:
 
     protoc --python_out=. --mypy_out=. binexport2.proto
+
+from BinExport2 at 6916731d5f6693c4a4f0a052501fd3bd92cfd08b
+https://github.com/google/binexport/blob/6916731/binexport2.proto
 """
 import io
 import hashlib
diff --git a/capa/features/extractors/binexport2/arch/arm/insn.py b/capa/features/extractors/binexport2/arch/arm/insn.py
index 2f2fc0f52..7af93aaff 100644
--- a/capa/features/extractors/binexport2/arch/arm/insn.py
+++ b/capa/features/extractors/binexport2/arch/arm/insn.py
@@ -15,11 +15,10 @@
 from capa.features.extractors.binexport2 import FunctionContext, InstructionContext
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
 from capa.features.extractors.binexport2.helpers import (
+    BinExport2InstructionPatternMatcher,
     mask_immediate,
     is_address_mapped,
-    get_operand_expressions,
     get_instruction_mnemonic,
-    get_instruction_operands,
     get_operand_register_expression,
     get_operand_immediate_expression,
 )
@@ -50,10 +49,10 @@ def extract_insn_number_features(
     if mnemonic in ("add", "sub"):
         assert len(instruction.operand_index) == 3
 
-        expression1: Optional[BinExport2.Expression] = get_operand_register_expression(
+        operand1_expression: Optional[BinExport2.Expression] = get_operand_register_expression(
             be2, be2.operand[instruction.operand_index[1]]
         )
-        if expression1 and is_stack_register_expression(be2, expression1):
+        if operand1_expression and is_stack_register_expression(be2, operand1_expression):
             # skip things like:
             # add x0,sp,#0x8
             return
@@ -78,6 +77,18 @@ def extract_insn_number_features(
                 yield OperandOffset(i, value), ih.address
 
 
+OFFSET_PATTERNS = BinExport2InstructionPatternMatcher.from_str(
+    """
+    ldr|ldrb|ldrh|ldrsb|ldrsh|ldrex|ldrd|str|strb|strh|strex|strd reg, [reg(not-stack),  #int]                                 ; capture #int
+    ldr|ldrb|ldrh|ldrsb|ldrsh|ldrex|ldrd|str|strb|strh|strex|strd reg, [reg(not-stack),  #int]!                                ; capture #int
+    ldr|ldrb|ldrh|ldrsb|ldrsh|ldrex|ldrd|str|strb|strh|strex|strd reg, [reg(not-stack)],        #int                           ; capture #int
+    ldp|ldpd|stp|stpd                                             reg, reg,                     [reg(not-stack), #int]         ; capture #int
+    ldp|ldpd|stp|stpd                                             reg, reg,                     [reg(not-stack), #int]!        ; capture #int
+    ldp|ldpd|stp|stpd                                             reg, reg,                     [reg(not-stack)],       #int   ; capture #int
+    """
+)
+
+
 def extract_insn_offset_features(
     fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
 ) -> Iterator[Tuple[Feature, Address]]:
@@ -85,104 +96,26 @@ def extract_insn_offset_features(
     ii: InstructionContext = ih.inner
 
     be2: BinExport2 = fhi.ctx.be2
-    instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
-
-    if len(instruction.operand_index) == 0:
-        # skip things like:
-        #   .text:0040116e leave
-        return
 
-    mnemonic: str = get_instruction_mnemonic(be2, instruction)
-    value: Optional[int] = None
-    value_index: Optional[int] = None
-
-    operands: List[BinExport2.Operand]
-    immediate_expression: Optional[BinExport2.Expression]
-
-    if mnemonic.startswith(("ldr", "str")):
-        operands = get_instruction_operands(be2, instruction)
-        expressions1: List[BinExport2.Expression]
-
-        if len(operands) == 2:
-            # like:
-            # ldr x0, [x1, 8]
-            expressions1 = get_operand_expressions(be2, operands[1])
-
-            if len(expressions1) == 4:
-                # like:
-                # ldr x0, [x1, 8]
-                if not is_stack_register_expression(be2, expressions1[1]):
-                    if expressions1[3].type == BinExport2.Expression.IMMEDIATE_INT:
-                        value = expressions1[3].immediate
-                        value_index = 1
-
-            elif len(expressions1) == 5:
-                # like
-                # ldr x0, [x1, 8]!
-                if not is_stack_register_expression(be2, expressions1[2]):
-                    if expressions1[4].type == BinExport2.Expression.IMMEDIATE_INT:
-                        value = expressions1[4].immediate
-                        value_index = 1
-
-        elif len(operands) == 3:
-            # like:
-            # ldr x0, [x1], 8
-            expressions1 = get_operand_expressions(be2, operands[1])
-            if not is_stack_register_expression(be2, expressions1[1]):
-                immediate_expression = get_operand_immediate_expression(be2, operands[2])
-
-                if immediate_expression:
-                    value = immediate_expression.immediate
-                    value_index = 2
-
-    elif mnemonic in ("ldp", "stp"):
-        operands = get_instruction_operands(be2, instruction)
-        expressions2: List[BinExport2.Expression]
-
-        if len(operands) == 3:
-            # like:
-            # ldp x0, x1, [x3, 8]!
-            expressions2 = get_operand_expressions(be2, operands[2])
-
-            if len(expressions2) == 4:
-                # like:
-                # ldp x0, x1, [x3, 8]
-                if not is_stack_register_expression(be2, expressions2[1]):
-                    if expressions2[3].type == BinExport2.Expression.IMMEDIATE_INT:
-                        value = expressions2[3].immediate
-                        value_index = 2
-
-            elif len(expressions2) == 5:
-                # like:
-                # ldp x0, x1, [x3, 8]!
-                if not is_stack_register_expression(be2, expressions2[2]):
-                    if expressions2[4].type == BinExport2.Expression.IMMEDIATE_INT:
-                        value = expressions2[4].immediate
-                        value_index = 2
-
-        elif len(operands) == 4:
-            # like
-            # ldp x0, x1, [x3], 8
-            expressions2 = get_operand_expressions(be2, operands[2])
-
-            if not is_stack_register_expression(be2, expressions2[1]):
-                immediate_expression = get_operand_immediate_expression(be2, operands[3])
-
-                if immediate_expression:
-                    value = immediate_expression.immediate
-                    value_index = 3
-
-    if value is None:
+    match = OFFSET_PATTERNS.match_with_be2(be2, ii.instruction_index)
+    if not match:
         return
 
-    # we shouldn't make it here if index is not set
-    assert value_index is not None
+    value = match.expression.immediate
 
     value = mask_immediate(fhi.arch, value)
     if not is_address_mapped(be2, value):
         value = capa.features.extractors.binexport2.helpers.twos_complement(fhi.arch, value)
         yield Offset(value), ih.address
-        yield OperandOffset(value_index, value), ih.address
+        yield OperandOffset(match.operand_index, value), ih.address
+
+
+NZXOR_PATTERNS = BinExport2InstructionPatternMatcher.from_str(
+    """
+    eor reg, reg, reg
+    eor reg, reg, #int
+    """
+)
 
 
 def extract_insn_nzxor_characteristic_features(
@@ -190,42 +123,33 @@ def extract_insn_nzxor_characteristic_features(
 ) -> Iterator[Tuple[Feature, Address]]:
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
-
     be2: BinExport2 = fhi.ctx.be2
 
-    instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
-    mnemonic: str = get_instruction_mnemonic(be2, instruction)
-
-    if mnemonic != "eor":
+    if NZXOR_PATTERNS.match_with_be2(be2, ii.instruction_index) is None:
         return
 
+    instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
+    # guaranteed to be simple int/reg operands
+    # so we don't have to realize the tree/list.
     operands: List[BinExport2.Operand] = [be2.operand[operand_index] for operand_index in instruction.operand_index]
 
-    assert len(operands) == 3
-
     if operands[1] != operands[2]:
         yield Characteristic("nzxor"), ih.address
 
 
+INDIRECT_CALL_PATTERNS = BinExport2InstructionPatternMatcher.from_str(
+    """
+    blx|bx|blr reg
+    """
+)
+
+
 def extract_function_indirect_call_characteristic_features(
     fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
 ) -> Iterator[Tuple[Feature, Address]]:
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
-
     be2: BinExport2 = fhi.ctx.be2
 
-    instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
-    mnemonic: str = get_instruction_mnemonic(be2, instruction)
-
-    if mnemonic not in ("blx", "bx", "blr"):
-        return
-
-    assert len(instruction.operand_index) == 1
-
-    expressions: List[BinExport2.Expression] = get_operand_expressions(be2, be2.operand[instruction.operand_index[0]])
-
-    assert len(expressions) == 1
-
-    if expressions[0].type == BinExport2.Expression.REGISTER:
+    if INDIRECT_CALL_PATTERNS.match_with_be2(be2, ii.instruction_index) is not None:
         yield Characteristic("indirect call"), ih.address
diff --git a/capa/features/extractors/binexport2/arch/intel/insn.py b/capa/features/extractors/binexport2/arch/intel/insn.py
index 3b4621acd..efb4a6fe5 100644
--- a/capa/features/extractors/binexport2/arch/intel/insn.py
+++ b/capa/features/extractors/binexport2/arch/intel/insn.py
@@ -6,32 +6,61 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 import logging
-from typing import List, Tuple, Iterator, Optional
+from typing import List, Tuple, Iterator
 
 import capa.features.extractors.strings
 import capa.features.extractors.binexport2.helpers
 from capa.features.insn import MAX_STRUCTURE_SIZE, Number, Offset, OperandNumber, OperandOffset
 from capa.features.common import Feature, Characteristic
 from capa.features.address import Address
-from capa.features.extractors.binexport2 import FunctionContext, BasicBlockContext, InstructionContext, BinExport2Index
+from capa.features.extractors.binexport2 import BinExport2Index, FunctionContext, BasicBlockContext, InstructionContext
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
 from capa.features.extractors.binexport2.helpers import (
+    BinExport2InstructionPatternMatcher,
     mask_immediate,
     is_address_mapped,
     get_instruction_mnemonic,
-    get_operand_register_expression,
-    get_operand_immediate_expression,
 )
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
-from capa.features.extractors.binexport2.arch.intel.helpers import (
-    SECURITY_COOKIE_BYTES_DELTA,
-    OperandPhraseInfo,
-    get_operand_phrase_info,
-)
+from capa.features.extractors.binexport2.arch.intel.helpers import SECURITY_COOKIE_BYTES_DELTA
 
 logger = logging.getLogger(__name__)
 
 
+IGNORE_NUMBER_PATTERNS = BinExport2InstructionPatternMatcher.from_str(
+    """
+    ret  #int
+    retn #int
+    add  reg(stack), #int
+    sub  reg(stack), #int
+    """
+)
+
+NUMBER_PATTERNS = BinExport2InstructionPatternMatcher.from_str(
+    """
+    push #int0            ; capture #int0
+
+    # its a little tedious to enumerate all the address forms
+    # but at least we are explicit
+    cmp|and|or|test|mov|add|adc|sub|shl|shr|sal|sar  reg,                      #int0  ; capture #int0
+    cmp|and|or|test|mov|add|adc|sub|shl|shr|sal|sar [reg],                     #int0  ; capture #int0
+    cmp|and|or|test|mov|add|adc|sub|shl|shr|sal|sar [#int],                    #int0  ; capture #int0
+    cmp|and|or|test|mov|add|adc|sub|shl|shr|sal|sar [reg + #int],              #int0  ; capture #int0
+    cmp|and|or|test|mov|add|adc|sub|shl|shr|sal|sar [reg + reg + #int],        #int0  ; capture #int0
+    cmp|and|or|test|mov|add|adc|sub|shl|shr|sal|sar [reg + reg * #int],        #int0  ; capture #int0
+    cmp|and|or|test|mov|add|adc|sub|shl|shr|sal|sar [reg + reg * #int + #int], #int0  ; capture #int0
+
+    imul reg, reg, #int  ; capture #int
+    # note that int is first
+    cmp|test #int0, reg   ; capture #int0
+
+    # imagine reg is zero'd out, then this is like `mov reg, #int`
+    # which is not uncommon.
+    lea reg, [reg + #int]  ; capture #int
+    """
+)
+
+
 def extract_insn_number_features(
     fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
 ) -> Iterator[Tuple[Feature, Address]]:
@@ -40,48 +69,56 @@ def extract_insn_number_features(
 
     be2: BinExport2 = fhi.ctx.be2
 
-    instruction_index: int = ii.instruction_index
-    instruction: BinExport2.Instruction = be2.instruction[instruction_index]
-
-    if len(instruction.operand_index) == 0:
-        # skip things like:
-        #   .text:0040116e leave
+    if IGNORE_NUMBER_PATTERNS.match_with_be2(be2, ii.instruction_index):
         return
 
-    mnemonic: str = get_instruction_mnemonic(be2, instruction)
+    match = NUMBER_PATTERNS.match_with_be2(be2, ii.instruction_index)
+    if not match:
+        return
 
-    if mnemonic.startswith("ret"):
-        # skip things like:
-        #   .text:0042250E retn 8
+    value: int = mask_immediate(fhi.arch, match.expression.immediate)
+    if is_address_mapped(be2, value):
         return
 
-    if mnemonic.startswith(("add", "sub")):
-        register_expression: Optional[BinExport2.Expression] = get_operand_register_expression(
-            be2, be2.operand[instruction.operand_index[0]]
-        )
-        if register_expression and register_expression.symbol.lower().endswith(("sp", "bp")):
-            # skip things like:
-            # 0x415bbc  ADD         ESP, 0xC
-            return
+    yield Number(value), ih.address
+    yield OperandNumber(match.operand_index, value), ih.address
 
-    for i, operand_index in enumerate(instruction.operand_index):
-        operand: BinExport2.Operand = be2.operand[operand_index]
+    instruction_index: int = ii.instruction_index
+    instruction: BinExport2.Instruction = be2.instruction[instruction_index]
 
-        immediate_expression: Optional[BinExport2.Expression] = get_operand_immediate_expression(be2, operand)
-        if not immediate_expression:
-            continue
+    mnemonic: str = get_instruction_mnemonic(be2, instruction)
+    if mnemonic.startswith("add"):
+        if 0 < value < MAX_STRUCTURE_SIZE:
+            yield Offset(value), ih.address
+            yield OperandOffset(match.operand_index, value), ih.address
 
-        value: int = mask_immediate(fhi.arch, immediate_expression.immediate)
-        if is_address_mapped(be2, value):
-            continue
 
-        yield Number(value), ih.address
-        yield OperandNumber(i, value), ih.address
+OFFSET_PATTERNS = BinExport2InstructionPatternMatcher.from_str(
+    """
+    mov|movzx|movsb|cmp [reg            +  reg * #int + #int0], #int  ; capture #int0
+    mov|movzx|movsb|cmp [reg            * #int + #int0],        #int  ; capture #int0
+    mov|movzx|movsb|cmp [reg            +  reg + #int0],        #int  ; capture #int0
+    mov|movzx|movsb|cmp [reg(not-stack) + #int0],               #int  ; capture #int0
+    mov|movzx|movsb|cmp [reg            +  reg * #int + #int0], reg   ; capture #int0
+    mov|movzx|movsb|cmp [reg            * #int + #int0],        reg   ; capture #int0
+    mov|movzx|movsb|cmp [reg            +  reg + #int0],        reg   ; capture #int0
+    mov|movzx|movsb|cmp [reg(not-stack) + #int0],               reg   ; capture #int0
+    mov|movzx|movsb|cmp|lea reg, [reg            +  reg * #int + #int0]  ; capture #int0
+    mov|movzx|movsb|cmp|lea reg, [reg            * #int + #int0]         ; capture #int0
+    mov|movzx|movsb|cmp|lea reg, [reg            +  reg + #int0]         ; capture #int0
+    mov|movzx|movsb|cmp|lea reg, [reg(not-stack) + #int0]                ; capture #int0
+    """
+)
 
-        if mnemonic.startswith("add"):
-            if 0 < value < MAX_STRUCTURE_SIZE:
-                yield Offset(value), ih.address
-                yield OperandOffset(i, value), ih.address
+# these are patterns that access offset 0 from some pointer
+# (pointer is not the stack pointer).
+OFFSET_ZERO_PATTERNS = BinExport2InstructionPatternMatcher.from_str(
+    """
+    mov|movzx|movsb [reg(not-stack)], reg
+    mov|movzx|movsb [reg(not-stack)], #int
+    lea             reg,              [reg(not-stack)]
+    """
+)
 
 
 def extract_insn_offset_features(
@@ -91,55 +128,23 @@ def extract_insn_offset_features(
     ii: InstructionContext = ih.inner
 
     be2: BinExport2 = fhi.ctx.be2
-    instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
-
-    if len(instruction.operand_index) == 0:
-        # skip things like:
-        #   .text:0040116e leave
-        return
-
-    mnemonic: str = get_instruction_mnemonic(be2, instruction)
-    value: int
-
-    for i, operand_index in enumerate(instruction.operand_index):
-        operand: BinExport2.Operand = be2.operand[operand_index]
 
-        is_dereference = False
-        for expression_index in operand.expression_index:
-            if be2.expression[expression_index].type == BinExport2.Expression.DEREFERENCE:
-                is_dereference = True
-                break
-
-        if not is_dereference:
-            continue
-
-        phrase_info: Optional[OperandPhraseInfo] = get_operand_phrase_info(be2, operand)
-        if not phrase_info:
-            continue
-
-        if phrase_info.displacement:
-            if phrase_info.base and phrase_info.base.symbol.lower().endswith(("bp", "sp")):
-                # skips things like:
-                # 00401068 MOV dword ptr [EBP + local_8],EAX
-                continue
-
-            value = mask_immediate(fhi.arch, phrase_info.displacement.immediate)
-            if not is_address_mapped(be2, value):
-                value = capa.features.extractors.binexport2.helpers.twos_complement(fhi.arch, value, 32)
+    match = OFFSET_PATTERNS.match_with_be2(be2, ii.instruction_index)
+    if not match:
+        match = OFFSET_ZERO_PATTERNS.match_with_be2(be2, ii.instruction_index)
+        if not match:
+            return
 
-                yield Offset(value), ih.address
-                yield OperandOffset(i, value), ih.address
+        yield Offset(0), ih.address
+        yield OperandOffset(match.operand_index, 0), ih.address
 
-                if mnemonic == "lea" and i == 1:
-                    if phrase_info.base and not any((phrase_info.scale, phrase_info.index)):
-                        yield Number(value), ih.address
-                        yield OperandNumber(i, value), ih.address
+    value = mask_immediate(fhi.arch, match.expression.immediate)
+    if is_address_mapped(be2, value):
+        return
 
-        elif phrase_info.base and not any((phrase_info.index, phrase_info.scale)):
-            # like:
-            # 00401062 MOVZX EAX,word ptr [EDI]
-            yield Offset(0), ih.address
-            yield OperandOffset(i, 0), ih.address
+    value = capa.features.extractors.binexport2.helpers.twos_complement(fhi.arch, value, 32)
+    yield Offset(value), ih.address
+    yield OperandOffset(match.operand_index, value), ih.address
 
 
 def is_security_cookie(
@@ -177,6 +182,14 @@ def is_security_cookie(
     return False
 
 
+NZXOR_PATTERNS = BinExport2InstructionPatternMatcher.from_str(
+    """
+    xor|xorpd|xorps|pxor reg, reg
+    xor|xorpd|xorps|pxor reg, #int
+    """
+)
+
+
 def extract_insn_nzxor_characteristic_features(
     fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
 ) -> Iterator[Tuple[Feature, Address]]:
@@ -188,65 +201,48 @@ def extract_insn_nzxor_characteristic_features(
     ii: InstructionContext = ih.inner
 
     be2: BinExport2 = fhi.ctx.be2
+    idx: BinExport2Index = fhi.ctx.idx
 
-    instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
-    mnemonic: str = get_instruction_mnemonic(be2, instruction)
-    if mnemonic not in (
-        "xor",
-        "xorpd",
-        "xorps",
-        "pxor",
-    ):
+    if NZXOR_PATTERNS.match_with_be2(be2, ii.instruction_index) is None:
         return
 
+    instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
+    # guaranteed to be simple int/reg operands
+    # so we don't have to realize the tree/list.
     operands: List[BinExport2.Operand] = [be2.operand[operand_index] for operand_index in instruction.operand_index]
 
-    if mnemonic in ("xor", "xorpd", "xorps", "pxor"):
-        if operands[0] == operands[1]:
-            return
-        instruction_address: int = idx.insn_address_by_index[ii.instruction_index]
-        if is_security_cookie(fhi, bbh.inner, instruction_address, instruction):
-            return
+    if operands[0] == operands[1]:
+        return
+
+    instruction_address: int = idx.insn_address_by_index[ii.instruction_index]
+    if is_security_cookie(fhi, bbh.inner, instruction_address, instruction):
+        return
 
     yield Characteristic("nzxor"), ih.address
 
 
+INDIRECT_CALL_PATTERNS = BinExport2InstructionPatternMatcher.from_str(
+    """
+    call|jmp reg0
+    call|jmp [reg + reg * #int + #int]
+    call|jmp [reg + reg * #int]
+    call|jmp [reg * #int + #int]
+    call|jmp [reg + reg + #int]
+    call|jmp [reg + #int]
+    call|jmp [reg]
+    """
+)
+
+
 def extract_function_indirect_call_characteristic_features(
     fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
 ) -> Iterator[Tuple[Feature, Address]]:
     fhi: FunctionContext = fh.inner
     ii: InstructionContext = ih.inner
-
     be2: BinExport2 = fhi.ctx.be2
-    instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
 
-    if len(instruction.operand_index) == 0:
-        # skip things like:
-        #   .text:0040116e leave
-        return
-
-    mnemonic: str = get_instruction_mnemonic(be2, instruction)
-    if mnemonic not in ("call", "jmp"):
+    match = INDIRECT_CALL_PATTERNS.match_with_be2(be2, ii.instruction_index)
+    if match is None:
         return
 
-    assert len(instruction.operand_index) == 1
-
-    operand: BinExport2.Operand = be2.operand[instruction.operand_index[0]]
-
-    if len(operand.expression_index) == 1:
-        expression0: BinExport2.Expression = be2.expression[operand.expression_index[0]]
-        # call edx
-        if expression0.type == BinExport2.Expression.REGISTER:
-            yield Characteristic("indirect call"), ih.address
-    else:
-        is_dereference = False
-        for expression_index in operand.expression_index:
-            if be2.expression[expression_index].type == BinExport2.Expression.DEREFERENCE:
-                is_dereference = True
-                break
-
-        if is_dereference:
-            phrase_info: Optional[OperandPhraseInfo] = get_operand_phrase_info(be2, operand)
-            if phrase_info and phrase_info.base:
-                # call dword ptr [eax+50h]
-                yield Characteristic("indirect call"), ih.address
+    yield Characteristic("indirect call"), ih.address
diff --git a/capa/features/extractors/binexport2/helpers.py b/capa/features/extractors/binexport2/helpers.py
index 4de4623dd..a877d31de 100644
--- a/capa/features/extractors/binexport2/helpers.py
+++ b/capa/features/extractors/binexport2/helpers.py
@@ -5,9 +5,13 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
-from typing import Set, List, Iterator, Optional
+import re
+from typing import Set, Dict, List, Tuple, Union, Iterator, Optional
+from collections import defaultdict
+from dataclasses import dataclass
 
 import capa.features.extractors.helpers
+import capa.features.extractors.binexport2.helpers
 from capa.features.common import ARCH_I386, ARCH_AMD64, ARCH_AARCH64
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
@@ -329,3 +333,318 @@ def get_instruction_mnemonic(be2: BinExport2, instruction: BinExport2.Instructio
 
 def get_instruction_operands(be2: BinExport2, instruction: BinExport2.Instruction) -> List[BinExport2.Operand]:
     return [be2.operand[operand_index] for operand_index in instruction.operand_index]
+
+
+def split_with_delimiters(s: str, delimiters: Tuple[str, ...]) -> Iterator[str]:
+    """
+    Splits a string by any of the provided delimiter characters,
+    including the delimiters in the results.
+
+    Args:
+        string: The string to split.
+        delimiters: A string containing the characters to use as delimiters.
+    """
+    start = 0
+    for i, char in enumerate(s):
+        if char in delimiters:
+            yield s[start:i]
+            yield char
+            start = i + 1
+
+    if start < len(s):
+        yield s[start:]
+
+
+BinExport2OperandPattern = Union[str, Tuple[str, ...]]
+
+
+@dataclass
+class BinExport2InstructionPattern:
+    """
+    This describes a way to match disassembled instructions, with mnemonics and operands.
+
+    You can specify constraints on the instruction, via:
+      - the mnemonics, like "mov",
+      - number of operands, and
+      - format of each operand, "[reg, reg, #int]".
+
+    During matching, you can also capture a single element, to see its concrete value.
+    For example, given the pattern:
+
+        mov reg0, #int0  ; capture int0
+
+    and the instruction:
+
+        mov eax, 1
+
+    Then the capture will contain the immediate integer 1.
+
+    This matcher uses the BinExport2 data layout under the hood.
+    """
+
+    mnemonics: Tuple[str, ...]
+    operands: Tuple[Union[str, BinExport2OperandPattern], ...]
+    capture: Optional[str]
+
+    @classmethod
+    def from_str(cls, query: str):
+        """
+        Parse a pattern string into a Pattern instance.
+        The supported syntax is like this:
+
+            br      reg
+            br      reg                          ; capture reg
+            br      reg(stack)                   ; capture reg
+            br      reg(not-stack)               ; capture reg
+            mov     reg0, reg1                   ; capture reg0
+            adrp    reg, #int                    ; capture #int
+            add     reg, reg, #int               ; capture #int
+            ldr     reg0, [reg1]                 ; capture reg1
+            ldr|str reg, [reg, #int]             ; capture #int
+            ldr|str reg, [reg(stack), #int]      ; capture #int
+            ldr|str reg, [reg(not-stack), #int]  ; capture #int
+            ldr|str reg, [reg, #int]!            ; capture #int
+            ldr|str reg, [reg], #int             ; capture #int
+            ldp|stp reg, reg, [reg, #int]        ; capture #int
+            ldp|stp reg, reg, [reg, #int]!       ; capture #int
+            ldp|stp reg, reg, [reg], #int        ; capture #int
+        """
+        #
+        # The implementation of the parser here is obviously ugly.
+        # Its handwritten and probably fragile. But since we don't
+        # expect this to be widely used, its probably ok.
+        # Don't hesitate to rewrite this if it becomes more important.
+        #
+        # Note that this doesn't have to be very performant.
+        # We expect these patterns to be parsed once upfront and then reused
+        # (globally at the module level?) rather than within any loop.
+        #
+
+        pattern, _, comment = query.strip().partition(";")
+
+        # we don't support fs: yet
+        assert ":" not in pattern
+
+        # from "capture #int" to "#int"
+        if comment:
+            comment = comment.strip()
+            assert comment.startswith("capture ")
+            capture = comment[len("capture ") :]
+        else:
+            capture = None
+
+        # from "ldr|str ..." to ["ldr", "str"]
+        pattern = pattern.strip()
+        mnemonic, _, rest = pattern.partition(" ")
+        mnemonics = mnemonic.split("|")
+
+        operands: List[Union[str, Tuple[str, ...]]] = []
+        while rest:
+            rest = rest.strip()
+            if not rest.startswith("["):
+                # If its not a dereference, which looks like `[op, op, op, ...]`,
+                # then its a simple operand, which we can split by the next comma.
+                operand, _, rest = rest.partition(", ")
+                rest = rest.strip()
+                operands.append(operand)
+
+            else:
+                # This looks like a dereference, something like `[op, op, op, ...]`.
+                # Since these can't be nested, look for the next ] and then parse backwards.
+                deref_end = rest.index("]")
+                try:
+                    deref_end = rest.index(", ", deref_end)
+                    deref_end += len(", ")
+                except ValueError:
+                    deref = rest
+                    rest = ""
+                else:
+                    deref = rest[:deref_end]
+                    rest = rest[deref_end:]
+                    rest = rest.strip()
+                    deref = deref.rstrip(" ")
+                    deref = deref.rstrip(",")
+
+                # like: [reg, #int]!
+                has_postindex_writeback = deref.endswith("!")
+
+                deref = deref.rstrip("!")
+                deref = deref.rstrip("]")
+                deref = deref.lstrip("[")
+
+                parts = tuple(split_with_delimiters(deref, (",", "+", "*")))
+                parts = tuple(s.strip() for s in parts)
+
+                # emit operands in this order to match
+                # how BinExport2 expressions are flatted
+                # by 
+                if has_postindex_writeback:
+                    operands.append(("!", "[") + parts)
+                else:
+                    operands.append(("[",) + parts)
+
+        for operand in operands:  # type: ignore
+            # Try to ensure we've parsed the operands correctly.
+            # This is just sanity checking.
+            for o in (operand,) if isinstance(operand, str) else operand:
+                # operands can look like:
+                #  - reg
+                #  - reg0
+                #  - reg(stack)
+                #  - reg0(stack)
+                #  - reg(not-stack)
+                #  - reg0(not-stack)
+                #  - #int
+                #  - #int0
+                # and a limited set of supported operators.
+                # use an inline regex so that its easy to read. not perf critical.
+                assert re.match(r"^(reg|#int)[0-9]?(\(stack\)|\(not-stack\))?$", o) or o in ("[", ",", "!", "+", "*")
+
+        return cls(tuple(mnemonics), tuple(operands), capture)
+
+    @dataclass
+    class MatchResult:
+        operand_index: int
+        expression_index: int
+        expression: BinExport2.Expression
+
+    def match(
+        self, mnemonic: str, operand_expressions: List[List[BinExport2.Expression]]
+    ) -> Optional["BinExport2InstructionPattern.MatchResult"]:
+        """
+        Match the given BinExport2 data against this pattern.
+
+        The BinExport2 expression tree must have been flattened, such as with
+        capa.features.extractors.binexport2.helpers.get_operand_expressions.
+
+        If there's a match, the captured Expression instance is returned.
+        Otherwise, you get None back.
+        """
+        if mnemonic not in self.mnemonics:
+            return None
+
+        if len(self.operands) != len(operand_expressions):
+            return None
+
+        captured = None
+
+        for operand_index, found_expressions in enumerate(operand_expressions):
+            wanted_expressions = self.operands[operand_index]
+
+            # from `"reg"` to `("reg", )`
+            if isinstance(wanted_expressions, str):
+                wanted_expressions = (wanted_expressions,)
+            assert isinstance(wanted_expressions, tuple)
+
+            if len(wanted_expressions) != len(found_expressions):
+                return None
+
+            for expression_index, (wanted_expression, found_expression) in enumerate(
+                zip(wanted_expressions, found_expressions)
+            ):
+                if wanted_expression.startswith("reg"):
+                    if found_expression.type != BinExport2.Expression.REGISTER:
+                        return None
+
+                    if wanted_expression.endswith(")"):
+                        if wanted_expression.endswith("(not-stack)"):
+                            # intel 64: rsp, esp, sp,
+                            # intel 32: ebp, ebp, bp
+                            # arm: sp
+                            register_name = found_expression.symbol.lower()
+                            if register_name in ("rsp", "esp", "sp", "rbp", "ebp", "bp"):
+                                return None
+
+                        elif wanted_expression.endswith("(stack)"):
+                            register_name = found_expression.symbol.lower()
+                            if register_name not in ("rsp", "esp", "sp", "rbp", "ebp", "bp"):
+                                return None
+
+                        else:
+                            raise ValueError("unexpected expression suffix", wanted_expression)
+
+                    if self.capture == wanted_expression:
+                        captured = BinExport2InstructionPattern.MatchResult(
+                            operand_index, expression_index, found_expression
+                        )
+
+                elif wanted_expression.startswith("#int"):
+                    if found_expression.type != BinExport2.Expression.IMMEDIATE_INT:
+                        return None
+
+                    if self.capture == wanted_expression:
+                        captured = BinExport2InstructionPattern.MatchResult(
+                            operand_index, expression_index, found_expression
+                        )
+
+                elif wanted_expression == "[":
+                    if found_expression.type != BinExport2.Expression.DEREFERENCE:
+                        return None
+
+                elif wanted_expression in (",", "!", "+", "*"):
+                    if found_expression.type != BinExport2.Expression.OPERATOR:
+                        return None
+
+                    if found_expression.symbol != wanted_expression:
+                        return None
+
+                else:
+                    raise ValueError(found_expression)
+
+        if captured:
+            return captured
+        else:
+            # There were no captures, so
+            # return arbitrary non-None expression
+            return BinExport2InstructionPattern.MatchResult(operand_index, expression_index, found_expression)
+
+
+class BinExport2InstructionPatternMatcher:
+    """Index and match a collection of instruction patterns."""
+
+    def __init__(self, queries: List[BinExport2InstructionPattern]):
+        self.queries = queries
+        # shard the patterns by (mnemonic, #operands)
+        self._index: Dict[Tuple[str, int], List[BinExport2InstructionPattern]] = defaultdict(list)
+
+        for query in queries:
+            for mnemonic in query.mnemonics:
+                self._index[(mnemonic.lower(), len(query.operands))].append(query)
+
+    @classmethod
+    def from_str(cls, patterns: str):
+        return cls(
+            [
+                BinExport2InstructionPattern.from_str(line)
+                for line in filter(
+                    lambda line: not line.startswith("#"), (line.strip() for line in patterns.split("\n"))
+                )
+            ]
+        )
+
+    def match(
+        self, mnemonic: str, operand_expressions: List[List[BinExport2.Expression]]
+    ) -> Optional[BinExport2InstructionPattern.MatchResult]:
+        queries = self._index.get((mnemonic.lower(), len(operand_expressions)), [])
+        for query in queries:
+            captured = query.match(mnemonic.lower(), operand_expressions)
+            if captured:
+                return captured
+
+        return None
+
+    def match_with_be2(
+        self, be2: BinExport2, instruction_index: int
+    ) -> Optional[BinExport2InstructionPattern.MatchResult]:
+        instruction: BinExport2.Instruction = be2.instruction[instruction_index]
+        mnemonic: str = get_instruction_mnemonic(be2, instruction)
+
+        if (mnemonic.lower(), len(instruction.operand_index)) not in self._index:
+            # verify that we might have a hit before we realize the operand expression list
+            return None
+
+        operands = []
+        for operand_index in instruction.operand_index:
+            operands.append(get_operand_expressions(be2, be2.operand[operand_index]))
+
+        return self.match(mnemonic, operands)
diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py
index b579fabd0..de2c82d86 100644
--- a/scripts/inspect-binexport2.py
+++ b/scripts/inspect-binexport2.py
@@ -127,12 +127,15 @@ def _render_expression_tree(
 
         if len(children_tree_indexes) == 1:
             # prefix operator, like "ds:"
-            if expression.symbol != ",":
-                # or there's a binary operator, like ",", that's missing a child,
-                # such as when we prune "lsl" branches.
+            if expression.symbol != "!":
                 o.write(expression.symbol)
+
             child_index = children_tree_indexes[0]
             _render_expression_tree(be2, operand, expression_tree, child_index, o)
+
+            # postfix operator, like "!" in aarch operand "[x1, 8]!"
+            if expression.symbol == "!":
+                o.write(expression.symbol)
             return
 
         elif len(children_tree_indexes) == 2:
@@ -177,9 +180,7 @@ def _render_expression_tree(
 _OPERAND_CACHE: Dict[int, str] = {}
 
 
-def render_operand(
-    be2: BinExport2, operand: BinExport2.Operand, index: Optional[int] = None
-) -> str:
+def render_operand(be2: BinExport2, operand: BinExport2.Operand, index: Optional[int] = None) -> str:
     # For the mimikatz example file, there are 138k distinct operands.
     # Of those, only 11k are unique, which is less than 10% of the total.
     # The most common operands are seen 37k, 24k, 17k, 15k, 11k, ... times.
@@ -224,9 +225,10 @@ def rec(tree_index, indent=0):
         expression = be2.expression[expression_index]
         children_tree_indexes: List[int] = expression_tree[tree_index]
 
-        print(f"    {'  ' * indent}expression: {str(expression).replace('\n', ', ')}")
+        NEWLINE = "\n"
+        print(f"    {'  ' * indent}expression: {str(expression).replace(NEWLINE, ', ')}")
         for child_index in children_tree_indexes:
-            rec(child_index, indent+1)
+            rec(child_index, indent + 1)
 
     rec(0)
 
@@ -238,7 +240,6 @@ def inspect_instruction(be2: BinExport2, instruction: BinExport2.Instruction, ad
     print(f"  mnemonic: {mnemonic.name}")
 
     print("  operands:")
-    operands = []
     for i, operand_index in enumerate(instruction.operand_index):
         print(f"  - operand {i}: [{operand_index}]")
         operand = be2.operand[operand_index]
diff --git a/tests/fixtures.py b/tests/fixtures.py
index a2c16ef5e..e4d0a6fa0 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -818,7 +818,9 @@ def parametrize(params, values, **kwargs):
         ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x8), False),
         ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x10), False),
         # insn/offset: negative
+        # 0x4012b4  MOVZX       ECX, [EAX+0xFFFFFFFFFFFFFFFF]
         ("mimikatz", "function=0x4011FB", capa.features.insn.Offset(-0x1), True),
+        # 0x4012b8  MOVZX       EAX, [EAX+0xFFFFFFFFFFFFFFFE]
         ("mimikatz", "function=0x4011FB", capa.features.insn.Offset(-0x2), True),
         #
         # insn/offset from mnemonic: add
@@ -841,7 +843,7 @@ def parametrize(params, values, **kwargs):
         # should not be considered, lea operand invalid encoding
         #    .text:004717B1 8D 4C 31 D0             lea     ecx, [ecx+esi-30h]
         ("mimikatz", "function=0x47153B,bb=0x4717AB,insn=0x4717B1", capa.features.insn.Number(-0x30), False),
-        # yes, this is also a number (imagine edx is zero):
+        # yes, this is also a number (imagine ebx is zero):
         #    .text:004018C0 8D 4B 02                lea     ecx, [ebx+2]
         ("mimikatz", "function=0x401873,bb=0x4018B2,insn=0x4018C0", capa.features.insn.Number(0x2), True),
         # insn/api
diff --git a/tests/test_binexport_accessors.py b/tests/test_binexport_accessors.py
index ba63b89c5..3aa78982f 100644
--- a/tests/test_binexport_accessors.py
+++ b/tests/test_binexport_accessors.py
@@ -15,13 +15,18 @@
 import fixtures
 from google.protobuf.json_format import ParseDict
 
+import capa.features.extractors.binexport2.helpers
 from capa.features.extractors.binexport2.helpers import (
+    BinExport2InstructionPattern,
+    BinExport2InstructionPatternMatcher,
+    split_with_delimiters,
     get_operand_expressions,
     get_instruction_mnemonic,
     get_instruction_operands,
     get_operand_register_expression,
     get_operand_immediate_expression,
 )
+from capa.features.extractors.binexport2.extractor import BinExport2FeatureExtractor
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 from capa.features.extractors.binexport2.arch.arm.helpers import is_stack_register_expression
 
@@ -343,3 +348,255 @@ def test_is_stack_register_expression():
     assert is_stack_register_expression(BE2, op1_exp0) is True
     op2_exp0 = get_operand_expressions(BE2, add_op2)[0]
     assert is_stack_register_expression(BE2, op2_exp0) is False
+
+
+def test_split_with_delimiters():
+    assert tuple(split_with_delimiters("abc|def", ("|",))) == ("abc", "|", "def")
+    assert tuple(split_with_delimiters("abc|def|", ("|",))) == ("abc", "|", "def", "|")
+    assert tuple(split_with_delimiters("abc||def", ("|",))) == ("abc", "|", "", "|", "def")
+    assert tuple(split_with_delimiters("abc|def-ghi", ("|", "-"))) == ("abc", "|", "def", "-", "ghi")
+
+
+def test_pattern_parsing():
+    assert BinExport2InstructionPattern.from_str(
+        "br      reg                     ; capture reg"
+    ) == BinExport2InstructionPattern(mnemonics=("br",), operands=("reg",), capture="reg")
+
+    assert BinExport2InstructionPattern.from_str(
+        "mov     reg0, reg1              ; capture reg0"
+    ) == BinExport2InstructionPattern(mnemonics=("mov",), operands=("reg0", "reg1"), capture="reg0")
+
+    assert BinExport2InstructionPattern.from_str(
+        "adrp    reg, #int               ; capture #int"
+    ) == BinExport2InstructionPattern(mnemonics=("adrp",), operands=("reg", "#int"), capture="#int")
+
+    assert BinExport2InstructionPattern.from_str(
+        "add     reg, reg, #int          ; capture #int"
+    ) == BinExport2InstructionPattern(mnemonics=("add",), operands=("reg", "reg", "#int"), capture="#int")
+
+    assert BinExport2InstructionPattern.from_str(
+        "ldr     reg0, [reg1]            ; capture reg1"
+    ) == BinExport2InstructionPattern(mnemonics=("ldr",), operands=("reg0", ("[", "reg1")), capture="reg1")
+
+    assert BinExport2InstructionPattern.from_str(
+        "ldr|str reg, [reg, #int]        ; capture #int"
+    ) == BinExport2InstructionPattern(
+        mnemonics=(
+            "ldr",
+            "str",
+        ),
+        operands=("reg", ("[", "reg", ",", "#int")),
+        capture="#int",
+    )
+
+    assert BinExport2InstructionPattern.from_str(
+        "ldr|str reg, [reg, #int]!       ; capture #int"
+    ) == BinExport2InstructionPattern(
+        mnemonics=(
+            "ldr",
+            "str",
+        ),
+        operands=("reg", ("!", "[", "reg", ",", "#int")),
+        capture="#int",
+    )
+
+    assert BinExport2InstructionPattern.from_str(
+        "ldr|str reg, [reg], #int        ; capture #int"
+    ) == BinExport2InstructionPattern(
+        mnemonics=(
+            "ldr",
+            "str",
+        ),
+        operands=(
+            "reg",
+            (
+                "[",
+                "reg",
+            ),
+            "#int",
+        ),
+        capture="#int",
+    )
+
+    assert BinExport2InstructionPattern.from_str(
+        "ldp|stp reg, reg, [reg, #int]   ; capture #int"
+    ) == BinExport2InstructionPattern(
+        mnemonics=(
+            "ldp",
+            "stp",
+        ),
+        operands=("reg", "reg", ("[", "reg", ",", "#int")),
+        capture="#int",
+    )
+
+    assert BinExport2InstructionPattern.from_str(
+        "ldp|stp reg, reg, [reg, #int]!  ; capture #int"
+    ) == BinExport2InstructionPattern(
+        mnemonics=(
+            "ldp",
+            "stp",
+        ),
+        operands=("reg", "reg", ("!", "[", "reg", ",", "#int")),
+        capture="#int",
+    )
+
+    assert BinExport2InstructionPattern.from_str(
+        "ldp|stp reg, reg, [reg], #int   ; capture #int"
+    ) == BinExport2InstructionPattern(
+        mnemonics=(
+            "ldp",
+            "stp",
+        ),
+        operands=("reg", "reg", ("[", "reg"), "#int"),
+        capture="#int",
+    )
+
+    assert (
+        BinExport2InstructionPatternMatcher.from_str(
+            """
+            # comment
+            br      reg
+            br      reg(not-stack)
+            br      reg                     ; capture reg
+            mov     reg0, reg1              ; capture reg0
+            adrp    reg, #int               ; capture #int
+            add     reg, reg, #int          ; capture #int
+            ldr     reg0, [reg1]            ; capture reg1
+            ldr|str reg, [reg, #int]        ; capture #int
+            ldr|str reg, [reg, #int]!       ; capture #int
+            ldr|str reg, [reg], #int        ; capture #int
+            ldp|stp reg, reg, [reg, #int]   ; capture #int
+            ldp|stp reg, reg, [reg, #int]!  ; capture #int
+            ldp|stp reg, reg, [reg], #int   ; capture #int
+            ldrb    reg0, [reg1, reg2]      ; capture reg2
+            call    [reg + reg * #int + #int]
+            call    [reg + reg * #int]
+            call    [reg * #int + #int]
+            call    [reg + reg + #int]
+            call    [reg + #int]
+            """
+        ).queries
+        is not None
+    )
+
+
+def match_address(extractor: BinExport2FeatureExtractor, queries: BinExport2InstructionPatternMatcher, address: int):
+    instruction = extractor.idx.insn_by_address[address]
+    mnemonic: str = get_instruction_mnemonic(extractor.be2, instruction)
+
+    operands = []
+    for operand_index in instruction.operand_index:
+        operand = extractor.be2.operand[operand_index]
+        operands.append(capa.features.extractors.binexport2.helpers.get_operand_expressions(extractor.be2, operand))
+
+    return queries.match(mnemonic, operands)
+
+
+def match_address_with_be2(
+    extractor: BinExport2FeatureExtractor, queries: BinExport2InstructionPatternMatcher, address: int
+):
+    instruction_index = extractor.idx.insn_index_by_address[address]
+    return queries.match_with_be2(extractor.be2, instruction_index)
+
+
+def test_pattern_matching():
+    queries = BinExport2InstructionPatternMatcher.from_str(
+        """
+        br      reg(stack)                     ; capture reg
+        br      reg(not-stack)                 ; capture reg
+        mov     reg0, reg1                     ; capture reg0
+        adrp    reg, #int                      ; capture #int
+        add     reg, reg, #int                 ; capture #int
+        ldr     reg0, [reg1]                   ; capture reg1
+        ldr|str reg, [reg, #int]               ; capture #int
+        ldr|str reg, [reg, #int]!              ; capture #int
+        ldr|str reg, [reg], #int               ; capture #int
+        ldp|stp reg, reg, [reg, #int]          ; capture #int
+        ldp|stp reg, reg, [reg, #int]!         ; capture #int
+        ldp|stp reg, reg, [reg], #int          ; capture #int
+        ldrb    reg0, [reg1(not-stack), reg2]  ; capture reg2
+        """
+    )
+
+    # 0x210184: ldrb      w2, [x0,                x1]
+    # query:    ldrb    reg0, [reg1(not-stack), reg2]      ; capture reg2"
+    assert match_address(BE2_EXTRACTOR, queries, 0x210184).expression.symbol == "x1"
+    assert match_address_with_be2(BE2_EXTRACTOR, queries, 0x210184).expression.symbol == "x1"
+
+    # 0x210198:  mov         x2, x1
+    # query:     mov       reg0, reg1           ; capture reg0"),
+    assert match_address(BE2_EXTRACTOR, queries, 0x210198).expression.symbol == "x2"
+    assert match_address_with_be2(BE2_EXTRACTOR, queries, 0x210198).expression.symbol == "x2"
+
+    # 0x210190:  add         x1, x1,  0x1
+    # query:     add        reg, reg, #int      ; capture #int
+    assert match_address(BE2_EXTRACTOR, queries, 0x210190).expression.immediate == 1
+    assert match_address_with_be2(BE2_EXTRACTOR, queries, 0x210190).expression.immediate == 1
+
+
+BE2_EXTRACTOR_687 = fixtures.get_binexport_extractor(
+    CD
+    / "data"
+    / "binexport2"
+    / "687e79cde5b0ced75ac229465835054931f9ec438816f2827a8be5f3bd474929.elf_.ghidra.BinExport"
+)
+
+
+def test_pattern_matching_exclamation():
+    queries = BinExport2InstructionPatternMatcher.from_str(
+        """
+        stp  reg, reg, [reg, #int]!  ; capture #int
+        """
+    )
+
+    # note this captures the sp
+    # 0x107918:  stp  x20, x19, [sp,0xFFFFFFFFFFFFFFE0]!
+    # query:     stp  reg, reg, [reg, #int]!  ; capture #int
+    assert match_address(BE2_EXTRACTOR_687, queries, 0x107918).expression.immediate == 0xFFFFFFFFFFFFFFE0
+    assert match_address_with_be2(BE2_EXTRACTOR_687, queries, 0x107918).expression.immediate == 0xFFFFFFFFFFFFFFE0
+
+
+def test_pattern_matching_stack():
+    queries = BinExport2InstructionPatternMatcher.from_str(
+        """
+        stp  reg, reg, [reg(stack), #int]!  ; capture #int
+        """
+    )
+
+    # note this does capture the sp
+    # compare this with the test above (exclamation)
+    # 0x107918:  stp  x20, x19, [sp,         0xFFFFFFFFFFFFFFE0]!
+    # query:     stp  reg, reg, [reg(stack), #int]!  ; capture #int
+    assert match_address(BE2_EXTRACTOR_687, queries, 0x107918).expression.immediate == 0xFFFFFFFFFFFFFFE0
+    assert match_address_with_be2(BE2_EXTRACTOR_687, queries, 0x107918).expression.immediate == 0xFFFFFFFFFFFFFFE0
+
+
+def test_pattern_matching_not_stack():
+    queries = BinExport2InstructionPatternMatcher.from_str(
+        """
+        stp  reg, reg, [reg(not-stack), #int]!  ; capture #int
+        """
+    )
+
+    # note this does not capture the sp
+    # compare this with the test above (exclamation)
+    # 0x107918:  stp  x20, x19, [sp,             0xFFFFFFFFFFFFFFE0]!
+    # query:     stp  reg, reg, [reg(not-stack), #int]!  ; capture #int
+    assert match_address(BE2_EXTRACTOR_687, queries, 0x107918) is None
+    assert match_address_with_be2(BE2_EXTRACTOR_687, queries, 0x107918) is None
+
+
+BE2_EXTRACTOR_MIMI = fixtures.get_binexport_extractor(CD / "data" / "binexport2" / "mimikatz.exe_.ghidra.BinExport")
+
+
+def test_pattern_matching_x86():
+    queries = BinExport2InstructionPatternMatcher.from_str(
+        """
+        cmp|lea reg, [reg(not-stack) + #int0]  ; capture #int0
+        """
+    )
+
+    # 0x4018c0:  LEA         ECX, [EBX+0x2]
+    # query:     cmp|lea     reg, [reg(not-stack) + #int0]  ; capture #int0
+    assert match_address(BE2_EXTRACTOR_MIMI, queries, 0x4018C0).expression.immediate == 2
+    assert match_address_with_be2(BE2_EXTRACTOR_MIMI, queries, 0x4018C0).expression.immediate == 2

From 90d500c6c1c4c623735de559715e84991f02c018 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <wballenthin@google.com>
Date: Thu, 12 Sep 2024 13:02:17 +0200
Subject: [PATCH 198/200] binexport: helpers: fix missing comment words

---
 capa/features/extractors/binexport2/helpers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/features/extractors/binexport2/helpers.py b/capa/features/extractors/binexport2/helpers.py
index a877d31de..e4e7f7b76 100644
--- a/capa/features/extractors/binexport2/helpers.py
+++ b/capa/features/extractors/binexport2/helpers.py
@@ -477,7 +477,7 @@ def from_str(cls, query: str):
 
                 # emit operands in this order to match
                 # how BinExport2 expressions are flatted
-                # by 
+                # by get_operand_expressions
                 if has_postindex_writeback:
                     operands.append(("!", "[") + parts)
                 else:

From 38dad419d0adda35a87580b870a9929243c3e2ee Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 12 Sep 2024 09:00:24 -0600
Subject: [PATCH 199/200] binexport: update tests to reflect updated test files

---
 tests/test_binexport_accessors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_binexport_accessors.py b/tests/test_binexport_accessors.py
index 3aa78982f..bc9ea6db1 100644
--- a/tests/test_binexport_accessors.py
+++ b/tests/test_binexport_accessors.py
@@ -198,7 +198,7 @@ def test_get_instruction_operands_count():
             (
                 BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x1"),
                 BinExport2.Expression(
-                    type=BinExport2.Expression.IMMEDIATE_INT, symbol="DAT_002101b0", immediate=0x2101B0
+                    type=BinExport2.Expression.IMMEDIATE_INT, symbol="PTR_helloWorldStr_002101b0", immediate=0x2101B0
                 ),
             ),
         ),

From b21d1c0688ec456ec3dd7227c365194e296d37d8 Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Thu, 12 Sep 2024 15:54:59 +0000
Subject: [PATCH 200/200] remove testing of feature branch

---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index c0081a699..9aa826ef0 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -12,7 +12,7 @@ on:
       - 'doc/**'
       - '**.md'
   pull_request:
-    branches: [ master, feat/1755 ]
+    branches: [ master ]
     paths-ignore:
       - 'web/**'
       - 'doc/**'