Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

extract dynamic capabilities #1644

Merged
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
e3f60ea
initial commit
yelhamer Jul 17, 2023
4af84e5
bugfixes
yelhamer Jul 17, 2023
bc46bf3
add vverbose rendering
yelhamer Jul 18, 2023
e5d7903
add removed tests
yelhamer Jul 18, 2023
4e4b123
mypy.ini: ignore proto issues
yelhamer Jul 18, 2023
c5d08ec
update extractors and tests
yelhamer Jul 19, 2023
7de223f
Update capa/features/extractors/ida/extractor.py: add call to get_inp…
yelhamer Jul 19, 2023
1029b36
Merge remote-tracking branch 'parentrepo/dynamic-feature-extraction' …
yelhamer Jul 20, 2023
8ac9caf
fix bugs
yelhamer Jul 20, 2023
0a4fe58
fix tests
yelhamer Jul 20, 2023
d99b16e
add copyright and remove old test
yelhamer Jul 20, 2023
482e0d3
use pathlib.Path() in binja and ida extractors
yelhamer Jul 20, 2023
fd7b926
Update capa/features/extractors/base_extractor.py
yelhamer Jul 20, 2023
2b2b2b6
Update capa/features/extractors/base_extractor.py
yelhamer Jul 20, 2023
b4cf50f
fix mypy issues
yelhamer Jul 20, 2023
ab092cb
add sample_hashes attribute to the base extractors
yelhamer Jul 20, 2023
6ee1dfd
address review comments: rename SampleHashes's from_sample() method t…
yelhamer Jul 20, 2023
806bc18
Update mypy.ini: add TODO comment
yelhamer Jul 20, 2023
24b3abd
add get_sample_hashes() to base extractor
yelhamer Jul 21, 2023
6d1a885
update static freeze test
yelhamer Jul 21, 2023
b1e468d
add tests for the get_sample_hashes() method
yelhamer Jul 21, 2023
da4e887
fix comment typo
yelhamer Jul 21, 2023
6f3fb42
update compute_dynamic_layout with the appropriate type
yelhamer Jul 21, 2023
bd83316
update compute_static_layout with the appropriate types
yelhamer Jul 21, 2023
736b2cd
address @mr-tz main.py review comments
yelhamer Jul 21, 2023
3ab3c61
use ida's hash-extraction functions
yelhamer Jul 21, 2023
8085cae
remove the usage of SampleHashes's __iter__() method
yelhamer Jul 21, 2023
6741229
migrate the `get_sample_hashes()` function to each individual extractor
yelhamer Jul 21, 2023
ab585ef
add the `skipif` mark back
yelhamer Jul 21, 2023
4ec39d4
fix linting issues
yelhamer Jul 21, 2023
c4ba5af
replace `: FeatureSet` annotations with a comment type annotation
yelhamer Jul 21, 2023
830bad5
fix bugs
yelhamer Jul 21, 2023
3d1a1fb
add get_sample_hashes() to NullFeatureExtractor
yelhamer Jul 21, 2023
90298fe
Update capa/features/extractors/base_extractor.py
yelhamer Jul 21, 2023
d13114e
remove SampleHashes __iter__method
yelhamer Jul 21, 2023
c32ac19
Update capa/features/extractors/ida/extractor.py
yelhamer Jul 21, 2023
344b3e9
Update capa/features/extractors/base_extractor.py
yelhamer Jul 21, 2023
d8c28e8
add get_sample_hashes() to elf extractor
yelhamer Jul 21, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/mypy/mypy.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
[mypy]

# TODO(yelhamer): remove this once proto has been added
# for the dynamic rendering
exclude = (?x)(
^capa/render/proto/__init__.py$
| ^tests/_test_proto.py$
)

[mypy-halo.*]
ignore_missing_imports = True

Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
- Add a new thread scope for the dynamic analysis flavor #1517 @yelhamer
- Add support for flavor-based rule scopes @yelhamer
- Add ProcessesAddress and ThreadAddress #1612 @yelhamer
- Add dynamic capability extraction @yelhamer

### Breaking Changes

Expand Down
46 changes: 46 additions & 0 deletions capa/features/extractors/base_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# See the License for the specific language governing permissions and limitations under the License.

import abc
import hashlib
import dataclasses
from typing import Any, Dict, Tuple, Union, Iterator
from dataclasses import dataclass
Expand All @@ -24,6 +25,29 @@
# the feature extractor from which they were created.


@dataclass
class SampleHashes:
md5: str
sha1: str
sha256: str

def __iter__(self) -> Iterator[str]:
yield self.md5
yield self.sha1
yield self.sha256
yelhamer marked this conversation as resolved.
Show resolved Hide resolved
yelhamer marked this conversation as resolved.
Show resolved Hide resolved

@classmethod
def from_bytes(cls, buf: bytes) -> "SampleHashes":
md5 = hashlib.md5()
sha1 = hashlib.sha1()
sha256 = hashlib.sha256()
md5.update(buf)
sha1.update(buf)
sha256.update(buf)

return cls(md5=md5.hexdigest(), sha1=sha1.hexdigest(), sha256=sha256.hexdigest())


@dataclass
class FunctionHandle:
"""reference to a function recognized by a feature extractor.
Expand Down Expand Up @@ -104,6 +128,12 @@ def get_base_address(self) -> Union[AbsoluteVirtualAddress, capa.features.addres
"""
raise NotImplementedError()

def get_sample_hashes(self) -> SampleHashes:
yelhamer marked this conversation as resolved.
Show resolved Hide resolved
"""
fetch the hashes for the sample contained within the extractor.
"""
raise NotImplementedError()

@abc.abstractmethod
def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]:
"""
Expand Down Expand Up @@ -309,6 +339,22 @@ class DynamicFeatureExtractor:
This class is not instantiated directly; it is the base class for other implementations.
"""

__metaclass__ = abc.ABCMeta

def __init__(self):
#
# note: a subclass should define ctor parameters for its own use.
# for example, the Vivisect feature extract might require the vw and/or path.
# this base class doesn't know what to do with that info, though.
#
super().__init__()

def get_sample_hashes(self) -> SampleHashes:
yelhamer marked this conversation as resolved.
Show resolved Hide resolved
"""
fetch the hashes for the sample contained within the extractor.
"""
raise NotImplementedError()

@abc.abstractmethod
def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]:
"""
Expand Down
13 changes: 12 additions & 1 deletion capa/features/extractors/binja/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
from typing import List, Tuple, Iterator
from pathlib import Path

import binaryninja as binja

Expand All @@ -17,7 +18,13 @@
import capa.features.extractors.binja.basicblock
from capa.features.common import Feature
from capa.features.address import Address, AbsoluteVirtualAddress
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor
from capa.features.extractors.base_extractor import (
BBHandle,
InsnHandle,
SampleHashes,
FunctionHandle,
StaticFeatureExtractor,
)


class BinjaFeatureExtractor(StaticFeatureExtractor):
Expand All @@ -28,10 +35,14 @@ def __init__(self, bv: binja.BinaryView):
self.global_features.extend(capa.features.extractors.binja.file.extract_file_format(self.bv))
self.global_features.extend(capa.features.extractors.binja.global_.extract_os(self.bv))
self.global_features.extend(capa.features.extractors.binja.global_.extract_arch(self.bv))
self.sample_hashes = SampleHashes.from_bytes(Path(bv.file.original_filename).read_bytes())

def get_base_address(self):
return AbsoluteVirtualAddress(self.bv.start)

def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes

def extract_global_features(self):
yield from self.global_features

Expand Down
10 changes: 9 additions & 1 deletion capa/features/extractors/cape/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import capa.features.extractors.cape.process
from capa.features.common import Feature
from capa.features.address import Address, AbsoluteVirtualAddress, _NoAddress
from capa.features.extractors.base_extractor import ThreadHandle, ProcessHandle, DynamicFeatureExtractor
from capa.features.extractors.base_extractor import SampleHashes, ThreadHandle, ProcessHandle, DynamicFeatureExtractor

logger = logging.getLogger(__name__)

Expand All @@ -28,13 +28,21 @@ def __init__(self, cape_version: str, static: Dict, behavior: Dict):
self.cape_version = cape_version
self.static = static
self.behavior = behavior
self.sample_hashes = SampleHashes(
md5=static["file"]["md5"].lower(),
sha1=static["file"]["sha1"].lower(),
sha256=static["file"]["sha256"].lower(),
)

self.global_features = capa.features.extractors.cape.global_.extract_features(self.static)

def get_base_address(self) -> Union[AbsoluteVirtualAddress, _NoAddress, None]:
# value according to the PE header, the actual trace may use a different imagebase
return AbsoluteVirtualAddress(self.static["pe"]["imagebase"])

def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes

def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]:
yield from self.global_features

Expand Down
12 changes: 11 additions & 1 deletion capa/features/extractors/dnfile/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,13 @@
from capa.features.common import Feature
from capa.features.address import NO_ADDRESS, Address, DNTokenAddress, DNTokenOffsetAddress
from capa.features.extractors.dnfile.types import DnType, DnUnmanagedMethod
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor
from capa.features.extractors.base_extractor import (
BBHandle,
InsnHandle,
SampleHashes,
FunctionHandle,
StaticFeatureExtractor,
)
from capa.features.extractors.dnfile.helpers import (
get_dotnet_types,
get_dotnet_fields,
Expand Down Expand Up @@ -72,6 +78,7 @@ class DnfileFeatureExtractor(StaticFeatureExtractor):
def __init__(self, path: Path):
super().__init__()
self.pe: dnfile.dnPE = dnfile.dnPE(str(path))
self.sample_hashes = SampleHashes.from_bytes(path.read_bytes())

# pre-compute .NET token lookup tables; each .NET method has access to this cache for feature extraction
# most relevant at instruction scope
Expand All @@ -86,6 +93,9 @@ def __init__(self, path: Path):
def get_base_address(self):
return NO_ADDRESS

def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes

def extract_global_features(self):
yield from self.global_features

Expand Down
6 changes: 5 additions & 1 deletion capa/features/extractors/dnfile_.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
Feature,
)
from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress
from capa.features.extractors.base_extractor import StaticFeatureExtractor
from capa.features.extractors.base_extractor import SampleHashes, StaticFeatureExtractor

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -86,10 +86,14 @@ def __init__(self, path: Path):
super().__init__()
self.path: Path = path
self.pe: dnfile.dnPE = dnfile.dnPE(str(path))
self.sample_hashes = SampleHashes.from_bytes(self.path.read_bytes())

def get_base_address(self) -> AbsoluteVirtualAddress:
return AbsoluteVirtualAddress(0x0)

def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes

def get_entry_point(self) -> int:
# self.pe.net.Flags.CLT_NATIVE_ENTRYPOINT
# True: native EP: Token
Expand Down
6 changes: 5 additions & 1 deletion capa/features/extractors/dotnetfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
Characteristic,
)
from capa.features.address import NO_ADDRESS, Address, DNTokenAddress
from capa.features.extractors.base_extractor import StaticFeatureExtractor
from capa.features.extractors.base_extractor import SampleHashes, StaticFeatureExtractor
from capa.features.extractors.dnfile.helpers import (
DnType,
iter_dotnet_table,
Expand Down Expand Up @@ -170,10 +170,14 @@ def __init__(self, path: Path):
super().__init__()
self.path: Path = path
self.pe: dnfile.dnPE = dnfile.dnPE(str(path))
self.sample_hashes = SampleHashes.from_bytes(self.path.read_bytes())

def get_base_address(self):
return NO_ADDRESS

def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes

def get_entry_point(self) -> int:
# self.pe.net.Flags.CLT_NATIVE_ENTRYPOINT
# True: native EP: Token
Expand Down
14 changes: 13 additions & 1 deletion capa/features/extractors/ida/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,13 @@
import capa.features.extractors.ida.basicblock
from capa.features.common import Feature
from capa.features.address import Address, AbsoluteVirtualAddress
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor
from capa.features.extractors.base_extractor import (
BBHandle,
InsnHandle,
SampleHashes,
FunctionHandle,
StaticFeatureExtractor,
)


class IdaFeatureExtractor(StaticFeatureExtractor):
Expand All @@ -28,10 +34,16 @@ def __init__(self):
self.global_features.extend(capa.features.extractors.ida.file.extract_file_format())
self.global_features.extend(capa.features.extractors.ida.global_.extract_os())
self.global_features.extend(capa.features.extractors.ida.global_.extract_arch())
self.sample_hashes = SampleHashes(
md5=idaapi.get_input_file_md5(), sha1=idaapi.get_input_file_sha1(), sha256=idaapi.get_input_file_sha256()
yelhamer marked this conversation as resolved.
Show resolved Hide resolved
)

def get_base_address(self):
return AbsoluteVirtualAddress(idaapi.get_imagebase())

def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes

def extract_global_features(self):
yield from self.global_features

Expand Down
9 changes: 9 additions & 0 deletions capa/features/extractors/null.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from capa.features.extractors.base_extractor import (
BBHandle,
InsnHandle,
SampleHashes,
ThreadHandle,
ProcessHandle,
FunctionHandle,
Expand Down Expand Up @@ -49,6 +50,7 @@ class NullStaticFeatureExtractor(StaticFeatureExtractor):
"""

base_address: Address
sample_hashes: SampleHashes
global_features: List[Feature]
file_features: List[Tuple[Address, Feature]]
functions: Dict[Address, FunctionFeatures]
Expand All @@ -60,6 +62,9 @@ def extract_global_features(self):
for feature in self.global_features:
yield feature, NO_ADDRESS

def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes

def extract_file_features(self):
for address, feature in self.file_features:
yield feature, address
Expand Down Expand Up @@ -103,6 +108,7 @@ class ProcessFeatures:
@dataclass
class NullDynamicFeatureExtractor(DynamicFeatureExtractor):
base_address: Address
sample_hashes: SampleHashes
global_features: List[Feature]
file_features: List[Tuple[Address, Feature]]
processes: Dict[Address, ProcessFeatures]
Expand All @@ -111,6 +117,9 @@ def extract_global_features(self):
for feature in self.global_features:
yield feature, NO_ADDRESS

def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes

def extract_file_features(self):
for address, feature in self.file_features:
yield feature, address
Expand Down
6 changes: 5 additions & 1 deletion capa/features/extractors/pefile.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from capa.features.file import Export, Import, Section
from capa.features.common import OS, ARCH_I386, FORMAT_PE, ARCH_AMD64, OS_WINDOWS, Arch, Format, Characteristic
from capa.features.address import NO_ADDRESS, FileOffsetAddress, AbsoluteVirtualAddress
from capa.features.extractors.base_extractor import StaticFeatureExtractor
from capa.features.extractors.base_extractor import SampleHashes, StaticFeatureExtractor

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -190,10 +190,14 @@ def __init__(self, path: Path):
super().__init__()
self.path: Path = path
self.pe = pefile.PE(str(path))
self.sample_hashes = SampleHashes.from_bytes(self.path.read_bytes())

def get_base_address(self):
return AbsoluteVirtualAddress(self.pe.OPTIONAL_HEADER.ImageBase)

def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes

def extract_global_features(self):
buf = Path(self.path).read_bytes()

Expand Down
12 changes: 11 additions & 1 deletion capa/features/extractors/viv/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@
import capa.features.extractors.viv.basicblock
from capa.features.common import Feature
from capa.features.address import Address, AbsoluteVirtualAddress
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor
from capa.features.extractors.base_extractor import (
BBHandle,
InsnHandle,
SampleHashes,
FunctionHandle,
StaticFeatureExtractor,
)

logger = logging.getLogger(__name__)

Expand All @@ -31,6 +37,7 @@ def __init__(self, vw, path: Path, os):
self.vw = vw
self.path = path
self.buf = path.read_bytes()
self.sample_hashes = SampleHashes.from_bytes(self.buf)

# pre-compute these because we'll yield them at *every* scope.
self.global_features: List[Tuple[Feature, Address]] = []
Expand All @@ -42,6 +49,9 @@ def get_base_address(self):
# assume there is only one file loaded into the vw
return AbsoluteVirtualAddress(list(self.vw.filemeta.values())[0]["imagebase"])

def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes

def extract_global_features(self):
yield from self.global_features

Expand Down
Loading
Loading