From 8e15fc535e16eca0fb5b465340e2934562d7f039 Mon Sep 17 00:00:00 2001 From: Wyatt <53830972+whyitfor@users.noreply.github.com> Date: Wed, 8 Jan 2025 18:31:17 -0500 Subject: [PATCH] Revamp magic identification for significant speed improvements (#492) Revamp magic identification for significant speed improvements 1. `File` now inherits from `GenericBinary` to avoid duplicative Identifier runs 2. Update auto-run component logic to run all Analyzers, not just the most specific ones 3. Refactor Magic identification 4. Update registered identifiers to make use of new `MagicIdentifier` --- .../contributor-guide/component/identifier.md | 45 +++++- .../key-concepts/component/identifier.md | 31 +--- ofrak_core/CHANGELOG.md | 12 ++ ofrak_core/ofrak/core/__init__.py | 3 + ofrak_core/ofrak/core/apk.py | 67 ++++----- ofrak_core/ofrak/core/binwalk.py | 3 +- ofrak_core/ofrak/core/bzip2.py | 6 +- ofrak_core/ofrak/core/checksum.py | 5 +- ofrak_core/ofrak/core/cpio.py | 6 +- ofrak_core/ofrak/core/dtb.py | 38 +++-- ofrak_core/ofrak/core/elf/model.py | 4 +- ofrak_core/ofrak/core/extfs.py | 8 +- ofrak_core/ofrak/core/filesystem.py | 4 +- ofrak_core/ofrak/core/gzip.py | 6 +- ofrak_core/ofrak/core/ihex.py | 2 +- ofrak_core/ofrak/core/iso9660.py | 6 +- ofrak_core/ofrak/core/java.py | 12 ++ ofrak_core/ofrak/core/jffs2.py | 4 +- ofrak_core/ofrak/core/lzma.py | 10 +- ofrak_core/ofrak/core/lzo.py | 6 +- ofrak_core/ofrak/core/magic.py | 136 +++++++++++++----- ofrak_core/ofrak/core/openwrt.py | 18 +-- ofrak_core/ofrak/core/pe/model.py | 4 +- ofrak_core/ofrak/core/rar.py | 8 +- ofrak_core/ofrak/core/seven_zip.py | 6 +- ofrak_core/ofrak/core/squashfs.py | 8 +- ofrak_core/ofrak/core/strings.py | 3 +- ofrak_core/ofrak/core/tar.py | 6 +- ofrak_core/ofrak/core/ubi.py | 24 ++-- ofrak_core/ofrak/core/ubifs.py | 23 ++- ofrak_core/ofrak/core/uf2.py | 19 ++- ofrak_core/ofrak/core/uimage.py | 4 +- ofrak_core/ofrak/core/zip.py | 6 +- ofrak_core/ofrak/core/zlib.py | 6 +- ofrak_core/ofrak/core/zstd.py | 6 +- ofrak_core/ofrak/service/job_service.py | 13 +- .../components/test_dtb_component.py | 12 +- ofrak_core/test_ofrak/service/conftest.py | 7 +- .../resource_service/test_resource_service.py | 33 ++++- .../2_ofrak_internals.ipynb | 6 +- .../4_simple_code_modification.ipynb | 2 +- 41 files changed, 372 insertions(+), 256 deletions(-) create mode 100644 ofrak_core/ofrak/core/java.py diff --git a/docs/contributor-guide/component/identifier.md b/docs/contributor-guide/component/identifier.md index 11807873d..1f9b50536 100644 --- a/docs/contributor-guide/component/identifier.md +++ b/docs/contributor-guide/component/identifier.md @@ -1,17 +1,50 @@ -# Registering Identifier Patterns -When [writing unpackers](./unpacker.md), OFRAK Contributors can leverage the `MagicMimeIdentifier` and `MagicDescriptionIdentifier` by registering mappings between resource tags and mime or description patterns. Doing so will ensure that `Resource.unpack` automatically calls their custom unpacker. +# Adding Identifiers +OFRAK Contributors and Users can extend the tool's identification capability in one of two ways: -For example, consider the following magic description identification registration in the file containing a `UImageUnpacker`: +1. Extend the [MagicIdentifier][ofrak.core.magic.MagicIdentifier] by registering a new magic pattern match +2. Implement a new [Identifier][ofrak.component.identifier.Identifier] + +## Extend the MagicIdentifier +First, consider extending the magic identifier by registering a new magic pattern match. +The [MagicIdentifier][ofrak.core.magic.MagicIdentifier] uses three pattern matchers: + +- [MagicMimePattern][ofrak.core.magic.MagicMimePattern] allows users to register matches to magic's mime output +- [MagicDescriptionPattern][ofrak.core.magic.MagicDescriptionPattern] allows users to create matching functions that run on the magic description output +- [RawMagicPattern][ofrak.core.magic.RawMagicPattern] allows users to create custom raw byte matching patterns against a resource's binary data + +Combining these pattern matching strategies can provide expanded identification coverage, particularly when libmagic's output contains false negatives. +For example, all three patterns are used to identify `DeviceTreeBlob` ```python -MagicDescriptionIdentifier.register(UImage, lambda s: s.startswith("u-boot legacy uImage")) +MagicMimePattern.register(DeviceTreeBlob, "Device Tree Blob") +MagicDescriptionPattern.register(DeviceTreeBlob, lambda s: "device tree blob" in s.lower()) + + +def match_dtb_magic(data: bytes): + if len(data) < 4: + return False + return data[:4] == DTB_MAGIC_BYTES + + +RawMagicPattern.register(DeviceTreeBlob, match_dtb_magic) ``` -This line ensures that the `MagicDescriptionIdentifier` adds a `UImage` tag to resources matching that description pattern. As a result, any unpackers targeting a `UImage` will automatically run when `Resource.unpack` is run. +These patterns (along with all other identifier patterns) will get run when the [MagicIdentifier][ofrak.core.magic.MagicIdentifier] runs, adding a `DeviceTreeBlob` tag to matching resources. +See the docstrings for each pattern for implementation details. +Generally speaking, it makes sense to start with a magic mime or magic description pattern, implementing a raw magic pattern only when necessary. +## Implement a New Identifier +Additionally, it is possible to implement a new [Identifier][ofrak.component.identifier.Identifier]. +Doing so should be reserved for situations where extending the [MagicIdentifier][ofrak.core.magic.MagicIdentifier] is impractical. +The [ApkIdentifier][ofrak.core.apk.ApkIdentifier] is an example of a custom identifier implementation. -### Handling External Dependencies +!!! warning + Adding new identifiers should be done with care to minimize overall performance impact to OFRAK workflows. + Try to carefully select the resource tags the identifier targets to minimize the frequency with which + it is run: generally speaking, targeting `GenericBinary` will result in this identifier getting run on the largest + number of possible resources. `ApkIdentifier` targets `JavaArchive` and `ZipArchive` only for this reason. +### Handling External Dependencies If the Identifier makes use of tools that are not packaged in modules installable via `pip` from PyPI (commonly command-line tools), these dependencies must be explicitly declared as part of the identifier's class declaration. See the [Components Using External Tools](./external_tools.md) doc diff --git a/docs/user-guide/key-concepts/component/identifier.md b/docs/user-guide/key-concepts/component/identifier.md index db3f212dc..d7cd10a28 100644 --- a/docs/user-guide/key-concepts/component/identifier.md +++ b/docs/user-guide/key-concepts/component/identifier.md @@ -2,37 +2,8 @@ ## Overview Identifiers are components that tag resources with specific resource tags. -The following is an example of the `MagicMimeIdentifier`, which uses libmagic file type identification to tag resources: -```python - -class MagicMimeIdentifier(Identifier[None]): - id = b"MagicMimeIdentifier" - targets = (File,) - _tags_by_mime: Dict[str, ResourceTag] = dict() - - async def identify(self, resource: Resource, config=None): - _magic = await resource.analyze(Magic) - magic_mime = _magic.mime - tag = MagicMimeIdentifier._tags_by_mime.get(magic_mime) - if tag is not None: - resource.add_tag(tag) - @classmethod - def register(cls, resource: ResourceTag, mime_types: Union[Iterable[str], str]): - if isinstance(mime_types, str): - mime_types = [mime_types] - for mime_type in mime_types: - if mime_type in cls._tags_by_mime: - raise AlreadyExistError(f"Registering already-registered mime type: {mime_type}") - cls._tags_by_mime[mime_type] = resource - - -... - -MagicMimeIdentifier.register(GenericText, "text/plain") - -``` +The most ubiquitous identifier is the [MagicIdentifier][ofrak.core.magic.MagicIdentifier]. -The last line of the example, `MagicMimeIdentifier.register(GenericText, "text/plain")`, registers the "text/plain" pattern as one that maps to the `GenericText` resource tag. ## Usage Identifiers can be explicitly run using the `Resource.identify` method: diff --git a/ofrak_core/CHANGELOG.md b/ofrak_core/CHANGELOG.md index 2fc6b4b0f..1ab1d8d70 100644 --- a/ofrak_core/CHANGELOG.md +++ b/ofrak_core/CHANGELOG.md @@ -17,7 +17,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - Add generic DecompilationAnalysis classes. ([#453](https://github.com/redballoonsecurity/ofrak/pull/453)) - `PatchFromSourceModifier` bundles src and header files into same temporary directory with BOM and FEM ([#517](https://github.com/redballoonsecurity/ofrak/pull/517)) - Add support for running on Windows to the `Filesystem` component. ([#521](https://github.com/redballoonsecurity/ofrak/pull/521)) +- Add `JavaArchive` resource tag ([#492](https://github.com/redballoonsecurity/ofrak/pull/492)) - Add new method for allocating `.bss` sections using free space ranges that aren't mapped to data ranges. ([#505](https://github.com/redballoonsecurity/ofrak/pull/505)) +- Add `JavaArchive` resource tag ([#492](https://github.com/redballoonsecurity/ofrak/pull/492)) ### Fixed - Improved flushing of filesystem entries (including symbolic links and other types) to disk. ([#373](https://github.com/redballoonsecurity/ofrak/pull/373)) @@ -37,8 +39,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - Fix bugs on Windows arising from using `os.path` methods when only forward-slashes are acceptable ([#521](https://github.com/redballoonsecurity/ofrak/pull/521)) - Made some changes to OFRAK test suite to improve test coverage on Windows ([#487](https://github.com/redballoonsecurity/ofrak/pull/487)) - Fix usage of `NamedTemporaryFile` with external tools on Windows ([#486](https://github.com/redballoonsecurity/ofrak/pull/486)) +- Fixed endianness issue in DTB raw byte identifier ([#492](https://github.com/redballoonsecurity/ofrak/pull/492)) - Fix unintentional ignoring of cpio errors introduced in [#486](https://github.com/redballoonsecurity/ofrak/pull/486) ([#555](https://github.com/redballoonsecurity/ofrak/pull/555])) - `Data` resource attribute always corresponds to value of `Resource.get_data_range_within_root` ([#559](https://github.com/redballoonsecurity/ofrak/pull/559)) +- Fixed endianness issue in DTB raw byte identifier ([#492](https://github.com/redballoonsecurity/ofrak/pull/492)) ### Changed - By default, the ofrak log is now `ofrak-YYYYMMDDhhmmss.log` rather than just `ofrak.log` and the name can be specified on the command line ([#480](https://github.com/redballoonsecurity/ofrak/pull/480)) @@ -50,6 +54,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - Minor update to OFRAK Community License, add OFRAK Pro License ([#478](https://github.com/redballoonsecurity/ofrak/pull/478)) - Update python to 3.9 as main version used and tested (including in default docker image build) ([#502](https://github.com/redballoonsecurity/ofrak/pull/502)) - Update OpenJDK to version 17, remove unused qemu package ([#502](https://github.com/redballoonsecurity/ofrak/pull/502)) +- Update resource tag File to inherit from GenericBinary ([#492](https://github.com/redballoonsecurity/ofrak/pull/492)) +- Update auto-run component logic to run all Analyzers, not just the most specific ([#492](https://github.com/redballoonsecurity/ofrak/pull/492)) +- Revamp magic identification for significant speed improvements ([#492](https://github.com/redballoonsecurity/ofrak/pull/492)) + - Refactor magic identification to use one identifier, named `MagicIdentifier` + - Rename `MagicMimeIdentifier` to `MagicMimePattern`, as it is run by `MagicIdentifier` + - Rename `MagicDescriptionIdentifier` to `MagicDescriptionPattern`, as it is run by `MagicIdentifier` + - Add `RawMagicPattern` to efficiently run custom magic byte search logic within `MagicIdenfifier` + - Update registered identifiers to make use of new `MagicIdentifier` for following resource tags: `Apk`, `Bzip2Data`, `CpioFilesystem`, `DeviceTreeBlob`, `Elf`, `Ext2Filesystem`, `Ext3Filesystem`, `Ext4Filesystem`, `GzipData`, `ISO9660Image`, `Jffs2Filesystem`, `LzmaData`, `XzData`, `LzoData`, `OpenWrtTrx`, `Pe`, `RarArchive`, `SevenZFilesystem`, `SquashfsFilesystem`, `TarArchive`, `Ubi`, `Ubifs`, `Uf2File`, `UImage`, `ZipArchive`, `ZlibData`, `ZstdData` ### Security - Update aiohttp to 3.10.11 ([#522](https://github.com/redballoonsecurity/ofrak/pull/522)) diff --git a/ofrak_core/ofrak/core/__init__.py b/ofrak_core/ofrak/core/__init__.py index 03ea0dfd0..b1f3a1df3 100644 --- a/ofrak_core/ofrak/core/__init__.py +++ b/ofrak_core/ofrak/core/__init__.py @@ -36,6 +36,9 @@ from ofrak.core.injector import * from ofrak.core.instruction import * from ofrak.core.iso9660 import * + +# Why JavaArchive only? See https://github.com/redballoonsecurity/ofrak/pull/492/files#r1905582276 +from ofrak.core.java import JavaArchive from ofrak.core.label import * from ofrak.core.lzma import * from ofrak.core.lzo import * diff --git a/ofrak_core/ofrak/core/apk.py b/ofrak_core/ofrak/core/apk.py index 9b01321e5..963b50a74 100644 --- a/ofrak_core/ofrak/core/apk.py +++ b/ofrak_core/ofrak/core/apk.py @@ -6,22 +6,17 @@ from subprocess import CalledProcessError from dataclasses import dataclass -from ofrak.core.filesystem import File, Folder - +from ofrak.component.identifier import Identifier from ofrak.component.packer import Packer - -from ofrak.resource import Resource - from ofrak.component.unpacker import Unpacker -from ofrak.component.identifier import Identifier - -from ofrak.model.component_model import ComponentConfig, ComponentExternalTool +from ofrak.core.filesystem import File, Folder +from ofrak.core.java import JavaArchive +from ofrak.core.magic import MagicMimePattern from ofrak.core.zip import ZipArchive, UNZIP_TOOL -from ofrak.core.binary import GenericBinary -from ofrak.core.magic import Magic, MagicMimeIdentifier +from ofrak.model.component_model import ComponentConfig, ComponentExternalTool +from ofrak.resource import Resource from ofrak_type.range import Range - APKTOOL = ComponentExternalTool("apktool", "https://ibotpeaches.github.io/Apktool/", "-version") JAVA = ComponentExternalTool( "java", @@ -206,30 +201,36 @@ async def pack( resource.queue_patch(Range(0, await resource.get_data_length()), new_data) +MagicMimePattern.register(Apk, "application/vnd.android.package-archive") + + class ApkIdentifier(Identifier): - targets = (File, GenericBinary) + """ + Identifier for ApkArchive. + + Some Apks are recognized by the MagicMimePattern; others are tagged as JavaArchive or + ZipArchive. This identifier inspects those files, and tags any with an androidmanifest.xml + as an ApkArchive. + """ + + targets = (JavaArchive, ZipArchive) external_dependencies = (UNZIP_TOOL,) async def identify(self, resource: Resource, config=None) -> None: - await resource.run(MagicMimeIdentifier) - magic = resource.get_attributes(Magic) - if magic.mime == "application/vnd.android.package-archive": - resource.add_tag(Apk) - elif magic is not None and magic.mime in ["application/java-archive", "application/zip"]: - async with resource.temp_to_disk(suffix=".zip") as temp_path: - unzip_cmd = [ - "unzip", - "-l", - temp_path, - ] - unzip_proc = await asyncio.create_subprocess_exec( - *unzip_cmd, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, stderr = await unzip_proc.communicate() - if unzip_proc.returncode: - raise CalledProcessError(returncode=unzip_proc.returncode, cmd=unzip_cmd) + async with resource.temp_to_disk(suffix=".zip") as temp_path: + unzip_cmd = [ + "unzip", + "-l", + temp_path, + ] + unzip_proc = await asyncio.create_subprocess_exec( + *unzip_cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await unzip_proc.communicate() + if unzip_proc.returncode: + raise CalledProcessError(returncode=unzip_proc.returncode, cmd=unzip_cmd) - if b"androidmanifest.xml" in stdout.lower(): - resource.add_tag(Apk) + if b"androidmanifest.xml" in stdout.lower(): + resource.add_tag(Apk) diff --git a/ofrak_core/ofrak/core/binwalk.py b/ofrak_core/ofrak/core/binwalk.py index e433d4e8e..a56815bcb 100644 --- a/ofrak_core/ofrak/core/binwalk.py +++ b/ofrak_core/ofrak/core/binwalk.py @@ -18,7 +18,6 @@ BINWALK_INSTALLED = False from ofrak.core.binary import GenericBinary -from ofrak.core.filesystem import File from ofrak.model.component_model import ComponentExternalTool from ofrak.service.data_service_i import DataServiceInterface from ofrak.service.resource_service_i import ResourceServiceInterface @@ -45,7 +44,7 @@ class BinwalkAttributes(ResourceAttributes): class BinwalkAnalyzer(Analyzer[None, BinwalkAttributes]): - targets = (GenericBinary, File) + targets = (GenericBinary,) outputs = (BinwalkAttributes,) external_dependencies = (BINWALK_TOOL,) diff --git a/ofrak_core/ofrak/core/bzip2.py b/ofrak_core/ofrak/core/bzip2.py index 03ea6487f..85a45eb4f 100644 --- a/ofrak_core/ofrak/core/bzip2.py +++ b/ofrak_core/ofrak/core/bzip2.py @@ -6,7 +6,7 @@ from ofrak.component.unpacker import Unpacker from ofrak.resource import Resource from ofrak.core.binary import GenericBinary -from ofrak.core.magic import MagicDescriptionIdentifier, MagicMimeIdentifier +from ofrak.core.magic import MagicDescriptionPattern, MagicMimePattern from ofrak_type.range import Range LOGGER = logging.getLogger(__name__) @@ -64,5 +64,5 @@ async def pack(self, resource: Resource, config=None): resource.queue_patch(Range(0, original_size), bzip2_compressed) -MagicMimeIdentifier.register(Bzip2Data, "application/x-bzip2") -MagicDescriptionIdentifier.register(Bzip2Data, lambda s: s.startswith("BZip2 archive")) +MagicMimePattern.register(Bzip2Data, "application/x-bzip2") +MagicDescriptionPattern.register(Bzip2Data, lambda s: s.startswith("BZip2 archive")) diff --git a/ofrak_core/ofrak/core/checksum.py b/ofrak_core/ofrak/core/checksum.py index 895c09693..d122c3a73 100644 --- a/ofrak_core/ofrak/core/checksum.py +++ b/ofrak_core/ofrak/core/checksum.py @@ -2,7 +2,6 @@ from dataclasses import dataclass from ofrak.core.binary import GenericBinary -from ofrak.core.filesystem import File from ofrak.component.analyzer import Analyzer from ofrak.model.resource_model import ResourceAttributes @@ -19,7 +18,7 @@ class Sha256Analyzer(Analyzer[None, Sha256Attributes]): Analyze binary data and add attributes with the SHA256 checksum of the data. """ - targets = (File, GenericBinary) + targets = (GenericBinary,) outputs = (Sha256Attributes,) async def analyze(self, resource: Resource, config=None) -> Sha256Attributes: @@ -39,7 +38,7 @@ class Md5Analyzer(Analyzer[None, Md5Attributes]): Analyze binary data and add attributes with the MD5 checksum of the data. """ - targets = (File, GenericBinary) + targets = (GenericBinary,) outputs = (Md5Attributes,) async def analyze(self, resource: Resource, config=None) -> Md5Attributes: diff --git a/ofrak_core/ofrak/core/cpio.py b/ofrak_core/ofrak/core/cpio.py index d1516856c..c9260abda 100644 --- a/ofrak_core/ofrak/core/cpio.py +++ b/ofrak_core/ofrak/core/cpio.py @@ -10,7 +10,7 @@ from ofrak.component.unpacker import Unpacker from ofrak.core.binary import GenericBinary from ofrak.core.filesystem import File, Folder, FilesystemRoot, SpecialFileType -from ofrak.core.magic import MagicMimeIdentifier, MagicDescriptionIdentifier, Magic +from ofrak.core.magic import MagicMimePattern, MagicDescriptionPattern, Magic from ofrak.model.component_model import ComponentExternalTool from ofrak.resource import Resource from ofrak_type.range import Range @@ -150,5 +150,5 @@ async def pack(self, resource: Resource, config=None): resource.queue_patch(Range(0, await resource.get_data_length()), cpio_pack_output) -MagicMimeIdentifier.register(CpioFilesystem, "application/x-cpio") -MagicDescriptionIdentifier.register(CpioFilesystem, lambda s: "cpio archive" in s) +MagicMimePattern.register(CpioFilesystem, "application/x-cpio") +MagicDescriptionPattern.register(CpioFilesystem, lambda s: "cpio archive" in s) diff --git a/ofrak_core/ofrak/core/dtb.py b/ofrak_core/ofrak/core/dtb.py index aa02549e3..bf3f979e6 100644 --- a/ofrak_core/ofrak/core/dtb.py +++ b/ofrak_core/ofrak/core/dtb.py @@ -12,18 +12,23 @@ import fdt from ofrak.component.analyzer import Analyzer -from ofrak.component.identifier import Identifier from ofrak.component.packer import Packer from ofrak.component.unpacker import Unpacker from ofrak.model.viewable_tag_model import AttributesType from ofrak.resource import Resource from ofrak.service.resource_service_i import ResourceFilter, ResourceSort -from ofrak.core import GenericBinary, MagicMimeIdentifier, MagicDescriptionIdentifier +from ofrak.core import GenericBinary +from ofrak.core.magic import ( + MagicMimePattern, + MagicDescriptionPattern, + RawMagicPattern, +) from ofrak.model.component_model import ComponentConfig from ofrak.model.resource_model import index from ofrak_type.range import Range DTB_MAGIC_SIGNATURE: int = 0xD00DFEED +DTB_MAGIC_BYTES = struct.pack(">I", DTB_MAGIC_SIGNATURE) @dataclass @@ -332,22 +337,6 @@ async def pack(self, resource: Resource, config: ComponentConfig = None): resource.queue_patch(Range(0, original_size), dtb.to_dtb()) -class DeviceTreeBlobIdentifier(Identifier[None]): - """ - Identify Device Tree Blob files. - """ - - targets = (GenericBinary,) - - async def identify(self, resource: Resource, config: ComponentConfig = None) -> None: - """ - Identify DTB files based on the first four bytes being "d00dfeed". - """ - data = await resource.get_data(Range(0, 4)) - if data == struct.pack(" fdt.items.Property: """ Generates an fdt.items.property corresponding to a DtbProperty. @@ -402,5 +391,14 @@ def _prop_from_fdt(p: fdt.items.Property) -> Tuple[DtbPropertyType, bytes]: return _p_type, _p_data -MagicMimeIdentifier.register(DeviceTreeBlob, "Device Tree Blob") -MagicDescriptionIdentifier.register(DeviceTreeBlob, lambda s: "device tree blob" in s.lower()) +MagicMimePattern.register(DeviceTreeBlob, "Device Tree Blob") +MagicDescriptionPattern.register(DeviceTreeBlob, lambda s: "device tree blob" in s.lower()) + + +def match_dtb_magic(data: bytes): + if len(data) < 4: + return False + return data[:4] == DTB_MAGIC_BYTES + + +RawMagicPattern.register(DeviceTreeBlob, match_dtb_magic) diff --git a/ofrak_core/ofrak/core/elf/model.py b/ofrak_core/ofrak/core/elf/model.py index 7aa4abed4..e0805d8f4 100644 --- a/ofrak_core/ofrak/core/elf/model.py +++ b/ofrak_core/ofrak/core/elf/model.py @@ -17,7 +17,7 @@ ResourceSortDirection, ResourceSort, ) -from ofrak.core.magic import MagicDescriptionIdentifier +from ofrak.core.magic import MagicDescriptionPattern from ofrak_type.bit_width import BitWidth from ofrak_type.endianness import Endianness from ofrak_type.memory_permissions import MemoryPermissions @@ -869,4 +869,4 @@ async def get_program_header_by_index(self, index: int) -> ElfProgramHeader: ) -MagicDescriptionIdentifier.register(Elf, lambda s: s.startswith("ELF ")) +MagicDescriptionPattern.register(Elf, lambda s: s.startswith("ELF ")) diff --git a/ofrak_core/ofrak/core/extfs.py b/ofrak_core/ofrak/core/extfs.py index 3c50705b9..47f2e5fa8 100644 --- a/ofrak_core/ofrak/core/extfs.py +++ b/ofrak_core/ofrak/core/extfs.py @@ -10,7 +10,7 @@ File, Folder, SpecialFileType, - MagicDescriptionIdentifier, + MagicDescriptionPattern, ) from ofrak.model.component_model import ComponentExternalTool, ComponentConfig @@ -74,6 +74,6 @@ async def unpack(self, resource: Resource, config: ComponentConfig = None) -> No await fs_view.initialize_from_disk(temp_dir) -MagicDescriptionIdentifier.register(Ext2Filesystem, lambda s: "ext2 filesystem" in s.lower()) -MagicDescriptionIdentifier.register(Ext3Filesystem, lambda s: "ext3 filesystem" in s.lower()) -MagicDescriptionIdentifier.register(Ext4Filesystem, lambda s: "ext4 filesystem" in s.lower()) +MagicDescriptionPattern.register(Ext2Filesystem, lambda s: "ext2 filesystem" in s.lower()) +MagicDescriptionPattern.register(Ext3Filesystem, lambda s: "ext3 filesystem" in s.lower()) +MagicDescriptionPattern.register(Ext4Filesystem, lambda s: "ext4 filesystem" in s.lower()) diff --git a/ofrak_core/ofrak/core/filesystem.py b/ofrak_core/ofrak/core/filesystem.py index d62c8836f..484472f96 100644 --- a/ofrak_core/ofrak/core/filesystem.py +++ b/ofrak_core/ofrak/core/filesystem.py @@ -8,6 +8,8 @@ from dataclasses import dataclass from typing import Dict, Iterable, Optional, Type, Union +from ofrak.core import GenericBinary + try: import xattr except ImportError: @@ -263,7 +265,7 @@ async def flush_to_disk(self, root_path: str = ".", filename: Optional[str] = No ) -class File(FilesystemEntry): +class File(FilesystemEntry, GenericBinary): """ Stores the data and location of a file within a filesystem or folder's descendant file tree. """ diff --git a/ofrak_core/ofrak/core/gzip.py b/ofrak_core/ofrak/core/gzip.py index 66df3d5ee..0ab70b9c4 100644 --- a/ofrak_core/ofrak/core/gzip.py +++ b/ofrak_core/ofrak/core/gzip.py @@ -8,7 +8,7 @@ from ofrak.component.packer import Packer from ofrak.component.unpacker import Unpacker from ofrak.core.binary import GenericBinary -from ofrak.core.magic import MagicMimeIdentifier, MagicDescriptionIdentifier +from ofrak.core.magic import MagicMimePattern, MagicDescriptionPattern from ofrak.model.component_model import ComponentExternalTool from ofrak.resource import Resource from ofrak_type.range import Range @@ -131,5 +131,5 @@ async def pack_with_pigz(data: bytes) -> bytes: return stdout -MagicMimeIdentifier.register(GzipData, "application/gzip") -MagicDescriptionIdentifier.register(GzipData, lambda s: s.startswith("gzip compressed data")) +MagicMimePattern.register(GzipData, "application/gzip") +MagicDescriptionPattern.register(GzipData, lambda s: s.startswith("gzip compressed data")) diff --git a/ofrak_core/ofrak/core/ihex.py b/ofrak_core/ofrak/core/ihex.py index 429d6b447..b392fedeb 100644 --- a/ofrak_core/ofrak/core/ihex.py +++ b/ofrak_core/ofrak/core/ihex.py @@ -153,7 +153,7 @@ class IhexIdentifier(Identifier): _INTEL_HEX_PATTERN = re.compile(rb"((\:([0-9A-F]{2}){5,})(\n|\r\n)+){2}") async def identify(self, resource: Resource, config=None) -> None: - matched_ihex = await resource.search_data(self._INTEL_HEX_PATTERN, max_matches=1) + matched_ihex = await resource.search_data(self._INTEL_HEX_PATTERN, 0, 0x2000, max_matches=1) if matched_ihex: offset, bytes = matched_ihex[0] # Only tag if pattern starts at offset 0 of resource diff --git a/ofrak_core/ofrak/core/iso9660.py b/ofrak_core/ofrak/core/iso9660.py index 4549846b3..8f5061702 100644 --- a/ofrak_core/ofrak/core/iso9660.py +++ b/ofrak_core/ofrak/core/iso9660.py @@ -14,7 +14,7 @@ from ofrak.component.unpacker import Unpacker from ofrak.core.binary import GenericBinary from ofrak.core.filesystem import FilesystemRoot, File, Folder -from ofrak.core.magic import MagicMimeIdentifier, MagicDescriptionIdentifier +from ofrak.core.magic import MagicMimePattern, MagicDescriptionPattern from ofrak.model.component_model import ComponentExternalTool from ofrak.model.resource_model import ResourceAttributes from ofrak.model.resource_model import index @@ -336,5 +336,5 @@ async def pack(self, resource: Resource, config=None) -> None: resource.queue_patch(Range(0, await resource.get_data_length()), new_data) -MagicMimeIdentifier.register(ISO9660Image, "application/x-iso9660-image") -MagicDescriptionIdentifier.register(ISO9660Image, lambda s: s.startswith("ISO 9660 CD")) +MagicMimePattern.register(ISO9660Image, "application/x-iso9660-image") +MagicDescriptionPattern.register(ISO9660Image, lambda s: s.startswith("ISO 9660 CD")) diff --git a/ofrak_core/ofrak/core/java.py b/ofrak_core/ofrak/core/java.py new file mode 100644 index 000000000..709895803 --- /dev/null +++ b/ofrak_core/ofrak/core/java.py @@ -0,0 +1,12 @@ +from dataclasses import dataclass + +from ofrak.core.magic import MagicMimePattern +from ofrak.core.zip import ZipArchive + + +@dataclass +class JavaArchive(ZipArchive): + pass + + +MagicMimePattern.register(JavaArchive, "application/java-archive") diff --git a/ofrak_core/ofrak/core/jffs2.py b/ofrak_core/ofrak/core/jffs2.py index 1b0b94c1a..a757071d9 100644 --- a/ofrak_core/ofrak/core/jffs2.py +++ b/ofrak_core/ofrak/core/jffs2.py @@ -9,7 +9,7 @@ from ofrak.resource import Resource from ofrak.core.filesystem import File, Folder, FilesystemRoot, SpecialFileType -from ofrak.core.magic import MagicDescriptionIdentifier +from ofrak.core.magic import MagicDescriptionPattern from ofrak.core.binary import GenericBinary from ofrak_type.range import Range from ofrak.model.component_model import ComponentExternalTool @@ -90,4 +90,4 @@ async def pack(self, resource: Resource, config=None): resource.queue_patch(Range(0, await resource.get_data_length()), new_data) -MagicDescriptionIdentifier.register(Jffs2Filesystem, lambda s: "jffs2 filesystem" in s.lower()) +MagicDescriptionPattern.register(Jffs2Filesystem, lambda s: "jffs2 filesystem" in s.lower()) diff --git a/ofrak_core/ofrak/core/lzma.py b/ofrak_core/ofrak/core/lzma.py index f9b7db4ba..8538bdf7e 100644 --- a/ofrak_core/ofrak/core/lzma.py +++ b/ofrak_core/ofrak/core/lzma.py @@ -7,7 +7,7 @@ from ofrak.component.unpacker import Unpacker from ofrak.resource import Resource from ofrak.core.binary import GenericBinary -from ofrak.core.magic import MagicMimeIdentifier, MagicDescriptionIdentifier +from ofrak.core.magic import MagicMimePattern, MagicDescriptionPattern from ofrak_type.range import Range LOGGER = logging.getLogger(__name__) @@ -99,7 +99,7 @@ async def _get_lzma_format_and_tag(self, resource): return lzma_format, tag -MagicMimeIdentifier.register(LzmaData, "application/x-lzma") -MagicMimeIdentifier.register(XzData, "application/x-xz") -MagicDescriptionIdentifier.register(LzmaData, lambda s: s.startswith("LZMA compressed data")) -MagicDescriptionIdentifier.register(XzData, lambda s: s.startswith("XZ compressed data")) +MagicMimePattern.register(LzmaData, "application/x-lzma") +MagicMimePattern.register(XzData, "application/x-xz") +MagicDescriptionPattern.register(LzmaData, lambda s: s.startswith("LZMA compressed data")) +MagicDescriptionPattern.register(XzData, lambda s: s.startswith("XZ compressed data")) diff --git a/ofrak_core/ofrak/core/lzo.py b/ofrak_core/ofrak/core/lzo.py index ac22da72c..cb5e9ec08 100644 --- a/ofrak_core/ofrak/core/lzo.py +++ b/ofrak_core/ofrak/core/lzo.py @@ -5,7 +5,7 @@ from ofrak.component.unpacker import Unpacker from ofrak.resource import Resource from ofrak.core.binary import GenericBinary -from ofrak.core.magic import MagicMimeIdentifier, MagicDescriptionIdentifier +from ofrak.core.magic import MagicMimePattern, MagicDescriptionPattern from ofrak.model.component_model import ComponentExternalTool from ofrak.model.component_model import ComponentConfig @@ -79,5 +79,5 @@ async def pack(self, resource: Resource, config: ComponentConfig = None): resource.queue_patch(Range(0, original_size), compressed_data) -MagicMimeIdentifier.register(LzoData, "application/x-lzop") -MagicDescriptionIdentifier.register(LzoData, lambda s: s.lower().startswith("lzop compressed data")) +MagicMimePattern.register(LzoData, "application/x-lzop") +MagicDescriptionPattern.register(LzoData, lambda s: s.lower().startswith("lzop compressed data")) diff --git a/ofrak_core/ofrak/core/magic.py b/ofrak_core/ofrak/core/magic.py index e600ed01d..ba15f09bb 100644 --- a/ofrak_core/ofrak/core/magic.py +++ b/ofrak_core/ofrak/core/magic.py @@ -3,6 +3,7 @@ from typing import Callable, Dict, Iterable, Union from ofrak.component.abstract import ComponentMissingDependencyError +from ofrak_type import Range try: import magic @@ -14,7 +15,6 @@ from ofrak.component.analyzer import Analyzer from ofrak.component.identifier import Identifier from ofrak.core.binary import GenericBinary, GenericText -from ofrak.core.filesystem import File from ofrak.model.component_model import ComponentExternalTool from ofrak.model.resource_model import ResourceAttributes from ofrak.model.tag_model import ResourceTag @@ -59,7 +59,7 @@ class MagicAnalyzer(Analyzer[None, Magic]): Analyze a binary blob to extract its mimetype and magic description. """ - targets = (File, GenericBinary) + targets = (GenericBinary,) outputs = (Magic,) external_dependencies = (LIBMAGIC_DEP,) @@ -73,63 +73,131 @@ async def analyze(self, resource: Resource, config=None) -> Magic: return Magic(magic_mime, magic_description) -class MagicMimeIdentifier(Identifier[None]): - """ - Identify and add the appropriate tag for a given resource based on its mimetype. +class MagicIdentifier(Identifier[None]): """ + Identify resources using three identifier patterns: + + 1. [MagicMimePattern][ofrak.core.magic.MagicMimePattern] + 2. [MagicDescriptionPattern][ofrak.core.magic.MagicDescriptionPattern] + 3. [RawMagicPattern][ofrak.core.magic.RawMagicPattern] - id = b"MagicMimeIdentifier" - targets = (File, GenericBinary) - external_dependencies = (LIBMAGIC_DEP,) # Indirect thru MagicAnalyzer, but worth tagging + OFRAK component authors can "register" magic patterns to run whenever this + identifier is: + + ```python + MagicMimePattern.register(GenericBinary, "application/octet-stream") + ``` + """ - _tags_by_mime: Dict[str, ResourceTag] = dict() + targets = (GenericBinary,) + external_dependencies = (LIBMAGIC_DEP,) - async def identify(self, resource: Resource, config=None): + async def identify(self, resource: Resource, config=None) -> None: _magic = await resource.analyze(Magic) - magic_mime = _magic.mime - tag = MagicMimeIdentifier._tags_by_mime.get(magic_mime) - if tag is not None: - resource.add_tag(tag) + MagicMimePattern.run(resource, _magic.mime) + MagicDescriptionPattern.run(resource, _magic.descriptor) + await RawMagicPattern.run(resource) + + +class MagicMimePattern: + """ + Pattern to tag resources based on their mimetype. + """ + + tags_by_mime: Dict[str, ResourceTag] = dict() @classmethod - def register(cls, resource: ResourceTag, mime_types: Union[Iterable[str], str]): + def register(cls, resource_tag: ResourceTag, mime_types: Union[Iterable[str], str]): + """ + Register what resource tags correspond to specific mime types. + """ + if isinstance(mime_types, str): mime_types = [mime_types] for mime_type in mime_types: - if mime_type in cls._tags_by_mime: + if mime_type in cls.tags_by_mime: raise AlreadyExistError(f"Registering already-registered mime type: {mime_type}") - cls._tags_by_mime[mime_type] = resource + cls.tags_by_mime[mime_type] = resource_tag + + @classmethod + def run(cls, resource: Resource, magic_mime: str): + """ + Run the pattern against a given resource, tagging it based on matching mime types. + + This method is designed to be called by the [MagicIdentifier][ofrak.core.magic.MagicIdentifier]. + """ + tag = cls.tags_by_mime.get(magic_mime) + if tag is not None: + resource.add_tag(tag) -class MagicDescriptionIdentifier(Identifier[None]): +class MagicDescriptionPattern: """ - Identify and add the appropriate tag for a given resource based on its mime description. + Pattern to tag resources based on its mime description. """ - id = b"MagicDescriptionIdentifier" - targets = (File, GenericBinary) - external_dependencies = (LIBMAGIC_DEP,) # Indirect thru MagicAnalyzer, but worth tagging + matchers: Dict[Callable, ResourceTag] = dict() + + @classmethod + def register(cls, resource_tag: ResourceTag, matcher: Callable[[str], bool]): + """ + Register a callable that determines whether the given resource tag should be applied. + """ + if matcher in cls.matchers: + raise AlreadyExistError("Registering already-registered matcher") + cls.matchers[matcher] = resource_tag - _matchers: Dict[Callable, ResourceTag] = dict() + @classmethod + def run(cls, resource: Resource, magic_description: str): + """ + Run this pattern against a given resource, tagging it based on registered tags. - async def identify(self, resource: Resource, config): - _magic = await resource.analyze(Magic) - magic_description = _magic.descriptor - for matcher, resource_type in self._matchers.items(): + This method is designed to be called by the [MagicIdentifier][ofrak.core.magic.MagicIdentifier]. + """ + for matcher, resource_type in cls.matchers.items(): if matcher(magic_description): resource.add_tag(resource_type) + +class RawMagicPattern: + """ + Pattern to tag resource based on custom raw magic matching patterns. + + MAX_SEARCH_SIZE specifies how many bytes this pattern's `run` method exposes to registered + matches (the first MAX_SEARCH_SIZE bytes of a resource are exposed). + """ + + matchers: Dict[Callable, ResourceTag] = dict() + MAX_SEARCH_SIZE = 64 + @classmethod - def register(cls, resource: ResourceTag, matcher: Callable): - if matcher in cls._matchers: + def register(cls, resource_tag: ResourceTag, matcher: Callable[[bytes], bool]): + """ + Register a callable that determines whether the given resource tag should be applied. + """ + if matcher in cls.matchers: raise AlreadyExistError("Registering already-registered matcher") - cls._matchers[matcher] = resource + cls.matchers[matcher] = resource_tag + + @classmethod + async def run(cls, resource: Resource): + """ + Run the pattern against a given resource, tagging it based on registered tags. + Note that the first MAX_SEARCH_SIZE bytes of a resource are made available to the callable. + + This method is designed to be called by the [MagicIdentifier][ofrak.core.magic.MagicIdentifier]. + """ + data_length = min(await resource.get_data_length(), cls.MAX_SEARCH_SIZE) + data = await resource.get_data(range=Range(0, data_length)) + for matcher, resource_type in cls.matchers.items(): + if matcher(data): + resource.add_tag(resource_type) -MagicMimeIdentifier.register(GenericText, "text/plain") -MagicDescriptionIdentifier.register( +MagicMimePattern.register(GenericText, "text/plain") +MagicDescriptionPattern.register( GenericText, lambda desc: any([("ASCII text" in s) for s in desc.split(", ")]) ) -MagicMimeIdentifier.register(GenericBinary, "application/octet-stream") -MagicDescriptionIdentifier.register(GenericBinary, lambda s: s == "data") +MagicMimePattern.register(GenericBinary, "application/octet-stream") +MagicDescriptionPattern.register(GenericBinary, lambda s: s == "data") diff --git a/ofrak_core/ofrak/core/openwrt.py b/ofrak_core/ofrak/core/openwrt.py index c044ba06d..f7f7e68fa 100644 --- a/ofrak_core/ofrak/core/openwrt.py +++ b/ofrak_core/ofrak/core/openwrt.py @@ -5,13 +5,12 @@ from enum import Enum from typing import Optional, List -from ofrak.component.identifier import Identifier from ofrak.component.analyzer import Analyzer from ofrak.component.modifier import Modifier from ofrak.component.packer import Packer from ofrak.component.unpacker import Unpacker, UnpackerError +from ofrak.core import RawMagicPattern from ofrak.core.binary import GenericBinary -from ofrak.core.filesystem import File from ofrak.model.component_model import ComponentConfig from ofrak.model.resource_model import ResourceAttributes from ofrak.model.viewable_tag_model import AttributesType @@ -126,18 +125,13 @@ async def get_header(self) -> OpenWrtTrxHeader: #################### # IDENTIFIER # #################### +def match_openwrt_magic(data: bytes) -> bool: + if len(data) < 4: + return False + return data[:4] == OPENWRT_TRX_MAGIC_BYTES -class OpenWrtIdentifier(Identifier[None]): - targets = ( - File, - GenericBinary, - ) - - async def identify(self, resource: Resource, config=None) -> None: - trx_magic = await resource.get_data(range=Range(0, 4)) - if trx_magic == OPENWRT_TRX_MAGIC_BYTES: - resource.add_tag(OpenWrtTrx) +RawMagicPattern.register(OpenWrtTrx, match_openwrt_magic) #################### diff --git a/ofrak_core/ofrak/core/pe/model.py b/ofrak_core/ofrak/core/pe/model.py index c1029a904..0df51fbda 100644 --- a/ofrak_core/ofrak/core/pe/model.py +++ b/ofrak_core/ofrak/core/pe/model.py @@ -10,7 +10,7 @@ ResourceAttributeValueFilter, ResourceFilter, ) -from ofrak.core.magic import MagicDescriptionIdentifier +from ofrak.core.magic import MagicDescriptionPattern from ofrak_type.error import NotFoundError @@ -293,4 +293,4 @@ async def get_optional_header(self) -> Optional[PeOptionalHeader]: return None -MagicDescriptionIdentifier.register(Pe, lambda s: s.startswith("PE32")) +MagicDescriptionPattern.register(Pe, lambda s: s.startswith("PE32")) diff --git a/ofrak_core/ofrak/core/rar.py b/ofrak_core/ofrak/core/rar.py index 09eede84a..7e9acf794 100644 --- a/ofrak_core/ofrak/core/rar.py +++ b/ofrak_core/ofrak/core/rar.py @@ -7,7 +7,7 @@ from ofrak.core.binary import GenericBinary from ofrak.core.filesystem import FilesystemRoot, File, Folder, SpecialFileType -from ofrak.core.magic import MagicMimeIdentifier, MagicDescriptionIdentifier +from ofrak.core.magic import MagicMimePattern, MagicDescriptionPattern from ofrak.model.component_model import ComponentExternalTool from ofrak.resource import Resource from ofrak.model.component_model import ComponentConfig @@ -58,6 +58,6 @@ async def unpack(self, resource: Resource, config: ComponentConfig = None): await rar_view.initialize_from_disk(temp_dir) -MagicMimeIdentifier.register(RarArchive, "application/x-rar-compressed") -MagicMimeIdentifier.register(RarArchive, "application/vnd.rar") -MagicDescriptionIdentifier.register(RarArchive, lambda s: "rar archive" in s.lower()) +MagicMimePattern.register(RarArchive, "application/x-rar-compressed") +MagicMimePattern.register(RarArchive, "application/vnd.rar") +MagicDescriptionPattern.register(RarArchive, lambda s: "rar archive" in s.lower()) diff --git a/ofrak_core/ofrak/core/seven_zip.py b/ofrak_core/ofrak/core/seven_zip.py index edd2d071a..e4bfe2821 100644 --- a/ofrak_core/ofrak/core/seven_zip.py +++ b/ofrak_core/ofrak/core/seven_zip.py @@ -10,7 +10,7 @@ from ofrak.resource import Resource from ofrak.core.binary import GenericBinary from ofrak.core.filesystem import File, Folder, FilesystemRoot, SpecialFileType -from ofrak.core.magic import MagicMimeIdentifier, MagicDescriptionIdentifier +from ofrak.core.magic import MagicMimePattern, MagicDescriptionPattern from ofrak.model.component_model import ComponentExternalTool from ofrak_type.range import Range @@ -88,5 +88,5 @@ async def pack(self, resource: Resource, config=None): resource.queue_patch(Range(0, await resource.get_data_length()), new_data) -MagicMimeIdentifier.register(SevenZFilesystem, "application/x-7z-compressed") -MagicDescriptionIdentifier.register(SevenZFilesystem, lambda s: s.startswith("7-zip archive")) +MagicMimePattern.register(SevenZFilesystem, "application/x-7z-compressed") +MagicDescriptionPattern.register(SevenZFilesystem, lambda s: s.startswith("7-zip archive")) diff --git a/ofrak_core/ofrak/core/squashfs.py b/ofrak_core/ofrak/core/squashfs.py index 3f73c2c41..40c439bdb 100644 --- a/ofrak_core/ofrak/core/squashfs.py +++ b/ofrak_core/ofrak/core/squashfs.py @@ -9,7 +9,7 @@ from ofrak.resource import Resource from ofrak.core.filesystem import File, Folder, FilesystemRoot, SpecialFileType -from ofrak.core.magic import MagicMimeIdentifier, MagicDescriptionIdentifier +from ofrak.core.magic import MagicMimePattern, MagicDescriptionPattern from ofrak.core.binary import GenericBinary from ofrak.model.component_model import ComponentExternalTool @@ -118,7 +118,5 @@ async def pack(self, resource: Resource, config=None): resource.queue_patch(Range(0, await resource.get_data_length()), new_data) -MagicMimeIdentifier.register(SquashfsFilesystem, "application/filesystem+sqsh") -MagicDescriptionIdentifier.register( - SquashfsFilesystem, lambda s: s.startswith("Squashfs filesystem") -) +MagicMimePattern.register(SquashfsFilesystem, "application/filesystem+sqsh") +MagicDescriptionPattern.register(SquashfsFilesystem, lambda s: s.startswith("Squashfs filesystem")) diff --git a/ofrak_core/ofrak/core/strings.py b/ofrak_core/ofrak/core/strings.py index 807205683..8dac1ae11 100644 --- a/ofrak_core/ofrak/core/strings.py +++ b/ofrak_core/ofrak/core/strings.py @@ -7,7 +7,6 @@ from ofrak.component.unpacker import Unpacker from ofrak.core.binary import BinaryPatchConfig, BinaryPatchModifier, GenericText, GenericBinary from ofrak.core.code_region import CodeRegion -from ofrak.core.filesystem import File from ofrak.model.component_model import ComponentConfig from ofrak.model.resource_model import index from ofrak.model.viewable_tag_model import AttributesType @@ -67,7 +66,7 @@ class StringFindReplaceModifier(Modifier[StringFindReplaceConfig]): Find and replace all instances of a given string with a replacement string. """ - targets = (GenericBinary, File) + targets = (GenericBinary,) async def modify(self, resource: Resource, config: StringFindReplaceConfig) -> None: to_find = config.to_find.encode("utf-8") diff --git a/ofrak_core/ofrak/core/tar.py b/ofrak_core/ofrak/core/tar.py index 6c023323a..5f6777182 100644 --- a/ofrak_core/ofrak/core/tar.py +++ b/ofrak_core/ofrak/core/tar.py @@ -9,7 +9,7 @@ from ofrak.resource import Resource from ofrak.core.binary import GenericBinary from ofrak.core.filesystem import FilesystemRoot, Folder, File, SpecialFileType -from ofrak.core.magic import MagicMimeIdentifier, MagicDescriptionIdentifier +from ofrak.core.magic import MagicMimePattern, MagicDescriptionPattern from ofrak.model.component_model import ComponentExternalTool from ofrak.model.component_model import ComponentConfig @@ -114,5 +114,5 @@ async def pack(self, resource: Resource, config: ComponentConfig = None) -> None resource.queue_patch(Range(0, await resource.get_data_length()), new_fh.read()) -MagicMimeIdentifier.register(TarArchive, "application/x-tar") -MagicDescriptionIdentifier.register(TarArchive, lambda s: "tar archive" in s.lower()) +MagicMimePattern.register(TarArchive, "application/x-tar") +MagicDescriptionPattern.register(TarArchive, lambda s: "tar archive" in s.lower()) diff --git a/ofrak_core/ofrak/core/ubi.py b/ofrak_core/ofrak/core/ubi.py index f59dc72b7..9f403df28 100644 --- a/ofrak_core/ofrak/core/ubi.py +++ b/ofrak_core/ofrak/core/ubi.py @@ -6,14 +6,14 @@ import os from subprocess import CalledProcessError +from ofrak.core import RawMagicPattern from ofrak.model.tag_model import ResourceTag -from ofrak import Identifier, Analyzer +from ofrak import Analyzer from ofrak.component.packer import Packer from ofrak.component.unpacker import Unpacker from ofrak.model.component_model import ComponentExternalTool from ofrak.resource import Resource -from ofrak.core.filesystem import File from ofrak.core.binary import GenericBinary from ofrak.resource_view import ResourceView @@ -57,7 +57,7 @@ async def is_tool_installed(self) -> bool: return False -PY_LZO_TOOL = _PyLzoTool() +PY_LZO_TOOL: _PyLzoTool = _PyLzoTool() # For some reason mypy needs this type annotation @dataclass @@ -296,18 +296,10 @@ async def pack(self, resource: Resource, config=None) -> None: resource.queue_patch(Range(0, await resource.get_data_length()), packed_blob_data) -class UbiIdentifier(Identifier): - """ - Check the first four bytes of a resource and tag the resource as Ubi if it matches the file magic. - """ +def match_ubi_magic(data: bytes) -> bool: + if len(data) < 4: + return False + return data[:4] in [UBI_EC_HDR_MAGIC, UBI_VID_HDR_MAGIC] - targets = (File, GenericBinary) - - external_dependencies = (PY_LZO_TOOL,) - async def identify(self, resource: Resource, config=None) -> None: - datalength = await resource.get_data_length() - if datalength >= 4: - data = await resource.get_data(Range(0, 4)) - if data in [UBI_EC_HDR_MAGIC, UBI_VID_HDR_MAGIC]: - resource.add_tag(Ubi) +RawMagicPattern.register(Ubi, match_ubi_magic) diff --git a/ofrak_core/ofrak/core/ubifs.py b/ofrak_core/ofrak/core/ubifs.py index 250055125..241a91175 100644 --- a/ofrak_core/ofrak/core/ubifs.py +++ b/ofrak_core/ofrak/core/ubifs.py @@ -4,10 +4,11 @@ import logging from subprocess import CalledProcessError -from ofrak import Identifier, Analyzer +from ofrak import Analyzer from ofrak.component.packer import Packer from ofrak.component.unpacker import Unpacker -from ofrak.core import PY_LZO_TOOL +from ofrak.core import RawMagicPattern +from ofrak.core.ubi import PY_LZO_TOOL from ofrak.resource import Resource from ofrak.core.filesystem import File, Folder, FilesystemRoot, SpecialFileType from ofrak.core.binary import GenericBinary @@ -205,18 +206,10 @@ async def pack(self, resource: Resource, config=None) -> None: resource.queue_patch(Range(0, await resource.get_data_length()), new_data) -class UbifsIdentifier(Identifier): - """ - Check the first four bytes of a resource and tag the resource as Ubifs if it matches the file magic. - """ +def match_ubifs_magic(data: bytes) -> bool: + if len(data) < 4: + return False + return data[:4] == UBIFS_NODE_MAGIC - targets = (File, GenericBinary) - - external_dependencies = (PY_LZO_TOOL,) - async def identify(self, resource: Resource, config=None) -> None: - datalength = await resource.get_data_length() - if datalength >= 4: - data = await resource.get_data(Range(0, 4)) - if data == UBIFS_NODE_MAGIC: - resource.add_tag(Ubifs) +RawMagicPattern.register(Ubifs, match_ubifs_magic) diff --git a/ofrak_core/ofrak/core/uf2.py b/ofrak_core/ofrak/core/uf2.py index b9df86a30..667264596 100644 --- a/ofrak_core/ofrak/core/uf2.py +++ b/ofrak_core/ofrak/core/uf2.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from typing import List, Tuple -from ofrak import Identifier +from ofrak.core import RawMagicPattern from ofrak.core.code_region import CodeRegion from ofrak.resource import Resource @@ -20,6 +20,7 @@ UF2_MAGIC_START_ONE = 0x0A324655 UF2_MAGIC_START_TWO = 0x9E5D5157 UF2_MAGIC_END = 0x0AB16F30 +UF2_MAGIC_START_BYTES = struct.pack(" None: - if await resource.get_data_length() < 8: - pass - else: - data = await resource.get_data(Range(0, 8)) - magic_one, magic_two = struct.unpack(" ComponentFilter: When auto-running components, most of the time only the *most specific* components should be run for a resource. For example, an APK resource is also a ZIP resource; we want to always run the APK Unpacker on resources that are tagged as both ZIP and APK, because APK is a more - specific tag. However, Identifiers are a special case because they have benign side-effects, so - it is desirable to greedily run all Identifiers that could target a resource, not only the most - specific Identifiers. + specific tag. However, Identifiers and Analyzers are a special case because they have benign side-effects, so + it is desirable to greedily run all Identifiers and Analzyers that could target a resource, not only the most + specific Identifiers and Analyzers. This function constructs a filter which allows only components that target at least one of the given tags, but for non-identifiers the filter is even stricter so that only the most specific @@ -562,10 +562,17 @@ def _build_tag_filter(tags: Tuple[ResourceTag]) -> ComponentFilter: IDENTIFIERS_FILTER, ComponentTargetFilter(*tags), ), + ComponentAndMetaFilter( + ANALYZERS_FILTER, + ComponentTargetFilter(*tags), + ), ComponentAndMetaFilter( ComponentNotMetaFilter( IDENTIFIERS_FILTER, ), + ComponentNotMetaFilter( + ANALYZERS_FILTER, + ), ComponentPrioritySelectingMetaFilter(*filters_prioritized_by_specificity), ), ) diff --git a/ofrak_core/test_ofrak/components/test_dtb_component.py b/ofrak_core/test_ofrak/components/test_dtb_component.py index 522c44899..877f42ded 100644 --- a/ofrak_core/test_ofrak/components/test_dtb_component.py +++ b/ofrak_core/test_ofrak/components/test_dtb_component.py @@ -6,7 +6,7 @@ from ofrak import OFRAKContext from ofrak.resource import Resource from ofrak.service.resource_service_i import ResourceFilter, ResourceAttributeValueFilter -from ofrak.core.dtb import DeviceTreeBlob, DtbProperty, DtbNode +from ofrak.core.dtb import DeviceTreeBlob, DtbProperty, DtbNode, match_dtb_magic from ofrak.core.strings import StringPatchingModifier, StringPatchingConfig from pytest_ofrak.patterns.unpack_modify_pack import UnpackModifyPackPattern @@ -110,3 +110,13 @@ async def verify(self, repacked_resource: Resource): # Assert that the repacked DTB contains an empty node named "great-new-node" assert dtb_diff_only_repacked.get_node("/great-new-node").empty + + +def test_dtb_raw_magic_pattern(): + """ + Test that DTB raw pattern callable is correct. + """ + dtb_path = os.path.join(test_ofrak.components.ASSETS_DIR, "imx7d-sdb.dtb") + with open(dtb_path, "rb") as f: + data = f.read() + assert match_dtb_magic(data) diff --git a/ofrak_core/test_ofrak/service/conftest.py b/ofrak_core/test_ofrak/service/conftest.py index 0236639bd..6e4443e75 100644 --- a/ofrak_core/test_ofrak/service/conftest.py +++ b/ofrak_core/test_ofrak/service/conftest.py @@ -105,7 +105,12 @@ def tree3_resource_models() -> List[ResourceModel]: # Elf indexes are arbitrary, just to create some indexable value return [ ResourceModel.create( - R_ID_3_ROOT, tags=(File, GenericBinary), attributes=(TestIndexAttributes(5),) + R_ID_3_ROOT, + tags=( + File, + GenericBinary, + ), + attributes=(TestIndexAttributes(5),), ), ResourceModel.create( R_ID_3_1, diff --git a/ofrak_core/test_ofrak/service/resource_service/test_resource_service.py b/ofrak_core/test_ofrak/service/resource_service/test_resource_service.py index f3e4a7a56..9839903ea 100644 --- a/ofrak_core/test_ofrak/service/resource_service/test_resource_service.py +++ b/ofrak_core/test_ofrak/service/resource_service/test_resource_service.py @@ -210,7 +210,10 @@ async def test_get_depths( [R_ID_3_ROOT], r_filter=ResourceFilter( include_self=True, - tags=(File, GenericBinary), + tags=( + File, + GenericBinary, + ), tags_condition=ResourceFilterCondition.AND, attribute_filters=None, ), @@ -364,7 +367,10 @@ async def test_get_ancestors_by_id( [R_ID_3_ROOT], r_filter=ResourceFilter( include_self=True, - tags=(File, GenericBinary), + tags=( + File, + GenericBinary, + ), tags_condition=ResourceFilterCondition.AND, attribute_filters=None, ), @@ -375,11 +381,23 @@ async def test_get_ancestors_by_id( [R_ID_3_ROOT], r_filter=ResourceFilter( include_self=True, - tags=(File, GenericBinary), + tags=( + File, + GenericBinary, + ), tags_condition=ResourceFilterCondition.AND, attribute_filters=None, ), - extra_resources=[((File, GenericBinary), ())] * 10, + extra_resources=[ + ( + ( + File, + GenericBinary, + ), + (), + ) + ] + * 10, ), GetDescendantsTestCase( "attributes filter: exact value (attributes filter cheapest)", @@ -903,7 +921,12 @@ async def test_get_root_resources(self, triple_populated_resource_service): R_ID_1_ROOT: ResourceModel(R_ID_1_ROOT), R_ID_2_ROOT: ResourceModel(R_ID_2_ROOT), R_ID_3_ROOT: ResourceModel.create( - R_ID_3_ROOT, tags=(File, GenericBinary), attributes=(TestIndexAttributes(5),) + R_ID_3_ROOT, + tags=( + File, + GenericBinary, + ), + attributes=(TestIndexAttributes(5),), ), } diff --git a/ofrak_tutorial/notebooks_with_outputs/2_ofrak_internals.ipynb b/ofrak_tutorial/notebooks_with_outputs/2_ofrak_internals.ipynb index 6bf808262..e6bcad7cf 100644 --- a/ofrak_tutorial/notebooks_with_outputs/2_ofrak_internals.ipynb +++ b/ofrak_tutorial/notebooks_with_outputs/2_ofrak_internals.ipynb @@ -108,7 +108,7 @@ "id": "1557d982", "metadata": {}, "source": [ - "- Several **components** were run on our resource, including `MagicMimeIdentifier` and `ElfUnpacker`:" + "- Several **components** were run on our resource, including `MagicIdentifier` and `ElfUnpacker`:" ] }, { @@ -121,7 +121,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "components run: [b'ApkIdentifier', b'DeviceTreeBlobIdentifier', b'ElfDynamicSectionUnpacker', b'ElfPointerArraySectionUnpacker', b'ElfRelaUnpacker', b'ElfSymbolUnpacker', b'ElfUnpacker', b'MagicDescriptionIdentifier', b'MagicMimeIdentifier', b'OpenWrtIdentifier', b'UbiIdentifier', b'UbifsIdentifier', b'Uf2FileIdentifier']\n" + "components run: [b'ElfDynamicSectionUnpacker', b'ElfPointerArraySectionUnpacker', b'ElfRelaUnpacker', b'ElfSymbolUnpacker', b'ElfUnpacker', b'MagicIdentifier']\n" ] } ], @@ -146,7 +146,7 @@ "\n", "(Note: [Lesson 1](1_simple_string_modification.ipynb) presented a simpler workflow: as we only needed to access and modify the binary data of the file, unpacking and repacking weren't necessary, so we only created the resource from a file, modified its binary data, and flushed the result to disk.)\n", "\n", - "Back to what happened when unpacking our hello world binary. Here, the `MagicMimeIdentifier` used `libmagic` on the binary to try and determine what type of file it is – in this case, it's an ELF executable. Based on this, OFRAK knows it needs to run the `ElfUnpacker`, which unpacked the binary into sections based on the known ELF file structure.\n", + "Back to what happened when unpacking our hello world binary. Here, the `MagicIdentifier` used `libmagic` on the binary to try and determine what type of file it is – in this case, it's an ELF executable. Based on this, OFRAK knows it needs to run the `ElfUnpacker`, which unpacked the binary into sections based on the known ELF file structure.\n", "\n", "Components can be manually selected and run on resources, or they can be run automatically, as we did here.\n", "\n", diff --git a/ofrak_tutorial/notebooks_with_outputs/4_simple_code_modification.ipynb b/ofrak_tutorial/notebooks_with_outputs/4_simple_code_modification.ipynb index fdd70d46a..e8ecb2da9 100644 --- a/ofrak_tutorial/notebooks_with_outputs/4_simple_code_modification.ipynb +++ b/ofrak_tutorial/notebooks_with_outputs/4_simple_code_modification.ipynb @@ -103,7 +103,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "components run: [b'ApkIdentifier', b'CodeRegionUnpacker', b'ComplexBlockUnpacker', b'DecompilationAnalysisIdentifier', b'DeviceTreeBlobIdentifier', b'ElfDynamicSectionUnpacker', b'ElfPointerArraySectionUnpacker', b'ElfRelaUnpacker', b'ElfSymbolUnpacker', b'ElfUnpacker', b'GhidraAnalysisIdentifier', b'GhidraBasicBlockUnpacker', b'LinkableSymbolIdentifier', b'MagicDescriptionIdentifier', b'MagicMimeIdentifier', b'OpenWrtIdentifier', b'UbiIdentifier', b'UbifsIdentifier', b'Uf2FileIdentifier']\n", + "components run: [b'CodeRegionUnpacker', b'ComplexBlockUnpacker', b'DecompilationAnalysisIdentifier', b'ElfDynamicSectionUnpacker', b'ElfPointerArraySectionUnpacker', b'ElfRelaUnpacker', b'ElfSymbolUnpacker', b'ElfUnpacker', b'GhidraAnalysisIdentifier', b'GhidraBasicBlockUnpacker', b'LinkableSymbolIdentifier', b'MagicIdentifier']\n", "309 resources created\n", "310 resources modified\n" ]