From dfc3b183747acd8b752f40f70f1daa56d9aecfde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Stucke?= Date: Thu, 14 Nov 2024 13:24:08 +0100 Subject: [PATCH] chore: Remove dependency on fact_helper_file --- fact_extractor/helperFunctions/magic.py | 46 ++++++++++++++++++ fact_extractor/helperFunctions/statistics.py | 4 +- fact_extractor/install/common.py | 26 +++++++++- fact_extractor/install/pre_install.sh | 2 +- .../generic_carver/code/generic_carver.py | 4 +- .../unpacking/generic_fs/code/generic_fs.py | 5 +- fact_extractor/test/data/ros_header | Bin 0 -> 288 bytes fact_extractor/test/unit/test_mime.py | 15 ++++++ fact_extractor/unpacker/unpackBase.py | 7 +-- requirements-unpackers.txt | 2 +- 10 files changed, 98 insertions(+), 13 deletions(-) create mode 100644 fact_extractor/helperFunctions/magic.py create mode 100644 fact_extractor/test/data/ros_header create mode 100644 fact_extractor/test/unit/test_mime.py diff --git a/fact_extractor/helperFunctions/magic.py b/fact_extractor/helperFunctions/magic.py new file mode 100644 index 00000000..9af20d58 --- /dev/null +++ b/fact_extractor/helperFunctions/magic.py @@ -0,0 +1,46 @@ +"""This is a wrapper around pymagic. +It aims to provide the same API but with the ability to load multiple magic +files in the default api. +""" + +from __future__ import annotations + +import os +from os import PathLike + +import magic as pymagic + +from helperFunctions.file_system import get_src_dir + +# On ubuntu this is provided by the libmagic-mgc package +_default_magic = os.getenv('MAGIC', '/usr/lib/file/magic.mgc') +_fw_magic = f'{get_src_dir()}/bin/firmware' +_magic_file = f'{_fw_magic}:{_default_magic}' + +_instances = {} + + +def _get_magic_instance(**kwargs): + """Returns an instance of pymagic.Magic""" + # Dicts are not hashable but sorting and creating a tuple is a valid hash + key = hash(tuple(sorted(kwargs.items()))) + instance = _instances.get(key) + if instance is None: + instance = _instances[key] = pymagic.Magic(**kwargs) + return instance + + +def from_file(filename: bytes | str | PathLike, magic_file: str | None = _magic_file, **kwargs) -> str: + """Like pymagic's ``magic.from_file`` but it accepts all keyword arguments + that ``magic.Magic`` accepts. + """ + instance = _get_magic_instance(magic_file=magic_file, **kwargs) + return instance.from_file(filename) + + +def from_buffer(buf: bytes | str, magic_file: str | None = _magic_file, **kwargs) -> str: + """Like pymagic's ``magic.from_buffer`` but it accepts all keyword arguments + that ``magic.Magic`` accepts. + """ + instance = _get_magic_instance(magic_file=magic_file, **kwargs) + return instance.from_buffer(buf) diff --git a/fact_extractor/helperFunctions/statistics.py b/fact_extractor/helperFunctions/statistics.py index 1333e70d..da7151a1 100644 --- a/fact_extractor/helperFunctions/statistics.py +++ b/fact_extractor/helperFunctions/statistics.py @@ -3,11 +3,11 @@ from pathlib import Path from typing import Dict, List +from helperFunctions import magic from common_helper_files import safe_rglob from common_helper_unpacking_classifier import ( avg_entropy, get_binary_size_without_padding, is_compressed ) -from fact_helper_file import get_file_type_from_path from helperFunctions.config import read_list_from_config @@ -28,7 +28,7 @@ def get_unpack_status(file_path: str, binary: bytes, extracted_files: List[Path] meta_data['entropy'] = avg_entropy(binary) if not extracted_files and meta_data.get('number_of_excluded_files', 0) == 0: - if get_file_type_from_path(file_path)['mime'] in read_list_from_config(config, 'ExpertSettings', 'compressed_file_types')\ + if magic.from_file(file_path, mime=True) in read_list_from_config(config, 'ExpertSettings', 'compressed_file_types')\ or not is_compressed(binary, compress_entropy_threshold=config.getfloat('ExpertSettings', 'unpack_threshold'), classifier=avg_entropy): meta_data['summary'] = ['unpacked'] else: diff --git a/fact_extractor/install/common.py b/fact_extractor/install/common.py index 7bd026ee..02c7d7a9 100644 --- a/fact_extractor/install/common.py +++ b/fact_extractor/install/common.py @@ -1,11 +1,12 @@ import logging +import subprocess as sp import os from contextlib import suppress from pathlib import Path from helperFunctions.config import load_config from helperFunctions.install import ( - apt_install_packages, apt_update_sources, pip_install_packages, load_requirements_file + apt_install_packages, apt_update_sources, pip_install_packages, load_requirements_file, OperateInDirectory ) APT_DEPENDENCIES = { @@ -37,6 +38,27 @@ def install_apt_dependencies(distribution: str): apt_install_packages(*APT_DEPENDENCIES[distribution]) +def _install_magic(): + bin_dir = Path(__file__).parent.parent / 'bin' + with OperateInDirectory(bin_dir): + sp.run( + [ + "wget", + "--output-document", + "firmware.xz", + "https://github.com/fkie-cad/firmware-magic-database/releases/download/v0.2.1/firmware.xz", + ], + check=True, + ) + sp.run( + [ + "unxz", + "--force", + "firmware.xz", + ] + ) + + def main(distribution): logging.info('Updating package lists') apt_update_sources() @@ -49,6 +71,8 @@ def main(distribution): with suppress(FileExistsError): os.mkdir('../bin') + _install_magic() + config = load_config('main.cfg') data_folder = config.get('unpack', 'data_folder') os.makedirs(str(Path(data_folder, 'files')), exist_ok=True) diff --git a/fact_extractor/install/pre_install.sh b/fact_extractor/install/pre_install.sh index c6035c55..d5b41ce0 100755 --- a/fact_extractor/install/pre_install.sh +++ b/fact_extractor/install/pre_install.sh @@ -4,7 +4,7 @@ echo "Install Pre-Install Requirements" (apt-get update && apt-get install sudo) || true sudo apt-get update -sudo apt-get -y install git apt-transport-https ca-certificates curl software-properties-common wget libmagic-dev +sudo apt-get -y install git apt-transport-https ca-certificates curl software-properties-common wget libmagic-dev xz-utils IS_VENV=$(python3 -c 'import sys; print(sys.exec_prefix!=sys.base_prefix)') if [[ $IS_VENV == "False" ]] diff --git a/fact_extractor/plugins/unpacking/generic_carver/code/generic_carver.py b/fact_extractor/plugins/unpacking/generic_carver/code/generic_carver.py index a11b7eee..2ba11223 100644 --- a/fact_extractor/plugins/unpacking/generic_carver/code/generic_carver.py +++ b/fact_extractor/plugins/unpacking/generic_carver/code/generic_carver.py @@ -3,13 +3,13 @@ ''' from __future__ import annotations +from helperFunctions import magic import logging import re import shutil from pathlib import Path from common_helper_process import execute_shell_command -from fact_helper_file import get_file_type_from_path NAME = 'generic_carver' MIME_PATTERNS = ['generic/carver'] @@ -45,7 +45,7 @@ def remove_false_positive_archives(self) -> str: for file_path in self.unpack_directory.glob('**/*'): if not file_path.is_file(): continue - file_type = get_file_type_from_path(file_path)['mime'] + file_type = magic.from_file(file_path, mime=True) if file_type == 'application/x-tar' or self._is_possible_tar(file_type, file_path): self._remove_invalid_archives(file_path, 'tar -tvf {}', 'does not look like a tar archive') diff --git a/fact_extractor/plugins/unpacking/generic_fs/code/generic_fs.py b/fact_extractor/plugins/unpacking/generic_fs/code/generic_fs.py index f35fd9f4..6bafe46a 100644 --- a/fact_extractor/plugins/unpacking/generic_fs/code/generic_fs.py +++ b/fact_extractor/plugins/unpacking/generic_fs/code/generic_fs.py @@ -2,13 +2,12 @@ This plugin mounts filesystem images and extracts their content ''' import re +from helperFunctions import magic from shlex import split from subprocess import run, PIPE, STDOUT from tempfile import TemporaryDirectory from time import sleep -from fact_helper_file import get_file_type_from_path - NAME = 'genericFS' MIME_PATTERNS = [ 'filesystem/btrfs', 'filesystem/dosmbr', 'filesystem/f2fs', 'filesystem/jfs', 'filesystem/minix', @@ -28,7 +27,7 @@ def unpack_function(file_path, tmp_dir): - mime_type = get_file_type_from_path(file_path)['mime'] + mime_type = magic.from_file(file_path, mime=True) if mime_type == 'filesystem/dosmbr': output = _mount_from_boot_record(file_path, tmp_dir) else: diff --git a/fact_extractor/test/data/ros_header b/fact_extractor/test/data/ros_header new file mode 100644 index 0000000000000000000000000000000000000000..286fc793a94ec281fe5b46071dff41592a2e001c GIT binary patch literal 288 zcmWIY3kfjPGcaT^6kw2Hy1{;!)09C*C0#1O(b<~;2-tvF0K|X-XCKdaH%}i|82lfFu`O{u(Q7l+;t2L1(m!E%@fq&`h$S}HPSOM burdJom;D9<10zFo2nXDRSO$g$B!LV7_ogVu literal 0 HcmV?d00001 diff --git a/fact_extractor/test/unit/test_mime.py b/fact_extractor/test/unit/test_mime.py new file mode 100644 index 00000000..cd909a93 --- /dev/null +++ b/fact_extractor/test/unit/test_mime.py @@ -0,0 +1,15 @@ +from pathlib import Path + +from helperFunctions import magic + +from helperFunctions.file_system import get_fact_bin_dir, get_test_data_dir + + +def test_magic(): + firmware_magic_path = Path(get_fact_bin_dir()) / 'firmware' + assert firmware_magic_path.is_file() + + assert ( + magic.from_file(f'{get_test_data_dir()}/ros_header', mime=True) == 'firmware/ros' + ), 'firmware-magic-database is not loaded' + assert magic.from_file(f'{get_test_data_dir()}/container/test.zip', mime=True) == 'application/zip' diff --git a/fact_extractor/unpacker/unpackBase.py b/fact_extractor/unpacker/unpackBase.py index fb0d629f..623e8515 100644 --- a/fact_extractor/unpacker/unpackBase.py +++ b/fact_extractor/unpacker/unpackBase.py @@ -1,12 +1,13 @@ +import fnmatch import logging from os import getgid, getuid from subprocess import PIPE, Popen from time import time -import fnmatch from typing import Callable, Dict, List, Tuple +from helperFunctions import magic from common_helper_files import get_files_in_dir -from fact_helper_file import get_file_type_from_path + from helperFunctions.config import read_list_from_config from helperFunctions.plugin import import_plugins @@ -50,7 +51,7 @@ def get_unpacker(self, mime_type: str): return self.unpacker_plugins['generic/carver'] def extract_files_from_file(self, file_path: str, tmp_dir) -> Tuple[List, Dict]: - current_unpacker = self.get_unpacker(get_file_type_from_path(file_path)['mime']) + current_unpacker = self.get_unpacker(magic.from_file(file_path, mime=True)) return self._extract_files_from_file_using_specific_unpacker(file_path, tmp_dir, current_unpacker) def unpacking_fallback(self, file_path, tmp_dir, old_meta, fallback_plugin_mime) -> Tuple[List, Dict]: diff --git a/requirements-unpackers.txt b/requirements-unpackers.txt index 234c6387..69b52e02 100644 --- a/requirements-unpackers.txt +++ b/requirements-unpackers.txt @@ -1,7 +1,7 @@ # FixMe: deprecated pluginbase~=1.0.1 git+https://github.com/fkie-cad/common_helper_unpacking_classifier.git -git+https://github.com/fkie-cad/fact_helper_file.git +python-magic patool~=2.2.0 # jffs2: jefferson + deps git+https://github.com/sviehb/jefferson.git@v0.4.1