From 14c298dee8f2c4516278cb0c27fe3e7ebd739c03 Mon Sep 17 00:00:00 2001 From: eclipsotic Date: Fri, 3 May 2024 22:47:06 -0400 Subject: [PATCH] Use mmap instead of read_bytes() to reduce memory usage for large files --- fact_extractor/helperFunctions/statistics.py | 3 ++- fact_extractor/unpacker/unpack.py | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/fact_extractor/helperFunctions/statistics.py b/fact_extractor/helperFunctions/statistics.py index 1333e70d..afbc1758 100644 --- a/fact_extractor/helperFunctions/statistics.py +++ b/fact_extractor/helperFunctions/statistics.py @@ -1,5 +1,6 @@ from configparser import ConfigParser from contextlib import suppress +from mmap import mmap from pathlib import Path from typing import Dict, List @@ -23,7 +24,7 @@ def add_unpack_statistics(extraction_dir: Path, meta_data: Dict): meta_data['number_of_unpacked_directories'] = unpacked_directories -def get_unpack_status(file_path: str, binary: bytes, extracted_files: List[Path], meta_data: Dict, config: ConfigParser): +def get_unpack_status(file_path: str, binary: bytes | mmap, extracted_files: List[Path], meta_data: Dict, config: ConfigParser): meta_data['summary'] = [] meta_data['entropy'] = avg_entropy(binary) diff --git a/fact_extractor/unpacker/unpack.py b/fact_extractor/unpacker/unpack.py index 98fb3901..cc895e1b 100644 --- a/fact_extractor/unpacker/unpack.py +++ b/fact_extractor/unpacker/unpack.py @@ -2,6 +2,7 @@ import json import logging +import mmap import shutil from pathlib import Path from tempfile import TemporaryDirectory @@ -52,9 +53,10 @@ def unpack(self, file_path): compute_stats = self.config.getboolean('ExpertSettings', 'statistics', fallback=True) if compute_stats: - binary = Path(file_path).read_bytes() add_unpack_statistics(self._file_folder, meta_data) - get_unpack_status(file_path, binary, extracted_files, meta_data, self.config) + with open(file_path, 'rb') as f: + with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm: + get_unpack_status(file_path, mm, extracted_files, meta_data, self.config) self.cleanup(tmp_dir)