From b003b6c66e0feaebd58241ad3b12bf65ba0a42dc Mon Sep 17 00:00:00 2001 From: Yun Zheng Hu Date: Tue, 24 Sep 2024 13:24:01 +0200 Subject: [PATCH 1/2] Don't show timezone warnings for UTC (#141) UTC is the default and should not show any warnings as we can fallback to timezone.UTC. Fixes #140 --- flow/record/fieldtypes/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flow/record/fieldtypes/__init__.py b/flow/record/fieldtypes/__init__.py index 45978d4..9f1bf68 100644 --- a/flow/record/fieldtypes/__init__.py +++ b/flow/record/fieldtypes/__init__.py @@ -67,7 +67,8 @@ def flow_record_tz(*, default_tz: str = "UTC") -> Optional[ZoneInfo | UTC]: try: return ZoneInfo(tz) except ZoneInfoNotFoundError as exc: - warnings.warn(f"{exc!r}, falling back to timezone.utc") + if tz != "UTC": + warnings.warn(f"{exc!r}, falling back to timezone.utc") return UTC From 1701dcf6ff77d99923ef2523cb258fc75ade68e1 Mon Sep 17 00:00:00 2001 From: Computer Network Investigation <121175071+JSCU-CNI@users.noreply.github.com> Date: Tue, 24 Sep 2024 14:00:59 +0200 Subject: [PATCH 2/2] Improve XLSX adapter (#137) This PR makes it easier to write records to an xlsx file. Each record descriptor gets its own sheet within the workbook and field values are sanitized to prevent openpyxl errors. --- flow/record/adapter/xlsx.py | 115 ++++++++++++++++++++++++++++++------ tests/test_xlsx_adapter.py | 55 +++++++++++++++++ 2 files changed, 152 insertions(+), 18 deletions(-) create mode 100644 tests/test_xlsx_adapter.py diff --git a/flow/record/adapter/xlsx.py b/flow/record/adapter/xlsx.py index 6386256..a9bf8c7 100644 --- a/flow/record/adapter/xlsx.py +++ b/flow/record/adapter/xlsx.py @@ -1,7 +1,14 @@ -import openpyxl +from base64 import b64decode, b64encode +from datetime import datetime, timezone +from typing import Any, Iterator + +from openpyxl import Workbook, load_workbook +from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE from flow import record +from flow.record import fieldtypes from flow.record.adapter import AbstractReader, AbstractWriter +from flow.record.fieldtypes.net import ipaddress from flow.record.selector import make_selector from flow.record.utils import is_stdout @@ -14,23 +21,72 @@ """ +def sanitize_fieldvalues(values: Iterator[Any]) -> Iterator[Any]: + """Sanitize field values so openpyxl will accept them.""" + + for value in values: + # openpyxl doesn't support timezone-aware datetime instances, + # so we convert to UTC and then remove the timezone info. + if isinstance(value, datetime) and value.tzinfo is not None: + value = value.astimezone(timezone.utc).replace(tzinfo=None) + + elif type(value) in [ipaddress, list, fieldtypes.posix_path, fieldtypes.windows_path]: + value = str(value) + + elif isinstance(value, bytes): + base64_encode = False + try: + new_value = 'b"' + value.decode() + '"' + if ILLEGAL_CHARACTERS_RE.search(new_value): + base64_encode = True + else: + value = new_value + except UnicodeDecodeError: + base64_encode = True + if base64_encode: + value = "base64:" + b64encode(value).decode() + + yield value + + class XlsxWriter(AbstractWriter): fp = None wb = None def __init__(self, path, **kwargs): self.fp = record.open_path_or_stream(path, "wb") - self.wb = openpyxl.Workbook() + self.wb = Workbook() self.ws = self.wb.active - self.desc = None - # self.ws.title = "Records" + + # Remove the active work sheet, every Record Descriptor will have its own sheet. + self.wb.remove(self.ws) + self.descs = [] + self._last_dec = None def write(self, r): - if not self.desc: - self.desc = r._desc - self.ws.append(r._desc.fields) + if r._desc not in self.descs: + self.descs.append(r._desc) + ws = self.wb.create_sheet(r._desc.name.strip().replace("/", "-")) + field_types = [] + field_names = [] + + for field_name, field in r._desc.get_all_fields().items(): + field_types.append(field.typename) + field_names.append(field_name) + + ws.append(field_types) + ws.append(field_names) + + if r._desc != self._last_dec: + self._last_dec = r._desc + self.ws = self.wb[r._desc.name.strip().replace("/", "-")] + + values = list(sanitize_fieldvalues(value for value in r._asdict().values())) - self.ws.append(r._asdict().values()) + try: + self.ws.append(values) + except ValueError as e: + raise ValueError(f"Unable to write values to workbook: {str(e)}") def flush(self): if self.wb: @@ -53,7 +109,7 @@ def __init__(self, path, selector=None, **kwargs): self.selector = make_selector(selector) self.fp = record.open_path_or_stream(path, "rb") self.desc = None - self.wb = openpyxl.load_workbook(self.fp) + self.wb = load_workbook(self.fp) self.ws = self.wb.active def close(self): @@ -62,12 +118,35 @@ def close(self): self.fp = None def __iter__(self): - desc = None - for row in self.ws.rows: - if not desc: - desc = record.RecordDescriptor([col.value.replace(" ", "_").lower() for col in row]) - continue - - obj = desc(*[col.value for col in row]) - if not self.selector or self.selector.match(obj): - yield obj + for worksheet in self.wb.worksheets: + desc = None + desc_name = worksheet.title.replace("-", "/") + field_names = None + field_types = None + for row in worksheet: + if field_types is None: + field_types = [col.value for col in row if col.value] + continue + if field_names is None: + field_names = [ + col.value.replace(" ", "_").lower() + for col in row + if col.value and not col.value.startswith("_") + ] + desc = record.RecordDescriptor(desc_name, list(zip(field_types, field_names))) + continue + + record_values = [] + for idx, col in enumerate(row): + value = col.value + if field_types[idx] == "bytes": + if value[1] == '"': # If so, we know this is b"" + # Cut of the b" at the start and the trailing " + value = value[2:-1].encode() + else: + # If not, we know it is base64 encoded (so we cut of the starting 'base64:') + value = b64decode(value[7:]) + record_values.append(value) + obj = desc(*record_values) + if not self.selector or self.selector.match(obj): + yield obj diff --git a/tests/test_xlsx_adapter.py b/tests/test_xlsx_adapter.py new file mode 100644 index 0000000..cc6b80e --- /dev/null +++ b/tests/test_xlsx_adapter.py @@ -0,0 +1,55 @@ +import re +import sys +from datetime import datetime, timedelta, timezone +from typing import Iterator +from unittest.mock import MagicMock + +import pytest + +from flow.record import fieldtypes + + +@pytest.fixture +def mock_openpyxl_package(monkeypatch: pytest.MonkeyPatch) -> Iterator[MagicMock]: + with monkeypatch.context() as m: + mock_openpyxl = MagicMock() + mock_cell = MagicMock() + mock_cell.ILLEGAL_CHARACTERS_RE = re.compile(r"[\000-\010]|[\013-\014]|[\016-\037]") + m.setitem(sys.modules, "openpyxl", mock_openpyxl) + m.setitem(sys.modules, "openpyxl.cell.cell", mock_cell) + + yield mock_openpyxl + + +def test_sanitize_field_values(mock_openpyxl_package): + from flow.record.adapter.xlsx import sanitize_fieldvalues + + assert list( + sanitize_fieldvalues( + [ + 7, + datetime(1920, 11, 11, 13, 37, 0, tzinfo=timezone(timedelta(hours=2))), + "James", + b"Bond", + b"\x00\x07", + fieldtypes.net.ipaddress("13.37.13.37"), + ["Shaken", "Not", "Stirred"], + fieldtypes.posix_path("/home/user"), + fieldtypes.posix_command("/bin/bash -c 'echo hello world'"), + fieldtypes.windows_path("C:\\Users\\user\\Desktop"), + fieldtypes.windows_command("C:\\Some.exe /?"), + ] + ) + ) == [ + 7, + datetime(1920, 11, 11, 11, 37, 0), # UTC normalization + "James", + 'b"Bond"', # When possible, encode bytes in a printable way + "base64:AAc=", # If not, base64 encode + "13.37.13.37", # Stringify an ip address + "['Shaken', 'Not', 'Stirred']", # Stringify a list + "/home/user", # Stringify a posix path + "/bin/bash -c 'echo hello world'", # Stringify a posix command + "C:\\Users\\user\\Desktop", # Stringify a windows path + "C:\\Some.exe /?", # Stringify a windows command + ]