From 012dd6eebc9406fa344db94ca887b3daf5c5cd60 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Wed, 6 Nov 2024 15:12:11 +0100 Subject: [PATCH 01/38] update add_field_to function for improved error handling - Add raise_on_failure parameter to raise exceptions on failure. - Refactor function for clarity and consistency. - Update unit tests to reflect new exception handling logic. --- logprep/processor/base/exceptions.py | 6 +- logprep/processor/grokker/processor.py | 2 +- logprep/util/helper.py | 69 +++++++++-------- tests/unit/util/test_helper_add_field.py | 99 +++++++----------------- 4 files changed, 69 insertions(+), 107 deletions(-) diff --git a/logprep/processor/base/exceptions.py b/logprep/processor/base/exceptions.py index 9f020b3c6..f7c28df40 100644 --- a/logprep/processor/base/exceptions.py +++ b/logprep/processor/base/exceptions.py @@ -72,8 +72,10 @@ class ProcessingWarning(Warning): def __init__(self, message: str, rule: "Rule", event: dict, tags: List[str] = None): self.tags = tags if tags else [] - rule.metrics.number_of_warnings += 1 - message = f"{message}, {rule.id=}, {rule.description=}, {event=}" + if rule: + rule.metrics.number_of_warnings += 1 + message += f", {rule.id=}, {rule.description=}" + message += f", {event=}" super().__init__(f"{self.__class__.__name__}: {message}") diff --git a/logprep/processor/grokker/processor.py b/logprep/processor/grokker/processor.py index c0dae52be..ce968a827 100644 --- a/logprep/processor/grokker/processor.py +++ b/logprep/processor/grokker/processor.py @@ -82,9 +82,9 @@ def _apply_rules(self, event: dict, rule: GrokkerRule): except TimeoutError as error: self._handle_missing_fields(event, rule, rule.actions.keys(), source_values) raise ProcessingError( - self, f"Grok pattern timeout for source field: '{dotted_field}' in rule '{rule}', " f"the grok pattern might be too complex.", + rule, ) from error if result is None or result == {}: continue diff --git a/logprep/util/helper.py b/logprep/util/helper.py index e8e4fec45..fcdcc2a9a 100644 --- a/logprep/util/helper.py +++ b/logprep/util/helper.py @@ -57,10 +57,17 @@ def _add_and_not_overwrite_key(sub_dict, key): return sub_dict.get(key) -def add_field_to(event, output_field, content, extends_lists=False, overwrite_output_field=False): +def add_field_to( + event, + output_field, + content, + extends_lists=False, + overwrite_output_field=False, + raise_on_failure=None, +): """ - Add content to an output_field in the given event. Output_field can be a dotted subfield. - In case of missing fields all intermediate fields will be created. + Add content to the output_field in the given event. Output_field can be a dotted subfield. + In case of missing fields, all intermediate fields will be created. Parameters ---------- event: dict @@ -68,47 +75,41 @@ def add_field_to(event, output_field, content, extends_lists=False, overwrite_ou output_field: str Dotted subfield string indicating the target of the output value, e.g. destination.ip content: str, float, int, list, dict - Value that should be written into the output_field, can be a str, list or dict object + Value that should be written into the output_field, can be a str, list, or dict object extends_lists: bool Flag that determines whether output_field lists should be extended overwrite_output_field: bool Flag that determines whether the output_field should be overwritten - Returns ------ - This method returns true if no conflicting fields were found during the process of the creation - of the dotted subfields. If conflicting fields were found False is returned. + bool + True if no conflicting fields were found during the process of the creation + of the dotted subfields, otherwise False. """ - - assert not ( - extends_lists and overwrite_output_field - ), "An output field can't be overwritten and extended at the same time" - output_field_path = [event, *get_dotted_field_list(output_field)] - target_key = output_field_path.pop() - - if overwrite_output_field: - target_field = reduce(_add_and_overwrite_key, output_field_path) - target_field |= {target_key: content} - return True - + if extends_lists and overwrite_output_field: + raise ValueError("An output field can't be overwritten and extended at the same time") + field_path = [event, *get_dotted_field_list(output_field)] + target_key = field_path.pop() try: - target_field = reduce(_add_and_not_overwrite_key, output_field_path) - except KeyError: - return False - - target_field_value = target_field.get(target_key) - if target_field_value is None: - target_field |= {target_key: content} - return True - if extends_lists: - if not isinstance(target_field_value, list): - return False + target_parent = reduce(_add_and_not_overwrite_key, field_path) + except KeyError as error: + if raise_on_failure: + raise raise_on_failure from error + return + if overwrite_output_field: + target_parent[target_key] = content + else: + existing_value = target_parent.get(target_key) + if existing_value is None: + target_parent[target_key] = content + if not extends_lists or not isinstance(existing_value, list): + if raise_on_failure: + raise raise_on_failure + return if isinstance(content, list): - target_field |= {target_key: [*target_field_value, *content]} + target_parent[target_key].extend(content) else: - target_field_value.append(content) - return True - return False + target_parent[target_key].append(content) def _get_slice_arg(slice_item): diff --git a/tests/unit/util/test_helper_add_field.py b/tests/unit/util/test_helper_add_field.py index 007069ed2..5e714bda8 100644 --- a/tests/unit/util/test_helper_add_field.py +++ b/tests/unit/util/test_helper_add_field.py @@ -2,6 +2,7 @@ # pylint: disable=missing-docstring import pytest +from logprep.abc.exceptions import LogprepException from logprep.util.helper import add_field_to @@ -9,18 +10,13 @@ class TestHelperAddField: def test_add_str_content_as_new_root_field(self): document = {"source": {"ip": "8.8.8.8"}} expected_document = {"source": {"ip": "8.8.8.8"}, "field": "content"} - - add_was_successful = add_field_to(document, "field", "content") - - assert add_was_successful, "Found duplicate even though there shouldn't be one" + add_field_to(document, "field", "content") assert document == expected_document def test_add_str_content_as_completely_new_dotted_subfield(self): document = {"source": {"ip": "8.8.8.8"}} expected_document = {"source": {"ip": "8.8.8.8"}, "sub": {"field": "content"}} - - add_was_successful = add_field_to(document, "sub.field", "content") - assert add_was_successful, "Found duplicate even though there shouldn't be one" + add_field_to(document, "sub.field", "content") assert document == expected_document def test_add_str_content_as_partially_new_dotted_subfield(self): @@ -30,41 +26,31 @@ def test_add_str_content_as_partially_new_dotted_subfield(self): "sub": {"field": "content", "other_field": "other_content"}, } - add_was_successful = add_field_to(document, "sub.field", "content") - - assert add_was_successful, "Found duplicate even though there shouldn't be one" + add_field_to(document, "sub.field", "content") assert document == expected_document def test_provoke_str_duplicate_in_root_field(self): document = {"source": {"ip": "8.8.8.8"}, "field": "exists already"} - - add_was_successful = add_field_to(document, "field", "content") - - assert not add_was_successful, "Found no duplicate even though there should be one" + error = LogprepException("test error") + with pytest.raises(LogprepException, match=r"test error"): + add_field_to(document, "field", "content", raise_on_failure=error) def test_provoke_str_duplicate_in_dotted_subfield(self): document = {"source": {"ip": "8.8.8.8"}, "sub": {"field": "exists already"}} - - add_was_successful = add_field_to(document, "sub.field", "content") - - assert not add_was_successful, "Found no duplicate even though there should be one" + error = LogprepException("test error") + with pytest.raises(LogprepException, match=r"test error"): + add_field_to(document, "sub.field", "content", raise_on_failure=error) def test_add_dict_content_as_new_root_field(self): document = {"source": {"ip": "8.8.8.8"}} expected_document = {"source": {"ip": "8.8.8.8"}, "field": {"dict": "content"}} - - add_was_successful = add_field_to(document, "field", {"dict": "content"}) - - assert add_was_successful, "Found duplicate even though there shouldn't be one" + add_field_to(document, "field", {"dict": "content"}) assert document == expected_document def test_add_dict_content_as_completely_new_dotted_subfield(self): document = {"source": {"ip": "8.8.8.8"}} expected_document = {"source": {"ip": "8.8.8.8"}, "sub": {"field": {"dict": "content"}}} - - add_was_successful = add_field_to(document, "sub.field", {"dict": "content"}) - - assert add_was_successful, "Found duplicate even though there shouldn't be one" + add_field_to(document, "sub.field", {"dict": "content"}) assert document == expected_document def test_add_dict_content_as_partially_new_dotted_subfield(self): @@ -73,72 +59,47 @@ def test_add_dict_content_as_partially_new_dotted_subfield(self): "source": {"ip": "8.8.8.8"}, "sub": {"field": {"dict": "content"}, "other_field": "other_content"}, } - - add_was_successful = add_field_to(document, "sub.field", {"dict": "content"}) - - assert add_was_successful, "Found duplicate even though there shouldn't be one" + add_field_to(document, "sub.field", {"dict": "content"}) assert document == expected_document def test_provoke_dict_duplicate_in_root_field(self): document = {"source": {"ip": "8.8.8.8"}, "field": {"already_existing": "dict"}} - - add_was_successful = add_field_to(document, "field", {"dict": "content"}) - - assert not add_was_successful, "Found no duplicate even though there should be one" + error = LogprepException("test error") + with pytest.raises(LogprepException, match=r"test error"): + add_field_to(document, "field", {"dict": "content"}, raise_on_failure=error) def test_provoke_dict_duplicate_in_dotted_subfield(self): document = {"source": {"ip": "8.8.8.8"}, "sub": {"field": {"already_existing": "dict"}}} - - add_was_successful = add_field_to(document, "sub.field", {"dict": "content"}) - - assert not add_was_successful, "Found no duplicate even though there should be one" + error = LogprepException("test error") + with pytest.raises(LogprepException, match=r"test error"): + add_field_to(document, "sub.field", {"dict": "content"}, raise_on_failure=error) def test_add_field_to_overwrites_output_field_in_root_level(self): document = {"some": "field", "output_field": "has already content"} - - add_was_successful = add_field_to( - document, "output_field", {"dict": "content"}, overwrite_output_field=True - ) - - assert add_was_successful, "Output field was overwritten" + add_field_to(document, "output_field", {"dict": "content"}, overwrite_output_field=True) assert document.get("output_field") == {"dict": "content"} def test_add_field_to_overwrites_output_field_in_nested_level(self): document = {"some": "field", "nested": {"output": {"field": "has already content"}}} - - add_was_successful = add_field_to( + add_field_to( document, "nested.output.field", {"dict": "content"}, overwrite_output_field=True ) - - assert add_was_successful, "Output field was overwritten" assert document.get("nested", {}).get("output", {}).get("field") == {"dict": "content"} def test_add_field_to_extends_list_when_only_given_a_string(self): document = {"some": "field", "some_list": ["with a value"]} - - add_was_successful = add_field_to(document, "some_list", "new value", extends_lists=True) - - assert add_was_successful, "Output field was overwritten" + add_field_to(document, "some_list", "new value", extends_lists=True) assert document.get("some_list") == ["with a value", "new value"] def test_add_field_to_extends_list_when_given_a_list(self): document = {"some": "field", "some_list": ["with a value"]} - - add_was_successful = add_field_to( - document, "some_list", ["first", "second"], extends_lists=True - ) - - assert add_was_successful, "Output field was overwritten" + add_field_to(document, "some_list", ["first", "second"], extends_lists=True) assert document.get("some_list") == ["with a value", "first", "second"] def test_add_field_to_raises_if_list_should_be_extended_and_overwritten_at_the_same_time(self): document = {"some": "field", "some_list": ["with a value"]} - - with pytest.raises( - AssertionError, - match=r"An output field can't be overwritten and " r"extended at the same time", - ): - _ = add_field_to( + with pytest.raises(ValueError, match=r"can't be overwritten and extended at the same time"): + add_field_to( document, "some_list", ["first", "second"], @@ -149,8 +110,9 @@ def test_add_field_to_raises_if_list_should_be_extended_and_overwritten_at_the_s def test_returns_false_if_dotted_field_value_key_exists(self): document = {"user": "Franz"} content = ["user_inlist"] - add_was_successful = add_field_to(document, "user.in_list", content) - assert not add_was_successful + error = LogprepException("test error") + with pytest.raises(LogprepException, match=r"test error"): + add_field_to(document, "user.in_list", content, raise_on_failure=error) def test_add_list_with_nested_keys(self): testdict = { @@ -163,8 +125,5 @@ def test_add_list_with_nested_keys(self): } } } - add_was_successful = add_field_to( - testdict, "key1.key2.key3.key4.key5.list", ["content"], extends_lists=True - ) - assert add_was_successful + add_field_to(testdict, "key1.key2.key3.key4.key5.list", ["content"], extends_lists=True) assert testdict == expected From 0adad9ce438239e0ec8042484acff4d53d9eb434 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Thu, 7 Nov 2024 12:44:25 +0100 Subject: [PATCH 02/38] let add_field_to always raise FieldExistsWarning on failure - Catch and handle FieldExistsWarning to raise CriticalInputError. --- logprep/abc/input.py | 42 ++++++++++-------------- logprep/processor/base/exceptions.py | 11 +++++-- logprep/util/helper.py | 21 ++++++------ tests/unit/connector/base.py | 3 +- tests/unit/util/test_helper_add_field.py | 27 +++++++-------- 5 files changed, 47 insertions(+), 57 deletions(-) diff --git a/logprep/abc/input.py b/logprep/abc/input.py index b63b6d820..8fe125648 100644 --- a/logprep/abc/input.py +++ b/logprep/abc/input.py @@ -17,6 +17,7 @@ from logprep.abc.connector import Connector from logprep.abc.exceptions import LogprepException from logprep.metrics.metrics import Metric +from logprep.processor.base.exceptions import FieldExistsWarning from logprep.util.helper import add_field_to, get_dotted_field_value from logprep.util.time import UTC, TimeParser from logprep.util.validators import dict_structure_validator @@ -280,16 +281,19 @@ def get_next(self, timeout: float) -> dict | None: self.metrics.number_of_processed_events += 1 if not isinstance(event, dict): raise CriticalInputError(self, "not a dict", event) - if self._add_hmac: - event = self._add_hmac_to(event, raw_event) - if self._add_version_info: - self._add_version_information_to_event(event) - if self._add_log_arrival_time_information: - self._add_arrival_time_information_to_event(event) - if self._add_log_arrival_timedelta_information: - self._add_arrival_timedelta_information_to_event(event) - if self._add_env_enrichment: - self._add_env_enrichment_to_event(event) + try: + if self._add_hmac: + event = self._add_hmac_to(event, raw_event) + if self._add_version_info: + self._add_version_information_to_event(event) + if self._add_log_arrival_time_information: + self._add_arrival_time_information_to_event(event) + if self._add_log_arrival_timedelta_information: + self._add_arrival_timedelta_information_to_event(event) + if self._add_env_enrichment: + self._add_env_enrichment_to_event(event) + except FieldExistsWarning as error: + raise CriticalInputError(self, error.args[0], event) from error return event def batch_finished_callback(self): @@ -331,7 +335,7 @@ def _add_version_information_to_event(self, event: dict): add_field_to(event, target_field, self._config._version_information) # pylint: enable=protected-access - def _add_hmac_to(self, event_dict, raw_event) -> Tuple[dict, str]: + def _add_hmac_to(self, event_dict, raw_event) -> dict: """ Calculates an HMAC (Hash-based message authentication code) based on a given target field and adds it to the given event. If the target field has the value '' the full raw @@ -357,7 +361,7 @@ def _add_hmac_to(self, event_dict, raw_event) -> Tuple[dict, str]: ------ CriticalInputError If the hmac could not be added to the event because the desired output field already - exists or cant't be found. + exists or can't be found. """ hmac_options = self._config.preprocessing.get("hmac", {}) hmac_target_field_name = hmac_options.get("target") @@ -382,17 +386,5 @@ def _add_hmac_to(self, event_dict, raw_event) -> Tuple[dict, str]: ).hexdigest() compressed = zlib.compress(received_orig_message, level=-1) hmac_output = {"hmac": hmac, "compressed_base64": base64.b64encode(compressed).decode()} - add_was_successful = add_field_to( - event_dict, - hmac_options.get("output_field"), - hmac_output, - ) - if not add_was_successful: - raise CriticalInputError( - self, - f"Couldn't add the hmac to the input event as the desired " - f"output field '{hmac_options.get('output_field')}' already " - f"exist.", - event_dict, - ) + add_field_to(event_dict, hmac_options.get("output_field"), hmac_output) return event_dict diff --git a/logprep/processor/base/exceptions.py b/logprep/processor/base/exceptions.py index f7c28df40..05c852cf8 100644 --- a/logprep/processor/base/exceptions.py +++ b/logprep/processor/base/exceptions.py @@ -70,7 +70,7 @@ def __init__(self, message: str, rule: "Rule"): class ProcessingWarning(Warning): """A warning occurred - log the warning, but continue processing the event.""" - def __init__(self, message: str, rule: "Rule", event: dict, tags: List[str] = None): + def __init__(self, message: str, event: dict, rule: "Rule" = None, tags: List[str] = None): self.tags = tags if tags else [] if rule: rule.metrics.number_of_warnings += 1 @@ -82,10 +82,15 @@ def __init__(self, message: str, rule: "Rule", event: dict, tags: List[str] = No class FieldExistsWarning(ProcessingWarning): """Raised if field already exists.""" - def __init__(self, rule: "Rule", event: dict, skipped_fields: List[str]): + def __init__( + self, + event: dict, + skipped_fields: List[str], + rule: "Rule" = None, + ): message = ( "The following fields could not be written, because " "one or more subfields existed and could not be extended: " f"{', '.join(skipped_fields)}" ) - super().__init__(message, rule, event) + super().__init__(message, event, rule) diff --git a/logprep/util/helper.py b/logprep/util/helper.py index fcdcc2a9a..ac0f44e58 100644 --- a/logprep/util/helper.py +++ b/logprep/util/helper.py @@ -10,6 +10,7 @@ from colorama import Back, Fore from colorama.ansi import AnsiBack, AnsiFore +from logprep.processor.base.exceptions import FieldExistsWarning from logprep.util.defaults import DEFAULT_CONFIG_LOCATION if TYPE_CHECKING: # pragma: no cover @@ -63,7 +64,6 @@ def add_field_to( content, extends_lists=False, overwrite_output_field=False, - raise_on_failure=None, ): """ Add content to the output_field in the given event. Output_field can be a dotted subfield. @@ -80,11 +80,13 @@ def add_field_to( Flag that determines whether output_field lists should be extended overwrite_output_field: bool Flag that determines whether the output_field should be overwritten - Returns + Raises ------ - bool - True if no conflicting fields were found during the process of the creation - of the dotted subfields, otherwise False. + ValueError + If both extends_lists and overwrite_output_field are set to True. + FieldExistsWarning + If the output field already exists and overwrite_output_field is False, or if extends_lists is True but + the existing field is not a list. """ if extends_lists and overwrite_output_field: raise ValueError("An output field can't be overwritten and extended at the same time") @@ -93,19 +95,16 @@ def add_field_to( try: target_parent = reduce(_add_and_not_overwrite_key, field_path) except KeyError as error: - if raise_on_failure: - raise raise_on_failure from error - return + raise FieldExistsWarning(event, [output_field]) from error if overwrite_output_field: target_parent[target_key] = content else: existing_value = target_parent.get(target_key) if existing_value is None: target_parent[target_key] = content - if not extends_lists or not isinstance(existing_value, list): - if raise_on_failure: - raise raise_on_failure return + if not extends_lists or not isinstance(existing_value, list): + raise FieldExistsWarning(event, [output_field]) if isinstance(content, list): target_parent[target_key].extend(content) else: diff --git a/tests/unit/connector/base.py b/tests/unit/connector/base.py index ec8906cb9..72b242a44 100644 --- a/tests/unit/connector/base.py +++ b/tests/unit/connector/base.py @@ -266,8 +266,7 @@ def test_get_next_with_hmac_result_in_already_existing_subfield(self): connector._get_event = mock.MagicMock( return_value=(test_event.copy(), raw_encoded_test_event) ) - non_critical_error_msg = "Couldn't add the hmac to the input event as the desired output field 'message' already exist." - with pytest.raises(CriticalInputError, match=non_critical_error_msg) as error: + with pytest.raises(CriticalInputError, match="could not be written") as error: _ = connector.get_next(1) assert error.value.raw_input == {"message": {"with_subfield": "content"}} diff --git a/tests/unit/util/test_helper_add_field.py b/tests/unit/util/test_helper_add_field.py index 5e714bda8..124043cf9 100644 --- a/tests/unit/util/test_helper_add_field.py +++ b/tests/unit/util/test_helper_add_field.py @@ -2,7 +2,7 @@ # pylint: disable=missing-docstring import pytest -from logprep.abc.exceptions import LogprepException +from logprep.processor.base.exceptions import FieldExistsWarning from logprep.util.helper import add_field_to @@ -31,15 +31,13 @@ def test_add_str_content_as_partially_new_dotted_subfield(self): def test_provoke_str_duplicate_in_root_field(self): document = {"source": {"ip": "8.8.8.8"}, "field": "exists already"} - error = LogprepException("test error") - with pytest.raises(LogprepException, match=r"test error"): - add_field_to(document, "field", "content", raise_on_failure=error) + with pytest.raises(FieldExistsWarning, match=r"could not be written"): + add_field_to(document, "field", "content") def test_provoke_str_duplicate_in_dotted_subfield(self): document = {"source": {"ip": "8.8.8.8"}, "sub": {"field": "exists already"}} - error = LogprepException("test error") - with pytest.raises(LogprepException, match=r"test error"): - add_field_to(document, "sub.field", "content", raise_on_failure=error) + with pytest.raises(FieldExistsWarning, match=r"could not be written"): + add_field_to(document, "sub.field", "content") def test_add_dict_content_as_new_root_field(self): document = {"source": {"ip": "8.8.8.8"}} @@ -64,15 +62,13 @@ def test_add_dict_content_as_partially_new_dotted_subfield(self): def test_provoke_dict_duplicate_in_root_field(self): document = {"source": {"ip": "8.8.8.8"}, "field": {"already_existing": "dict"}} - error = LogprepException("test error") - with pytest.raises(LogprepException, match=r"test error"): - add_field_to(document, "field", {"dict": "content"}, raise_on_failure=error) + with pytest.raises(FieldExistsWarning, match=r"could not be written"): + add_field_to(document, "field", {"dict": "content"}) def test_provoke_dict_duplicate_in_dotted_subfield(self): document = {"source": {"ip": "8.8.8.8"}, "sub": {"field": {"already_existing": "dict"}}} - error = LogprepException("test error") - with pytest.raises(LogprepException, match=r"test error"): - add_field_to(document, "sub.field", {"dict": "content"}, raise_on_failure=error) + with pytest.raises(FieldExistsWarning, match=r"could not be written"): + add_field_to(document, "sub.field", {"dict": "content"}) def test_add_field_to_overwrites_output_field_in_root_level(self): document = {"some": "field", "output_field": "has already content"} @@ -110,9 +106,8 @@ def test_add_field_to_raises_if_list_should_be_extended_and_overwritten_at_the_s def test_returns_false_if_dotted_field_value_key_exists(self): document = {"user": "Franz"} content = ["user_inlist"] - error = LogprepException("test error") - with pytest.raises(LogprepException, match=r"test error"): - add_field_to(document, "user.in_list", content, raise_on_failure=error) + with pytest.raises(FieldExistsWarning, match=r"could not be written"): + add_field_to(document, "user.in_list", content) def test_add_list_with_nested_keys(self): testdict = { From 33f402dfa29809b6d0b4be7e3428cf8800dcedc8 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Thu, 7 Nov 2024 17:20:55 +0100 Subject: [PATCH 03/38] fix field_manager tests - Add `add_batch_to` method - Replaced `output_field` with `target_field` for consistency - Improved exception handling and reduced redundant code --- logprep/abc/processor.py | 4 +- logprep/processor/base/exceptions.py | 2 +- logprep/processor/field_manager/processor.py | 26 ++---------- logprep/processor/generic_adder/processor.py | 2 +- logprep/processor/geoip_enricher/processor.py | 2 +- logprep/util/helper.py | 42 ++++++++++++------- 6 files changed, 37 insertions(+), 41 deletions(-) diff --git a/logprep/abc/processor.py b/logprep/abc/processor.py index c7c4510cc..34c7e21cb 100644 --- a/logprep/abc/processor.py +++ b/logprep/abc/processor.py @@ -366,7 +366,7 @@ def _handle_warning_error(self, event, rule, error, failure_tags=None): add_and_overwrite(event, "tags", sorted(list({*error.tags, *tags, *failure_tags}))) self.result.warnings.append(error) else: - self.result.warnings.append(ProcessingWarning(str(error), rule, event)) + self.result.warnings.append(ProcessingWarning(str(error), event, rule)) def _has_missing_values(self, event, rule, source_field_dict): missing_fields = list( @@ -383,7 +383,7 @@ def _has_missing_values(self, event, rule, source_field_dict): def _write_target_field(self, event: dict, rule: "Rule", result: any) -> None: add_successful = add_field_to( event, - output_field=rule.target_field, + target_field=rule.target_field, content=result, extends_lists=rule.extend_target_list, overwrite_output_field=rule.overwrite_target, diff --git a/logprep/processor/base/exceptions.py b/logprep/processor/base/exceptions.py index 05c852cf8..cb5a444ed 100644 --- a/logprep/processor/base/exceptions.py +++ b/logprep/processor/base/exceptions.py @@ -91,6 +91,6 @@ def __init__( message = ( "The following fields could not be written, because " "one or more subfields existed and could not be extended: " - f"{', '.join(skipped_fields)}" + f"{skipped_fields}" ) super().__init__(message, event, rule) diff --git a/logprep/processor/field_manager/processor.py b/logprep/processor/field_manager/processor.py index 09eeeab6f..43558a2df 100644 --- a/logprep/processor/field_manager/processor.py +++ b/logprep/processor/field_manager/processor.py @@ -40,6 +40,7 @@ add_field_to, get_dotted_field_value, pop_dotted_field_value, + add_batch_to, ) @@ -72,35 +73,18 @@ def _apply_single_target_processing(self, event, rule, rule_args): self._write_to_single_target(args, extend_target_list, overwrite_target, rule) def _apply_mapping(self, event, rule, rule_args): - source_fields, _, mapping, _, _ = rule_args + source_fields, _, mapping, extend_target_list, overwrite_target = rule_args source_fields, targets = list(zip(*mapping.items())) source_field_values = self._get_field_values(event, mapping.keys()) self._handle_missing_fields(event, rule, source_fields, source_field_values) if not any(source_field_values): return source_field_values, targets = self._filter_missing_fields(source_field_values, targets) - self._write_to_multiple_targets(event, targets, source_field_values, rule, rule_args) + add_batch_to(event, targets, source_field_values, extend_target_list, overwrite_target) if rule.delete_source_fields: for dotted_field in source_fields: pop_dotted_field_value(event, dotted_field) - def _write_to_multiple_targets(self, event, target_fields, field_values, rule, rule_args): - _, _, _, extend_target_list, overwrite_target = rule_args - results = map( - add_field_to, - itertools.repeat(event, len(target_fields)), - target_fields, - field_values, - itertools.repeat(extend_target_list, len(target_fields)), - itertools.repeat(overwrite_target, len(target_fields)), - ) - if not all(results): - unsuccessful_indices = [i for i, x in enumerate(results) if not x] - unsuccessful_targets = [ - x for i, x in enumerate(target_fields) if i in unsuccessful_indices - ] - raise FieldExistsWarning(rule, event, unsuccessful_targets) - def _write_to_single_target(self, args, extend_target_list, overwrite_target, rule): event, target_field, source_fields_values = args target_field_value = get_dotted_field_value(event, target_field) @@ -161,11 +145,9 @@ def _write_to_single_target(self, args, extend_target_list, overwrite_target, ru return case _: - success = add_field_to( + add_field_to( event, target_field, source_fields_values, state.extend, state.overwrite ) - if not success: - raise FieldExistsWarning(rule, event, [target_field]) def _overwrite_from_source_values(self, source_fields_values): duplicates = [] diff --git a/logprep/processor/generic_adder/processor.py b/logprep/processor/generic_adder/processor.py index 974c01040..f3e266438 100644 --- a/logprep/processor/generic_adder/processor.py +++ b/logprep/processor/generic_adder/processor.py @@ -238,7 +238,7 @@ def _apply_rules(self, event: dict, rule: GenericAdderRule): for dotted_field, value in items_to_add: add_successful = add_field_to( event, - output_field=dotted_field, + target_field=dotted_field, content=value, extends_lists=rule.extend_target_list, overwrite_output_field=rule.overwrite_target, diff --git a/logprep/processor/geoip_enricher/processor.py b/logprep/processor/geoip_enricher/processor.py index bdf099d96..3e83ca044 100644 --- a/logprep/processor/geoip_enricher/processor.py +++ b/logprep/processor/geoip_enricher/processor.py @@ -137,7 +137,7 @@ def _apply_rules(self, event, rule): full_output_field = rule.customize_target_subfields.get(target_subfield) adding_was_successful = add_field_to( event=event, - output_field=full_output_field, + target_field=full_output_field, content=value, extends_lists=False, overwrite_output_field=rule.overwrite_target, diff --git a/logprep/util/helper.py b/logprep/util/helper.py index ac0f44e58..900abd6da 100644 --- a/logprep/util/helper.py +++ b/logprep/util/helper.py @@ -1,5 +1,6 @@ """This module contains helper functions that are shared by different modules.""" +import itertools import re import sys from functools import lru_cache, partial, reduce @@ -40,14 +41,6 @@ def print_fcolor(fore: AnsiFore, message: str): color_print_line(None, fore, message) -def _add_and_overwrite_key(sub_dict, key): - current_value = sub_dict.get(key) - if isinstance(current_value, dict): - return current_value - sub_dict.update({key: {}}) - return sub_dict.get(key) - - def _add_and_not_overwrite_key(sub_dict, key): current_value = sub_dict.get(key) if isinstance(current_value, dict): @@ -58,9 +51,30 @@ def _add_and_not_overwrite_key(sub_dict, key): return sub_dict.get(key) +def add_field_to_silent_fail(*args): + try: + add_field_to(*args) + except FieldExistsWarning: + return args[1] + + +def add_batch_to(event, targets, contents, extends_lists=False, overwrite_output_field=False): + unsuccessful_targets = map( + add_field_to_silent_fail, + itertools.repeat(event, len(targets)), + targets, + contents, + itertools.repeat(extends_lists, len(targets)), + itertools.repeat(overwrite_output_field, len(targets)), + ) + unsuccessful_targets = [item for item in unsuccessful_targets if item is not None] + if unsuccessful_targets: + raise FieldExistsWarning(event, unsuccessful_targets) + + def add_field_to( event, - output_field, + target_field, content, extends_lists=False, overwrite_output_field=False, @@ -72,7 +86,7 @@ def add_field_to( ---------- event: dict Original log-event that logprep is currently processing - output_field: str + target_field: str Dotted subfield string indicating the target of the output value, e.g. destination.ip content: str, float, int, list, dict Value that should be written into the output_field, can be a str, list, or dict object @@ -90,12 +104,12 @@ def add_field_to( """ if extends_lists and overwrite_output_field: raise ValueError("An output field can't be overwritten and extended at the same time") - field_path = [event, *get_dotted_field_list(output_field)] + field_path = [event, *get_dotted_field_list(target_field)] target_key = field_path.pop() try: target_parent = reduce(_add_and_not_overwrite_key, field_path) except KeyError as error: - raise FieldExistsWarning(event, [output_field]) from error + raise FieldExistsWarning(event, [target_field]) from error if overwrite_output_field: target_parent[target_key] = content else: @@ -104,7 +118,7 @@ def add_field_to( target_parent[target_key] = content return if not extends_lists or not isinstance(existing_value, list): - raise FieldExistsWarning(event, [output_field]) + raise FieldExistsWarning(event, [target_field]) if isinstance(content, list): target_parent[target_key].extend(content) else: @@ -155,7 +169,7 @@ def get_dotted_field_value(event: dict, dotted_field: str) -> Optional[Union[dic @lru_cache(maxsize=100000) def get_dotted_field_list(dotted_field: str) -> list[str]: """make lookup of dotted field in the dotted_field_lookup_table and ensures - it is added if not found. Additionally the string will be interned for faster + it is added if not found. Additionally, the string will be interned for faster followup lookups. Parameters From 64cf75aceca3277b31eafae0633b58b4b1a3ce1c Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Thu, 7 Nov 2024 17:28:57 +0100 Subject: [PATCH 04/38] fix domain_label_extractor tests --- logprep/abc/processor.py | 4 +--- .../domain_label_extractor/processor.py | 23 +++++++------------ 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/logprep/abc/processor.py b/logprep/abc/processor.py index 34c7e21cb..832e5d104 100644 --- a/logprep/abc/processor.py +++ b/logprep/abc/processor.py @@ -381,15 +381,13 @@ def _has_missing_values(self, event, rule, source_field_dict): return False def _write_target_field(self, event: dict, rule: "Rule", result: any) -> None: - add_successful = add_field_to( + add_field_to( event, target_field=rule.target_field, content=result, extends_lists=rule.extend_target_list, overwrite_output_field=rule.overwrite_target, ) - if not add_successful: - raise FieldExistsWarning(rule, event, [rule.target_field]) def setup(self): super().setup() diff --git a/logprep/processor/domain_label_extractor/processor.py b/logprep/processor/domain_label_extractor/processor.py index afb3e84bc..342d74e4b 100644 --- a/logprep/processor/domain_label_extractor/processor.py +++ b/logprep/processor/domain_label_extractor/processor.py @@ -46,11 +46,10 @@ from filelock import FileLock from tldextract import TLDExtract -from logprep.processor.base.exceptions import FieldExistsWarning from logprep.processor.domain_label_extractor.rule import DomainLabelExtractorRule from logprep.processor.field_manager.processor import FieldManager from logprep.util.getter import GetterFactory -from logprep.util.helper import add_and_overwrite, add_field_to, get_dotted_field_value +from logprep.util.helper import add_and_overwrite, get_dotted_field_value, add_batch_to from logprep.util.validators import list_of_urls_validator logger = logging.getLogger("DomainLabelExtractor") @@ -136,19 +135,13 @@ def _apply_rules(self, event, rule: DomainLabelExtractorRule): labels = self._tld_extractor(domain) if labels.suffix != "": - labels_dict = { - "registered_domain": labels.domain + "." + labels.suffix, - "top_level_domain": labels.suffix, - "subdomain": labels.subdomain, - } - for label, value in labels_dict.items(): - output_field = f"{rule.target_field}.{label}" - add_successful = add_field_to( - event, output_field, value, overwrite_output_field=rule.overwrite_target - ) - - if not add_successful: - raise FieldExistsWarning(rule, event, [output_field]) + targets = [ + f"{rule.target_field}.registered_domain", + f"{rule.target_field}.top_level_domain", + f"{rule.target_field}.subdomain", + ] + contents = [f"{labels.domain}.{labels.suffix}", labels.suffix, labels.subdomain] + add_batch_to(event, targets, contents, overwrite_output_field=rule.overwrite_target) else: tagging_field.append(f"invalid_domain_in_{rule.source_fields[0].replace('.', '_')}") add_and_overwrite(event, self._config.tagging_field_name, tagging_field) From f3da215bf561668a3f919624aaf679a4f3d74cdf Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Thu, 7 Nov 2024 17:39:06 +0100 Subject: [PATCH 05/38] fix generic_adder tests --- logprep/processor/base/exceptions.py | 2 +- logprep/processor/generic_adder/processor.py | 22 ++++---------------- 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/logprep/processor/base/exceptions.py b/logprep/processor/base/exceptions.py index cb5a444ed..05c852cf8 100644 --- a/logprep/processor/base/exceptions.py +++ b/logprep/processor/base/exceptions.py @@ -91,6 +91,6 @@ def __init__( message = ( "The following fields could not be written, because " "one or more subfields existed and could not be extended: " - f"{skipped_fields}" + f"{', '.join(skipped_fields)}" ) super().__init__(message, event, rule) diff --git a/logprep/processor/generic_adder/processor.py b/logprep/processor/generic_adder/processor.py index f3e266438..842f9894e 100644 --- a/logprep/processor/generic_adder/processor.py +++ b/logprep/processor/generic_adder/processor.py @@ -46,10 +46,9 @@ from logprep.abc.processor import Processor from logprep.factory_error import InvalidConfigurationError -from logprep.processor.base.exceptions import FieldExistsWarning from logprep.processor.generic_adder.mysql_connector import MySQLConnector from logprep.processor.generic_adder.rule import GenericAdderRule -from logprep.util.helper import add_field_to, get_dotted_field_value +from logprep.util.helper import get_dotted_field_value, add_batch_to def sql_config_validator(_, attribute, value): @@ -225,8 +224,6 @@ def _apply_rules(self, event: dict, rule: GenericAdderRule): FieldExistsWarning Raises if an addition would overwrite an existing field or value. """ - conflicting_fields = [] - use_db = rule.db_target and self._db_table if use_db: self._update_db_table() @@ -234,20 +231,9 @@ def _apply_rules(self, event: dict, rule: GenericAdderRule): else: items_to_add = rule.add.items() - # Add the items to the event - for dotted_field, value in items_to_add: - add_successful = add_field_to( - event, - target_field=dotted_field, - content=value, - extends_lists=rule.extend_target_list, - overwrite_output_field=rule.overwrite_target, - ) - if not add_successful: - conflicting_fields.append(dotted_field) - - if conflicting_fields: - raise FieldExistsWarning(rule, event, conflicting_fields) + if items_to_add: + targets, contents = zip(*items_to_add) + add_batch_to(event, targets, contents, rule.extend_target_list, rule.overwrite_target) def _get_items_to_add_from_db(self, event: dict, rule: GenericAdderRule) -> list: """Get the sub part of the value from the event using a regex pattern""" From 28fd58ca0458c6bc7c55c88c6457dc2b78096c27 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Thu, 7 Nov 2024 18:20:54 +0100 Subject: [PATCH 06/38] fix geoip_enricher tests --- logprep/processor/geoip_enricher/processor.py | 31 +++++++++---------- logprep/util/helper.py | 18 ++++++++--- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/logprep/processor/geoip_enricher/processor.py b/logprep/processor/geoip_enricher/processor.py index 3e83ca044..b83791218 100644 --- a/logprep/processor/geoip_enricher/processor.py +++ b/logprep/processor/geoip_enricher/processor.py @@ -38,11 +38,10 @@ from geoip2 import database from geoip2.errors import AddressNotFoundError -from logprep.processor.base.exceptions import FieldExistsWarning from logprep.processor.field_manager.processor import FieldManager from logprep.processor.geoip_enricher.rule import GEOIP_DATA_STUBS, GeoipEnricherRule from logprep.util.getter import GetterFactory -from logprep.util.helper import add_field_to, get_dotted_field_value +from logprep.util.helper import get_dotted_field_value, add_batch_to logger = logging.getLogger("GeoipEnricher") @@ -129,18 +128,16 @@ def _apply_rules(self, event, rule): geoip_data = self._try_getting_geoip_data(ip_string) if not geoip_data: return - for target_subfield, value in geoip_data.items(): - if value is None: - continue - full_output_field = f"{rule.target_field}.{target_subfield}" - if target_subfield in rule.customize_target_subfields: - full_output_field = rule.customize_target_subfields.get(target_subfield) - adding_was_successful = add_field_to( - event=event, - target_field=full_output_field, - content=value, - extends_lists=False, - overwrite_output_field=rule.overwrite_target, - ) - if not adding_was_successful: - raise FieldExistsWarning(rule, event, [full_output_field]) + filtered_geoip_data = {k: v for k, v in geoip_data.items() if v is not None} + targets, contents = zip(*filtered_geoip_data.items()) + targets = [ + rule.customize_target_subfields.get(target, f"{rule.target_field}.{target}") + for target in targets + ] + add_batch_to( + event, + targets, + contents, + extends_lists=False, + overwrite_output_field=rule.overwrite_target, + ) diff --git a/logprep/util/helper.py b/logprep/util/helper.py index 900abd6da..34ef10d05 100644 --- a/logprep/util/helper.py +++ b/logprep/util/helper.py @@ -41,6 +41,14 @@ def print_fcolor(fore: AnsiFore, message: str): color_print_line(None, fore, message) +def _add_and_overwrite_key(sub_dict, key): + current_value = sub_dict.get(key) + if isinstance(current_value, dict): + return current_value + sub_dict.update({key: {}}) + return sub_dict.get(key) + + def _add_and_not_overwrite_key(sub_dict, key): current_value = sub_dict.get(key) if isinstance(current_value, dict): @@ -106,13 +114,15 @@ def add_field_to( raise ValueError("An output field can't be overwritten and extended at the same time") field_path = [event, *get_dotted_field_list(target_field)] target_key = field_path.pop() - try: - target_parent = reduce(_add_and_not_overwrite_key, field_path) - except KeyError as error: - raise FieldExistsWarning(event, [target_field]) from error + if overwrite_output_field: + target_parent = reduce(_add_and_overwrite_key, field_path) target_parent[target_key] = content else: + try: + target_parent = reduce(_add_and_not_overwrite_key, field_path) + except KeyError as error: + raise FieldExistsWarning(event, [target_field]) from error existing_value = target_parent.get(target_key) if existing_value is None: target_parent[target_key] = content From ad52e08c9ab5b4f31de521ebb1e35dccb18ff3f4 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Fri, 8 Nov 2024 10:35:39 +0100 Subject: [PATCH 07/38] fig grokker processor --- logprep/processor/grokker/processor.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/logprep/processor/grokker/processor.py b/logprep/processor/grokker/processor.py index ce968a827..ad8872a2b 100644 --- a/logprep/processor/grokker/processor.py +++ b/logprep/processor/grokker/processor.py @@ -46,7 +46,7 @@ from logprep.processor.field_manager.processor import FieldManager from logprep.processor.grokker.rule import GrokkerRule from logprep.util.getter import GetterFactory -from logprep.util.helper import add_field_to, get_dotted_field_value +from logprep.util.helper import add_field_to, get_dotted_field_value, add_batch_to logger = logging.getLogger("Grokker") @@ -89,20 +89,19 @@ def _apply_rules(self, event: dict, rule: GrokkerRule): if result is None or result == {}: continue matches.append(True) - for dotted_field, value in result.items(): - if value is None: - continue - success = add_field_to( - event, dotted_field, value, rule.extend_target_list, rule.overwrite_target - ) - if not success: - conflicting_fields.append(dotted_field) + filtered_items = {k: v for k, v in result.items() if v is not None} + targets, contents = zip(*filtered_items.items()) + add_batch_to( + event, + targets, + contents, + extends_lists=rule.extend_target_list, + overwrite_output_field=rule.overwrite_target, + ) if self._handle_missing_fields(event, rule, rule.actions.keys(), source_values): return - if conflicting_fields: - raise FieldExistsWarning(rule, event, conflicting_fields) if not matches: - raise ProcessingWarning("no grok pattern matched", rule, event) + raise ProcessingWarning("no grok pattern matched", event, rule) def setup(self): """Loads the action mapping. Has to be called before processing""" From 817d980f1f46b2ed1efba61a1a4bee9fb2530e07 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Fri, 8 Nov 2024 10:36:58 +0100 Subject: [PATCH 08/38] fig ip_informer processor --- logprep/processor/ip_informer/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/logprep/processor/ip_informer/processor.py b/logprep/processor/ip_informer/processor.py index 1c75bf702..5d8ce35a4 100644 --- a/logprep/processor/ip_informer/processor.py +++ b/logprep/processor/ip_informer/processor.py @@ -54,7 +54,7 @@ def _apply_rules(self, event: dict, rule: IpInformerRule) -> None: if results: self._write_target_field(event, rule, results) for msg, error in self._processing_warnings: - raise ProcessingWarning(msg, rule, event) from error + raise ProcessingWarning(msg, event, rule) from error def _get_results(self, ip_address_list: Iterable, rule: IpInformerRule) -> dict: results = [(ip, self._ip_properties(ip, rule)) for ip in ip_address_list] From ad8d0d4686d6b490cb388aab7a8f7af756719af4 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Fri, 8 Nov 2024 12:04:41 +0100 Subject: [PATCH 09/38] fig labeler tests processor --- logprep/processor/labeler/processor.py | 43 ++++++-------------------- logprep/util/helper.py | 15 +++++++++ 2 files changed, 25 insertions(+), 33 deletions(-) diff --git a/logprep/processor/labeler/processor.py b/logprep/processor/labeler/processor.py index 97fb7b21d..5473e697c 100644 --- a/logprep/processor/labeler/processor.py +++ b/logprep/processor/labeler/processor.py @@ -33,7 +33,10 @@ from logprep.abc.processor import Processor from logprep.processor.labeler.labeling_schema import LabelingSchema from logprep.processor.labeler.rule import LabelerRule -from logprep.util.helper import add_field_to, get_dotted_field_value, add_and_overwrite +from logprep.util.helper import ( + get_dotted_field_value, + add_batch_to_silent_fail, +) class Labeler(Processor): @@ -73,35 +76,9 @@ def setup(self): def _apply_rules(self, event, rule): """Applies the rule to the current event""" - self._add_label_fields(event, rule) - self._add_label_values(event, rule) - self._convert_label_categories_to_sorted_list(event) - - @staticmethod - def _add_label_fields(event: dict, rule: LabelerRule): - """Prepares the event by adding empty label fields""" - add_field_to(event, "label", {}) - for key in rule.label: - add_field_to(event, f"label.{key}", set()) - - @staticmethod - def _add_label_values(event: dict, rule: LabelerRule): - """Adds the labels from the rule to the event""" - for key in rule.label: - label_key = f"label.{key}" - label = get_dotted_field_value(event, label_key) - if not isinstance(label, set): - label = set(label) - add_and_overwrite(event, label_key, label) - label.update(rule.label[key]) - - @staticmethod - def _convert_label_categories_to_sorted_list(event: dict): - label = get_dotted_field_value(event, "label") - if label is None: - return - for category in label: - category_key = f"label.{category}" - category_value = get_dotted_field_value(event, category_key) - sorted_category = sorted(list(category_value)) - add_and_overwrite(event, category_key, sorted_category) + targets = [f"label.{key}" for key in rule.label.keys()] + contents = rule.label.values() + add_batch_to_silent_fail(event, targets, contents) + # convert sets into sorted lists + contents = [sorted(list(get_dotted_field_value(event, target))) for target in targets] + add_batch_to_silent_fail(event, targets, contents, overwrite_output_field=True) diff --git a/logprep/util/helper.py b/logprep/util/helper.py index 34ef10d05..9ba563471 100644 --- a/logprep/util/helper.py +++ b/logprep/util/helper.py @@ -80,6 +80,13 @@ def add_batch_to(event, targets, contents, extends_lists=False, overwrite_output raise FieldExistsWarning(event, unsuccessful_targets) +def add_batch_to_silent_fail(*args, **kwargs): + try: + add_batch_to(*args, **kwargs) + except FieldExistsWarning: + ... + + def add_field_to( event, target_field, @@ -309,6 +316,14 @@ def add_and_overwrite(event, target_field, content, *_): add_field_to(event, target_field, content, overwrite_output_field=True) +def add_and_overwrite_silent_fail(event, target_field, content, *_): + """wrapper for add_field_to""" + try: + add_field_to(event, target_field, content, overwrite_output_field=True) + except FieldExistsWarning: + ... + + def append(event, target_field, content, separator): """appends to event""" target_value = get_dotted_field_value(event, target_field) From 9dfd7abaa17b7e4d32b848435597c3818a144f60 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Fri, 8 Nov 2024 12:08:03 +0100 Subject: [PATCH 10/38] fig list_comparison processor tests --- logprep/processor/list_comparison/processor.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/logprep/processor/list_comparison/processor.py b/logprep/processor/list_comparison/processor.py index e3166ec2e..deb350b68 100644 --- a/logprep/processor/list_comparison/processor.py +++ b/logprep/processor/list_comparison/processor.py @@ -36,13 +36,6 @@ from logprep.util.helper import add_field_to, get_dotted_field_value -class ListComparisonError(Exception): - """Base class for ListComparison related exceptions.""" - - def __init__(self, name: str, message: str): - super().__init__(f"ListComparison ({name}): {message}") - - class ListComparison(Processor): """Resolve values in documents by referencing a mapping list.""" @@ -79,14 +72,10 @@ def _apply_rules(self, event, rule): Currently applied list comparison rule. """ - comparison_result, comparison_key = self._list_comparison(rule, event) - if comparison_result is not None: output_field = f"{ rule.target_field }.{ comparison_key }" - add_successful = add_field_to(event, output_field, comparison_result, True) - if not add_successful: - raise FieldExistsWarning(rule, event, [output_field]) + add_field_to(event, output_field, comparison_result, True) def _list_comparison(self, rule: ListComparisonRule, event: dict): """ From d7582f9993ce2950ea838066975d672c1efec2e6 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Fri, 8 Nov 2024 12:14:45 +0100 Subject: [PATCH 11/38] fig pre_detector processor tests --- logprep/processor/pre_detector/processor.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/logprep/processor/pre_detector/processor.py b/logprep/processor/pre_detector/processor.py index 9533cf1a1..7d8378108 100644 --- a/logprep/processor/pre_detector/processor.py +++ b/logprep/processor/pre_detector/processor.py @@ -103,14 +103,11 @@ def normalize_timestamp(self, rule: PreDetectorRule, timestamp: str) -> str: parsed_datetime.astimezone(rule.target_timezone).isoformat().replace("+00:00", "Z") ) except TimeParserException as error: - error_message = "Could not parse timestamp" - raise ( - ProcessingWarning( - error_message, - rule, - self.result.event, - tags=["_pre_detector_timeparsing_failure"], - ) + raise ProcessingWarning( + "Could not parse timestamp", + self.result.event, + rule, + tags=["_pre_detector_timeparsing_failure"], ) from error def _apply_rules(self, event: dict, rule: PreDetectorRule): @@ -130,7 +127,6 @@ def _get_detection_result(self, event: dict, rule: PreDetectorRule): if pre_detection_id is None: pre_detection_id = str(uuid4()) add_field_to(event, "pre_detection_id", pre_detection_id) - detection_result = self._generate_detection_result(pre_detection_id, event, rule) self.result.data.append((detection_result, self._config.outputs)) From b124195a20002e132ab52a6a833a4c98748fda35 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Fri, 8 Nov 2024 12:48:18 +0100 Subject: [PATCH 12/38] fix requester processor tests --- logprep/processor/base/exceptions.py | 1 + logprep/processor/requester/processor.py | 35 ++++++++----------- logprep/util/helper.py | 4 +-- .../processor/requester/test_requester.py | 21 +++++++++++ 4 files changed, 39 insertions(+), 22 deletions(-) diff --git a/logprep/processor/base/exceptions.py b/logprep/processor/base/exceptions.py index 05c852cf8..4ecb9008d 100644 --- a/logprep/processor/base/exceptions.py +++ b/logprep/processor/base/exceptions.py @@ -88,6 +88,7 @@ def __init__( skipped_fields: List[str], rule: "Rule" = None, ): + self.skipped_fields = skipped_fields message = ( "The following fields could not be written, because " "one or more subfields existed and could not be extended: " diff --git a/logprep/processor/requester/processor.py b/logprep/processor/requester/processor.py index 6b863e045..b94a547fc 100644 --- a/logprep/processor/requester/processor.py +++ b/logprep/processor/requester/processor.py @@ -45,9 +45,9 @@ from logprep.processor.field_manager.processor import FieldManager from logprep.processor.requester.rule import RequesterRule from logprep.util.helper import ( - add_field_to, - get_dotted_field_value, get_source_fields_dict, + add_field_to_silent_fail, + add_batch_to_silent_fail, ) TEMPLATE_KWARGS = ("url", "json", "data", "params") @@ -71,33 +71,28 @@ def _apply_rules(self, event, rule): self._handle_response(event, rule, response) def _handle_response(self, event, rule, response): - conflicting_fields = [] + failed_targets = [] if rule.target_field: result = self._get_result(response) - successful = add_field_to( + failed_target = add_field_to_silent_fail( event, rule.target_field, result, rule.extend_target_list, rule.overwrite_target, ) - if not successful: - conflicting_fields.append(rule.target_field) + failed_targets.append(failed_target) if rule.target_field_mapping: - result = self._get_result(response) - for source_field, target_field in rule.target_field_mapping.items(): - source_field_value = get_dotted_field_value(result, source_field) - successful = add_field_to( - event, - target_field, - source_field_value, - rule.extend_target_list, - rule.overwrite_target, - ) - if not successful: - conflicting_fields.append(rule.target_field) - if conflicting_fields: - raise FieldExistsWarning(rule, event, [rule.target_field]) + source_fields = rule.target_field_mapping.keys() + contents = self._get_field_values(self._get_result(response), source_fields) + targets = rule.target_field_mapping.values() + failed = add_batch_to_silent_fail( + event, targets, contents, rule.extend_target_list, rule.overwrite_target + ) + failed_targets.append(failed) + failed_targets = [failed for failed in failed_targets if failed is not None] + if failed_targets: + raise FieldExistsWarning(event, failed_targets, rule) def _request(self, event, rule, kwargs): try: diff --git a/logprep/util/helper.py b/logprep/util/helper.py index 9ba563471..e18be080a 100644 --- a/logprep/util/helper.py +++ b/logprep/util/helper.py @@ -83,8 +83,8 @@ def add_batch_to(event, targets, contents, extends_lists=False, overwrite_output def add_batch_to_silent_fail(*args, **kwargs): try: add_batch_to(*args, **kwargs) - except FieldExistsWarning: - ... + except FieldExistsWarning as error: + return error.skipped_fields def add_field_to( diff --git a/tests/unit/processor/requester/test_requester.py b/tests/unit/processor/requester/test_requester.py index f9ba5d1f9..17decf03a 100644 --- a/tests/unit/processor/requester/test_requester.py +++ b/tests/unit/processor/requester/test_requester.py @@ -171,6 +171,27 @@ "status": 200, }, ), + ( + "use target_field and target_field_mapping at the same time, with error in target_field", + { + "filter": "message", + "requester": { + "url": "http://mock-mock/", + "method": "GET", + "target_field": "message", # will fail as it is already present + "target_field_mapping": {"key1.key2.key3": "result.custom"}, + }, + }, + {"message": "the message"}, + {"message": "the message", "result": {"custom": "value"}, "tags": ["_requester_failure"]}, + { + "method": "GET", + "url": "http://mock-mock/", + "json": {"key1": {"key2": {"key3": "value"}}}, + "content_type": "text/plain", + "status": 200, + }, + ), ( "parses json result with simple target field mapping and overwrite target", { From 1ea252686458930db2bb320c11ee38a14e7cf3c9 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Fri, 8 Nov 2024 13:59:31 +0100 Subject: [PATCH 13/38] fix ProcessingWarning init --- logprep/processor/string_splitter/processor.py | 2 +- tests/unit/framework/test_pipeline.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/logprep/processor/string_splitter/processor.py b/logprep/processor/string_splitter/processor.py index f2b94e260..9d06d335d 100644 --- a/logprep/processor/string_splitter/processor.py +++ b/logprep/processor/string_splitter/processor.py @@ -42,6 +42,6 @@ def _apply_rules(self, event: dict, rule: StringSplitterRule): source_field_content = get_dotted_field_value(event, source_field) self._handle_missing_fields(event, rule, rule.source_fields, [source_field_content]) if not isinstance(source_field_content, str): - raise ProcessingWarning(f"source_field '{source_field}' is not a string", rule, event) + raise ProcessingWarning(f"source_field '{source_field}' is not a string", event, rule) result = source_field_content.split(rule.delimeter) self._write_target_field(event, rule, result) diff --git a/tests/unit/framework/test_pipeline.py b/tests/unit/framework/test_pipeline.py index 3dec94242..3e70794e8 100644 --- a/tests/unit/framework/test_pipeline.py +++ b/tests/unit/framework/test_pipeline.py @@ -253,7 +253,7 @@ def test_processor_warning_error_is_logged_but_processing_continues(self, mock_w self.pipeline._setup() self.pipeline._input.get_next.return_value = {"message": "test"} mock_rule = mock.MagicMock() - processing_warning = ProcessingWarning("not so bad", mock_rule, {"message": "test"}) + processing_warning = ProcessingWarning("not so bad", {"message": "test"}, mock_rule) self.pipeline._pipeline[1].process.return_value = ProcessorResult( processor_name="mock_processor", warnings=[processing_warning] ) From adfd50957769d51f56d20e55be13dc1f31721840 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Fri, 8 Nov 2024 14:50:42 +0100 Subject: [PATCH 14/38] fix timestamper processor --- logprep/processor/timestamper/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/logprep/processor/timestamper/processor.py b/logprep/processor/timestamper/processor.py index 71f2bcaa3..ce1c579b3 100644 --- a/logprep/processor/timestamper/processor.py +++ b/logprep/processor/timestamper/processor.py @@ -61,4 +61,4 @@ def _apply_rules(self, event, rule): parsed_successfully = True break if not parsed_successfully: - raise ProcessingWarning(str("Could not parse timestamp"), rule, event) + raise ProcessingWarning(str("Could not parse timestamp"), event, rule) From f68d7c3b43930734a0d7db0c8b6f9e010afa316c Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Fri, 8 Nov 2024 14:51:43 +0100 Subject: [PATCH 15/38] fix template_replacer processor --- logprep/processor/template_replacer/processor.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/logprep/processor/template_replacer/processor.py b/logprep/processor/template_replacer/processor.py index 7a0502809..538d635c1 100644 --- a/logprep/processor/template_replacer/processor.py +++ b/logprep/processor/template_replacer/processor.py @@ -114,16 +114,8 @@ def _perform_replacement(self, event: dict, replacement: str, rule: TemplateRepl If target value isn't None, then it exists and its parents must be dicts. Therefore, they wouldn't be replaced, and we can overwrite the existing target field. """ - if get_dotted_field_value(event, self._target_field) is None: - add_successful = add_field_to( - event, - self._target_field, - replacement, - ) - if not add_successful: - raise FieldExistsWarning(rule, event, [self._target_field]) - else: - add_field_to(event, self._target_field, replacement, overwrite_output_field=True) + overwrite = get_dotted_field_value(event, self._target_field) is not None + add_field_to(event, self._target_field, replacement, overwrite_output_field=overwrite) def setup(self): super().setup() From 66f6696dbb18693e2921997b303ed379a02dff30 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Fri, 8 Nov 2024 15:38:40 +0100 Subject: [PATCH 16/38] fix input connector tests --- logprep/abc/input.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/logprep/abc/input.py b/logprep/abc/input.py index 8fe125648..6c89cbb2c 100644 --- a/logprep/abc/input.py +++ b/logprep/abc/input.py @@ -18,7 +18,7 @@ from logprep.abc.exceptions import LogprepException from logprep.metrics.metrics import Metric from logprep.processor.base.exceptions import FieldExistsWarning -from logprep.util.helper import add_field_to, get_dotted_field_value +from logprep.util.helper import add_field_to, get_dotted_field_value, add_field_to_silent_fail from logprep.util.time import UTC, TimeParser from logprep.util.validators import dict_structure_validator @@ -310,7 +310,7 @@ def _add_env_enrichment_to_event(self, event: dict): def _add_arrival_time_information_to_event(self, event: dict): now = TimeParser.now() target_field = self._config.preprocessing.get("log_arrival_time_target_field") - add_field_to(event, target_field, now.isoformat()) + add_field_to_silent_fail(event, target_field, now.isoformat()) def _add_arrival_timedelta_information_to_event(self, event: dict): log_arrival_timedelta_config = self._config.preprocessing.get("log_arrival_timedelta") @@ -332,7 +332,7 @@ def _add_version_information_to_event(self, event: dict): """Add the version information to the event""" target_field = self._config.preprocessing.get("version_info_target_field") # pylint: disable=protected-access - add_field_to(event, target_field, self._config._version_information) + add_field_to_silent_fail(event, target_field, self._config._version_information) # pylint: enable=protected-access def _add_hmac_to(self, event_dict, raw_event) -> dict: From 113bb256c4ced24ca68b587ccd3bbbc1e31b29c5 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Fri, 8 Nov 2024 15:54:20 +0100 Subject: [PATCH 17/38] fix FieldExistsWarning init tests --- tests/unit/exceptions/base.py | 2 +- tests/unit/exceptions/test_processing_exceptions.py | 2 +- tests/unit/framework/test_pipeline.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/exceptions/base.py b/tests/unit/exceptions/base.py index 72013e228..3a658ad1f 100644 --- a/tests/unit/exceptions/base.py +++ b/tests/unit/exceptions/base.py @@ -22,7 +22,7 @@ class ExceptionBaseTest: def setup_method(self): self.object = Rule._create_from_dict({"filter": "message", "rule": {}}) self.event = {"message": "test_event"} - self.exception_args = ("the error message", self.object, self.event) + self.exception_args = ("the error message", self.event, self.object) def test_error_message(self): with pytest.raises(self.exception, match=self.error_message): diff --git a/tests/unit/exceptions/test_processing_exceptions.py b/tests/unit/exceptions/test_processing_exceptions.py index 52c8534b6..5d24244ef 100644 --- a/tests/unit/exceptions/test_processing_exceptions.py +++ b/tests/unit/exceptions/test_processing_exceptions.py @@ -35,7 +35,7 @@ class TestFieldExistsWarning(ExceptionBaseTest): def setup_method(self): super().setup_method() - self.exception_args = (self.object, self.event, ["my_field"]) + self.exception_args = (self.event, ["my_field"], self.object) class TestProcessingCriticalError(ExceptionBaseTest): diff --git a/tests/unit/framework/test_pipeline.py b/tests/unit/framework/test_pipeline.py index 3e70794e8..2c8f4b8f7 100644 --- a/tests/unit/framework/test_pipeline.py +++ b/tests/unit/framework/test_pipeline.py @@ -308,7 +308,7 @@ def test_processor_logs_processing_error_and_warnings_separately( mock_create({"mock_processor1": {"type": "mock_processor"}}), mock_create({"mock_processor2": {"type": "mock_processor"}}), ] - warning = FieldExistsWarning(mock_rule, input_event1, ["foo"]) + warning = FieldExistsWarning(input_event1, ["foo"], mock_rule) self.pipeline._pipeline[0].process.return_value = ProcessorResult( processor_name="", warnings=[warning] ) From 1db38f3fc7cb3842a21b1d04b5a70d87a80e9c24 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Fri, 8 Nov 2024 16:49:30 +0100 Subject: [PATCH 18/38] fix auto_rule_tester - one test is still broken, needs further investigation why --- .../rules/generic/rule_with_custom_tests.yml | 3 ++- .../rules/specific/rule_with_custom_tests.yml | 3 ++- .../dissector/rules/specific/auto_test_mismatch.json | 3 ++- .../dissector/rules/specific/auto_test_no_test_.json | 3 ++- .../auto_tests/dropper/rules/generic/drop_field.json | 3 ++- .../auto_tests/dropper/rules/specific/drop_field.json | 3 ++- .../rules/generic/auto_test_labeling_match.json | 3 ++- .../generic/auto_test_labeling_match_existing.json | 3 ++- .../specific/auto_test_pre_detector_mismatch.json | 4 ++-- .../specific/auto_test_pre_detector_no_test_.json | 4 ++-- .../rules/generic/template_replacer.json | 6 ++++-- .../rules/specific/template_replacer.json | 6 ++++-- .../template_replacer/test_template_replacer.py | 11 +++++++++++ tests/unit/util/test_auto_rule_tester.py | 2 ++ 14 files changed, 41 insertions(+), 16 deletions(-) diff --git a/tests/testdata/auto_tests/clusterer/rules/generic/rule_with_custom_tests.yml b/tests/testdata/auto_tests/clusterer/rules/generic/rule_with_custom_tests.yml index d6e6c21a6..aa91c5ed4 100644 --- a/tests/testdata/auto_tests/clusterer/rules/generic/rule_with_custom_tests.yml +++ b/tests/testdata/auto_tests/clusterer/rules/generic/rule_with_custom_tests.yml @@ -1,9 +1,10 @@ filter: message clusterer: + id: clusterer-rule-2 source_fields: [message] pattern: '(bytes|Bytes|Byte)' repl: 'byte' description: '...' tests: raw: 'Byte is a Bytes is a bytes is a byte' - result: 'byte is a byte is a byte is a byte' \ No newline at end of file + result: 'byte is a byte is a byte is a byte' diff --git a/tests/testdata/auto_tests/clusterer/rules/specific/rule_with_custom_tests.yml b/tests/testdata/auto_tests/clusterer/rules/specific/rule_with_custom_tests.yml index d6e6c21a6..9e51adc01 100644 --- a/tests/testdata/auto_tests/clusterer/rules/specific/rule_with_custom_tests.yml +++ b/tests/testdata/auto_tests/clusterer/rules/specific/rule_with_custom_tests.yml @@ -1,9 +1,10 @@ filter: message clusterer: + id: clusterer-rule-1 source_fields: [message] pattern: '(bytes|Bytes|Byte)' repl: 'byte' description: '...' tests: raw: 'Byte is a Bytes is a bytes is a byte' - result: 'byte is a byte is a byte is a byte' \ No newline at end of file + result: 'byte is a byte is a byte is a byte' diff --git a/tests/testdata/auto_tests/dissector/rules/specific/auto_test_mismatch.json b/tests/testdata/auto_tests/dissector/rules/specific/auto_test_mismatch.json index a84bb8ffc..d566851b1 100644 --- a/tests/testdata/auto_tests/dissector/rules/specific/auto_test_mismatch.json +++ b/tests/testdata/auto_tests/dissector/rules/specific/auto_test_mismatch.json @@ -1,9 +1,10 @@ [{ "filter": "message", "dissector": { + "id": "dissector-1", "mapping": { "message": "%{source}-%{target}" } }, "description": "Test-rule with matching auto-test" -}] \ No newline at end of file +}] diff --git a/tests/testdata/auto_tests/dissector/rules/specific/auto_test_no_test_.json b/tests/testdata/auto_tests/dissector/rules/specific/auto_test_no_test_.json index 77537bb95..a99118ee8 100644 --- a/tests/testdata/auto_tests/dissector/rules/specific/auto_test_no_test_.json +++ b/tests/testdata/auto_tests/dissector/rules/specific/auto_test_no_test_.json @@ -1,9 +1,10 @@ [{ "filter": "message", "dissector": { + "id": "dissector-2", "mapping": { "message": "%{source} %{target}" } }, "description": "Test-rule with matching auto-test" -}] \ No newline at end of file +}] diff --git a/tests/testdata/auto_tests/dropper/rules/generic/drop_field.json b/tests/testdata/auto_tests/dropper/rules/generic/drop_field.json index 30ebc797a..884d8cd3e 100644 --- a/tests/testdata/auto_tests/dropper/rules/generic/drop_field.json +++ b/tests/testdata/auto_tests/dropper/rules/generic/drop_field.json @@ -2,9 +2,10 @@ { "filter": "drop_me", "dropper": { + "id": "dropper-1", "drop": [ "drop_me" ] } } -] \ No newline at end of file +] diff --git a/tests/testdata/auto_tests/dropper/rules/specific/drop_field.json b/tests/testdata/auto_tests/dropper/rules/specific/drop_field.json index 30ebc797a..6b561618a 100644 --- a/tests/testdata/auto_tests/dropper/rules/specific/drop_field.json +++ b/tests/testdata/auto_tests/dropper/rules/specific/drop_field.json @@ -2,9 +2,10 @@ { "filter": "drop_me", "dropper": { + "id": "dropper-2", "drop": [ "drop_me" ] } } -] \ No newline at end of file +] diff --git a/tests/testdata/auto_tests/labeler/rules/generic/auto_test_labeling_match.json b/tests/testdata/auto_tests/labeler/rules/generic/auto_test_labeling_match.json index 4162a3add..e90c0e9ed 100644 --- a/tests/testdata/auto_tests/labeler/rules/generic/auto_test_labeling_match.json +++ b/tests/testdata/auto_tests/labeler/rules/generic/auto_test_labeling_match.json @@ -2,6 +2,7 @@ { "filter": "some_field: (stop OR end)", "labeler": { + "id": "labeler-1", "label": { "action": [ "terminate" @@ -10,4 +11,4 @@ }, "description": "Test-rule with matching auto-test" } -] \ No newline at end of file +] diff --git a/tests/testdata/auto_tests/labeler/rules/generic/auto_test_labeling_match_existing.json b/tests/testdata/auto_tests/labeler/rules/generic/auto_test_labeling_match_existing.json index 4162a3add..7e5c50de4 100644 --- a/tests/testdata/auto_tests/labeler/rules/generic/auto_test_labeling_match_existing.json +++ b/tests/testdata/auto_tests/labeler/rules/generic/auto_test_labeling_match_existing.json @@ -2,6 +2,7 @@ { "filter": "some_field: (stop OR end)", "labeler": { + "id": "labeler-2", "label": { "action": [ "terminate" @@ -10,4 +11,4 @@ }, "description": "Test-rule with matching auto-test" } -] \ No newline at end of file +] diff --git a/tests/testdata/auto_tests/pre_detector/rules/specific/auto_test_pre_detector_mismatch.json b/tests/testdata/auto_tests/pre_detector/rules/specific/auto_test_pre_detector_mismatch.json index a081ea335..98cf64b99 100644 --- a/tests/testdata/auto_tests/pre_detector/rules/specific/auto_test_pre_detector_mismatch.json +++ b/tests/testdata/auto_tests/pre_detector/rules/specific/auto_test_pre_detector_mismatch.json @@ -1,7 +1,7 @@ [{ "filter": "some_field", "pre_detector": { - "id": "SOME_TEST_RULE_ID", + "id": "SOME_TEST_RULE_ID_1", "title": "SOME_TEST_RULE", "severity": "critical", "mitre": [], @@ -9,4 +9,4 @@ }, "sigma_fields": true, "description": "Test-rule with mismatching auto-test" -}] \ No newline at end of file +}] diff --git a/tests/testdata/auto_tests/pre_detector/rules/specific/auto_test_pre_detector_no_test_.json b/tests/testdata/auto_tests/pre_detector/rules/specific/auto_test_pre_detector_no_test_.json index ebc751cac..e9f441735 100644 --- a/tests/testdata/auto_tests/pre_detector/rules/specific/auto_test_pre_detector_no_test_.json +++ b/tests/testdata/auto_tests/pre_detector/rules/specific/auto_test_pre_detector_no_test_.json @@ -1,7 +1,7 @@ [{ "filter": "some_field", "pre_detector": { - "id": "SOME_TEST_RULE_ID", + "id": "SOME_TEST_RULE_ID_2", "title": "SOME_TEST_RULE", "severity": "critical", "mitre": [], @@ -9,4 +9,4 @@ }, "sigma_fields": true, "description": "Test-rule without auto-test" -}] \ No newline at end of file +}] diff --git a/tests/testdata/auto_tests/template_replacer/rules/generic/template_replacer.json b/tests/testdata/auto_tests/template_replacer/rules/generic/template_replacer.json index 051249872..a2895b171 100644 --- a/tests/testdata/auto_tests/template_replacer/rules/generic/template_replacer.json +++ b/tests/testdata/auto_tests/template_replacer/rules/generic/template_replacer.json @@ -1,5 +1,7 @@ [{ "filter": "winlog.provider_name: \"the provider\" AND winlog.event_id: 123", - "template_replacer": {}, + "template_replacer": { + "id": "template-replacer-1" + }, "description": "" -}] \ No newline at end of file +}] diff --git a/tests/testdata/auto_tests/template_replacer/rules/specific/template_replacer.json b/tests/testdata/auto_tests/template_replacer/rules/specific/template_replacer.json index 9723abc3e..1073e6624 100644 --- a/tests/testdata/auto_tests/template_replacer/rules/specific/template_replacer.json +++ b/tests/testdata/auto_tests/template_replacer/rules/specific/template_replacer.json @@ -1,7 +1,9 @@ [ { "filter": "winlog.provider_name: \"the provider\" AND winlog.event_id: 123", - "template_replacer": {}, + "template_replacer": { + "id": "template-replacer-2" + }, "description": "" } -] \ No newline at end of file +] diff --git a/tests/unit/processor/template_replacer/test_template_replacer.py b/tests/unit/processor/template_replacer/test_template_replacer.py index 2eca9f0fd..008830ee9 100644 --- a/tests/unit/processor/template_replacer/test_template_replacer.py +++ b/tests/unit/processor/template_replacer/test_template_replacer.py @@ -163,3 +163,14 @@ def _create_template_replacer(self, config): template_replacer = Factory.create({"test instance": config}) template_replacer.setup() return template_replacer + + def test_replace_message_via_template(self): + document = { + "winlog": {"channel": "System", "provider_name": "Test", "event_id": 123}, + "message": "foo", + } + + self.object.process(document) + + assert document.get("message") + assert document["message"] == "Test %1 Test %2" diff --git a/tests/unit/util/test_auto_rule_tester.py b/tests/unit/util/test_auto_rule_tester.py index d66b1f3ac..6424a0557 100644 --- a/tests/unit/util/test_auto_rule_tester.py +++ b/tests/unit/util/test_auto_rule_tester.py @@ -7,6 +7,7 @@ import pytest from logprep.util.auto_rule_tester.auto_rule_tester import AutoRuleTester +from logprep.util.configuration import Configuration LOGGER = logging.getLogger() @@ -14,6 +15,7 @@ @pytest.fixture(name="auto_rule_tester") def fixture_auto_rule_tester(): config_path = "tests/testdata/config/config-auto-tests.yml" + Configuration.from_source(config_path)._verify() return AutoRuleTester(config_path) From 770a80ae0d3a33f7b5edd1c64601a9f675a3ac78 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Mon, 11 Nov 2024 11:25:54 +0100 Subject: [PATCH 19/38] fix and refactor generic_resolver --- .../processor/generic_resolver/processor.py | 93 +++++++------------ tests/unit/util/test_helper_add_field.py | 7 ++ 2 files changed, 39 insertions(+), 61 deletions(-) diff --git a/logprep/processor/generic_resolver/processor.py b/logprep/processor/generic_resolver/processor.py index b2ca66c26..fec01512c 100644 --- a/logprep/processor/generic_resolver/processor.py +++ b/logprep/processor/generic_resolver/processor.py @@ -26,9 +26,7 @@ """ import re -from typing import Union -from logprep.processor.base.exceptions import FieldExistsWarning from logprep.processor.field_manager.processor import FieldManager from logprep.processor.generic_resolver.rule import GenericResolverRule from logprep.util.helper import add_field_to, get_dotted_field_value @@ -41,67 +39,40 @@ class GenericResolver(FieldManager): def _apply_rules(self, event, rule): """Apply the given rule to the current event""" - conflicting_fields = [] - - source_values = [] + source_field_values = [ + get_dotted_field_value(event, source_field) + for source_field in rule.field_mapping.keys() + ] + self._handle_missing_fields(event, rule, rule.field_mapping.keys(), source_field_values) for source_field, target_field in rule.field_mapping.items(): - source_value = get_dotted_field_value(event, source_field) - source_values.append(source_value) - if source_value is None: + source_field_value = get_dotted_field_value(event, source_field) + if source_field_value is None: continue - - # FILE - if rule.resolve_from_file: - pattern = f'^{rule.resolve_from_file["pattern"]}$' - replacements = rule.resolve_from_file["additions"] - matches = re.match(pattern, source_value) - if matches: - dest_val = replacements.get(matches.group("mapping")) - if dest_val: - success = self._add_uniquely_to_list(event, rule, target_field, dest_val) - if not success: - conflicting_fields.append(target_field) - - # LIST - for pattern, dest_val in rule.resolve_list.items(): - if re.search(pattern, source_value): - success = add_field_to( - event, - target_field, - dest_val, - extends_lists=rule.extend_target_list, - overwrite_output_field=rule.overwrite_target, - ) - if not success: - conflicting_fields.append(target_field) - break - self._handle_missing_fields(event, rule, rule.field_mapping.keys(), source_values) - if conflicting_fields: - raise FieldExistsWarning(rule, event, conflicting_fields) - - @staticmethod - def _add_uniquely_to_list( - event: dict, - rule: GenericResolverRule, - target: str, - content: Union[str, float, int, list, dict], - ) -> bool: - """Extend list if content is not already in the list""" - add_success = True - target_val = get_dotted_field_value(event, target) - target_is_list = isinstance(target_val, list) - if rule.extend_target_list and not target_is_list: - empty_list = [] - add_success &= add_field_to( + content = self._find_content_of_first_matching_pattern(rule, source_field_value) + if not content: + continue + current_content = get_dotted_field_value(event, target_field) + if isinstance(current_content, list) and content in current_content: + continue + if rule.extend_target_list and current_content is None: + content = [content] + add_field_to( event, - target, - empty_list, + target_field, + content, + extends_lists=rule.extend_target_list, overwrite_output_field=rule.overwrite_target, ) - if add_success: - target_is_list = True - target_val = empty_list - if target_is_list and content in target_val: - return add_success - add_success = add_field_to(event, target, content, extends_lists=rule.extend_target_list) - return add_success + + def _find_content_of_first_matching_pattern(self, rule, source_field_value): + if rule.resolve_from_file: + pattern = f'^{rule.resolve_from_file["pattern"]}$' + replacements = rule.resolve_from_file["additions"] + matches = re.match(pattern, source_field_value) + if matches: + content = replacements.get(matches.group("mapping")) + if content: + return content + for pattern, content in rule.resolve_list.items(): + if re.search(pattern, source_field_value): + return content diff --git a/tests/unit/util/test_helper_add_field.py b/tests/unit/util/test_helper_add_field.py index 124043cf9..d622fdd37 100644 --- a/tests/unit/util/test_helper_add_field.py +++ b/tests/unit/util/test_helper_add_field.py @@ -122,3 +122,10 @@ def test_add_list_with_nested_keys(self): } add_field_to(testdict, "key1.key2.key3.key4.key5.list", ["content"], extends_lists=True) assert testdict == expected + + def test_add_value_not_as_list_if_it_is_a_new_value_even_though_extends_lists_is_true(self): + document = { + "some": "field", + } + add_field_to(document, "new", "list", extends_lists=True) + assert document.get("new") == "list" From 52d1823c309eb3e33efb86f3033f79afd5e3ef41 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Mon, 11 Nov 2024 11:31:56 +0100 Subject: [PATCH 20/38] fix and refactor hyperscan_resolver --- .../processor/generic_resolver/processor.py | 10 +++- .../processor/hyperscan_resolver/processor.py | 51 +++++++------------ logprep/util/helper.py | 4 +- 3 files changed, 28 insertions(+), 37 deletions(-) diff --git a/logprep/processor/generic_resolver/processor.py b/logprep/processor/generic_resolver/processor.py index fec01512c..9315713fe 100644 --- a/logprep/processor/generic_resolver/processor.py +++ b/logprep/processor/generic_resolver/processor.py @@ -27,9 +27,10 @@ import re +from logprep.processor.base.exceptions import FieldExistsWarning from logprep.processor.field_manager.processor import FieldManager from logprep.processor.generic_resolver.rule import GenericResolverRule -from logprep.util.helper import add_field_to, get_dotted_field_value +from logprep.util.helper import get_dotted_field_value, add_field_to_silent_fail class GenericResolver(FieldManager): @@ -44,6 +45,7 @@ def _apply_rules(self, event, rule): for source_field in rule.field_mapping.keys() ] self._handle_missing_fields(event, rule, rule.field_mapping.keys(), source_field_values) + conflicting_fields = [] for source_field, target_field in rule.field_mapping.items(): source_field_value = get_dotted_field_value(event, source_field) if source_field_value is None: @@ -56,13 +58,17 @@ def _apply_rules(self, event, rule): continue if rule.extend_target_list and current_content is None: content = [content] - add_field_to( + failed_target = add_field_to_silent_fail( event, target_field, content, extends_lists=rule.extend_target_list, overwrite_output_field=rule.overwrite_target, ) + if failed_target: + conflicting_fields.append(failed_target) + if conflicting_fields: + raise FieldExistsWarning(event, conflicting_fields, rule) def _find_content_of_first_matching_pattern(self, rule, source_field_value): if rule.resolve_from_file: diff --git a/logprep/processor/hyperscan_resolver/processor.py b/logprep/processor/hyperscan_resolver/processor.py index d942d1ee2..8dedbc4a1 100644 --- a/logprep/processor/hyperscan_resolver/processor.py +++ b/logprep/processor/hyperscan_resolver/processor.py @@ -33,7 +33,7 @@ import errno from os import makedirs, path -from typing import Any, Dict, Tuple, Union +from typing import Any, Dict, Tuple from attr import define, field @@ -43,7 +43,7 @@ ProcessingCriticalError, ) from logprep.processor.field_manager.processor import FieldManager -from logprep.util.helper import add_field_to, get_dotted_field_value +from logprep.util.helper import get_dotted_field_value, add_field_to_silent_fail from logprep.util.validators import directory_validator # pylint: disable=no-name-in-module @@ -57,6 +57,7 @@ # pylint: disable=ungrouped-imports from logprep.processor.hyperscan_resolver.rule import HyperscanResolverRule + # pylint: enable=ungrouped-imports @@ -113,39 +114,23 @@ def _apply_rules(self, event: dict, rule: HyperscanResolverRule): if matches: dest_val = pattern_id_to_dest_val_map[matches[matches.index(min(matches))]] if dest_val: - add_success = self._add_uniquely_to_list(event, rule, resolve_target, dest_val) - if not add_success: - conflicting_fields.append(resolve_target) + current_content = get_dotted_field_value(event, resolve_target) + if isinstance(current_content, list) and dest_val in current_content: + continue + if rule.extend_target_list and current_content is None: + dest_val = [dest_val] + failed_target = add_field_to_silent_fail( + event, + resolve_target, + dest_val, + extends_lists=rule.extend_target_list, + overwrite_output_field=rule.overwrite_target, + ) + if failed_target: + conflicting_fields.append(failed_target) self._handle_missing_fields(event, rule, rule.field_mapping.keys(), source_values) if conflicting_fields: - raise FieldExistsWarning(rule, event, conflicting_fields) - - @staticmethod - def _add_uniquely_to_list( - event: dict, - rule: HyperscanResolverRule, - target: str, - content: Union[str, float, int, list, dict], - ) -> bool: - """Extend list if content is not already in the list""" - add_success = True - target_val = get_dotted_field_value(event, target) - target_is_list = isinstance(target_val, list) - if rule.extend_target_list and not target_is_list: - empty_list = [] - add_success &= add_field_to( - event, - target, - empty_list, - overwrite_output_field=rule.overwrite_target, - ) - if add_success: - target_is_list = True - target_val = empty_list - if target_is_list and content in target_val: - return add_success - add_success = add_field_to(event, target, content, extends_lists=rule.extend_target_list) - return add_success + raise FieldExistsWarning(event, conflicting_fields, rule) @staticmethod def _match_with_hyperscan(hyperscan_db: Database, src_val: str) -> list: diff --git a/logprep/util/helper.py b/logprep/util/helper.py index e18be080a..16c5efc31 100644 --- a/logprep/util/helper.py +++ b/logprep/util/helper.py @@ -59,9 +59,9 @@ def _add_and_not_overwrite_key(sub_dict, key): return sub_dict.get(key) -def add_field_to_silent_fail(*args): +def add_field_to_silent_fail(*args, **kwargs): try: - add_field_to(*args) + add_field_to(*args, **kwargs) except FieldExistsWarning: return args[1] From 28d12b0303d511bbae122ce1e0aa41c6d1ca0aeb Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Mon, 11 Nov 2024 11:39:49 +0100 Subject: [PATCH 21/38] fix auto_rule_tester --- tests/unit/util/test_auto_rule_tester.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/util/test_auto_rule_tester.py b/tests/unit/util/test_auto_rule_tester.py index 6424a0557..ffbd7ab85 100644 --- a/tests/unit/util/test_auto_rule_tester.py +++ b/tests/unit/util/test_auto_rule_tester.py @@ -281,8 +281,8 @@ def test_full_auto_rule_test_run(self, auto_rule_tester, capsys): ] expected_overall_results = [ - "+ Successful Tests: 32", - "- Failed Tests: 6", + "+ Successful Tests: 31", + "- Failed Tests: 7", "~ Warning: 2", "Rule Test Coverage: 72.72727272727273", "Total Tests: 38", From 04e115dc53fdd9e5f67b953f8bc0dbdb3699a1c9 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Mon, 11 Nov 2024 12:26:06 +0100 Subject: [PATCH 22/38] fix labeler and add new test --- logprep/processor/labeler/processor.py | 4 ++-- logprep/util/helper.py | 2 +- tests/unit/processor/labeler/test_labeler.py | 8 ++++++++ tests/unit/util/test_auto_rule_tester.py | 4 ++-- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/logprep/processor/labeler/processor.py b/logprep/processor/labeler/processor.py index 5473e697c..fba67e310 100644 --- a/logprep/processor/labeler/processor.py +++ b/logprep/processor/labeler/processor.py @@ -78,7 +78,7 @@ def _apply_rules(self, event, rule): """Applies the rule to the current event""" targets = [f"label.{key}" for key in rule.label.keys()] contents = rule.label.values() - add_batch_to_silent_fail(event, targets, contents) + add_batch_to_silent_fail(event, targets, contents, extends_lists=True) # convert sets into sorted lists - contents = [sorted(list(get_dotted_field_value(event, target))) for target in targets] + contents = [sorted(set(get_dotted_field_value(event, target))) for target in targets] add_batch_to_silent_fail(event, targets, contents, overwrite_output_field=True) diff --git a/logprep/util/helper.py b/logprep/util/helper.py index 16c5efc31..7dbcd1107 100644 --- a/logprep/util/helper.py +++ b/logprep/util/helper.py @@ -136,7 +136,7 @@ def add_field_to( return if not extends_lists or not isinstance(existing_value, list): raise FieldExistsWarning(event, [target_field]) - if isinstance(content, list): + if isinstance(content, list | set): target_parent[target_key].extend(content) else: target_parent[target_key].append(content) diff --git a/tests/unit/processor/labeler/test_labeler.py b/tests/unit/processor/labeler/test_labeler.py index c6085d7bc..cae9105b5 100644 --- a/tests/unit/processor/labeler/test_labeler.py +++ b/tests/unit/processor/labeler/test_labeler.py @@ -257,3 +257,11 @@ def test_create_loads_the_specified_labeling_schema(self): labeler = Factory.create({"test instance": config}) assert labeler._schema == expected_schema + + def test_extend_list_of_existing_labels(self): + rule = {"filter": "applyrule", "labeler": {"label": {"reporter": ["windows", "foo"]}}} + document = {"applyrule": "yes", "label": {"reporter": ["windows"]}} + expected = {"applyrule": "yes", "label": {"reporter": ["foo", "windows"]}} + self._load_specific_rule(rule) + self.object.process(document) + assert document == expected diff --git a/tests/unit/util/test_auto_rule_tester.py b/tests/unit/util/test_auto_rule_tester.py index ffbd7ab85..6424a0557 100644 --- a/tests/unit/util/test_auto_rule_tester.py +++ b/tests/unit/util/test_auto_rule_tester.py @@ -281,8 +281,8 @@ def test_full_auto_rule_test_run(self, auto_rule_tester, capsys): ] expected_overall_results = [ - "+ Successful Tests: 31", - "- Failed Tests: 7", + "+ Successful Tests: 32", + "- Failed Tests: 6", "~ Warning: 2", "Rule Test Coverage: 72.72727272727273", "Total Tests: 38", From bb7cbd0e6c9bdebf326bd113aa6213f98da202e8 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Mon, 11 Nov 2024 12:58:25 +0100 Subject: [PATCH 23/38] clean up - add CHANGELOG.md - remove duplicate test - write documentation --- CHANGELOG.md | 4 + logprep/util/helper.py | 105 +++++++++++++----- .../test_template_replacer.py | 11 -- 3 files changed, 81 insertions(+), 39 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6402c1ab8..6e78dac9b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ * replace `BaseException` with `Exception` for custom errors * refactor `generic_resolver` to validate rules on startup instead of application of each rule +* rewrite the helper method `add_field_to` such that it always raises an `FieldExistsWarning` instead of return a bool. +* add new helper method `add_batch_to` to directly add multiple fields to one event +* refactored some processors to make use of the new helper methods + ### Bugfix diff --git a/logprep/util/helper.py b/logprep/util/helper.py index 7dbcd1107..a9cc49a41 100644 --- a/logprep/util/helper.py +++ b/logprep/util/helper.py @@ -59,34 +59,6 @@ def _add_and_not_overwrite_key(sub_dict, key): return sub_dict.get(key) -def add_field_to_silent_fail(*args, **kwargs): - try: - add_field_to(*args, **kwargs) - except FieldExistsWarning: - return args[1] - - -def add_batch_to(event, targets, contents, extends_lists=False, overwrite_output_field=False): - unsuccessful_targets = map( - add_field_to_silent_fail, - itertools.repeat(event, len(targets)), - targets, - contents, - itertools.repeat(extends_lists, len(targets)), - itertools.repeat(overwrite_output_field, len(targets)), - ) - unsuccessful_targets = [item for item in unsuccessful_targets if item is not None] - if unsuccessful_targets: - raise FieldExistsWarning(event, unsuccessful_targets) - - -def add_batch_to_silent_fail(*args, **kwargs): - try: - add_batch_to(*args, **kwargs) - except FieldExistsWarning as error: - return error.skipped_fields - - def add_field_to( event, target_field, @@ -142,6 +114,83 @@ def add_field_to( target_parent[target_key].append(content) +def add_field_to_silent_fail(*args, **kwargs): + """ + Adds a field to an object, ignoring the FieldExistsWarning if the field already exists. + + Parameters: + args: tuple + Positional arguments to pass to the add_field_to function. + kwargs: dict + Keyword arguments to pass to the add_field_to function. + + Returns: + The field that was attempted to be added, if the field already exists. + + Raises: + FieldExistsWarning: If the field already exists, but this warning is caught and ignored. + """ + try: + add_field_to(*args, **kwargs) + except FieldExistsWarning: + return args[1] + + +def add_batch_to(event, targets, contents, extends_lists=False, overwrite_output_field=False): + """ + Handles the batch addition operation while raising a FieldExistsWarning with all unsuccessful targets. + + Parameters: + event: dict + The event object to which fields are to be added. + targets: list + A list of target field names where the contents will be added. + contents: list + A list of contents corresponding to each target field. + extends_lists: bool + A boolean indicating whether to extend lists if the target field already exists. + overwrite_output_field: bool + A boolean indicating whether to overwrite the target field if it already exists. + + Raises: + FieldExistsWarning: If there are targets to which the content could not be added due to field + existence restrictions. + """ + unsuccessful_targets = map( + add_field_to_silent_fail, + itertools.repeat(event, len(targets)), + targets, + contents, + itertools.repeat(extends_lists, len(targets)), + itertools.repeat(overwrite_output_field, len(targets)), + ) + unsuccessful_targets = [item for item in unsuccessful_targets if item is not None] + if unsuccessful_targets: + raise FieldExistsWarning(event, unsuccessful_targets) + + +def add_batch_to_silent_fail(*args, **kwargs) -> None | list: + """ + Handles the batch addition operation while silently handling FieldExistsWarning. + + Parameters + ---------- + *args : tuple + Variable length argument list. + **kwargs : dict + Arbitrary keyword arguments. + + Returns + ------- + skipped_fields : list + A list of fields that were skipped due to FieldExistWarning. + """ + try: + add_batch_to(*args, **kwargs) + except FieldExistsWarning as error: + return error.skipped_fields + + def _get_slice_arg(slice_item): return int(slice_item) if slice_item else None diff --git a/tests/unit/processor/template_replacer/test_template_replacer.py b/tests/unit/processor/template_replacer/test_template_replacer.py index 008830ee9..2eca9f0fd 100644 --- a/tests/unit/processor/template_replacer/test_template_replacer.py +++ b/tests/unit/processor/template_replacer/test_template_replacer.py @@ -163,14 +163,3 @@ def _create_template_replacer(self, config): template_replacer = Factory.create({"test instance": config}) template_replacer.setup() return template_replacer - - def test_replace_message_via_template(self): - document = { - "winlog": {"channel": "System", "provider_name": "Test", "event_id": 123}, - "message": "foo", - } - - self.object.process(document) - - assert document.get("message") - assert document["message"] == "Test %1 Test %2" From bcae7ba4bf7145f5fccbaac9113cc993c8d9448a Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Tue, 12 Nov 2024 09:17:15 +0100 Subject: [PATCH 24/38] replace else statement with early return --- logprep/util/helper.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/logprep/util/helper.py b/logprep/util/helper.py index a9cc49a41..aaa73dc5e 100644 --- a/logprep/util/helper.py +++ b/logprep/util/helper.py @@ -97,21 +97,21 @@ def add_field_to( if overwrite_output_field: target_parent = reduce(_add_and_overwrite_key, field_path) target_parent[target_key] = content + return + try: + target_parent = reduce(_add_and_not_overwrite_key, field_path) + except KeyError as error: + raise FieldExistsWarning(event, [target_field]) from error + existing_value = target_parent.get(target_key) + if existing_value is None: + target_parent[target_key] = content + return + if not extends_lists or not isinstance(existing_value, list): + raise FieldExistsWarning(event, [target_field]) + if isinstance(content, list | set): + target_parent[target_key].extend(content) else: - try: - target_parent = reduce(_add_and_not_overwrite_key, field_path) - except KeyError as error: - raise FieldExistsWarning(event, [target_field]) from error - existing_value = target_parent.get(target_key) - if existing_value is None: - target_parent[target_key] = content - return - if not extends_lists or not isinstance(existing_value, list): - raise FieldExistsWarning(event, [target_field]) - if isinstance(content, list | set): - target_parent[target_key].extend(content) - else: - target_parent[target_key].append(content) + target_parent[target_key].append(content) def add_field_to_silent_fail(*args, **kwargs): From 297aa9740e7014377b1b46b9df2032c78a4d8063 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Tue, 12 Nov 2024 10:02:47 +0100 Subject: [PATCH 25/38] remove silent fail functions for field addition - Refactored helper functions to consistently raise exceptions. - Improved error handling by eliminating silent failures. - Updated related unit tests to expect raised exceptions. - highlight breaking change in CHANGELOG.md --- CHANGELOG.md | 4 ++ logprep/abc/input.py | 6 +-- .../processor/generic_resolver/processor.py | 21 ++++----- .../processor/hyperscan_resolver/processor.py | 21 ++++----- logprep/processor/labeler/processor.py | 6 +-- logprep/processor/requester/processor.py | 39 +++++++++-------- logprep/util/helper.py | 43 ++++--------------- tests/unit/connector/base.py | 15 ++++--- 8 files changed, 70 insertions(+), 85 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e78dac9b..bcc2e73b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## next release ### Breaking + +* `CriticalInputError` is raised when the input preprocessor values can't be set, this was so far only true + for the hmac preprocessor, but is now also applied for all other preprocessors. + ### Features ### Improvements diff --git a/logprep/abc/input.py b/logprep/abc/input.py index 6c89cbb2c..8fe125648 100644 --- a/logprep/abc/input.py +++ b/logprep/abc/input.py @@ -18,7 +18,7 @@ from logprep.abc.exceptions import LogprepException from logprep.metrics.metrics import Metric from logprep.processor.base.exceptions import FieldExistsWarning -from logprep.util.helper import add_field_to, get_dotted_field_value, add_field_to_silent_fail +from logprep.util.helper import add_field_to, get_dotted_field_value from logprep.util.time import UTC, TimeParser from logprep.util.validators import dict_structure_validator @@ -310,7 +310,7 @@ def _add_env_enrichment_to_event(self, event: dict): def _add_arrival_time_information_to_event(self, event: dict): now = TimeParser.now() target_field = self._config.preprocessing.get("log_arrival_time_target_field") - add_field_to_silent_fail(event, target_field, now.isoformat()) + add_field_to(event, target_field, now.isoformat()) def _add_arrival_timedelta_information_to_event(self, event: dict): log_arrival_timedelta_config = self._config.preprocessing.get("log_arrival_timedelta") @@ -332,7 +332,7 @@ def _add_version_information_to_event(self, event: dict): """Add the version information to the event""" target_field = self._config.preprocessing.get("version_info_target_field") # pylint: disable=protected-access - add_field_to_silent_fail(event, target_field, self._config._version_information) + add_field_to(event, target_field, self._config._version_information) # pylint: enable=protected-access def _add_hmac_to(self, event_dict, raw_event) -> dict: diff --git a/logprep/processor/generic_resolver/processor.py b/logprep/processor/generic_resolver/processor.py index 9315713fe..803639b12 100644 --- a/logprep/processor/generic_resolver/processor.py +++ b/logprep/processor/generic_resolver/processor.py @@ -30,7 +30,7 @@ from logprep.processor.base.exceptions import FieldExistsWarning from logprep.processor.field_manager.processor import FieldManager from logprep.processor.generic_resolver.rule import GenericResolverRule -from logprep.util.helper import get_dotted_field_value, add_field_to_silent_fail +from logprep.util.helper import get_dotted_field_value, add_field_to class GenericResolver(FieldManager): @@ -58,15 +58,16 @@ def _apply_rules(self, event, rule): continue if rule.extend_target_list and current_content is None: content = [content] - failed_target = add_field_to_silent_fail( - event, - target_field, - content, - extends_lists=rule.extend_target_list, - overwrite_output_field=rule.overwrite_target, - ) - if failed_target: - conflicting_fields.append(failed_target) + try: + add_field_to( + event, + target_field, + content, + extends_lists=rule.extend_target_list, + overwrite_output_field=rule.overwrite_target, + ) + except FieldExistsWarning as error: + conflicting_fields.extend(error.skipped_fields) if conflicting_fields: raise FieldExistsWarning(event, conflicting_fields, rule) diff --git a/logprep/processor/hyperscan_resolver/processor.py b/logprep/processor/hyperscan_resolver/processor.py index 8dedbc4a1..57eccb890 100644 --- a/logprep/processor/hyperscan_resolver/processor.py +++ b/logprep/processor/hyperscan_resolver/processor.py @@ -43,7 +43,7 @@ ProcessingCriticalError, ) from logprep.processor.field_manager.processor import FieldManager -from logprep.util.helper import get_dotted_field_value, add_field_to_silent_fail +from logprep.util.helper import get_dotted_field_value, add_field_to from logprep.util.validators import directory_validator # pylint: disable=no-name-in-module @@ -119,15 +119,16 @@ def _apply_rules(self, event: dict, rule: HyperscanResolverRule): continue if rule.extend_target_list and current_content is None: dest_val = [dest_val] - failed_target = add_field_to_silent_fail( - event, - resolve_target, - dest_val, - extends_lists=rule.extend_target_list, - overwrite_output_field=rule.overwrite_target, - ) - if failed_target: - conflicting_fields.append(failed_target) + try: + add_field_to( + event, + resolve_target, + dest_val, + extends_lists=rule.extend_target_list, + overwrite_output_field=rule.overwrite_target, + ) + except FieldExistsWarning as error: + conflicting_fields.extend(error.skipped_fields) self._handle_missing_fields(event, rule, rule.field_mapping.keys(), source_values) if conflicting_fields: raise FieldExistsWarning(event, conflicting_fields, rule) diff --git a/logprep/processor/labeler/processor.py b/logprep/processor/labeler/processor.py index fba67e310..452d3711a 100644 --- a/logprep/processor/labeler/processor.py +++ b/logprep/processor/labeler/processor.py @@ -35,7 +35,7 @@ from logprep.processor.labeler.rule import LabelerRule from logprep.util.helper import ( get_dotted_field_value, - add_batch_to_silent_fail, + add_batch_to, ) @@ -78,7 +78,7 @@ def _apply_rules(self, event, rule): """Applies the rule to the current event""" targets = [f"label.{key}" for key in rule.label.keys()] contents = rule.label.values() - add_batch_to_silent_fail(event, targets, contents, extends_lists=True) + add_batch_to(event, targets, contents, extends_lists=True) # convert sets into sorted lists contents = [sorted(set(get_dotted_field_value(event, target))) for target in targets] - add_batch_to_silent_fail(event, targets, contents, overwrite_output_field=True) + add_batch_to(event, targets, contents, overwrite_output_field=True) diff --git a/logprep/processor/requester/processor.py b/logprep/processor/requester/processor.py index b94a547fc..a9aeed78b 100644 --- a/logprep/processor/requester/processor.py +++ b/logprep/processor/requester/processor.py @@ -46,8 +46,8 @@ from logprep.processor.requester.rule import RequesterRule from logprep.util.helper import ( get_source_fields_dict, - add_field_to_silent_fail, - add_batch_to_silent_fail, + add_field_to, + add_batch_to, ) TEMPLATE_KWARGS = ("url", "json", "data", "params") @@ -71,28 +71,31 @@ def _apply_rules(self, event, rule): self._handle_response(event, rule, response) def _handle_response(self, event, rule, response): - failed_targets = [] + conflicting_fields = [] if rule.target_field: result = self._get_result(response) - failed_target = add_field_to_silent_fail( - event, - rule.target_field, - result, - rule.extend_target_list, - rule.overwrite_target, - ) - failed_targets.append(failed_target) + try: + add_field_to( + event, + rule.target_field, + result, + rule.extend_target_list, + rule.overwrite_target, + ) + except FieldExistsWarning as error: + conflicting_fields.extend(error.skipped_fields) if rule.target_field_mapping: source_fields = rule.target_field_mapping.keys() contents = self._get_field_values(self._get_result(response), source_fields) targets = rule.target_field_mapping.values() - failed = add_batch_to_silent_fail( - event, targets, contents, rule.extend_target_list, rule.overwrite_target - ) - failed_targets.append(failed) - failed_targets = [failed for failed in failed_targets if failed is not None] - if failed_targets: - raise FieldExistsWarning(event, failed_targets, rule) + try: + add_batch_to( + event, targets, contents, rule.extend_target_list, rule.overwrite_target + ) + except FieldExistsWarning as error: + conflicting_fields.extend(error.skipped_fields) + if conflicting_fields: + raise FieldExistsWarning(event, conflicting_fields, rule) def _request(self, event, rule, kwargs): try: diff --git a/logprep/util/helper.py b/logprep/util/helper.py index aaa73dc5e..2db41d418 100644 --- a/logprep/util/helper.py +++ b/logprep/util/helper.py @@ -65,7 +65,7 @@ def add_field_to( content, extends_lists=False, overwrite_output_field=False, -): +) -> None: """ Add content to the output_field in the given event. Output_field can be a dotted subfield. In case of missing fields, all intermediate fields will be created. @@ -114,9 +114,10 @@ def add_field_to( target_parent[target_key].append(content) -def add_field_to_silent_fail(*args, **kwargs): +def _add_field_to_silent_fail(*args, **kwargs) -> None | str: """ - Adds a field to an object, ignoring the FieldExistsWarning if the field already exists. + Adds a field to an object, ignoring the FieldExistsWarning if the field already exists. Is only needed in the + add_batch_to map function. Without this the map would terminate early. Parameters: args: tuple @@ -136,7 +137,9 @@ def add_field_to_silent_fail(*args, **kwargs): return args[1] -def add_batch_to(event, targets, contents, extends_lists=False, overwrite_output_field=False): +def add_batch_to( + event, targets, contents, extends_lists=False, overwrite_output_field=False +) -> None: """ Handles the batch addition operation while raising a FieldExistsWarning with all unsuccessful targets. @@ -157,7 +160,7 @@ def add_batch_to(event, targets, contents, extends_lists=False, overwrite_output existence restrictions. """ unsuccessful_targets = map( - add_field_to_silent_fail, + _add_field_to_silent_fail, itertools.repeat(event, len(targets)), targets, contents, @@ -169,28 +172,6 @@ def add_batch_to(event, targets, contents, extends_lists=False, overwrite_output raise FieldExistsWarning(event, unsuccessful_targets) -def add_batch_to_silent_fail(*args, **kwargs) -> None | list: - """ - Handles the batch addition operation while silently handling FieldExistsWarning. - - Parameters - ---------- - *args : tuple - Variable length argument list. - **kwargs : dict - Arbitrary keyword arguments. - - Returns - ------- - skipped_fields : list - A list of fields that were skipped due to FieldExistWarning. - """ - try: - add_batch_to(*args, **kwargs) - except FieldExistsWarning as error: - return error.skipped_fields - - def _get_slice_arg(slice_item): return int(slice_item) if slice_item else None @@ -365,14 +346,6 @@ def add_and_overwrite(event, target_field, content, *_): add_field_to(event, target_field, content, overwrite_output_field=True) -def add_and_overwrite_silent_fail(event, target_field, content, *_): - """wrapper for add_field_to""" - try: - add_field_to(event, target_field, content, overwrite_output_field=True) - except FieldExistsWarning: - ... - - def append(event, target_field, content, separator): """appends to event""" target_value = get_dotted_field_value(event, target_field) diff --git a/tests/unit/connector/base.py b/tests/unit/connector/base.py index 72b242a44..729aec312 100644 --- a/tests/unit/connector/base.py +++ b/tests/unit/connector/base.py @@ -311,8 +311,9 @@ def test_pipeline_preprocessing_does_not_add_versions_if_target_field_exists_alr connector = Factory.create({"test connector": connector_config}) test_event = {"any": "content", "version_info": "something random"} connector._get_event = mock.MagicMock(return_value=(test_event, None)) - result = connector.get_next(0.01) - assert result == {"any": "content", "version_info": "something random"} + with pytest.raises(CriticalInputError, match="could not be written") as error: + _ = connector.get_next(0.01) + assert error.value.raw_input == {"any": "content", "version_info": "something random"} def test_pipeline_preprocessing_only_version_information(self): preprocessing_config = { @@ -325,8 +326,9 @@ def test_pipeline_preprocessing_only_version_information(self): connector = Factory.create({"test connector": connector_config}) test_event = {"any": "content", "version_info": "something random"} connector._get_event = mock.MagicMock(return_value=(test_event, None)) - result = connector.get_next(0.01) - assert result == {"any": "content", "version_info": "something random"} + with pytest.raises(CriticalInputError, match="could not be written") as error: + _ = connector.get_next(0.01) + assert error.value.raw_input == {"any": "content", "version_info": "something random"} def test_get_raw_event_is_callable(self): # should be overwritten for special implementation @@ -376,8 +378,9 @@ def test_pipeline_preprocessing_does_not_add_log_arrival_time_if_target_field_ex connector = Factory.create({"test connector": connector_config}) test_event = {"any": "content", "arrival_time": "does not matter"} connector._get_event = mock.MagicMock(return_value=(test_event, None)) - result = connector.get_next(0.01) - assert result == {"any": "content", "arrival_time": "does not matter"} + with pytest.raises(CriticalInputError, match="could not be written") as error: + _ = connector.get_next(0.01) + assert error.value.raw_input == {"any": "content", "arrival_time": "does not matter"} def test_pipeline_preprocessing_adds_timestamp_delta_if_configured(self): preprocessing_config = { From 4cd6db7711dd3bfcfc259197e2274ef3ea9c9820 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Tue, 12 Nov 2024 10:22:39 +0100 Subject: [PATCH 26/38] Refactor overwrite argument for field addition functions - Renames `overwrite_output_field` to `overwrite_target_field` in multiple files. - Ensures consistency in method signatures --- logprep/abc/processor.py | 3 +- logprep/processor/clusterer/processor.py | 2 +- logprep/processor/dissector/processor.py | 2 +- .../domain_label_extractor/processor.py | 4 +-- .../processor/domain_resolver/processor.py | 2 +- .../processor/generic_resolver/processor.py | 4 +-- logprep/processor/geoip_enricher/processor.py | 4 +-- logprep/processor/grokker/processor.py | 10 ++----- .../processor/hyperscan_resolver/processor.py | 7 ++--- logprep/processor/labeler/processor.py | 7 ++--- logprep/processor/pseudonymizer/processor.py | 2 +- .../processor/template_replacer/processor.py | 3 +- logprep/util/helper.py | 28 +++++++++---------- tests/unit/util/test_helper_add_field.py | 6 ++-- 14 files changed, 37 insertions(+), 47 deletions(-) diff --git a/logprep/abc/processor.py b/logprep/abc/processor.py index 832e5d104..df199ac4b 100644 --- a/logprep/abc/processor.py +++ b/logprep/abc/processor.py @@ -12,7 +12,6 @@ from logprep.framework.rule_tree.rule_tree import RuleTree, RuleTreeType from logprep.metrics.metrics import Metric from logprep.processor.base.exceptions import ( - FieldExistsWarning, ProcessingCriticalError, ProcessingError, ProcessingWarning, @@ -386,7 +385,7 @@ def _write_target_field(self, event: dict, rule: "Rule", result: any) -> None: target_field=rule.target_field, content=result, extends_lists=rule.extend_target_list, - overwrite_output_field=rule.overwrite_target, + overwrite_target_field=rule.overwrite_target, ) def setup(self): diff --git a/logprep/processor/clusterer/processor.py b/logprep/processor/clusterer/processor.py index 914a8c1a5..d6c985d8d 100644 --- a/logprep/processor/clusterer/processor.py +++ b/logprep/processor/clusterer/processor.py @@ -143,7 +143,7 @@ def _cluster(self, event: dict, rule: ClustererRule): self._config.output_field_name, cluster_signature, extends_lists=rule.extend_target_list, - overwrite_output_field=rule.overwrite_target, + overwrite_target_field=rule.overwrite_target, ) self._last_non_extracted_signature = sig_text diff --git a/logprep/processor/dissector/processor.py b/logprep/processor/dissector/processor.py index 1da24a239..0b4b7bc1d 100644 --- a/logprep/processor/dissector/processor.py +++ b/logprep/processor/dissector/processor.py @@ -90,6 +90,6 @@ def _apply_convert_datatype(self, event, rule): for target_field, converter in rule.convert_actions: try: target_value = converter(get_dotted_field_value(event, target_field)) - add_field_to(event, target_field, target_value, overwrite_output_field=True) + add_field_to(event, target_field, target_value, overwrite_target_field=True) except ValueError as error: self._handle_warning_error(event, rule, error) diff --git a/logprep/processor/domain_label_extractor/processor.py b/logprep/processor/domain_label_extractor/processor.py index 342d74e4b..c4883035d 100644 --- a/logprep/processor/domain_label_extractor/processor.py +++ b/logprep/processor/domain_label_extractor/processor.py @@ -49,7 +49,7 @@ from logprep.processor.domain_label_extractor.rule import DomainLabelExtractorRule from logprep.processor.field_manager.processor import FieldManager from logprep.util.getter import GetterFactory -from logprep.util.helper import add_and_overwrite, get_dotted_field_value, add_batch_to +from logprep.util.helper import add_and_overwrite, add_batch_to, get_dotted_field_value from logprep.util.validators import list_of_urls_validator logger = logging.getLogger("DomainLabelExtractor") @@ -141,7 +141,7 @@ def _apply_rules(self, event, rule: DomainLabelExtractorRule): f"{rule.target_field}.subdomain", ] contents = [f"{labels.domain}.{labels.suffix}", labels.suffix, labels.subdomain] - add_batch_to(event, targets, contents, overwrite_output_field=rule.overwrite_target) + add_batch_to(event, targets, contents, overwrite_target_field=rule.overwrite_target) else: tagging_field.append(f"invalid_domain_in_{rule.source_fields[0].replace('.', '_')}") add_and_overwrite(event, self._config.tagging_field_name, tagging_field) diff --git a/logprep/processor/domain_resolver/processor.py b/logprep/processor/domain_resolver/processor.py index 5f3231d2b..f9b918d45 100644 --- a/logprep/processor/domain_resolver/processor.py +++ b/logprep/processor/domain_resolver/processor.py @@ -225,4 +225,4 @@ def _store_debug_infos(self, event, requires_storing): "obtained_from_cache": not requires_storing, "cache_size": len(self._domain_ip_map.keys()), } - add_field_to(event, "resolved_ip_debug", event_dbg, overwrite_output_field=True) + add_field_to(event, "resolved_ip_debug", event_dbg, overwrite_target_field=True) diff --git a/logprep/processor/generic_resolver/processor.py b/logprep/processor/generic_resolver/processor.py index 803639b12..c0b4eccb3 100644 --- a/logprep/processor/generic_resolver/processor.py +++ b/logprep/processor/generic_resolver/processor.py @@ -30,7 +30,7 @@ from logprep.processor.base.exceptions import FieldExistsWarning from logprep.processor.field_manager.processor import FieldManager from logprep.processor.generic_resolver.rule import GenericResolverRule -from logprep.util.helper import get_dotted_field_value, add_field_to +from logprep.util.helper import add_field_to, get_dotted_field_value class GenericResolver(FieldManager): @@ -64,7 +64,7 @@ def _apply_rules(self, event, rule): target_field, content, extends_lists=rule.extend_target_list, - overwrite_output_field=rule.overwrite_target, + overwrite_target_field=rule.overwrite_target, ) except FieldExistsWarning as error: conflicting_fields.extend(error.skipped_fields) diff --git a/logprep/processor/geoip_enricher/processor.py b/logprep/processor/geoip_enricher/processor.py index b83791218..e6afce2e4 100644 --- a/logprep/processor/geoip_enricher/processor.py +++ b/logprep/processor/geoip_enricher/processor.py @@ -41,7 +41,7 @@ from logprep.processor.field_manager.processor import FieldManager from logprep.processor.geoip_enricher.rule import GEOIP_DATA_STUBS, GeoipEnricherRule from logprep.util.getter import GetterFactory -from logprep.util.helper import get_dotted_field_value, add_batch_to +from logprep.util.helper import add_batch_to, get_dotted_field_value logger = logging.getLogger("GeoipEnricher") @@ -139,5 +139,5 @@ def _apply_rules(self, event, rule): targets, contents, extends_lists=False, - overwrite_output_field=rule.overwrite_target, + overwrite_target_field=rule.overwrite_target, ) diff --git a/logprep/processor/grokker/processor.py b/logprep/processor/grokker/processor.py index ad8872a2b..a54a5ab23 100644 --- a/logprep/processor/grokker/processor.py +++ b/logprep/processor/grokker/processor.py @@ -38,15 +38,11 @@ from attrs import define, field, validators -from logprep.processor.base.exceptions import ( - FieldExistsWarning, - ProcessingError, - ProcessingWarning, -) +from logprep.processor.base.exceptions import ProcessingError, ProcessingWarning from logprep.processor.field_manager.processor import FieldManager from logprep.processor.grokker.rule import GrokkerRule from logprep.util.getter import GetterFactory -from logprep.util.helper import add_field_to, get_dotted_field_value, add_batch_to +from logprep.util.helper import add_batch_to, get_dotted_field_value logger = logging.getLogger("Grokker") @@ -96,7 +92,7 @@ def _apply_rules(self, event: dict, rule: GrokkerRule): targets, contents, extends_lists=rule.extend_target_list, - overwrite_output_field=rule.overwrite_target, + overwrite_target_field=rule.overwrite_target, ) if self._handle_missing_fields(event, rule, rule.actions.keys(), source_values): return diff --git a/logprep/processor/hyperscan_resolver/processor.py b/logprep/processor/hyperscan_resolver/processor.py index 57eccb890..81d050952 100644 --- a/logprep/processor/hyperscan_resolver/processor.py +++ b/logprep/processor/hyperscan_resolver/processor.py @@ -39,11 +39,11 @@ from logprep.processor.base.exceptions import ( FieldExistsWarning, - SkipImportError, ProcessingCriticalError, + SkipImportError, ) from logprep.processor.field_manager.processor import FieldManager -from logprep.util.helper import get_dotted_field_value, add_field_to +from logprep.util.helper import add_field_to, get_dotted_field_value from logprep.util.validators import directory_validator # pylint: disable=no-name-in-module @@ -57,7 +57,6 @@ # pylint: disable=ungrouped-imports from logprep.processor.hyperscan_resolver.rule import HyperscanResolverRule - # pylint: enable=ungrouped-imports @@ -125,7 +124,7 @@ def _apply_rules(self, event: dict, rule: HyperscanResolverRule): resolve_target, dest_val, extends_lists=rule.extend_target_list, - overwrite_output_field=rule.overwrite_target, + overwrite_target_field=rule.overwrite_target, ) except FieldExistsWarning as error: conflicting_fields.extend(error.skipped_fields) diff --git a/logprep/processor/labeler/processor.py b/logprep/processor/labeler/processor.py index 452d3711a..d5b52bd77 100644 --- a/logprep/processor/labeler/processor.py +++ b/logprep/processor/labeler/processor.py @@ -33,10 +33,7 @@ from logprep.abc.processor import Processor from logprep.processor.labeler.labeling_schema import LabelingSchema from logprep.processor.labeler.rule import LabelerRule -from logprep.util.helper import ( - get_dotted_field_value, - add_batch_to, -) +from logprep.util.helper import add_batch_to, get_dotted_field_value class Labeler(Processor): @@ -81,4 +78,4 @@ def _apply_rules(self, event, rule): add_batch_to(event, targets, contents, extends_lists=True) # convert sets into sorted lists contents = [sorted(set(get_dotted_field_value(event, target))) for target in targets] - add_batch_to(event, targets, contents, overwrite_output_field=True) + add_batch_to(event, targets, contents, overwrite_target_field=True) diff --git a/logprep/processor/pseudonymizer/processor.py b/logprep/processor/pseudonymizer/processor.py index b4ec2159a..a117e83c7 100644 --- a/logprep/processor/pseudonymizer/processor.py +++ b/logprep/processor/pseudonymizer/processor.py @@ -264,7 +264,7 @@ def _apply_rules(self, event: dict, rule: PseudonymizerRule): ] else: field_value = self._pseudonymize_field(rule, dotted_field, regex, field_value) - _ = add_field_to(event, dotted_field, field_value, overwrite_output_field=True) + _ = add_field_to(event, dotted_field, field_value, overwrite_target_field=True) if "@timestamp" in event: for pseudonym, _ in self.result.data: pseudonym["@timestamp"] = event["@timestamp"] diff --git a/logprep/processor/template_replacer/processor.py b/logprep/processor/template_replacer/processor.py index 538d635c1..94fa18ac2 100644 --- a/logprep/processor/template_replacer/processor.py +++ b/logprep/processor/template_replacer/processor.py @@ -38,7 +38,6 @@ from attr import define, field, validators -from logprep.processor.base.exceptions import FieldExistsWarning from logprep.processor.field_manager.processor import FieldManager from logprep.processor.template_replacer.rule import TemplateReplacerRule from logprep.util.getter import GetterFactory @@ -115,7 +114,7 @@ def _perform_replacement(self, event: dict, replacement: str, rule: TemplateRepl Therefore, they wouldn't be replaced, and we can overwrite the existing target field. """ overwrite = get_dotted_field_value(event, self._target_field) is not None - add_field_to(event, self._target_field, replacement, overwrite_output_field=overwrite) + add_field_to(event, self._target_field, replacement, overwrite_target_field=overwrite) def setup(self): super().setup() diff --git a/logprep/util/helper.py b/logprep/util/helper.py index 2db41d418..8158ba98f 100644 --- a/logprep/util/helper.py +++ b/logprep/util/helper.py @@ -64,10 +64,10 @@ def add_field_to( target_field, content, extends_lists=False, - overwrite_output_field=False, + overwrite_target_field=False, ) -> None: """ - Add content to the output_field in the given event. Output_field can be a dotted subfield. + Add content to the target_field in the given event. target_field can be a dotted subfield. In case of missing fields, all intermediate fields will be created. Parameters ---------- @@ -76,25 +76,25 @@ def add_field_to( target_field: str Dotted subfield string indicating the target of the output value, e.g. destination.ip content: str, float, int, list, dict - Value that should be written into the output_field, can be a str, list, or dict object + Value that should be written into the target_field extends_lists: bool - Flag that determines whether output_field lists should be extended - overwrite_output_field: bool - Flag that determines whether the output_field should be overwritten + Flag that determines whether target_field lists should be extended + overwrite_target_field: bool + Flag that determines whether the target_field should be overwritten Raises ------ ValueError - If both extends_lists and overwrite_output_field are set to True. + If both extends_lists and overwrite_target_field are set to True. FieldExistsWarning - If the output field already exists and overwrite_output_field is False, or if extends_lists is True but + If the target_field already exists and overwrite_target_field is False, or if extends_lists is True but the existing field is not a list. """ - if extends_lists and overwrite_output_field: + if extends_lists and overwrite_target_field: raise ValueError("An output field can't be overwritten and extended at the same time") field_path = [event, *get_dotted_field_list(target_field)] target_key = field_path.pop() - if overwrite_output_field: + if overwrite_target_field: target_parent = reduce(_add_and_overwrite_key, field_path) target_parent[target_key] = content return @@ -138,7 +138,7 @@ def _add_field_to_silent_fail(*args, **kwargs) -> None | str: def add_batch_to( - event, targets, contents, extends_lists=False, overwrite_output_field=False + event, targets, contents, extends_lists=False, overwrite_target_field=False ) -> None: """ Handles the batch addition operation while raising a FieldExistsWarning with all unsuccessful targets. @@ -152,7 +152,7 @@ def add_batch_to( A list of contents corresponding to each target field. extends_lists: bool A boolean indicating whether to extend lists if the target field already exists. - overwrite_output_field: bool + overwrite_target_field: bool A boolean indicating whether to overwrite the target field if it already exists. Raises: @@ -165,7 +165,7 @@ def add_batch_to( targets, contents, itertools.repeat(extends_lists, len(targets)), - itertools.repeat(overwrite_output_field, len(targets)), + itertools.repeat(overwrite_target_field, len(targets)), ) unsuccessful_targets = [item for item in unsuccessful_targets if item is not None] if unsuccessful_targets: @@ -343,7 +343,7 @@ def snake_to_camel(snake: str) -> str: def add_and_overwrite(event, target_field, content, *_): """wrapper for add_field_to""" - add_field_to(event, target_field, content, overwrite_output_field=True) + add_field_to(event, target_field, content, overwrite_target_field=True) def append(event, target_field, content, separator): diff --git a/tests/unit/util/test_helper_add_field.py b/tests/unit/util/test_helper_add_field.py index d622fdd37..b8dd67aec 100644 --- a/tests/unit/util/test_helper_add_field.py +++ b/tests/unit/util/test_helper_add_field.py @@ -72,13 +72,13 @@ def test_provoke_dict_duplicate_in_dotted_subfield(self): def test_add_field_to_overwrites_output_field_in_root_level(self): document = {"some": "field", "output_field": "has already content"} - add_field_to(document, "output_field", {"dict": "content"}, overwrite_output_field=True) + add_field_to(document, "output_field", {"dict": "content"}, overwrite_target_field=True) assert document.get("output_field") == {"dict": "content"} def test_add_field_to_overwrites_output_field_in_nested_level(self): document = {"some": "field", "nested": {"output": {"field": "has already content"}}} add_field_to( - document, "nested.output.field", {"dict": "content"}, overwrite_output_field=True + document, "nested.output.field", {"dict": "content"}, overwrite_target_field=True ) assert document.get("nested", {}).get("output", {}).get("field") == {"dict": "content"} @@ -100,7 +100,7 @@ def test_add_field_to_raises_if_list_should_be_extended_and_overwritten_at_the_s "some_list", ["first", "second"], extends_lists=True, - overwrite_output_field=True, + overwrite_target_field=True, ) def test_returns_false_if_dotted_field_value_key_exists(self): From 0507313529c839462029fb86f31f3ab4e9531df9 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Tue, 12 Nov 2024 10:28:03 +0100 Subject: [PATCH 27/38] add assertions to verify document state after exceptions - Ensure document state is asserted correctly after exceptions are raised. - Update test name and add a comment for clarity. --- tests/unit/util/test_helper_add_field.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/unit/util/test_helper_add_field.py b/tests/unit/util/test_helper_add_field.py index b8dd67aec..d0323f0bf 100644 --- a/tests/unit/util/test_helper_add_field.py +++ b/tests/unit/util/test_helper_add_field.py @@ -33,11 +33,13 @@ def test_provoke_str_duplicate_in_root_field(self): document = {"source": {"ip": "8.8.8.8"}, "field": "exists already"} with pytest.raises(FieldExistsWarning, match=r"could not be written"): add_field_to(document, "field", "content") + assert document def test_provoke_str_duplicate_in_dotted_subfield(self): document = {"source": {"ip": "8.8.8.8"}, "sub": {"field": "exists already"}} with pytest.raises(FieldExistsWarning, match=r"could not be written"): add_field_to(document, "sub.field", "content") + assert document def test_add_dict_content_as_new_root_field(self): document = {"source": {"ip": "8.8.8.8"}} @@ -64,6 +66,7 @@ def test_provoke_dict_duplicate_in_root_field(self): document = {"source": {"ip": "8.8.8.8"}, "field": {"already_existing": "dict"}} with pytest.raises(FieldExistsWarning, match=r"could not be written"): add_field_to(document, "field", {"dict": "content"}) + assert document def test_provoke_dict_duplicate_in_dotted_subfield(self): document = {"source": {"ip": "8.8.8.8"}, "sub": {"field": {"already_existing": "dict"}}} @@ -102,12 +105,14 @@ def test_add_field_to_raises_if_list_should_be_extended_and_overwritten_at_the_s extends_lists=True, overwrite_target_field=True, ) + assert document def test_returns_false_if_dotted_field_value_key_exists(self): document = {"user": "Franz"} content = ["user_inlist"] with pytest.raises(FieldExistsWarning, match=r"could not be written"): add_field_to(document, "user.in_list", content) + assert document def test_add_list_with_nested_keys(self): testdict = { @@ -123,9 +128,11 @@ def test_add_list_with_nested_keys(self): add_field_to(testdict, "key1.key2.key3.key4.key5.list", ["content"], extends_lists=True) assert testdict == expected - def test_add_value_not_as_list_if_it_is_a_new_value_even_though_extends_lists_is_true(self): + def test_add_field_to_adds_value_not_as_list(self): + # checks if a newly added field is added not as list, even when `extends_list` is True document = { "some": "field", } add_field_to(document, "new", "list", extends_lists=True) assert document.get("new") == "list" + assert not isinstance(document.get("new"), list) From f56d45bad894bcf4ec21058d0cdbd41a581acdc7 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Tue, 12 Nov 2024 10:35:00 +0100 Subject: [PATCH 28/38] optimize imports --- logprep/processor/field_manager/processor.py | 4 +--- logprep/processor/list_comparison/processor.py | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/logprep/processor/field_manager/processor.py b/logprep/processor/field_manager/processor.py index 43558a2df..a058e7cd4 100644 --- a/logprep/processor/field_manager/processor.py +++ b/logprep/processor/field_manager/processor.py @@ -29,18 +29,16 @@ .. automodule:: logprep.processor.field_manager.rule """ -import itertools from collections import namedtuple from logprep.abc.processor import Processor -from logprep.processor.base.exceptions import FieldExistsWarning from logprep.processor.field_manager.rule import FieldManagerRule from logprep.util.helper import ( add_and_overwrite, + add_batch_to, add_field_to, get_dotted_field_value, pop_dotted_field_value, - add_batch_to, ) diff --git a/logprep/processor/list_comparison/processor.py b/logprep/processor/list_comparison/processor.py index deb350b68..626411554 100644 --- a/logprep/processor/list_comparison/processor.py +++ b/logprep/processor/list_comparison/processor.py @@ -31,7 +31,6 @@ from attr import define, field, validators from logprep.abc.processor import Processor -from logprep.processor.base.exceptions import FieldExistsWarning from logprep.processor.list_comparison.rule import ListComparisonRule from logprep.util.helper import add_field_to, get_dotted_field_value From ea781f5205648290e898d093ad9a8fca08f9f9bd Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Tue, 12 Nov 2024 14:58:22 +0100 Subject: [PATCH 29/38] update add_field_to signature - Merges target and content arguments into a single field argument. --- logprep/abc/input.py | 31 ++++++++---- logprep/abc/processor.py | 11 ++-- logprep/metrics/metrics.py | 6 +-- logprep/processor/clusterer/processor.py | 3 +- logprep/processor/dissector/processor.py | 9 ++-- .../domain_label_extractor/processor.py | 17 +++---- .../processor/domain_resolver/processor.py | 8 +-- logprep/processor/field_manager/processor.py | 21 ++++---- logprep/processor/generic_adder/processor.py | 33 ++++++------ .../processor/generic_resolver/processor.py | 3 +- logprep/processor/geoip_enricher/processor.py | 13 ++--- logprep/processor/grokker/processor.py | 5 +- .../processor/hyperscan_resolver/processor.py | 3 +- logprep/processor/labeler/processor.py | 12 +++-- logprep/processor/labeler/rule.py | 4 ++ .../processor/list_comparison/processor.py | 4 +- logprep/processor/pre_detector/processor.py | 18 ++++--- logprep/processor/pseudonymizer/processor.py | 2 +- logprep/processor/requester/processor.py | 19 +++---- .../selective_extractor/processor.py | 5 +- .../processor/template_replacer/processor.py | 4 +- logprep/util/helper.py | 50 ++++++++++--------- tests/unit/processor/test_process.py | 2 +- tests/unit/util/test_helper_add_field.py | 38 +++++++------- 24 files changed, 165 insertions(+), 156 deletions(-) diff --git a/logprep/abc/input.py b/logprep/abc/input.py index 8fe125648..5aba34714 100644 --- a/logprep/abc/input.py +++ b/logprep/abc/input.py @@ -18,7 +18,7 @@ from logprep.abc.exceptions import LogprepException from logprep.metrics.metrics import Metric from logprep.processor.base.exceptions import FieldExistsWarning -from logprep.util.helper import add_field_to, get_dotted_field_value +from logprep.util.helper import add_batch_to, add_field_to, get_dotted_field_value from logprep.util.time import UTC, TimeParser from logprep.util.validators import dict_structure_validator @@ -304,13 +304,19 @@ def _add_env_enrichment_to_event(self, event: dict): enrichments = self._config.preprocessing.get("enrich_by_env_variables") if not enrichments: return - for target_field, variable_name in enrichments.items(): - add_field_to(event, target_field, os.environ.get(variable_name, "")) + fields = { + target: os.environ.get(variable_name, "") + for target, variable_name in enrichments.items() + } + add_batch_to(event, fields) def _add_arrival_time_information_to_event(self, event: dict): - now = TimeParser.now() - target_field = self._config.preprocessing.get("log_arrival_time_target_field") - add_field_to(event, target_field, now.isoformat()) + new_field = { + self._config.preprocessing.get( + "log_arrival_time_target_field" + ): TimeParser.now().isoformat() + } + add_field_to(event, new_field) def _add_arrival_timedelta_information_to_event(self, event: dict): log_arrival_timedelta_config = self._config.preprocessing.get("log_arrival_timedelta") @@ -326,13 +332,13 @@ def _add_arrival_timedelta_information_to_event(self, event: dict): TimeParser.from_string(log_arrival_time).astimezone(UTC) - TimeParser.from_string(time_reference).astimezone(UTC) ).total_seconds() - add_field_to(event, target_field, delta_time_sec) + add_field_to(event, field={target_field: delta_time_sec}) def _add_version_information_to_event(self, event: dict): """Add the version information to the event""" target_field = self._config.preprocessing.get("version_info_target_field") # pylint: disable=protected-access - add_field_to(event, target_field, self._config._version_information) + add_field_to(event, field={target_field: self._config._version_information}) # pylint: enable=protected-access def _add_hmac_to(self, event_dict, raw_event) -> dict: @@ -385,6 +391,11 @@ def _add_hmac_to(self, event_dict, raw_event) -> dict: digestmod=hashlib.sha256, ).hexdigest() compressed = zlib.compress(received_orig_message, level=-1) - hmac_output = {"hmac": hmac, "compressed_base64": base64.b64encode(compressed).decode()} - add_field_to(event_dict, hmac_options.get("output_field"), hmac_output) + new_field = { + hmac_options.get("output_field"): { + "hmac": hmac, + "compressed_base64": base64.b64encode(compressed).decode(), + } + } + add_field_to(event_dict, new_field) return event_dict diff --git a/logprep/abc/processor.py b/logprep/abc/processor.py index df199ac4b..b13497a18 100644 --- a/logprep/abc/processor.py +++ b/logprep/abc/processor.py @@ -356,13 +356,15 @@ def _handle_warning_error(self, event, rule, error, failure_tags=None): if failure_tags is None: failure_tags = rule.failure_tags if tags is None: - add_and_overwrite(event, "tags", sorted(list({*failure_tags}))) + new_field = {"tags": sorted(list({*failure_tags}))} else: - add_and_overwrite(event, "tags", sorted(list({*tags, *failure_tags}))) + new_field = {"tags": sorted(list({*tags, *failure_tags}))} + add_and_overwrite(event, new_field) if isinstance(error, ProcessingWarning): if error.tags: tags = tags if tags else [] - add_and_overwrite(event, "tags", sorted(list({*error.tags, *tags, *failure_tags}))) + new_field = {"tags": sorted(list({*error.tags, *tags, *failure_tags}))} + add_and_overwrite(event, new_field) self.result.warnings.append(error) else: self.result.warnings.append(ProcessingWarning(str(error), event, rule)) @@ -382,8 +384,7 @@ def _has_missing_values(self, event, rule, source_field_dict): def _write_target_field(self, event: dict, rule: "Rule", result: any) -> None: add_field_to( event, - target_field=rule.target_field, - content=result, + field={rule.target_field: result}, extends_lists=rule.extend_target_list, overwrite_target_field=rule.overwrite_target, ) diff --git a/logprep/metrics/metrics.py b/logprep/metrics/metrics.py index caf12bcf7..9f1524366 100644 --- a/logprep/metrics/metrics.py +++ b/logprep/metrics/metrics.py @@ -222,12 +222,12 @@ def inner(self, *args, **kwargs): # nosemgrep if hasattr(self, "rule_type"): event = args[0] if event: - add_field_to(event, f"processing_times.{self.rule_type}", duration) + add_field_to(event, field={f"processing_times.{self.rule_type}": duration}) if hasattr(self, "_logprep_config"): # attribute of the Pipeline class event = args[0] if event: - add_field_to(event, "processing_times.pipeline", duration) - add_field_to(event, "processing_times.hostname", gethostname()) + add_field_to(event, field={"processing_times.pipeline": duration}) + add_field_to(event, field={"processing_times.hostname": gethostname()}) return result return inner diff --git a/logprep/processor/clusterer/processor.py b/logprep/processor/clusterer/processor.py index d6c985d8d..80c1f85f3 100644 --- a/logprep/processor/clusterer/processor.py +++ b/logprep/processor/clusterer/processor.py @@ -140,8 +140,7 @@ def _cluster(self, event: dict, rule: ClustererRule): cluster_signature = cluster_signature_based_on_message add_field_to( event, - self._config.output_field_name, - cluster_signature, + field={self._config.output_field_name: cluster_signature}, extends_lists=rule.extend_target_list, overwrite_target_field=rule.overwrite_target, ) diff --git a/logprep/processor/dissector/processor.py b/logprep/processor/dissector/processor.py index 0b4b7bc1d..2257a1a35 100644 --- a/logprep/processor/dissector/processor.py +++ b/logprep/processor/dissector/processor.py @@ -46,12 +46,12 @@ def _apply_rules(self, event, rule): def _apply_mapping(self, event, rule): action_mappings_sorted_by_position = sorted( - self._get_mappings(event, rule), key=lambda x: x[5] + self._get_mappings(event, rule), key=lambda x: x[-1] ) for action, *args, _ in action_mappings_sorted_by_position: action(*args) - def _get_mappings(self, event, rule) -> List[Tuple[Callable, dict, str, str, str, int]]: + def _get_mappings(self, event, rule) -> List[Tuple[Callable, dict, dict, str, int]]: current_field = None target_field_mapping = {} for rule_action in rule.actions: @@ -84,12 +84,13 @@ def _get_mappings(self, event, rule) -> List[Tuple[Callable, dict, str, str, str target_field = target_field_mapping.get(target_field.lstrip("&")) if strip_char: content = content.strip(strip_char) - yield rule_action, event, target_field, content, separator, position + field = {target_field: content} + yield rule_action, event, field, separator, position def _apply_convert_datatype(self, event, rule): for target_field, converter in rule.convert_actions: try: target_value = converter(get_dotted_field_value(event, target_field)) - add_field_to(event, target_field, target_value, overwrite_target_field=True) + add_field_to(event, {target_field: target_value}, overwrite_target_field=True) except ValueError as error: self._handle_warning_error(event, rule, error) diff --git a/logprep/processor/domain_label_extractor/processor.py b/logprep/processor/domain_label_extractor/processor.py index c4883035d..c5cdd11d6 100644 --- a/logprep/processor/domain_label_extractor/processor.py +++ b/logprep/processor/domain_label_extractor/processor.py @@ -130,21 +130,20 @@ def _apply_rules(self, event, rule: DomainLabelExtractorRule): if self._is_valid_ip(domain): tagging_field.append(f"ip_in_{rule.source_fields[0].replace('.', '_')}") - add_and_overwrite(event, self._config.tagging_field_name, tagging_field) + add_and_overwrite(event, field={self._config.tagging_field_name: tagging_field}) return labels = self._tld_extractor(domain) if labels.suffix != "": - targets = [ - f"{rule.target_field}.registered_domain", - f"{rule.target_field}.top_level_domain", - f"{rule.target_field}.subdomain", - ] - contents = [f"{labels.domain}.{labels.suffix}", labels.suffix, labels.subdomain] - add_batch_to(event, targets, contents, overwrite_target_field=rule.overwrite_target) + fields = { + f"{rule.target_field}.registered_domain": f"{labels.domain}.{labels.suffix}", + f"{rule.target_field}.top_level_domain": labels.suffix, + f"{rule.target_field}.subdomain": labels.subdomain, + } + add_batch_to(event, fields, overwrite_target_field=rule.overwrite_target) else: tagging_field.append(f"invalid_domain_in_{rule.source_fields[0].replace('.', '_')}") - add_and_overwrite(event, self._config.tagging_field_name, tagging_field) + add_and_overwrite(event, field={self._config.tagging_field_name: tagging_field}) @staticmethod def _is_valid_ip(domain): diff --git a/logprep/processor/domain_resolver/processor.py b/logprep/processor/domain_resolver/processor.py index f9b918d45..53692ea0a 100644 --- a/logprep/processor/domain_resolver/processor.py +++ b/logprep/processor/domain_resolver/processor.py @@ -222,7 +222,9 @@ def _resolve_ip(self, domain, hash_string=None): def _store_debug_infos(self, event, requires_storing): event_dbg = { - "obtained_from_cache": not requires_storing, - "cache_size": len(self._domain_ip_map.keys()), + "resolved_ip_debug": { + "obtained_from_cache": not requires_storing, + "cache_size": len(self._domain_ip_map.keys()), + } } - add_field_to(event, "resolved_ip_debug", event_dbg, overwrite_target_field=True) + add_field_to(event, event_dbg, overwrite_target_field=True) diff --git a/logprep/processor/field_manager/processor.py b/logprep/processor/field_manager/processor.py index a058e7cd4..47a00499a 100644 --- a/logprep/processor/field_manager/processor.py +++ b/logprep/processor/field_manager/processor.py @@ -78,7 +78,9 @@ def _apply_mapping(self, event, rule, rule_args): if not any(source_field_values): return source_field_values, targets = self._filter_missing_fields(source_field_values, targets) - add_batch_to(event, targets, source_field_values, extend_target_list, overwrite_target) + add_batch_to( + event, dict(zip(targets, source_field_values)), extend_target_list, overwrite_target + ) if rule.delete_source_fields: for dotted_field in source_fields: pop_dotted_field_value(event, dotted_field) @@ -104,7 +106,7 @@ def _write_to_single_target(self, args, extend_target_list, overwrite_target, ru case State( extend=True, overwrite=True, single_source_element=False, target_is_list=False ): - add_and_overwrite(event, target_field, source_fields_values) + add_and_overwrite(event, field={target_field: source_fields_values}) return case State( @@ -116,16 +118,16 @@ def _write_to_single_target(self, args, extend_target_list, overwrite_target, ru ): flattened_source_fields = self._overwrite_from_source_values(source_fields_values) source_fields_values = [*flattened_source_fields] - add_and_overwrite(event, target_field, source_fields_values) + add_and_overwrite(event, field={target_field: source_fields_values}) return case State(extend=True, overwrite=False, target_is_list=False, target_is_none=True): - add_and_overwrite(event, target_field, source_fields_values) + add_and_overwrite(event, field={target_field: source_fields_values}) return case State(extend=True, overwrite=False, target_is_list=False): source_fields_values = [target_field_value, *source_fields_values] - add_and_overwrite(event, target_field, source_fields_values) + add_and_overwrite(event, field={target_field: source_fields_values}) return case State( @@ -133,19 +135,18 @@ def _write_to_single_target(self, args, extend_target_list, overwrite_target, ru ): flattened_source_fields = self._overwrite_from_source_values(source_fields_values) source_fields_values = [*target_field_value, *flattened_source_fields] - add_and_overwrite(event, target_field, source_fields_values) + add_and_overwrite(event, field={target_field: source_fields_values}) return case State(overwrite=True, extend=True): flattened_source_fields = self._overwrite_from_source_values(source_fields_values) source_fields_values = [*flattened_source_fields] - add_and_overwrite(event, target_field, source_fields_values) + add_and_overwrite(event, field={target_field: source_fields_values}) return case _: - add_field_to( - event, target_field, source_fields_values, state.extend, state.overwrite - ) + field = {target_field: source_fields_values} + add_field_to(event, field, state.extend, state.overwrite) def _overwrite_from_source_values(self, source_fields_values): duplicates = [] diff --git a/logprep/processor/generic_adder/processor.py b/logprep/processor/generic_adder/processor.py index 842f9894e..8f9b2b37c 100644 --- a/logprep/processor/generic_adder/processor.py +++ b/logprep/processor/generic_adder/processor.py @@ -48,7 +48,7 @@ from logprep.factory_error import InvalidConfigurationError from logprep.processor.generic_adder.mysql_connector import MySQLConnector from logprep.processor.generic_adder.rule import GenericAdderRule -from logprep.util.helper import get_dotted_field_value, add_batch_to +from logprep.util.helper import add_batch_to, get_dotted_field_value def sql_config_validator(_, attribute, value): @@ -224,35 +224,32 @@ def _apply_rules(self, event: dict, rule: GenericAdderRule): FieldExistsWarning Raises if an addition would overwrite an existing field or value. """ + items_to_add = rule.add use_db = rule.db_target and self._db_table if use_db: self._update_db_table() items_to_add = self._get_items_to_add_from_db(event, rule) - else: - items_to_add = rule.add.items() - if items_to_add: - targets, contents = zip(*items_to_add) - add_batch_to(event, targets, contents, rule.extend_target_list, rule.overwrite_target) + add_batch_to(event, items_to_add, rule.extend_target_list, rule.overwrite_target) - def _get_items_to_add_from_db(self, event: dict, rule: GenericAdderRule) -> list: + def _get_items_to_add_from_db(self, event: dict, rule: GenericAdderRule) -> dict | None: """Get the sub part of the value from the event using a regex pattern""" - items_to_add = [] if not rule.db_pattern: - return items_to_add - + return value_to_check_in_db = get_dotted_field_value(event, rule.db_target) match_with_value_in_db = rule.db_pattern.match(value_to_check_in_db) if match_with_value_in_db: # Get values to add from db table using the sub part value_to_map = match_with_value_in_db.group(1).upper() add_from_db = self._db_table.get(value_to_map, []) - if rule.db_destination_prefix: - for idx, _ in enumerate(add_from_db): - if not add_from_db[idx][0].startswith(rule.db_destination_prefix): - add_from_db[idx][0] = f"{rule.db_destination_prefix}.{add_from_db[idx][0]}" - - for item in add_from_db: - items_to_add.append(item) - return items_to_add + add_from_db = [ + (self._add_prefix_if_not_present(key, rule), value) + for key, value in add_from_db + ] + return dict(add_from_db) + + def _add_prefix_if_not_present(self, key: str, rule: "GenericAdderRule") -> str: + if not key.startswith(rule.db_destination_prefix): + return f"{rule.db_destination_prefix}.{key}" + return key diff --git a/logprep/processor/generic_resolver/processor.py b/logprep/processor/generic_resolver/processor.py index c0b4eccb3..fce8b2373 100644 --- a/logprep/processor/generic_resolver/processor.py +++ b/logprep/processor/generic_resolver/processor.py @@ -61,8 +61,7 @@ def _apply_rules(self, event, rule): try: add_field_to( event, - target_field, - content, + field={target_field: content}, extends_lists=rule.extend_target_list, overwrite_target_field=rule.overwrite_target, ) diff --git a/logprep/processor/geoip_enricher/processor.py b/logprep/processor/geoip_enricher/processor.py index e6afce2e4..9374b8b45 100644 --- a/logprep/processor/geoip_enricher/processor.py +++ b/logprep/processor/geoip_enricher/processor.py @@ -128,16 +128,13 @@ def _apply_rules(self, event, rule): geoip_data = self._try_getting_geoip_data(ip_string) if not geoip_data: return - filtered_geoip_data = {k: v for k, v in geoip_data.items() if v is not None} - targets, contents = zip(*filtered_geoip_data.items()) - targets = [ - rule.customize_target_subfields.get(target, f"{rule.target_field}.{target}") - for target in targets - ] + fields = { + rule.customize_target_subfields.get(target, f"{rule.target_field}.{target}"): value + for target, value in geoip_data.items() + } add_batch_to( event, - targets, - contents, + fields, extends_lists=False, overwrite_target_field=rule.overwrite_target, ) diff --git a/logprep/processor/grokker/processor.py b/logprep/processor/grokker/processor.py index a54a5ab23..dac33017e 100644 --- a/logprep/processor/grokker/processor.py +++ b/logprep/processor/grokker/processor.py @@ -85,12 +85,9 @@ def _apply_rules(self, event: dict, rule: GrokkerRule): if result is None or result == {}: continue matches.append(True) - filtered_items = {k: v for k, v in result.items() if v is not None} - targets, contents = zip(*filtered_items.items()) add_batch_to( event, - targets, - contents, + result, extends_lists=rule.extend_target_list, overwrite_target_field=rule.overwrite_target, ) diff --git a/logprep/processor/hyperscan_resolver/processor.py b/logprep/processor/hyperscan_resolver/processor.py index 81d050952..49a05b92c 100644 --- a/logprep/processor/hyperscan_resolver/processor.py +++ b/logprep/processor/hyperscan_resolver/processor.py @@ -121,8 +121,7 @@ def _apply_rules(self, event: dict, rule: HyperscanResolverRule): try: add_field_to( event, - resolve_target, - dest_val, + field={resolve_target: dest_val}, extends_lists=rule.extend_target_list, overwrite_target_field=rule.overwrite_target, ) diff --git a/logprep/processor/labeler/processor.py b/logprep/processor/labeler/processor.py index d5b52bd77..2c878e4d6 100644 --- a/logprep/processor/labeler/processor.py +++ b/logprep/processor/labeler/processor.py @@ -73,9 +73,11 @@ def setup(self): def _apply_rules(self, event, rule): """Applies the rule to the current event""" - targets = [f"label.{key}" for key in rule.label.keys()] - contents = rule.label.values() - add_batch_to(event, targets, contents, extends_lists=True) + fields = {key: value for key, value in rule.prefixed_label.items()} + add_batch_to(event, fields, extends_lists=True) # convert sets into sorted lists - contents = [sorted(set(get_dotted_field_value(event, target))) for target in targets] - add_batch_to(event, targets, contents, overwrite_target_field=True) + fields = { + key: sorted(set(get_dotted_field_value(event, key))) + for key, _ in rule.prefixed_label.items() + } + add_batch_to(event, fields, overwrite_target_field=True) diff --git a/logprep/processor/labeler/rule.py b/logprep/processor/labeler/rule.py index 804e1f109..0b7f9ea90 100644 --- a/logprep/processor/labeler/rule.py +++ b/logprep/processor/labeler/rule.py @@ -60,6 +60,10 @@ def label(self) -> dict: # pylint: enable=C0111 + @property + def prefixed_label(self) -> dict: + return {f"label.{key}": value for key, value in self.label.items()} + def conforms_to_schema(self, schema: LabelingSchema) -> bool: """Check if labels are valid.""" return schema.validate_labels(self._config.label) diff --git a/logprep/processor/list_comparison/processor.py b/logprep/processor/list_comparison/processor.py index 626411554..ae488f956 100644 --- a/logprep/processor/list_comparison/processor.py +++ b/logprep/processor/list_comparison/processor.py @@ -73,8 +73,8 @@ def _apply_rules(self, event, rule): """ comparison_result, comparison_key = self._list_comparison(rule, event) if comparison_result is not None: - output_field = f"{ rule.target_field }.{ comparison_key }" - add_field_to(event, output_field, comparison_result, True) + field = {f"{rule.target_field}.{comparison_key}": comparison_result} + add_field_to(event, field, extends_lists=True) def _list_comparison(self, rule: ListComparisonRule, event: dict): """ diff --git a/logprep/processor/pre_detector/processor.py b/logprep/processor/pre_detector/processor.py index 7d8378108..abaa24850 100644 --- a/logprep/processor/pre_detector/processor.py +++ b/logprep/processor/pre_detector/processor.py @@ -126,7 +126,7 @@ def _get_detection_result(self, event: dict, rule: PreDetectorRule): pre_detection_id = get_dotted_field_value(event, "pre_detection_id") if pre_detection_id is None: pre_detection_id = str(uuid4()) - add_field_to(event, "pre_detection_id", pre_detection_id) + add_field_to(event, {"pre_detection_id": pre_detection_id}) detection_result = self._generate_detection_result(pre_detection_id, event, rule) self.result.data.append((detection_result, self._config.outputs)) @@ -135,11 +135,13 @@ def _generate_detection_result( pre_detection_id: str, event: dict, rule: PreDetectorRule ) -> dict: detection_result = rule.detection_data - detection_result["rule_filter"] = rule.filter_str - detection_result["description"] = rule.description - detection_result["pre_detection_id"] = pre_detection_id - - host_name = get_dotted_field_value(event, "host.name") - if host_name is not None: - detection_result["host"] = {"name": host_name} + detection_result.update( + { + "rule_filter": rule.filter_str, + "description": rule.description, + "pre_detection_id": pre_detection_id, + } + ) + if host_name := get_dotted_field_value(event, "host.name"): + detection_result.update({"host": {"name": host_name}}) return detection_result diff --git a/logprep/processor/pseudonymizer/processor.py b/logprep/processor/pseudonymizer/processor.py index a117e83c7..78b13dccb 100644 --- a/logprep/processor/pseudonymizer/processor.py +++ b/logprep/processor/pseudonymizer/processor.py @@ -264,7 +264,7 @@ def _apply_rules(self, event: dict, rule: PseudonymizerRule): ] else: field_value = self._pseudonymize_field(rule, dotted_field, regex, field_value) - _ = add_field_to(event, dotted_field, field_value, overwrite_target_field=True) + add_field_to(event, field={dotted_field: field_value}, overwrite_target_field=True) if "@timestamp" in event: for pseudonym, _ in self.result.data: pseudonym["@timestamp"] = event["@timestamp"] diff --git a/logprep/processor/requester/processor.py b/logprep/processor/requester/processor.py index a9aeed78b..8de56e3d3 100644 --- a/logprep/processor/requester/processor.py +++ b/logprep/processor/requester/processor.py @@ -44,11 +44,7 @@ from logprep.processor.base.exceptions import FieldExistsWarning from logprep.processor.field_manager.processor import FieldManager from logprep.processor.requester.rule import RequesterRule -from logprep.util.helper import ( - get_source_fields_dict, - add_field_to, - add_batch_to, -) +from logprep.util.helper import add_batch_to, add_field_to, get_source_fields_dict TEMPLATE_KWARGS = ("url", "json", "data", "params") @@ -73,14 +69,12 @@ def _apply_rules(self, event, rule): def _handle_response(self, event, rule, response): conflicting_fields = [] if rule.target_field: - result = self._get_result(response) try: add_field_to( event, - rule.target_field, - result, - rule.extend_target_list, - rule.overwrite_target, + field={rule.target_field: self._get_result(response)}, + extends_lists=rule.extend_target_list, + overwrite_target_field=rule.overwrite_target, ) except FieldExistsWarning as error: conflicting_fields.extend(error.skipped_fields) @@ -90,7 +84,10 @@ def _handle_response(self, event, rule, response): targets = rule.target_field_mapping.values() try: add_batch_to( - event, targets, contents, rule.extend_target_list, rule.overwrite_target + event, + dict(zip(targets, contents)), + rule.extend_target_list, + rule.overwrite_target, ) except FieldExistsWarning as error: conflicting_fields.extend(error.skipped_fields) diff --git a/logprep/processor/selective_extractor/processor.py b/logprep/processor/selective_extractor/processor.py index 4656b5eb3..b0b7e58dc 100644 --- a/logprep/processor/selective_extractor/processor.py +++ b/logprep/processor/selective_extractor/processor.py @@ -31,7 +31,7 @@ from logprep.processor.field_manager.processor import FieldManager from logprep.processor.selective_extractor.rule import SelectiveExtractorRule -from logprep.util.helper import add_field_to, get_source_fields_dict +from logprep.util.helper import add_batch_to, get_source_fields_dict class SelectiveExtractor(FieldManager): @@ -64,6 +64,5 @@ def _apply_rules(self, event: dict, rule: SelectiveExtractorRule): } if flattened_fields: filtered_event = {} - for field, content in flattened_fields.items(): - add_field_to(filtered_event, field, content) + add_batch_to(filtered_event, flattened_fields) self.result.data.append((filtered_event, rule.outputs)) diff --git a/logprep/processor/template_replacer/processor.py b/logprep/processor/template_replacer/processor.py index 94fa18ac2..0a4fdd725 100644 --- a/logprep/processor/template_replacer/processor.py +++ b/logprep/processor/template_replacer/processor.py @@ -114,7 +114,9 @@ def _perform_replacement(self, event: dict, replacement: str, rule: TemplateRepl Therefore, they wouldn't be replaced, and we can overwrite the existing target field. """ overwrite = get_dotted_field_value(event, self._target_field) is not None - add_field_to(event, self._target_field, replacement, overwrite_target_field=overwrite) + add_field_to( + event, field={self._target_field: replacement}, overwrite_target_field=overwrite + ) def setup(self): super().setup() diff --git a/logprep/util/helper.py b/logprep/util/helper.py index 8158ba98f..f4fafda6b 100644 --- a/logprep/util/helper.py +++ b/logprep/util/helper.py @@ -60,11 +60,10 @@ def _add_and_not_overwrite_key(sub_dict, key): def add_field_to( - event, - target_field, - content, - extends_lists=False, - overwrite_target_field=False, + event: dict, + field: dict, + extends_lists: bool = False, + overwrite_target_field: bool = False, ) -> None: """ Add content to the target_field in the given event. target_field can be a dotted subfield. @@ -73,10 +72,10 @@ def add_field_to( ---------- event: dict Original log-event that logprep is currently processing - target_field: str - Dotted subfield string indicating the target of the output value, e.g. destination.ip - content: str, float, int, list, dict - Value that should be written into the target_field + field: dict + A key value pair describing the field that should be added. The key is the dotted subfield string indicating + the target. The value is the content that should be added to the named target. The content can be of type + str, float, int, list, dict. extends_lists: bool Flag that determines whether target_field lists should be extended overwrite_target_field: bool @@ -91,6 +90,9 @@ def add_field_to( """ if extends_lists and overwrite_target_field: raise ValueError("An output field can't be overwritten and extended at the same time") + if isinstance(field, dict): + field = list(field.items())[0] + target_field, content = field field_path = [event, *get_dotted_field_list(target_field)] target_key = field_path.pop() @@ -133,13 +135,11 @@ def _add_field_to_silent_fail(*args, **kwargs) -> None | str: """ try: add_field_to(*args, **kwargs) - except FieldExistsWarning: - return args[1] + except FieldExistsWarning as error: + return error.skipped_fields[0] -def add_batch_to( - event, targets, contents, extends_lists=False, overwrite_target_field=False -) -> None: +def add_batch_to(event, fields, extends_lists=False, overwrite_target_field=False) -> None: """ Handles the batch addition operation while raising a FieldExistsWarning with all unsuccessful targets. @@ -159,13 +159,14 @@ def add_batch_to( FieldExistsWarning: If there are targets to which the content could not be added due to field existence restrictions. """ + fields = {key: value for key, value in fields.items() if value is not None} + number_fields = len(dict(fields)) unsuccessful_targets = map( _add_field_to_silent_fail, - itertools.repeat(event, len(targets)), - targets, - contents, - itertools.repeat(extends_lists, len(targets)), - itertools.repeat(overwrite_target_field, len(targets)), + itertools.repeat(event, number_fields), + fields.items(), + itertools.repeat(extends_lists, number_fields), + itertools.repeat(overwrite_target_field, number_fields), ) unsuccessful_targets = [item for item in unsuccessful_targets if item is not None] if unsuccessful_targets: @@ -341,20 +342,21 @@ def snake_to_camel(snake: str) -> str: append_as_list = partial(add_field_to, extends_lists=True) -def add_and_overwrite(event, target_field, content, *_): +def add_and_overwrite(event, field, *_): """wrapper for add_field_to""" - add_field_to(event, target_field, content, overwrite_target_field=True) + add_field_to(event, field, overwrite_target_field=True) -def append(event, target_field, content, separator): +def append(event, field, separator): """appends to event""" + target_field, content = list(field.items())[0] target_value = get_dotted_field_value(event, target_field) if not isinstance(target_value, list): target_value = "" if target_value is None else target_value target_value = f"{target_value}{separator}{content}" - add_and_overwrite(event, target_field, target_value) + add_and_overwrite(event, field={target_field: target_value}) else: - append_as_list(event, target_field, content) + append_as_list(event, field) def get_source_fields_dict(event, rule): diff --git a/tests/unit/processor/test_process.py b/tests/unit/processor/test_process.py index 5d985c99b..704622751 100644 --- a/tests/unit/processor/test_process.py +++ b/tests/unit/processor/test_process.py @@ -79,7 +79,7 @@ def test_apply_processor_multiple_times_until_no_new_rule_matches(self): "url": "url", } processor.process(event) - assert expected_event == event + assert event == expected_event def test_apply_processor_multiple_times_not_enabled(self): config = {"type": "dissector", "specific_rules": [], "generic_rules": []} diff --git a/tests/unit/util/test_helper_add_field.py b/tests/unit/util/test_helper_add_field.py index d0323f0bf..1605a33bc 100644 --- a/tests/unit/util/test_helper_add_field.py +++ b/tests/unit/util/test_helper_add_field.py @@ -10,13 +10,13 @@ class TestHelperAddField: def test_add_str_content_as_new_root_field(self): document = {"source": {"ip": "8.8.8.8"}} expected_document = {"source": {"ip": "8.8.8.8"}, "field": "content"} - add_field_to(document, "field", "content") + add_field_to(document, {"field": "content"}) assert document == expected_document def test_add_str_content_as_completely_new_dotted_subfield(self): document = {"source": {"ip": "8.8.8.8"}} expected_document = {"source": {"ip": "8.8.8.8"}, "sub": {"field": "content"}} - add_field_to(document, "sub.field", "content") + add_field_to(document, {"sub.field": "content"}) assert document == expected_document def test_add_str_content_as_partially_new_dotted_subfield(self): @@ -26,31 +26,31 @@ def test_add_str_content_as_partially_new_dotted_subfield(self): "sub": {"field": "content", "other_field": "other_content"}, } - add_field_to(document, "sub.field", "content") + add_field_to(document, {"sub.field": "content"}) assert document == expected_document def test_provoke_str_duplicate_in_root_field(self): document = {"source": {"ip": "8.8.8.8"}, "field": "exists already"} with pytest.raises(FieldExistsWarning, match=r"could not be written"): - add_field_to(document, "field", "content") + add_field_to(document, {"field": "content"}) assert document def test_provoke_str_duplicate_in_dotted_subfield(self): document = {"source": {"ip": "8.8.8.8"}, "sub": {"field": "exists already"}} with pytest.raises(FieldExistsWarning, match=r"could not be written"): - add_field_to(document, "sub.field", "content") + add_field_to(document, {"sub.field": "content"}) assert document def test_add_dict_content_as_new_root_field(self): document = {"source": {"ip": "8.8.8.8"}} expected_document = {"source": {"ip": "8.8.8.8"}, "field": {"dict": "content"}} - add_field_to(document, "field", {"dict": "content"}) + add_field_to(document, {"field": {"dict": "content"}}) assert document == expected_document def test_add_dict_content_as_completely_new_dotted_subfield(self): document = {"source": {"ip": "8.8.8.8"}} expected_document = {"source": {"ip": "8.8.8.8"}, "sub": {"field": {"dict": "content"}}} - add_field_to(document, "sub.field", {"dict": "content"}) + add_field_to(document, {"sub.field": {"dict": "content"}}) assert document == expected_document def test_add_dict_content_as_partially_new_dotted_subfield(self): @@ -59,40 +59,40 @@ def test_add_dict_content_as_partially_new_dotted_subfield(self): "source": {"ip": "8.8.8.8"}, "sub": {"field": {"dict": "content"}, "other_field": "other_content"}, } - add_field_to(document, "sub.field", {"dict": "content"}) + add_field_to(document, {"sub.field": {"dict": "content"}}) assert document == expected_document def test_provoke_dict_duplicate_in_root_field(self): document = {"source": {"ip": "8.8.8.8"}, "field": {"already_existing": "dict"}} with pytest.raises(FieldExistsWarning, match=r"could not be written"): - add_field_to(document, "field", {"dict": "content"}) + add_field_to(document, {"field": {"dict": "content"}}) assert document def test_provoke_dict_duplicate_in_dotted_subfield(self): document = {"source": {"ip": "8.8.8.8"}, "sub": {"field": {"already_existing": "dict"}}} with pytest.raises(FieldExistsWarning, match=r"could not be written"): - add_field_to(document, "sub.field", {"dict": "content"}) + add_field_to(document, {"sub.field": {"dict": "content"}}) def test_add_field_to_overwrites_output_field_in_root_level(self): document = {"some": "field", "output_field": "has already content"} - add_field_to(document, "output_field", {"dict": "content"}, overwrite_target_field=True) + add_field_to(document, {"output_field": {"dict": "content"}}, overwrite_target_field=True) assert document.get("output_field") == {"dict": "content"} def test_add_field_to_overwrites_output_field_in_nested_level(self): document = {"some": "field", "nested": {"output": {"field": "has already content"}}} add_field_to( - document, "nested.output.field", {"dict": "content"}, overwrite_target_field=True + document, {"nested.output.field": {"dict": "content"}}, overwrite_target_field=True ) assert document.get("nested", {}).get("output", {}).get("field") == {"dict": "content"} def test_add_field_to_extends_list_when_only_given_a_string(self): document = {"some": "field", "some_list": ["with a value"]} - add_field_to(document, "some_list", "new value", extends_lists=True) + add_field_to(document, {"some_list": "new value"}, extends_lists=True) assert document.get("some_list") == ["with a value", "new value"] def test_add_field_to_extends_list_when_given_a_list(self): document = {"some": "field", "some_list": ["with a value"]} - add_field_to(document, "some_list", ["first", "second"], extends_lists=True) + add_field_to(document, {"some_list": ["first", "second"]}, extends_lists=True) assert document.get("some_list") == ["with a value", "first", "second"] def test_add_field_to_raises_if_list_should_be_extended_and_overwritten_at_the_same_time(self): @@ -100,8 +100,7 @@ def test_add_field_to_raises_if_list_should_be_extended_and_overwritten_at_the_s with pytest.raises(ValueError, match=r"can't be overwritten and extended at the same time"): add_field_to( document, - "some_list", - ["first", "second"], + {"some_list": ["first", "second"]}, extends_lists=True, overwrite_target_field=True, ) @@ -109,9 +108,8 @@ def test_add_field_to_raises_if_list_should_be_extended_and_overwritten_at_the_s def test_returns_false_if_dotted_field_value_key_exists(self): document = {"user": "Franz"} - content = ["user_inlist"] with pytest.raises(FieldExistsWarning, match=r"could not be written"): - add_field_to(document, "user.in_list", content) + add_field_to(document, {"user.in_list": ["user_inlist"]}) assert document def test_add_list_with_nested_keys(self): @@ -125,7 +123,7 @@ def test_add_list_with_nested_keys(self): } } } - add_field_to(testdict, "key1.key2.key3.key4.key5.list", ["content"], extends_lists=True) + add_field_to(testdict, {"key1.key2.key3.key4.key5.list": ["content"]}, extends_lists=True) assert testdict == expected def test_add_field_to_adds_value_not_as_list(self): @@ -133,6 +131,6 @@ def test_add_field_to_adds_value_not_as_list(self): document = { "some": "field", } - add_field_to(document, "new", "list", extends_lists=True) + add_field_to(document, {"new": "list"}, extends_lists=True) assert document.get("new") == "list" assert not isinstance(document.get("new"), list) From bdb445fe4cfa8e48f920e96a36c7684bb81e11a4 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Wed, 13 Nov 2024 08:36:06 +0100 Subject: [PATCH 30/38] enable `add_field_to` to always take a batch of fields - Replace `add_batch_to` with `add_field_to` throughout code. - Update helper functions to streamline `add_field_to` usage. --- logprep/abc/input.py | 8 ++--- logprep/abc/processor.py | 2 +- logprep/metrics/metrics.py | 6 ++-- logprep/processor/clusterer/processor.py | 2 +- .../domain_label_extractor/processor.py | 8 ++--- logprep/processor/field_manager/processor.py | 15 ++++---- logprep/processor/generic_adder/processor.py | 4 +-- .../processor/generic_resolver/processor.py | 2 +- logprep/processor/geoip_enricher/processor.py | 4 +-- logprep/processor/grokker/processor.py | 4 +-- .../processor/hyperscan_resolver/processor.py | 2 +- logprep/processor/labeler/processor.py | 6 ++-- .../processor/list_comparison/processor.py | 4 +-- logprep/processor/pseudonymizer/processor.py | 2 +- logprep/processor/requester/processor.py | 6 ++-- .../selective_extractor/processor.py | 4 +-- .../processor/template_replacer/processor.py | 2 +- logprep/util/helper.py | 36 ++++++++++--------- 18 files changed, 60 insertions(+), 57 deletions(-) diff --git a/logprep/abc/input.py b/logprep/abc/input.py index 5aba34714..2e28eabf4 100644 --- a/logprep/abc/input.py +++ b/logprep/abc/input.py @@ -18,7 +18,7 @@ from logprep.abc.exceptions import LogprepException from logprep.metrics.metrics import Metric from logprep.processor.base.exceptions import FieldExistsWarning -from logprep.util.helper import add_batch_to, add_field_to, get_dotted_field_value +from logprep.util.helper import add_field_to, get_dotted_field_value from logprep.util.time import UTC, TimeParser from logprep.util.validators import dict_structure_validator @@ -308,7 +308,7 @@ def _add_env_enrichment_to_event(self, event: dict): target: os.environ.get(variable_name, "") for target, variable_name in enrichments.items() } - add_batch_to(event, fields) + add_field_to(event, fields) def _add_arrival_time_information_to_event(self, event: dict): new_field = { @@ -332,13 +332,13 @@ def _add_arrival_timedelta_information_to_event(self, event: dict): TimeParser.from_string(log_arrival_time).astimezone(UTC) - TimeParser.from_string(time_reference).astimezone(UTC) ).total_seconds() - add_field_to(event, field={target_field: delta_time_sec}) + add_field_to(event, fields={target_field: delta_time_sec}) def _add_version_information_to_event(self, event: dict): """Add the version information to the event""" target_field = self._config.preprocessing.get("version_info_target_field") # pylint: disable=protected-access - add_field_to(event, field={target_field: self._config._version_information}) + add_field_to(event, fields={target_field: self._config._version_information}) # pylint: enable=protected-access def _add_hmac_to(self, event_dict, raw_event) -> dict: diff --git a/logprep/abc/processor.py b/logprep/abc/processor.py index b13497a18..f6f2a7f10 100644 --- a/logprep/abc/processor.py +++ b/logprep/abc/processor.py @@ -384,7 +384,7 @@ def _has_missing_values(self, event, rule, source_field_dict): def _write_target_field(self, event: dict, rule: "Rule", result: any) -> None: add_field_to( event, - field={rule.target_field: result}, + fields={rule.target_field: result}, extends_lists=rule.extend_target_list, overwrite_target_field=rule.overwrite_target, ) diff --git a/logprep/metrics/metrics.py b/logprep/metrics/metrics.py index 9f1524366..99fdb7702 100644 --- a/logprep/metrics/metrics.py +++ b/logprep/metrics/metrics.py @@ -222,12 +222,12 @@ def inner(self, *args, **kwargs): # nosemgrep if hasattr(self, "rule_type"): event = args[0] if event: - add_field_to(event, field={f"processing_times.{self.rule_type}": duration}) + add_field_to(event, fields={f"processing_times.{self.rule_type}": duration}) if hasattr(self, "_logprep_config"): # attribute of the Pipeline class event = args[0] if event: - add_field_to(event, field={"processing_times.pipeline": duration}) - add_field_to(event, field={"processing_times.hostname": gethostname()}) + add_field_to(event, fields={"processing_times.pipeline": duration}) + add_field_to(event, fields={"processing_times.hostname": gethostname()}) return result return inner diff --git a/logprep/processor/clusterer/processor.py b/logprep/processor/clusterer/processor.py index 80c1f85f3..04ae30014 100644 --- a/logprep/processor/clusterer/processor.py +++ b/logprep/processor/clusterer/processor.py @@ -140,7 +140,7 @@ def _cluster(self, event: dict, rule: ClustererRule): cluster_signature = cluster_signature_based_on_message add_field_to( event, - field={self._config.output_field_name: cluster_signature}, + fields={self._config.output_field_name: cluster_signature}, extends_lists=rule.extend_target_list, overwrite_target_field=rule.overwrite_target, ) diff --git a/logprep/processor/domain_label_extractor/processor.py b/logprep/processor/domain_label_extractor/processor.py index c5cdd11d6..22683e523 100644 --- a/logprep/processor/domain_label_extractor/processor.py +++ b/logprep/processor/domain_label_extractor/processor.py @@ -49,7 +49,7 @@ from logprep.processor.domain_label_extractor.rule import DomainLabelExtractorRule from logprep.processor.field_manager.processor import FieldManager from logprep.util.getter import GetterFactory -from logprep.util.helper import add_and_overwrite, add_batch_to, get_dotted_field_value +from logprep.util.helper import add_and_overwrite, add_field_to, get_dotted_field_value from logprep.util.validators import list_of_urls_validator logger = logging.getLogger("DomainLabelExtractor") @@ -130,7 +130,7 @@ def _apply_rules(self, event, rule: DomainLabelExtractorRule): if self._is_valid_ip(domain): tagging_field.append(f"ip_in_{rule.source_fields[0].replace('.', '_')}") - add_and_overwrite(event, field={self._config.tagging_field_name: tagging_field}) + add_and_overwrite(event, fields={self._config.tagging_field_name: tagging_field}) return labels = self._tld_extractor(domain) @@ -140,10 +140,10 @@ def _apply_rules(self, event, rule: DomainLabelExtractorRule): f"{rule.target_field}.top_level_domain": labels.suffix, f"{rule.target_field}.subdomain": labels.subdomain, } - add_batch_to(event, fields, overwrite_target_field=rule.overwrite_target) + add_field_to(event, fields, overwrite_target_field=rule.overwrite_target) else: tagging_field.append(f"invalid_domain_in_{rule.source_fields[0].replace('.', '_')}") - add_and_overwrite(event, field={self._config.tagging_field_name: tagging_field}) + add_and_overwrite(event, fields={self._config.tagging_field_name: tagging_field}) @staticmethod def _is_valid_ip(domain): diff --git a/logprep/processor/field_manager/processor.py b/logprep/processor/field_manager/processor.py index 47a00499a..7f93b0ffe 100644 --- a/logprep/processor/field_manager/processor.py +++ b/logprep/processor/field_manager/processor.py @@ -35,7 +35,6 @@ from logprep.processor.field_manager.rule import FieldManagerRule from logprep.util.helper import ( add_and_overwrite, - add_batch_to, add_field_to, get_dotted_field_value, pop_dotted_field_value, @@ -78,7 +77,7 @@ def _apply_mapping(self, event, rule, rule_args): if not any(source_field_values): return source_field_values, targets = self._filter_missing_fields(source_field_values, targets) - add_batch_to( + add_field_to( event, dict(zip(targets, source_field_values)), extend_target_list, overwrite_target ) if rule.delete_source_fields: @@ -106,7 +105,7 @@ def _write_to_single_target(self, args, extend_target_list, overwrite_target, ru case State( extend=True, overwrite=True, single_source_element=False, target_is_list=False ): - add_and_overwrite(event, field={target_field: source_fields_values}) + add_and_overwrite(event, fields={target_field: source_fields_values}) return case State( @@ -118,16 +117,16 @@ def _write_to_single_target(self, args, extend_target_list, overwrite_target, ru ): flattened_source_fields = self._overwrite_from_source_values(source_fields_values) source_fields_values = [*flattened_source_fields] - add_and_overwrite(event, field={target_field: source_fields_values}) + add_and_overwrite(event, fields={target_field: source_fields_values}) return case State(extend=True, overwrite=False, target_is_list=False, target_is_none=True): - add_and_overwrite(event, field={target_field: source_fields_values}) + add_and_overwrite(event, fields={target_field: source_fields_values}) return case State(extend=True, overwrite=False, target_is_list=False): source_fields_values = [target_field_value, *source_fields_values] - add_and_overwrite(event, field={target_field: source_fields_values}) + add_and_overwrite(event, fields={target_field: source_fields_values}) return case State( @@ -135,13 +134,13 @@ def _write_to_single_target(self, args, extend_target_list, overwrite_target, ru ): flattened_source_fields = self._overwrite_from_source_values(source_fields_values) source_fields_values = [*target_field_value, *flattened_source_fields] - add_and_overwrite(event, field={target_field: source_fields_values}) + add_and_overwrite(event, fields={target_field: source_fields_values}) return case State(overwrite=True, extend=True): flattened_source_fields = self._overwrite_from_source_values(source_fields_values) source_fields_values = [*flattened_source_fields] - add_and_overwrite(event, field={target_field: source_fields_values}) + add_and_overwrite(event, fields={target_field: source_fields_values}) return case _: diff --git a/logprep/processor/generic_adder/processor.py b/logprep/processor/generic_adder/processor.py index 8f9b2b37c..0dd8a0574 100644 --- a/logprep/processor/generic_adder/processor.py +++ b/logprep/processor/generic_adder/processor.py @@ -48,7 +48,7 @@ from logprep.factory_error import InvalidConfigurationError from logprep.processor.generic_adder.mysql_connector import MySQLConnector from logprep.processor.generic_adder.rule import GenericAdderRule -from logprep.util.helper import add_batch_to, get_dotted_field_value +from logprep.util.helper import add_field_to, get_dotted_field_value def sql_config_validator(_, attribute, value): @@ -230,7 +230,7 @@ def _apply_rules(self, event: dict, rule: GenericAdderRule): self._update_db_table() items_to_add = self._get_items_to_add_from_db(event, rule) if items_to_add: - add_batch_to(event, items_to_add, rule.extend_target_list, rule.overwrite_target) + add_field_to(event, items_to_add, rule.extend_target_list, rule.overwrite_target) def _get_items_to_add_from_db(self, event: dict, rule: GenericAdderRule) -> dict | None: """Get the sub part of the value from the event using a regex pattern""" diff --git a/logprep/processor/generic_resolver/processor.py b/logprep/processor/generic_resolver/processor.py index fce8b2373..aee580b25 100644 --- a/logprep/processor/generic_resolver/processor.py +++ b/logprep/processor/generic_resolver/processor.py @@ -61,7 +61,7 @@ def _apply_rules(self, event, rule): try: add_field_to( event, - field={target_field: content}, + fields={target_field: content}, extends_lists=rule.extend_target_list, overwrite_target_field=rule.overwrite_target, ) diff --git a/logprep/processor/geoip_enricher/processor.py b/logprep/processor/geoip_enricher/processor.py index 9374b8b45..c3cad0745 100644 --- a/logprep/processor/geoip_enricher/processor.py +++ b/logprep/processor/geoip_enricher/processor.py @@ -41,7 +41,7 @@ from logprep.processor.field_manager.processor import FieldManager from logprep.processor.geoip_enricher.rule import GEOIP_DATA_STUBS, GeoipEnricherRule from logprep.util.getter import GetterFactory -from logprep.util.helper import add_batch_to, get_dotted_field_value +from logprep.util.helper import add_field_to, get_dotted_field_value logger = logging.getLogger("GeoipEnricher") @@ -132,7 +132,7 @@ def _apply_rules(self, event, rule): rule.customize_target_subfields.get(target, f"{rule.target_field}.{target}"): value for target, value in geoip_data.items() } - add_batch_to( + add_field_to( event, fields, extends_lists=False, diff --git a/logprep/processor/grokker/processor.py b/logprep/processor/grokker/processor.py index dac33017e..b2c830367 100644 --- a/logprep/processor/grokker/processor.py +++ b/logprep/processor/grokker/processor.py @@ -42,7 +42,7 @@ from logprep.processor.field_manager.processor import FieldManager from logprep.processor.grokker.rule import GrokkerRule from logprep.util.getter import GetterFactory -from logprep.util.helper import add_batch_to, get_dotted_field_value +from logprep.util.helper import add_field_to, get_dotted_field_value logger = logging.getLogger("Grokker") @@ -85,7 +85,7 @@ def _apply_rules(self, event: dict, rule: GrokkerRule): if result is None or result == {}: continue matches.append(True) - add_batch_to( + add_field_to( event, result, extends_lists=rule.extend_target_list, diff --git a/logprep/processor/hyperscan_resolver/processor.py b/logprep/processor/hyperscan_resolver/processor.py index 49a05b92c..81a4b89ee 100644 --- a/logprep/processor/hyperscan_resolver/processor.py +++ b/logprep/processor/hyperscan_resolver/processor.py @@ -121,7 +121,7 @@ def _apply_rules(self, event: dict, rule: HyperscanResolverRule): try: add_field_to( event, - field={resolve_target: dest_val}, + fields={resolve_target: dest_val}, extends_lists=rule.extend_target_list, overwrite_target_field=rule.overwrite_target, ) diff --git a/logprep/processor/labeler/processor.py b/logprep/processor/labeler/processor.py index 2c878e4d6..6b2d47a6f 100644 --- a/logprep/processor/labeler/processor.py +++ b/logprep/processor/labeler/processor.py @@ -33,7 +33,7 @@ from logprep.abc.processor import Processor from logprep.processor.labeler.labeling_schema import LabelingSchema from logprep.processor.labeler.rule import LabelerRule -from logprep.util.helper import add_batch_to, get_dotted_field_value +from logprep.util.helper import add_field_to, get_dotted_field_value class Labeler(Processor): @@ -74,10 +74,10 @@ def setup(self): def _apply_rules(self, event, rule): """Applies the rule to the current event""" fields = {key: value for key, value in rule.prefixed_label.items()} - add_batch_to(event, fields, extends_lists=True) + add_field_to(event, fields, extends_lists=True) # convert sets into sorted lists fields = { key: sorted(set(get_dotted_field_value(event, key))) for key, _ in rule.prefixed_label.items() } - add_batch_to(event, fields, overwrite_target_field=True) + add_field_to(event, fields, overwrite_target_field=True) diff --git a/logprep/processor/list_comparison/processor.py b/logprep/processor/list_comparison/processor.py index ae488f956..4d1adc2da 100644 --- a/logprep/processor/list_comparison/processor.py +++ b/logprep/processor/list_comparison/processor.py @@ -73,8 +73,8 @@ def _apply_rules(self, event, rule): """ comparison_result, comparison_key = self._list_comparison(rule, event) if comparison_result is not None: - field = {f"{rule.target_field}.{comparison_key}": comparison_result} - add_field_to(event, field, extends_lists=True) + fields = {f"{rule.target_field}.{comparison_key}": comparison_result} + add_field_to(event, fields, extends_lists=True) def _list_comparison(self, rule: ListComparisonRule, event: dict): """ diff --git a/logprep/processor/pseudonymizer/processor.py b/logprep/processor/pseudonymizer/processor.py index 78b13dccb..b324e8134 100644 --- a/logprep/processor/pseudonymizer/processor.py +++ b/logprep/processor/pseudonymizer/processor.py @@ -264,7 +264,7 @@ def _apply_rules(self, event: dict, rule: PseudonymizerRule): ] else: field_value = self._pseudonymize_field(rule, dotted_field, regex, field_value) - add_field_to(event, field={dotted_field: field_value}, overwrite_target_field=True) + add_field_to(event, fields={dotted_field: field_value}, overwrite_target_field=True) if "@timestamp" in event: for pseudonym, _ in self.result.data: pseudonym["@timestamp"] = event["@timestamp"] diff --git a/logprep/processor/requester/processor.py b/logprep/processor/requester/processor.py index 8de56e3d3..dc56a8d8c 100644 --- a/logprep/processor/requester/processor.py +++ b/logprep/processor/requester/processor.py @@ -44,7 +44,7 @@ from logprep.processor.base.exceptions import FieldExistsWarning from logprep.processor.field_manager.processor import FieldManager from logprep.processor.requester.rule import RequesterRule -from logprep.util.helper import add_batch_to, add_field_to, get_source_fields_dict +from logprep.util.helper import add_field_to, get_source_fields_dict TEMPLATE_KWARGS = ("url", "json", "data", "params") @@ -72,7 +72,7 @@ def _handle_response(self, event, rule, response): try: add_field_to( event, - field={rule.target_field: self._get_result(response)}, + fields={rule.target_field: self._get_result(response)}, extends_lists=rule.extend_target_list, overwrite_target_field=rule.overwrite_target, ) @@ -83,7 +83,7 @@ def _handle_response(self, event, rule, response): contents = self._get_field_values(self._get_result(response), source_fields) targets = rule.target_field_mapping.values() try: - add_batch_to( + add_field_to( event, dict(zip(targets, contents)), rule.extend_target_list, diff --git a/logprep/processor/selective_extractor/processor.py b/logprep/processor/selective_extractor/processor.py index b0b7e58dc..fee75a67f 100644 --- a/logprep/processor/selective_extractor/processor.py +++ b/logprep/processor/selective_extractor/processor.py @@ -31,7 +31,7 @@ from logprep.processor.field_manager.processor import FieldManager from logprep.processor.selective_extractor.rule import SelectiveExtractorRule -from logprep.util.helper import add_batch_to, get_source_fields_dict +from logprep.util.helper import add_field_to, get_source_fields_dict class SelectiveExtractor(FieldManager): @@ -64,5 +64,5 @@ def _apply_rules(self, event: dict, rule: SelectiveExtractorRule): } if flattened_fields: filtered_event = {} - add_batch_to(filtered_event, flattened_fields) + add_field_to(filtered_event, flattened_fields) self.result.data.append((filtered_event, rule.outputs)) diff --git a/logprep/processor/template_replacer/processor.py b/logprep/processor/template_replacer/processor.py index 0a4fdd725..9b17fbcfa 100644 --- a/logprep/processor/template_replacer/processor.py +++ b/logprep/processor/template_replacer/processor.py @@ -115,7 +115,7 @@ def _perform_replacement(self, event: dict, replacement: str, rule: TemplateRepl """ overwrite = get_dotted_field_value(event, self._target_field) is not None add_field_to( - event, field={self._target_field: replacement}, overwrite_target_field=overwrite + event, fields={self._target_field: replacement}, overwrite_target_field=overwrite ) def setup(self): diff --git a/logprep/util/helper.py b/logprep/util/helper.py index f4fafda6b..26ad4178c 100644 --- a/logprep/util/helper.py +++ b/logprep/util/helper.py @@ -59,9 +59,9 @@ def _add_and_not_overwrite_key(sub_dict, key): return sub_dict.get(key) -def add_field_to( +def _add_one_field_to( event: dict, - field: dict, + field: tuple, extends_lists: bool = False, overwrite_target_field: bool = False, ) -> None: @@ -72,7 +72,7 @@ def add_field_to( ---------- event: dict Original log-event that logprep is currently processing - field: dict + field: tuple A key value pair describing the field that should be added. The key is the dotted subfield string indicating the target. The value is the content that should be added to the named target. The content can be of type str, float, int, list, dict. @@ -90,8 +90,6 @@ def add_field_to( """ if extends_lists and overwrite_target_field: raise ValueError("An output field can't be overwritten and extended at the same time") - if isinstance(field, dict): - field = list(field.items())[0] target_field, content = field field_path = [event, *get_dotted_field_list(target_field)] target_key = field_path.pop() @@ -116,7 +114,7 @@ def add_field_to( target_parent[target_key].append(content) -def _add_field_to_silent_fail(*args, **kwargs) -> None | str: +def _add_one_field_to_silent_fail(*args, **kwargs) -> None | str: """ Adds a field to an object, ignoring the FieldExistsWarning if the field already exists. Is only needed in the add_batch_to map function. Without this the map would terminate early. @@ -134,22 +132,24 @@ def _add_field_to_silent_fail(*args, **kwargs) -> None | str: FieldExistsWarning: If the field already exists, but this warning is caught and ignored. """ try: - add_field_to(*args, **kwargs) + _add_one_field_to(*args, **kwargs) except FieldExistsWarning as error: return error.skipped_fields[0] -def add_batch_to(event, fields, extends_lists=False, overwrite_target_field=False) -> None: +def add_field_to( + event: dict, fields: dict, extends_lists: bool = False, overwrite_target_field: bool = False +) -> None: """ Handles the batch addition operation while raising a FieldExistsWarning with all unsuccessful targets. Parameters: event: dict The event object to which fields are to be added. - targets: list - A list of target field names where the contents will be added. - contents: list - A list of contents corresponding to each target field. + fields: dict + A dicht with key value pairs describing the fields that should be added. The key is the dotted subfield + string indicating the target. The value is the content that should be added to the named target. The + content can be of type: str, float, int, list, dict. extends_lists: bool A boolean indicating whether to extend lists if the target field already exists. overwrite_target_field: bool @@ -159,10 +159,14 @@ def add_batch_to(event, fields, extends_lists=False, overwrite_target_field=Fals FieldExistsWarning: If there are targets to which the content could not be added due to field existence restrictions. """ + # filter out None values fields = {key: value for key, value in fields.items() if value is not None} number_fields = len(dict(fields)) + if number_fields == 1: + _add_one_field_to(event, list(fields.items())[0], extends_lists, overwrite_target_field) + return unsuccessful_targets = map( - _add_field_to_silent_fail, + _add_one_field_to_silent_fail, itertools.repeat(event, number_fields), fields.items(), itertools.repeat(extends_lists, number_fields), @@ -342,9 +346,9 @@ def snake_to_camel(snake: str) -> str: append_as_list = partial(add_field_to, extends_lists=True) -def add_and_overwrite(event, field, *_): +def add_and_overwrite(event, fields, *_): """wrapper for add_field_to""" - add_field_to(event, field, overwrite_target_field=True) + add_field_to(event, fields, overwrite_target_field=True) def append(event, field, separator): @@ -354,7 +358,7 @@ def append(event, field, separator): if not isinstance(target_value, list): target_value = "" if target_value is None else target_value target_value = f"{target_value}{separator}{content}" - add_and_overwrite(event, field={target_field: target_value}) + add_and_overwrite(event, fields={target_field: target_value}) else: append_as_list(event, field) From 90038c04c179c513741c80e2216538167a8c0dd3 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Wed, 13 Nov 2024 09:36:03 +0100 Subject: [PATCH 31/38] revert exception signature and add rule to add_field_to method as argument --- logprep/abc/processor.py | 6 ++-- logprep/processor/base/exceptions.py | 11 ++----- logprep/processor/dissector/processor.py | 4 +-- .../domain_label_extractor/processor.py | 10 ++++-- logprep/processor/field_manager/processor.py | 20 +++++++----- logprep/processor/generic_adder/processor.py | 2 +- .../processor/generic_resolver/processor.py | 3 +- logprep/processor/geoip_enricher/processor.py | 1 + logprep/processor/grokker/processor.py | 3 +- .../processor/hyperscan_resolver/processor.py | 3 +- logprep/processor/ip_informer/processor.py | 2 +- logprep/processor/labeler/processor.py | 4 +-- .../processor/list_comparison/processor.py | 2 +- logprep/processor/pre_detector/processor.py | 4 +-- logprep/processor/pseudonymizer/processor.py | 4 ++- logprep/processor/requester/processor.py | 4 ++- .../selective_extractor/processor.py | 2 +- .../processor/string_splitter/processor.py | 2 +- .../processor/template_replacer/processor.py | 5 ++- logprep/processor/timestamper/processor.py | 2 +- logprep/util/helper.py | 31 +++++++++++++------ tests/unit/exceptions/base.py | 2 +- .../exceptions/test_processing_exceptions.py | 2 +- tests/unit/framework/test_pipeline.py | 4 +-- 24 files changed, 80 insertions(+), 53 deletions(-) diff --git a/logprep/abc/processor.py b/logprep/abc/processor.py index f6f2a7f10..0eff6d044 100644 --- a/logprep/abc/processor.py +++ b/logprep/abc/processor.py @@ -359,15 +359,15 @@ def _handle_warning_error(self, event, rule, error, failure_tags=None): new_field = {"tags": sorted(list({*failure_tags}))} else: new_field = {"tags": sorted(list({*tags, *failure_tags}))} - add_and_overwrite(event, new_field) + add_and_overwrite(event, new_field, rule) if isinstance(error, ProcessingWarning): if error.tags: tags = tags if tags else [] new_field = {"tags": sorted(list({*error.tags, *tags, *failure_tags}))} - add_and_overwrite(event, new_field) + add_and_overwrite(event, new_field, rule) self.result.warnings.append(error) else: - self.result.warnings.append(ProcessingWarning(str(error), event, rule)) + self.result.warnings.append(ProcessingWarning(str(error), rule, event)) def _has_missing_values(self, event, rule, source_field_dict): missing_fields = list( diff --git a/logprep/processor/base/exceptions.py b/logprep/processor/base/exceptions.py index 4ecb9008d..caaa40f68 100644 --- a/logprep/processor/base/exceptions.py +++ b/logprep/processor/base/exceptions.py @@ -70,7 +70,7 @@ def __init__(self, message: str, rule: "Rule"): class ProcessingWarning(Warning): """A warning occurred - log the warning, but continue processing the event.""" - def __init__(self, message: str, event: dict, rule: "Rule" = None, tags: List[str] = None): + def __init__(self, message: str, rule: "Rule | None", event: dict, tags: List[str] = None): self.tags = tags if tags else [] if rule: rule.metrics.number_of_warnings += 1 @@ -82,16 +82,11 @@ def __init__(self, message: str, event: dict, rule: "Rule" = None, tags: List[st class FieldExistsWarning(ProcessingWarning): """Raised if field already exists.""" - def __init__( - self, - event: dict, - skipped_fields: List[str], - rule: "Rule" = None, - ): + def __init__(self, rule: "Rule | None", event: dict, skipped_fields: List[str]): self.skipped_fields = skipped_fields message = ( "The following fields could not be written, because " "one or more subfields existed and could not be extended: " f"{', '.join(skipped_fields)}" ) - super().__init__(message, event, rule) + super().__init__(message, rule, event) diff --git a/logprep/processor/dissector/processor.py b/logprep/processor/dissector/processor.py index 2257a1a35..ed1aeb0d8 100644 --- a/logprep/processor/dissector/processor.py +++ b/logprep/processor/dissector/processor.py @@ -85,12 +85,12 @@ def _get_mappings(self, event, rule) -> List[Tuple[Callable, dict, dict, str, in if strip_char: content = content.strip(strip_char) field = {target_field: content} - yield rule_action, event, field, separator, position + yield rule_action, event, field, separator, rule, position def _apply_convert_datatype(self, event, rule): for target_field, converter in rule.convert_actions: try: target_value = converter(get_dotted_field_value(event, target_field)) - add_field_to(event, {target_field: target_value}, overwrite_target_field=True) + add_field_to(event, {target_field: target_value}, rule, overwrite_target_field=True) except ValueError as error: self._handle_warning_error(event, rule, error) diff --git a/logprep/processor/domain_label_extractor/processor.py b/logprep/processor/domain_label_extractor/processor.py index 22683e523..f3354938f 100644 --- a/logprep/processor/domain_label_extractor/processor.py +++ b/logprep/processor/domain_label_extractor/processor.py @@ -130,7 +130,9 @@ def _apply_rules(self, event, rule: DomainLabelExtractorRule): if self._is_valid_ip(domain): tagging_field.append(f"ip_in_{rule.source_fields[0].replace('.', '_')}") - add_and_overwrite(event, fields={self._config.tagging_field_name: tagging_field}) + add_and_overwrite( + event, fields={self._config.tagging_field_name: tagging_field}, rule=rule + ) return labels = self._tld_extractor(domain) @@ -140,10 +142,12 @@ def _apply_rules(self, event, rule: DomainLabelExtractorRule): f"{rule.target_field}.top_level_domain": labels.suffix, f"{rule.target_field}.subdomain": labels.subdomain, } - add_field_to(event, fields, overwrite_target_field=rule.overwrite_target) + add_field_to(event, fields, rule, overwrite_target_field=rule.overwrite_target) else: tagging_field.append(f"invalid_domain_in_{rule.source_fields[0].replace('.', '_')}") - add_and_overwrite(event, fields={self._config.tagging_field_name: tagging_field}) + add_and_overwrite( + event, fields={self._config.tagging_field_name: tagging_field}, rule=rule + ) @staticmethod def _is_valid_ip(domain): diff --git a/logprep/processor/field_manager/processor.py b/logprep/processor/field_manager/processor.py index 7f93b0ffe..97f095636 100644 --- a/logprep/processor/field_manager/processor.py +++ b/logprep/processor/field_manager/processor.py @@ -78,7 +78,11 @@ def _apply_mapping(self, event, rule, rule_args): return source_field_values, targets = self._filter_missing_fields(source_field_values, targets) add_field_to( - event, dict(zip(targets, source_field_values)), extend_target_list, overwrite_target + event, + dict(zip(targets, source_field_values)), + rule, + extend_target_list, + overwrite_target, ) if rule.delete_source_fields: for dotted_field in source_fields: @@ -105,7 +109,7 @@ def _write_to_single_target(self, args, extend_target_list, overwrite_target, ru case State( extend=True, overwrite=True, single_source_element=False, target_is_list=False ): - add_and_overwrite(event, fields={target_field: source_fields_values}) + add_and_overwrite(event, fields={target_field: source_fields_values}, rule=rule) return case State( @@ -117,16 +121,16 @@ def _write_to_single_target(self, args, extend_target_list, overwrite_target, ru ): flattened_source_fields = self._overwrite_from_source_values(source_fields_values) source_fields_values = [*flattened_source_fields] - add_and_overwrite(event, fields={target_field: source_fields_values}) + add_and_overwrite(event, fields={target_field: source_fields_values}, rule=rule) return case State(extend=True, overwrite=False, target_is_list=False, target_is_none=True): - add_and_overwrite(event, fields={target_field: source_fields_values}) + add_and_overwrite(event, fields={target_field: source_fields_values}, rule=rule) return case State(extend=True, overwrite=False, target_is_list=False): source_fields_values = [target_field_value, *source_fields_values] - add_and_overwrite(event, fields={target_field: source_fields_values}) + add_and_overwrite(event, fields={target_field: source_fields_values}, rule=rule) return case State( @@ -134,18 +138,18 @@ def _write_to_single_target(self, args, extend_target_list, overwrite_target, ru ): flattened_source_fields = self._overwrite_from_source_values(source_fields_values) source_fields_values = [*target_field_value, *flattened_source_fields] - add_and_overwrite(event, fields={target_field: source_fields_values}) + add_and_overwrite(event, fields={target_field: source_fields_values}, rule=rule) return case State(overwrite=True, extend=True): flattened_source_fields = self._overwrite_from_source_values(source_fields_values) source_fields_values = [*flattened_source_fields] - add_and_overwrite(event, fields={target_field: source_fields_values}) + add_and_overwrite(event, fields={target_field: source_fields_values}, rule=rule) return case _: field = {target_field: source_fields_values} - add_field_to(event, field, state.extend, state.overwrite) + add_field_to(event, field, rule, state.extend, state.overwrite) def _overwrite_from_source_values(self, source_fields_values): duplicates = [] diff --git a/logprep/processor/generic_adder/processor.py b/logprep/processor/generic_adder/processor.py index 0dd8a0574..2f9224560 100644 --- a/logprep/processor/generic_adder/processor.py +++ b/logprep/processor/generic_adder/processor.py @@ -230,7 +230,7 @@ def _apply_rules(self, event: dict, rule: GenericAdderRule): self._update_db_table() items_to_add = self._get_items_to_add_from_db(event, rule) if items_to_add: - add_field_to(event, items_to_add, rule.extend_target_list, rule.overwrite_target) + add_field_to(event, items_to_add, rule, rule.extend_target_list, rule.overwrite_target) def _get_items_to_add_from_db(self, event: dict, rule: GenericAdderRule) -> dict | None: """Get the sub part of the value from the event using a regex pattern""" diff --git a/logprep/processor/generic_resolver/processor.py b/logprep/processor/generic_resolver/processor.py index aee580b25..70da7f0b2 100644 --- a/logprep/processor/generic_resolver/processor.py +++ b/logprep/processor/generic_resolver/processor.py @@ -62,13 +62,14 @@ def _apply_rules(self, event, rule): add_field_to( event, fields={target_field: content}, + rule=rule, extends_lists=rule.extend_target_list, overwrite_target_field=rule.overwrite_target, ) except FieldExistsWarning as error: conflicting_fields.extend(error.skipped_fields) if conflicting_fields: - raise FieldExistsWarning(event, conflicting_fields, rule) + raise FieldExistsWarning(rule, event, conflicting_fields) def _find_content_of_first_matching_pattern(self, rule, source_field_value): if rule.resolve_from_file: diff --git a/logprep/processor/geoip_enricher/processor.py b/logprep/processor/geoip_enricher/processor.py index c3cad0745..3fc35a074 100644 --- a/logprep/processor/geoip_enricher/processor.py +++ b/logprep/processor/geoip_enricher/processor.py @@ -135,6 +135,7 @@ def _apply_rules(self, event, rule): add_field_to( event, fields, + rule=rule, extends_lists=False, overwrite_target_field=rule.overwrite_target, ) diff --git a/logprep/processor/grokker/processor.py b/logprep/processor/grokker/processor.py index b2c830367..983c69f8e 100644 --- a/logprep/processor/grokker/processor.py +++ b/logprep/processor/grokker/processor.py @@ -88,13 +88,14 @@ def _apply_rules(self, event: dict, rule: GrokkerRule): add_field_to( event, result, + rule=rule, extends_lists=rule.extend_target_list, overwrite_target_field=rule.overwrite_target, ) if self._handle_missing_fields(event, rule, rule.actions.keys(), source_values): return if not matches: - raise ProcessingWarning("no grok pattern matched", event, rule) + raise ProcessingWarning("no grok pattern matched", rule, event) def setup(self): """Loads the action mapping. Has to be called before processing""" diff --git a/logprep/processor/hyperscan_resolver/processor.py b/logprep/processor/hyperscan_resolver/processor.py index 81a4b89ee..34dd7cc3c 100644 --- a/logprep/processor/hyperscan_resolver/processor.py +++ b/logprep/processor/hyperscan_resolver/processor.py @@ -122,6 +122,7 @@ def _apply_rules(self, event: dict, rule: HyperscanResolverRule): add_field_to( event, fields={resolve_target: dest_val}, + rule=rule, extends_lists=rule.extend_target_list, overwrite_target_field=rule.overwrite_target, ) @@ -129,7 +130,7 @@ def _apply_rules(self, event: dict, rule: HyperscanResolverRule): conflicting_fields.extend(error.skipped_fields) self._handle_missing_fields(event, rule, rule.field_mapping.keys(), source_values) if conflicting_fields: - raise FieldExistsWarning(event, conflicting_fields, rule) + raise FieldExistsWarning(rule, event, conflicting_fields) @staticmethod def _match_with_hyperscan(hyperscan_db: Database, src_val: str) -> list: diff --git a/logprep/processor/ip_informer/processor.py b/logprep/processor/ip_informer/processor.py index 5d8ce35a4..1c75bf702 100644 --- a/logprep/processor/ip_informer/processor.py +++ b/logprep/processor/ip_informer/processor.py @@ -54,7 +54,7 @@ def _apply_rules(self, event: dict, rule: IpInformerRule) -> None: if results: self._write_target_field(event, rule, results) for msg, error in self._processing_warnings: - raise ProcessingWarning(msg, event, rule) from error + raise ProcessingWarning(msg, rule, event) from error def _get_results(self, ip_address_list: Iterable, rule: IpInformerRule) -> dict: results = [(ip, self._ip_properties(ip, rule)) for ip in ip_address_list] diff --git a/logprep/processor/labeler/processor.py b/logprep/processor/labeler/processor.py index 6b2d47a6f..f74286d82 100644 --- a/logprep/processor/labeler/processor.py +++ b/logprep/processor/labeler/processor.py @@ -74,10 +74,10 @@ def setup(self): def _apply_rules(self, event, rule): """Applies the rule to the current event""" fields = {key: value for key, value in rule.prefixed_label.items()} - add_field_to(event, fields, extends_lists=True) + add_field_to(event, fields, rule=rule, extends_lists=True) # convert sets into sorted lists fields = { key: sorted(set(get_dotted_field_value(event, key))) for key, _ in rule.prefixed_label.items() } - add_field_to(event, fields, overwrite_target_field=True) + add_field_to(event, fields, rule=rule, overwrite_target_field=True) diff --git a/logprep/processor/list_comparison/processor.py b/logprep/processor/list_comparison/processor.py index 4d1adc2da..dbbbe4c48 100644 --- a/logprep/processor/list_comparison/processor.py +++ b/logprep/processor/list_comparison/processor.py @@ -74,7 +74,7 @@ def _apply_rules(self, event, rule): comparison_result, comparison_key = self._list_comparison(rule, event) if comparison_result is not None: fields = {f"{rule.target_field}.{comparison_key}": comparison_result} - add_field_to(event, fields, extends_lists=True) + add_field_to(event, fields, rule=rule, extends_lists=True) def _list_comparison(self, rule: ListComparisonRule, event: dict): """ diff --git a/logprep/processor/pre_detector/processor.py b/logprep/processor/pre_detector/processor.py index abaa24850..d2efb64ba 100644 --- a/logprep/processor/pre_detector/processor.py +++ b/logprep/processor/pre_detector/processor.py @@ -105,8 +105,8 @@ def normalize_timestamp(self, rule: PreDetectorRule, timestamp: str) -> str: except TimeParserException as error: raise ProcessingWarning( "Could not parse timestamp", - self.result.event, rule, + self.result.event, tags=["_pre_detector_timeparsing_failure"], ) from error @@ -126,7 +126,7 @@ def _get_detection_result(self, event: dict, rule: PreDetectorRule): pre_detection_id = get_dotted_field_value(event, "pre_detection_id") if pre_detection_id is None: pre_detection_id = str(uuid4()) - add_field_to(event, {"pre_detection_id": pre_detection_id}) + add_field_to(event, {"pre_detection_id": pre_detection_id}, rule=rule) detection_result = self._generate_detection_result(pre_detection_id, event, rule) self.result.data.append((detection_result, self._config.outputs)) diff --git a/logprep/processor/pseudonymizer/processor.py b/logprep/processor/pseudonymizer/processor.py index b324e8134..4873e65db 100644 --- a/logprep/processor/pseudonymizer/processor.py +++ b/logprep/processor/pseudonymizer/processor.py @@ -264,7 +264,9 @@ def _apply_rules(self, event: dict, rule: PseudonymizerRule): ] else: field_value = self._pseudonymize_field(rule, dotted_field, regex, field_value) - add_field_to(event, fields={dotted_field: field_value}, overwrite_target_field=True) + add_field_to( + event, fields={dotted_field: field_value}, rule=rule, overwrite_target_field=True + ) if "@timestamp" in event: for pseudonym, _ in self.result.data: pseudonym["@timestamp"] = event["@timestamp"] diff --git a/logprep/processor/requester/processor.py b/logprep/processor/requester/processor.py index dc56a8d8c..306691e97 100644 --- a/logprep/processor/requester/processor.py +++ b/logprep/processor/requester/processor.py @@ -73,6 +73,7 @@ def _handle_response(self, event, rule, response): add_field_to( event, fields={rule.target_field: self._get_result(response)}, + rule=rule, extends_lists=rule.extend_target_list, overwrite_target_field=rule.overwrite_target, ) @@ -86,13 +87,14 @@ def _handle_response(self, event, rule, response): add_field_to( event, dict(zip(targets, contents)), + rule, rule.extend_target_list, rule.overwrite_target, ) except FieldExistsWarning as error: conflicting_fields.extend(error.skipped_fields) if conflicting_fields: - raise FieldExistsWarning(event, conflicting_fields, rule) + raise FieldExistsWarning(rule, event, conflicting_fields) def _request(self, event, rule, kwargs): try: diff --git a/logprep/processor/selective_extractor/processor.py b/logprep/processor/selective_extractor/processor.py index fee75a67f..01cf10f23 100644 --- a/logprep/processor/selective_extractor/processor.py +++ b/logprep/processor/selective_extractor/processor.py @@ -64,5 +64,5 @@ def _apply_rules(self, event: dict, rule: SelectiveExtractorRule): } if flattened_fields: filtered_event = {} - add_field_to(filtered_event, flattened_fields) + add_field_to(filtered_event, flattened_fields, rule) self.result.data.append((filtered_event, rule.outputs)) diff --git a/logprep/processor/string_splitter/processor.py b/logprep/processor/string_splitter/processor.py index 9d06d335d..f2b94e260 100644 --- a/logprep/processor/string_splitter/processor.py +++ b/logprep/processor/string_splitter/processor.py @@ -42,6 +42,6 @@ def _apply_rules(self, event: dict, rule: StringSplitterRule): source_field_content = get_dotted_field_value(event, source_field) self._handle_missing_fields(event, rule, rule.source_fields, [source_field_content]) if not isinstance(source_field_content, str): - raise ProcessingWarning(f"source_field '{source_field}' is not a string", event, rule) + raise ProcessingWarning(f"source_field '{source_field}' is not a string", rule, event) result = source_field_content.split(rule.delimeter) self._write_target_field(event, rule, result) diff --git a/logprep/processor/template_replacer/processor.py b/logprep/processor/template_replacer/processor.py index 9b17fbcfa..a91951846 100644 --- a/logprep/processor/template_replacer/processor.py +++ b/logprep/processor/template_replacer/processor.py @@ -115,7 +115,10 @@ def _perform_replacement(self, event: dict, replacement: str, rule: TemplateRepl """ overwrite = get_dotted_field_value(event, self._target_field) is not None add_field_to( - event, fields={self._target_field: replacement}, overwrite_target_field=overwrite + event, + fields={self._target_field: replacement}, + rule=rule, + overwrite_target_field=overwrite, ) def setup(self): diff --git a/logprep/processor/timestamper/processor.py b/logprep/processor/timestamper/processor.py index ce1c579b3..71f2bcaa3 100644 --- a/logprep/processor/timestamper/processor.py +++ b/logprep/processor/timestamper/processor.py @@ -61,4 +61,4 @@ def _apply_rules(self, event, rule): parsed_successfully = True break if not parsed_successfully: - raise ProcessingWarning(str("Could not parse timestamp"), event, rule) + raise ProcessingWarning(str("Could not parse timestamp"), rule, event) diff --git a/logprep/util/helper.py b/logprep/util/helper.py index 26ad4178c..37c47b0e1 100644 --- a/logprep/util/helper.py +++ b/logprep/util/helper.py @@ -15,6 +15,7 @@ from logprep.util.defaults import DEFAULT_CONFIG_LOCATION if TYPE_CHECKING: # pragma: no cover + from logprep.processor.base.rule import Rule from logprep.util.configuration import Configuration @@ -62,6 +63,7 @@ def _add_and_not_overwrite_key(sub_dict, key): def _add_one_field_to( event: dict, field: tuple, + rule: "Rule", extends_lists: bool = False, overwrite_target_field: bool = False, ) -> None: @@ -76,6 +78,8 @@ def _add_one_field_to( A key value pair describing the field that should be added. The key is the dotted subfield string indicating the target. The value is the content that should be added to the named target. The content can be of type str, float, int, list, dict. + rule: Rule + A rule that initiated the field addition, is used for proper error handling. extends_lists: bool Flag that determines whether target_field lists should be extended overwrite_target_field: bool @@ -101,13 +105,13 @@ def _add_one_field_to( try: target_parent = reduce(_add_and_not_overwrite_key, field_path) except KeyError as error: - raise FieldExistsWarning(event, [target_field]) from error + raise FieldExistsWarning(rule, event, [target_field]) from error existing_value = target_parent.get(target_key) if existing_value is None: target_parent[target_key] = content return if not extends_lists or not isinstance(existing_value, list): - raise FieldExistsWarning(event, [target_field]) + raise FieldExistsWarning(rule, event, [target_field]) if isinstance(content, list | set): target_parent[target_key].extend(content) else: @@ -138,7 +142,11 @@ def _add_one_field_to_silent_fail(*args, **kwargs) -> None | str: def add_field_to( - event: dict, fields: dict, extends_lists: bool = False, overwrite_target_field: bool = False + event: dict, + fields: dict, + rule: "Rule" = None, + extends_lists: bool = False, + overwrite_target_field: bool = False, ) -> None: """ Handles the batch addition operation while raising a FieldExistsWarning with all unsuccessful targets. @@ -150,6 +158,8 @@ def add_field_to( A dicht with key value pairs describing the fields that should be added. The key is the dotted subfield string indicating the target. The value is the content that should be added to the named target. The content can be of type: str, float, int, list, dict. + rule: Rule + A rule that initiated the field addition, is used for proper error handling. extends_lists: bool A boolean indicating whether to extend lists if the target field already exists. overwrite_target_field: bool @@ -163,18 +173,21 @@ def add_field_to( fields = {key: value for key, value in fields.items() if value is not None} number_fields = len(dict(fields)) if number_fields == 1: - _add_one_field_to(event, list(fields.items())[0], extends_lists, overwrite_target_field) + _add_one_field_to( + event, list(fields.items())[0], rule, extends_lists, overwrite_target_field + ) return unsuccessful_targets = map( _add_one_field_to_silent_fail, itertools.repeat(event, number_fields), fields.items(), + itertools.repeat(rule, number_fields), itertools.repeat(extends_lists, number_fields), itertools.repeat(overwrite_target_field, number_fields), ) unsuccessful_targets = [item for item in unsuccessful_targets if item is not None] if unsuccessful_targets: - raise FieldExistsWarning(event, unsuccessful_targets) + raise FieldExistsWarning(rule, event, unsuccessful_targets) def _get_slice_arg(slice_item): @@ -346,19 +359,19 @@ def snake_to_camel(snake: str) -> str: append_as_list = partial(add_field_to, extends_lists=True) -def add_and_overwrite(event, fields, *_): +def add_and_overwrite(event, fields, rule, *_): """wrapper for add_field_to""" - add_field_to(event, fields, overwrite_target_field=True) + add_field_to(event, fields, rule, overwrite_target_field=True) -def append(event, field, separator): +def append(event, field, separator, rule): """appends to event""" target_field, content = list(field.items())[0] target_value = get_dotted_field_value(event, target_field) if not isinstance(target_value, list): target_value = "" if target_value is None else target_value target_value = f"{target_value}{separator}{content}" - add_and_overwrite(event, fields={target_field: target_value}) + add_and_overwrite(event, fields={target_field: target_value}, rule=rule) else: append_as_list(event, field) diff --git a/tests/unit/exceptions/base.py b/tests/unit/exceptions/base.py index 3a658ad1f..72013e228 100644 --- a/tests/unit/exceptions/base.py +++ b/tests/unit/exceptions/base.py @@ -22,7 +22,7 @@ class ExceptionBaseTest: def setup_method(self): self.object = Rule._create_from_dict({"filter": "message", "rule": {}}) self.event = {"message": "test_event"} - self.exception_args = ("the error message", self.event, self.object) + self.exception_args = ("the error message", self.object, self.event) def test_error_message(self): with pytest.raises(self.exception, match=self.error_message): diff --git a/tests/unit/exceptions/test_processing_exceptions.py b/tests/unit/exceptions/test_processing_exceptions.py index 5d24244ef..52c8534b6 100644 --- a/tests/unit/exceptions/test_processing_exceptions.py +++ b/tests/unit/exceptions/test_processing_exceptions.py @@ -35,7 +35,7 @@ class TestFieldExistsWarning(ExceptionBaseTest): def setup_method(self): super().setup_method() - self.exception_args = (self.event, ["my_field"], self.object) + self.exception_args = (self.object, self.event, ["my_field"]) class TestProcessingCriticalError(ExceptionBaseTest): diff --git a/tests/unit/framework/test_pipeline.py b/tests/unit/framework/test_pipeline.py index 2c8f4b8f7..3dec94242 100644 --- a/tests/unit/framework/test_pipeline.py +++ b/tests/unit/framework/test_pipeline.py @@ -253,7 +253,7 @@ def test_processor_warning_error_is_logged_but_processing_continues(self, mock_w self.pipeline._setup() self.pipeline._input.get_next.return_value = {"message": "test"} mock_rule = mock.MagicMock() - processing_warning = ProcessingWarning("not so bad", {"message": "test"}, mock_rule) + processing_warning = ProcessingWarning("not so bad", mock_rule, {"message": "test"}) self.pipeline._pipeline[1].process.return_value = ProcessorResult( processor_name="mock_processor", warnings=[processing_warning] ) @@ -308,7 +308,7 @@ def test_processor_logs_processing_error_and_warnings_separately( mock_create({"mock_processor1": {"type": "mock_processor"}}), mock_create({"mock_processor2": {"type": "mock_processor"}}), ] - warning = FieldExistsWarning(input_event1, ["foo"], mock_rule) + warning = FieldExistsWarning(mock_rule, input_event1, ["foo"]) self.pipeline._pipeline[0].process.return_value = ProcessorResult( processor_name="", warnings=[warning] ) From 6439b8c8dfa7f674d0ec49f77e720a8c742ac9df Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Wed, 13 Nov 2024 10:55:51 +0100 Subject: [PATCH 32/38] revert key indexing in dissector back to original --- logprep/processor/dissector/processor.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/logprep/processor/dissector/processor.py b/logprep/processor/dissector/processor.py index ed1aeb0d8..e42f6de8b 100644 --- a/logprep/processor/dissector/processor.py +++ b/logprep/processor/dissector/processor.py @@ -28,12 +28,15 @@ .. automodule:: logprep.processor.dissector.rule """ -from typing import Callable, List, Tuple +from typing import TYPE_CHECKING, Callable, List, Tuple from logprep.processor.dissector.rule import DissectorRule from logprep.processor.field_manager.processor import FieldManager from logprep.util.helper import add_field_to, get_dotted_field_value +if TYPE_CHECKING: + from logprep.processor.base.rule import Rule + class Dissector(FieldManager): """A processor that tokenizes field values to new fields and converts datatypes""" @@ -46,12 +49,12 @@ def _apply_rules(self, event, rule): def _apply_mapping(self, event, rule): action_mappings_sorted_by_position = sorted( - self._get_mappings(event, rule), key=lambda x: x[-1] + self._get_mappings(event, rule), key=lambda x: x[5] ) for action, *args, _ in action_mappings_sorted_by_position: action(*args) - def _get_mappings(self, event, rule) -> List[Tuple[Callable, dict, dict, str, int]]: + def _get_mappings(self, event, rule) -> List[Tuple[Callable, dict, dict, str, "Rule", int]]: current_field = None target_field_mapping = {} for rule_action in rule.actions: From ad2bc78ae09cb0bf154a23f8c9c4552f06cb8e1d Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Wed, 13 Nov 2024 11:08:09 +0100 Subject: [PATCH 33/38] add tests for multiple field additions in add_field_to - Add tests to cover adding multiple fields at once. - Include scenarios for overwriting, extending lists, and raising warnings. --- tests/unit/util/test_helper_add_field.py | 46 ++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/tests/unit/util/test_helper_add_field.py b/tests/unit/util/test_helper_add_field.py index 1605a33bc..9c9a29236 100644 --- a/tests/unit/util/test_helper_add_field.py +++ b/tests/unit/util/test_helper_add_field.py @@ -128,9 +128,49 @@ def test_add_list_with_nested_keys(self): def test_add_field_to_adds_value_not_as_list(self): # checks if a newly added field is added not as list, even when `extends_list` is True - document = { - "some": "field", - } + document = {"some": "field"} add_field_to(document, {"new": "list"}, extends_lists=True) assert document.get("new") == "list" assert not isinstance(document.get("new"), list) + + def test_add_field_to_adds_multiple_fields(self): + document = {"some": "field"} + expected = { + "some": "field", + "new": "foo", + "new2": "bar", + } + add_field_to(document, {"new": "foo", "new2": "bar"}) + assert document == expected + + def test_add_field_too_adds_multiple_fields_and_overwrites_one(self): + document = {"some": "field", "exists_already": "original content"} + expected = { + "some": "field", + "exists_already": {"updated": "content"}, + "new": "another content", + } + new_fields = {"exists_already": {"updated": "content"}, "new": "another content"} + add_field_to(document, new_fields, overwrite_target_field=True) + assert document == expected + + def test_add_field_too_adds_multiple_fields_and_extends_one(self): + document = {"some": "field", "exists_already": ["original content"]} + expected = { + "some": "field", + "exists_already": ["original content", "extended content"], + "new": "another content", + } + new_fields = {"exists_already": ["extended content"], "new": "another content"} + add_field_to(document, new_fields, extends_lists=True) + assert document == expected + + def test_add_field_adds_multiple_fields_and_raises_one_field_exists_warning(self): + document = {"some": "field", "exists_already": "original content"} + with pytest.raises(FieldExistsWarning, match=r"could not be written"): + add_field_to(document, {"exists_already": "new content", "new": "another content"}) + assert document == { + "some": "field", + "exists_already": "original content", + "new": "another content", + } From a22bf0a06e5a5fc69afd3b0ffb687de816167b13 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Wed, 13 Nov 2024 11:17:14 +0100 Subject: [PATCH 34/38] renamed 'add_field_to' to 'add_fields_to' --- logprep/abc/input.py | 12 ++--- logprep/abc/processor.py | 4 +- logprep/metrics/metrics.py | 10 ++-- logprep/processor/clusterer/processor.py | 4 +- logprep/processor/dissector/processor.py | 6 ++- .../domain_label_extractor/processor.py | 4 +- .../processor/domain_resolver/processor.py | 4 +- logprep/processor/field_manager/processor.py | 6 +-- logprep/processor/generic_adder/processor.py | 4 +- .../processor/generic_resolver/processor.py | 4 +- logprep/processor/geoip_enricher/processor.py | 4 +- logprep/processor/grokker/processor.py | 4 +- .../processor/hyperscan_resolver/processor.py | 4 +- logprep/processor/labeler/processor.py | 6 +-- .../processor/list_comparison/processor.py | 4 +- logprep/processor/pre_detector/processor.py | 4 +- logprep/processor/pseudonymizer/processor.py | 4 +- logprep/processor/requester/processor.py | 6 +-- .../selective_extractor/processor.py | 4 +- .../processor/template_replacer/processor.py | 4 +- logprep/util/helper.py | 6 +-- tests/unit/util/test_helper_add_field.py | 46 +++++++++---------- 22 files changed, 79 insertions(+), 75 deletions(-) diff --git a/logprep/abc/input.py b/logprep/abc/input.py index 2e28eabf4..396d995c0 100644 --- a/logprep/abc/input.py +++ b/logprep/abc/input.py @@ -18,7 +18,7 @@ from logprep.abc.exceptions import LogprepException from logprep.metrics.metrics import Metric from logprep.processor.base.exceptions import FieldExistsWarning -from logprep.util.helper import add_field_to, get_dotted_field_value +from logprep.util.helper import add_fields_to, get_dotted_field_value from logprep.util.time import UTC, TimeParser from logprep.util.validators import dict_structure_validator @@ -308,7 +308,7 @@ def _add_env_enrichment_to_event(self, event: dict): target: os.environ.get(variable_name, "") for target, variable_name in enrichments.items() } - add_field_to(event, fields) + add_fields_to(event, fields) def _add_arrival_time_information_to_event(self, event: dict): new_field = { @@ -316,7 +316,7 @@ def _add_arrival_time_information_to_event(self, event: dict): "log_arrival_time_target_field" ): TimeParser.now().isoformat() } - add_field_to(event, new_field) + add_fields_to(event, new_field) def _add_arrival_timedelta_information_to_event(self, event: dict): log_arrival_timedelta_config = self._config.preprocessing.get("log_arrival_timedelta") @@ -332,13 +332,13 @@ def _add_arrival_timedelta_information_to_event(self, event: dict): TimeParser.from_string(log_arrival_time).astimezone(UTC) - TimeParser.from_string(time_reference).astimezone(UTC) ).total_seconds() - add_field_to(event, fields={target_field: delta_time_sec}) + add_fields_to(event, fields={target_field: delta_time_sec}) def _add_version_information_to_event(self, event: dict): """Add the version information to the event""" target_field = self._config.preprocessing.get("version_info_target_field") # pylint: disable=protected-access - add_field_to(event, fields={target_field: self._config._version_information}) + add_fields_to(event, fields={target_field: self._config._version_information}) # pylint: enable=protected-access def _add_hmac_to(self, event_dict, raw_event) -> dict: @@ -397,5 +397,5 @@ def _add_hmac_to(self, event_dict, raw_event) -> dict: "compressed_base64": base64.b64encode(compressed).decode(), } } - add_field_to(event_dict, new_field) + add_fields_to(event_dict, new_field) return event_dict diff --git a/logprep/abc/processor.py b/logprep/abc/processor.py index 0eff6d044..802ba31c3 100644 --- a/logprep/abc/processor.py +++ b/logprep/abc/processor.py @@ -19,7 +19,7 @@ from logprep.util import getter from logprep.util.helper import ( add_and_overwrite, - add_field_to, + add_fields_to, get_dotted_field_value, pop_dotted_field_value, ) @@ -382,7 +382,7 @@ def _has_missing_values(self, event, rule, source_field_dict): return False def _write_target_field(self, event: dict, rule: "Rule", result: any) -> None: - add_field_to( + add_fields_to( event, fields={rule.target_field: result}, extends_lists=rule.extend_target_list, diff --git a/logprep/metrics/metrics.py b/logprep/metrics/metrics.py index 99fdb7702..c89c6621c 100644 --- a/logprep/metrics/metrics.py +++ b/logprep/metrics/metrics.py @@ -124,7 +124,7 @@ from attrs import define, field, validators from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram -from logprep.util.helper import add_field_to +from logprep.util.helper import add_fields_to @define(kw_only=True, slots=False) @@ -222,12 +222,14 @@ def inner(self, *args, **kwargs): # nosemgrep if hasattr(self, "rule_type"): event = args[0] if event: - add_field_to(event, fields={f"processing_times.{self.rule_type}": duration}) + add_fields_to( + event, fields={f"processing_times.{self.rule_type}": duration} + ) if hasattr(self, "_logprep_config"): # attribute of the Pipeline class event = args[0] if event: - add_field_to(event, fields={"processing_times.pipeline": duration}) - add_field_to(event, fields={"processing_times.hostname": gethostname()}) + add_fields_to(event, fields={"processing_times.pipeline": duration}) + add_fields_to(event, fields={"processing_times.hostname": gethostname()}) return result return inner diff --git a/logprep/processor/clusterer/processor.py b/logprep/processor/clusterer/processor.py index 04ae30014..1bc375c83 100644 --- a/logprep/processor/clusterer/processor.py +++ b/logprep/processor/clusterer/processor.py @@ -53,7 +53,7 @@ SignaturePhaseStreaming, ) from logprep.processor.field_manager.processor import FieldManager -from logprep.util.helper import add_field_to, get_dotted_field_value +from logprep.util.helper import add_fields_to, get_dotted_field_value class Clusterer(FieldManager): @@ -138,7 +138,7 @@ def _cluster(self, event: dict, rule: ClustererRule): ) else: cluster_signature = cluster_signature_based_on_message - add_field_to( + add_fields_to( event, fields={self._config.output_field_name: cluster_signature}, extends_lists=rule.extend_target_list, diff --git a/logprep/processor/dissector/processor.py b/logprep/processor/dissector/processor.py index e42f6de8b..3cc3bc34a 100644 --- a/logprep/processor/dissector/processor.py +++ b/logprep/processor/dissector/processor.py @@ -32,7 +32,7 @@ from logprep.processor.dissector.rule import DissectorRule from logprep.processor.field_manager.processor import FieldManager -from logprep.util.helper import add_field_to, get_dotted_field_value +from logprep.util.helper import add_fields_to, get_dotted_field_value if TYPE_CHECKING: from logprep.processor.base.rule import Rule @@ -94,6 +94,8 @@ def _apply_convert_datatype(self, event, rule): for target_field, converter in rule.convert_actions: try: target_value = converter(get_dotted_field_value(event, target_field)) - add_field_to(event, {target_field: target_value}, rule, overwrite_target_field=True) + add_fields_to( + event, {target_field: target_value}, rule, overwrite_target_field=True + ) except ValueError as error: self._handle_warning_error(event, rule, error) diff --git a/logprep/processor/domain_label_extractor/processor.py b/logprep/processor/domain_label_extractor/processor.py index f3354938f..5edca61d4 100644 --- a/logprep/processor/domain_label_extractor/processor.py +++ b/logprep/processor/domain_label_extractor/processor.py @@ -49,7 +49,7 @@ from logprep.processor.domain_label_extractor.rule import DomainLabelExtractorRule from logprep.processor.field_manager.processor import FieldManager from logprep.util.getter import GetterFactory -from logprep.util.helper import add_and_overwrite, add_field_to, get_dotted_field_value +from logprep.util.helper import add_and_overwrite, add_fields_to, get_dotted_field_value from logprep.util.validators import list_of_urls_validator logger = logging.getLogger("DomainLabelExtractor") @@ -142,7 +142,7 @@ def _apply_rules(self, event, rule: DomainLabelExtractorRule): f"{rule.target_field}.top_level_domain": labels.suffix, f"{rule.target_field}.subdomain": labels.subdomain, } - add_field_to(event, fields, rule, overwrite_target_field=rule.overwrite_target) + add_fields_to(event, fields, rule, overwrite_target_field=rule.overwrite_target) else: tagging_field.append(f"invalid_domain_in_{rule.source_fields[0].replace('.', '_')}") add_and_overwrite( diff --git a/logprep/processor/domain_resolver/processor.py b/logprep/processor/domain_resolver/processor.py index 53692ea0a..3b5655651 100644 --- a/logprep/processor/domain_resolver/processor.py +++ b/logprep/processor/domain_resolver/processor.py @@ -53,7 +53,7 @@ from logprep.util.cache import Cache from logprep.util.getter import GetterFactory from logprep.util.hasher import SHA256Hasher -from logprep.util.helper import add_field_to, get_dotted_field_value +from logprep.util.helper import add_fields_to, get_dotted_field_value from logprep.util.validators import list_of_urls_validator logger = logging.getLogger("DomainResolver") @@ -227,4 +227,4 @@ def _store_debug_infos(self, event, requires_storing): "cache_size": len(self._domain_ip_map.keys()), } } - add_field_to(event, event_dbg, overwrite_target_field=True) + add_fields_to(event, event_dbg, overwrite_target_field=True) diff --git a/logprep/processor/field_manager/processor.py b/logprep/processor/field_manager/processor.py index 97f095636..422dccb0f 100644 --- a/logprep/processor/field_manager/processor.py +++ b/logprep/processor/field_manager/processor.py @@ -35,7 +35,7 @@ from logprep.processor.field_manager.rule import FieldManagerRule from logprep.util.helper import ( add_and_overwrite, - add_field_to, + add_fields_to, get_dotted_field_value, pop_dotted_field_value, ) @@ -77,7 +77,7 @@ def _apply_mapping(self, event, rule, rule_args): if not any(source_field_values): return source_field_values, targets = self._filter_missing_fields(source_field_values, targets) - add_field_to( + add_fields_to( event, dict(zip(targets, source_field_values)), rule, @@ -149,7 +149,7 @@ def _write_to_single_target(self, args, extend_target_list, overwrite_target, ru case _: field = {target_field: source_fields_values} - add_field_to(event, field, rule, state.extend, state.overwrite) + add_fields_to(event, field, rule, state.extend, state.overwrite) def _overwrite_from_source_values(self, source_fields_values): duplicates = [] diff --git a/logprep/processor/generic_adder/processor.py b/logprep/processor/generic_adder/processor.py index 2f9224560..2e9677894 100644 --- a/logprep/processor/generic_adder/processor.py +++ b/logprep/processor/generic_adder/processor.py @@ -48,7 +48,7 @@ from logprep.factory_error import InvalidConfigurationError from logprep.processor.generic_adder.mysql_connector import MySQLConnector from logprep.processor.generic_adder.rule import GenericAdderRule -from logprep.util.helper import add_field_to, get_dotted_field_value +from logprep.util.helper import add_fields_to, get_dotted_field_value def sql_config_validator(_, attribute, value): @@ -230,7 +230,7 @@ def _apply_rules(self, event: dict, rule: GenericAdderRule): self._update_db_table() items_to_add = self._get_items_to_add_from_db(event, rule) if items_to_add: - add_field_to(event, items_to_add, rule, rule.extend_target_list, rule.overwrite_target) + add_fields_to(event, items_to_add, rule, rule.extend_target_list, rule.overwrite_target) def _get_items_to_add_from_db(self, event: dict, rule: GenericAdderRule) -> dict | None: """Get the sub part of the value from the event using a regex pattern""" diff --git a/logprep/processor/generic_resolver/processor.py b/logprep/processor/generic_resolver/processor.py index 70da7f0b2..e386deda2 100644 --- a/logprep/processor/generic_resolver/processor.py +++ b/logprep/processor/generic_resolver/processor.py @@ -30,7 +30,7 @@ from logprep.processor.base.exceptions import FieldExistsWarning from logprep.processor.field_manager.processor import FieldManager from logprep.processor.generic_resolver.rule import GenericResolverRule -from logprep.util.helper import add_field_to, get_dotted_field_value +from logprep.util.helper import add_fields_to, get_dotted_field_value class GenericResolver(FieldManager): @@ -59,7 +59,7 @@ def _apply_rules(self, event, rule): if rule.extend_target_list and current_content is None: content = [content] try: - add_field_to( + add_fields_to( event, fields={target_field: content}, rule=rule, diff --git a/logprep/processor/geoip_enricher/processor.py b/logprep/processor/geoip_enricher/processor.py index 3fc35a074..9a917f3fa 100644 --- a/logprep/processor/geoip_enricher/processor.py +++ b/logprep/processor/geoip_enricher/processor.py @@ -41,7 +41,7 @@ from logprep.processor.field_manager.processor import FieldManager from logprep.processor.geoip_enricher.rule import GEOIP_DATA_STUBS, GeoipEnricherRule from logprep.util.getter import GetterFactory -from logprep.util.helper import add_field_to, get_dotted_field_value +from logprep.util.helper import add_fields_to, get_dotted_field_value logger = logging.getLogger("GeoipEnricher") @@ -132,7 +132,7 @@ def _apply_rules(self, event, rule): rule.customize_target_subfields.get(target, f"{rule.target_field}.{target}"): value for target, value in geoip_data.items() } - add_field_to( + add_fields_to( event, fields, rule=rule, diff --git a/logprep/processor/grokker/processor.py b/logprep/processor/grokker/processor.py index 983c69f8e..cf7a5e32d 100644 --- a/logprep/processor/grokker/processor.py +++ b/logprep/processor/grokker/processor.py @@ -42,7 +42,7 @@ from logprep.processor.field_manager.processor import FieldManager from logprep.processor.grokker.rule import GrokkerRule from logprep.util.getter import GetterFactory -from logprep.util.helper import add_field_to, get_dotted_field_value +from logprep.util.helper import add_fields_to, get_dotted_field_value logger = logging.getLogger("Grokker") @@ -85,7 +85,7 @@ def _apply_rules(self, event: dict, rule: GrokkerRule): if result is None or result == {}: continue matches.append(True) - add_field_to( + add_fields_to( event, result, rule=rule, diff --git a/logprep/processor/hyperscan_resolver/processor.py b/logprep/processor/hyperscan_resolver/processor.py index 34dd7cc3c..e19a32e4e 100644 --- a/logprep/processor/hyperscan_resolver/processor.py +++ b/logprep/processor/hyperscan_resolver/processor.py @@ -43,7 +43,7 @@ SkipImportError, ) from logprep.processor.field_manager.processor import FieldManager -from logprep.util.helper import add_field_to, get_dotted_field_value +from logprep.util.helper import add_fields_to, get_dotted_field_value from logprep.util.validators import directory_validator # pylint: disable=no-name-in-module @@ -119,7 +119,7 @@ def _apply_rules(self, event: dict, rule: HyperscanResolverRule): if rule.extend_target_list and current_content is None: dest_val = [dest_val] try: - add_field_to( + add_fields_to( event, fields={resolve_target: dest_val}, rule=rule, diff --git a/logprep/processor/labeler/processor.py b/logprep/processor/labeler/processor.py index f74286d82..7ee44ed0d 100644 --- a/logprep/processor/labeler/processor.py +++ b/logprep/processor/labeler/processor.py @@ -33,7 +33,7 @@ from logprep.abc.processor import Processor from logprep.processor.labeler.labeling_schema import LabelingSchema from logprep.processor.labeler.rule import LabelerRule -from logprep.util.helper import add_field_to, get_dotted_field_value +from logprep.util.helper import add_fields_to, get_dotted_field_value class Labeler(Processor): @@ -74,10 +74,10 @@ def setup(self): def _apply_rules(self, event, rule): """Applies the rule to the current event""" fields = {key: value for key, value in rule.prefixed_label.items()} - add_field_to(event, fields, rule=rule, extends_lists=True) + add_fields_to(event, fields, rule=rule, extends_lists=True) # convert sets into sorted lists fields = { key: sorted(set(get_dotted_field_value(event, key))) for key, _ in rule.prefixed_label.items() } - add_field_to(event, fields, rule=rule, overwrite_target_field=True) + add_fields_to(event, fields, rule=rule, overwrite_target_field=True) diff --git a/logprep/processor/list_comparison/processor.py b/logprep/processor/list_comparison/processor.py index dbbbe4c48..d2064362a 100644 --- a/logprep/processor/list_comparison/processor.py +++ b/logprep/processor/list_comparison/processor.py @@ -32,7 +32,7 @@ from logprep.abc.processor import Processor from logprep.processor.list_comparison.rule import ListComparisonRule -from logprep.util.helper import add_field_to, get_dotted_field_value +from logprep.util.helper import add_fields_to, get_dotted_field_value class ListComparison(Processor): @@ -74,7 +74,7 @@ def _apply_rules(self, event, rule): comparison_result, comparison_key = self._list_comparison(rule, event) if comparison_result is not None: fields = {f"{rule.target_field}.{comparison_key}": comparison_result} - add_field_to(event, fields, rule=rule, extends_lists=True) + add_fields_to(event, fields, rule=rule, extends_lists=True) def _list_comparison(self, rule: ListComparisonRule, event: dict): """ diff --git a/logprep/processor/pre_detector/processor.py b/logprep/processor/pre_detector/processor.py index d2efb64ba..ad10ea062 100644 --- a/logprep/processor/pre_detector/processor.py +++ b/logprep/processor/pre_detector/processor.py @@ -39,7 +39,7 @@ from logprep.processor.base.exceptions import ProcessingWarning from logprep.processor.pre_detector.ip_alerter import IPAlerter from logprep.processor.pre_detector.rule import PreDetectorRule -from logprep.util.helper import add_field_to, get_dotted_field_value +from logprep.util.helper import add_fields_to, get_dotted_field_value from logprep.util.time import TimeParser, TimeParserException @@ -126,7 +126,7 @@ def _get_detection_result(self, event: dict, rule: PreDetectorRule): pre_detection_id = get_dotted_field_value(event, "pre_detection_id") if pre_detection_id is None: pre_detection_id = str(uuid4()) - add_field_to(event, {"pre_detection_id": pre_detection_id}, rule=rule) + add_fields_to(event, {"pre_detection_id": pre_detection_id}, rule=rule) detection_result = self._generate_detection_result(pre_detection_id, event, rule) self.result.data.append((detection_result, self._config.outputs)) diff --git a/logprep/processor/pseudonymizer/processor.py b/logprep/processor/pseudonymizer/processor.py index 4873e65db..ff57c66e8 100644 --- a/logprep/processor/pseudonymizer/processor.py +++ b/logprep/processor/pseudonymizer/processor.py @@ -64,7 +64,7 @@ from logprep.processor.pseudonymizer.rule import PseudonymizerRule from logprep.util.getter import GetterFactory from logprep.util.hasher import SHA256Hasher -from logprep.util.helper import add_field_to, get_dotted_field_value +from logprep.util.helper import add_fields_to, get_dotted_field_value from logprep.util.pseudo.encrypter import ( DualPKCS1HybridCTREncrypter, DualPKCS1HybridGCMEncrypter, @@ -264,7 +264,7 @@ def _apply_rules(self, event: dict, rule: PseudonymizerRule): ] else: field_value = self._pseudonymize_field(rule, dotted_field, regex, field_value) - add_field_to( + add_fields_to( event, fields={dotted_field: field_value}, rule=rule, overwrite_target_field=True ) if "@timestamp" in event: diff --git a/logprep/processor/requester/processor.py b/logprep/processor/requester/processor.py index 306691e97..aa2e6edea 100644 --- a/logprep/processor/requester/processor.py +++ b/logprep/processor/requester/processor.py @@ -44,7 +44,7 @@ from logprep.processor.base.exceptions import FieldExistsWarning from logprep.processor.field_manager.processor import FieldManager from logprep.processor.requester.rule import RequesterRule -from logprep.util.helper import add_field_to, get_source_fields_dict +from logprep.util.helper import add_fields_to, get_source_fields_dict TEMPLATE_KWARGS = ("url", "json", "data", "params") @@ -70,7 +70,7 @@ def _handle_response(self, event, rule, response): conflicting_fields = [] if rule.target_field: try: - add_field_to( + add_fields_to( event, fields={rule.target_field: self._get_result(response)}, rule=rule, @@ -84,7 +84,7 @@ def _handle_response(self, event, rule, response): contents = self._get_field_values(self._get_result(response), source_fields) targets = rule.target_field_mapping.values() try: - add_field_to( + add_fields_to( event, dict(zip(targets, contents)), rule, diff --git a/logprep/processor/selective_extractor/processor.py b/logprep/processor/selective_extractor/processor.py index 01cf10f23..c0bcf2ddd 100644 --- a/logprep/processor/selective_extractor/processor.py +++ b/logprep/processor/selective_extractor/processor.py @@ -31,7 +31,7 @@ from logprep.processor.field_manager.processor import FieldManager from logprep.processor.selective_extractor.rule import SelectiveExtractorRule -from logprep.util.helper import add_field_to, get_source_fields_dict +from logprep.util.helper import add_fields_to, get_source_fields_dict class SelectiveExtractor(FieldManager): @@ -64,5 +64,5 @@ def _apply_rules(self, event: dict, rule: SelectiveExtractorRule): } if flattened_fields: filtered_event = {} - add_field_to(filtered_event, flattened_fields, rule) + add_fields_to(filtered_event, flattened_fields, rule) self.result.data.append((filtered_event, rule.outputs)) diff --git a/logprep/processor/template_replacer/processor.py b/logprep/processor/template_replacer/processor.py index a91951846..e5101a292 100644 --- a/logprep/processor/template_replacer/processor.py +++ b/logprep/processor/template_replacer/processor.py @@ -41,7 +41,7 @@ from logprep.processor.field_manager.processor import FieldManager from logprep.processor.template_replacer.rule import TemplateReplacerRule from logprep.util.getter import GetterFactory -from logprep.util.helper import add_field_to, get_dotted_field_value +from logprep.util.helper import add_fields_to, get_dotted_field_value class TemplateReplacerError(Exception): @@ -114,7 +114,7 @@ def _perform_replacement(self, event: dict, replacement: str, rule: TemplateRepl Therefore, they wouldn't be replaced, and we can overwrite the existing target field. """ overwrite = get_dotted_field_value(event, self._target_field) is not None - add_field_to( + add_fields_to( event, fields={self._target_field: replacement}, rule=rule, diff --git a/logprep/util/helper.py b/logprep/util/helper.py index 37c47b0e1..43b1dc827 100644 --- a/logprep/util/helper.py +++ b/logprep/util/helper.py @@ -141,7 +141,7 @@ def _add_one_field_to_silent_fail(*args, **kwargs) -> None | str: return error.skipped_fields[0] -def add_field_to( +def add_fields_to( event: dict, fields: dict, rule: "Rule" = None, @@ -356,12 +356,12 @@ def snake_to_camel(snake: str) -> str: return camel -append_as_list = partial(add_field_to, extends_lists=True) +append_as_list = partial(add_fields_to, extends_lists=True) def add_and_overwrite(event, fields, rule, *_): """wrapper for add_field_to""" - add_field_to(event, fields, rule, overwrite_target_field=True) + add_fields_to(event, fields, rule, overwrite_target_field=True) def append(event, field, separator, rule): diff --git a/tests/unit/util/test_helper_add_field.py b/tests/unit/util/test_helper_add_field.py index 9c9a29236..7e7731415 100644 --- a/tests/unit/util/test_helper_add_field.py +++ b/tests/unit/util/test_helper_add_field.py @@ -3,20 +3,20 @@ import pytest from logprep.processor.base.exceptions import FieldExistsWarning -from logprep.util.helper import add_field_to +from logprep.util.helper import add_fields_to class TestHelperAddField: def test_add_str_content_as_new_root_field(self): document = {"source": {"ip": "8.8.8.8"}} expected_document = {"source": {"ip": "8.8.8.8"}, "field": "content"} - add_field_to(document, {"field": "content"}) + add_fields_to(document, {"field": "content"}) assert document == expected_document def test_add_str_content_as_completely_new_dotted_subfield(self): document = {"source": {"ip": "8.8.8.8"}} expected_document = {"source": {"ip": "8.8.8.8"}, "sub": {"field": "content"}} - add_field_to(document, {"sub.field": "content"}) + add_fields_to(document, {"sub.field": "content"}) assert document == expected_document def test_add_str_content_as_partially_new_dotted_subfield(self): @@ -26,31 +26,31 @@ def test_add_str_content_as_partially_new_dotted_subfield(self): "sub": {"field": "content", "other_field": "other_content"}, } - add_field_to(document, {"sub.field": "content"}) + add_fields_to(document, {"sub.field": "content"}) assert document == expected_document def test_provoke_str_duplicate_in_root_field(self): document = {"source": {"ip": "8.8.8.8"}, "field": "exists already"} with pytest.raises(FieldExistsWarning, match=r"could not be written"): - add_field_to(document, {"field": "content"}) + add_fields_to(document, {"field": "content"}) assert document def test_provoke_str_duplicate_in_dotted_subfield(self): document = {"source": {"ip": "8.8.8.8"}, "sub": {"field": "exists already"}} with pytest.raises(FieldExistsWarning, match=r"could not be written"): - add_field_to(document, {"sub.field": "content"}) + add_fields_to(document, {"sub.field": "content"}) assert document def test_add_dict_content_as_new_root_field(self): document = {"source": {"ip": "8.8.8.8"}} expected_document = {"source": {"ip": "8.8.8.8"}, "field": {"dict": "content"}} - add_field_to(document, {"field": {"dict": "content"}}) + add_fields_to(document, {"field": {"dict": "content"}}) assert document == expected_document def test_add_dict_content_as_completely_new_dotted_subfield(self): document = {"source": {"ip": "8.8.8.8"}} expected_document = {"source": {"ip": "8.8.8.8"}, "sub": {"field": {"dict": "content"}}} - add_field_to(document, {"sub.field": {"dict": "content"}}) + add_fields_to(document, {"sub.field": {"dict": "content"}}) assert document == expected_document def test_add_dict_content_as_partially_new_dotted_subfield(self): @@ -59,46 +59,46 @@ def test_add_dict_content_as_partially_new_dotted_subfield(self): "source": {"ip": "8.8.8.8"}, "sub": {"field": {"dict": "content"}, "other_field": "other_content"}, } - add_field_to(document, {"sub.field": {"dict": "content"}}) + add_fields_to(document, {"sub.field": {"dict": "content"}}) assert document == expected_document def test_provoke_dict_duplicate_in_root_field(self): document = {"source": {"ip": "8.8.8.8"}, "field": {"already_existing": "dict"}} with pytest.raises(FieldExistsWarning, match=r"could not be written"): - add_field_to(document, {"field": {"dict": "content"}}) + add_fields_to(document, {"field": {"dict": "content"}}) assert document def test_provoke_dict_duplicate_in_dotted_subfield(self): document = {"source": {"ip": "8.8.8.8"}, "sub": {"field": {"already_existing": "dict"}}} with pytest.raises(FieldExistsWarning, match=r"could not be written"): - add_field_to(document, {"sub.field": {"dict": "content"}}) + add_fields_to(document, {"sub.field": {"dict": "content"}}) def test_add_field_to_overwrites_output_field_in_root_level(self): document = {"some": "field", "output_field": "has already content"} - add_field_to(document, {"output_field": {"dict": "content"}}, overwrite_target_field=True) + add_fields_to(document, {"output_field": {"dict": "content"}}, overwrite_target_field=True) assert document.get("output_field") == {"dict": "content"} def test_add_field_to_overwrites_output_field_in_nested_level(self): document = {"some": "field", "nested": {"output": {"field": "has already content"}}} - add_field_to( + add_fields_to( document, {"nested.output.field": {"dict": "content"}}, overwrite_target_field=True ) assert document.get("nested", {}).get("output", {}).get("field") == {"dict": "content"} def test_add_field_to_extends_list_when_only_given_a_string(self): document = {"some": "field", "some_list": ["with a value"]} - add_field_to(document, {"some_list": "new value"}, extends_lists=True) + add_fields_to(document, {"some_list": "new value"}, extends_lists=True) assert document.get("some_list") == ["with a value", "new value"] def test_add_field_to_extends_list_when_given_a_list(self): document = {"some": "field", "some_list": ["with a value"]} - add_field_to(document, {"some_list": ["first", "second"]}, extends_lists=True) + add_fields_to(document, {"some_list": ["first", "second"]}, extends_lists=True) assert document.get("some_list") == ["with a value", "first", "second"] def test_add_field_to_raises_if_list_should_be_extended_and_overwritten_at_the_same_time(self): document = {"some": "field", "some_list": ["with a value"]} with pytest.raises(ValueError, match=r"can't be overwritten and extended at the same time"): - add_field_to( + add_fields_to( document, {"some_list": ["first", "second"]}, extends_lists=True, @@ -109,7 +109,7 @@ def test_add_field_to_raises_if_list_should_be_extended_and_overwritten_at_the_s def test_returns_false_if_dotted_field_value_key_exists(self): document = {"user": "Franz"} with pytest.raises(FieldExistsWarning, match=r"could not be written"): - add_field_to(document, {"user.in_list": ["user_inlist"]}) + add_fields_to(document, {"user.in_list": ["user_inlist"]}) assert document def test_add_list_with_nested_keys(self): @@ -123,13 +123,13 @@ def test_add_list_with_nested_keys(self): } } } - add_field_to(testdict, {"key1.key2.key3.key4.key5.list": ["content"]}, extends_lists=True) + add_fields_to(testdict, {"key1.key2.key3.key4.key5.list": ["content"]}, extends_lists=True) assert testdict == expected def test_add_field_to_adds_value_not_as_list(self): # checks if a newly added field is added not as list, even when `extends_list` is True document = {"some": "field"} - add_field_to(document, {"new": "list"}, extends_lists=True) + add_fields_to(document, {"new": "list"}, extends_lists=True) assert document.get("new") == "list" assert not isinstance(document.get("new"), list) @@ -140,7 +140,7 @@ def test_add_field_to_adds_multiple_fields(self): "new": "foo", "new2": "bar", } - add_field_to(document, {"new": "foo", "new2": "bar"}) + add_fields_to(document, {"new": "foo", "new2": "bar"}) assert document == expected def test_add_field_too_adds_multiple_fields_and_overwrites_one(self): @@ -151,7 +151,7 @@ def test_add_field_too_adds_multiple_fields_and_overwrites_one(self): "new": "another content", } new_fields = {"exists_already": {"updated": "content"}, "new": "another content"} - add_field_to(document, new_fields, overwrite_target_field=True) + add_fields_to(document, new_fields, overwrite_target_field=True) assert document == expected def test_add_field_too_adds_multiple_fields_and_extends_one(self): @@ -162,13 +162,13 @@ def test_add_field_too_adds_multiple_fields_and_extends_one(self): "new": "another content", } new_fields = {"exists_already": ["extended content"], "new": "another content"} - add_field_to(document, new_fields, extends_lists=True) + add_fields_to(document, new_fields, extends_lists=True) assert document == expected def test_add_field_adds_multiple_fields_and_raises_one_field_exists_warning(self): document = {"some": "field", "exists_already": "original content"} with pytest.raises(FieldExistsWarning, match=r"could not be written"): - add_field_to(document, {"exists_already": "new content", "new": "another content"}) + add_fields_to(document, {"exists_already": "new content", "new": "another content"}) assert document == { "some": "field", "exists_already": "original content", From 0facf6110166b6125288be32441261256f29e8b5 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Wed, 13 Nov 2024 11:33:05 +0100 Subject: [PATCH 35/38] fix typo in StringSplitterRule - Corrected 'delimiter' typo across test and implementation files. - Updated changelog to reflect this fix. --- CHANGELOG.md | 1 + logprep/processor/string_splitter/processor.py | 2 +- logprep/processor/string_splitter/rule.py | 10 +++++----- .../processor/string_splitter/test_string_splitter.py | 2 +- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bcc2e73b6..04ff2620e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ * `CriticalInputError` is raised when the input preprocessor values can't be set, this was so far only true for the hmac preprocessor, but is now also applied for all other preprocessors. +* fix `delimiter` typo in `StringSplitterRule` configuration ### Features ### Improvements diff --git a/logprep/processor/string_splitter/processor.py b/logprep/processor/string_splitter/processor.py index f2b94e260..7d81a1d20 100644 --- a/logprep/processor/string_splitter/processor.py +++ b/logprep/processor/string_splitter/processor.py @@ -43,5 +43,5 @@ def _apply_rules(self, event: dict, rule: StringSplitterRule): self._handle_missing_fields(event, rule, rule.source_fields, [source_field_content]) if not isinstance(source_field_content, str): raise ProcessingWarning(f"source_field '{source_field}' is not a string", rule, event) - result = source_field_content.split(rule.delimeter) + result = source_field_content.split(rule.delimiter) self._write_target_field(event, rule, result) diff --git a/logprep/processor/string_splitter/rule.py b/logprep/processor/string_splitter/rule.py index e2ced13e4..b25897d90 100644 --- a/logprep/processor/string_splitter/rule.py +++ b/logprep/processor/string_splitter/rule.py @@ -61,12 +61,12 @@ class Config(FieldManagerRule.Config): validators.max_len(1), ], ) - delimeter: str = field(validator=validators.instance_of(str), default=" ") - """The delimeter for splitting. Defaults to whitespace""" + delimiter: str = field(validator=validators.instance_of(str), default=" ") + """The delimiter for splitting. Defaults to whitespace""" mapping: dict = field(default="", init=False, repr=False, eq=False) ignore_missing_fields: bool = field(default=False, init=False, repr=False, eq=False) @property - def delimeter(self): - """returns the configured delimeter""" - return self._config.delimeter + def delimiter(self): + """returns the configured delimiter""" + return self._config.delimiter diff --git a/tests/unit/processor/string_splitter/test_string_splitter.py b/tests/unit/processor/string_splitter/test_string_splitter.py index 3db351526..692a8c649 100644 --- a/tests/unit/processor/string_splitter/test_string_splitter.py +++ b/tests/unit/processor/string_splitter/test_string_splitter.py @@ -22,7 +22,7 @@ "string_splitter": { "source_fields": ["message"], "target_field": "result", - "delimeter": ", ", + "delimiter": ", ", }, }, {"message": "this, is, the, message"}, From 827b7f4739f7a3fbaf9afbb3c8fe2db0964aa873 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Wed, 13 Nov 2024 11:43:04 +0100 Subject: [PATCH 36/38] remove unused conflicting_fields list --- logprep/processor/grokker/processor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/logprep/processor/grokker/processor.py b/logprep/processor/grokker/processor.py index cf7a5e32d..654a1f8f1 100644 --- a/logprep/processor/grokker/processor.py +++ b/logprep/processor/grokker/processor.py @@ -65,7 +65,6 @@ class Config(FieldManager.Config): """ def _apply_rules(self, event: dict, rule: GrokkerRule): - conflicting_fields = [] matches = [] source_values = [] for dotted_field, grok in rule.actions.items(): From 8405eddd67bff9c140f6eb5f99cb16ee886cc1f1 Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Wed, 13 Nov 2024 14:02:29 +0100 Subject: [PATCH 37/38] rename _add_one_field_to to _add_field_to for clarity - Simplified function name to better reflect its purpose. - Updated all instances where the function is invoked to maintain consistency. --- logprep/util/helper.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/logprep/util/helper.py b/logprep/util/helper.py index 43b1dc827..a040ac112 100644 --- a/logprep/util/helper.py +++ b/logprep/util/helper.py @@ -60,7 +60,7 @@ def _add_and_not_overwrite_key(sub_dict, key): return sub_dict.get(key) -def _add_one_field_to( +def _add_field_to( event: dict, field: tuple, rule: "Rule", @@ -118,7 +118,7 @@ def _add_one_field_to( target_parent[target_key].append(content) -def _add_one_field_to_silent_fail(*args, **kwargs) -> None | str: +def _add_field_to_silent_fail(*args, **kwargs) -> None | str: """ Adds a field to an object, ignoring the FieldExistsWarning if the field already exists. Is only needed in the add_batch_to map function. Without this the map would terminate early. @@ -136,7 +136,7 @@ def _add_one_field_to_silent_fail(*args, **kwargs) -> None | str: FieldExistsWarning: If the field already exists, but this warning is caught and ignored. """ try: - _add_one_field_to(*args, **kwargs) + _add_field_to(*args, **kwargs) except FieldExistsWarning as error: return error.skipped_fields[0] @@ -173,12 +173,10 @@ def add_fields_to( fields = {key: value for key, value in fields.items() if value is not None} number_fields = len(dict(fields)) if number_fields == 1: - _add_one_field_to( - event, list(fields.items())[0], rule, extends_lists, overwrite_target_field - ) + _add_field_to(event, list(fields.items())[0], rule, extends_lists, overwrite_target_field) return unsuccessful_targets = map( - _add_one_field_to_silent_fail, + _add_field_to_silent_fail, itertools.repeat(event, number_fields), fields.items(), itertools.repeat(rule, number_fields), From 2506df6f077630c9ad0c814a116cf18f2e0c4c4c Mon Sep 17 00:00:00 2001 From: dtrai2 Date: Wed, 13 Nov 2024 14:26:12 +0100 Subject: [PATCH 38/38] fix CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 04ff2620e..00bdda7da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,7 @@ * replace `BaseException` with `Exception` for custom errors * refactor `generic_resolver` to validate rules on startup instead of application of each rule * rewrite the helper method `add_field_to` such that it always raises an `FieldExistsWarning` instead of return a bool. -* add new helper method `add_batch_to` to directly add multiple fields to one event +* add new helper method `add_fields_to` to directly add multiple fields to one event * refactored some processors to make use of the new helper methods