From 1fec611b89cf476aa78da81bd18aab23c8ef3761 Mon Sep 17 00:00:00 2001 From: dtrai2 <95028228+dtrai2@users.noreply.github.com> Date: Mon, 24 Jun 2024 12:13:18 +0200 Subject: [PATCH] remove normalizer processor (#400) * remove normalizer processor * remove pygrok dependency * add regex dependency * update changelog --------- Co-authored-by: ekneg54 --- CHANGELOG.md | 8 +- .../user_manual/configuration/processor.rst | 1 - logprep/connector/file/input.py | 2 +- logprep/processor/normalizer/__init__.py | 0 logprep/processor/normalizer/processor.py | 388 ------ logprep/processor/normalizer/rule.py | 1071 --------------- logprep/registry.py | 2 - pyproject.toml | 3 +- quickstart/docker-compose.yml | 1 + quickstart/exampledata/config/pipeline.yml | 9 +- .../rules/dissector/generic/example_rule.yml | 5 + .../rules/dissector/specific/example_rule.yml | 5 + .../additional_grok_patterns/pattern_file_one | 2 - .../additional_grok_patterns/pattern_file_two | 1 - .../rules/normalizer/generic/example_rule.yml | 5 - .../normalizer/normalizer_regex_mapping.yml | 4 - .../normalizer/specific/example_rule.yml | 5 - .../auto_tests/normalizer/regex_mapping.yml | 4 - .../generic/auto_test_normalizer_match.json | 7 - .../auto_test_normalizer_match_test.json | 13 - .../auto_test_normalizer_mismatch.json | 7 - .../auto_test_normalizer_mismatch_test.json | 13 - .../auto_test_normalizer_no_test_.json | 7 - .../additional_grok_patterns/pattern_file_one | 3 - .../additional_grok_patterns/pattern_file_two | 1 - .../pattern_timestamp | 1 - .../unit/normalizer/html_replace_fields.yml | 4 - .../normalizer/normalizer_regex_mapping.yml | 4 - .../unit/normalizer/regex_mapping.yml | 1 - ...st1_NOT_1111_to_test_normalized_test1.json | 7 - ...ta_test1_to_test_normalized_something.json | 7 - ...ormalize_to_test_normalized_something.json | 7 - .../rules/generic/this_is_not_a_rule.not_json | 1 - .../rules/specific/Test1_id_1111.json | 9 - .../rules/specific/Test2_id_2222.json | 8 - .../specific/this_is_not_a_rule.not_json | 1 - tests/unit/processor/grokker/test_grokker.py | 14 +- tests/unit/processor/normalizer/__init__.py | 0 .../processor/normalizer/test_normalizer.py | 1172 ----------------- .../normalizer/test_normalizer_rule.py | 279 ---- tests/unit/test_configuration.py | 38 +- tests/unit/util/test_helper.py | 2 +- 42 files changed, 46 insertions(+), 3076 deletions(-) delete mode 100644 logprep/processor/normalizer/__init__.py delete mode 100644 logprep/processor/normalizer/processor.py delete mode 100644 logprep/processor/normalizer/rule.py create mode 100644 quickstart/exampledata/rules/dissector/generic/example_rule.yml create mode 100644 quickstart/exampledata/rules/dissector/specific/example_rule.yml delete mode 100644 quickstart/exampledata/rules/normalizer/additional_grok_patterns/pattern_file_one delete mode 100644 quickstart/exampledata/rules/normalizer/additional_grok_patterns/pattern_file_two delete mode 100644 quickstart/exampledata/rules/normalizer/generic/example_rule.yml delete mode 100644 quickstart/exampledata/rules/normalizer/normalizer_regex_mapping.yml delete mode 100644 quickstart/exampledata/rules/normalizer/specific/example_rule.yml delete mode 100644 tests/testdata/auto_tests/normalizer/regex_mapping.yml delete mode 100644 tests/testdata/auto_tests/normalizer/rules/generic/auto_test_normalizer_match.json delete mode 100644 tests/testdata/auto_tests/normalizer/rules/generic/auto_test_normalizer_match_test.json delete mode 100644 tests/testdata/auto_tests/normalizer/rules/specific/auto_test_normalizer_mismatch.json delete mode 100644 tests/testdata/auto_tests/normalizer/rules/specific/auto_test_normalizer_mismatch_test.json delete mode 100644 tests/testdata/auto_tests/normalizer/rules/specific/auto_test_normalizer_no_test_.json delete mode 100644 tests/testdata/unit/normalizer/additional_grok_patterns/pattern_file_one delete mode 100644 tests/testdata/unit/normalizer/additional_grok_patterns/pattern_file_two delete mode 100644 tests/testdata/unit/normalizer/additional_grok_patterns/pattern_timestamp delete mode 100644 tests/testdata/unit/normalizer/html_replace_fields.yml delete mode 100644 tests/testdata/unit/normalizer/normalizer_regex_mapping.yml delete mode 100644 tests/testdata/unit/normalizer/regex_mapping.yml delete mode 100644 tests/testdata/unit/normalizer/rules/generic/event_data_test1_NOT_1111_to_test_normalized_test1.json delete mode 100644 tests/testdata/unit/normalizer/rules/generic/event_data_test1_to_test_normalized_something.json delete mode 100644 tests/testdata/unit/normalizer/rules/generic/event_data_test_normalize_to_test_normalized_something.json delete mode 100644 tests/testdata/unit/normalizer/rules/generic/this_is_not_a_rule.not_json delete mode 100644 tests/testdata/unit/normalizer/rules/specific/Test1_id_1111.json delete mode 100644 tests/testdata/unit/normalizer/rules/specific/Test2_id_2222.json delete mode 100644 tests/testdata/unit/normalizer/rules/specific/this_is_not_a_rule.not_json delete mode 100644 tests/unit/processor/normalizer/__init__.py delete mode 100644 tests/unit/processor/normalizer/test_normalizer.py delete mode 100644 tests/unit/processor/normalizer/test_normalizer_rule.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 136844eb2..a72146306 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,13 @@ ## Upcoming Changes - ## next release +### Breaking -This release limits the maximum python version to `3.12.3` because of the issue +* This release limits the maximum python version to `3.12.3` because of the issue [#612](https://github.com/fkie-cad/Logprep/issues/612). +* Remove `normalizer` processor, as it's functionality was replaced by the `grokker`, `timestamper` +and `field_manager` processors -### Breaking ### Features ### Improvements @@ -417,7 +418,6 @@ In case of positive detection results, rule attributions are now inserted in the * Bump `requests` to `>=2.31.0` to circumvent `CVE-2023-32681` * Include a lucene representation of the rule filter into the predetector results. The representation is not completely lucene compatible due to non-existing regex functionality. -* Remove direct dependency of `python-dateutil` ### Bugfix diff --git a/doc/source/user_manual/configuration/processor.rst b/doc/source/user_manual/configuration/processor.rst index 2b79ecc82..54165aba3 100644 --- a/doc/source/user_manual/configuration/processor.rst +++ b/doc/source/user_manual/configuration/processor.rst @@ -24,7 +24,6 @@ Processors .. automodule:: logprep.processor.key_checker.processor .. automodule:: logprep.processor.labeler.processor .. automodule:: logprep.processor.list_comparison.processor -.. automodule:: logprep.processor.normalizer.processor .. automodule:: logprep.processor.pre_detector.processor .. automodule:: logprep.processor.pseudonymizer.processor .. automodule:: logprep.processor.requester.processor diff --git a/logprep/connector/file/input.py b/logprep/connector/file/input.py index 8b0c218fb..6e9d89477 100644 --- a/logprep/connector/file/input.py +++ b/logprep/connector/file/input.py @@ -210,7 +210,7 @@ class Config(Input.Config): logfile_path: str = field(validator=file_validator) """A path to a file in generic raw format, which can be in any string based - format. Needs to be parsed with normalizer or another processor""" + format. Needs to be parsed with dissector or another processor""" start: str = field( validator=[validators.instance_of(str), validators.in_(("begin", "end"))], diff --git a/logprep/processor/normalizer/__init__.py b/logprep/processor/normalizer/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/logprep/processor/normalizer/processor.py b/logprep/processor/normalizer/processor.py deleted file mode 100644 index 44e0fa6b8..000000000 --- a/logprep/processor/normalizer/processor.py +++ /dev/null @@ -1,388 +0,0 @@ -""" -Normalizer -========== - -The Normalizer copies specific values to configurable fields. - -Processor Configuration -^^^^^^^^^^^^^^^^^^^^^^^ - -.. code-block:: yaml - :linenos: - - - normalizername: - type: normalizer - generic_rules: - - tests/testdata/labeler_rules/rules/ - specific_rules: - - tests/testdata/labeler_rules/rules/ - regex_mapping: tests/testdata/unit/normalizer/normalizer_regex_mapping.yml - html_replace_fields: tests/testdata/unit/normalizer/html_replace_fields.yml - count_grok_pattern_matches: - count_directory_path: "path/to/directory" - write_period: 0.1 - lock_file_path: "path/to/lock/file" - -.. autoclass:: logprep.processor.normalizer.processor.Normalizer.Config - :members: - :undoc-members: - :inherited-members: - :noindex: - -.. automodule:: logprep.processor.normalizer.rule - -""" - -import calendar -import html -import json -import os -import re -from functools import reduce -from pathlib import Path -from time import time -from typing import List, Optional, Tuple, Union -from zoneinfo import ZoneInfo - -import msgspec -from attr import define, field, validators -from filelock import FileLock -from pytz import timezone - -from logprep.abc.processor import Processor -from logprep.processor.base.exceptions import FieldExistsWarning, ProcessingWarning -from logprep.processor.normalizer.rule import NormalizerRule -from logprep.util.getter import GetterFactory -from logprep.util.helper import ( - add_field_to, - get_dotted_field_list, - get_dotted_field_value, -) -from logprep.util.time import TimeParser, TimeParserException -from logprep.util.validators import directory_validator - - -class NormalizerError(ProcessingWarning): - """Base class for Normalizer related exceptions.""" - - -class Normalizer(Processor): - """Normalize log events by copying specific values to standardized fields.""" - - @define(kw_only=True) - class Config(Processor.Config): - """config description for Normalizer""" - - regex_mapping: str = field(validator=validators.instance_of(str)) - """Path to regex mapping file with regex keywords that are replaced with regex expressions - by the normalizer. For string format see :ref:`getters`.""" - html_replace_fields: Optional[str] = field( - default=None, validator=[validators.optional(validators.instance_of(str))] - ) - """Path to yaml file with html replace fields. For string format see :ref:`getters`""" - count_grok_pattern_matches: Optional[dict] = field( - default=None, validator=validators.optional(validators.instance_of(dict)) - ) - """Optional configuration to count matches of grok patterns. - Counting will be disabled if this value is omitted.""" - grok_patterns: Optional[str] = field(default=None, validator=directory_validator) - """Optional path to a directory with grok patterns.""" - - __slots__ = [ - "_conflicting_fields", - "_regex_mapping", - "_html_replace_fields", - "_count_grok_pattern_matches", - "_grok_matches_path", - "_file_lock_path", - "_grok_cnt_period", - "_grok_cnt_timer", - "_grok_pattern_matches", - ] - - _grok_pattern_matches: dict - - _grok_cnt_timer: float - - _grok_cnt_period: str - - _file_lock_path: str - - _grok_matches_path: str - - _count_grok_pattern_matches: str - - _regex_mapping: str - - _html_replace_fields: str - - _conflicting_fields: list - - rule_class = NormalizerRule - _encoder = msgspec.json.Encoder() - _decoder = msgspec.json.Decoder() - - def __init__(self, name: str, configuration: Processor.Config): - self._event = None - self._conflicting_fields = [] - - self._regex_mapping = configuration.regex_mapping - self._html_replace_fields = configuration.html_replace_fields - - self._count_grok_pattern_matches = configuration.count_grok_pattern_matches - if self._count_grok_pattern_matches: - self._grok_matches_path = self._count_grok_pattern_matches["count_directory_path"] - self._file_lock_path = self._count_grok_pattern_matches.get( - "lock_file_path", "count_grok_pattern_matches.lock" - ) - self._grok_cnt_period = self._count_grok_pattern_matches["write_period"] - self._grok_cnt_timer = time() - self._grok_pattern_matches = {} - - NormalizerRule.additional_grok_patterns = configuration.grok_patterns - - self._regex_mapping = GetterFactory.from_string(self._regex_mapping).get_yaml() - - if self._html_replace_fields: - getter = GetterFactory.from_string(self._html_replace_fields) - self._html_replace_fields = getter.get_yaml() - super().__init__(name=name, configuration=configuration) - - # pylint: enable=arguments-differ - - def _write_grok_matches(self): - """Write count of matches for each grok pattern into a file if configured time has passed. - - If enabled, grok pattern matches are being counted. - This method writes them into a file after a configured time has passed. - First, it reads the existing counts from the file, then it sums them with the current - counts and writes the result back into the file. - - One file is created per day if anything is written. - """ - now = time() - if now < self._grok_cnt_timer + self._grok_cnt_period: - return - self._grok_cnt_timer = now - - current_date = TimeParser.now().date() - weekday = calendar.day_name[current_date.weekday()].lower() - - file_name = f"{current_date}_{weekday}.json" - file_path = os.path.join(self._grok_matches_path, file_name) - Path(self._grok_matches_path).mkdir(parents=True, exist_ok=True) - with FileLock(self._file_lock_path): - json_dict = {} - if os.path.isfile(file_path): - with open(file_path, "r", encoding="utf8") as grok_json_file: - json_dict = self._decoder.decode(grok_json_file.read()) - - for key, value in self._grok_pattern_matches.items(): - json_dict[key] = json_dict.get(key, 0) + value - self._grok_pattern_matches[key] = 0 - - with open(file_path, "w", encoding="utf8") as grok_json_file: - json_dict = dict(reversed(sorted(json_dict.items(), key=lambda items: items[1]))) - json.dump(json_dict, grok_json_file, indent=4) - - def _try_add_field(self, event: dict, target: Union[str, List[str]], value: str): - target, value = self._get_transformed_value(target, value) - - if self._field_exists(event, target): - if get_dotted_field_value(event, target) != value: - self._conflicting_fields.append(target) - else: - self._add_field(event, target, value) - - def _get_transformed_value(self, target: Union[str, List[str]], value: str) -> Tuple[str, str]: - if isinstance(target, list): - matching_pattern = self._regex_mapping.get(target[1], None) - if matching_pattern: - substitution_pattern = target[2] - value = re.sub(matching_pattern, substitution_pattern, value) - target = target[0] - return target, value - - def _add_field(self, event: dict, dotted_field: str, value: Union[str, int]): - fields = get_dotted_field_list(dotted_field) - missing_fields = self._decoder.decode(self._encoder.encode(fields)) - for event_field in fields: - if isinstance(event, dict) and event_field in event: - event = event[event_field] - missing_fields.pop(0) - else: - break - if not isinstance(event, dict): - self._conflicting_fields.append(dotted_field) - return - for event_field in missing_fields[:-1]: - event[event_field] = {} - event = event[event_field] - event[missing_fields[-1]] = value - - if self._html_replace_fields and dotted_field in self._html_replace_fields: - if self._has_html_entity(value): - event[missing_fields[-1] + "_decodiert"] = html.unescape(value) - - @staticmethod - def _has_html_entity(value): - return re.search("&#[0-9]{2,4};", value) - - @staticmethod - def _replace_field(event: dict, dotted_field: str, value: str): - fields = get_dotted_field_list(dotted_field) - reduce(lambda dict_, key: dict_[key], fields[:-1], event)[fields[-1]] = value - - def process(self, event: dict): - self._conflicting_fields.clear() - super().process(event) - if self._count_grok_pattern_matches: - self._write_grok_matches() - - def _apply_rules(self, event, rule): - """Normalizes Windows Event Logs. - - The rules in this function are applied on a first-come, first-serve basis: If a rule copies - a source field to a normalized field and a subsequent rule tries to write the same - normalized field, it will not be overwritten and a ProcessingWarning will be raised - as a last step after processing the current event has finished. The same holds true if a - rule tries to overwrite a field that already exists in the original event. The rules should - be written in a way that no such warnings are produced during normal operation because each - warning should be an indicator of incorrect rules or unexpected/changed events. - """ - self._apply_grok(event, rule) - self._apply_timestamp_normalization(event, rule) - for source_field, target_field in rule.substitutions.items(): - self._apply_field_copy(event, source_field, target_field) - self._raise_warning_if_fields_already_existed(rule, event) - - def _apply_grok(self, event: dict, rule: NormalizerRule): - """ - Applies the grok pattern of a given NormalizerRule to a given event, by matching the field - value against the grok pattern. If no pattern matches a grok failure field is written to the - event (if configured). - """ - one_matched = False - source_field, source_value = None, None - for source_field, grok in rule.grok.items(): - source_value = get_dotted_field_value(event, source_field) - if source_value is None: - continue - matches = self._get_grok_matches(grok, source_value) - for normalized_field, field_value in matches.items(): - if field_value is not None: - self._try_add_field(event, normalized_field, field_value) - one_matched = True - if not one_matched: - self._write_grok_failure_field(event, rule, source_field, source_value) - - def _get_grok_matches(self, grok, source_value): - if self._count_grok_pattern_matches: - return grok.match(source_value, self._grok_pattern_matches) - return grok.match(source_value) - - def _write_grok_failure_field(self, event, rule, source_field, source_value): - """ - If grok patterns exist and are configured with a failure target field, then add the source - field and the first 100 characters of the grok patterns' target field to the event. - The source field path, which is usually converted to subfields according to the dotted field - path, is converted first to a path that is split by '>'. That way the separate fields are - not all added to the event and instead only a path identifier is added. For example the - source_field 'field.subfield.key' will be converted to 'field>subfield>key'. This makes - it easy to understand in which field the grok failure happened without having to add - all subfields separately. - - Parameters - ---------- - event : dict - The event that is currently being processed - rule : NormalizerRule - The current rule that should be applied to the event - source_field : str - The dotted source field path to which the grok pattern was applied to - source_value : str - The content of the source_field - """ - if not rule.grok.items(): - return - grok_wrapper = rule.grok.get(source_field) - failure_target_field = grok_wrapper.failure_target_field - if grok_wrapper.failure_target_field: - source_field_path = source_field.replace(".", ">") - add_field_to(event, f"{failure_target_field}.{source_field_path}", source_value[:100]) - - def _apply_timestamp_normalization(self, event: dict, rule: NormalizerRule): - """ - Normalizes the timestamps of an event by applying the given rule. - """ - for source_field, normalization in rule.timestamps.items(): - source_timestamp = get_dotted_field_value(event, source_field) - if source_timestamp is None: - continue - - timestamp_normalization = normalization.get("timestamp") - timestamp = self._transform_timestamp( - source_timestamp, timestamp_normalization, rule, event - ) - timestamp = self._convert_timezone(timestamp, timestamp_normalization) - iso_timestamp = timestamp.isoformat().replace("+00:00", "Z") - self._write_normalized_timestamp(event, iso_timestamp, timestamp_normalization) - - def _transform_timestamp(self, source_timestamp, timestamp_normalization, rule, event): - source_timezone = timestamp_normalization["source_timezone"] - timestamp = None - format_parsed = False - source_formats = timestamp_normalization["source_formats"] - for source_format in source_formats: - try: - timestamp = TimeParser.parse_datetime( - source_timestamp, source_format, ZoneInfo(source_timezone) - ) - if timestamp is not None: - format_parsed = True - break - except TimeParserException: - pass - if not format_parsed: - error_message = ( - f"Could not parse source timestamp " - f"{source_timestamp}' with formats '{source_formats}'" - ) - raise NormalizerError(error_message, rule, event) - return timestamp - - def _convert_timezone(self, timestamp, timestamp_normalization): - source_timezone = timestamp_normalization["source_timezone"] - destination_timezone = timestamp_normalization["destination_timezone"] - time_zone = timezone(source_timezone) - if not timestamp.tzinfo: - timestamp = time_zone.localize(timestamp) - timestamp = timezone(source_timezone).normalize(timestamp) - timestamp = timestamp.astimezone(timezone(destination_timezone)) - timestamp = timezone(destination_timezone).normalize(timestamp) - return timestamp - - def _write_normalized_timestamp(self, event, iso_timestamp, timestamp_normalization): - allow_override = timestamp_normalization.get("allow_override", True) - normalization_target = timestamp_normalization["destination"] - if allow_override: - self._replace_field(event, normalization_target, iso_timestamp) - else: - self._try_add_field(event, normalization_target, iso_timestamp) - - def _apply_field_copy(self, event: dict, source_field: str, target_field: str): - if self._field_exists(event, source_field): - source_value = get_dotted_field_value(event, source_field) - self._try_add_field(event, target_field, source_value) - - def _raise_warning_if_fields_already_existed(self, rule, event): - if self._conflicting_fields: - raise FieldExistsWarning(rule, event, self._conflicting_fields) - - def shut_down(self): - """ - Stop processing of this processor and finish outstanding actions. - Optional: Called when stopping the pipeline - """ - if self._count_grok_pattern_matches: - self._write_grok_matches() diff --git a/logprep/processor/normalizer/rule.py b/logprep/processor/normalizer/rule.py deleted file mode 100644 index a68704b0c..000000000 --- a/logprep/processor/normalizer/rule.py +++ /dev/null @@ -1,1071 +0,0 @@ -# pylint: disable=anomalous-backslash-in-string -""" -Rule Configuration -^^^^^^^^^^^^^^^^^^ - -The normalizer requires the additional field :code:`normalize`. -It contains key-value pairs that define if and how fields gets normalized. -The keys describe fields that are going to be normalized and the values describe the new -normalized fields. Through normalizing, old fields are being copied to new fields, but the old -fields are not deleted. - -In the following example the field :code:`event_data.ClientAddress` is normalized to -:code:`client.ip`. - -.. code-block:: yaml - :linenos: - :caption: Example - - filter: 'event_data.ClientAddress' - normalize: - event_data.ClientAddress: client.ip - description: '...' - -Extraction and Replacement --------------------------- - -Instead of copying a whole field, it is possible to copy only parts of it via regex capture groups. -These can be then extracted and rearranged in a new field. -The groups are defined in a configurable file as keywords and can be referenced from within the -rules via the Python regex syntax. - -Instead of specifying a target field, a list with three elements has to be used. -The first element is the target field, the second element is a regex keyword and the third field is -a regex expression that defines how the value should be inserted into the new field. - -In the following example :code:`event_data.address_text: "The IP is 1.2.3.4 and the port is 1234!"` -is normalized to :code:`address: "1.2.3.4:1234"`. - -.. code-block:: json - :linenos: - :caption: Example - Definition of regex keywords in the regex mapping file - - { - "RE_IP_PORT_CAP": ".*(?P[\\d.]+).*(?P\\d+).*", - "RE_WHOLE_FIELD": "(.*)" - } - -.. code-block:: yaml - :linenos: - :caption: Example - Rule with extraction - - filter: event_id - normalize: - event_data.address_text: - - address - - RE_IP_PORT_CAP - - '\g:\g' - -Grok ----- - -Grok functionality is fully supported for field normalization. -This can be combined with the normalizations that have been already introduced or it can be used -instead of them. -By combining both types of normalization it is possible to perform transformations on results of -Grok that can not be achieved by Grok alone. -All Grok normalizations are always performed before other normalizations. -An example for this is the creation of nested fields. - -The following example would normalize -:code:`event_data.ip_and_port: "Linus has the address 1.2.3.4 1234", event_data.address_text: -"This is an address: 1.2.3.4:1234"` to :code:`address.ip: "1.2.3.4"`, :code:`address.port: 1234`, -:code:`name: Linus` and :code:`address.combined: 1.2.3.4 and 1234`. - -.. code-block:: yaml - :linenos: - :caption: Example - Grok normalization and subsequent normalization of a result - - filter: event_id - normalize: - event_data.ip_and_port: '{"grok": "%{USER:name} has the address %{IP:[address][ip]} %{NUMBER:[address][port]:int}"}' - event_data.address_text: - - address.combined - - RE_IP_PORT_CAP - - '\g and \g' - -It is furthermore possible to use more than one Grok pattern for a field by specifying them -in a list. -The patterns will be sequentially checked until one of them matches. - -The following example would normalize :code:`some_field_with_an_ip: "1.2.3.4 1234"` to -:code:`ip: "1.2.3.4"`, :code:`port: 1234`, skipping the first Grok pattern. -:code:`some_field_with_an_ip: "1.2.3.4 1234 foo"` would be however normalized to -:code:`ip_foo: "1.2.3.4"`, :code:`port_foo: 1234`. - -.. code-block:: yaml - :linenos: - :caption: Example - Grok normalization with multiple patterns - - filter: 'some_field_with_an_ip' - normalize: - some_field_with_an_ip: - grok: - - '%{IP:ip_foo} %{NUMBER:port_foo:int} foo' - - '%{IP:ip} %{NUMBER:port:int}' - -As Grok pattern are only applied when they match a given input string it is sometimes desired to -know when none of the given pattern matches. -This is helpful in identifying new, unknown or reconfigured log sources that are not correctly -covered by the current rule set. -To activate the output of this information it is required to add the field -:code:`failure_target_field` to the grok rule. -This will describe the output field where the grok failure should be written to. -It can be a dotted field path. -An example rule would look like: - -.. code-block:: yaml - :linenos: - :caption: Example - Grok normalization with grok failure target field - - filter: 'some_field_with_an_ip' - normalize: - some_field_with_an_ip: - grok: - - '%{IP:ip_foo} %{NUMBER:port_foo:int} foo' - - '%{IP:ip} %{NUMBER:port:int}' - failure_target_field: 'grok_failure' - -If this is applied to an event which has the field :code:`some_field_with_an_ip`, but it's content -is not matched by any grok-filter then the :code:`grok_failure` field will be added. -This failure field will contain a subfield which identifies the grok target field as well as the -first 100 characters of the fields content. -By adding the failure information as a separate object it is possible to add more failures to it -in case many different grok rules exist and multiple events are not matched by any grok pattern. - -Given this example event: - -.. code-block:: json - :linenos: - :caption: Example Input Event - - { - "some_field_with_an_ip": "content which is not an ip", - "other event": "content" - } - -The normalizer would produce the following output event: - -.. code-block:: json - :linenos: - :caption: Example Output Event - - { - "some_field_with_an_ip": "content which is not an ip", - "other event": "content", - "grok_failure": { - "some_field_with_an_ip": "content which is not an ip" - } - } - -If the grok field is a subfield somewhere inside the event, then the keys of the grok_failure object -will contain the path to this subfield separated by :code:`>`. -This helps in identifying the original source field to which the grok pattern was applied to. -A grok failure output example would look like: - -.. code-block:: json - :linenos: - :caption: Example Output Event - - { - "nested_ip": { - "some_field_with_an_ip": "content which is not an ip", - }, - "other event": "content", - "grok_failure": { - "nested_ip>some_field_with_an_ip": "content which is not an ip" - } - } - -Normalization of Timestamps ---------------------------- - -There is a special functionality that allows to normalize timestamps. -With this functionality different timestamp formats can be converted to ISO8601 and timezones can -be adapted. Instead of giving a target field, the special field `timestamp` is used. -Under this field additional configurations for the normalization can be specified. -Under `timestamp.source_formats` a list of possible source formats for the timestamp must be -defined. The original timezone of the timestamp must be specified in `timestamp.source_timezone`. -Furthermore, in `timestamp.destination_timezone` the new timestamp must be specified. -Finally, `timestamp.destination` defines the target field to which the new timestamp should be -written. Optionally, it can be defined if the normalization is allowed to override existing values -by setting `timestamp.allow_override` to `true` or `false`. It is allowed to override by default. - -Valid formats for timestamps are defined by the notation of the Python datetime module. -Additionally, the value `ISO8601` and `UNIX` can be used for the `source_formats` field. -The former can be used if the -timestamp already exists in the ISO98601 format, such that only a timezone conversion should be -applied. And the latter can be used if the timestamp is given in the UNIX Epoch Time. This supports -the Unix timestamps in seconds and milliseconds. - -Valid timezones are defined in the pytz module: - -.. raw:: html - -
- List of all timezones - -.. code-block:: text - :linenos: - :caption: Timezones from the Python pytz module - - Africa/Abidjan - Africa/Accra - Africa/Addis_Ababa - Africa/Algiers - Africa/Asmara - Africa/Asmera - Africa/Bamako - Africa/Bangui - Africa/Banjul - Africa/Bissau - Africa/Blantyre - Africa/Brazzaville - Africa/Bujumbura - Africa/Cairo - Africa/Casablanca - Africa/Ceuta - Africa/Conakry - Africa/Dakar - Africa/Dar_es_Salaam - Africa/Djibouti - Africa/Douala - Africa/El_Aaiun - Africa/Freetown - Africa/Gaborone - Africa/Harare - Africa/Johannesburg - Africa/Juba - Africa/Kampala - Africa/Khartoum - Africa/Kigali - Africa/Kinshasa - Africa/Lagos - Africa/Libreville - Africa/Lome - Africa/Luanda - Africa/Lubumbashi - Africa/Lusaka - Africa/Malabo - Africa/Maputo - Africa/Maseru - Africa/Mbabane - Africa/Mogadishu - Africa/Monrovia - Africa/Nairobi - Africa/Ndjamena - Africa/Niamey - Africa/Nouakchott - Africa/Ouagadougou - Africa/Porto-Novo - Africa/Sao_Tome - Africa/Timbuktu - Africa/Tripoli - Africa/Tunis - Africa/Windhoek - America/Adak - America/Anchorage - America/Anguilla - America/Antigua - America/Araguaina - America/Argentina/Buenos_Aires - America/Argentina/Catamarca - America/Argentina/ComodRivadavia - America/Argentina/Cordoba - America/Argentina/Jujuy - America/Argentina/La_Rioja - America/Argentina/Mendoza - America/Argentina/Rio_Gallegos - America/Argentina/Salta - America/Argentina/San_Juan - America/Argentina/San_Luis - America/Argentina/Tucuman - America/Argentina/Ushuaia - America/Aruba - America/Asuncion - America/Atikokan - America/Atka - America/Bahia - America/Bahia_Banderas - America/Barbados - America/Belem - America/Belize - America/Blanc-Sablon - America/Boa_Vista - America/Bogota - America/Boise - America/Buenos_Aires - America/Cambridge_Bay - America/Campo_Grande - America/Cancun - America/Caracas - America/Catamarca - America/Cayenne - America/Cayman - America/Chicago - America/Chihuahua - America/Coral_Harbour - America/Cordoba - America/Costa_Rica - America/Creston - America/Cuiaba - America/Curacao - America/Danmarkshavn - America/Dawson - America/Dawson_Creek - America/Denver - America/Detroit - America/Dominica - America/Edmonton - America/Eirunepe - America/El_Salvador - America/Ensenada - America/Fort_Wayne - America/Fortaleza - America/Glace_Bay - America/Godthab - America/Goose_Bay - America/Grand_Turk - America/Grenada - America/Guadeloupe - America/Guatemala - America/Guayaquil - America/Guyana - America/Halifax - America/Havana - America/Hermosillo - America/Indiana/Indianapolis - America/Indiana/Knox - America/Indiana/Marengo - America/Indiana/Petersburg - America/Indiana/Tell_City - America/Indiana/Vevay - America/Indiana/Vincennes - America/Indiana/Winamac - America/Indianapolis - America/Inuvik - America/Iqaluit - America/Jamaica - America/Jujuy - America/Juneau - America/Kentucky/Louisville - America/Kentucky/Monticello - America/Knox_IN - America/Kralendijk - America/La_Paz - America/Lima - America/Los_Angeles - America/Louisville - America/Lower_Princes - America/Maceio - America/Managua - America/Manaus - America/Marigot - America/Martinique - America/Matamoros - America/Mazatlan - America/Mendoza - America/Menominee - America/Merida - America/Metlakatla - America/Mexico_City - America/Miquelon - America/Moncton - America/Monterrey - America/Montevideo - America/Montreal - America/Montserrat - America/Nassau - America/New_York - America/Nipigon - America/Nome - America/Noronha - America/North_Dakota/Beulah - America/North_Dakota/Center - America/North_Dakota/New_Salem - America/Ojinaga - America/Panama - America/Pangnirtung - America/Paramaribo - America/Phoenix - America/Port-au-Prince - America/Port_of_Spain - America/Porto_Acre - America/Porto_Velho - America/Puerto_Rico - America/Rainy_River - America/Rankin_Inlet - America/Recife - America/Regina - America/Resolute - America/Rio_Branco - America/Rosario - America/Santa_Isabel - America/Santarem - America/Santiago - America/Santo_Domingo - America/Sao_Paulo - America/Scoresbysund - America/Shiprock - America/Sitka - America/St_Barthelemy - America/St_Johns - America/St_Kitts - America/St_Lucia - America/St_Thomas - America/St_Vincent - America/Swift_Current - America/Tegucigalpa - America/Thule - America/Thunder_Bay - America/Tijuana - America/Toronto - America/Tortola - America/Vancouver - America/Virgin - America/Whitehorse - America/Winnipeg - America/Yakutat - America/Yellowknife - Antarctica/Casey - Antarctica/Davis - Antarctica/DumontDUrville - Antarctica/Macquarie - Antarctica/Mawson - Antarctica/McMurdo - Antarctica/Palmer - Antarctica/Rothera - Antarctica/South_Pole - Antarctica/Syowa - Antarctica/Vostok - Arctic/Longyearbyen - Asia/Aden - Asia/Almaty - Asia/Amman - Asia/Anadyr - Asia/Aqtau - Asia/Aqtobe - Asia/Ashgabat - Asia/Ashkhabad - Asia/Baghdad - Asia/Bahrain - Asia/Baku - Asia/Bangkok - Asia/Beirut - Asia/Bishkek - Asia/Brunei - Asia/Calcutta - Asia/Choibalsan - Asia/Chongqing - Asia/Chungking - Asia/Colombo - Asia/Dacca - Asia/Damascus - Asia/Dhaka - Asia/Dili - Asia/Dubai - Asia/Dushanbe - Asia/Gaza - Asia/Harbin - Asia/Hebron - Asia/Ho_Chi_Minh - Asia/Hong_Kong - Asia/Hovd - Asia/Irkutsk - Asia/Istanbul - Asia/Jakarta - Asia/Jayapura - Asia/Jerusalem - Asia/Kabul - Asia/Kamchatka - Asia/Karachi - Asia/Kashgar - Asia/Kathmandu - Asia/Katmandu - Asia/Kolkata - Asia/Krasnoyarsk - Asia/Kuala_Lumpur - Asia/Kuching - Asia/Kuwait - Asia/Macao - Asia/Macau - Asia/Magadan - Asia/Makassar - Asia/Manila - Asia/Muscat - Asia/Nicosia - Asia/Novokuznetsk - Asia/Novosibirsk - Asia/Omsk - Asia/Oral - Asia/Phnom_Penh - Asia/Pontianak - Asia/Pyongyang - Asia/Qatar - Asia/Qyzylorda - Asia/Rangoon - Asia/Riyadh - Asia/Saigon - Asia/Sakhalin - Asia/Samarkand - Asia/Seoul - Asia/Shanghai - Asia/Singapore - Asia/Taipei - Asia/Tashkent - Asia/Tbilisi - Asia/Tehran - Asia/Tel_Aviv - Asia/Thimbu - Asia/Thimphu - Asia/Tokyo - Asia/Ujung_Pandang - Asia/Ulaanbaatar - Asia/Ulan_Bator - Asia/Urumqi - Asia/Vientiane - Asia/Vladivostok - Asia/Yakutsk - Asia/Yekaterinburg - Asia/Yerevan - Atlantic/Azores - Atlantic/Bermuda - Atlantic/Canary - Atlantic/Cape_Verde - Atlantic/Faeroe - Atlantic/Faroe - Atlantic/Jan_Mayen - Atlantic/Madeira - Atlantic/Reykjavik - Atlantic/South_Georgia - Atlantic/St_Helena - Atlantic/Stanley - Australia/ACT - Australia/Adelaide - Australia/Brisbane - Australia/Broken_Hill - Australia/Canberra - Australia/Currie - Australia/Darwin - Australia/Eucla - Australia/Hobart - Australia/LHI - Australia/Lindeman - Australia/Lord_Howe - Australia/Melbourne - Australia/NSW - Australia/North - Australia/Perth - Australia/Queensland - Australia/South - Australia/Sydney - Australia/Tasmania - Australia/Victoria - Australia/West - Australia/Yancowinna - Brazil/Acre - Brazil/DeNoronha - Brazil/East - Brazil/West - CET - CST6CDT - Canada/Atlantic - Canada/Central - Canada/East-Saskatchewan - Canada/Eastern - Canada/Mountain - Canada/Newfoundland - Canada/Pacific - Canada/Saskatchewan - Canada/Yukon - Chile/Continental - Chile/EasterIsland - Cuba - EET - EST - EST5EDT - Egypt - Eire - Etc/GMT - Etc/GMT+0 - Etc/GMT+1 - Etc/GMT+10 - Etc/GMT+11 - Etc/GMT+12 - Etc/GMT+2 - Etc/GMT+3 - Etc/GMT+4 - Etc/GMT+5 - Etc/GMT+6 - Etc/GMT+7 - Etc/GMT+8 - Etc/GMT+9 - Etc/GMT-0 - Etc/GMT-1 - Etc/GMT-10 - Etc/GMT-11 - Etc/GMT-12 - Etc/GMT-13 - Etc/GMT-14 - Etc/GMT-2 - Etc/GMT-3 - Etc/GMT-4 - Etc/GMT-5 - Etc/GMT-6 - Etc/GMT-7 - Etc/GMT-8 - Etc/GMT-9 - Etc/GMT0 - Etc/Greenwich - Etc/UCT - Etc/UTC - Etc/Universal - Etc/Zulu - Europe/Amsterdam - Europe/Andorra - Europe/Athens - Europe/Belfast - Europe/Belgrade - Europe/Berlin - Europe/Bratislava - Europe/Brussels - Europe/Bucharest - Europe/Budapest - Europe/Chisinau - Europe/Copenhagen - Europe/Dublin - Europe/Gibraltar - Europe/Guernsey - Europe/Helsinki - Europe/Isle_of_Man - Europe/Istanbul - Europe/Jersey - Europe/Kaliningrad - Europe/Kiev - Europe/Lisbon - Europe/Ljubljana - Europe/London - Europe/Luxembourg - Europe/Madrid - Europe/Malta - Europe/Mariehamn - Europe/Minsk - Europe/Monaco - Europe/Moscow - Europe/Nicosia - Europe/Oslo - Europe/Paris - Europe/Podgorica - Europe/Prague - Europe/Riga - Europe/Rome - Europe/Samara - Europe/San_Marino - Europe/Sarajevo - Europe/Simferopol - Europe/Skopje - Europe/Sofia - Europe/Stockholm - Europe/Tallinn - Europe/Tirane - Europe/Tiraspol - Europe/Uzhgorod - Europe/Vaduz - Europe/Vatican - Europe/Vienna - Europe/Vilnius - Europe/Volgograd - Europe/Warsaw - Europe/Zagreb - Europe/Zaporozhye - Europe/Zurich - GB - GB-Eire - GMT - GMT+0 - GMT-0 - GMT0 - Greenwich - HST - Hongkong - Iceland - Indian/Antananarivo - Indian/Chagos - Indian/Christmas - Indian/Cocos - Indian/Comoro - Indian/Kerguelen - Indian/Mahe - Indian/Maldives - Indian/Mauritius - Indian/Mayotte - Indian/Reunion - Iran - Israel - Jamaica - Japan - Kwajalein - Libya - MET - MST - MST7MDT - Mexico/BajaNorte - Mexico/BajaSur - Mexico/General - NZ - NZ-CHAT - Navajo - PRC - PST8PDT - Pacific/Apia - Pacific/Auckland - Pacific/Chatham - Pacific/Chuuk - Pacific/Easter - Pacific/Efate - Pacific/Enderbury - Pacific/Fakaofo - Pacific/Fiji - Pacific/Funafuti - Pacific/Galapagos - Pacific/Gambier - Pacific/Guadalcanal - Pacific/Guam - Pacific/Honolulu - Pacific/Johnston - Pacific/Kiritimati - Pacific/Kosrae - Pacific/Kwajalein - Pacific/Majuro - Pacific/Marquesas - Pacific/Midway - Pacific/Nauru - Pacific/Niue - Pacific/Norfolk - Pacific/Noumea - Pacific/Pago_Pago - Pacific/Palau - Pacific/Pitcairn - Pacific/Pohnpei - Pacific/Ponape - Pacific/Port_Moresby - Pacific/Rarotonga - Pacific/Saipan - Pacific/Samoa - Pacific/Tahiti - Pacific/Tarawa - Pacific/Tongatapu - Pacific/Truk - Pacific/Wake - Pacific/Wallis - Pacific/Yap - Poland - Portugal - ROC - ROK - Singapore - Turkey - UCT - US/Alaska - US/Aleutian - US/Arizona - US/Central - US/East-Indiana - US/Eastern - US/Hawaii - US/Indiana-Starke - US/Michigan - US/Mountain - US/Pacific - US/Pacific-New - US/Samoa - UTC - Universal - W-SU - WET - Zulu - -.. raw:: html - -
-
- -In the following example :code:`@timestamp: 2000 12 31 - 22:59:59` would be normalized to -:code:`@timestamp: 2000-12-31T23:59:59+01:00`. - -.. code-block:: yaml - :linenos: - :caption: Example - Normalization of a timestamp - - filter: '@timestamp' - normalize: - '@timestamp': - timestamp: - destination: '@timestamp' - source_formats: - - '%Y %m %d - %H:%M:%S' - source_timezone: 'UTC' - destination_timezone: 'Europe/Berlin' - description: 'Test-rule with matching auto-test' - -If Grok and a timestamp normalization is being used in the same rule, -then Grok is being applied first, so that a time normalization can be performed -on the Grok results. -""" -# pylint: enable=anomalous-backslash-in-string - -import re -import uuid -from typing import Dict, List, Union - -from pygrok import Grok - -from logprep.filter.expression.filter_expression import FilterExpression -from logprep.processor.base.rule import InvalidRuleDefinitionError, Rule - -GROK_DELIMITER = "__________________" - - -class NormalizerRuleError(InvalidRuleDefinitionError): - """Base class for Normalizer rule related exceptions.""" - - def __init__(self, message): - super().__init__(f"Normalizer rule ({message}): ") - - -class InvalidNormalizationDefinition(NormalizerRuleError): - """Raise if normalization definition invalid.""" - - def __init__(self, definition: Union[list, dict]): - message = f"The following normalization definition is invalid: {definition}" - super().__init__(message) - - -class InvalidGrokDefinition(NormalizerRuleError): - """Raise if grok definition invalid.""" - - def __init__(self, definition: Union[list, dict]): - message = f"The following grok-expression is invalid: {definition}" - super().__init__(message) - - -class InvalidTimestampDefinition(NormalizerRuleError): - """Raise if timestamp definition invalid.""" - - def __init__(self, definition: Union[list, dict]): - message = f"The following timestamp normalization definition is invalid: {definition}" - super().__init__(message) - - -class GrokWrapper: - """Wrap around pygrok to add delimiter support.""" - - grok_delimiter_pattern = re.compile(GROK_DELIMITER) - - def __init__(self, patterns: Union[str, List[str]], failure_target_field=None, **kwargs): - if isinstance(patterns, str): - self._grok_list = [Grok(f"^{patterns}$", **kwargs)] - else: - patterns = [f"^{pattern}$" for pattern in patterns] - self._grok_list = [Grok(pattern_item, **kwargs) for pattern_item in patterns] - - self._match_cnt_initialized = False - self.failure_target_field = failure_target_field - - def __eq__(self, other: "GrokWrapper") -> bool: - return set(grok_item.regex_obj for grok_item in self._grok_list) == set( - grok_item.regex_obj for grok_item in other._grok_list - ) - - def match(self, text: str, pattern_matches: dict = None) -> Dict[str, str]: - """Match string via grok using delimiter and count matches if enabled.""" - if pattern_matches is not None and not self._match_cnt_initialized: - for grok in self._grok_list: - pattern_matches[grok.pattern] = 0 - self._match_cnt_initialized = True - - for grok in self._grok_list: - matches = grok.match(text) - if matches: - if pattern_matches is not None: - pattern_matches[grok.pattern] += 1 - dotted_matches = {} - for key, value in matches.items(): - dotted_matches[self.grok_delimiter_pattern.sub(".", key)] = value - return dotted_matches - return {} - - -class NormalizerRule(Rule): - """Check if documents match a filter.""" - - additional_grok_patterns = None - extract_field_pattern = re.compile(r"%{(\w+):([\w\[\]]+)(?::\w+)?}") - sub_fields_pattern = re.compile(r"(\[(\w+)\])") - failure_tags = ["_normalizer_failure"] - - # pylint: disable=super-init-not-called - # TODO: this is not refactored, because this processor should be dissected - def __init__( - self, filter_rule: FilterExpression, normalizations: dict, description: str = None - ): - self.__class__.__hash__ = Rule.__hash__ - self.filter_str = str(filter_rule) - self._filter = filter_rule - self._special_fields = None - self.file_name = None - self._tests = [] - self.metrics = self.Metrics( - labels={ - "component": "rule", - "description": f"{str(uuid.uuid4())} - {description}", - "type": "normalizer", - "name": "normalizer", - } - ) - self._substitutions = {} - self._grok = {} - self._timestamps = {} - self._description = description - self._id = str(uuid.uuid4()) - - self._parse_normalizations(normalizations) - - # pylint: enable=super-init-not-called - - def _parse_normalizations(self, normalizations): - for source_field, normalization in normalizations.items(): - if isinstance(normalization, dict) and normalization.get("grok"): - self._extract_grok_pattern(normalization, source_field) - elif isinstance(normalization, dict) and normalization.get("timestamp"): - self._timestamps.update({source_field: normalization}) - else: - self._substitutions.update({source_field: normalization}) - - def _extract_grok_pattern(self, normalization, source_field): - """Checks the rule file for grok pattern, reformats them and adds them to self._grok""" - if isinstance(normalization["grok"], str): - normalization["grok"] = [normalization["grok"]] - for idx, grok in enumerate(normalization["grok"]): - patterns = self.extract_field_pattern.findall(grok) - self._reformat_grok_pattern(idx, normalization, patterns) - self._add_grok_patterns(normalization, source_field) - - def _add_grok_patterns(self, normalization, source_field): - failure_target_field = normalization.get("failure_target_field") - self._grok.update( - { - source_field: GrokWrapper( - patterns=normalization["grok"], - custom_patterns_dir=NormalizerRule.additional_grok_patterns, - failure_target_field=failure_target_field, - ) - } - ) - - def _reformat_grok_pattern(self, idx, normalization, patterns): - """ - Changes the grok pattern format by removing the square brackets and introducing - the GROK_DELIMITER. - """ - for pattern in patterns: - if len(pattern) >= 2: - sub_fields = re.findall(self.sub_fields_pattern, pattern[1]) - if sub_fields: - mutable_pattern = list(pattern) - mutable_pattern[1] = GROK_DELIMITER.join( - (sub_field[1] for sub_field in sub_fields) - ) - to_replace = re.escape(r"%{" + r":".join(pattern)) - transformed_fields_names = "%{" + ":".join(mutable_pattern) - normalization["grok"][idx] = re.sub( - to_replace, transformed_fields_names, normalization["grok"][idx] - ) - - def __eq__(self, other: "NormalizerRule") -> bool: - return all( - [ - other.filter == self._filter, - self._substitutions == other.substitutions, - self._grok == other.grok, - self._timestamps == other.timestamps, - ] - ) - - # pylint: disable=C0111 - @property - def substitutions(self) -> dict: - return self._substitutions - - @property - def grok(self) -> dict: - return self._grok - - @property - def timestamps(self) -> dict: - return self._timestamps - - @property - def description(self) -> str: - return self._description - - @property - def id(self) -> str: - return self._id - - # pylint: enable=C0111 - - @staticmethod - def _create_from_dict(rule: dict, processor_name: str = None) -> "NormalizerRule": - NormalizerRule._check_rule_validity(rule, "normalize") - NormalizerRule._check_if_normalization_valid(rule) - - filter_expression = Rule._create_filter_expression(rule) - description = rule.get("description") - return NormalizerRule(filter_expression, rule["normalize"], description) - - @staticmethod - def _check_if_normalization_valid(rule: dict): - for value in rule["normalize"].values(): - if isinstance(value, list): - if len(value) != 3: - raise InvalidNormalizationDefinition(value) - if isinstance(value, dict): - NormalizerRule._validate_allowed_keys(value) - if "grok" in value.keys(): - NormalizerRule._validate_grok(value) - if "timestamp" in value.keys(): - NormalizerRule._validate_timestamp(value) - - @staticmethod - def _validate_allowed_keys(value): - allowed_keys = ["grok", "timestamp", "failure_target_field"] - if any(key for key in value.keys() if key not in allowed_keys): - raise InvalidNormalizationDefinition(value) - - @staticmethod - def _validate_grok(value): - grok = value["grok"] - if not grok: - raise InvalidNormalizationDefinition(value) - if isinstance(grok, list): - if any(not isinstance(pattern, str) for pattern in grok): - raise InvalidNormalizationDefinition(value) - try: - GrokWrapper(grok, custom_patterns_dir=NormalizerRule.additional_grok_patterns) - except Exception as error: - raise InvalidGrokDefinition(grok) from error - - @staticmethod - def _validate_timestamp(value): - timestamp = value.get("timestamp") - if not timestamp: - raise InvalidNormalizationDefinition(value) - if not isinstance(timestamp.get("destination"), str): - raise InvalidTimestampDefinition(timestamp) - if not isinstance(timestamp.get("source_formats"), list): - raise InvalidTimestampDefinition(timestamp) - if not isinstance(timestamp.get("source_timezone"), str): - raise InvalidTimestampDefinition(timestamp) - if not isinstance(timestamp.get("destination_timezone"), str): - raise InvalidTimestampDefinition(timestamp) diff --git a/logprep/registry.py b/logprep/registry.py index cab9cbf05..c6090d7fd 100644 --- a/logprep/registry.py +++ b/logprep/registry.py @@ -37,7 +37,6 @@ from logprep.processor.key_checker.processor import KeyChecker from logprep.processor.labeler.processor import Labeler from logprep.processor.list_comparison.processor import ListComparison -from logprep.processor.normalizer.processor import Normalizer from logprep.processor.pre_detector.processor import PreDetector from logprep.processor.pseudonymizer.processor import Pseudonymizer from logprep.processor.requester.processor import Requester @@ -73,7 +72,6 @@ class Registry: "key_checker": KeyChecker, "labeler": Labeler, "list_comparison": ListComparison, - "normalizer": Normalizer, "pre_detector": PreDetector, "pseudonymizer": Pseudonymizer, "requester": Requester, diff --git a/pyproject.toml b/pyproject.toml index 7a7714639..5ccb5031d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,13 +63,12 @@ dependencies = [ "protobuf>=3.20.2", "pycryptodome", "pyparsing", - "pytz", # can be removed with normalizer code "scikit-learn>=1.2.0", "scipy>=1.9.2", "joblib", - "pygrok", "pyyaml", "requests>=2.31.0", + "regex", "ruamel.yaml", "schedule", "tldextract", diff --git a/quickstart/docker-compose.yml b/quickstart/docker-compose.yml index 199861d1e..886dfc64d 100644 --- a/quickstart/docker-compose.yml +++ b/quickstart/docker-compose.yml @@ -58,6 +58,7 @@ services: - KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093 - KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://127.0.0.1:9092 - KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=1@127.0.0.1:9093 + - KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE - ALLOW_PLAINTEXT_LISTENER=yes volumes: - /var/run/docker.sock:/var/run/docker.sock diff --git a/quickstart/exampledata/config/pipeline.yml b/quickstart/exampledata/config/pipeline.yml index f28b8c6f5..57542c64e 100644 --- a/quickstart/exampledata/config/pipeline.yml +++ b/quickstart/exampledata/config/pipeline.yml @@ -22,13 +22,12 @@ pipeline: generic_rules: - quickstart/exampledata/rules/labeler/generic - - normalizer: - type: normalizer + - dissector: + type: dissector specific_rules: - - quickstart/exampledata/rules/normalizer/specific/ + - quickstart/exampledata/rules/dissector/specific/ generic_rules: - - quickstart/exampledata/rules/normalizer/generic/ - regex_mapping: quickstart/exampledata/rules/normalizer/normalizer_regex_mapping.yml + - quickstart/exampledata/rules/dissector/generic/ - dropper: type: dropper diff --git a/quickstart/exampledata/rules/dissector/generic/example_rule.yml b/quickstart/exampledata/rules/dissector/generic/example_rule.yml new file mode 100644 index 000000000..e7a6e664f --- /dev/null +++ b/quickstart/exampledata/rules/dissector/generic/example_rule.yml @@ -0,0 +1,5 @@ +filter: message +dissector: + mapping: + message: "%{generic.msg.part_one} %{generic.msg.part_two}" +description: '...' diff --git a/quickstart/exampledata/rules/dissector/specific/example_rule.yml b/quickstart/exampledata/rules/dissector/specific/example_rule.yml new file mode 100644 index 000000000..e50663c08 --- /dev/null +++ b/quickstart/exampledata/rules/dissector/specific/example_rule.yml @@ -0,0 +1,5 @@ +filter: message +dissector: + mapping: + message: "%{specific.msg.part_one} %{specific.msg.part_two}" +description: '...' diff --git a/quickstart/exampledata/rules/normalizer/additional_grok_patterns/pattern_file_one b/quickstart/exampledata/rules/normalizer/additional_grok_patterns/pattern_file_one deleted file mode 100644 index df1b3109a..000000000 --- a/quickstart/exampledata/rules/normalizer/additional_grok_patterns/pattern_file_one +++ /dev/null @@ -1,2 +0,0 @@ -CUSTOM_PATTERN_123456 123456 -CUSTOM_PATTERN_Test Test diff --git a/quickstart/exampledata/rules/normalizer/additional_grok_patterns/pattern_file_two b/quickstart/exampledata/rules/normalizer/additional_grok_patterns/pattern_file_two deleted file mode 100644 index 58216ab4b..000000000 --- a/quickstart/exampledata/rules/normalizer/additional_grok_patterns/pattern_file_two +++ /dev/null @@ -1 +0,0 @@ -CUSTOM_PATTERN_OTHER_FILE other file! \ No newline at end of file diff --git a/quickstart/exampledata/rules/normalizer/generic/example_rule.yml b/quickstart/exampledata/rules/normalizer/generic/example_rule.yml deleted file mode 100644 index 6a4f8bc85..000000000 --- a/quickstart/exampledata/rules/normalizer/generic/example_rule.yml +++ /dev/null @@ -1,5 +0,0 @@ -filter: "test_normalizer" -normalize: - id: normalizer-1352bc0a-53ae-4740-bb9e-1e865f63375f - something: normalized_field -description: "..." diff --git a/quickstart/exampledata/rules/normalizer/normalizer_regex_mapping.yml b/quickstart/exampledata/rules/normalizer/normalizer_regex_mapping.yml deleted file mode 100644 index a8922be72..000000000 --- a/quickstart/exampledata/rules/normalizer/normalizer_regex_mapping.yml +++ /dev/null @@ -1,4 +0,0 @@ -RE_IP_PORT_CAP: .*(?P[\d.]+):(?P\d+).* -RE_FULL_CAP: (?P.*) -RE_ONLY_THIS_CAP: .*(?POnly this!).* -RE_SWITCH_CAP: (?PSecond).*(?PFirst) \ No newline at end of file diff --git a/quickstart/exampledata/rules/normalizer/specific/example_rule.yml b/quickstart/exampledata/rules/normalizer/specific/example_rule.yml deleted file mode 100644 index 589392be7..000000000 --- a/quickstart/exampledata/rules/normalizer/specific/example_rule.yml +++ /dev/null @@ -1,5 +0,0 @@ -filter: "test_normalizer" -normalize: - id: normalizer-1a3c69b2-5d54-4b6b-ab07-c7ddbea7917c - something_special: normalized_field -description: "..." diff --git a/tests/testdata/auto_tests/normalizer/regex_mapping.yml b/tests/testdata/auto_tests/normalizer/regex_mapping.yml deleted file mode 100644 index a8922be72..000000000 --- a/tests/testdata/auto_tests/normalizer/regex_mapping.yml +++ /dev/null @@ -1,4 +0,0 @@ -RE_IP_PORT_CAP: .*(?P[\d.]+):(?P\d+).* -RE_FULL_CAP: (?P.*) -RE_ONLY_THIS_CAP: .*(?POnly this!).* -RE_SWITCH_CAP: (?PSecond).*(?PFirst) \ No newline at end of file diff --git a/tests/testdata/auto_tests/normalizer/rules/generic/auto_test_normalizer_match.json b/tests/testdata/auto_tests/normalizer/rules/generic/auto_test_normalizer_match.json deleted file mode 100644 index e0e765794..000000000 --- a/tests/testdata/auto_tests/normalizer/rules/generic/auto_test_normalizer_match.json +++ /dev/null @@ -1,7 +0,0 @@ -[{ - "filter": "some_field", - "normalize": { - "some_field": "some_normalized_field" - }, - "description": "Test-rule with matching auto-test" -}] \ No newline at end of file diff --git a/tests/testdata/auto_tests/normalizer/rules/generic/auto_test_normalizer_match_test.json b/tests/testdata/auto_tests/normalizer/rules/generic/auto_test_normalizer_match_test.json deleted file mode 100644 index a1d2b5a05..000000000 --- a/tests/testdata/auto_tests/normalizer/rules/generic/auto_test_normalizer_match_test.json +++ /dev/null @@ -1,13 +0,0 @@ -[ - { - "raw": { - "some_field": "doesnt_matter", - "tags": ["syslog"] - }, - "processed": { - "some_field": "doesnt_matter", - "some_normalized_field": "doesnt_matter", - "tags": ["syslog"] - } - } -] \ No newline at end of file diff --git a/tests/testdata/auto_tests/normalizer/rules/specific/auto_test_normalizer_mismatch.json b/tests/testdata/auto_tests/normalizer/rules/specific/auto_test_normalizer_mismatch.json deleted file mode 100644 index 8240ee881..000000000 --- a/tests/testdata/auto_tests/normalizer/rules/specific/auto_test_normalizer_mismatch.json +++ /dev/null @@ -1,7 +0,0 @@ -[{ - "filter": "some_field", - "normalize": { - "some_field": "some_normalized_field" - }, - "description": "Test-rule with mismatching auto-test" -}] \ No newline at end of file diff --git a/tests/testdata/auto_tests/normalizer/rules/specific/auto_test_normalizer_mismatch_test.json b/tests/testdata/auto_tests/normalizer/rules/specific/auto_test_normalizer_mismatch_test.json deleted file mode 100644 index ac915bac5..000000000 --- a/tests/testdata/auto_tests/normalizer/rules/specific/auto_test_normalizer_mismatch_test.json +++ /dev/null @@ -1,13 +0,0 @@ -[ - { - "raw": { - "some_field": "doesnt_matter", - "tags": ["syslog"] - }, - "processed": { - "some_field": "doesnt_matter", - "some_normalized_field": "does_matter", - "tags": ["syslog"] - } - } -] \ No newline at end of file diff --git a/tests/testdata/auto_tests/normalizer/rules/specific/auto_test_normalizer_no_test_.json b/tests/testdata/auto_tests/normalizer/rules/specific/auto_test_normalizer_no_test_.json deleted file mode 100644 index 8240ee881..000000000 --- a/tests/testdata/auto_tests/normalizer/rules/specific/auto_test_normalizer_no_test_.json +++ /dev/null @@ -1,7 +0,0 @@ -[{ - "filter": "some_field", - "normalize": { - "some_field": "some_normalized_field" - }, - "description": "Test-rule with mismatching auto-test" -}] \ No newline at end of file diff --git a/tests/testdata/unit/normalizer/additional_grok_patterns/pattern_file_one b/tests/testdata/unit/normalizer/additional_grok_patterns/pattern_file_one deleted file mode 100644 index af79c8dcb..000000000 --- a/tests/testdata/unit/normalizer/additional_grok_patterns/pattern_file_one +++ /dev/null @@ -1,3 +0,0 @@ -CUSTOM_PATTERN_123456 123456 -CUSTOM_PATTERN_Test Test -CUSTOM_PATTERN_TEST Test diff --git a/tests/testdata/unit/normalizer/additional_grok_patterns/pattern_file_two b/tests/testdata/unit/normalizer/additional_grok_patterns/pattern_file_two deleted file mode 100644 index 58216ab4b..000000000 --- a/tests/testdata/unit/normalizer/additional_grok_patterns/pattern_file_two +++ /dev/null @@ -1 +0,0 @@ -CUSTOM_PATTERN_OTHER_FILE other file! \ No newline at end of file diff --git a/tests/testdata/unit/normalizer/additional_grok_patterns/pattern_timestamp b/tests/testdata/unit/normalizer/additional_grok_patterns/pattern_timestamp deleted file mode 100644 index e1bb71a00..000000000 --- a/tests/testdata/unit/normalizer/additional_grok_patterns/pattern_timestamp +++ /dev/null @@ -1 +0,0 @@ -CUSTOM_TIMESTAMP \d{4} \d{2} \d{2} - \d{2}\:\d{2}\:\d{2} \ No newline at end of file diff --git a/tests/testdata/unit/normalizer/html_replace_fields.yml b/tests/testdata/unit/normalizer/html_replace_fields.yml deleted file mode 100644 index d7ca06d2d..000000000 --- a/tests/testdata/unit/normalizer/html_replace_fields.yml +++ /dev/null @@ -1,4 +0,0 @@ -- inpol.vorname -- inpol.familienname -- inpol.geburtsname -- test.replace \ No newline at end of file diff --git a/tests/testdata/unit/normalizer/normalizer_regex_mapping.yml b/tests/testdata/unit/normalizer/normalizer_regex_mapping.yml deleted file mode 100644 index a8922be72..000000000 --- a/tests/testdata/unit/normalizer/normalizer_regex_mapping.yml +++ /dev/null @@ -1,4 +0,0 @@ -RE_IP_PORT_CAP: .*(?P[\d.]+):(?P\d+).* -RE_FULL_CAP: (?P.*) -RE_ONLY_THIS_CAP: .*(?POnly this!).* -RE_SWITCH_CAP: (?PSecond).*(?PFirst) \ No newline at end of file diff --git a/tests/testdata/unit/normalizer/regex_mapping.yml b/tests/testdata/unit/normalizer/regex_mapping.yml deleted file mode 100644 index 9e26dfeeb..000000000 --- a/tests/testdata/unit/normalizer/regex_mapping.yml +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/tests/testdata/unit/normalizer/rules/generic/event_data_test1_NOT_1111_to_test_normalized_test1.json b/tests/testdata/unit/normalizer/rules/generic/event_data_test1_NOT_1111_to_test_normalized_test1.json deleted file mode 100644 index 24f765ca0..000000000 --- a/tests/testdata/unit/normalizer/rules/generic/event_data_test1_NOT_1111_to_test_normalized_test1.json +++ /dev/null @@ -1,7 +0,0 @@ -[{ - "filter": "NOT winlog.event_id: 4444 AND winlog.event_data.test2", - "normalize": { - "winlog.event_data.test2": "test_normalized.test2" - }, - "description": "..." -}] diff --git a/tests/testdata/unit/normalizer/rules/generic/event_data_test1_to_test_normalized_something.json b/tests/testdata/unit/normalizer/rules/generic/event_data_test1_to_test_normalized_something.json deleted file mode 100644 index 282c6c905..000000000 --- a/tests/testdata/unit/normalizer/rules/generic/event_data_test1_to_test_normalized_something.json +++ /dev/null @@ -1,7 +0,0 @@ -[{ - "filter": "winlog.event_data.test1", - "normalize": { - "winlog.event_data.test1": "test_normalized.something" - }, - "description": "..." -}] diff --git a/tests/testdata/unit/normalizer/rules/generic/event_data_test_normalize_to_test_normalized_something.json b/tests/testdata/unit/normalizer/rules/generic/event_data_test_normalize_to_test_normalized_something.json deleted file mode 100644 index 81766eeff..000000000 --- a/tests/testdata/unit/normalizer/rules/generic/event_data_test_normalize_to_test_normalized_something.json +++ /dev/null @@ -1,7 +0,0 @@ -[{ - "filter": "winlog.event_data.test_normalize", - "normalize": { - "winlog.event_data.test_normalize": "test_normalized.something" - }, - "description": "..." -}] diff --git a/tests/testdata/unit/normalizer/rules/generic/this_is_not_a_rule.not_json b/tests/testdata/unit/normalizer/rules/generic/this_is_not_a_rule.not_json deleted file mode 100644 index 129b1e3f3..000000000 --- a/tests/testdata/unit/normalizer/rules/generic/this_is_not_a_rule.not_json +++ /dev/null @@ -1 +0,0 @@ -I'm not a json file and should not be loaded as a rule! diff --git a/tests/testdata/unit/normalizer/rules/specific/Test1_id_1111.json b/tests/testdata/unit/normalizer/rules/specific/Test1_id_1111.json deleted file mode 100644 index 71d3eba7b..000000000 --- a/tests/testdata/unit/normalizer/rules/specific/Test1_id_1111.json +++ /dev/null @@ -1,9 +0,0 @@ -[{ - "filter": "winlog.event_id: 1111", - "normalize": { - "winlog.event_data.test1": "test_normalized.test1", - "winlog.event_data.test2": "test_normalized.test2", - "winlog.event_data.test3": "test_normalized.test3" - }, - "description": "..." -}] diff --git a/tests/testdata/unit/normalizer/rules/specific/Test2_id_2222.json b/tests/testdata/unit/normalizer/rules/specific/Test2_id_2222.json deleted file mode 100644 index b205eb837..000000000 --- a/tests/testdata/unit/normalizer/rules/specific/Test2_id_2222.json +++ /dev/null @@ -1,8 +0,0 @@ -[{ - "filter": "winlog.event_id: 2222", - "normalize": { - "winlog.event_data.Test1": "test_normalized.test.field1", - "winlog.event_data.Test2": "test_normalized.test.field2" - }, - "description": "..." -}] diff --git a/tests/testdata/unit/normalizer/rules/specific/this_is_not_a_rule.not_json b/tests/testdata/unit/normalizer/rules/specific/this_is_not_a_rule.not_json deleted file mode 100644 index 129b1e3f3..000000000 --- a/tests/testdata/unit/normalizer/rules/specific/this_is_not_a_rule.not_json +++ /dev/null @@ -1 +0,0 @@ -I'm not a json file and should not be loaded as a rule! diff --git a/tests/unit/processor/grokker/test_grokker.py b/tests/unit/processor/grokker/test_grokker.py index 394810523..73e717690 100644 --- a/tests/unit/processor/grokker/test_grokker.py +++ b/tests/unit/processor/grokker/test_grokker.py @@ -473,28 +473,24 @@ def test_loads_patterns_without_custom_patterns_dir(self): def test_loads_custom_patterns(self): rule = { "filter": "winlog.event_id: 123456789", - "grokker": { - "mapping": {"winlog.event_data.normalize me!": "%{CUSTOM_PATTERN_TEST:normalized}"} - }, + "grokker": {"mapping": {"winlog.event_data.normalize me!": "%{ID:normalized}"}}, } event = { "winlog": { "api": "wineventlog", "event_id": 123456789, - "event_data": {"normalize me!": "Test"}, + "event_data": {"normalize me!": "id-1"}, } } expected = { "winlog": { "api": "wineventlog", "event_id": 123456789, - "event_data": {"normalize me!": "Test"}, + "event_data": {"normalize me!": "id-1"}, }, - "normalized": "Test", + "normalized": "id-1", } - self.object._config.custom_patterns_dir = ( - "tests/testdata/unit/normalizer/additional_grok_patterns" - ) + self.object._config.custom_patterns_dir = "tests/testdata/unit/grokker/patterns/" self._load_specific_rule(rule) self.object.setup() self.object.process(event) diff --git a/tests/unit/processor/normalizer/__init__.py b/tests/unit/processor/normalizer/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/unit/processor/normalizer/test_normalizer.py b/tests/unit/processor/normalizer/test_normalizer.py deleted file mode 100644 index 71a97aa4b..000000000 --- a/tests/unit/processor/normalizer/test_normalizer.py +++ /dev/null @@ -1,1172 +0,0 @@ -# pylint: disable=missing-docstring -# pylint: disable=protected-access -# pylint: disable=too-many-lines -# pylint: disable=line-too-long -import calendar -import copy -import json -import logging -import os -import re -import tempfile -from copy import deepcopy - -import pytest - -from logprep.factory import Factory -from logprep.processor.base.exceptions import ProcessingWarning -from logprep.processor.normalizer.rule import ( - InvalidGrokDefinition, - InvalidNormalizationDefinition, - NormalizerRule, -) -from logprep.util.time import TimeParser -from tests.unit.processor.base import BaseProcessorTestCase - - -class TestNormalizer(BaseProcessorTestCase): - CONFIG = { - "type": "normalizer", - "specific_rules": ["tests/testdata/unit/normalizer/rules/specific/"], - "generic_rules": ["tests/testdata/unit/normalizer/rules/generic/"], - "regex_mapping": "tests/testdata/unit/normalizer/normalizer_regex_mapping.yml", - "html_replace_fields": "tests/testdata/unit/normalizer/html_replace_fields.yml", - } - - @property - def specific_rules_dirs(self): - return self.CONFIG["specific_rules"] - - @property - def generic_rules_dirs(self): - return self.CONFIG["generic_rules"] - - def test_process_normalized_field_already_exists_with_same_content(self): - document = { - "winlog": { - "api": "wineventlog", - "event_id": 1234, - "event_data": {"test_normalize": "Existing and normalized have the same value"}, - "test_normalized": {"something": "Existing and normalized have the same value"}, - } - } - try: - self.object.process(document) - except ProcessingWarning: - pytest.fail( - "Normalization over an existing field with the same value as the normalized" - " field should not raise a ProcessingWarning!" - ) - - assert ( - document["test_normalized"]["something"] - == "Existing and normalized have the same value" - ) - - def test_process_normalized_field_already_exists_with_different_content(self, caplog): - document = { - "winlog": { - "api": "wineventlog", - "event_id": 1234, - "event_data": {"test_normalize": "I am new and want to be normalized!"}, - }, - "test_normalized": {"something": "I already exist but I am different!"}, - } - with caplog.at_level(logging.WARNING): - self.object.process(document) - assert re.match(".*FieldExistsWarning.*", caplog.text) - - assert document["test_normalized"]["something"] == "I already exist but I am different!" - - def test_apply_windows_rules_catch_all(self): - document = { - "winlog": { - "api": "wineventlog", - "event_id": 1234, - "event_data": {"test_normalize": "foo"}, - } - } - self.object.process(document) - assert document["test_normalized"]["something"] == "foo" - - def test_apply_windows_rules_for_specific_event_id(self): - document = { - "winlog": { - "api": "wineventlog", - "event_id": 1111, - "event_data": {"test1": "foo"}, - } - } - self.object.process(document) - assert document["test_normalized"]["test1"] == "foo" - - document = { - "winlog": { - "api": "wineventlog", - "event_id": 1112, - "event_data": {"test1": "foo"}, - } - } - self.object.process(document) - assert "test1" not in document.get("test_normalized", {}) - - def test_add_field_without_conflicts(self): - event = {"host": {"ip": "127.0.0.1"}, "client": {"port": 22222}} - self.object._add_field(event, "foo.bar.baz", 1234) - self.object._add_field(event, "host.user.name", "admin") - self.object._add_field(event, "client.address", "localhost") - assert event == { - "foo": {"bar": {"baz": 1234}}, - "host": {"ip": "127.0.0.1", "user": {"name": "admin"}}, - "client": {"address": "localhost", "port": 22222}, - } - assert not self.object._conflicting_fields - - def test_add_field_with_conflicts(self): - event = {"host": "localhost"} - self.object._add_field(event, "host.user.name", "admin") - assert self.object._conflicting_fields == ["host.user.name"] - - def test_normalization_from_specific_rules(self): - event = { - "winlog": { - "api": "wineventlog", - "event_id": 1111, - "event_data": { - "param1": "Do not normalize me!", - "test1": "Normalize me!", - }, - } - } - - self.object.process(event) - - assert event["winlog"]["event_data"]["param1"] == "Do not normalize me!" - assert event["test_normalized"]["test1"] == "Normalize me!" - - def test_normalization_from_specific_rule_with_multiple_matching_fields(self): - event = { - "winlog": { - "api": "wineventlog", - "event_id": 2222, - "event_data": { - "param1": "Do not normalize me!", - "Test1": "Normalize me.", - "Test2": "Normalize me!", - }, - } - } - - self.object.process(event) - - assert event["winlog"]["event_data"]["param1"] == "Do not normalize me!" - assert event["test_normalized"]["test"]["field1"] == "Normalize me." - assert event["test_normalized"]["test"]["field2"] == "Normalize me!" - - def test_normalization_from_generic_rules(self): - event = { - "winlog": { - "api": "wineventlog", - "event_id": 1234, - "event_data": { - "param1": "Do not normalize me!", - "test1": "Normalize me!", - }, - } - } - - self.object.process(event) - - assert event["winlog"]["event_data"]["param1"] == "Do not normalize me!" - assert event["test_normalized"]["something"] == "Normalize me!" - - def test_normalize_with_invalid_list_fails(self): - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": {"winlog.event_data.invalid_normalization": ["I am normalized!", ""]}, - } - - with pytest.raises(InvalidNormalizationDefinition): - self._load_specific_rule(rule) - - def test_normalize_full_field_with_regex_succeeds(self): - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"normalize me!": "Source value"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.normalize me!": [ - "I am normalized!", - "RE_FULL_CAP", - r"\g", - ] - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event["I am normalized!"] == "Source value" - - def test_normalize_full_field_with_regex_extraction_succeeds(self): - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"normalize me!": "xyz Only this! xyz"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.normalize me!": [ - "I am normalized!", - "RE_ONLY_THIS_CAP", - r"\g", - ] - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event["I am normalized!"] == "Only this!" - - def test_normalize_full_field_with_non_matching_regex(self): - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"normalize me!": "Keep it as is!"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.normalize me!": [ - "I am normalized!", - r"no match", - r"does not matter", - ] - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event["I am normalized!"] == "Keep it as is!" - - def test_normalize_full_field_with_regex_rearrange_succeeds(self): - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"normalize me!": "Second comes not before First"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.normalize me!": [ - "I am normalized!", - "RE_SWITCH_CAP", - r"\g comes before \g", - ] - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event["I am normalized!"] == "First comes before Second" - - def test_normalization_from_grok(self): - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"normalize me!": "123.123.123.123 1234"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.normalize me!": {"grok": "%{IP:some_ip} %{NUMBER:port:int}"} - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event.get("some_ip") == "123.123.123.123" - assert event.get("port") == 1234 - - def test_normalization_from_grok_match_only_exact(self): - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"normalize me!": "foo 123.123.123.123 1234 bar"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.normalize me!": {"grok": "%{IP:some_ip} %{NUMBER:port:int}"} - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event.get("some_ip") is None - assert event.get("port") is None - - def test_normalization_from_grok_does_not_match(self): - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"normalize me!": "123.123.123.123"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.normalize me!": {"grok": "%{IP:some_ip} %{NUMBER:port:int}"} - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event.get("some_ip") is None - assert event.get("port") is None - - def test_normalization_from_grok_list_match_first_matching(self): - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"normalize me!": "123.123.123.123 1234"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.normalize me!": { - "grok": [ - "%{IP:some_ip_1} %{NUMBER:port_1:int}", - "%{IP:some_ip_2} %{NUMBER:port_2:int}", - ] - } - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event.get("some_ip_1") == "123.123.123.123" - assert event.get("port_1") == 1234 - assert event.get("some_ip_2") is None - assert event.get("port_2") is None - - def test_normalization_from_grok_list_match_first_matching_after_skipping_non_matching( - self, - ): - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"normalize me!": "123.123.123.123 1234 bar"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.normalize me!": { - "grok": [ - "%{IP:some_ip_1} %{NUMBER:port_1:int} foo", - "%{IP:some_ip_2} %{NUMBER:port_2:int} bar", - ] - } - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event.get("some_ip_1") is None - assert event.get("port_1") is None - assert event.get("some_ip_2") == "123.123.123.123" - assert event.get("port_2") == 1234 - - def test_normalization_from_grok_list_match_none(self): - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"normalize me!": "123.123.123.123 1234"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.normalize me!": { - "grok": [ - "%{IP:some_ip_1} %{NUMBER:port_1:int} foo", - "%{IP:some_ip_2} %{NUMBER:port_2:int} bar", - ] - } - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event.get("some_ip_1") is None - assert event.get("port_1") is None - assert event.get("some_ip_2") is None - assert event.get("port_2") is None - - def test_normalization_from_nested_grok(self): - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"normalize me!": "123.123.123.123 555 1234 %ttss 11"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.normalize me!": { - "grok": r"%{IP:[parent][some_ip]} \w+ %{NUMBER:[parent][port]:int} %[ts]+ %{NUMBER:test:int}" - } - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event.get("test") == 11 - assert event.get("parent") - assert event["parent"].get("some_ip") == "123.123.123.123" - assert event["parent"].get("port") == 1234 - - def test_normalization_from_grok_with_custom_patterns(self): - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"normalize me!": "123456 Test other file!"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.normalize me!": { - "grok": "%{CUSTOM_PATTERN_123456:custom_123456} %{CUSTOM_PATTERN_Test:custom_Test} %{CUSTOM_PATTERN_OTHER_FILE:custom_other_file}" - } - }, - } - - with pytest.raises(InvalidGrokDefinition): - self._load_specific_rule(rule) - - NormalizerRule.additional_grok_patterns = ( - "tests/testdata/unit/normalizer/additional_grok_patterns" - ) - self._load_specific_rule(rule) - self.object.process(event) - - assert event.get("custom_123456") == "123456" - assert event.get("custom_Test") == "Test" - assert event.get("custom_other_file") == "other file!" - - def test_normalization_from_grok_and_norm_result(self): - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"normalize me!": "123.123.123.123 1234"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.normalize me!": {"grok": "%{IP:some_ip} %{NUMBER:port:int}"}, - "some_ip": "some.ip", - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event.get("some_ip") == "123.123.123.123" - assert event.get("port") == 1234 - assert event.get("some") - assert event["some"].get("ip") == "123.123.123.123" - - def test_normalization_from_grok_writes_grok_failure_if_no_grok_pattern_matches_and_if_configured( - self, - ): - event = {"grok_me": "123.123.123.123 1234"} - - rule = { - "filter": "grok_me", - "normalize": { - "grok_me": { - "grok": [ - "%{IP:some_ip_1} %{NUMBER:port_1:int} foo", - "%{IP:some_ip_2} %{NUMBER:port_2:int} bar", - ], - "failure_target_field": "grok_failure", - } - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event.get("some_ip_1") is None - assert event.get("port_1") is None - assert event.get("some_ip_2") is None - assert event.get("port_2") is None - assert event.get("grok_failure") == {"grok_me": "123.123.123.123 1234"} - - def test_normalization_from_grok_writes_grok_failure_for_nested_fields( - self, - ): - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"normalize me!": "123.123.123.123 1234"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.normalize me!": { - "grok": [ - "%{IP:some_ip_1} %{NUMBER:port_1:int} foo", - "%{IP:some_ip_2} %{NUMBER:port_2:int} bar", - ], - "failure_target_field": "grok_failure", - } - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event.get("some_ip_1") is None - assert event.get("port_1") is None - assert event.get("some_ip_2") is None - assert event.get("port_2") is None - assert event.get("grok_failure") == { - "winlog>event_data>normalize me!": "123.123.123.123 1234" - } - - def test_normalization_from_grok_writes_grok_failure_to_dotted_subfield( - self, - ): - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"normalize me!": "123.123.123.123 1234"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.normalize me!": { - "grok": [ - "%{IP:some_ip_1} %{NUMBER:port_1:int} foo", - "%{IP:some_ip_2} %{NUMBER:port_2:int} bar", - ], - "failure_target_field": "winlog.event_data.grok_failure", - } - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event.get("some_ip_1") is None - assert event.get("port_1") is None - assert event.get("some_ip_2") is None - assert event.get("port_2") is None - assert event.get("winlog", {}).get("event_data", {}).get("grok_failure") == { - "winlog>event_data>normalize me!": "123.123.123.123 1234" - } - - def test_normalization_from_grok_onto_existing(self, caplog): - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"normalize me!": "123.123.123.123 1234"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.normalize me!": {"grok": "%{IP:winlog} %{NUMBER:port:int}"} - }, - } - - self._load_specific_rule(rule) - - with caplog.at_level(logging.WARNING): - self.object.process(event) - assert re.match(".*FieldExistsWarning.*", caplog.text) - - def test_incorrect_grok_identifier_definition(self): - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.normalize me!": {"groks": "%{IP:some_ip} %{NUMBER:port:int}"} - }, - } - - with pytest.raises(InvalidNormalizationDefinition): - self._load_specific_rule(rule) - - def test_incorrect_grok_definition(self): - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.normalize me!": {"grok": "%{IP:some_ip} %{NUMBA:port:int}"} - }, - } - - with pytest.raises(InvalidGrokDefinition): - self._load_specific_rule(rule) - - def test_normalization_from_timestamp_berlin_to_utc(self): - expected = { - "@timestamp": "1999-12-12T11:12:22Z", - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"some_timestamp_utc": "1999 12 12 - 12:12:22"}, - }, - } - - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"some_timestamp_utc": "1999 12 12 - 12:12:22"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.some_timestamp_utc": { - "timestamp": { - "destination": "@timestamp", - "source_formats": ["%Y", "%Y %m %d - %H:%M:%S"], - "source_timezone": "Europe/Berlin", - "destination_timezone": "UTC", - } - } - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event == expected - - def test_normalization_from_grok_with_timestamp_normalization(self): - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"normalize me!": "123.123.123.123 1234 1999 12 12 - 12:12:22"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.normalize me!": { - "grok": "%{IP:some_ip} %{NUMBER:port:int} %{CUSTOM_TIMESTAMP:some_timestamp_utc}" - }, - "some_timestamp_utc": { - "timestamp": { - "destination": "@timestamp", - "source_formats": ["%Y", "%Y %m %d - %H:%M:%S"], - "source_timezone": "UTC", - "destination_timezone": "UTC", - } - }, - }, - } - - NormalizerRule.additional_grok_patterns = ( - "tests/testdata/unit/normalizer/additional_grok_patterns" - ) - self._load_specific_rule(rule) - self.object.process(event) - - assert event.get("some_ip") == "123.123.123.123" - assert event.get("port") == 1234 - assert event.get("@timestamp") == "1999-12-12T12:12:22Z" - - def test_normalization_from_grok_with_timestamp_normalization_and_timestamp_does_not_exist( - self, - ): - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"normalize me!": "123.123.123.123 1234"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.normalize me!": { - "grok": [ - "%{IP:some_ip} %{NUMBER:port:int} %{CUSTOM_TIMESTAMP:some_timestamp_utc}", - "%{IP:some_ip} %{NUMBER:port:int}", - ] - }, - "some_timestamp_utc": { - "timestamp": { - "destination": "@timestamp", - "source_formats": ["%Y", "%Y %m %d - %H:%M:%S"], - "source_timezone": "UTC", - "destination_timezone": "UTC", - } - }, - }, - } - - NormalizerRule.additional_grok_patterns = ( - "tests/testdata/unit/normalizer/additional_grok_patterns" - ) - self._load_specific_rule(rule) - self.object.process(event) - - assert event.get("some_ip") == "123.123.123.123" - assert event.get("port") == 1234 - assert event.get("@timestamp") is None - - def test_normalization_from_timestamp_same_timezones(self): - expected = { - "@timestamp": "1999-12-12T12:12:22Z", - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"some_timestamp_utc": "1999 12 12 - 12:12:22"}, - }, - } - - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"some_timestamp_utc": "1999 12 12 - 12:12:22"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.some_timestamp_utc": { - "timestamp": { - "destination": "@timestamp", - "source_formats": ["%Y", "%Y %m %d - %H:%M:%S"], - "source_timezone": "UTC", - "destination_timezone": "UTC", - } - } - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event == expected - - def test_normalization_from_timestamp_utc_to_berlin(self): - expected = { - "@timestamp": "1999-12-12T13:12:22+01:00", - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"some_timestamp_utc": "1999 12 12 - 12:12:22"}, - }, - } - - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"some_timestamp_utc": "1999 12 12 - 12:12:22"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.some_timestamp_utc": { - "timestamp": { - "destination": "@timestamp", - "source_formats": ["%Y", "%Y %m %d - %H:%M:%S"], - "source_timezone": "UTC", - "destination_timezone": "Europe/Berlin", - } - } - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event == expected - - def test_normalization_from_iso8601_timestamp(self): - expected = { - "@timestamp": "2020-01-03T14:04:05.879000Z", - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"some_timestamp_utc": "2020-01-03T14:04:05.879Z"}, - }, - } - - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"some_timestamp_utc": "2020-01-03T14:04:05.879Z"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.some_timestamp_utc": { - "timestamp": { - "destination": "@timestamp", - "source_formats": ["%Y", "ISO8601"], - "source_timezone": "Europe/Berlin", - "destination_timezone": "UTC", - } - } - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event == expected - - def test_normalization_from_unix_with_millis_timestamp(self): - expected = { - "@timestamp": "2022-01-14T12:40:49.843000+01:00", - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"some_timestamp_utc": "1642160449843"}, - }, - } - - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"some_timestamp_utc": "1642160449843"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.some_timestamp_utc": { - "timestamp": { - "destination": "@timestamp", - "source_formats": ["UNIX"], - "source_timezone": "UTC", - "destination_timezone": "Europe/Berlin", - } - } - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event == expected - - def test_normalization_from_unix_with_seconds_timestamp(self): - expected = { - "@timestamp": "2022-01-14T12:40:49+01:00", - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"some_timestamp_utc": "1642160449"}, - }, - } - - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"some_timestamp_utc": "1642160449"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.some_timestamp_utc": { - "timestamp": { - "destination": "@timestamp", - "source_formats": ["UNIX"], - "source_timezone": "UTC", - "destination_timezone": "Europe/Berlin", - } - } - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event == expected - - def test_normalization_from_timestamp_with_non_matching_patterns(self, caplog): - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"some_timestamp_utc": "1999 12 12 - 12:12:22 UTC"}, - } - } - - expected = copy.deepcopy(event) - expected.update({"tags": ["_normalizer_failure"]}) - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.some_timestamp_utc": { - "timestamp": { - "destination": "@timestamp", - "source_formats": ["a%Y", "a%Y %m", "ISO8601"], - "source_timezone": "UTC", - "destination_timezone": "Europe/Berlin", - } - } - }, - } - - self._load_specific_rule(rule) - with caplog.at_level(logging.WARNING): - self.object.process(event) - assert re.match(".*NormalizerError.*", caplog.text) - assert event == expected - - def test_normalization_from_timestamp_with_collision(self): - expected = { - "@timestamp": "1999-12-12T11:12:22Z", - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"some_timestamp_berlin": "1999 12 12 - 12:12:22"}, - }, - } - - event = { - "@timestamp": "2200-02-01T16:19:22Z", - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"some_timestamp_berlin": "1999 12 12 - 12:12:22"}, - }, - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.some_timestamp_berlin": { - "timestamp": { - "destination": "@timestamp", - "source_formats": ["%Y", "%Y %m %d - %H:%M:%S"], - "source_timezone": "Europe/Berlin", - "destination_timezone": "UTC", - } - } - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event == expected - - def test_normalization_from_timestamp_with_collision_without_allow_override_fails(self, caplog): - event = { - "@timestamp": "2200-02-01T16:19:22Z", - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"some_timestamp_utc": "1999 12 12 - 12:12:22"}, - }, - } - - expected = copy.deepcopy(event) - expected.update({"tags": ["_normalizer_failure"]}) - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.some_timestamp_utc": { - "timestamp": { - "destination": "@timestamp", - "source_formats": ["%Y", "%Y %m %d - %H:%M:%S"], - "source_timezone": "Europe/Berlin", - "destination_timezone": "UTC", - "allow_override": False, - } - } - }, - } - - self._load_specific_rule(rule) - with caplog.at_level(logging.WARNING): - self.object.process(event) - assert re.match(".*FieldExistsWarning.*", caplog.text) - assert event == expected - - def test_normalization_with_replace_html_entity(self): - event = { - "tags": ["testtag"], - "message": "replace=MAX+€MORITZ&dont_replace=FOO+BAR&id=5", - } - - expected = { - "tags": ["testtag"], - "message": "replace=MAX+€MORITZ&dont_replace=FOO+BAR&id=5", - "test": { - "id": "5", - "dont_replace": "FOO+BAR", - "replace": "MAX+€MORITZ", - "replace_decodiert": "MAX+€MORITZ", - }, - } - - rule = { - "filter": "tags: testtag", - "normalize": { - "message": { - "grok": "replace=%{DATA:[test][replace]}" - "&dont_replace=%{DATA:[test][dont_replace]}&id=%{INT:[test][id]}" - } - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - assert event == expected - - def test_normalization_with_grok_pattern_count(self): - temp_path = tempfile.mkdtemp() - config = deepcopy(self.CONFIG) - - config.update( - {"count_grok_pattern_matches": {"count_directory_path": temp_path, "write_period": 0}} - ) - processor_config = {"Test Normalizer Name": config} - self.object = Factory.create(processor_config) - - event = { - "winlog": { - "api": "wineventlog", - "event_id": 123456789, - "event_data": {"normalize me!": "123.123.123.123 1234"}, - } - } - - rule = { - "filter": "winlog.event_id: 123456789", - "normalize": { - "winlog.event_data.normalize me!": { - "grok": ["%{IP:some_ip} %{NUMBER:port:int}", "NO MATCH"] - } - }, - } - - self._load_specific_rule(rule) - self.object.process(event) - - match_cnt_path = self.object._grok_matches_path - match_cnt_files = os.listdir(match_cnt_path) - - assert len(match_cnt_files) == 1 - - now = TimeParser.now() - date = now.date() - match_file_name = match_cnt_files[0] - - assert match_file_name.endswith(".json") - - file_date, file_weekday = match_cnt_files[0][:-5].split("_") - - assert date.isoformat() == file_date - assert calendar.day_name[date.weekday()].lower() == file_weekday - - with open( - os.path.join(match_cnt_path, match_file_name), "r", encoding="utf8" - ) as match_file: - match_json = json.load(match_file) - - assert "^%{IP:some_ip} %{NUMBER:port:int}$" in match_json - assert "^NO MATCH$" in match_json - assert match_json["^%{IP:some_ip} %{NUMBER:port:int}$"] == 1 - assert match_json["^NO MATCH$"] == 0 - - self.object.process(event) - - with open( - os.path.join(match_cnt_path, match_file_name), "r", encoding="utf8" - ) as match_file: - match_json = json.load(match_file) - - assert match_json["^%{IP:some_ip} %{NUMBER:port:int}$"] == 2 - assert match_json["^NO MATCH$"] == 0 - - assert event.get("some_ip") == "123.123.123.123" - assert event.get("port") == 1234 diff --git a/tests/unit/processor/normalizer/test_normalizer_rule.py b/tests/unit/processor/normalizer/test_normalizer_rule.py deleted file mode 100644 index a9470f4a8..000000000 --- a/tests/unit/processor/normalizer/test_normalizer_rule.py +++ /dev/null @@ -1,279 +0,0 @@ -# pylint: disable=missing-docstring -# pylint: disable=protected-access - -import pytest - -from logprep.filter.lucene_filter import LuceneFilter -from logprep.processor.normalizer.rule import NormalizerRule - - -@pytest.fixture(name="specific_rule_definition") -def fixture_specific_rule_definition(): - return { - "filter": "message", - "normalize": { - "substitution_field": "foo", - "grok_field": {"grok": ["%{IP:ip_foo} %{NUMBER:port_foo:int} foo"]}, - "timestamp_field": { - "timestamp": { - "destination": "timestamp_field", - "source_formats": ["%Y %m %d - %H:%M:%S"], - "source_timezone": "UTC", - "destination_timezone": "Europe/Berlin", - }, - }, - }, - "description": "insert a description text", - } - - -class TestNormalizerRule: - @pytest.mark.parametrize( - "testcase, other_rule_definition, is_equal", - [ - ( - "Should be equal cause the same", - { - "filter": "message", - "normalize": { - "substitution_field": "foo", - "grok_field": {"grok": ["%{IP:ip_foo} %{NUMBER:port_foo:int} foo"]}, - "timestamp_field": { - "timestamp": { - "destination": "timestamp_field", - "source_formats": ["%Y %m %d - %H:%M:%S"], - "source_timezone": "UTC", - "destination_timezone": "Europe/Berlin", - }, - }, - }, - }, - True, - ), - ( - "Should be not equal cause of other filter", - { - "filter": "other_message", - "normalize": { - "substitution_field": "foo", - "grok_field": {"grok": ["%{IP:ip_foo} %{NUMBER:port_foo:int} foo"]}, - "timestamp_field": { - "timestamp": { - "destination": "timestamp_field", - "source_formats": ["%Y %m %d - %H:%M:%S"], - "source_timezone": "UTC", - "destination_timezone": "Europe/Berlin", - }, - }, - }, - }, - False, - ), - ( - "Should be not equal cause of other substitution_field", - { - "filter": "message", - "normalize": { - "substitution_field": "bar", - "grok_field": {"grok": ["%{IP:ip_foo} %{NUMBER:port_foo:int} foo"]}, - "timestamp_field": { - "timestamp": { - "destination": "timestamp_field", - "source_formats": ["%Y %m %d - %H:%M:%S"], - "source_timezone": "UTC", - "destination_timezone": "Europe/Berlin", - }, - }, - }, - }, - False, - ), - ( - "Should be not equal cause of no substitution_field", - { - "filter": "message", - "normalize": { - "grok_field": {"grok": ["%{IP:ip_foo} %{NUMBER:port_foo:int} foo"]}, - "timestamp_field": { - "timestamp": { - "destination": "timestamp_field", - "source_formats": ["%Y %m %d - %H:%M:%S"], - "source_timezone": "UTC", - "destination_timezone": "Europe/Berlin", - }, - }, - }, - }, - False, - ), - ( - "Should be not equal cause of no grok_field", - { - "filter": "message", - "normalize": { - "substitution_field": "foo", - "timestamp_field": { - "timestamp": { - "destination": "timestamp_field", - "source_formats": ["%Y %m %d - %H:%M:%S"], - "source_timezone": "UTC", - "destination_timezone": "Europe/Berlin", - }, - }, - }, - }, - False, - ), - ( - "Should be not equal cause of other grok_field", - { - "filter": "message", - "normalize": { - "substitution_field": "foo", - "grok_field": {"grok": ["%{IP:ip_bar} %{NUMBER:port_bar:int} bar"]}, - "timestamp_field": { - "timestamp": { - "destination": "timestamp_field", - "source_formats": ["%Y %m %d - %H:%M:%S"], - "source_timezone": "UTC", - "destination_timezone": "Europe/Berlin", - }, - }, - }, - }, - False, - ), - ( - "Should be not equal cause of additional grok_field", - { - "filter": "message", - "normalize": { - "substitution_field": "foo", - "grok_field": { - "grok": [ - "%{IP:ip_foo} %{NUMBER:port_foo:int} foo", - "%{IP:ip_bar} %{NUMBER:port_bar:int} bar", - ] - }, - "timestamp_field": { - "timestamp": { - "destination": "timestamp_field", - "source_formats": ["%Y %m %d - %H:%M:%S"], - "source_timezone": "UTC", - "destination_timezone": "Europe/Berlin", - }, - }, - }, - }, - False, - ), - ( - "Should be not equal cause of other timestamp_field", - { - "filter": "message", - "normalize": { - "substitution_field": "foo", - "grok_field": {"grok": ["%{IP:ip_foo} %{NUMBER:port_foo:int} foo"]}, - "timestamp_field": { - "timestamp": { - "destination": "other_timestamp_field", - "source_formats": ["%Y %m %d - %H:%M:%S"], - "source_timezone": "UTC", - "destination_timezone": "Europe/Berlin", - }, - }, - }, - }, - False, - ), - ( - "Should be not equal cause of no timestamp_field", - { - "filter": "message", - "normalize": { - "substitution_field": "foo", - "grok_field": {"grok": ["%{IP:ip_foo} %{NUMBER:port_foo:int} foo"]}, - }, - }, - False, - ), - ], - ) - def test_rules_equality( - self, specific_rule_definition, testcase, other_rule_definition, is_equal - ): - rule1 = NormalizerRule( - LuceneFilter.create(specific_rule_definition["filter"]), - specific_rule_definition["normalize"], - ) - rule2 = NormalizerRule( - LuceneFilter.create(other_rule_definition["filter"]), - other_rule_definition["normalize"], - ) - assert (rule1 == rule2) == is_equal, testcase - - def test_grok_loads_one_pattern(self): - grok_rule = { - "filter": "message", - "normalize": { - "some_grok_field": {"grok": "%{IP:ip_foo}"}, - }, - } - - rule = NormalizerRule(LuceneFilter.create(grok_rule["filter"]), grok_rule["normalize"]) - assert len(rule.grok) == 1 - assert rule.grok.get("some_grok_field")._grok_list[0].pattern == "^%{IP:ip_foo}$" - - def test_grok_loads_one_pattern_from_list(self): - grok_rule = { - "filter": "message", - "normalize": { - "some_grok_field": {"grok": ["%{IP:ip_foo}"]}, - }, - } - - rule = NormalizerRule(LuceneFilter.create(grok_rule["filter"]), grok_rule["normalize"]) - assert len(rule.grok) == 1 - patterns = [grok.pattern for grok in rule.grok.get("some_grok_field")._grok_list] - assert len(patterns) == 1 - assert patterns[0] == "^%{IP:ip_foo}$" - - def test_grok_loads_multiple_patterns_from_one_list(self): - grok_patterns = ["%{IP:ip_foo}", "%{IP:ip_bar}", "%{IP:ip_baz}"] - grok_rule = { - "filter": "message", - "normalize": { - "some_grok_field": {"grok": grok_patterns}, - }, - } - - rule = NormalizerRule(LuceneFilter.create(grok_rule["filter"]), grok_rule["normalize"]) - assert len(rule.grok) == 1 - patterns = [grok.pattern for grok in rule.grok.get("some_grok_field")._grok_list] - assert len(patterns) == 3 - assert patterns == ["^%{IP:ip_foo}$", "^%{IP:ip_bar}$", "^%{IP:ip_baz}$"] - - def test_grok_loads_multiple_patterns_from_multiple_lists(self): - grok_rule = { - "filter": "message", - "normalize": { - "some_grok_field_1": {"grok": ["%{IP:ip_foo}"]}, - "some_grok_field_2": {"grok": ["%{IP:ip_bar}"]}, - "some_grok_field_3": {"grok": ["%{IP:ip_baz}"]}, - }, - } - - rule = NormalizerRule(LuceneFilter.create(grok_rule["filter"]), grok_rule["normalize"]) - - assert len(rule.grok) == 3 - patterns = [grok.pattern for grok in rule.grok.get("some_grok_field_1")._grok_list] - assert len(patterns) == 1 - assert patterns[0] == "^%{IP:ip_foo}$" - - patterns = [grok.pattern for grok in rule.grok.get("some_grok_field_2")._grok_list] - assert len(patterns) == 1 - assert patterns[0] == "^%{IP:ip_bar}$" - - patterns = [grok.pattern for grok in rule.grok.get("some_grok_field_3")._grok_list] - assert len(patterns) == 1 - assert patterns[0] == "^%{IP:ip_baz}$" diff --git a/tests/unit/test_configuration.py b/tests/unit/test_configuration.py index f3476825b..e31e00605 100644 --- a/tests/unit/test_configuration.py +++ b/tests/unit/test_configuration.py @@ -38,20 +38,20 @@ def teardown_method(self): def test_reads_test_config(self): test_config = { "type": "mock_processor", - "specific_rules": ["tests/testdata/unit/normalizer/rules/specific/"], - "generic_rules": ["tests/testdata/unit/normalizer/rules/generic/"], + "specific_rules": ["tests/testdata/unit/dissector/rules/specific/"], + "generic_rules": ["tests/testdata/unit/dissector/rules/generic/"], "mandatory_attribute": "I am mandatory", "optional_attribute": "I am optional", } config = Configuration.create("dummy name", test_config) assert config.type == "mock_processor" assert config.mandatory_attribute == "I am mandatory" - assert config.generic_rules == ["tests/testdata/unit/normalizer/rules/generic/"] + assert config.generic_rules == ["tests/testdata/unit/dissector/rules/generic/"] def test_raises_on_missing_type(self): test_config = { - "specific_rules": ["tests/testdata/unit/normalizer/rules/specific/"], - "generic_rules": ["tests/testdata/unit/normalizer/rules/generic/"], + "specific_rules": ["tests/testdata/unit/dissector/rules/specific/"], + "generic_rules": ["tests/testdata/unit/dissector/rules/generic/"], "mandatory_attribute": "I am mandatory", "optional_attribute": "I am optional", } @@ -61,8 +61,8 @@ def test_raises_on_missing_type(self): def test_raises_on_unknown_processor(self): test_config = { "type": "unknown_processor", - "specific_rules": ["tests/testdata/unit/normalizer/rules/specific/"], - "generic_rules": ["tests/testdata/unit/normalizer/rules/generic/"], + "specific_rules": ["tests/testdata/unit/dissector/rules/specific/"], + "generic_rules": ["tests/testdata/unit/dissector/rules/generic/"], "mandatory_attribute": "I am mandatory", "optional_attribute": "I am optional", } @@ -72,8 +72,8 @@ def test_raises_on_unknown_processor(self): def test_raises_if_one_mandatory_field_is_missing(self): test_config = { "type": "mock_processor", - "specific_rules": ["tests/testdata/unit/normalizer/rules/specific/"], - "generic_rules": ["tests/testdata/unit/normalizer/rules/generic/"], + "specific_rules": ["tests/testdata/unit/dissector/rules/specific/"], + "generic_rules": ["tests/testdata/unit/dissector/rules/generic/"], "optional_attribute": "I am optional", } with pytest.raises( @@ -84,7 +84,7 @@ def test_raises_if_one_mandatory_field_is_missing(self): def test_raises_if_mandatory_attribute_from_base_is_missing(self): test_config = { "type": "mock_processor", - "generic_rules": ["tests/testdata/unit/normalizer/rules/generic/"], + "generic_rules": ["tests/testdata/unit/dissector/rules/generic/"], "mandatory_attribute": "does not matter", } with pytest.raises( @@ -96,7 +96,7 @@ def test_raises_if_mandatory_attribute_from_base_is_missing(self): def test_raises_if_multiple_mandatory_field_are_missing(self): test_config = { "type": "mock_processor", - "generic_rules": ["tests/testdata/unit/normalizer/rules/generic/"], + "generic_rules": ["tests/testdata/unit/dissector/rules/generic/"], } with pytest.raises( TypeError, @@ -107,8 +107,8 @@ def test_raises_if_multiple_mandatory_field_are_missing(self): def test_raises_on_unknown_field(self): test_config = { "type": "mock_processor", - "specific_rules": ["tests/testdata/unit/normalizer/rules/specific/"], - "generic_rules": ["tests/testdata/unit/normalizer/rules/generic/"], + "specific_rules": ["tests/testdata/unit/dissector/rules/specific/"], + "generic_rules": ["tests/testdata/unit/dissector/rules/generic/"], "mandatory_attribute": "I am mandatory", "optional_attribute": "I am optional", "i_shoul_not_be_here": "does not matter", @@ -119,8 +119,8 @@ def test_raises_on_unknown_field(self): def test_init_non_mandatory_fields_with_default(self): test_config = { "type": "mock_processor", - "specific_rules": ["tests/testdata/unit/normalizer/rules/specific/"], - "generic_rules": ["tests/testdata/unit/normalizer/rules/generic/"], + "specific_rules": ["tests/testdata/unit/dissector/rules/specific/"], + "generic_rules": ["tests/testdata/unit/dissector/rules/generic/"], "mandatory_attribute": "I am mandatory", } config = Configuration.create("dummy name", test_config) @@ -130,8 +130,8 @@ def test_init_non_mandatory_fields_with_default(self): def test_init_optional_field_in_sub_class(self): test_config = { "type": "mock_processor", - "specific_rules": ["tests/testdata/unit/normalizer/rules/specific/"], - "generic_rules": ["tests/testdata/unit/normalizer/rules/generic/"], + "specific_rules": ["tests/testdata/unit/dissector/rules/specific/"], + "generic_rules": ["tests/testdata/unit/dissector/rules/generic/"], "mandatory_attribute": "I am mandatory", "optional_attribute": "I am optional", } @@ -141,8 +141,8 @@ def test_init_optional_field_in_sub_class(self): def test_init_optional_field_in_base_class(self): test_config = { "type": "mock_processor", - "specific_rules": ["tests/testdata/unit/normalizer/rules/specific/"], - "generic_rules": ["tests/testdata/unit/normalizer/rules/generic/"], + "specific_rules": ["tests/testdata/unit/dissector/rules/specific/"], + "generic_rules": ["tests/testdata/unit/dissector/rules/generic/"], "mandatory_attribute": "I am mandatory", "tree_config": "tests/testdata/unit/tree_config.json", } diff --git a/tests/unit/util/test_helper.py b/tests/unit/util/test_helper.py index 5c6292607..a059a8930 100644 --- a/tests/unit/util/test_helper.py +++ b/tests/unit/util/test_helper.py @@ -42,7 +42,7 @@ class TestSnakeToCamel: ("IPhoneHysteria", "i_phone_hysteria"), ("DatetimeExtractor", "datetime_extractor"), ("GenericAdder", "generic_adder"), - ("Normalizer", "normalizer"), + ("Dissector", "dissector"), ("GeoipEnricher", "geoip_enricher"), ], )