diff --git a/CHANGELOG.md b/CHANGELOG.md index 978483f68..7a151f76e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,10 +24,12 @@ - removed `max_caching_days` config option - add `max_cached_pseudonymized_urls` config option which defaults to 1000 - add lru caching for peudonymizatin of urls +* improve loading times for the rule tree by optimizing the rule segmentation and sorting * add support for python 3.12 and remove support for python 3.9 ### Bugfix +* fix the rule tree parsing some rules incorrectly, potentially resulting in more matches ## v8.0.0 ### Breaking @@ -54,6 +56,7 @@ ### Bugfix + ## v7.0.0 ### Breaking diff --git a/logprep/framework/rule_tree/rule_parser.py b/logprep/framework/rule_tree/rule_parser.py index 8645830ad..16ca96e48 100644 --- a/logprep/framework/rule_tree/rule_parser.py +++ b/logprep/framework/rule_tree/rule_parser.py @@ -4,7 +4,6 @@ behavior, allowing a simpler construction of the rule tree. """ - from typing import TYPE_CHECKING from logprep.filter.expression.filter_expression import ( @@ -106,7 +105,6 @@ def parse_rule(self, rule: "Rule", priority_dict: dict) -> list: RuleSorter.sort_rule_segments(dnf_rule_segments, priority_dict) self._add_exists_filter(dnf_rule_segments) self._rule_tagger.add(dnf_rule_segments) - return dnf_rule_segments @staticmethod diff --git a/logprep/framework/rule_tree/rule_segmenter.py b/logprep/framework/rule_tree/rule_segmenter.py index 09555c0ad..86a9a6b06 100644 --- a/logprep/framework/rule_tree/rule_segmenter.py +++ b/logprep/framework/rule_tree/rule_segmenter.py @@ -191,11 +191,10 @@ def _segment_conjunctive_expression(expression: FilterExpression) -> list: class CnfToDnfConverter: """Converts simplified rules from the conjunctive normal form to the disjunctive normal form""" - @staticmethod - def convert_cnf_to_dnf(cnf: list): + @classmethod + def convert_cnf_to_dnf(cls, cnf: list) -> list: """Convert rule from conjunctive normal form into disjunctive normal form. - This function handles the parsing of OR-subexpressions in AND-expression filters in a recursive manner. It converts an input in conjunctive normal form into the disjunctive normal form. @@ -214,22 +213,27 @@ def convert_cnf_to_dnf(cnf: list): result_list: list Given input list with resolved OR-subexpressions. + Raises + ------ + RuleSegmenterException + Raises if converting the rule requires too much time, since the complexity of the + transformation to the disjunctive normal form is exponential. + """ dnf = [] - - or_segment = CnfToDnfConverter._pop_disjunctive_segment(cnf) - - CnfToDnfConverter._resolve_disjunctive_segment(or_segment, cnf, dnf) - - for parsed_expression in dnf.copy(): - for segment in parsed_expression: - if isinstance(segment, list): - if parsed_expression in dnf: - dnf.remove(parsed_expression) - resolved_expressions = CnfToDnfConverter.convert_cnf_to_dnf(parsed_expression) - - for resolved_expression in resolved_expressions: - dnf.append(resolved_expression) + or_segments = CnfToDnfConverter._pop_disjunctive_segment(cnf) + CnfToDnfConverter._resolve_disjunctive_segment(or_segments, cnf, dnf) + dnf_len = range(len(dnf)) + for idx in dnf_len: + parsed_expression = dnf[idx] + if any(isinstance(segment, list) for segment in parsed_expression): + if parsed_expression in dnf: + dnf[idx] = None + resolved_expressions = CnfToDnfConverter.convert_cnf_to_dnf(parsed_expression) + + for resolved_expression in resolved_expressions: + dnf.append(resolved_expression) + dnf = [item for item in dnf if item is not None] return dnf @staticmethod diff --git a/logprep/framework/rule_tree/rule_sorter.py b/logprep/framework/rule_tree/rule_sorter.py index 06e3d8479..9b6de7942 100644 --- a/logprep/framework/rule_tree/rule_sorter.py +++ b/logprep/framework/rule_tree/rule_sorter.py @@ -37,16 +37,22 @@ def sort_rule_segments(parsed_rule_list: list, priority_dict: dict): Dictionary with sorting priority information (key -> field name; value -> priority). """ + sorting_keys = {} for parsed_rule in parsed_rule_list: - parsed_rule.sort( - key=lambda expression: RuleSorter._get_sorting_key(expression, priority_dict) - ) + for parsed_expression in parsed_rule: + expression_repr = repr(parsed_expression) + if expression_repr not in sorting_keys: + sorting_keys[expression_repr] = RuleSorter._get_sorting_key( + parsed_expression, priority_dict + ) + for parsed_rule in parsed_rule_list: + parsed_rule.sort(key=lambda expression: sorting_keys.get(repr(expression))) @staticmethod def _get_sorting_key( expression: FilterExpression, priority_dict: dict ) -> Union[dict, str, None]: - """Get the sorting key for an expression with a priority dict.. + """Get the sorting key for an expression with a priority dict. This function is used by the _sort_rule_segments() function in the sorting key. It includes various cases to cover all the different expression classes. For every class it diff --git a/logprep/framework/rule_tree/rule_tree.py b/logprep/framework/rule_tree/rule_tree.py index 708d4a9a1..670a75ee7 100644 --- a/logprep/framework/rule_tree/rule_tree.py +++ b/logprep/framework/rule_tree/rule_tree.py @@ -123,8 +123,8 @@ def add_rule(self, rule: "Rule", logger: Logger = None): parsed_rule = self.rule_parser.parse_rule(rule, self.priority_dict) except Exception as error: # pylint: disable=broad-except logger.warning( - f'Error parsing rule "{rule.filter}": {type(error).__name__}: {error}.' - f"\nIgnore and continue with next rule." + f'Error parsing rule "{rule.file_name}.yml": {type(error).__name__}: {error}. ' + f"Ignore and continue with next rule." ) return for rule_segment in parsed_rule: diff --git a/tests/unit/framework/rule_tree/test_rule_parser.py b/tests/unit/framework/rule_tree/test_rule_parser.py index d42b8222c..f5a7c6ad0 100644 --- a/tests/unit/framework/rule_tree/test_rule_parser.py +++ b/tests/unit/framework/rule_tree/test_rule_parser.py @@ -2,7 +2,6 @@ # pylint: disable=missing-docstring # pylint: disable=line-too-long # pylint: disable=too-many-statements - import pytest from logprep.filter.expression.filter_expression import StringFilterExpression, Not, Exists @@ -439,10 +438,6 @@ class TestRuleParser: [Exists(["A1"]), Exists(["B2"]), Exists(["C1"]), Exists(["D2"])], [Exists(["A1"]), Exists(["B2"]), Exists(["C2"]), Exists(["D1"])], [Exists(["A1"]), Exists(["B2"]), Exists(["C2"]), Exists(["D2"])], - [Exists(["A1"]), Exists(["C1"]), Exists(["D1"])], - [Exists(["A1"]), Exists(["C1"]), Exists(["D2"])], - [Exists(["A1"]), Exists(["C2"]), Exists(["D1"])], - [Exists(["A1"]), Exists(["C2"]), Exists(["D2"])], [Exists(["A2"]), Exists(["B1"]), Exists(["C1"]), Exists(["D1"])], [Exists(["A2"]), Exists(["B1"]), Exists(["C1"]), Exists(["D2"])], [Exists(["A2"]), Exists(["B1"]), Exists(["C2"]), Exists(["D1"])], @@ -451,10 +446,50 @@ class TestRuleParser: [Exists(["A2"]), Exists(["B2"]), Exists(["C1"]), Exists(["D2"])], [Exists(["A2"]), Exists(["B2"]), Exists(["C2"]), Exists(["D1"])], [Exists(["A2"]), Exists(["B2"]), Exists(["C2"]), Exists(["D2"])], - [Exists(["A2"]), Exists(["C1"]), Exists(["D1"])], - [Exists(["A2"]), Exists(["C1"]), Exists(["D2"])], - [Exists(["A2"]), Exists(["C2"]), Exists(["D1"])], - [Exists(["A2"]), Exists(["C2"]), Exists(["D2"])], + ], + ), + ( + PreDetectorRule._create_from_dict( + { + "filter": "(A1 OR A2) AND (B1 OR (C1 AND C2))", + "pre_detector": { + "id": 1, + "title": "1", + "severity": "0", + "case_condition": "directly", + "mitre": [], + }, + } + ), + {}, + {}, + [ + [Exists(["A1"]), Exists(["B1"])], + [Exists(["A1"]), Exists(["C1"]), Exists(["C2"])], + [Exists(["A2"]), Exists(["B1"])], + [Exists(["A2"]), Exists(["C1"]), Exists(["C2"])], + ], + ), + ( + PreDetectorRule._create_from_dict( + { + "filter": "((A1 OR A2) AND (B1 OR B2)) AND C1", + "pre_detector": { + "id": 1, + "title": "1", + "severity": "0", + "case_condition": "directly", + "mitre": [], + }, + } + ), + {}, + {}, + [ + [Exists(["A1"]), Exists(["B1"]), Exists(["C1"])], + [Exists(["A1"]), Exists(["B2"]), Exists(["C1"])], + [Exists(["A2"]), Exists(["B1"]), Exists(["C1"])], + [Exists(["A2"]), Exists(["B2"]), Exists(["C1"])], ], ), ( diff --git a/tests/unit/framework/rule_tree/test_rule_tree.py b/tests/unit/framework/rule_tree/test_rule_tree.py index 99d4fcfc8..8820da815 100644 --- a/tests/unit/framework/rule_tree/test_rule_tree.py +++ b/tests/unit/framework/rule_tree/test_rule_tree.py @@ -93,8 +93,8 @@ def test_add_rule_fails(self, rule_dict): ): rule_tree.add_rule(rule, logger=mocked_logger) expected_call = mock.call.warning( - 'Error parsing rule "winlog:"123"": Exception: mocked error.' - "\nIgnore and continue with next rule." + 'Error parsing rule "None.yml": Exception: mocked error. ' + "Ignore and continue with next rule." ) assert expected_call in mocked_logger.mock_calls