From c59242d0a058b3f6a3c589f4becf7653a1c6faca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Zimmermann?= <101292599+ekneg54@users.noreply.github.com> Date: Tue, 10 Sep 2024 13:34:23 +0200 Subject: [PATCH] Remove autorulecorpustester (#665) * Remove autorulecorpustester --- CHANGELOG.md | 6 + doc/source/user_manual/testing_rules.rst | 1 - logprep/run_logprep.py | 18 - .../auto_rule_corpus_tester.py | 419 -------------- logprep/util/rule_dry_runner.py | 15 +- tests/unit/test_run_logprep.py | 17 - .../unit/util/test_auto_rule_corpus_tester.py | 511 ------------------ 7 files changed, 18 insertions(+), 969 deletions(-) delete mode 100644 logprep/util/auto_rule_tester/auto_rule_corpus_tester.py delete mode 100644 tests/unit/util/test_auto_rule_corpus_tester.py diff --git a/CHANGELOG.md b/CHANGELOG.md index a125e2507..4f43db15a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,8 +2,14 @@ ## next release ### Breaking + +* remove AutoRuleCorpusTester + ### Features ### Improvements + +* remove AutoRuleCorpusTester + ### Bugfix * ensure `logprep.abc.Component.Config` is immutable and can be applied multiple times diff --git a/doc/source/user_manual/testing_rules.rst b/doc/source/user_manual/testing_rules.rst index 117f99ab3..a2c98493a 100644 --- a/doc/source/user_manual/testing_rules.rst +++ b/doc/source/user_manual/testing_rules.rst @@ -3,7 +3,6 @@ Testing Rules .. automodule:: logprep.util.rule_dry_runner .. automodule:: logprep.util.auto_rule_tester.auto_rule_tester -.. automodule:: logprep.util.auto_rule_tester.auto_rule_corpus_tester Custom Tests diff --git a/logprep/run_logprep.py b/logprep/run_logprep.py index e4255d5b5..03542be38 100644 --- a/logprep/run_logprep.py +++ b/logprep/run_logprep.py @@ -13,7 +13,6 @@ from logprep.generator.http.controller import Controller from logprep.generator.kafka.run_load_tester import LoadTester from logprep.runner import Runner -from logprep.util.auto_rule_tester.auto_rule_corpus_tester import RuleCorpusTester from logprep.util.auto_rule_tester.auto_rule_tester import AutoRuleTester from logprep.util.configuration import Configuration, InvalidConfigurationError from logprep.util.defaults import DEFAULT_LOG_CONFIG, EXITCODES @@ -160,23 +159,6 @@ def test_rules(configs: tuple[str]) -> None: tester.run() -@test.command( - short_help="Run the rule corpus tester against a given configuration", name="integration" -) -@click.argument("configs", nargs=-1, required=False) -@click.argument("testdata") -def test_ruleset(configs: tuple[str], testdata: str): - """Test the given ruleset against specified test data - - \b - CONFIG is a path to configuration file (filepath or URL). - TESTDATA is a path to a set of test files. - """ - _ = _get_configuration(configs) - tester = RuleCorpusTester(configs, testdata) - tester.run() - - @cli.group(short_help="Generate load for a running logprep instance") def generate(): """ diff --git a/logprep/util/auto_rule_tester/auto_rule_corpus_tester.py b/logprep/util/auto_rule_tester/auto_rule_corpus_tester.py deleted file mode 100644 index 6e0692025..000000000 --- a/logprep/util/auto_rule_tester/auto_rule_corpus_tester.py +++ /dev/null @@ -1,419 +0,0 @@ -# pylint: disable=anomalous-backslash-in-string -""" -Rule Corpus Tests ------------------ - -The rule corpus tester can be used to test a full logprep pipeline and configuration against -a set of expected outputs. - -To start the tester call: - -.. code-block:: bash - :caption: Run rule corpus test - - logprep test integration $CONFIG $CORPUS_TEST_DATA - -Where in the parameter :code:`CONFIG` should point to a valid logprep configuration and -:code:`CORPUS_TEST_DATA` to a directory containing the test data with the different test cases. -The test cases can be organized into subdirectories. -Each test case should contain one input event (\*_in.json), one expected output event (\*_out.json) -and an expected extra outputs like predetections or pseudonyms (\*_out_extra.json). -The expected extra data is optional though, but if given, it is a single json file, where each -output has a root key of the expected target. -All files belonging to the same test case have to start with the same name, like the following -example: - -.. code-block:: bash - :caption: Test data setup - - - test_one_in.json - - test_one_out.json - - test_one_out_extra.json - - test_two_in.json - - test_two_out.json - -.. code-block:: json - :caption: Content of test_one_in.json - Logprep input - - { - "test": "event" - } - -.. code-block:: json - :caption: Content of test_one_out.json - Expected Logprep Output - - { - "processed": ["test", "event"] - "with": "" - } - -.. code-block:: json - :caption: Content of test_one_out_extra.json - Expected Logprep Extra Output - - [ - { - "predetection_target": { - "id": "..." - } - } - ] - -As sometimes test could have cases where you don't want to test for a specific value of a key it is -possible to test only for the key and ignore the value. -In order to achieve this just set a field in an expected output as :code:``, with that -the value won't be considered during the testing. -Furthermore, it is possible to set an entire field as optional with :code:``. -This way fields can be testet for their presents when they exist, and will be ignored when they do -not exist. -This can for example be the case for the geo ip enricher, which sometimes finds city information -about an ip and sometimes not. - -While executing the tests report print statements are collected which will be printed to the console -after the test run is completed. -During the run itself only a short summary is given for each case. - -If during the test run logprep has an error or warning it logs it to the console as well, which will -be printed inside the test cases summary and before the summary result of the test, which created -the log message. - -If one or more test cases fail this tester ends with an exit code of 1, otherwise 0. -""" -import itertools - -# pylint: enable=anomalous-backslash-in-string -# pylint: disable=protected-access -import json -import logging -import os -import re -import shutil -import sys -import tempfile -from functools import cached_property -from json import JSONDecodeError -from pathlib import Path -from pprint import pprint -from typing import Dict, List - -from attr import Factory, define, field, validators -from colorama import Fore, Style -from deepdiff import DeepDiff, grep - -from logprep.framework.pipeline import Pipeline, PipelineResult -from logprep.util.configuration import Configuration -from logprep.util.helper import get_dotted_field_value -from logprep.util.json_handling import parse_json - -logger = logging.getLogger("corpustester") - - -def convert_extra_data_format(extra_outputs) -> List[Dict]: - """ - Converts the format of the extra data outputs such that it is a list of dicts, where the - output target is the key and the values are the actual outputs. - """ - reformatted_extra_outputs = [] - for value, key in extra_outputs: - reformatted_extra_outputs.append({str(key): value}) - return reformatted_extra_outputs - - -class RuleCorpusTester: - """This class can test a rule corpus against expected outputs""" - - _tmp_dir: str - """ Temporary directory where test files will be saved temporarily """ - - _original_config_paths: tuple[str] - """ Path to the original configuration that should be tested """ - - _input_test_data_path: str - """ Path to the directory that contains the test data (in, out, extra_outs) """ - - _test_cases: dict - """ Dictionary that contains the test cases, their input data and their results """ - - @define(kw_only=True) - class TestCase: - input_document: dict = field(validator=validators.instance_of(dict), default={}) - expected_output: dict = field(validator=validators.instance_of(dict), default={}) - expected_extra_output: dict = field(validator=validators.instance_of(list), default=[]) - generated_output: dict = field(validator=validators.instance_of(dict), default={}) - generated_extra_output: dict = field(validator=validators.instance_of(list), default=[]) - failed: bool = field(validator=validators.instance_of(bool), default=False) - report: List = Factory(list) - warnings: str = field(default="") - - @cached_property - def _tmp_dir(self): - return tempfile.mkdtemp() - - @cached_property - def _test_cases(self): - file_paths = [] - for root, _, files in os.walk(self._input_test_data_path): - for filename in files: - file_paths.append(os.path.abspath(os.path.join(root, filename))) - test_cases = self._group_path_by_test_case(self._input_test_data_path, file_paths) - parsing_errors = [case.report for case in test_cases.values() if case.report] - if parsing_errors: - raise ValueError(f"Following parsing errors were found: {parsing_errors}") - no_input_files = [case for case in test_cases if not test_cases[case].input_document] - if no_input_files: - raise ValueError(f"The following TestCases have no input documents: {no_input_files}") - return dict(sorted(test_cases.items())) - - @cached_property - def _pipeline(self): - merged_input_file_path = Path(self._tmp_dir) / "input.json" - inputs = [test_case.input_document for test_case in self._test_cases.values()] - merged_input_file_path.write_text(json.dumps(inputs), encoding="utf8") - patched_config = Configuration() - patched_config.input = { - "patched_input": {"type": "json_input", "documents_path": str(merged_input_file_path)} - } - config = Configuration.from_sources(self._original_config_paths) - input_config = config.input - connector_name = list(input_config.keys())[0] - if "preprocessing" in input_config[connector_name]: - patched_config.input["patched_input"] |= { - "preprocessing": input_config[connector_name]["preprocessing"] - } - patched_config.pipeline = config.pipeline - pipeline = Pipeline(config=patched_config) - return pipeline - - def __init__(self, config_paths: tuple[str], input_test_data_path: str): - self._original_config_paths = config_paths - self._input_test_data_path = input_test_data_path - - def run(self): - """ - Starts the test routine by reading all input files, patching the logprep pipline, executing - the pipeline for each input event, comparing the generated output with the expected output - and printing out the test results. - """ - self._run_pipeline_per_test_case() - self._print_test_reports() - self._print_test_summary() - shutil.rmtree(self._tmp_dir) - if any(case.failed for case in self._test_cases.values()): - sys.exit(1) - else: - sys.exit(0) - - def _run_pipeline_per_test_case(self): - """ - For each test case the logprep connector files are rewritten (only the current test case - will be added to the input file), the pipline is run and the outputs are compared. - """ - print(Style.BRIGHT + "# Test Cases Summary:" + Style.RESET_ALL) - for test_case_id, test_case in self._test_cases.items(): - _ = [processor.setup() for processor in self._pipeline._pipeline] - result: PipelineResult = self._pipeline.process_pipeline() - parsed_event = result.event - extra_outputs = convert_extra_data_format(result.data) - test_case.generated_output = parsed_event - test_case.generated_extra_output = extra_outputs - test_case.warnings = result.warnings - self._compare_logprep_outputs(test_case_id, parsed_event) - self._compare_extra_data_output(test_case_id, extra_outputs) - self._print_pass_fail_statements(test_case_id) - - def _compare_logprep_outputs(self, test_case_id, logprep_output): - test_case = self._test_cases.get(test_case_id, {}) - if test_case.expected_output: - diff = self._compare_events(logprep_output, test_case.expected_output) - self._extract_print_statements_from_diff(test_case_id, diff) - - def _compare_extra_data_output(self, test_case_id, logprep_extra_outputs): - test_case = self._test_cases.get(test_case_id, {}) - prints = [] - if len(logprep_extra_outputs) > len(test_case.expected_extra_output): - prints.append( - f"{Fore.RED}There is at least one generated extra output that is unexpected" - ) - if len(logprep_extra_outputs) < len(test_case.expected_extra_output): - prints.append(f"{Fore.RED}There is at least one expected extra output missing") - for expected_extra_output in test_case.expected_extra_output: - expected_extra_output_key = list(expected_extra_output.keys())[0] - has_matching_output = self._has_matching_logprep_output( - expected_extra_output, - expected_extra_output_key, - logprep_extra_outputs, - ) - if not has_matching_output: - prints.append( - f"{Fore.RED}For the following extra output, " - "no matching extra output was generated by logprep", - ) - prints.append(expected_extra_output) - if prints: - self._test_cases[test_case_id].failed = True - self._test_cases[test_case_id].report.extend(prints) - - def _compare_events(self, generated, expected): - ignore_value_search_results = expected | grep("") - optional_keys_search_results = expected | grep("") - missing_keys = self._check_keys_of_ignored_values( - generated, ignore_value_search_results.get("matched_values") - ) - ignore_paths = [] - if "matched_values" in ignore_value_search_results: - path = list(ignore_value_search_results["matched_values"]) - ignore_paths.extend([re.escape(path) for path in path]) - if "matched_values" in optional_keys_search_results: - path = list(optional_keys_search_results["matched_values"]) - ignore_paths.extend([re.escape(path) for path in path]) - diff = DeepDiff( - expected, - generated, - ignore_order=True, - report_repetition=True, - exclude_regex_paths=ignore_paths, - ) - if missing_keys: - diff.update({"dictionary_item_removed": missing_keys}) - return diff - - def _extract_print_statements_from_diff(self, test_case_id, diff): - if not diff: - return - prints = [] - if "dictionary_item_removed" in diff: - prints.append( - f"{Fore.RED}Following expected items are missing in the generated logprep output:", - ) - for item in diff["dictionary_item_removed"]: - prints.append(f" - {item}") - if "dictionary_item_added" in diff: - prints.append(f"{Fore.RED}Following unexpected values were generated by logprep") - for item in diff["dictionary_item_added"]: - prints.append(f" - {item}") - if "values_changed" in diff: - prints.append( - f"{Fore.RED}Following values differ between generated and expected output", - ) - for key, value in diff["values_changed"].items(): - prints.append(f" - {key}: {self._rewrite_output(str(value))}") - if prints: - self._test_cases[test_case_id].failed = True - self._test_cases[test_case_id].report.extend(prints) - - def _has_matching_logprep_output( - self, expected_extra_output, expected_extra_output_key, logprep_extra_outputs - ): - """ - Iterate over all logprep extra outputs and search for an output that matches the - expected output - """ - has_matching_output = False - for logprep_extra_output in logprep_extra_outputs: - logprep_extra_output_key = list(logprep_extra_output.keys())[0] - if expected_extra_output_key == logprep_extra_output_key: - diff = self._compare_events( - logprep_extra_output[logprep_extra_output_key], - expected_extra_output[expected_extra_output_key], - ) - if diff is not None: - has_matching_output = True - return has_matching_output - - def _print_pass_fail_statements(self, test_case_id): - test_case = self._test_cases.get(test_case_id, {}) - status = f"{Style.BRIGHT}{Fore.GREEN} PASSED" - if not test_case.expected_output: - status = f"{Style.BRIGHT}{Fore.RESET} SKIPPED - (no expected output given)" - elif len(test_case.report) > 0: - status = f"{Style.BRIGHT}{Fore.RED} FAILED" - elif test_case.warnings: - status = f"{Style.BRIGHT}{Fore.YELLOW} PASSED - (with warnings)" - print(f"{Fore.BLUE} Test Case: {Fore.CYAN}{test_case_id} {status}{Style.RESET_ALL}") - - def _print_test_reports(self): - if not any(case.failed for case in self._test_cases.values()): - return - print(Style.BRIGHT + "# Test Cases Detailed Reports:" + Style.RESET_ALL) - for test_case_id, test_case in self._test_cases.items(): - if (test_case.warnings or test_case.report) and test_case.expected_output: - self._print_long_test_result(test_case_id, test_case) - print() - - def _print_long_test_result(self, test_case_id, test_case): - report_title = f"test report for '{test_case_id}'" - print(f"{Fore.RED}{Style.BRIGHT}↓ {report_title} ↓ {Style.RESET_ALL}") - print_logprep_output = True - if test_case.warnings and not test_case.report: - print(Fore.GREEN + "Test passed, but with following warnings:" + Fore.RESET) - print(test_case.warnings) - print_logprep_output = False - if test_case.warnings and test_case.report: - print(Fore.RED + "Logprep Warnings:" + Fore.RESET) - for warning in test_case.warnings: - print(warning) - for statement in test_case.report: - if isinstance(statement, (dict, list)): - pprint(statement) - else: - print(statement) - if print_logprep_output: - print(Fore.RED + "Logprep Event Output:" + Fore.RESET) - pprint(test_case.generated_output) - print(Fore.RED + "Logprep Extra Data Output:" + Fore.RESET) - pprint(test_case.generated_extra_output) - print(f"{Fore.RED}{Style.BRIGHT}↑ {report_title} ↑ {Style.RESET_ALL}") - - def _print_test_summary(self): - print(Fore.RESET + Style.BRIGHT + "# Test Overview" + Style.RESET_ALL) - total_cases = len(self._test_cases) - failed_cases = sum(case.failed for case in self._test_cases.values()) - print(f"Failed tests: {failed_cases}") - print(f"Total test cases: {total_cases}") - if total_cases: - success_rate = (total_cases - failed_cases) / total_cases * 100 - print(f"Success rate: {success_rate:.2f}%") - - def _check_keys_of_ignored_values(self, logprep_output, field_paths) -> list: - if not field_paths: - return [] - missing_keys = [] - for path in field_paths: - dotted_path = ".".join(re.findall(r"\['([^'|.*]*)'\]", path)) - field_value = get_dotted_field_value(logprep_output, dotted_path) - if field_value is None: - missing_keys.append(path) - return missing_keys - - def _group_path_by_test_case(self, data_directory, file_paths): - test_cases = {} - for filename in file_paths: - test_case_id = self._strip_input_file_type(filename) - if test_case_id not in test_cases: - test_cases[test_case_id] = self.TestCase() - document = [{}] - try: - document = parse_json(os.path.join(data_directory, filename)) - except JSONDecodeError as error: - test_cases[test_case_id].failed = True - error_print = f"Json-Error decoding file {filename}: {error}" - test_cases[test_case_id].report.append(error_print) - if "_in.json" in filename: - test_cases[test_case_id].input_document = document[0] - if "_out.json" in filename: - test_cases[test_case_id].expected_output = document[0] - if "_out_extra.json" in filename: - test_cases[test_case_id].expected_extra_output = document - return test_cases - - def _strip_input_file_type(self, filename): - """Remove the input file suffix to identify the case name""" - filename = filename.replace("_in", "") - filename = filename.replace("_out_extra", "") - filename = filename.replace("_out", "") - filename = filename.replace(".json", "*") - return filename - - def _rewrite_output(self, statement): - statement = statement.replace("new_value", "generated") - statement = statement.replace("old_value", "expected") - return statement diff --git a/logprep/util/rule_dry_runner.py b/logprep/util/rule_dry_runner.py index 0c6782fd6..07e5e2514 100644 --- a/logprep/util/rule_dry_runner.py +++ b/logprep/util/rule_dry_runner.py @@ -41,14 +41,12 @@ from copy import deepcopy from difflib import ndiff from functools import cached_property +from typing import Dict, List from colorama import Back, Fore from ruamel.yaml import YAML from logprep.framework.pipeline import Pipeline, PipelineResult -from logprep.util.auto_rule_tester.auto_rule_corpus_tester import ( - convert_extra_data_format, -) from logprep.util.configuration import Configuration from logprep.util.getter import GetterFactory from logprep.util.helper import color_print_line, color_print_title, recursive_compare @@ -56,6 +54,17 @@ yaml = YAML(typ="safe", pure=True) +def convert_extra_data_format(extra_outputs) -> List[Dict]: + """ + Converts the format of the extra data outputs such that it is a list of dicts, where the + output target is the key and the values are the actual outputs. + """ + reformatted_extra_outputs = [] + for value, key in extra_outputs: + reformatted_extra_outputs.append({str(key): value}) + return reformatted_extra_outputs + + class DryRunner: """Used to run pipeline with given events and show changes made by processing.""" diff --git a/tests/unit/test_run_logprep.py b/tests/unit/test_run_logprep.py index 41c6eae28..b5b419a0e 100644 --- a/tests/unit/test_run_logprep.py +++ b/tests/unit/test_run_logprep.py @@ -58,18 +58,10 @@ def setup_method(self): "test dry-run tests/testdata/config/config.yml examples/exampledata/input_logdata/test_input.jsonl", "logprep.util.rule_dry_runner.DryRunner.run", ), - ( - "test integration tests/testdata/config/config.yml path/to/testset", - "logprep.util.auto_rule_tester.auto_rule_corpus_tester.RuleCorpusTester.run", - ), ( "test dry-run tests/testdata/config/config.yml tests/testdata/config/config.yml asdfsdv", "logprep.util.rule_dry_runner.DryRunner.run", ), - ( - "test integration tests/testdata/config/config.yml tests/testdata/config/config.yml path/to/testset", - "logprep.util.auto_rule_tester.auto_rule_corpus_tester.RuleCorpusTester.run", - ), ], ) def test_cli_commands_with_configs(self, command: str, target: str): @@ -85,7 +77,6 @@ def test_cli_commands_with_configs(self, command: str, target: str): ("test", "config"), ("test", "unit"), ("test", "dry-run", "input_data"), - ("test", "integration", "testdata"), ], ) def test_cli_invokes_default_config_location(self, command): @@ -271,14 +262,6 @@ def test_test_rules_starts_auto_rule_tester(self, mock_tester): logger = logging.getLogger() logger.disabled = False - @mock.patch("logprep.util.auto_rule_tester.auto_rule_corpus_tester.RuleCorpusTester.run") - def test_test_ruleset_starts_rule_corpus_tester(self, mock_tester): - config_path = "tests/testdata/config/config.yml" - test_data_path = "path/to/testset" - result = self.cli_runner.invoke(cli, ["test", "integration", config_path, test_data_path]) - assert result.exit_code == 0 - mock_tester.assert_called() - @mock.patch("logging.Logger.info") def test_run_logprep_logs_log_level(self, mock_info): config = Configuration.from_sources(("tests/testdata/config/config.yml",)) diff --git a/tests/unit/util/test_auto_rule_corpus_tester.py b/tests/unit/util/test_auto_rule_corpus_tester.py deleted file mode 100644 index b0576678d..000000000 --- a/tests/unit/util/test_auto_rule_corpus_tester.py +++ /dev/null @@ -1,511 +0,0 @@ -# pylint: disable=missing-docstring -# pylint: disable=protected-access -# pylint: disable=too-many-arguments -import json -import os -import re -from json import JSONDecodeError -from logging.config import dictConfig -from unittest import mock - -import pytest - -from logprep.abc.processor import ProcessorResult -from logprep.framework.pipeline import PipelineResult -from logprep.util.auto_rule_tester.auto_rule_corpus_tester import RuleCorpusTester -from logprep.util.defaults import DEFAULT_LOG_CONFIG -from logprep.util.getter import GetterFactory - - -@pytest.fixture(name="corpus_tester") -def fixture_auto_rule_corpus_tester(): - config_path = ("tests/testdata/config/config.yml",) - data_dir = "will be overwritten in test cases" - corpus_tester = RuleCorpusTester(config_path, data_dir) - return corpus_tester - - -def write_test_case_data_tmp_files(test_data_dir, test_case_name, test_data): - input_data_path = test_data_dir / f"{test_case_name}_in.json" - input_data_path.write_text(json.dumps(test_data.get("input"))) - expected_output_data_path = test_data_dir / f"{test_case_name}_out.json" - expected_output_data_path.write_text(json.dumps(test_data.get("expected_output"))) - expected_extra_output_data_path = test_data_dir / f"{test_case_name}_out_extra.json" - expected_extra_output_data_path.write_text(json.dumps(test_data.get("expected_extra_output"))) - - -def prepare_corpus_tester(corpus_tester, tmp_path, test_data): - test_data_dir = tmp_path / "test_data" - os.makedirs(test_data_dir, exist_ok=True) - write_test_case_data_tmp_files(test_data_dir, "rule_auto_corpus_test", test_data) - corpus_tester._input_test_data_path = test_data_dir - corpus_tester._tmp_dir = tmp_path - - -class TestAutoRuleTester: - def setup_method(self): - dictConfig(DEFAULT_LOG_CONFIG) - - @pytest.mark.parametrize( - "test_case, test_data, mock_output, expected_prints, exit_code", - [ - ( - "One successful test", - { - "input": {"message": "A B"}, - "expected_output": {"message": "A B", "source": "A", "target": "B"}, - "expected_extra_output": [], - }, - None, - ["PASSED", "Success rate: 100.00%"], - 0, - ), - ( - "Unknown field in logprep output", - { - "input": { - "winlog": {"event_id": "2222", "event_data": {"Test1": 1, "Test2": 2}} - }, - "expected_output": { - "winlog": {"event_id": "2222", "event_data": {"Test1": 1}}, - "test_normalized": {"test": {"field1": 1, "field2": 2}}, - }, - "expected_extra_output": [], - }, - None, - [ - "FAILED", - "Success rate: 0.00%", - "Detailed Reports", - "unexpected values were generated", - "root['winlog']['event_data']['Test2']", - ], - 1, - ), - ( - "Failed test with changed value only", - { - "input": { - "winlog": {"event_id": "2222", "event_data": {"Test1": 1, "Test2": 2}} - }, - "expected_output": { - "winlog": {"event_id": "2222", "event_data": {"Test1": 1, "Test2": 4}}, - "test_normalized": {"test": {"field1": 1, "field2": 2}}, - }, - "expected_extra_output": [], - }, - None, - [ - "FAILED", - "Success rate: 0.00%", - "Detailed Reports", - "values differ between generated and expected output", - "- root['winlog']['event_data']['Test2']: {'generated': 2, 'expected': 4}", - ], - 1, - ), - ( - "One successful test with extra output", - { - "input": {"winlog": {"event_data": {"IpAddress": "1.2.3.4"}}}, - "expected_output": {"winlog": {"event_data": {"IpAddress": ""}}}, - "expected_extra_output": [ - { - "({'kafka_output': 'pseudonyms'},)": { - "pseudonym": "", - "origin": "", - } - } - ], - }, - None, - ["PASSED", "Success rate: 100.00%"], - 0, - ), - ( - "Failed test with unexpected extra output", - { - "input": {"winlog": {"event_data": {"IpAddress": "1.2.3.4"}}}, - "expected_output": {"winlog": {"event_data": {"IpAddress": ""}}}, - "expected_extra_output": [], - }, - None, - [ - "FAILED", - "Success rate: 0.00%", - "Detailed Reports", - "There is at least one generated extra output that is unexpected", - "Logprep Event Output", - "Logprep Extra Data Output", - "pseudonyms", - ], - 1, - ), - ( - "Failed test with expected extra output, not generated by logprep", - { - "input": {"winlog": {"event_data": {"IpAddress": "1.2.3.4"}}}, - "expected_output": {"winlog": {"event_data": {"IpAddress": ""}}}, - "expected_extra_output": [ - { - "pseudonyms": { - "pseudonym": "", - "origin": "", - } - }, - {"some_random_extra": {"foo": "bar"}}, - ], - }, - None, - [ - "FAILED", - "Success rate: 0.00%", - "Detailed Reports", - "There is at least one expected extra output missing", - "For the following extra output, no matching extra output was generated by", - "Logprep Event Output", - "Logprep Extra Data Output", - "pseudonyms", - ], - 1, - ), - ( - "Failed test with unexpected field generated by logprep", - { - "input": { - "winlog": {"event_id": "2222", "event_data": {"Test1": 1, "Test2": 2}} - }, - "expected_output": { - "winlog": {"event_id": "2222", "event_data": {"Test1": 1, "Test2": 2}}, - "test_normalized": {"test": {"field1": 1, "field2": 2}}, - }, - "expected_extra_output": [], - }, - [ - { - "winlog": {"event_id": "2222", "event_data": {"Test1": 1}}, - "test_normalized": {"test": {"field1": 1}}, - }, - [], - ], - [ - "FAILED", - "Success rate: 0.00%", - "Detailed Reports", - "expected items are missing in the generated logprep output", - "root['winlog']['event_data']['Test2']", - "root['test_normalized']['test']['field2']", - ], - 1, - ), - ( - "Successful test with ignored value in generated by logprep output", - { - "input": { - "winlog": {"event_id": "2222", "event_data": {"Test1": 1, "Test2": 2}} - }, - "expected_output": { - "winlog": {"event_id": "2222", "event_data": ""}, - "test_normalized": {"test": {"field1": 1, "field2": 2}}, - }, - "expected_extra_output": [], - }, - [ - { - "winlog": {"event_id": "2222", "event_data": "SOME_RANDOM_CONTENT"}, - "test_normalized": {"test": {"field1": 1, "field2": 2}}, - }, - [], - ], - [ - "PASSED", - "Success rate: 100.00%", - ], - 0, - ), - ( - "Failed test if key of is missing in generated logprep output", - { - "input": { - "winlog": {"event_id": "2222", "event_data": {"Test1": 1, "Test2": 2}} - }, - "expected_output": { - "winlog": {"event_id": "2222", "event_data": ""}, - "test_normalized": {"test": {"field1": 1, "field2": 2}}, - }, - "expected_extra_output": [], - }, - [ - { - "winlog": {"event_id": "2222"}, - "test_normalized": {"test": {"field1": 1, "field2": 2}}, - }, - [], - ], - [ - "FAILED", - "expected items are missing in the generated logprep output", - "root['winlog']['event_data']", - "Success rate: 0.00%", - ], - 1, - ), - ( - "Successful test with optional key missing in generated by logprep output", - { - "input": { - "winlog": {"event_id": "2222", "event_data": {"Test1": 1, "Test2": 2}} - }, - "expected_output": { - "winlog": {"event_id": "2222", "event_data": ""}, - "test_normalized": {"test": {"field1": 1, "field2": 2}}, - }, - "expected_extra_output": [], - }, - [ - { - "winlog": {"event_id": "2222"}, - "test_normalized": {"test": {"field1": 1, "field2": 2}}, - }, - [], - ], - [ - "PASSED", - "Success rate: 100.00%", - ], - 0, - ), - ( - "Successful test with optional key present in generated by logprep output", - { - "input": { - "winlog": {"event_id": "2222", "event_data": {"Test1": 1, "Test2": 2}} - }, - "expected_output": { - "winlog": {"event_id": "2222", "event_data": ""}, - "test_normalized": {"test": {"field1": 1, "field2": 2}}, - }, - "expected_extra_output": [], - }, - [ - { - "winlog": {"event_id": "2222", "event_data": "something"}, - "test_normalized": {"test": {"field1": 1, "field2": 2}}, - }, - [], - ], - [ - "PASSED", - "Success rate: 100.00%", - ], - 0, - ), - ], - ) - @mock.patch("logprep.util.auto_rule_tester.auto_rule_corpus_tester.sys.exit") - def test_run_prints_expected_outputs_to_console( - self, - mock_exit, - tmp_path, - corpus_tester, - test_case, - test_data, - mock_output, - expected_prints, - exit_code, - capsys, - ): - prepare_corpus_tester(corpus_tester, tmp_path, test_data) - if mock_output is not None: - with mock.patch( - "logprep.util.auto_rule_tester.auto_rule_corpus_tester.Pipeline.process_pipeline" - ) as mock_process_pipeline: - mock_process_pipeline.return_value = PipelineResult( - results=[], - event=mock_output[0], - event_received=mock_output[0], - pipeline=[], - ) - mock_process_pipeline.return_value.results = [ - ProcessorResult(processor_name="test", data=test_data["expected_extra_output"]) - ] - corpus_tester.run() - else: - corpus_tester.run() - console_output, console_error = capsys.readouterr() - assert console_error == "" - for expected_print in expected_prints: - assert expected_print in console_output, test_case - mock_exit.assert_called_with(exit_code) - - @mock.patch("logprep.util.auto_rule_tester.auto_rule_corpus_tester.parse_json") - def test_run_logs_json_decoding_error(self, mock_parse_json, tmp_path, corpus_tester): - test_data = {"input": {}, "expected_output": {}, "expected_extra_output": []} - prepare_corpus_tester(corpus_tester, tmp_path, test_data) - mock_parse_json.side_effect = JSONDecodeError("Some Error", "in doc", 0) - with pytest.raises(ValueError, match="Following parsing errors were found"): - corpus_tester.run() - - def test_run_raises_if_case_misses_input_file(self, tmp_path, corpus_tester): - expected_output_data_path = tmp_path / "rule_auto_corpus_test_out.json" - expected_output_data_path.write_text('{"json":"file"}') - corpus_tester._input_test_data_path = tmp_path - with pytest.raises(ValueError, match="The following TestCases have no input document"): - corpus_tester.run() - - @mock.patch("logprep.util.auto_rule_tester.auto_rule_corpus_tester.sys.exit") - def test_run_skips_test_if_expected_output_is_missing( - self, mock_exit, tmp_path, corpus_tester, capsys - ): - test_data = { - "input": {"winlog": {"event_id": "2222", "event_data": {"Test1": 1, "Test2": 2}}}, - "expected_output": {}, - "expected_extra_output": [], - } - expected_prints = [ - "SKIPPED", - "no expected output given", - "Total test cases: 1", - "Success rate: 100.00%", - ] - prepare_corpus_tester(corpus_tester, tmp_path, test_data) - os.remove(tmp_path / "test_data" / "rule_auto_corpus_test_out.json") - corpus_tester.run() - console_output, console_error = capsys.readouterr() - assert console_error == "" - for expected_print in expected_prints: - assert expected_print in console_output - mock_exit.assert_called_with(0) - - @mock.patch("logprep.util.auto_rule_tester.auto_rule_corpus_tester.shutil.rmtree") - @mock.patch("logprep.util.auto_rule_tester.auto_rule_corpus_tester.sys.exit") - def test_run_removes_test_tmp_dir(self, _, mock_shutil, corpus_tester): - corpus_tester.run() - mock_shutil.assert_called_with(corpus_tester._tmp_dir) - - @mock.patch("logprep.util.auto_rule_tester.auto_rule_corpus_tester.sys.exit") - def test_run_with_two_processors_that_have_different_extra_outputs( - self, mock_exit, tmp_path, capsys - ): - config_path = "tests/testdata/config/config.yml" - config = GetterFactory.from_string(config_path).get_yaml() - config["pipeline"].append( - { - "selective_extractor": { - "type": "selective_extractor", - "specific_rules": ["tests/testdata/unit/selective_extractor/rules/specific"], - "generic_rules": [], - } - } - ) - test_config_path = tmp_path / "test_config.yml" - test_config_path.write_text(json.dumps(config), encoding="utf8") - corpus_tester = RuleCorpusTester([str(test_config_path)], "") - test_data = { - "input": { - "message": "A B", - "field1": "field 1 value", - "winlog": {"event_id": "2222", "event_data": {"IpAddress": "1.2.3.4"}}, - }, - "expected_output": { - "message": "A B", - "source": "A", - "target": "B", - "field1": "field 1 value", - "winlog": {"event_id": "2222", "event_data": {"IpAddress": ""}}, - }, - "expected_extra_output": [ - {"({'kafka': 'topic'},)": {"field1": "field 1 value"}}, - {"({'kafka': 'topic'},)": {"message": "something"}}, - { - "({'kafka_output': 'pseudonyms'},)": { - "origin": "", - "pseudonym": "", - } - }, - ], - } - expected_prints = [ - "PASSED", - "Total test cases: 1", - "Success rate: 100.00%", - ] - prepare_corpus_tester(corpus_tester, tmp_path, test_data) - corpus_tester.run() - console_output, console_error = capsys.readouterr() - assert console_error == "" - for expected_print in expected_prints: - assert expected_print in console_output - mock_exit.assert_called_with(0) - - @mock.patch("logprep.util.auto_rule_tester.auto_rule_corpus_tester.sys.exit") - def test_corpus_tests_dont_share_cache_between_runs_by_resetting_processors( - self, mock_exit, tmp_path, capsys - ): - test_case_data = { - "input": { - "winlog": {"event_id": "2222", "event_data": {"IpAddress": "1.2.3.4"}}, - }, - "expected_output": { - "winlog": {"event_id": "2222", "event_data": {"IpAddress": ""}}, - }, - "expected_extra_output": [ - { - "({'kafka_output': 'pseudonyms'},)": { - "origin": "", - "pseudonym": "", - } - }, - ], - } - test_data_dir = tmp_path / "test_data" - os.makedirs(test_data_dir, exist_ok=True) - # run one test case two times to trigger the pseudonymizer cache. - # Without reinitializing the processors the second test wouldn't create an extra output, as - # the cache realizes it as an existing pseudonym already. - write_test_case_data_tmp_files(test_data_dir, "test_case_one", test_case_data) - write_test_case_data_tmp_files(test_data_dir, "test_case_two", test_case_data) - config_path = ["tests/testdata/config/config.yml"] - corpus_tester = RuleCorpusTester(config_path, test_data_dir) - corpus_tester.run() - console_output, console_error = capsys.readouterr() - assert console_error == "" - expected_prints = [ - "PASSED", - "Total test cases: 2", - "Success rate: 100.00%", - ] - for expected_print in expected_prints: - assert expected_print in console_output - mock_exit.assert_called_with(0) - - @mock.patch("logprep.util.auto_rule_tester.auto_rule_corpus_tester.sys.exit") - def test_warnings_are_printed_inside_the_detailed_reports(self, mock_exit, tmp_path, capsys): - test_case_data = { - "input": { - "field1": 2, - "field2": 2, - "new_field": "exists already", - }, - "expected_output": { - "field1": 2, - "field2": 2, - "new_field": "exists already", - }, - "expected_extra_output": [], - } - test_data_dir = tmp_path / "test_data" - os.makedirs(test_data_dir, exist_ok=True) - write_test_case_data_tmp_files(test_data_dir, "test_case_one", test_case_data) - config_path = ["tests/testdata/config/config.yml"] - corpus_tester = RuleCorpusTester(config_path, test_data_dir) - corpus_tester.run() - console_output, console_error = capsys.readouterr() - assert console_error == "" - warnings_inside_details_pattern = ( - r".*Test Cases Detailed Reports.*test_case_one.*" - r"Logprep Warnings.*FieldExistsWarning.*test_case_one.*" - r"Test Overview" - ) - assert re.match(warnings_inside_details_pattern, console_output, flags=re.DOTALL) - mock_exit.assert_called_with(1)