From c59242d0a058b3f6a3c589f4becf7653a1c6faca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Zimmermann?=
 <101292599+ekneg54@users.noreply.github.com>
Date: Tue, 10 Sep 2024 13:34:23 +0200
Subject: [PATCH] Remove autorulecorpustester (#665)

* Remove autorulecorpustester
---
 CHANGELOG.md                                  |   6 +
 doc/source/user_manual/testing_rules.rst      |   1 -
 logprep/run_logprep.py                        |  18 -
 .../auto_rule_corpus_tester.py                | 419 --------------
 logprep/util/rule_dry_runner.py               |  15 +-
 tests/unit/test_run_logprep.py                |  17 -
 .../unit/util/test_auto_rule_corpus_tester.py | 511 ------------------
 7 files changed, 18 insertions(+), 969 deletions(-)
 delete mode 100644 logprep/util/auto_rule_tester/auto_rule_corpus_tester.py
 delete mode 100644 tests/unit/util/test_auto_rule_corpus_tester.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a125e2507..4f43db15a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,8 +2,14 @@
 
 ## next release
 ### Breaking
+
+* remove AutoRuleCorpusTester
+
 ### Features
 ### Improvements
+
+* remove AutoRuleCorpusTester
+
 ### Bugfix
 
 * ensure `logprep.abc.Component.Config` is immutable and can be applied multiple times
diff --git a/doc/source/user_manual/testing_rules.rst b/doc/source/user_manual/testing_rules.rst
index 117f99ab3..a2c98493a 100644
--- a/doc/source/user_manual/testing_rules.rst
+++ b/doc/source/user_manual/testing_rules.rst
@@ -3,7 +3,6 @@ Testing Rules
 
 .. automodule:: logprep.util.rule_dry_runner
 .. automodule:: logprep.util.auto_rule_tester.auto_rule_tester
-.. automodule:: logprep.util.auto_rule_tester.auto_rule_corpus_tester
 
 
 Custom Tests
diff --git a/logprep/run_logprep.py b/logprep/run_logprep.py
index e4255d5b5..03542be38 100644
--- a/logprep/run_logprep.py
+++ b/logprep/run_logprep.py
@@ -13,7 +13,6 @@
 from logprep.generator.http.controller import Controller
 from logprep.generator.kafka.run_load_tester import LoadTester
 from logprep.runner import Runner
-from logprep.util.auto_rule_tester.auto_rule_corpus_tester import RuleCorpusTester
 from logprep.util.auto_rule_tester.auto_rule_tester import AutoRuleTester
 from logprep.util.configuration import Configuration, InvalidConfigurationError
 from logprep.util.defaults import DEFAULT_LOG_CONFIG, EXITCODES
@@ -160,23 +159,6 @@ def test_rules(configs: tuple[str]) -> None:
         tester.run()
 
 
-@test.command(
-    short_help="Run the rule corpus tester against a given configuration", name="integration"
-)
-@click.argument("configs", nargs=-1, required=False)
-@click.argument("testdata")
-def test_ruleset(configs: tuple[str], testdata: str):
-    """Test the given ruleset against specified test data
-
-    \b
-    CONFIG is a path to configuration file (filepath or URL).
-    TESTDATA is a path to a set of test files.
-    """
-    _ = _get_configuration(configs)
-    tester = RuleCorpusTester(configs, testdata)
-    tester.run()
-
-
 @cli.group(short_help="Generate load for a running logprep instance")
 def generate():
     """
diff --git a/logprep/util/auto_rule_tester/auto_rule_corpus_tester.py b/logprep/util/auto_rule_tester/auto_rule_corpus_tester.py
deleted file mode 100644
index 6e0692025..000000000
--- a/logprep/util/auto_rule_tester/auto_rule_corpus_tester.py
+++ /dev/null
@@ -1,419 +0,0 @@
-# pylint: disable=anomalous-backslash-in-string
-"""
-Rule Corpus Tests
------------------
-
-The rule corpus tester can be used to test a full logprep pipeline and configuration against
-a set of expected outputs.
-
-To start the tester call:
-
-..  code-block:: bash
-    :caption: Run rule corpus test
-
-    logprep test integration $CONFIG $CORPUS_TEST_DATA
-
-Where in the parameter :code:`CONFIG` should point to a valid logprep configuration and
-:code:`CORPUS_TEST_DATA` to a directory containing the test data with the different test cases.
-The test cases can be organized into subdirectories.
-Each test case should contain one input event (\*_in.json), one expected output event (\*_out.json)
-and an expected extra outputs like predetections or pseudonyms (\*_out_extra.json).
-The expected extra data is optional though, but if given, it is a single json file, where each
-output has a root key of the expected target.
-All files belonging to the same test case have to start with the same name, like the following
-example:
-
-..  code-block:: bash
-    :caption: Test data setup
-
-    - test_one_in.json
-    - test_one_out.json
-    - test_one_out_extra.json
-    - test_two_in.json
-    - test_two_out.json
-
-..  code-block:: json
-    :caption: Content of test_one_in.json - Logprep input
-
-    {
-        "test": "event"
-    }
-
-..  code-block:: json
-    :caption: Content of test_one_out.json - Expected Logprep Output
-
-    {
-        "processed": ["test", "event"]
-        "with": "<IGNORE_VALUE>"
-    }
-
-..  code-block:: json
-    :caption: Content of test_one_out_extra.json - Expected Logprep Extra Output
-
-    [
-        {
-            "predetection_target": {
-                "id": "..."
-            }
-        }
-    ]
-
-As sometimes test could have cases where you don't want to test for a specific value of a key it is
-possible to test only for the key and ignore the value.
-In order to achieve this just set a field in an expected output as :code:`<IGNORE_VALUE>`, with that
-the value won't be considered during the testing.
-Furthermore, it is possible to set an entire field as optional with :code:`<OPTIONAL_KEY>`.
-This way fields can be testet for their presents when they exist, and will be ignored when they do
-not exist.
-This can for example be the case for the geo ip enricher, which sometimes finds city information
-about an ip and sometimes not.
-
-While executing the tests report print statements are collected which will be printed to the console
-after the test run is completed.
-During the run itself only a short summary is given for each case.
-
-If during the test run logprep has an error or warning it logs it to the console as well, which will
-be printed inside the test cases summary and before the summary result of the test, which created
-the log message.
-
-If one or more test cases fail this tester ends with an exit code of 1, otherwise 0.
-"""
-import itertools
-
-# pylint: enable=anomalous-backslash-in-string
-# pylint: disable=protected-access
-import json
-import logging
-import os
-import re
-import shutil
-import sys
-import tempfile
-from functools import cached_property
-from json import JSONDecodeError
-from pathlib import Path
-from pprint import pprint
-from typing import Dict, List
-
-from attr import Factory, define, field, validators
-from colorama import Fore, Style
-from deepdiff import DeepDiff, grep
-
-from logprep.framework.pipeline import Pipeline, PipelineResult
-from logprep.util.configuration import Configuration
-from logprep.util.helper import get_dotted_field_value
-from logprep.util.json_handling import parse_json
-
-logger = logging.getLogger("corpustester")
-
-
-def convert_extra_data_format(extra_outputs) -> List[Dict]:
-    """
-    Converts the format of the extra data outputs such that it is a list of dicts, where the
-    output target is the key and the values are the actual outputs.
-    """
-    reformatted_extra_outputs = []
-    for value, key in extra_outputs:
-        reformatted_extra_outputs.append({str(key): value})
-    return reformatted_extra_outputs
-
-
-class RuleCorpusTester:
-    """This class can test a rule corpus against expected outputs"""
-
-    _tmp_dir: str
-    """ Temporary directory where test files will be saved temporarily """
-
-    _original_config_paths: tuple[str]
-    """ Path to the original configuration that should be tested """
-
-    _input_test_data_path: str
-    """ Path to the directory that contains the test data (in, out, extra_outs) """
-
-    _test_cases: dict
-    """ Dictionary that contains the test cases, their input data and their results """
-
-    @define(kw_only=True)
-    class TestCase:
-        input_document: dict = field(validator=validators.instance_of(dict), default={})
-        expected_output: dict = field(validator=validators.instance_of(dict), default={})
-        expected_extra_output: dict = field(validator=validators.instance_of(list), default=[])
-        generated_output: dict = field(validator=validators.instance_of(dict), default={})
-        generated_extra_output: dict = field(validator=validators.instance_of(list), default=[])
-        failed: bool = field(validator=validators.instance_of(bool), default=False)
-        report: List = Factory(list)
-        warnings: str = field(default="")
-
-    @cached_property
-    def _tmp_dir(self):
-        return tempfile.mkdtemp()
-
-    @cached_property
-    def _test_cases(self):
-        file_paths = []
-        for root, _, files in os.walk(self._input_test_data_path):
-            for filename in files:
-                file_paths.append(os.path.abspath(os.path.join(root, filename)))
-        test_cases = self._group_path_by_test_case(self._input_test_data_path, file_paths)
-        parsing_errors = [case.report for case in test_cases.values() if case.report]
-        if parsing_errors:
-            raise ValueError(f"Following parsing errors were found: {parsing_errors}")
-        no_input_files = [case for case in test_cases if not test_cases[case].input_document]
-        if no_input_files:
-            raise ValueError(f"The following TestCases have no input documents: {no_input_files}")
-        return dict(sorted(test_cases.items()))
-
-    @cached_property
-    def _pipeline(self):
-        merged_input_file_path = Path(self._tmp_dir) / "input.json"
-        inputs = [test_case.input_document for test_case in self._test_cases.values()]
-        merged_input_file_path.write_text(json.dumps(inputs), encoding="utf8")
-        patched_config = Configuration()
-        patched_config.input = {
-            "patched_input": {"type": "json_input", "documents_path": str(merged_input_file_path)}
-        }
-        config = Configuration.from_sources(self._original_config_paths)
-        input_config = config.input
-        connector_name = list(input_config.keys())[0]
-        if "preprocessing" in input_config[connector_name]:
-            patched_config.input["patched_input"] |= {
-                "preprocessing": input_config[connector_name]["preprocessing"]
-            }
-        patched_config.pipeline = config.pipeline
-        pipeline = Pipeline(config=patched_config)
-        return pipeline
-
-    def __init__(self, config_paths: tuple[str], input_test_data_path: str):
-        self._original_config_paths = config_paths
-        self._input_test_data_path = input_test_data_path
-
-    def run(self):
-        """
-        Starts the test routine by reading all input files, patching the logprep pipline, executing
-        the pipeline for each input event, comparing the generated output with the expected output
-        and printing out the test results.
-        """
-        self._run_pipeline_per_test_case()
-        self._print_test_reports()
-        self._print_test_summary()
-        shutil.rmtree(self._tmp_dir)
-        if any(case.failed for case in self._test_cases.values()):
-            sys.exit(1)
-        else:
-            sys.exit(0)
-
-    def _run_pipeline_per_test_case(self):
-        """
-        For each test case the logprep connector files are rewritten (only the current test case
-        will be added to the input file), the pipline is run and the outputs are compared.
-        """
-        print(Style.BRIGHT + "# Test Cases Summary:" + Style.RESET_ALL)
-        for test_case_id, test_case in self._test_cases.items():
-            _ = [processor.setup() for processor in self._pipeline._pipeline]
-            result: PipelineResult = self._pipeline.process_pipeline()
-            parsed_event = result.event
-            extra_outputs = convert_extra_data_format(result.data)
-            test_case.generated_output = parsed_event
-            test_case.generated_extra_output = extra_outputs
-            test_case.warnings = result.warnings
-            self._compare_logprep_outputs(test_case_id, parsed_event)
-            self._compare_extra_data_output(test_case_id, extra_outputs)
-            self._print_pass_fail_statements(test_case_id)
-
-    def _compare_logprep_outputs(self, test_case_id, logprep_output):
-        test_case = self._test_cases.get(test_case_id, {})
-        if test_case.expected_output:
-            diff = self._compare_events(logprep_output, test_case.expected_output)
-            self._extract_print_statements_from_diff(test_case_id, diff)
-
-    def _compare_extra_data_output(self, test_case_id, logprep_extra_outputs):
-        test_case = self._test_cases.get(test_case_id, {})
-        prints = []
-        if len(logprep_extra_outputs) > len(test_case.expected_extra_output):
-            prints.append(
-                f"{Fore.RED}There is at least one generated extra output that is unexpected"
-            )
-        if len(logprep_extra_outputs) < len(test_case.expected_extra_output):
-            prints.append(f"{Fore.RED}There is at least one expected extra output missing")
-        for expected_extra_output in test_case.expected_extra_output:
-            expected_extra_output_key = list(expected_extra_output.keys())[0]
-            has_matching_output = self._has_matching_logprep_output(
-                expected_extra_output,
-                expected_extra_output_key,
-                logprep_extra_outputs,
-            )
-            if not has_matching_output:
-                prints.append(
-                    f"{Fore.RED}For the following extra output, "
-                    "no matching extra output was generated by logprep",
-                )
-                prints.append(expected_extra_output)
-        if prints:
-            self._test_cases[test_case_id].failed = True
-            self._test_cases[test_case_id].report.extend(prints)
-
-    def _compare_events(self, generated, expected):
-        ignore_value_search_results = expected | grep("<IGNORE_VALUE>")
-        optional_keys_search_results = expected | grep("<OPTIONAL_KEY>")
-        missing_keys = self._check_keys_of_ignored_values(
-            generated, ignore_value_search_results.get("matched_values")
-        )
-        ignore_paths = []
-        if "matched_values" in ignore_value_search_results:
-            path = list(ignore_value_search_results["matched_values"])
-            ignore_paths.extend([re.escape(path) for path in path])
-        if "matched_values" in optional_keys_search_results:
-            path = list(optional_keys_search_results["matched_values"])
-            ignore_paths.extend([re.escape(path) for path in path])
-        diff = DeepDiff(
-            expected,
-            generated,
-            ignore_order=True,
-            report_repetition=True,
-            exclude_regex_paths=ignore_paths,
-        )
-        if missing_keys:
-            diff.update({"dictionary_item_removed": missing_keys})
-        return diff
-
-    def _extract_print_statements_from_diff(self, test_case_id, diff):
-        if not diff:
-            return
-        prints = []
-        if "dictionary_item_removed" in diff:
-            prints.append(
-                f"{Fore.RED}Following expected items are missing in the generated logprep output:",
-            )
-            for item in diff["dictionary_item_removed"]:
-                prints.append(f" - {item}")
-        if "dictionary_item_added" in diff:
-            prints.append(f"{Fore.RED}Following unexpected values were generated by logprep")
-            for item in diff["dictionary_item_added"]:
-                prints.append(f" - {item}")
-        if "values_changed" in diff:
-            prints.append(
-                f"{Fore.RED}Following values differ between generated and expected output",
-            )
-            for key, value in diff["values_changed"].items():
-                prints.append(f" - {key}: {self._rewrite_output(str(value))}")
-        if prints:
-            self._test_cases[test_case_id].failed = True
-            self._test_cases[test_case_id].report.extend(prints)
-
-    def _has_matching_logprep_output(
-        self, expected_extra_output, expected_extra_output_key, logprep_extra_outputs
-    ):
-        """
-        Iterate over all logprep extra outputs and search for an output that matches the
-        expected output
-        """
-        has_matching_output = False
-        for logprep_extra_output in logprep_extra_outputs:
-            logprep_extra_output_key = list(logprep_extra_output.keys())[0]
-            if expected_extra_output_key == logprep_extra_output_key:
-                diff = self._compare_events(
-                    logprep_extra_output[logprep_extra_output_key],
-                    expected_extra_output[expected_extra_output_key],
-                )
-                if diff is not None:
-                    has_matching_output = True
-        return has_matching_output
-
-    def _print_pass_fail_statements(self, test_case_id):
-        test_case = self._test_cases.get(test_case_id, {})
-        status = f"{Style.BRIGHT}{Fore.GREEN} PASSED"
-        if not test_case.expected_output:
-            status = f"{Style.BRIGHT}{Fore.RESET} SKIPPED - (no expected output given)"
-        elif len(test_case.report) > 0:
-            status = f"{Style.BRIGHT}{Fore.RED} FAILED"
-        elif test_case.warnings:
-            status = f"{Style.BRIGHT}{Fore.YELLOW} PASSED - (with warnings)"
-        print(f"{Fore.BLUE} Test Case: {Fore.CYAN}{test_case_id} {status}{Style.RESET_ALL}")
-
-    def _print_test_reports(self):
-        if not any(case.failed for case in self._test_cases.values()):
-            return
-        print(Style.BRIGHT + "# Test Cases Detailed Reports:" + Style.RESET_ALL)
-        for test_case_id, test_case in self._test_cases.items():
-            if (test_case.warnings or test_case.report) and test_case.expected_output:
-                self._print_long_test_result(test_case_id, test_case)
-                print()
-
-    def _print_long_test_result(self, test_case_id, test_case):
-        report_title = f"test report for '{test_case_id}'"
-        print(f"{Fore.RED}{Style.BRIGHT}↓ {report_title} ↓ {Style.RESET_ALL}")
-        print_logprep_output = True
-        if test_case.warnings and not test_case.report:
-            print(Fore.GREEN + "Test passed, but with following warnings:" + Fore.RESET)
-            print(test_case.warnings)
-            print_logprep_output = False
-        if test_case.warnings and test_case.report:
-            print(Fore.RED + "Logprep Warnings:" + Fore.RESET)
-            for warning in test_case.warnings:
-                print(warning)
-        for statement in test_case.report:
-            if isinstance(statement, (dict, list)):
-                pprint(statement)
-            else:
-                print(statement)
-        if print_logprep_output:
-            print(Fore.RED + "Logprep Event Output:" + Fore.RESET)
-            pprint(test_case.generated_output)
-            print(Fore.RED + "Logprep Extra Data Output:" + Fore.RESET)
-            pprint(test_case.generated_extra_output)
-        print(f"{Fore.RED}{Style.BRIGHT}↑ {report_title} ↑ {Style.RESET_ALL}")
-
-    def _print_test_summary(self):
-        print(Fore.RESET + Style.BRIGHT + "# Test Overview" + Style.RESET_ALL)
-        total_cases = len(self._test_cases)
-        failed_cases = sum(case.failed for case in self._test_cases.values())
-        print(f"Failed tests: {failed_cases}")
-        print(f"Total test cases: {total_cases}")
-        if total_cases:
-            success_rate = (total_cases - failed_cases) / total_cases * 100
-            print(f"Success rate: {success_rate:.2f}%")
-
-    def _check_keys_of_ignored_values(self, logprep_output, field_paths) -> list:
-        if not field_paths:
-            return []
-        missing_keys = []
-        for path in field_paths:
-            dotted_path = ".".join(re.findall(r"\['([^'|.*]*)'\]", path))
-            field_value = get_dotted_field_value(logprep_output, dotted_path)
-            if field_value is None:
-                missing_keys.append(path)
-        return missing_keys
-
-    def _group_path_by_test_case(self, data_directory, file_paths):
-        test_cases = {}
-        for filename in file_paths:
-            test_case_id = self._strip_input_file_type(filename)
-            if test_case_id not in test_cases:
-                test_cases[test_case_id] = self.TestCase()
-            document = [{}]
-            try:
-                document = parse_json(os.path.join(data_directory, filename))
-            except JSONDecodeError as error:
-                test_cases[test_case_id].failed = True
-                error_print = f"Json-Error decoding file {filename}: {error}"
-                test_cases[test_case_id].report.append(error_print)
-            if "_in.json" in filename:
-                test_cases[test_case_id].input_document = document[0]
-            if "_out.json" in filename:
-                test_cases[test_case_id].expected_output = document[0]
-            if "_out_extra.json" in filename:
-                test_cases[test_case_id].expected_extra_output = document
-        return test_cases
-
-    def _strip_input_file_type(self, filename):
-        """Remove the input file suffix to identify the case name"""
-        filename = filename.replace("_in", "")
-        filename = filename.replace("_out_extra", "")
-        filename = filename.replace("_out", "")
-        filename = filename.replace(".json", "*")
-        return filename
-
-    def _rewrite_output(self, statement):
-        statement = statement.replace("new_value", "generated")
-        statement = statement.replace("old_value", "expected")
-        return statement
diff --git a/logprep/util/rule_dry_runner.py b/logprep/util/rule_dry_runner.py
index 0c6782fd6..07e5e2514 100644
--- a/logprep/util/rule_dry_runner.py
+++ b/logprep/util/rule_dry_runner.py
@@ -41,14 +41,12 @@
 from copy import deepcopy
 from difflib import ndiff
 from functools import cached_property
+from typing import Dict, List
 
 from colorama import Back, Fore
 from ruamel.yaml import YAML
 
 from logprep.framework.pipeline import Pipeline, PipelineResult
-from logprep.util.auto_rule_tester.auto_rule_corpus_tester import (
-    convert_extra_data_format,
-)
 from logprep.util.configuration import Configuration
 from logprep.util.getter import GetterFactory
 from logprep.util.helper import color_print_line, color_print_title, recursive_compare
@@ -56,6 +54,17 @@
 yaml = YAML(typ="safe", pure=True)
 
 
+def convert_extra_data_format(extra_outputs) -> List[Dict]:
+    """
+    Converts the format of the extra data outputs such that it is a list of dicts, where the
+    output target is the key and the values are the actual outputs.
+    """
+    reformatted_extra_outputs = []
+    for value, key in extra_outputs:
+        reformatted_extra_outputs.append({str(key): value})
+    return reformatted_extra_outputs
+
+
 class DryRunner:
     """Used to run pipeline with given events and show changes made by processing."""
 
diff --git a/tests/unit/test_run_logprep.py b/tests/unit/test_run_logprep.py
index 41c6eae28..b5b419a0e 100644
--- a/tests/unit/test_run_logprep.py
+++ b/tests/unit/test_run_logprep.py
@@ -58,18 +58,10 @@ def setup_method(self):
                 "test dry-run tests/testdata/config/config.yml examples/exampledata/input_logdata/test_input.jsonl",
                 "logprep.util.rule_dry_runner.DryRunner.run",
             ),
-            (
-                "test integration tests/testdata/config/config.yml path/to/testset",
-                "logprep.util.auto_rule_tester.auto_rule_corpus_tester.RuleCorpusTester.run",
-            ),
             (
                 "test dry-run tests/testdata/config/config.yml tests/testdata/config/config.yml asdfsdv",
                 "logprep.util.rule_dry_runner.DryRunner.run",
             ),
-            (
-                "test integration tests/testdata/config/config.yml tests/testdata/config/config.yml path/to/testset",
-                "logprep.util.auto_rule_tester.auto_rule_corpus_tester.RuleCorpusTester.run",
-            ),
         ],
     )
     def test_cli_commands_with_configs(self, command: str, target: str):
@@ -85,7 +77,6 @@ def test_cli_commands_with_configs(self, command: str, target: str):
             ("test", "config"),
             ("test", "unit"),
             ("test", "dry-run", "input_data"),
-            ("test", "integration", "testdata"),
         ],
     )
     def test_cli_invokes_default_config_location(self, command):
@@ -271,14 +262,6 @@ def test_test_rules_starts_auto_rule_tester(self, mock_tester):
         logger = logging.getLogger()
         logger.disabled = False
 
-    @mock.patch("logprep.util.auto_rule_tester.auto_rule_corpus_tester.RuleCorpusTester.run")
-    def test_test_ruleset_starts_rule_corpus_tester(self, mock_tester):
-        config_path = "tests/testdata/config/config.yml"
-        test_data_path = "path/to/testset"
-        result = self.cli_runner.invoke(cli, ["test", "integration", config_path, test_data_path])
-        assert result.exit_code == 0
-        mock_tester.assert_called()
-
     @mock.patch("logging.Logger.info")
     def test_run_logprep_logs_log_level(self, mock_info):
         config = Configuration.from_sources(("tests/testdata/config/config.yml",))
diff --git a/tests/unit/util/test_auto_rule_corpus_tester.py b/tests/unit/util/test_auto_rule_corpus_tester.py
deleted file mode 100644
index b0576678d..000000000
--- a/tests/unit/util/test_auto_rule_corpus_tester.py
+++ /dev/null
@@ -1,511 +0,0 @@
-# pylint: disable=missing-docstring
-# pylint: disable=protected-access
-# pylint: disable=too-many-arguments
-import json
-import os
-import re
-from json import JSONDecodeError
-from logging.config import dictConfig
-from unittest import mock
-
-import pytest
-
-from logprep.abc.processor import ProcessorResult
-from logprep.framework.pipeline import PipelineResult
-from logprep.util.auto_rule_tester.auto_rule_corpus_tester import RuleCorpusTester
-from logprep.util.defaults import DEFAULT_LOG_CONFIG
-from logprep.util.getter import GetterFactory
-
-
-@pytest.fixture(name="corpus_tester")
-def fixture_auto_rule_corpus_tester():
-    config_path = ("tests/testdata/config/config.yml",)
-    data_dir = "will be overwritten in test cases"
-    corpus_tester = RuleCorpusTester(config_path, data_dir)
-    return corpus_tester
-
-
-def write_test_case_data_tmp_files(test_data_dir, test_case_name, test_data):
-    input_data_path = test_data_dir / f"{test_case_name}_in.json"
-    input_data_path.write_text(json.dumps(test_data.get("input")))
-    expected_output_data_path = test_data_dir / f"{test_case_name}_out.json"
-    expected_output_data_path.write_text(json.dumps(test_data.get("expected_output")))
-    expected_extra_output_data_path = test_data_dir / f"{test_case_name}_out_extra.json"
-    expected_extra_output_data_path.write_text(json.dumps(test_data.get("expected_extra_output")))
-
-
-def prepare_corpus_tester(corpus_tester, tmp_path, test_data):
-    test_data_dir = tmp_path / "test_data"
-    os.makedirs(test_data_dir, exist_ok=True)
-    write_test_case_data_tmp_files(test_data_dir, "rule_auto_corpus_test", test_data)
-    corpus_tester._input_test_data_path = test_data_dir
-    corpus_tester._tmp_dir = tmp_path
-
-
-class TestAutoRuleTester:
-    def setup_method(self):
-        dictConfig(DEFAULT_LOG_CONFIG)
-
-    @pytest.mark.parametrize(
-        "test_case, test_data, mock_output, expected_prints, exit_code",
-        [
-            (
-                "One successful test",
-                {
-                    "input": {"message": "A B"},
-                    "expected_output": {"message": "A B", "source": "A", "target": "B"},
-                    "expected_extra_output": [],
-                },
-                None,
-                ["PASSED", "Success rate: 100.00%"],
-                0,
-            ),
-            (
-                "Unknown field in logprep output",
-                {
-                    "input": {
-                        "winlog": {"event_id": "2222", "event_data": {"Test1": 1, "Test2": 2}}
-                    },
-                    "expected_output": {
-                        "winlog": {"event_id": "2222", "event_data": {"Test1": 1}},
-                        "test_normalized": {"test": {"field1": 1, "field2": 2}},
-                    },
-                    "expected_extra_output": [],
-                },
-                None,
-                [
-                    "FAILED",
-                    "Success rate: 0.00%",
-                    "Detailed Reports",
-                    "unexpected values were generated",
-                    "root['winlog']['event_data']['Test2']",
-                ],
-                1,
-            ),
-            (
-                "Failed test with changed value only",
-                {
-                    "input": {
-                        "winlog": {"event_id": "2222", "event_data": {"Test1": 1, "Test2": 2}}
-                    },
-                    "expected_output": {
-                        "winlog": {"event_id": "2222", "event_data": {"Test1": 1, "Test2": 4}},
-                        "test_normalized": {"test": {"field1": 1, "field2": 2}},
-                    },
-                    "expected_extra_output": [],
-                },
-                None,
-                [
-                    "FAILED",
-                    "Success rate: 0.00%",
-                    "Detailed Reports",
-                    "values differ between generated and expected output",
-                    "- root['winlog']['event_data']['Test2']: {'generated': 2, 'expected': 4}",
-                ],
-                1,
-            ),
-            (
-                "One successful test with extra output",
-                {
-                    "input": {"winlog": {"event_data": {"IpAddress": "1.2.3.4"}}},
-                    "expected_output": {"winlog": {"event_data": {"IpAddress": "<IGNORE_VALUE>"}}},
-                    "expected_extra_output": [
-                        {
-                            "({'kafka_output': 'pseudonyms'},)": {
-                                "pseudonym": "<IGNORE_VALUE>",
-                                "origin": "<IGNORE_VALUE>",
-                            }
-                        }
-                    ],
-                },
-                None,
-                ["PASSED", "Success rate: 100.00%"],
-                0,
-            ),
-            (
-                "Failed test with unexpected extra output",
-                {
-                    "input": {"winlog": {"event_data": {"IpAddress": "1.2.3.4"}}},
-                    "expected_output": {"winlog": {"event_data": {"IpAddress": "<IGNORE_VALUE>"}}},
-                    "expected_extra_output": [],
-                },
-                None,
-                [
-                    "FAILED",
-                    "Success rate: 0.00%",
-                    "Detailed Reports",
-                    "There is at least one generated extra output that is unexpected",
-                    "Logprep Event Output",
-                    "Logprep Extra Data Output",
-                    "pseudonyms",
-                ],
-                1,
-            ),
-            (
-                "Failed test with expected extra output, not generated by logprep",
-                {
-                    "input": {"winlog": {"event_data": {"IpAddress": "1.2.3.4"}}},
-                    "expected_output": {"winlog": {"event_data": {"IpAddress": "<IGNORE_VALUE>"}}},
-                    "expected_extra_output": [
-                        {
-                            "pseudonyms": {
-                                "pseudonym": "<IGNORE_VALUE>",
-                                "origin": "<IGNORE_VALUE>",
-                            }
-                        },
-                        {"some_random_extra": {"foo": "bar"}},
-                    ],
-                },
-                None,
-                [
-                    "FAILED",
-                    "Success rate: 0.00%",
-                    "Detailed Reports",
-                    "There is at least one expected extra output missing",
-                    "For the following extra output, no matching extra output was generated by",
-                    "Logprep Event Output",
-                    "Logprep Extra Data Output",
-                    "pseudonyms",
-                ],
-                1,
-            ),
-            (
-                "Failed test with unexpected field generated by logprep",
-                {
-                    "input": {
-                        "winlog": {"event_id": "2222", "event_data": {"Test1": 1, "Test2": 2}}
-                    },
-                    "expected_output": {
-                        "winlog": {"event_id": "2222", "event_data": {"Test1": 1, "Test2": 2}},
-                        "test_normalized": {"test": {"field1": 1, "field2": 2}},
-                    },
-                    "expected_extra_output": [],
-                },
-                [
-                    {
-                        "winlog": {"event_id": "2222", "event_data": {"Test1": 1}},
-                        "test_normalized": {"test": {"field1": 1}},
-                    },
-                    [],
-                ],
-                [
-                    "FAILED",
-                    "Success rate: 0.00%",
-                    "Detailed Reports",
-                    "expected items are missing in the generated logprep output",
-                    "root['winlog']['event_data']['Test2']",
-                    "root['test_normalized']['test']['field2']",
-                ],
-                1,
-            ),
-            (
-                "Successful test with ignored value in generated by logprep output",
-                {
-                    "input": {
-                        "winlog": {"event_id": "2222", "event_data": {"Test1": 1, "Test2": 2}}
-                    },
-                    "expected_output": {
-                        "winlog": {"event_id": "2222", "event_data": "<IGNORE_VALUE>"},
-                        "test_normalized": {"test": {"field1": 1, "field2": 2}},
-                    },
-                    "expected_extra_output": [],
-                },
-                [
-                    {
-                        "winlog": {"event_id": "2222", "event_data": "SOME_RANDOM_CONTENT"},
-                        "test_normalized": {"test": {"field1": 1, "field2": 2}},
-                    },
-                    [],
-                ],
-                [
-                    "PASSED",
-                    "Success rate: 100.00%",
-                ],
-                0,
-            ),
-            (
-                "Failed test if key of <IGNORE_VALUE> is missing in generated logprep output",
-                {
-                    "input": {
-                        "winlog": {"event_id": "2222", "event_data": {"Test1": 1, "Test2": 2}}
-                    },
-                    "expected_output": {
-                        "winlog": {"event_id": "2222", "event_data": "<IGNORE_VALUE>"},
-                        "test_normalized": {"test": {"field1": 1, "field2": 2}},
-                    },
-                    "expected_extra_output": [],
-                },
-                [
-                    {
-                        "winlog": {"event_id": "2222"},
-                        "test_normalized": {"test": {"field1": 1, "field2": 2}},
-                    },
-                    [],
-                ],
-                [
-                    "FAILED",
-                    "expected items are missing in the generated logprep output",
-                    "root['winlog']['event_data']",
-                    "Success rate: 0.00%",
-                ],
-                1,
-            ),
-            (
-                "Successful test with optional key missing in generated by logprep output",
-                {
-                    "input": {
-                        "winlog": {"event_id": "2222", "event_data": {"Test1": 1, "Test2": 2}}
-                    },
-                    "expected_output": {
-                        "winlog": {"event_id": "2222", "event_data": "<OPTIONAL_KEY>"},
-                        "test_normalized": {"test": {"field1": 1, "field2": 2}},
-                    },
-                    "expected_extra_output": [],
-                },
-                [
-                    {
-                        "winlog": {"event_id": "2222"},
-                        "test_normalized": {"test": {"field1": 1, "field2": 2}},
-                    },
-                    [],
-                ],
-                [
-                    "PASSED",
-                    "Success rate: 100.00%",
-                ],
-                0,
-            ),
-            (
-                "Successful test with optional key present in generated by logprep output",
-                {
-                    "input": {
-                        "winlog": {"event_id": "2222", "event_data": {"Test1": 1, "Test2": 2}}
-                    },
-                    "expected_output": {
-                        "winlog": {"event_id": "2222", "event_data": "<OPTIONAL_KEY>"},
-                        "test_normalized": {"test": {"field1": 1, "field2": 2}},
-                    },
-                    "expected_extra_output": [],
-                },
-                [
-                    {
-                        "winlog": {"event_id": "2222", "event_data": "something"},
-                        "test_normalized": {"test": {"field1": 1, "field2": 2}},
-                    },
-                    [],
-                ],
-                [
-                    "PASSED",
-                    "Success rate: 100.00%",
-                ],
-                0,
-            ),
-        ],
-    )
-    @mock.patch("logprep.util.auto_rule_tester.auto_rule_corpus_tester.sys.exit")
-    def test_run_prints_expected_outputs_to_console(
-        self,
-        mock_exit,
-        tmp_path,
-        corpus_tester,
-        test_case,
-        test_data,
-        mock_output,
-        expected_prints,
-        exit_code,
-        capsys,
-    ):
-        prepare_corpus_tester(corpus_tester, tmp_path, test_data)
-        if mock_output is not None:
-            with mock.patch(
-                "logprep.util.auto_rule_tester.auto_rule_corpus_tester.Pipeline.process_pipeline"
-            ) as mock_process_pipeline:
-                mock_process_pipeline.return_value = PipelineResult(
-                    results=[],
-                    event=mock_output[0],
-                    event_received=mock_output[0],
-                    pipeline=[],
-                )
-                mock_process_pipeline.return_value.results = [
-                    ProcessorResult(processor_name="test", data=test_data["expected_extra_output"])
-                ]
-                corpus_tester.run()
-        else:
-            corpus_tester.run()
-        console_output, console_error = capsys.readouterr()
-        assert console_error == ""
-        for expected_print in expected_prints:
-            assert expected_print in console_output, test_case
-        mock_exit.assert_called_with(exit_code)
-
-    @mock.patch("logprep.util.auto_rule_tester.auto_rule_corpus_tester.parse_json")
-    def test_run_logs_json_decoding_error(self, mock_parse_json, tmp_path, corpus_tester):
-        test_data = {"input": {}, "expected_output": {}, "expected_extra_output": []}
-        prepare_corpus_tester(corpus_tester, tmp_path, test_data)
-        mock_parse_json.side_effect = JSONDecodeError("Some Error", "in doc", 0)
-        with pytest.raises(ValueError, match="Following parsing errors were found"):
-            corpus_tester.run()
-
-    def test_run_raises_if_case_misses_input_file(self, tmp_path, corpus_tester):
-        expected_output_data_path = tmp_path / "rule_auto_corpus_test_out.json"
-        expected_output_data_path.write_text('{"json":"file"}')
-        corpus_tester._input_test_data_path = tmp_path
-        with pytest.raises(ValueError, match="The following TestCases have no input document"):
-            corpus_tester.run()
-
-    @mock.patch("logprep.util.auto_rule_tester.auto_rule_corpus_tester.sys.exit")
-    def test_run_skips_test_if_expected_output_is_missing(
-        self, mock_exit, tmp_path, corpus_tester, capsys
-    ):
-        test_data = {
-            "input": {"winlog": {"event_id": "2222", "event_data": {"Test1": 1, "Test2": 2}}},
-            "expected_output": {},
-            "expected_extra_output": [],
-        }
-        expected_prints = [
-            "SKIPPED",
-            "no expected output given",
-            "Total test cases: 1",
-            "Success rate: 100.00%",
-        ]
-        prepare_corpus_tester(corpus_tester, tmp_path, test_data)
-        os.remove(tmp_path / "test_data" / "rule_auto_corpus_test_out.json")
-        corpus_tester.run()
-        console_output, console_error = capsys.readouterr()
-        assert console_error == ""
-        for expected_print in expected_prints:
-            assert expected_print in console_output
-        mock_exit.assert_called_with(0)
-
-    @mock.patch("logprep.util.auto_rule_tester.auto_rule_corpus_tester.shutil.rmtree")
-    @mock.patch("logprep.util.auto_rule_tester.auto_rule_corpus_tester.sys.exit")
-    def test_run_removes_test_tmp_dir(self, _, mock_shutil, corpus_tester):
-        corpus_tester.run()
-        mock_shutil.assert_called_with(corpus_tester._tmp_dir)
-
-    @mock.patch("logprep.util.auto_rule_tester.auto_rule_corpus_tester.sys.exit")
-    def test_run_with_two_processors_that_have_different_extra_outputs(
-        self, mock_exit, tmp_path, capsys
-    ):
-        config_path = "tests/testdata/config/config.yml"
-        config = GetterFactory.from_string(config_path).get_yaml()
-        config["pipeline"].append(
-            {
-                "selective_extractor": {
-                    "type": "selective_extractor",
-                    "specific_rules": ["tests/testdata/unit/selective_extractor/rules/specific"],
-                    "generic_rules": [],
-                }
-            }
-        )
-        test_config_path = tmp_path / "test_config.yml"
-        test_config_path.write_text(json.dumps(config), encoding="utf8")
-        corpus_tester = RuleCorpusTester([str(test_config_path)], "")
-        test_data = {
-            "input": {
-                "message": "A B",
-                "field1": "field 1 value",
-                "winlog": {"event_id": "2222", "event_data": {"IpAddress": "1.2.3.4"}},
-            },
-            "expected_output": {
-                "message": "A B",
-                "source": "A",
-                "target": "B",
-                "field1": "field 1 value",
-                "winlog": {"event_id": "2222", "event_data": {"IpAddress": "<IGNORE_VALUE>"}},
-            },
-            "expected_extra_output": [
-                {"({'kafka': 'topic'},)": {"field1": "field 1 value"}},
-                {"({'kafka': 'topic'},)": {"message": "something"}},
-                {
-                    "({'kafka_output': 'pseudonyms'},)": {
-                        "origin": "<IGNORE_VALUE>",
-                        "pseudonym": "<IGNORE_VALUE>",
-                    }
-                },
-            ],
-        }
-        expected_prints = [
-            "PASSED",
-            "Total test cases: 1",
-            "Success rate: 100.00%",
-        ]
-        prepare_corpus_tester(corpus_tester, tmp_path, test_data)
-        corpus_tester.run()
-        console_output, console_error = capsys.readouterr()
-        assert console_error == ""
-        for expected_print in expected_prints:
-            assert expected_print in console_output
-        mock_exit.assert_called_with(0)
-
-    @mock.patch("logprep.util.auto_rule_tester.auto_rule_corpus_tester.sys.exit")
-    def test_corpus_tests_dont_share_cache_between_runs_by_resetting_processors(
-        self, mock_exit, tmp_path, capsys
-    ):
-        test_case_data = {
-            "input": {
-                "winlog": {"event_id": "2222", "event_data": {"IpAddress": "1.2.3.4"}},
-            },
-            "expected_output": {
-                "winlog": {"event_id": "2222", "event_data": {"IpAddress": "<IGNORE_VALUE>"}},
-            },
-            "expected_extra_output": [
-                {
-                    "({'kafka_output': 'pseudonyms'},)": {
-                        "origin": "<IGNORE_VALUE>",
-                        "pseudonym": "<IGNORE_VALUE>",
-                    }
-                },
-            ],
-        }
-        test_data_dir = tmp_path / "test_data"
-        os.makedirs(test_data_dir, exist_ok=True)
-        # run one test case two times to trigger the pseudonymizer cache.
-        # Without reinitializing the processors the second test wouldn't create an extra output, as
-        # the cache realizes it as an existing pseudonym already.
-        write_test_case_data_tmp_files(test_data_dir, "test_case_one", test_case_data)
-        write_test_case_data_tmp_files(test_data_dir, "test_case_two", test_case_data)
-        config_path = ["tests/testdata/config/config.yml"]
-        corpus_tester = RuleCorpusTester(config_path, test_data_dir)
-        corpus_tester.run()
-        console_output, console_error = capsys.readouterr()
-        assert console_error == ""
-        expected_prints = [
-            "PASSED",
-            "Total test cases: 2",
-            "Success rate: 100.00%",
-        ]
-        for expected_print in expected_prints:
-            assert expected_print in console_output
-        mock_exit.assert_called_with(0)
-
-    @mock.patch("logprep.util.auto_rule_tester.auto_rule_corpus_tester.sys.exit")
-    def test_warnings_are_printed_inside_the_detailed_reports(self, mock_exit, tmp_path, capsys):
-        test_case_data = {
-            "input": {
-                "field1": 2,
-                "field2": 2,
-                "new_field": "exists already",
-            },
-            "expected_output": {
-                "field1": 2,
-                "field2": 2,
-                "new_field": "exists already",
-            },
-            "expected_extra_output": [],
-        }
-        test_data_dir = tmp_path / "test_data"
-        os.makedirs(test_data_dir, exist_ok=True)
-        write_test_case_data_tmp_files(test_data_dir, "test_case_one", test_case_data)
-        config_path = ["tests/testdata/config/config.yml"]
-        corpus_tester = RuleCorpusTester(config_path, test_data_dir)
-        corpus_tester.run()
-        console_output, console_error = capsys.readouterr()
-        assert console_error == ""
-        warnings_inside_details_pattern = (
-            r".*Test Cases Detailed Reports.*test_case_one.*"
-            r"Logprep Warnings.*FieldExistsWarning.*test_case_one.*"
-            r"Test Overview"
-        )
-        assert re.match(warnings_inside_details_pattern, console_output, flags=re.DOTALL)
-        mock_exit.assert_called_with(1)