From 3601d6d4b1a1ead0cb9d8aa6ce50f22b780de662 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Zimmermann?= <101292599+ekneg54@users.noreply.github.com> Date: Tue, 6 Feb 2024 13:56:15 +0100 Subject: [PATCH] config from multiple sources (#507) * remove schema and rule checker * add defaults module to store all logprep defaults * refactor get_versions_string and DEFAULT_LOCATION_CONFIG and move them to util module * write tests for get_versions_string which was only tested implicit * remove rule validation * reimplement logprep.util.configuration module * reimplement logprep.runner module * add reload method to configuration * add commandline option to print config as json or yaml * add reload successful an failure metric to runner * implement configuration equality -> equal version == equal configuration * remove MultiprocessingPipeline * update changelog * update architecture visualizations * move exception handling to configuration module to handle exceptions where they occur, the exception handling of the exceptions where moved to the util/configuration module. In `run_logprep` only the InvalidConfiguration has to be handled. The Configuration only raises the `InvalidConfigurationError` exception * add LogprepException and ensure InvalidConfigurationErrors adds errors only once --------- Co-authored-by: dtrai2 Co-authored-by: djkhl --- CHANGELOG.md | 10 +- README.md | 31 +- .../diagramms/logprep_start.drawio | 252 +-- .../diagramms/logprep_start.drawio.html | 2 +- .../diagramms/multiprocessing.drawio | 216 ++- .../diagramms/multiprocessing.drawio.html | 2 +- .../diagramms/pipelineManager.drawio | 132 +- .../diagramms/pipelineManager.drawio.html | 2 +- doc/source/development/architecture/index.rst | 7 +- doc/source/getting_started.rst | 4 +- .../configuration/configurationdata.rst | 90 - .../user_manual/configuration/index.rst | 9 +- .../user_manual/configuration/logprep.rst | 80 - doc/source/user_manual/execution.rst | 4 +- doc/source/user_manual/index.rst | 1 - doc/source/user_manual/validation.rst | 19 - doc/source/user_manual/verification.rst | 6 +- logprep/_version.py | 6 +- logprep/abc/exceptions.py | 8 + logprep/abc/getter.py | 1 + logprep/abc/input.py | 5 +- logprep/abc/output.py | 5 +- logprep/configuration.py | 8 +- logprep/factory.py | 8 +- logprep/factory_error.py | 8 +- logprep/framework/pipeline.py | 123 +- logprep/framework/pipeline_manager.py | 67 +- .../framework/rule_tree/demorgan_resolver.py | 10 +- logprep/framework/rule_tree/rule_parser.py | 13 +- logprep/framework/rule_tree/rule_segmenter.py | 3 +- logprep/framework/rule_tree/rule_sorter.py | 7 +- logprep/metrics/exporter.py | 12 +- logprep/metrics/metrics.py | 9 +- logprep/processor/base/exceptions.py | 5 +- logprep/processor/labeler/processor.py | 5 +- logprep/processor/pseudonymizer/processor.py | 28 +- logprep/run_logprep.py | 145 +- logprep/runner.py | 295 +-- .../auto_rule_corpus_tester.py | 24 +- .../util/auto_rule_tester/auto_rule_tester.py | 6 +- logprep/util/configuration.py | 757 ++++---- logprep/util/defaults.py | 2 + logprep/util/getter.py | 4 +- logprep/util/helper.py | 28 +- logprep/util/json_handling.py | 18 - logprep/util/pipeline_profiler.py | 2 +- logprep/util/rule_dry_runner.py | 40 +- logprep/util/schema_and_rule_checker.py | 216 --- logprep/util/time.py | 5 +- .../exampledata/config/dummy-output.yml | 6 + tests/acceptance/test_amides.py | 15 +- tests/acceptance/test_config_refresh.py | 23 +- tests/acceptance/test_file_input.py | 26 +- tests/acceptance/test_full_configuration.py | 67 +- tests/acceptance/test_http_input.py | 31 +- tests/acceptance/test_multiple_outputs.py | 156 +- tests/acceptance/test_pre_detection.py | 14 +- tests/acceptance/test_preprocessing.py | 15 +- ..._selective_extractor_full_pipeline_pass.py | 22 +- .../acceptance/test_wineventlog_processing.py | 31 +- .../test_wineventlog_pseudonymization.py | 17 +- tests/acceptance/util.py | 20 +- .../rules/specific/template_replacer.json | 12 +- tests/testdata/config/config-auto-tests.yml | 13 +- tests/testdata/config/config-only-output.yml | 3 + tests/testdata/config/config.yml | 10 + tests/testdata/config/config2.yml | 2 +- tests/testdata/metadata.py | 3 +- tests/unit/framework/test_pipeline.py | 207 +-- tests/unit/framework/test_pipeline_manager.py | 206 +-- tests/unit/metrics/test_exporter.py | 15 +- tests/unit/metrics/test_metrics.py | 22 + tests/unit/processor/labeler/test_labeler.py | 5 +- .../pseudonymizer/test_pseudonymizer.py | 3 +- tests/unit/processor/test_process.py | 19 +- tests/unit/test_quickstart.py | 8 +- tests/unit/test_run_logprep.py | 142 +- tests/unit/test_runner.py | 580 +++--- tests/unit/util/test_auto_rule_tester.py | 3 +- tests/unit/util/test_configuration.py | 1595 ++++++++++------- tests/unit/util/test_getter.py | 1 - tests/unit/util/test_helper.py | 52 +- tests/unit/util/test_rule_dry_runner.py | 42 +- .../unit/util/test_schema_and_rule_checker.py | 9 - versioneer.py | 8 +- 85 files changed, 3034 insertions(+), 3109 deletions(-) delete mode 100644 doc/source/user_manual/configuration/configurationdata.rst delete mode 100644 doc/source/user_manual/configuration/logprep.rst delete mode 100644 doc/source/user_manual/validation.rst create mode 100644 logprep/abc/exceptions.py create mode 100644 logprep/util/defaults.py delete mode 100644 logprep/util/schema_and_rule_checker.py create mode 100644 quickstart/exampledata/config/dummy-output.yml create mode 100644 tests/testdata/config/config-only-output.yml delete mode 100644 tests/unit/util/test_schema_and_rule_checker.py diff --git a/CHANGELOG.md b/CHANGELOG.md index c675b8312..ce358774f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,14 +4,16 @@ ### Breaking * reimplement the logprep CLI, see `logprep --help` for more information. +* remove feature to reload configuration by sending signal `SIGUSR1` +* remove feature to validate rules because it is already included in `logprep test config` ### Features - * add a `number_of_successful_writes` metric to the s3 connector, which counts how many events were successfully written to s3 * make the s3 connector work with the new `_write_backlog` method introduced by the `confluent_kafka` commit bugfix in v9.0.0 * add option to Opensearch Output Connector to use parallel bulk implementation (default is True) - +* add feature to logprep to load config from multiple sources (files or uris) +* add feature to logprep to print the resulting configruation with `logprep print json|yaml ` in json or yaml ### Improvements @@ -20,6 +22,10 @@ * make the s3 connector blocking by removing threading * revert the change from v9.0.0 to always check the existence of a field for negated key-value based lucene filter expressions * make store_custom in s3, opensearch and elasticsearch connector not call `batch_finished_callback` to prevent data loss that could be caused by partially processed events +* remove the `schema_and_rule_checker` module +* rewrite Logprep Configuration object see documentation for more details +* rewrite Runner +* delete MultiProcessingPipeline class to simplify multiprocesing ### Bugfix diff --git a/README.md b/README.md index 3934ed8f0..f364effc4 100644 --- a/README.md +++ b/README.md @@ -289,13 +289,13 @@ Depending on how you have installed Logprep you have different choices to run Lo If you have installed it via PyPI or the Github Development release just run: ``` -logprep $CONFIG +logprep run $CONFIG ``` If you have installed Logprep via cloning the repository then you should run it via: ``` -PYTHONPATH="." python3 logprep/run_logprep.py $CONFIG +PYTHONPATH="." python3 logprep/run_logprep.py run $CONFIG ``` Where `$CONFIG` is the path or uri to a configuration file (see the documentation about the @@ -307,37 +307,12 @@ The next sections all assume an installation via pip The following command can be executed to verify the configuration file without having to run Logprep: ``` -logprep --verify-config $CONFIG +logprep test config $CONFIG ``` Where `$CONFIG` is the path or uri to a configuration file (see the documentation about the [configuration](https://logprep.readthedocs.io/en/latest/user_manual/configuration/index.html)). -### Validating Labeling-Schema and Rules - -The following command can be executed to validate the schema and the rules: - -``` -logprep --validate-rules $CONFIG -``` - -Where `$CONFIG` is the path or uri to a configuration file (see the documentation about the -[configuration](https://logprep.readthedocs.io/en/latest/user_manual/configuration/index.html)). - -Alternatively, the validation can be performed directly. Assuming you have cloned the repository -from git. - -``` -PYTHONPATH="." python3 logprep/util/schema_and_rule_checker.py --labeling-schema $LABELING_SCHEMA --labeling-rules $LABELING_RULES -``` - -Where `$LABELING_SCHEMA` is the path to a labeling-schema (JSON file) and `$LABELING_RULES` is -the path to a directory with rule files (JSON/YML files, see Rules.md, subdirectories -are permitted) - -Analogously, `--normalization-rules` and `--pseudonymizer-rules` can be used. - -Validation does also perform a verification of the pipeline section of the Logprep configuration. ### Reload the Configuration diff --git a/doc/source/development/architecture/diagramms/logprep_start.drawio b/doc/source/development/architecture/diagramms/logprep_start.drawio index 88aaec243..e21739756 100644 --- a/doc/source/development/architecture/diagramms/logprep_start.drawio +++ b/doc/source/development/architecture/diagramms/logprep_start.drawio @@ -1,124 +1,128 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/source/development/architecture/diagramms/logprep_start.drawio.html b/doc/source/development/architecture/diagramms/logprep_start.drawio.html index 31248c6ef..24535751a 100644 --- a/doc/source/development/architecture/diagramms/logprep_start.drawio.html +++ b/doc/source/development/architecture/diagramms/logprep_start.drawio.html @@ -5,7 +5,7 @@ logprep_start -
+
\ No newline at end of file diff --git a/doc/source/development/architecture/diagramms/multiprocessing.drawio b/doc/source/development/architecture/diagramms/multiprocessing.drawio index 04018a05e..68ff14097 100644 --- a/doc/source/development/architecture/diagramms/multiprocessing.drawio +++ b/doc/source/development/architecture/diagramms/multiprocessing.drawio @@ -1,49 +1,167 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/source/development/architecture/diagramms/multiprocessing.drawio.html b/doc/source/development/architecture/diagramms/multiprocessing.drawio.html index 1dbcfb3b0..438bec68e 100644 --- a/doc/source/development/architecture/diagramms/multiprocessing.drawio.html +++ b/doc/source/development/architecture/diagramms/multiprocessing.drawio.html @@ -5,7 +5,7 @@ multiprocessing -
+
\ No newline at end of file diff --git a/doc/source/development/architecture/diagramms/pipelineManager.drawio b/doc/source/development/architecture/diagramms/pipelineManager.drawio index 1a97026f3..aad671d74 100644 --- a/doc/source/development/architecture/diagramms/pipelineManager.drawio +++ b/doc/source/development/architecture/diagramms/pipelineManager.drawio @@ -1,6 +1,6 @@ - + - + @@ -10,17 +10,10 @@ - - - - - - - - - - - + + + + @@ -57,14 +50,29 @@ - + + + + + + + + + + + + + + + + - + - - + + @@ -74,17 +82,12 @@ - - - - - - - + + @@ -99,40 +102,12 @@ - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - @@ -141,34 +116,47 @@ - - + + - - - + + + + + + - - + + + + + + + + + + + + + + + + - + - - - - - - - - - - + - + + + + + + diff --git a/doc/source/development/architecture/diagramms/pipelineManager.drawio.html b/doc/source/development/architecture/diagramms/pipelineManager.drawio.html index 3d2178d6c..1f6f953f7 100644 --- a/doc/source/development/architecture/diagramms/pipelineManager.drawio.html +++ b/doc/source/development/architecture/diagramms/pipelineManager.drawio.html @@ -5,7 +5,7 @@ pipelineManager -
+
\ No newline at end of file diff --git a/doc/source/development/architecture/index.rst b/doc/source/development/architecture/index.rst index 469a77069..51a622bb9 100644 --- a/doc/source/development/architecture/index.rst +++ b/doc/source/development/architecture/index.rst @@ -93,10 +93,11 @@ The following diagrams illustrate the flow of a single event to make it more com :file: ../../development/architecture/diagramms/event.drawio.html -Shared ressources within Multiprocessing -======================================== +Multiprocessing +=============== -This diagram shows what ressources are shared within the multiprocessing processes. +This diagram shows what ressources are shared within the multiprocessing processes and how the +processes are started and stopped. .. raw:: html :file: ../../development/architecture/diagramms/multiprocessing.drawio.html diff --git a/doc/source/getting_started.rst b/doc/source/getting_started.rst index 5f5a63afe..cd1629aea 100644 --- a/doc/source/getting_started.rst +++ b/doc/source/getting_started.rst @@ -62,13 +62,13 @@ If you have installed it via PyPI or the Github Development release just run: .. code-block:: bash - logprep $CONFIG + logprep run $CONFIG If you have installed Logprep via cloning the repository then you should run it via: .. code-block:: bash - PYTHONPATH="." python3 logprep/run_logprep.py $CONFIG + PYTHONPATH="." python3 logprep/run_logprep.py run $CONFIG Where :code:`$CONFIG` is the path to a configuration file. For more information see the :ref:`configuration` section. diff --git a/doc/source/user_manual/configuration/configurationdata.rst b/doc/source/user_manual/configuration/configurationdata.rst deleted file mode 100644 index d83444e2b..000000000 --- a/doc/source/user_manual/configuration/configurationdata.rst +++ /dev/null @@ -1,90 +0,0 @@ -Configuration File -================== - -Configuration is done via a YAML-File. -Logprep searches for the file :code:`/etc/logprep/pipeline.yml` if no configuration file is passed. -You can pass a different configuration file via a valid file path or url. - -.. code-block:: bash - - logprep /different/path/file.yml - -or - -.. code-block:: bash - - logprep http://url-to-our-yaml-file-or-api - - -The options under :code:`input`, :code:`output` and :code:`pipeline` are passed to factories in Logprep. -They contain settings for each separate processor and connector. -Details for configuring connectors are described in :ref:`output` and :ref:`input` and for processors in :ref:`processors` . -General information about the configuration of the pipeline can be found in :ref:`pipeline_config` . - -It is possible to use environment variables in all configuration and rules files in all places. -Environment variables have to be set in uppercase and prefixed -with :code:`LOGPREP_`, :code:`GITHUB_`, :code:`PYTEST_` or :code:`CI_`. Lowercase variables are ignored. Forbidden -variable names are: :code:`["LOGPREP_LIST"]` - -The following config file will be valid by setting the given environment variables: - -.. code-block:: yaml - :caption: pipeline.yml config file - - version: $LOGPREP_VERSION - process_count: $LOGPREP_PROCESS_COUNT - timeout: 0.1 - logger: - level: $LOGPREP_LOG_LEVEL - $LOGPREP_PIPELINE - $LOGPREP_INPUT - $LOGPREP_OUTPUT - - -.. code-block:: bash - :caption: setting the bash environment variables - - export LOGPREP_VERSION="1" - export LOGPREP_PROCESS_COUNT="1" - export LOGPREP_LOG_LEVEL="DEBUG" - export LOGPREP_PIPELINE=" - pipeline: - - labelername: - type: labeler - schema: quickstart/exampledata/rules/labeler/schema.json - include_parent_labels: true - specific_rules: - - quickstart/exampledata/rules/labeler/specific - generic_rules: - - quickstart/exampledata/rules/labeler/generic" - export LOGPREP_OUTPUT=" - output: - kafka: - type: confluentkafka_output - topic: producer - error_topic: producer_error - flush_timeout: 30 - send_timeout: 2 - kafka_config: - bootstrap.servers: localhost:9092" - export LOGPREP_INPUT=" - input: - kafka: - type: confluentkafka_input - topic: consumer - offset_reset_policy: smallest - kafka_config: - bootstrap.servers: localhost:9092 - group.id: test" - - -This section explains the possible configuration parameters. - -Reading the Configuration -------------------------- - -Logprep can be "issued" to reload the configuration by sending the signal `SIGUSR1` to the Logprep process or by defining the -configuration option :code:`config_refresh_interval`. - -An error message is thrown if the configuration does not pass a consistency check, and the processor proceeds to run with its old configuration. -Then the configuration should be checked and corrected according to the error message. diff --git a/doc/source/user_manual/configuration/index.rst b/doc/source/user_manual/configuration/index.rst index 719cfe14c..9ee760a78 100644 --- a/doc/source/user_manual/configuration/index.rst +++ b/doc/source/user_manual/configuration/index.rst @@ -4,11 +4,16 @@ Configuration ============= +.. automodule:: logprep.util.configuration + :no-index: + +.. autoclass:: logprep.util.configuration.Configuration + :members: version, config_refresh_interval, process_count, timeout, logger, input, output, pipeline, metrics, profile_pipelines, print_auto_test_stack_trace + :no-index: + .. toctree:: :maxdepth: 2 - configurationdata - logprep input output pipeline diff --git a/doc/source/user_manual/configuration/logprep.rst b/doc/source/user_manual/configuration/logprep.rst deleted file mode 100644 index 666c7f81e..000000000 --- a/doc/source/user_manual/configuration/logprep.rst +++ /dev/null @@ -1,80 +0,0 @@ -======= -Logprep -======= - -version -======= - -It is optionally possible to set a version to your configuration file which can be printed via -:code:`logprep --version config/pipeline.yml`. -This has no effect on the execution of logprep and is merely used for documentation purposes. - -process_count -============= - -Integer, value >= 1 - -Count of worker processes that should be started. -The maximum performance can be probably reached by setting `process_count = Count of physical cores`. - -timeout -======= - -Float, value > 0.0 - -Logprep tries to react to signals (like sent by CTRL+C) within the given time. -The time taken for some processing steps is not always predictable, thus it is not possible to ensure that this time will be adhered to. -However, Logprep reacts quickly for small values (< 1.0), but this requires more processing power. -This can be useful for testing and debugging. -Larger values (like 5.0) slow the reaction time down, but this requires less processing power, which makes in preferable for continuous operation. - -logger -====== - -The logger writes log messages into the journal. -Duplicate log messages are being aggregated if specific conditions are met. -This can be configured with the following sub parameters: - -.. note:: - Logging for individual processors can be deactivated in their configuration in the pipeline by setting :code:`logging: false`. - -level ------ - -Configures the level of logs that should be logged. -Possible values are the Python-logging log levels: -CRITICAL, FATAL, ERROR, WARN, WARNING, INFO und DEBUG. - -INFO is being used by default. -DEBUG should be only temporarily activated for debugging, since it creates a large amount of log messages. - -aggregation_threshold ---------------------- - -Defines the amount after which duplicate log messages are being aggregated. - -aggregation_period ------------------- - -Defines after how many seconds an aggregation of log messages will be performed. - -Example -------- - -.. code-block:: yaml - :linenos: - - logger: - level: INFO - aggregation_threshold: 4 - aggregation_period: 10 - - -config_refresh_interval ------------------------ - -Configures the interval in seconds on which logprep should try to reload the configuration. -This config key is optional. If not configured, logprep won't reload the configuration automatically. -If configured the configuration will only be reloaded if the configuration version changes. -If http errors occurs on configuration reload `config_refresh_interval` is set to a quarter -of the current `config_refresh_interval` until a minimum of 5 seconds is reached. diff --git a/doc/source/user_manual/execution.rst b/doc/source/user_manual/execution.rst index 76699fd87..43633a740 100644 --- a/doc/source/user_manual/execution.rst +++ b/doc/source/user_manual/execution.rst @@ -5,9 +5,9 @@ To execute Logprep the following command can be executed in the root directory o .. code-block:: bash - logprep $CONFIG + logprep run $CONFIG -Where :code:`$CONFIG` is the path or a url to a configuration file (see :doc:`configuration/configurationdata`). +Where :code:`$CONFIG` is the path or a url to a configuration file (see :ref:`configuration`). To get help on the different parameters use: diff --git a/doc/source/user_manual/index.rst b/doc/source/user_manual/index.rst index b641a8ebe..bcf40898d 100644 --- a/doc/source/user_manual/index.rst +++ b/doc/source/user_manual/index.rst @@ -8,6 +8,5 @@ User Manual introduction execution verification - validation testing_rules configuration/index diff --git a/doc/source/user_manual/validation.rst b/doc/source/user_manual/validation.rst deleted file mode 100644 index 9a76e3e4a..000000000 --- a/doc/source/user_manual/validation.rst +++ /dev/null @@ -1,19 +0,0 @@ -Validating Rules -================ - -The following command can be used to validate the rules: - -.. code-block:: bash - :caption: Directly with Python - - PYTHONPATH="." python3 logprep/run_logprep.py $CONFIG --validate-rules - -.. code-block:: bash - :caption: With PEX file - - logprep.pex $CONFIG --validate-rules - -The paths to the rules that will be validated are found in :code:`$CONFIG` under each processor entry. -Where :code:`$CONFIG` is the path to a configuration file (see :doc:`configuration/configurationdata`). - -Validation does also perform a verification of the pipeline section of the Logprep configuration. \ No newline at end of file diff --git a/doc/source/user_manual/verification.rst b/doc/source/user_manual/verification.rst index 9fe828a16..3c6c739d9 100644 --- a/doc/source/user_manual/verification.rst +++ b/doc/source/user_manual/verification.rst @@ -7,11 +7,11 @@ The following command can be used to verify the configuration without running Lo .. code-block:: bash :caption: Directly with Python - PYTHONPATH="." python3 logprep/run_logprep.py $CONFIG --verify-config + PYTHONPATH="." python3 logprep/run_logprep.py test config $CONFIG .. code-block:: bash :caption: With PEX file - logprep.pex $CONFIG --verify-config + logprep.pex test config $CONFIG -Where :code:`$CONFIG` is the path to a configuration file (see :doc:`configuration/configurationdata`). +Where :code:`$CONFIG` is the path to a configuration file (see :ref:`configuration`). diff --git a/logprep/_version.py b/logprep/_version.py index 10ebadd2a..a4a4f0e6a 100644 --- a/logprep/_version.py +++ b/logprep/_version.py @@ -10,12 +10,14 @@ """Git implementation of _version.py.""" import errno +import functools import os import re import subprocess import sys from typing import Callable, Dict -import functools + +from logprep.abc.exceptions import LogprepException def get_keywords(): @@ -49,7 +51,7 @@ def get_config(): return cfg -class NotThisMethod(Exception): +class NotThisMethod(LogprepException): """Exception raised if a method is not valid for the current scenario.""" diff --git a/logprep/abc/exceptions.py b/logprep/abc/exceptions.py new file mode 100644 index 000000000..593e84244 --- /dev/null +++ b/logprep/abc/exceptions.py @@ -0,0 +1,8 @@ +""" abstract module for exceptions""" + + +class LogprepException(Exception): + """Base class for Logprep related exceptions.""" + + def __eq__(self, __value: object) -> bool: + return type(self) is type(__value) and self.args == __value.args diff --git a/logprep/abc/getter.py b/logprep/abc/getter.py index d7d141928..5bb633454 100644 --- a/logprep/abc/getter.py +++ b/logprep/abc/getter.py @@ -49,6 +49,7 @@ class EnvTemplate(Template): validators.deep_iterable(member_validator=validators.instance_of(str)), ], factory=list, + repr=False, ) """used variables in content but not set in environment""" diff --git a/logprep/abc/input.py b/logprep/abc/input.py index 706c6ec5d..e3fcd58e1 100644 --- a/logprep/abc/input.py +++ b/logprep/abc/input.py @@ -14,6 +14,7 @@ from attrs import define, field, validators from logprep.abc.connector import Connector +from logprep.abc.exceptions import LogprepException from logprep.metrics.metrics import Metric from logprep.util.helper import add_field_to, get_dotted_field_value from logprep.util.time import UTC, TimeParser @@ -23,7 +24,7 @@ from logprep.abc.output import Output -class InputError(Exception): +class InputError(LogprepException): """Base class for Input related exceptions.""" def __init__(self, input_connector: "Input", message: str) -> None: @@ -47,7 +48,7 @@ class FatalInputError(InputError): """Must not be catched.""" -class InputWarning(Exception): +class InputWarning(LogprepException): """May be catched but must be displayed to the user/logged.""" def __init__(self, input_connector: "Input", message: str) -> None: diff --git a/logprep/abc/output.py b/logprep/abc/output.py index 18d771f4c..f91c63c01 100644 --- a/logprep/abc/output.py +++ b/logprep/abc/output.py @@ -9,10 +9,11 @@ from attrs import define, field, validators from logprep.abc.connector import Connector +from logprep.abc.exceptions import LogprepException from logprep.abc.input import Input -class OutputError(Exception): +class OutputError(LogprepException): """Base class for Output related exceptions.""" def __init__(self, output: "Output", message: str) -> None: @@ -20,7 +21,7 @@ def __init__(self, output: "Output", message: str) -> None: super().__init__(f"{self.__class__.__name__} in {output.describe()}: {message}") -class OutputWarning(Exception): +class OutputWarning(LogprepException): """Base class for Output related warnings.""" def __init__(self, output: "Output", message: str) -> None: diff --git a/logprep/configuration.py b/logprep/configuration.py index 08847d0ca..b2af1c2fe 100644 --- a/logprep/configuration.py +++ b/logprep/configuration.py @@ -1,11 +1,9 @@ """module for component configuration """ + from typing import TYPE_CHECKING, Any, Mapping +from logprep.factory_error import NoTypeSpecifiedError, UnknownComponentTypeError from logprep.registry import Registry -from logprep.factory_error import ( - NoTypeSpecifiedError, - UnknownComponentTypeError, -) if TYPE_CHECKING: # pragma: no cover from logprep.abc.component import Component @@ -61,5 +59,5 @@ def get_class(name: str, config_: Mapping[str, Any]): components = Registry.mapping component_type = config_.get("type") if component_type not in components: - raise UnknownComponentTypeError(component_type) + raise UnknownComponentTypeError(name, component_type) return Registry.get_class(component_type) diff --git a/logprep/factory.py b/logprep/factory.py index 11e783738..9ed6c5969 100644 --- a/logprep/factory.py +++ b/logprep/factory.py @@ -1,8 +1,8 @@ """This module contains a factory to create connectors and processors.""" import copy +import logging from typing import TYPE_CHECKING -from logprep.abc.component import Component from logprep.configuration import Configuration from logprep.factory_error import ( InvalidConfigSpecificationError, @@ -12,12 +12,16 @@ if TYPE_CHECKING: # pragma: no cover from logging import Logger + from logprep.abc.component import Component + class Factory: """Create components for logprep.""" + _logger: "Logger" = logging.getLogger(__name__) + @classmethod - def create(cls, configuration: dict, logger: "Logger") -> Component: + def create(cls, configuration: dict, logger: "Logger") -> "Component": """Create component.""" if configuration == {} or configuration is None: raise InvalidConfigurationError("The component definition is empty.") diff --git a/logprep/factory_error.py b/logprep/factory_error.py index e8be998f7..27c22e441 100644 --- a/logprep/factory_error.py +++ b/logprep/factory_error.py @@ -1,7 +1,9 @@ """This module contains errors related to ProcessorFactory.""" +from logprep.abc.exceptions import LogprepException -class FactoryError(BaseException): + +class FactoryError(LogprepException): """Base class for ProcessorFactory related exceptions.""" @@ -34,5 +36,5 @@ def __init__(self, name=None): class UnknownComponentTypeError(FactoryError): """Raise if the type is unknown.""" - def __init__(self, processor_type): - super().__init__(f"Unknown type '{processor_type}'") + def __init__(self, component_name, component_type): + super().__init__(f"Unknown type '{component_type}' for '{component_name}'") diff --git a/logprep/framework/pipeline.py b/logprep/framework/pipeline.py index 9b16f964e..f80b0d282 100644 --- a/logprep/framework/pipeline.py +++ b/logprep/framework/pipeline.py @@ -13,8 +13,8 @@ import queue import warnings from ctypes import c_bool -from functools import cached_property -from multiprocessing import Lock, Process, Value, current_process +from functools import cached_property, partial +from multiprocessing import Lock, Value, current_process from typing import Any, List, Tuple import attrs @@ -40,6 +40,7 @@ from logprep.factory import Factory from logprep.metrics.metrics import HistogramMetric, Metric from logprep.processor.base.exceptions import ProcessingCriticalError, ProcessingWarning +from logprep.util.configuration import Configuration from logprep.util.pipeline_profiler import PipelineProfiler @@ -86,7 +87,7 @@ class Metrics(Component.Metrics): ) """Time in seconds that it took to process an event""" - _logprep_config: dict + _logprep_config: Configuration """ the logprep configuration dict """ _log_queue: multiprocessing.Queue @@ -104,27 +105,6 @@ class Metrics(Component.Metrics): pipeline_index: int """ the index of this pipeline """ - def __init__( - self, - config: dict, - pipeline_index: int = None, - log_queue: multiprocessing.Queue = None, - lock: Lock = None, - used_server_ports: dict = None, - ) -> None: - self._log_queue = log_queue - self.logger = logging.getLogger(f"Logprep Pipeline {pipeline_index}") - self.logger.addHandler(logging.handlers.QueueHandler(log_queue)) - self._logprep_config = config - self._timeout = config.get("timeout") - self._continue_iterating = Value(c_bool) - - self._lock = lock - self._used_server_ports = used_server_ports - self.pipeline_index = pipeline_index - self._encoder = msgspec.msgpack.Encoder() - self._decoder = msgspec.msgpack.Decoder() - @cached_property def metrics(self): """create and return metrics object""" @@ -138,7 +118,7 @@ def _process_name(self) -> str: def _event_version_information(self) -> dict: return { "logprep": get_versions().get("version"), - "configuration": self._logprep_config.get("version", "unset"), + "configuration": self._logprep_config.version, } @property @@ -154,13 +134,13 @@ def metric_labels(self) -> dict: @cached_property def _pipeline(self) -> tuple: self.logger.debug(f"Building '{self._process_name}'") - pipeline = [self._create_processor(entry) for entry in self._logprep_config.get("pipeline")] + pipeline = [self._create_processor(entry) for entry in self._logprep_config.pipeline] self.logger.debug("Finished building pipeline") return pipeline @cached_property def _output(self) -> dict[str, Output]: - output_configs = self._logprep_config.get("output") + output_configs = self._logprep_config.output if not output_configs: return None output_names = list(output_configs.keys()) @@ -172,8 +152,8 @@ def _output(self) -> dict[str, Output]: @cached_property def _input(self) -> Input: - input_connector_config = self._logprep_config.get("input") - if input_connector_config is None: + input_connector_config = self._logprep_config.input + if not input_connector_config: return None connector_name = list(input_connector_config.keys())[0] input_connector_config[connector_name].update( @@ -181,6 +161,29 @@ def _input(self) -> Input: ) return Factory.create(input_connector_config, self.logger) + def __init__( + self, + config: Configuration, + pipeline_index: int = None, + log_queue: multiprocessing.Queue = None, + lock: Lock = None, + used_server_ports: dict = None, + ) -> None: + self._log_queue = log_queue + self.logger = logging.getLogger(f"Logprep Pipeline {pipeline_index}") + self.logger.addHandler(logging.handlers.QueueHandler(log_queue)) + self._logprep_config = config + self._timeout = config.timeout + self._continue_iterating = Value(c_bool) + + self._lock = lock + self._used_server_ports = used_server_ports + self.pipeline_index = pipeline_index + self._encoder = msgspec.msgpack.Encoder() + self._decoder = msgspec.msgpack.Decoder() + if self._logprep_config.profile_pipelines: + self.run = partial(PipelineProfiler.profile_function, self.run) + @_handle_pipeline_error def _setup(self): self.logger.debug("Creating connectors") @@ -214,7 +217,8 @@ def _create_processor(self, entry: dict) -> "Processor": def run(self) -> None: """Start processing processors in the Pipeline.""" - self._enable_iteration() + with self._continue_iterating.get_lock(): + self._continue_iterating.value = True assert self._input, "Pipeline should not be run without input connector" assert self._output, "Pipeline should not be run without output connector" with self._lock: @@ -224,24 +228,16 @@ def run(self) -> None: self.logger.debug("Start iterating") if hasattr(self._input, "server"): with self._input.server.run_in_thread(): - while self._iterate(): + while self._continue_iterating.value: self.process_pipeline() else: - while self._iterate(): + while self._continue_iterating.value: self.process_pipeline() self._shut_down() - def _iterate(self) -> bool: - return self._continue_iterating.value - - def _enable_iteration(self) -> None: - with self._continue_iterating.get_lock(): - self._continue_iterating.value = True - @_handle_pipeline_error def process_pipeline(self) -> Tuple[dict, list]: """Retrieve next event, process event with full pipeline and store or return results""" - assert self._input, "Run process_pipeline only with an valid input connector" Component.run_pending_tasks() extra_outputs = [] event = None @@ -335,50 +331,3 @@ def stop(self) -> None: self.logger.debug(f"Stopping pipeline ({self._process_name})") with self._continue_iterating.get_lock(): self._continue_iterating.value = False - - -class MultiprocessingPipeline(Process, Pipeline): - """A thread-safe Pipeline for multiprocessing.""" - - def __init__( - self, - pipeline_index: int, - config: dict, - log_queue: multiprocessing.Queue, - lock: Lock, - used_server_ports: dict, - ) -> None: - self._profile = config.get("profile_pipelines", False) - - Pipeline.__init__( - self, - pipeline_index=pipeline_index, - config=config, - log_queue=log_queue, - lock=lock, - used_server_ports=used_server_ports, - ) - - self._continue_iterating = Value(c_bool) - self.stop() - Process.__init__(self) - - def run(self) -> None: - """Start processing the Pipeline.""" - if self._profile: - PipelineProfiler.profile_function(Pipeline.run, self) - else: - Pipeline.run(self) - - def _enable_iteration(self) -> None: - with self._continue_iterating.get_lock(): - self._continue_iterating.value = True - - def _iterate(self) -> Value: - with self._continue_iterating.get_lock(): - return self._continue_iterating.value - - def stop(self) -> None: - """Stop processing the Pipeline.""" - with self._continue_iterating.get_lock(): - self._continue_iterating.value = False diff --git a/logprep/framework/pipeline_manager.py b/logprep/framework/pipeline_manager.py index ffc04e210..e4ffb9c67 100644 --- a/logprep/framework/pipeline_manager.py +++ b/logprep/framework/pipeline_manager.py @@ -8,23 +8,12 @@ from attr import define, field from logprep.abc.component import Component -from logprep.framework.pipeline import MultiprocessingPipeline +from logprep.framework.pipeline import Pipeline from logprep.metrics.exporter import PrometheusExporter from logprep.metrics.metrics import CounterMetric from logprep.util.configuration import Configuration -class PipelineManagerError(Exception): - """Base class for pipeline related exceptions.""" - - -class MustSetConfigurationFirstError(PipelineManagerError): - """Raise if configuration was not set.""" - - def __init__(self, what_failed: str): - super().__init__(f"Failed to {what_failed}: Configuration is unset") - - class PipelineManager: """Manage pipelines via multi-processing.""" @@ -56,29 +45,25 @@ class Metrics(Component.Metrics): ) """Number of failed pipelines""" - def __init__(self): - self.prometheus_exporter = None + def __init__(self, configuration: Configuration): self.metrics = self.Metrics(labels={"component": "manager"}) self._logger = logging.getLogger("Logprep PipelineManager") self.log_queue = multiprocessing.Queue(-1) self._queue_listener = logging.handlers.QueueListener(self.log_queue) self._queue_listener.start() - self._pipelines = [] - self._configuration = None + self._pipelines: list[multiprocessing.Process] = [] + self._configuration = configuration self._lock = multiprocessing.Lock() self._used_server_ports = None - - def set_configuration(self, configuration: Configuration): - """set the verified config""" - self._configuration = configuration - + prometheus_config = self._configuration.metrics + if prometheus_config.enabled: + self.prometheus_exporter = PrometheusExporter(prometheus_config) + else: + self.prometheus_exporter = None manager = multiprocessing.Manager() self._used_server_ports = manager.dict() - prometheus_config = configuration.get("metrics", {}) - if prometheus_config.get("enabled", False) and not self.prometheus_exporter: - self.prometheus_exporter = PrometheusExporter(prometheus_config) def get_count(self) -> int: """Get the pipeline count. @@ -110,14 +95,13 @@ def _increase_to_count(self, count: int): while len(self._pipelines) < count: new_pipeline_index = len(self._pipelines) + 1 self._pipelines.append(self._create_pipeline(new_pipeline_index)) - self._pipelines[-1].start() self.metrics.number_of_pipeline_starts += 1 def _decrease_to_count(self, count: int): while len(self._pipelines) > count: - pipeline = self._pipelines.pop() - pipeline.stop() - pipeline.join() + pipeline_process = self._pipelines.pop() + pipeline_process.stop() + pipeline_process.join() self.metrics.number_of_pipeline_stops += 1 def restart_failed_pipeline(self): @@ -130,7 +114,7 @@ def restart_failed_pipeline(self): self.prometheus_exporter.mark_process_dead(failed_pipeline.pid) if failed_pipelines: - self.set_count(self._configuration.get("process_count")) + self.set_count(self._configuration.process_count) exit_codes = [pipeline.exitcode for pipeline in failed_pipelines] self._logger.warning( f"Restarted {len(failed_pipelines)} failed pipeline(s), " @@ -142,21 +126,28 @@ def stop(self): self._decrease_to_count(0) if self.prometheus_exporter: self.prometheus_exporter.cleanup_prometheus_multiprocess_dir() + self._queue_listener.stop() + self.log_queue.close() def restart(self): """Restarts all pipelines""" - self._decrease_to_count(0) - self._increase_to_count(self._configuration.get("process_count")) - - def _create_pipeline(self, index) -> MultiprocessingPipeline: - if self._configuration is None: - raise MustSetConfigurationFirstError("create new pipeline") - - self._logger.info("Created new pipeline") - return MultiprocessingPipeline( + self.set_count(0) + self.set_count(self._configuration.process_count) + if not self.prometheus_exporter: + return + if not self.prometheus_exporter.is_running: + self.prometheus_exporter.run() + + def _create_pipeline(self, index) -> multiprocessing.Process: + pipeline = Pipeline( pipeline_index=index, config=self._configuration, log_queue=self.log_queue, lock=self._lock, used_server_ports=self._used_server_ports, ) + self._logger.info("Created new pipeline") + process = multiprocessing.Process(target=pipeline.run, daemon=True) + process.stop = pipeline.stop + process.start() + return process diff --git a/logprep/framework/rule_tree/demorgan_resolver.py b/logprep/framework/rule_tree/demorgan_resolver.py index 62c760426..6675eb4f3 100644 --- a/logprep/framework/rule_tree/demorgan_resolver.py +++ b/logprep/framework/rule_tree/demorgan_resolver.py @@ -1,14 +1,16 @@ """Module implements functionality to apply De Morgan's law on rule filter expressions""" + +from logprep.abc.exceptions import LogprepException from logprep.filter.expression.filter_expression import ( - Not, And, - Or, - FilterExpression, CompoundFilterExpression, + FilterExpression, + Not, + Or, ) -class DeMorganResolverException(Exception): +class DeMorganResolverException(LogprepException): """Raise if demorgan resolver encounters a problem.""" diff --git a/logprep/framework/rule_tree/rule_parser.py b/logprep/framework/rule_tree/rule_parser.py index 3c3384dc5..0ed3890a3 100644 --- a/logprep/framework/rule_tree/rule_parser.py +++ b/logprep/framework/rule_tree/rule_parser.py @@ -4,24 +4,21 @@ behavior, allowing a simpler construction of the rule tree. """ -from typing import TYPE_CHECKING -from logprep.filter.expression.filter_expression import ( - Always, - Exists, - Not, -) +from typing import TYPE_CHECKING +from logprep.abc.exceptions import LogprepException +from logprep.filter.expression.filter_expression import Always, Exists, Not from logprep.framework.rule_tree.demorgan_resolver import DeMorganResolver +from logprep.framework.rule_tree.rule_segmenter import RuleSegmenter from logprep.framework.rule_tree.rule_sorter import RuleSorter from logprep.framework.rule_tree.rule_tagger import RuleTagger -from logprep.framework.rule_tree.rule_segmenter import RuleSegmenter if TYPE_CHECKING: from logprep.processor.base.rule import Rule -class RuleParserException(Exception): +class RuleParserException(LogprepException): """Raise if rule parser encounters a problem.""" diff --git a/logprep/framework/rule_tree/rule_segmenter.py b/logprep/framework/rule_tree/rule_segmenter.py index 86a9a6b06..f30e4877b 100644 --- a/logprep/framework/rule_tree/rule_segmenter.py +++ b/logprep/framework/rule_tree/rule_segmenter.py @@ -2,6 +2,7 @@ from typing import Union +from logprep.abc.exceptions import LogprepException from logprep.filter.expression.filter_expression import ( And, CompoundFilterExpression, @@ -11,7 +12,7 @@ ) -class RuleSegmenterException(Exception): +class RuleSegmenterException(LogprepException): """Raise if rule segmenter encounters a problem.""" diff --git a/logprep/framework/rule_tree/rule_sorter.py b/logprep/framework/rule_tree/rule_sorter.py index 9b6de7942..d805d50d9 100644 --- a/logprep/framework/rule_tree/rule_sorter.py +++ b/logprep/framework/rule_tree/rule_sorter.py @@ -2,15 +2,16 @@ from typing import Union +from logprep.abc.exceptions import LogprepException from logprep.filter.expression.filter_expression import ( Always, - Not, - KeyBasedFilterExpression, FilterExpression, + KeyBasedFilterExpression, + Not, ) -class RuleSorterException(Exception): +class RuleSorterException(LogprepException): """Raise if rule sorter encounters a problem.""" diff --git a/logprep/metrics/exporter.py b/logprep/metrics/exporter.py index 27b5aae6f..086e492e4 100644 --- a/logprep/metrics/exporter.py +++ b/logprep/metrics/exporter.py @@ -1,19 +1,23 @@ """This module contains functionality to start a prometheus exporter and expose metrics with it""" + import os import shutil from logging import getLogger from prometheus_client import REGISTRY, multiprocess, start_http_server +from logprep.util.configuration import MetricsConfig + class PrometheusExporter: """Used to control the prometheus exporter and to manage the metrics""" - def __init__(self, status_logger_config): + def __init__(self, status_logger_config: MetricsConfig): + self.is_running = False self._logger = getLogger("Prometheus Exporter") + self._logger.debug("Initializing Prometheus Exporter") self.configuration = status_logger_config - self._port = status_logger_config.get("port", 8000) - self._prepare_multiprocessing() + self._port = status_logger_config.port def _prepare_multiprocessing(self): """ @@ -48,5 +52,7 @@ def mark_process_dead(self, pid): def run(self): """Starts the default prometheus http endpoint""" + self._prepare_multiprocessing() start_http_server(self._port) self._logger.info(f"Prometheus Exporter started on port {self._port}") + self.is_running = True diff --git a/logprep/metrics/metrics.py b/logprep/metrics/metrics.py index e73726168..99ae80813 100644 --- a/logprep/metrics/metrics.py +++ b/logprep/metrics/metrics.py @@ -114,12 +114,13 @@ :private-members: :inherited-members: """ + import os import time -from _socket import gethostname from abc import ABC, abstractmethod -from typing import Union +from typing import Any, Union +from _socket import gethostname from attrs import define, field, validators from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram @@ -238,10 +239,10 @@ def inner(self, *args, **kwargs): # nosemgrep class CounterMetric(Metric): """Wrapper for prometheus Counter metric""" - def __add__(self, other): + def __add__(self, other: Any) -> "CounterMetric": return self.add_with_labels(other, self.labels) - def add_with_labels(self, other, labels): + def add_with_labels(self, other: Any, labels: dict) -> "CounterMetric": """Add with labels""" labels = self.labels | labels self.tracker.labels(**labels).inc(other) diff --git a/logprep/processor/base/exceptions.py b/logprep/processor/base/exceptions.py index 868489484..953671d78 100644 --- a/logprep/processor/base/exceptions.py +++ b/logprep/processor/base/exceptions.py @@ -2,13 +2,14 @@ from typing import TYPE_CHECKING, Any, List +from logprep.abc.exceptions import LogprepException from logprep.factory_error import FactoryError if TYPE_CHECKING: # pragma: no cover from logprep.processor.base.rule import Rule -class RuleError(BaseException): +class RuleError(LogprepException): """Base class for Rule related exceptions.""" @@ -48,7 +49,7 @@ def __init__(self, processor_type=None): # pragma: no cover super().__init__("Processor can't be imported") -class ProcessingError(Exception): +class ProcessingError(LogprepException): """Base class for exceptions related to processing events.""" def __init__(self, message: str, rule: "Rule"): diff --git a/logprep/processor/labeler/processor.py b/logprep/processor/labeler/processor.py index 9de41b300..8046b851a 100644 --- a/logprep/processor/labeler/processor.py +++ b/logprep/processor/labeler/processor.py @@ -39,7 +39,7 @@ """ from logging import Logger -from typing import Optional +from typing import List, Optional from attr import define, field, validators @@ -80,6 +80,9 @@ def __init__( ): self._schema = LabelingSchema.create_from_file(configuration.schema) super().__init__(name, configuration=configuration, logger=logger) + + def setup(self): + super().setup() for rule in self._generic_rules + self._specific_rules: if self._config.include_parent_labels: rule.add_parent_labels_from_schema(self._schema) diff --git a/logprep/processor/pseudonymizer/processor.py b/logprep/processor/pseudonymizer/processor.py index 1132958d6..93f778584 100644 --- a/logprep/processor/pseudonymizer/processor.py +++ b/logprep/processor/pseudonymizer/processor.py @@ -213,10 +213,24 @@ def __init__(self, name: str, configuration: Processor.Config, logger: Logger): super().__init__(name=name, configuration=configuration, logger=logger) self.pseudonyms = [] - def load_rules(self, specific_rules_targets: List[str], generic_rules_targets: List[str]): - super().load_rules(specific_rules_targets, generic_rules_targets) + def setup(self): + super().setup() self._replace_regex_keywords_by_regex_expression() + def _replace_regex_keywords_by_regex_expression(self): + for rule_dict in self._specific_rules: + for dotted_field, regex_keyword in rule_dict.pseudonyms.items(): + if regex_keyword in self._regex_mapping: + rule_dict.pseudonyms[dotted_field] = re.compile( + self._regex_mapping[regex_keyword] + ) + for rule_dict in self._generic_rules: + for dotted_field, regex_keyword in rule_dict.pseudonyms.items(): + if regex_keyword in self._regex_mapping: + rule_dict.pseudonyms[dotted_field] = re.compile( + self._regex_mapping[regex_keyword] + ) + def process(self, event: dict): self.pseudonyms = [] super().process(event) @@ -309,16 +323,6 @@ def _pseudonymize_url(self, url_string: str) -> str: self.metrics.pseudonymized_urls += 1 return url_string - def _replace_regex_keywords_by_regex_expression(self): - for rule in self._specific_rules: - for dotted_field, regex_keyword in rule.pseudonyms.items(): - if regex_keyword in self._regex_mapping: - rule.pseudonyms[dotted_field] = re.compile(self._regex_mapping[regex_keyword]) - for rule in self._generic_rules: - for dotted_field, regex_keyword in rule.pseudonyms.items(): - if regex_keyword in self._regex_mapping: - rule.pseudonyms[dotted_field] = re.compile(self._regex_mapping[regex_keyword]) - def _wrap_hash(self, hash_string: str) -> str: return self.HASH_PREFIX + hash_string + self.HASH_SUFFIX diff --git a/logprep/run_logprep.py b/logprep/run_logprep.py index 875df7442..6b5a3998c 100644 --- a/logprep/run_logprep.py +++ b/logprep/run_logprep.py @@ -2,26 +2,25 @@ """This module can be used to start the logprep.""" import logging import os +import signal import sys +import tempfile import warnings +from pathlib import Path import click -import requests from colorama import Fore -from logprep._version import get_versions from logprep.runner import Runner from logprep.util.auto_rule_tester.auto_rule_corpus_tester import RuleCorpusTester from logprep.util.auto_rule_tester.auto_rule_tester import AutoRuleTester from logprep.util.configuration import Configuration, InvalidConfigurationError -from logprep.util.getter import GetterNotFoundError -from logprep.util.helper import print_fcolor +from logprep.util.helper import get_versions_string, print_fcolor from logprep.util.rule_dry_runner import DryRunner warnings.simplefilter("always", DeprecationWarning) logging.captureWarnings(True) -DEFAULT_LOCATION_CONFIG = "file:///etc/logprep/pipeline.yml" logging.getLogger("filelock").setLevel(logging.ERROR) logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR) logging.getLogger("elasticsearch").setLevel(logging.ERROR) @@ -30,66 +29,32 @@ EPILOG_STR = "Check out our docs at https://logprep.readthedocs.io/en/latest/" -def get_versions_string(config=None) -> str: - """ - Prints the version and exists. If a configuration was found then it's version - is printed as well - """ - versions = get_versions() - padding = 25 - version_string = f"{'logprep version:'.ljust(padding)}{versions['version']}" - version_string += f"\n{'python version:'.ljust(padding)}{sys.version.split()[0]}" - if config: - config_version = f"{config.get('version', 'unset')}, {config.path}" - else: - config_version = "no configuration found" - version_string += f"\n{'configuration version:'.ljust(padding)}{config_version}" - return version_string - - -def print_version_and_exit(config): +def _print_version(config: "Configuration") -> None: print(get_versions_string(config)) sys.exit(0) -def _setup_logger(config: Configuration): - try: - log_config = config.get("logger", {}) - log_level = log_config.get("level", "INFO") - logging.basicConfig( - level=log_level, format="%(asctime)-15s %(name)-5s %(levelname)-8s: %(message)s" - ) - logger = logging.getLogger("Logprep") - logger.info(f"Log level set to '{log_level}'") - for version in get_versions_string(config).split("\n"): - logger.info(version) - except BaseException as error: # pylint: disable=broad-except - logging.getLogger("Logprep").exception(error) - sys.exit(1) +def _get_logger(logger_config: dict) -> logging.Logger: + log_level = logger_config.get("level", "INFO") + logging.basicConfig( + level=log_level, format="%(asctime)-15s %(name)-5s %(levelname)-8s: %(message)s" + ) + logger = logging.getLogger("Logprep") + logger.setLevel(log_level) return logger -def _load_configuration(config): +def _get_configuration(config_paths: list[str]) -> Configuration: try: - config = Configuration().create_from_yaml(config) - except FileNotFoundError: - print(f"The given config file does not exist: {config}", file=sys.stderr) - print( - "Create the configuration or change the path. Use '--help' for more information.", - file=sys.stderr, - ) - sys.exit(1) - except GetterNotFoundError as error: - print(f"{error}", file=sys.stderr) - except requests.RequestException as error: - print(f"{error}", file=sys.stderr) + return Configuration.from_sources(config_paths) + except InvalidConfigurationError as error: + print(f"InvalidConfigurationError: {error}", file=sys.stderr) sys.exit(1) - return config @click.group(name="logprep") @click.version_option(version=get_versions_string(), message="%(version)s") -def cli(): +def cli() -> None: """ Logprep allows to collect, process and forward log messages from various data sources. Log messages are being read and written by so-called connectors. @@ -97,33 +62,35 @@ def cli(): @cli.command(short_help="Run logprep to process log messages", epilog=EPILOG_STR) -@click.argument("config") +@click.argument("configs", nargs=-1, required=False) @click.option( "--version", is_flag=True, default=False, help="Print version and exit (includes also congfig version)", ) -def run(config: str, version=None): +def run(configs: tuple[str], version=None) -> None: """ Run Logprep with the given configuration. CONFIG is a path to configuration file (filepath or URL). """ - config_obj = _load_configuration(config) + configuration = _get_configuration(configs) if version: - print_version_and_exit(config_obj) - logger = _setup_logger(config_obj) - logger.debug(f'Metric export enabled: {config_obj.get("metrics", {}).get("enabled", False)}') - logger.debug(f"Config path: {config}") + _print_version(configuration) + logger = _get_logger(configuration.logger) + logger.info(f"Log level set to '{logging.getLevelName(logger.level)}'") + for version in get_versions_string(configuration).split("\n"): + logger.info(version) + logger.debug(f"Metric export enabled: {configuration.metrics.enabled}") + logger.debug(f"Config path: {configs}") runner = None try: - runner = Runner.get_runner() - runner.load_configuration(config) + runner = Runner.get_runner(configuration) logger.debug("Configuration loaded") runner.start() # pylint: disable=broad-except - except BaseException as error: + except Exception as error: if os.environ.get("DEBUG", False): logger.exception(f"A critical error occurred: {error}") # pragma: no cover else: @@ -135,32 +102,26 @@ def run(config: str, version=None): @cli.group(name="test", short_help="Execute tests against a given configuration") -def test(): +def test() -> None: """ Execute certain tests like unit and integration tests. Can also verify the configuration. """ @test.command(name="config") -@click.argument("config") -def test_config(config): +@click.argument("configs", nargs=-1) +def test_config(configs: tuple[str]) -> None: """ Verify the configuration file CONFIG is a path to configuration file (filepath or URL). """ - config = _load_configuration(config) - logger = _setup_logger(config) - try: - config.verify(logger=logger) - except InvalidConfigurationError as error: - logger.critical(error) - sys.exit(1) + _get_configuration(configs) print_fcolor(Fore.GREEN, "The verification of the configuration was successful") @test.command(short_help="Execute a dry run against a configuration and selected events") -@click.argument("config") +@click.argument("configs", nargs=-1) @click.argument("events") @click.option( "--input-type", @@ -176,7 +137,7 @@ def test_config(config): type=click.BOOL, show_default=True, ) -def dry_run(config, events, input_type, full_output): +def dry_run(configs: tuple[str], events: str, input_type: str, full_output: bool) -> None: """ Execute a logprep dry run with the given configuration against a set of events. The results of the processing will be printed in the terminal. @@ -185,41 +146,46 @@ def dry_run(config, events, input_type, full_output): CONFIG is a path to configuration file (filepath or URL). EVENTS is a path to a 'json' or 'jsonl' file. """ + config = _get_configuration(configs) json_input = input_type == "json" dry_runner = DryRunner(events, config, full_output, json_input, logging.getLogger("DryRunner")) dry_runner.run() @test.command(short_help="Run the rule tests of the given configuration", name="unit") -@click.argument("config") -def test_rules(config): +@click.argument("configs", nargs=-1) +def test_rules(configs: tuple[str]) -> None: """ Test rules against their respective test files CONFIG is a path to configuration file (filepath or URL). """ - tester = AutoRuleTester(config) + config_obj = _get_configuration(configs) + config_path = Path(tempfile.gettempdir(), "auto-rule-test") + config_path.write_text(config_obj.as_yaml(), encoding="utf-8") + tester = AutoRuleTester(config_path) tester.run() @test.command( short_help="Run the rule corpus tester against a given configuration", name="integration" ) -@click.argument("config") +@click.argument("configs", nargs=-1, required=False) @click.argument("testdata") -def test_ruleset(config, testdata): +def test_ruleset(configs: tuple[str], testdata: str): """Test the given ruleset against specified test data \b CONFIG is a path to configuration file (filepath or URL). TESTDATA is a path to a set of test files. """ - tester = RuleCorpusTester(config, testdata) + _ = _get_configuration(configs) + tester = RuleCorpusTester(configs, testdata) tester.run() @cli.command(short_help="Generate load for a running logprep instance [Not Yet Implemented]") -def generate(): +def generate() -> None: """ Generate load offers two different options to create sample events for a running logprep instance. @@ -228,22 +194,33 @@ def generate(): @cli.command(short_help="Print a complete configuration file [Not Yet Implemented]", name="print") -@click.argument("config") +@click.argument("configs", nargs=-1, required=True) @click.option( "--output", type=click.Choice(["json", "yaml"]), default="yaml", help="What output format to use", ) -def print_config(config, output): +def print_config(configs: tuple[str], output) -> None: """ Prints the given configuration as a combined yaml or json file, with all rules and options included. CONFIG is a path to configuration file (filepath or URL). """ - raise NotImplementedError + config = _get_configuration(configs) + if output == "json": + print(config.as_json(indent=2)) + else: + print(config.as_yaml()) + + +def signal_handler(__: int, _) -> None: + """Handle signals for stopping the runner and reloading the configuration.""" + Runner.get_runner(Configuration()).stop() if __name__ == "__main__": + signal.signal(signal.SIGTERM, signal_handler) + signal.signal(signal.SIGINT, signal_handler) cli() diff --git a/logprep/runner.py b/logprep/runner.py index 5d8496661..eb89b148a 100644 --- a/logprep/runner.py +++ b/logprep/runner.py @@ -1,12 +1,10 @@ """This module contains the logprep runner and is responsible for signal handling.""" + # pylint: disable=logging-fstring-interpolation import logging -import signal -from ctypes import c_bool -from multiprocessing import Value, current_process +from typing import Generator -import requests from attrs import define, field from schedule import Scheduler @@ -14,39 +12,12 @@ from logprep.abc.component import Component from logprep.framework.pipeline_manager import PipelineManager from logprep.metrics.metrics import CounterMetric, GaugeMetric -from logprep.util.configuration import Configuration, InvalidConfigurationError - - -class RunnerError(Exception): - """Base class for Runner related exceptions.""" - - -class MustNotConfigureTwiceError(RunnerError): - """Raise if the configuration has been set more than once.""" - - -class NotALoggerError(RunnerError): - """Raise if the logger was assigned a non-logger object .""" - - -class MustConfigureALoggerError(RunnerError): - """Raise if no logger has been configured.""" - - -class MustConfigureBeforeRunningError(RunnerError): - """Raise if the runner has been started before it has been configured.""" - - -class MustNotCreateMoreThanOneManagerError(RunnerError): - """Raise if more than once managers have been created.""" - - -class CannotReloadWhenConfigIsUnsetError(RunnerError): - """Raise if the configuration was reloaded but not set.""" - - -class UseGetRunnerToCreateRunnerSingleton(RunnerError): - """ "Raise if the runner was not created as a singleton.""" +from logprep.util.configuration import ( + ConfigGetterException, + Configuration, + ConfigVersionDidNotChangeError, + InvalidConfigurationError, +) class Runner: @@ -59,24 +30,27 @@ class Runner: configuration (a YAML file, see documentation for details). Finally, call the start method to start processing. - The Runner should only raise exceptions derived from RunnerError but other components may raise - exceptions that are not catched by it. Hence, we recommend to simply catch Exception and - log it as an unhandled exception. - Example ------- For a complete example take a Look at run_logprep.py - for simply getting a Runner started this should suffice: - >>> runner = Runner.get_runner() - >>> runner.set_logger(logging.getLogger()) - >>> runner.load_configuration(path_to_configuration) + >>> configuration = Configuration.from_sources(["path/to/config.yml"]) + >>> runner = Runner.get_runner(configuration) >>> runner.start() """ + scheduler: Scheduler + _runner = None + _configuration: Configuration + + _metrics: "Runner.Metrics" + + _exit_received: bool = False + scheduler: Scheduler @define(kw_only=True) @@ -102,175 +76,119 @@ class Metrics(Component.Metrics): """Indicates the configuration refresh interval in seconds.""" number_of_config_refreshes: CounterMetric = field( factory=lambda: CounterMetric( - description="Logprep config refresh interval", + description="Indicates how often the logprep configuration was updated.", name="number_of_config_refreshes", labels={"from": "unset", "config": "unset"}, ) ) """Indicates how often the logprep configuration was updated.""" + number_of_config_refresh_failures: CounterMetric = field( + factory=lambda: CounterMetric( + description=( + "Indicates how often the logprep configuration " + "could not be updated due to failures during the update." + ), + name="number_of_config_refreshes", + labels={"from": "unset", "config": "unset"}, + ) + ) + """Indicates how often the logprep configuration could not be updated + due to failures during the update.""" @property def _metric_labels(self) -> dict[str, str]: versions = get_versions() labels = { "logprep": f"{versions.get('version')}", - "config": f"{self._configuration.get('version', 'unset')}", + "config": f"{self._configuration.version}", } return labels + @property + def _config_refresh_interval(self) -> int: + """Indicates the configuration refresh interval in seconds.""" + return self._configuration.config_refresh_interval + + @_config_refresh_interval.setter + def _config_refresh_interval(self, value: int | None) -> None: + """Set the configuration refresh interval in seconds.""" + if value is None: + self._configuration.config_refresh_interval = None + elif value <= 5: + self._configuration.config_refresh_interval = 5 + else: + self._configuration.config_refresh_interval = value + # Use this method to obtain a runner singleton for production @staticmethod - def get_runner(): + def get_runner(configuration: Configuration) -> "Runner": """Create a Runner singleton.""" if Runner._runner is None: - Runner._runner = Runner(bypass_check_to_obtain_non_singleton_instance=True) + Runner._runner = Runner(configuration) return Runner._runner # For production, use the get_runner method to create/get access to a singleton! - def __init__(self, bypass_check_to_obtain_non_singleton_instance=False): - self._configuration = None - self._yaml_path = None + def __init__(self, configuration: Configuration) -> None: + self._configuration = configuration self.metrics = self.Metrics(labels={"logprep": "unset", "config": "unset"}) self._logger = logging.getLogger("Logprep Runner") - self._config_refresh_interval = None - self._manager = None + self._manager = PipelineManager(configuration) self.scheduler = Scheduler() - # noinspection PyTypeChecker - self._continue_iterating = Value(c_bool) - self._continue_iterating.value = False - - if not bypass_check_to_obtain_non_singleton_instance: - raise UseGetRunnerToCreateRunnerSingleton - - def load_configuration(self, yaml_file: str): - """Load the configuration from a YAML file (cf. documentation). - - This will raise an exception if the configuration is not valid. - - Parameters - ---------- - yaml_file: str - Path to a configuration YAML file. - - Raises - ------ - MustNotConfigureTwiceError - If '_configuration' was already set. - - """ - if self._configuration is not None: - raise MustNotConfigureTwiceError - - configuration = Configuration.create_from_yaml(yaml_file) - configuration.verify(self._logger) - - self._yaml_path = yaml_file - self._configuration = configuration - self._config_refresh_interval = configuration.get("config_refresh_interval") - self.metrics.version_info.add_with_labels(1, self._metric_labels) - def start(self): """Start processing. This runs until an SIGTERM, SIGINT or KeyboardInterrupt signal is received, or an unhandled error occurs. - - Raises - ------ - MustConfigureBeforeRunningError - If '_configuration' was not set before starting the Runner. - MustConfigureALoggerError - If '_logger' was not set before reloading the configuration. - """ - if self._configuration is None: - raise MustConfigureBeforeRunningError - if self._logger is None: - raise MustConfigureALoggerError - - self._create_manager() - if self._config_refresh_interval is not None: - self.metrics.config_refresh_interval += self._config_refresh_interval - self._manager.set_configuration(self._configuration) - self._manager.set_count(self._configuration["process_count"]) - self._logger.debug("Pipeline manager initiated") - - with self._continue_iterating.get_lock(): - self._continue_iterating.value = True + self._set_version_info_metric() self._schedule_config_refresh_job() - if self._manager.prometheus_exporter: - self._manager.prometheus_exporter.run() + self._manager.restart() self._logger.info("Startup complete") self._logger.debug("Runner iterating") for _ in self._keep_iterating(): - self._loop() - self.stop() - + if self._exit_received: + break + self.scheduler.run_pending() + self._manager.restart_failed_pipeline() + self._logger.info("Shutting down") self._logger.info("Initiated shutdown") self._manager.stop() self._logger.info("Shutdown complete") - def _loop(self): - self.scheduler.run_pending() - self._manager.restart_failed_pipeline() - - def reload_configuration(self, refresh=False): - """Reload the configuration from the configured yaml path. - - Raises - ------ - CannotReloadWhenConfigIsUnsetError - If '_configuration' was never set before reloading the configuration. - - """ - if self._configuration is None: - raise CannotReloadWhenConfigIsUnsetError + def reload_configuration(self): + """Reloads the configuration""" try: - new_configuration = Configuration.create_from_yaml(self._yaml_path) - self._config_refresh_interval = new_configuration.get("config_refresh_interval") + self._configuration.reload() + self._logger.info("Successfully reloaded configuration") + self.metrics.number_of_config_refreshes += 1 + self._manager.restart() self._schedule_config_refresh_job() - except (requests.RequestException, FileNotFoundError) as error: + self._logger.info(f"Configuration version: {self._configuration.version}") + self._set_version_info_metric() + except ConfigGetterException as error: self._logger.warning(f"Failed to load configuration: {error}") - current_refresh_interval = self._config_refresh_interval - if isinstance(current_refresh_interval, (float, int)): - new_refresh_interval = current_refresh_interval / 4 - self._config_refresh_interval = new_refresh_interval - self.metrics.config_refresh_interval += new_refresh_interval + self.metrics.number_of_config_refresh_failures += 1 + self._config_refresh_interval = int(self._config_refresh_interval / 4) self._schedule_config_refresh_job() - return - if refresh: - version_differ = new_configuration.get("version") != self._configuration.get("version") - if not version_differ: - self._logger.info( - "Configuration version didn't change. Continue running with current version." - ) - self._logger.info( - f"Configuration version: {self._configuration.get('version', 'unset')}" - ) - return - try: - new_configuration.verify(self._logger) - - # Only reached when configuration is verified successfully - self._configuration = new_configuration - self._schedule_config_refresh_job() - self._manager.set_configuration(self._configuration) - self._manager.restart() - self._logger.info("Successfully reloaded configuration") - config_version = self._configuration.get("version", "unset") - self._logger.info(f"Configuration version: {config_version}") - self.metrics.version_info.add_with_labels(1, self._metric_labels) - self.metrics.number_of_config_refreshes += 1 - if self._config_refresh_interval is not None: - self.metrics.config_refresh_interval += self._config_refresh_interval + except ConfigVersionDidNotChangeError as error: + self._logger.info(str(error)) except InvalidConfigurationError as error: - self._logger.error( - "Invalid configuration, leaving old" - f" configuration in place: {self._yaml_path}: {str(error)}" - ) + self._logger.error(str(error)) + self.metrics.number_of_config_refresh_failures += 1 + + def _set_version_info_metric(self): + self.metrics.version_info.add_with_labels( + 1, + {"logprep": f"{get_versions()['version']}", "config": self._configuration.version}, + ) + + def stop(self): + """Stop the logprep runner. Is called by the signal handler + in run_logprep.py.""" + self._exit_received = True def _schedule_config_refresh_job(self): refresh_interval = self._config_refresh_interval @@ -278,43 +196,12 @@ def _schedule_config_refresh_job(self): if scheduler.jobs: scheduler.cancel_job(scheduler.jobs[0]) if isinstance(refresh_interval, (float, int)): - refresh_interval = 5 if refresh_interval < 5 else refresh_interval - scheduler.every(refresh_interval).seconds.do(self.reload_configuration, refresh=True) + self.metrics.config_refresh_interval += refresh_interval + scheduler.every(refresh_interval).seconds.do(self.reload_configuration) self._logger.info(f"Config refresh interval is set to: {refresh_interval} seconds") - def _create_manager(self): - if self._manager is not None: - raise MustNotCreateMoreThanOneManagerError - self._manager = PipelineManager() + def _keep_iterating(self) -> Generator: + """Indicates whether the runner should keep iterating.""" - def stop(self): - """Stop the current process""" - if current_process().name == "MainProcess": - if self._logger is not None: - self._logger.info("Shutting down") - with self._continue_iterating.get_lock(): - self._continue_iterating.value = False - - def _keep_iterating(self): - """generator function""" - while True: - with self._continue_iterating.get_lock(): - iterate = self._continue_iterating.value - if not iterate: - return - yield iterate - - -def signal_handler(signal_number: int, _): - """Handle signals for stopping the runner and reloading the configuration.""" - if signal_number == signal.SIGUSR1: - print("Info: Reloading config") - Runner.get_runner().reload_configuration() - else: - Runner.get_runner().stop() - - -# Register signals -signal.signal(signal.SIGTERM, signal_handler) -signal.signal(signal.SIGINT, signal_handler) -signal.signal(signal.SIGUSR1, signal_handler) + while 1: # pragma: no cover + yield 1 diff --git a/logprep/util/auto_rule_tester/auto_rule_corpus_tester.py b/logprep/util/auto_rule_tester/auto_rule_corpus_tester.py index 11c898aa4..ec8014038 100644 --- a/logprep/util/auto_rule_tester/auto_rule_corpus_tester.py +++ b/logprep/util/auto_rule_tester/auto_rule_corpus_tester.py @@ -11,7 +11,7 @@ .. code-block:: bash :caption: Run rule corpus test - logprep $CONFIG --auto-corpus-test --corpus-testdata $CORPUS_TEST_DATA + logprep test integration $CONFIG $CORPUS_TEST_DATA Where in the parameter :code:`CONFIG` should point to a valid logprep configuration and :code:`CORPUS_TEST_DATA` to a directory containing the test data with the different test cases. @@ -96,7 +96,7 @@ from pprint import pprint from typing import List -from attr import define, validators, field, Factory +from attr import Factory, define, field, validators from colorama import Fore, Style from deepdiff import DeepDiff, grep @@ -189,13 +189,19 @@ def _pipeline(self): merged_input_file_path = Path(self._tmp_dir) / "input.json" inputs = [test_case.input_document for test_case in self._test_cases.values()] merged_input_file_path.write_text(json.dumps(inputs), encoding="utf8") - path_to_patched_config = Configuration.patch_yaml_with_json_connectors( - self._original_config_path, self._tmp_dir, str(merged_input_file_path) - ) - config = Configuration.create_from_yaml(path_to_patched_config) - config.verify_pipeline_without_processor_outputs(getLogger("logprep")) - del config["output"] - pipeline = Pipeline(config=config) + patched_config = Configuration() + patched_config.input = { + "patched_input": {"type": "json_input", "documents_path": str(merged_input_file_path)} + } + config = Configuration.from_sources([self._original_config_path]) + input_config = config.input + connector_name = list(input_config.keys())[0] + if "preprocessing" in input_config[connector_name]: + patched_config.input["patched_input"] |= { + "preprocessing": input_config[connector_name]["preprocessing"] + } + patched_config.pipeline = config.pipeline + pipeline = Pipeline(config=patched_config) pipeline.logger = self._logprep_logger return pipeline diff --git a/logprep/util/auto_rule_tester/auto_rule_tester.py b/logprep/util/auto_rule_tester/auto_rule_tester.py index 13f0b71ff..563f579df 100644 --- a/logprep/util/auto_rule_tester/auto_rule_tester.py +++ b/logprep/util/auto_rule_tester/auto_rule_tester.py @@ -37,15 +37,15 @@ .. code-block:: bash :caption: Directly with Python - PYTHONPATH="." python3 logprep/run_logprep.py $CONFIG --auto-test + PYTHONPATH="." python3 logprep/run_logprep.py test unit $CONFIG .. code-block:: bash :caption: With PEX file - logprep.pex $CONFIG --auto-test + logprep.pex test unit $CONFIG Where :code:`$CONFIG` is the path to a configuration file -(see :doc:`configuration/configurationdata`). +(see :ref:`configuration`). Auto-testing does also perform a verification of the pipeline section of the Logprep configuration. """ diff --git a/logprep/util/configuration.py b/logprep/util/configuration.py index 689c10f18..ff25e704a 100644 --- a/logprep/util/configuration.py +++ b/logprep/util/configuration.py @@ -1,75 +1,181 @@ -"""This module is used to create the configuration for the runner.""" - -import re -import sys +""" +Configuration is done via YAML or JSON files or http api ressources. +Logprep searches for the file :code:`/etc/logprep/pipeline.yml` if no +configuration file is passed. + +You can pass multiple configuration files via valid file paths or urls. + +.. code-block:: bash + + logprep run /different/path/file.yml + +or + +.. code-block:: bash + + logprep run http://url-to-our-yaml-file-or-api + +or + +.. code-block:: bash + + logprep run http://api/v1/pipeline http://api/v1/addition_processor_pipline /path/to/conector.yaml + +The options under :code:`input`, :code:`output` and :code:`pipeline` are passed +to factories in Logprep. +They contain settings for each separate processor and connector. +Details for configuring connectors are described in +:ref:`output` and :ref:`input` and for processors in :ref:`processors`. +General information about the configuration of the pipeline can be found +in :ref:`pipeline_config`. + +It is possible to use environment variables in all configuration +and rule files in all places. +Environment variables have to be set in uppercase and prefixed +with :code:`LOGPREP_`, :code:`GITHUB_`, :code:`PYTEST_` or +:code:`CI_`. Lowercase variables are ignored. Forbidden +variable names are: :code:`["LOGPREP_LIST"]`, as it is already used internally. + +The following config file will be valid by setting the given environment variables: + +.. code-block:: yaml + :caption: pipeline.yml config file + + version: $LOGPREP_VERSION + process_count: $LOGPREP_PROCESS_COUNT + timeout: 0.1 + logger: + level: $LOGPREP_LOG_LEVEL + $LOGPREP_PIPELINE + $LOGPREP_INPUT + $LOGPREP_OUTPUT + + +.. code-block:: bash + :caption: setting the bash environment variables + + export LOGPREP_VERSION="1" + export LOGPREP_PROCESS_COUNT="1" + export LOGPREP_LOG_LEVEL="DEBUG" + export LOGPREP_PIPELINE=" + pipeline: + - labelername: + type: labeler + schema: quickstart/exampledata/rules/labeler/schema.json + include_parent_labels: true + specific_rules: + - quickstart/exampledata/rules/labeler/specific + generic_rules: + - quickstart/exampledata/rules/labeler/generic" + export LOGPREP_OUTPUT=" + output: + kafka: + type: confluentkafka_output + topic: producer + error_topic: producer_error + flush_timeout: 30 + send_timeout: 2 + kafka_config: + bootstrap.servers: localhost:9092" + export LOGPREP_INPUT=" + input: + kafka: + type: confluentkafka_input + topic: consumer + offset_reset_policy: smallest + kafka_config: + bootstrap.servers: localhost:9092 + group.id: test" +""" + +import json +import os from copy import deepcopy -from logging import Logger +from itertools import chain +from logging import getLogger from pathlib import Path -from typing import List +from typing import Any, Iterable, List, Optional -from colorama import Fore +from attrs import asdict, define, field, validators +from requests import RequestException +from ruamel.yaml import YAML +from ruamel.yaml.compat import StringIO from ruamel.yaml.scanner import ScannerError from logprep.abc.getter import Getter from logprep.abc.processor import Processor from logprep.factory import Factory -from logprep.factory_error import FactoryError -from logprep.factory_error import ( - InvalidConfigurationError as FactoryInvalidConfigurationError, -) -from logprep.factory_error import UnknownComponentTypeError +from logprep.factory_error import FactoryError, InvalidConfigurationError from logprep.processor.base.exceptions import InvalidRuleDefinitionError -from logprep.util.getter import GetterFactory -from logprep.util.helper import print_fcolor -from logprep.util.json_handling import dump_config_as_file +from logprep.util import getter +from logprep.util.defaults import DEFAULT_CONFIG_LOCATION +from logprep.util.getter import GetterFactory, GetterNotFoundError +from logprep.util.json_handling import list_json_files_in_directory -class InvalidConfigurationError(BaseException): - """Base class for Configuration related exceptions.""" +class MyYAML(YAML): + """helper class to dump yaml with ruamel.yaml""" + + def dump(self, data, stream=None, **kw): + inefficient = False + if stream is None: + inefficient = True + stream = StringIO() + YAML.dump(self, data, stream, **kw) + if inefficient: + return stream.getvalue() - def __init__(self, unprefixed_message: str = None, message: str = None): - if unprefixed_message is not None: - super().__init__(unprefixed_message) - elif message is not None: - super().__init__(f"Invalid Configuration: {message}") - else: - super().__init__("Invalid Configuration.") + +yaml = MyYAML(pure=True) class InvalidConfigurationErrors(InvalidConfigurationError): """Raise for multiple Configuration related exceptions.""" - def __init__(self, errors: List[InvalidConfigurationError]): - self.errors = errors + errors: List[InvalidConfigurationError] + + def __init__(self, errors: List[Exception]): + unique_errors = [] + for error in errors: + if not isinstance(error, InvalidConfigurationError): + error = InvalidConfigurationError(*error.args) + if error not in unique_errors: + unique_errors.append(error) + else: + if error not in unique_errors: + unique_errors.append(error) + self.errors = unique_errors super().__init__("\n".join([str(error) for error in self.errors])) -class RequiredConfigurationKeyMissingError(InvalidConfigurationError): - """Raise if required option is missing in configuration.""" +class ConfigVersionDidNotChangeError(InvalidConfigurationError): + """Raise if configuration version did not change.""" - def __init__(self, key: str): - super().__init__(f"Required option is missing: {key}") + def __init__(self): + super().__init__( + "Configuration version didn't change. Continue running with current version." + ) -class InvalidProcessorConfigurationError(InvalidConfigurationError): - """Raise if processor configuration is invalid.""" +class ConfigGetterException(InvalidConfigurationError): + """Raise if configuration getter fails.""" def __init__(self, message: str): - super().__init__(f"Invalid processor configuration: {message}") + super().__init__(message) -class InvalidInputConnectorConfigurationError(InvalidConfigurationError): - """Raise if input connector configuration is invalid.""" +class RequiredConfigurationKeyMissingError(InvalidConfigurationError): + """Raise if required option is missing in configuration.""" - def __init__(self, message: str): - super().__init__(f"Invalid input connector configuration: {message}") + def __init__(self, key: str): + super().__init__(f"Required option is missing: {key}") -class InvalidOutputConnectorConfigurationError(InvalidConfigurationError): - """Raise if output connector configuration is invalid.""" +class InvalidProcessorConfigurationError(InvalidConfigurationError): + """Raise if processor configuration is invalid.""" def __init__(self, message: str): - super().__init__(f"Invalid output connector configuration: {message}") + super().__init__(f"Invalid processor configuration: {message}") class MissingEnvironmentError(InvalidConfigurationError): @@ -79,295 +185,362 @@ def __init__(self, message: str): super().__init__(f"Environment variable(s) used, but not set: {message}") -class Configuration(dict): - """Used to create and verify a configuration dict parsed from a YAML file.""" - - _getter: Getter +@define(kw_only=True, frozen=True) +class MetricsConfig: + """the metrics config class used in Configuration""" + + enabled: bool = field(validator=validators.instance_of(bool), default=False) + port: int = field(validator=validators.instance_of(int), default=8000) + + +@define(kw_only=True) +class Configuration: + """the configuration class""" + + version: str = field( + validator=validators.instance_of(str), converter=str, default="unset", eq=True + ) + """It is optionally possible to set a version to your configuration file which + can be printed via :code:`logprep run --version config/pipeline.yml`. + This has no effect on the execution of logprep and is merely used for documentation purposes. + Defaults to :code:`unset`.""" + config_refresh_interval: Optional[int] = field( + validator=validators.instance_of((int, type(None))), default=None, eq=False + ) + """Configures the interval in seconds on which logprep should try to reload the configuration. + If not configured, logprep won't reload the configuration automatically. + If configured the configuration will only be reloaded if the configuration version changes. + If http errors occurs on configuration reload `config_refresh_interval` is set to a quarter + of the current `config_refresh_interval` until a minimum of 5 seconds is reached. + Defaults to :code:`None`, which means that the configuration will not be refreshed.""" + process_count: int = field( + validator=[validators.instance_of(int), validators.ge(1)], default=1, eq=False + ) + """Number of logprep processes to start. Defaults to :code:`1`.""" + timeout: float = field( + validator=[validators.instance_of(float), validators.gt(0)], default=5.0, eq=False + ) + """Logprep tries to react to signals (like sent by CTRL+C) within the given time. + The time taken for some processing steps is not always predictable, thus it is not possible to + ensure that this time will be adhered to. + However, Logprep reacts quickly for small values (< 1.0), but this requires more + processing power. This can be useful for testing and debugging. + Larger values (like 5.0) slow the reaction time down, but this requires less processing power, + which makes in preferable for continuous operation. Defaults to :code:`5.0`.""" + logger: dict = field( + validator=validators.instance_of(dict), default={"level": "INFO"}, eq=False + ) + """Logger configuration. Defaults to :code:`{"level": "INFO"}`.""" + input: dict = field(validator=validators.instance_of(dict), factory=dict, eq=False) + """Input connector configuration. Defaults to :code:`{}`.""" + output: dict = field(validator=validators.instance_of(dict), factory=dict, eq=False) + """Output connector configuration. Defaults to :code:`{}`.""" + pipeline: list[dict] = field(validator=validators.instance_of(list), factory=list, eq=False) + """Pipeline configuration. Defaults to :code:`[]`.""" + metrics: MetricsConfig = field( + validator=validators.instance_of(MetricsConfig), + factory=MetricsConfig, + converter=lambda x: MetricsConfig(**x) if isinstance(x, dict) else x, + eq=False, + ) + """Metrics configuration. Defaults to :code:`{"enabled": False, "port": 8000}`.""" + profile_pipelines: bool = field(default=False, eq=False) + """Start the profiler to profile the pipeline. Defaults to :code:`False`.""" + print_auto_test_stack_trace: bool = field(default=False, eq=False) + """Print stack trace when auto test fails. Defaults to :code:`False`.""" + + _getter: Getter = field( + validator=validators.instance_of(Getter), + default=GetterFactory.from_string(DEFAULT_CONFIG_LOCATION), + repr=False, + eq=False, + ) + + _configs: tuple["Configuration"] = field( + validator=validators.instance_of(tuple), factory=tuple, repr=False, eq=False + ) @property - def path(self): - """returns the path of the configuration""" - return f"{self._getter.protocol}://{self._getter.target}" - - @path.setter - def path(self, path): - """sets the path and getter""" - self._getter = GetterFactory.from_string(path) + def config_paths(self) -> list[str]: + """Paths of the configuration files.""" + # pylint: disable=protected-access + targets = ( + (config._getter.protocol, config._getter.target) + for config in self._configs + if config._getter + ) + # pylint: enable=protected-access + return [f"{protocol}://{target}" for protocol, target in targets] @classmethod - def create_from_yaml(cls, path: str) -> "Configuration": - """Create configuration from a YAML file. + def from_source(cls, config_path: str) -> "Configuration": + """Create configuration from an uri source. Parameters ---------- - path : str - Path of file to create configuration from. + config_path : str + uri of file to create configuration from. Returns ------- config : Configuration - Configuration object based on dictionary. + Configuration object attrs class. """ - config_getter = GetterFactory.from_string(path) try: - config_dict = config_getter.get_json() - except ValueError: + config_getter = GetterFactory.from_string(config_path) try: + config_dict = config_getter.get_json() + except (json.JSONDecodeError, ValueError): config_dict = config_getter.get_yaml() - except ScannerError as error: - print_fcolor(Fore.RED, f"Error parsing YAML file: {path}\n{error}") - sys.exit(1) - config = Configuration() - config._getter = config_getter - config.update(config_dict) + config = Configuration(**config_dict, getter=config_getter) + except TypeError as error: + raise InvalidConfigurationError( + f"Invalid configuration file: {config_path} {error.args[0]}" + ) from error + except ValueError as error: + raise InvalidConfigurationError( + f"Invalid configuration file: {config_path} {str(error)}" + ) from error + config._configs = (config,) return config - @staticmethod - def patch_yaml_with_json_connectors( - original_config_path: str, output_dir: str, input_file_path: str = None - ) -> str: - """ - Patches a given configuration file with jsonl input and output connectors, while - maintaining the input preprocessors. Additionally, the process_count is set to one and the - metrics configuration are removed, if present. + @classmethod + def from_sources(cls, config_paths: Iterable[str] = None) -> "Configuration": + """Creates configuration from a list of configuration sources. Parameters ---------- - original_config_path : str - Path to the original configuration file that should be patched - output_dir : str - Path where the patched configuration file should be saved to. That is the same - path where the jsonl connectors read and write the input/output files. - input_file_path : Optional[str] - If a concrete input file is given, then it is used in the patched input connector + config_paths : list[str] + List of configuration sources (URI) to create configuration from. Returns ------- - patched_config_path : str - The path to the patched configuration file - """ - configuration = GetterFactory.from_string(original_config_path).get_yaml() - configured_input = configuration.get("input", {}) - input_file_path = input_file_path if input_file_path else f"{output_dir}/input.json" - input_type = "jsonl_input" if input_file_path.endswith(".jsonl") else "json_input" - configuration["input"] = { - "patched_input": { - "type": input_type, - "documents_path": input_file_path, - } - } - if configured_input: - input_name = list(configured_input.keys())[0] - preprocessors = configured_input.get(input_name, {}).get("preprocessing", {}) - if preprocessors: - configuration["input"]["patched_input"]["preprocessing"] = preprocessors - configuration["output"] = { - "patched_output": { - "type": "jsonl_output", - "output_file": f"{output_dir}/output.json", - "output_file_custom": f"{output_dir}/output_custom.json", - "output_file_error": f"{output_dir}/output_error.json", - } - } - configuration["process_count"] = 1 - if "metrics" in configuration: - del configuration["metrics"] - patched_config_path = Path(output_dir) / "patched_config.yml" - dump_config_as_file(str(patched_config_path), configuration) - return str(patched_config_path) - - def verify(self, logger: Logger): - """Verify the configuration.""" - errors = self._check_for_errors(logger) - self._print_and_raise_errors(errors) - for error in errors: - raise error - - def verify_pipeline_only(self, logger: Logger): - """Verify the configuration only for the pipeline. - - This is used to check rules where it is not necessary to start the whole framework. - """ - errors = [] - try: - self._verify_pipeline(logger) - except InvalidConfigurationError as error: - errors.append(error) - self._print_and_raise_errors(errors) + config : Configuration + resulting configuration object. - def verify_pipeline_without_processor_outputs(self, logger: Logger): - """Verify the configuration only for the pipeline, but ignore processor output errors. - This is used to check if the configuration is valid inside the auto rule tester and the - rule corpus tester, as the configuration does not have an output there. """ + if not config_paths: + config_paths = [DEFAULT_CONFIG_LOCATION] errors = [] - try: - self._verify_pipeline_without_processor_outputs(logger) - except InvalidConfigurationError as error: - errors.append(error) - self._print_and_raise_errors(errors) - - def _check_for_errors(self, logger: Logger) -> List[InvalidConfigurationError]: - errors = [] - try: - self._verify_environment() - except MissingEnvironmentError as error: - errors.append(error) - try: - self._verify_required_keys_exist() - except InvalidConfigurationError as error: - errors.append(error) - try: - self._verify_values_make_sense() - except InvalidConfigurationError as error: - errors.append(error) - try: - self._verify_input(logger) - except InvalidConfigurationError as error: - errors.append(error) - try: - self._verify_output(logger) - except InvalidConfigurationError as error: - errors.append(error) - try: - self._verify_pipeline(logger) - except InvalidConfigurationError as error: - errors.append(error) - if self.get("metrics", {}): + configs = [] + for config_path in config_paths: try: - self._verify_metrics_config() + config = Configuration.from_source(config_path) + configs.append(config) + except (GetterNotFoundError, RequestException) as error: + raise ConfigGetterException(f"{config_path} {error}") from error + except FileNotFoundError as error: + raise ConfigGetterException( + f"One or more of the given config file(s) does not exist: {error.filename}\n", + ) from error + except ScannerError as error: + raise ConfigGetterException( + f"Invalid yaml or json file: {config_path} {error.problem}\n" + ) from error except InvalidConfigurationError as error: errors.append(error) - return errors - - def _verify_environment(self): - if self._getter.missing_env_vars: - missing_env_error = MissingEnvironmentError(", ".join(self._getter.missing_env_vars)) - raise InvalidConfigurationErrors([missing_env_error]) - - def _verify_required_keys_exist(self): - required_keys = ["process_count", "timeout"] - + configuration = Configuration() + configuration._configs = tuple(configs) + configuration._set_attributes_from_configs() + try: + configuration._build_merged_pipeline() + except InvalidConfigurationErrors as error: + errors = [*errors, *error.errors] + try: + configuration._verify() + except InvalidConfigurationErrors as error: + errors = [*errors, *error.errors] + if errors: + raise InvalidConfigurationErrors(errors) + return configuration + + def as_dict(self) -> dict: + """Return the configuration as dict.""" + return asdict( + self, + filter=lambda attribute, _: attribute.name not in ("_getter", "_configs"), + recurse=True, + ) + + def as_json(self, indent=None) -> str: + """Return the configuration as json string.""" + return json.dumps(self.as_dict(), indent=indent) + + def as_yaml(self) -> str: + """Return the configuration as yaml string.""" + return yaml.dump(self.as_dict()) + + def reload(self) -> None: + """Reload the configuration.""" errors = [] - for key in required_keys: - if key not in self: - errors.append(RequiredConfigurationKeyMissingError(key)) + try: + new_config = Configuration.from_sources(self.config_paths) + if new_config == self: + raise ConfigVersionDidNotChangeError() + self._configs = new_config._configs # pylint: disable=protected-access + self._set_attributes_from_configs() + self.pipeline = new_config.pipeline + except InvalidConfigurationErrors as error: + errors = [*errors, *error.errors] if errors: raise InvalidConfigurationErrors(errors) - def _verify_values_make_sense(self): - errors = [] - if "process_count" in self and self["process_count"] < 1: - errors.append( - InvalidConfigurationError( - message=f"Process count must be an integer of one or larger, not: " - f'{self["process_count"]}' - ) - ) - if "pipeline" in self and not self["pipeline"]: - errors.append( - InvalidConfigurationError(message='"pipeline" must contain at least one item!') + def _set_attributes_from_configs(self) -> None: + for attribute in filter(lambda x: x.repr, self.__attrs_attrs__): + setattr( + self, + attribute.name, + self._get_last_non_falsy_value(self._configs, attribute.name), ) + def _build_merged_pipeline(self): + pipelines = (config.pipeline for config in self._configs if config.pipeline) + pipeline = list(chain(*pipelines)) + errors = [] + pipeline_with_loaded_rules = [] + for processor_definition in pipeline: + try: + processor_definition_with_rules = self._load_rule_definitions(processor_definition) + pipeline_with_loaded_rules.append(processor_definition_with_rules) + except (FactoryError, TypeError, ValueError, InvalidRuleDefinitionError) as error: + errors.append(error) if errors: raise InvalidConfigurationErrors(errors) + self.pipeline = pipeline_with_loaded_rules + + def _load_rule_definitions(self, processor_definition: dict) -> dict: + processor_definition = deepcopy(processor_definition) + _ = Factory.create(processor_definition, logger=getLogger(__name__)) + processor_name, processor_config = processor_definition.popitem() + for rule_tree_name in ("specific_rules", "generic_rules"): + rules_targets = self._resolve_directories(processor_config.get(rule_tree_name, [])) + rules_definitions = list( + chain(*[self._get_dict_list_from_target(target) for target in rules_targets]) + ) + processor_config[rule_tree_name] = rules_definitions + return {processor_name: processor_config} - def _verify_input(self, logger): + @staticmethod + def _get_dict_list_from_target(rule_target: str | dict) -> list[dict]: + """Create a rule from a file.""" + if isinstance(rule_target, dict): + return [rule_target] + content = GetterFactory.from_string(rule_target).get() try: - _ = Factory.create(self["input"], logger) - except FactoryError as error: - raise InvalidInputConnectorConfigurationError(str(error)) from error - except TypeError as error: - msg = self._format_type_error(error) - raise InvalidInputConnectorConfigurationError(msg) from error - except KeyError as error: - raise RequiredConfigurationKeyMissingError("input") from error + rule_data = json.loads(content) + except ValueError: + rule_data = yaml.load_all(content) + if isinstance(rule_data, dict): + return [rule_data] # pragma: no cover + return list(rule_data) - def _verify_output(self, logger): - try: - output_configs = self.get("output") - output_names = list(output_configs.keys()) - for output_name in output_names: - output_config = output_configs.get(output_name) - Factory.create({output_name: output_config}, logger) - except FactoryError as error: - raise InvalidOutputConnectorConfigurationError(str(error)) from error - except TypeError as error: - msg = self._format_type_error(error) - raise InvalidOutputConnectorConfigurationError(msg) from error - except (AttributeError, KeyError) as error: - raise RequiredConfigurationKeyMissingError("output") from error + @staticmethod + def _resolve_directories(rule_sources: list) -> list: + """resolves directories to a list of files or rule definitions + + Parameters + ---------- + rule_sources : list + a list of files, directories or rule definitions + + Returns + ------- + list + a list of files and rule definitions + """ + resolved_sources = [] + for rule_source in rule_sources: + if isinstance(rule_source, dict): + resolved_sources.append(rule_source) + continue + getter_instance = getter.GetterFactory.from_string(rule_source) + if getter_instance.protocol == "file": + if Path(getter_instance.target).is_dir(): + paths = list_json_files_in_directory(getter_instance.target) + for file_path in paths: + resolved_sources.append(file_path) + else: + resolved_sources.append(rule_source) + else: + resolved_sources.append(rule_source) + return resolved_sources @staticmethod - def _format_type_error(error: TypeError) -> str: - msg = str(error) - if "missing" in str(error): - parameters = re.split(r"argument(s)?:", str(error))[-1].strip() - msg = f"Required option(s) are missing: {parameters}." - elif "unexpected" in str(error): - parameter = str(error).rsplit("argument ", maxsplit=1)[-1].strip() - msg = f"Unknown option: {parameter}." - return msg - - def _verify_pipeline(self, logger: Logger): - self._verify_pipeline_key() + def _get_last_non_falsy_value(configs: list["Configuration"], attribute: str) -> Any: + if configs: + values = [getattr(config, attribute) for config in configs] + for value in reversed(values): + if value: + return value + return values[-1] + return getattr(Configuration(), attribute) + + def _verify(self): + """Verify the configuration.""" errors = [] - for processor_config in self["pipeline"]: - processor = self._verify_processor(errors, logger, processor_config) + try: + self._verify_environment() + except MissingEnvironmentError as error: + errors.append(error) + try: + if not self.input: + raise RequiredConfigurationKeyMissingError("input") + Factory.create(self.input, logger=getLogger(__name__)) + except Exception as error: # pylint: disable=broad-except + errors.append(error) + if not self.output: + errors.append(RequiredConfigurationKeyMissingError("output")) + else: + for output_name, output_config in self.output.items(): + try: + Factory.create({output_name: output_config}, logger=getLogger(__name__)) + except Exception as error: # pylint: disable=broad-except + errors.append(error) + for processor_config in self.pipeline: try: + processor = Factory.create(deepcopy(processor_config), logger=getLogger(__name__)) self._verify_rules(processor) - except InvalidRuleDefinitionError as error: + except (FactoryError, TypeError, ValueError, InvalidRuleDefinitionError) as error: errors.append(error) try: self._verify_processor_outputs(processor_config) - except InvalidProcessorConfigurationError as error: + except Exception as error: # pylint: disable=broad-except errors.append(error) if errors: raise InvalidConfigurationErrors(errors) - def _verify_pipeline_without_processor_outputs(self, logger: Logger): - self._verify_pipeline_key() - errors = [] - for processor_config in self["pipeline"]: - self._verify_processor(errors, logger, processor_config) - if errors: - raise InvalidConfigurationErrors(errors) - - def _verify_pipeline_key(self): - if not self.get("pipeline"): - raise RequiredConfigurationKeyMissingError("pipeline") - if not isinstance(self["pipeline"], list): - error = InvalidConfigurationError( - '"pipeline" must be a list of processor dictionary configurations!' - ) - raise InvalidConfigurationErrors([error]) + def _verify_processor_outputs(self, processor_config): + processor_config = deepcopy(processor_config) + processor_name, processor_config = processor_config.popitem() + if "outputs" not in processor_config: + return + outputs = processor_config.get("outputs") + for output in outputs: + for output_name, _ in output.items(): + if output_name not in self.output: + raise InvalidProcessorConfigurationError( + f"{processor_name}: output '{output_name}' does not exist in logprep outputs" # pylint: disable=line-too-long + ) - def _verify_processor(self, errors, logger, processor_config): - processor = None - try: - processor = Factory.create(processor_config, logger) - except (FactoryInvalidConfigurationError, UnknownComponentTypeError) as error: - errors.append( - InvalidProcessorConfigurationError(f"{list(processor_config.keys())[0]} - {error}") - ) - except TypeError as error: - msg = self._format_type_error(error) - errors.append( - InvalidProcessorConfigurationError(f"{list(processor_config.keys())[0]} - {msg}") - ) - except InvalidRuleDefinitionError: - errors.append( - InvalidConfigurationError( - "Could not verify configuration for processor instance " - f"'{list(processor_config.keys())[0]}', because it has invalid rules." + def _verify_environment(self): + # pylint: disable=protected-access + getters = (config._getter for config in self._configs if config._getter) + # pylint: enable=protected-access + missing_env_vars = tuple(chain(*[getter.missing_env_vars for getter in getters])) + if missing_env_vars: + raise MissingEnvironmentError(", ".join(missing_env_vars)) + if "PROMETHEUS_MULTIPROC_DIR" in os.environ: + prometheus_multiproc_path = os.environ["PROMETHEUS_MULTIPROC_DIR"] + if not Path(prometheus_multiproc_path).exists(): + raise InvalidConfigurationError( + ( + "PROMETHEUS_MULTIPROC_DIR is set, but " + f"'{prometheus_multiproc_path}' does not exist" + ) ) - ) - return processor - def _verify_rules(self, processor: Processor): - if not processor: - return + def _verify_rules(self, processor: Processor) -> None: rule_ids = [] for rule in processor.rules: if rule.id in rule_ids: @@ -376,46 +549,12 @@ def _verify_rules(self, processor: Processor): if not hasattr(processor.rule_class, "outputs"): continue self._verify_outputs(processor, rule) - duplicates = [item for item in rule_ids if rule_ids.count(item) > 1] - if duplicates: - raise InvalidRuleDefinitionError(f"Duplicate rule ids: {duplicates}") - def _verify_outputs(self, processor, rule): + def _verify_outputs(self, processor: Processor, rule) -> None: for output in rule.outputs: for output_name, _ in output.items(): - if output_name not in self["output"]: + if output_name not in self.output: raise InvalidRuleDefinitionError( f"{processor.describe()}: output" f" '{output_name}' does not exist in logprep outputs" ) - - def _verify_processor_outputs(self, processor_config): - processor_config = deepcopy(processor_config) - processor_name, processor_config = processor_config.popitem() - if "outputs" not in processor_config: - return - if "output" not in self: - return - outputs = processor_config.get("outputs") - for output in outputs: - for output_name, _ in output.items(): - if output_name not in self["output"]: - raise InvalidProcessorConfigurationError( - f"{processor_name}: output '{output_name}' does not exist in logprep outputs" - ) - - def _verify_metrics_config(self): - metrics_config = self.get("metrics") - if metrics_config: - errors = [] - if "enabled" not in metrics_config: - errors.append(RequiredConfigurationKeyMissingError("metrics > enabled")) - if errors: - raise InvalidConfigurationErrors(errors) - - @staticmethod - def _print_and_raise_errors(errors: List[BaseException]): - for error in errors: - print_fcolor(Fore.RED, str(error)) - for error in errors: - raise error diff --git a/logprep/util/defaults.py b/logprep/util/defaults.py new file mode 100644 index 000000000..d0af78353 --- /dev/null +++ b/logprep/util/defaults.py @@ -0,0 +1,2 @@ +"""Default values for logprep.""" +DEFAULT_CONFIG_LOCATION = "file:///etc/logprep/pipeline.yml" diff --git a/logprep/util/getter.py b/logprep/util/getter.py index 02077dd9a..9b88178db 100644 --- a/logprep/util/getter.py +++ b/logprep/util/getter.py @@ -1,6 +1,7 @@ """Content getters provide a shared interface to get content from targets. They are returned by the GetterFactory. """ + import os import re from collections import defaultdict @@ -14,10 +15,11 @@ from requests.auth import HTTPBasicAuth from logprep._version import get_versions +from logprep.abc.exceptions import LogprepException from logprep.abc.getter import Getter -class GetterNotFoundError(BaseException): +class GetterNotFoundError(LogprepException): """Is raised if getter is not found.""" def __init__(self, message) -> None: diff --git a/logprep/util/helper.py b/logprep/util/helper.py index 04fd0f8d8..27aba6adc 100644 --- a/logprep/util/helper.py +++ b/logprep/util/helper.py @@ -1,13 +1,20 @@ """This module contains helper functions that are shared by different modules.""" + import re import sys from functools import lru_cache, partial, reduce from os import remove -from typing import Optional, Union +from typing import TYPE_CHECKING, Optional, Union from colorama import Back, Fore from colorama.ansi import AnsiBack, AnsiFore +from logprep.util.defaults import DEFAULT_CONFIG_LOCATION +from versioneer import get_versions + +if TYPE_CHECKING: # pragma: no cover + from logprep.util.configuration import Configuration + def color_print_line( back: Optional[Union[str, AnsiBack]], fore: Optional[Union[str, AnsiBack]], message: str @@ -308,3 +315,22 @@ def get_dict_size_in_byte(dictionary: dict) -> int: elements_size = sum(map(get_dict_size_in_byte, dictionary)) return size + elements_size return size + + +def get_versions_string(config: "Configuration" = None) -> str: + """ + Prints the version and exists. If a configuration was found then it's version + is printed as well + """ + versions = get_versions() + padding = 25 + version_string = f"{'python version:'.ljust(padding)}{sys.version.split()[0]}" + version_string += f"\n{'logprep version:'.ljust(padding)}{versions['version']}" + if config: + config_version = ( + f"{config.version}, {', '.join(config.config_paths) if config.config_paths else 'None'}" + ) + else: + config_version = f"no configuration found in {', '.join([DEFAULT_CONFIG_LOCATION])}" + version_string += f"\n{'configuration version:'.ljust(padding)}{config_version}" + return version_string diff --git a/logprep/util/json_handling.py b/logprep/util/json_handling.py index 71c31886f..7b755df19 100644 --- a/logprep/util/json_handling.py +++ b/logprep/util/json_handling.py @@ -3,8 +3,6 @@ import os from typing import List -from yaml import safe_dump - def list_json_files_in_directory(directory: str) -> List[str]: """ @@ -34,22 +32,6 @@ def list_json_files_in_directory(directory: str) -> List[str]: return valid_file_paths -def dump_config_as_file(config_path, config): - """ - Saves a config file based on the given config dictionary. - - Parameters - ---------- - config_path: str - The path were the File should be saved - config: dict - The configuration that should be saved - """ - - with open(config_path, "w", encoding="utf8") as generated_config_file: - safe_dump(config, generated_config_file) - - def parse_jsonl(jsonl_path): """ Read and parse all json events from a given jsonl file. diff --git a/logprep/util/pipeline_profiler.py b/logprep/util/pipeline_profiler.py index 4f0e9301a..b81a3938f 100644 --- a/logprep/util/pipeline_profiler.py +++ b/logprep/util/pipeline_profiler.py @@ -1,9 +1,9 @@ """This module implements a pipeline profiler that can be activated in the config.""" import cProfile +import os import pstats from datetime import datetime -import os class PipelineProfiler: diff --git a/logprep/util/rule_dry_runner.py b/logprep/util/rule_dry_runner.py index 27bc9c297..774ce2818 100644 --- a/logprep/util/rule_dry_runner.py +++ b/logprep/util/rule_dry_runner.py @@ -11,15 +11,15 @@ .. code-block:: bash :caption: Directly with Python - PYTHONPATH="." python3 logprep/run_logprep.py $CONFIG --dry-run $EVENTS + PYTHONPATH="." python3 logprep/run_logprep.py test dry-run $CONFIG $EVENTS .. code-block:: bash :caption: With a PEX file - logprep.pex $CONFIG --dry-run $EVENTS + logprep.pex test dry-run $CONFIG $EVENTS Where :code:`$CONFIG` is the path to a configuration file -(see :doc:`configuration/configurationdata`). +(see :ref:`configuration`). The only required section in the configuration is :code:`pipeline` (see tests/testdata/config/config-dry-run.yml for an example). The remaining options are set internally or are being ignored. @@ -45,12 +45,15 @@ from copy import deepcopy from difflib import ndiff from functools import cached_property +from pathlib import Path from colorama import Back, Fore from ruamel.yaml import YAML from logprep.framework.pipeline import Pipeline -from logprep.util.auto_rule_tester.auto_rule_corpus_tester import align_extra_output_formats +from logprep.util.auto_rule_tester.auto_rule_corpus_tester import ( + align_extra_output_formats, +) from logprep.util.configuration import Configuration from logprep.util.getter import GetterFactory from logprep.util.helper import color_print_line, color_print_title, recursive_compare @@ -67,15 +70,22 @@ def _tmp_path(self): @cached_property def _pipeline(self): - patched_config_path = Configuration.patch_yaml_with_json_connectors( - original_config_path=self._config_path, - output_dir=self._tmp_path, - input_file_path=self._input_file_path, - ) - config = Configuration.create_from_yaml(patched_config_path) - config.verify_pipeline_without_processor_outputs(self._logger) - del config["output"] - return Pipeline(config=config) + patched_config = Configuration() + patched_config.input = { + "patched_input": { + "type": f"{'json' if self._use_json else 'jsonl'}_input", + "documents_path": str(self._input_file_path), + } + } + input_config = self._config.input + connector_name = list(input_config.keys())[0] + if "preprocessing" in input_config[connector_name]: + patched_config.input["patched_input"] |= { + "preprocessing": input_config[connector_name]["preprocessing"] + } + patched_config.pipeline = self._config.pipeline + pipeline = Pipeline(config=patched_config) + return pipeline @cached_property def _input_documents(self): @@ -85,10 +95,10 @@ def _input_documents(self): return document_getter.get_jsonl() def __init__( - self, input_file_path: str, config_path: str, full_output: bool, use_json: bool, logger + self, input_file_path: str, config: Configuration, full_output: bool, use_json: bool, logger ): self._input_file_path = input_file_path - self._config_path = config_path + self._config = config self._full_output = full_output self._use_json = use_json self._logger = logger diff --git a/logprep/util/schema_and_rule_checker.py b/logprep/util/schema_and_rule_checker.py deleted file mode 100644 index 50d1015b0..000000000 --- a/logprep/util/schema_and_rule_checker.py +++ /dev/null @@ -1,216 +0,0 @@ -# !/usr/bin/python3 - -"""Runner for testing schemas and rules""" - -from argparse import ArgumentParser -from collections.abc import Iterable -from json.decoder import JSONDecodeError -from logging import Logger -from os import walk -from os.path import join -from typing import List, Optional - -from colorama import Fore - -from logprep.abc.processor import Processor -from logprep.filter.lucene_filter import LuceneFilterError -from logprep.processor.base.exceptions import ( - InvalidRuleDefinitionError, - MismatchedRuleDefinitionError, -) -from logprep.processor.base.rule import Rule -from logprep.processor.labeler.labeling_schema import ( - InvalidLabelingSchemaFileError, - LabelingSchema, -) -from logprep.util.configuration import Configuration - - -class SchemaAndRuleChecker: - """Check validity of schema and rules.""" - - def __init__(self): - self.errors = [] - - @staticmethod - def _parse_command_line_arguments(): - argument_parser = ArgumentParser() - argument_parser.add_argument("--labeling-schema", help="Path to labeling schema file") - argument_parser.add_argument("--labeling-rules", help="Path to labeling rule directory") - argument_parser.add_argument( - "--normalization-rules", help="Path to normalizer rule directory" - ) - argument_parser.add_argument( - "--pseudonymization-rules", help="Path to pseudonymizer rule directory" - ) - - arguments = argument_parser.parse_args() - return arguments - - def _print_valid(self, msg: str): - if not self.errors: - print(Fore.GREEN + msg) - print(Fore.RESET, end="") - - def _print_errors(self): - for error in self.errors: - print(Fore.RED + error) - print(Fore.RESET, end="") - - @staticmethod - def init_additional_grok_patterns(rule_class: Rule, config: dict): - if isinstance(config, dict) and config.get("grok_patterns"): - rule_class.additional_grok_patterns = config.get("grok_patterns") - - @staticmethod - def _get_pipeline(config_path: str) -> Iterable: - config_path = Configuration().create_from_yaml(config_path) - pipeline = config_path["pipeline"] - return pipeline - - def _get_rule_and_schema_paths_from_config(self, config_path: str, processor_type: Processor): - pipeline = self._get_pipeline(config_path) - for processor in pipeline: - options = next(iter(processor.values())) - if options["type"] == processor_type: - rules = [] - if options.get("rules") is not None: - rules = options["rules"] - elif None not in (options.get("specific_rules"), options.get("generic_rules")): - rules = options["specific_rules"] + options["generic_rules"] - yield options.get("schema"), rules - - def _get_config_values(self, config_path, processor_type): - pipeline = self._get_pipeline(config_path) - for processor in pipeline: - options = next(iter(processor.values())) - if options["type"] == processor_type: - return options - - @staticmethod - def _log_error_message(error: KeyError, logger: Logger): - logger.critical( - f"Key {error} does not exist in configuration file! Rules can't be " f"validated!" - ) - - def validate_rules( - self, config_path: str, processor_type: Processor, rule_class: Rule, logger: Logger - ) -> bool: - """Validate rule for processor. - - Parameters - ---------- - config_path : dict - Path to configuration file - processor_type : Processor - Type of processor to validate rules for. - rule_class : Rule - Type of rule to validate rules for. - logger : Logger - Logger to use. - - Returns - ------- - valid : bool - Signifies if rule is valid or not. - - """ - try: - options = self._get_config_values(config_path, processor_type) - self.init_additional_grok_patterns(rule_class, options) - - valid = True - for schema_path, rules_paths in self._get_rule_and_schema_paths_from_config( - config_path, processor_type - ): - for rules_path in rules_paths: - valid = valid and self._validate_rules_in_path( - rules_path, processor_type, rule_class, schema_path - ) - return valid - except KeyError as error: - self._log_error_message(error, logger) - - def _validate_rules_in_path( - self, - path_rules: str, - processor_type: Processor, - rule_class: Rule, - path_schema: str = None, - ): - number_of_checked_rules = 0 - if isinstance(path_rules, dict): - self.check_rule_creation_errors(rule_class, path_rules) - else: - for root, _, files in walk(path_rules): - for file in files: - number_of_checked_rules += 1 - rule_path = join(root, file) - - multi_rule = self.check_rule_creation_errors(rule_class, rule_path) - self._validate_schema(multi_rule, path_schema, rule_path) - self._print_schema_check_results(path_schema) - if not self.errors: - self._print_valid( - f"Valid {processor_type} rules in {path_rules} " - f"({number_of_checked_rules} rules checked)." - ) - - self._print_errors() - return False if self.errors else True - - def _print_schema_check_results(self, path_schema: str): - if path_schema: - self._print_valid(f"Valid labeler schema in {path_schema}.") - - def _validate_schema(self, multi_rule: list, path_schema: str, rule_path: str): - if path_schema: - schema = self._validate_schema_definition(path_schema) - if schema and multi_rule: - for rule in multi_rule: - try: - rule.conforms_to_schema(schema) - except MismatchedRuleDefinitionError as error: - self.errors.append( - f"Mismatch of rule definition in {rule_path} with schema in " - f"{path_schema}: {str(error)}" - ) - - def _validate_schema_definition(self, path_schema: str) -> LabelingSchema: - try: - schema = LabelingSchema.create_from_file(path_schema) - except InvalidLabelingSchemaFileError as error: - self.errors.append(str(error)) - else: - return schema - - def check_rule_creation_errors(self, rule_class: Rule, rule_path: str) -> Optional[List[Rule]]: - """Check for error on rule creation. - - Parameters - ---------- - rule_class : Rule - Class of rule to be tested. - rule_path : str - Path to rule to be tested. - - Returns - ------- - rule : Rule - Rule object. - - """ - rule = None - try: - if isinstance(rule_path, dict): - rule = rule_class.create_rules_from_target(rule_path) - elif rule_path.endswith(".json") or rule_path.endswith(".yml"): - if not rule_path.endswith("_test.json"): - rule = rule_class.create_rules_from_target(rule_path) - except InvalidRuleDefinitionError as error: - self.errors.append("Invalid rule definition in {}: {}".format(rule_path, str(error))) - except JSONDecodeError as error: - self.errors.append("JSON decoder Error in {}: {}".format(rule_path, str(error))) - except LuceneFilterError as error: - self.errors.append("Lucene Filter Error in {}: {}".format(rule_path, str(error))) - return rule diff --git a/logprep/util/time.py b/logprep/util/time.py index 148a03980..32a209289 100644 --- a/logprep/util/time.py +++ b/logprep/util/time.py @@ -1,14 +1,17 @@ """logprep time helpers module""" + from datetime import datetime, tzinfo from typing import Union from zoneinfo import ZoneInfo import ciso8601 +from logprep.abc.exceptions import LogprepException + UTC = ZoneInfo("UTC") -class TimeParserException(Exception): +class TimeParserException(LogprepException): """exception class for time parsing""" diff --git a/quickstart/exampledata/config/dummy-output.yml b/quickstart/exampledata/config/dummy-output.yml new file mode 100644 index 000000000..297d30220 --- /dev/null +++ b/quickstart/exampledata/config/dummy-output.yml @@ -0,0 +1,6 @@ +output: + opensearch: + type: dummy_output + kafka: + type: dummy_output + do_nothing: true diff --git a/tests/acceptance/test_amides.py b/tests/acceptance/test_amides.py index 83d2e7df3..02e856bdd 100644 --- a/tests/acceptance/test_amides.py +++ b/tests/acceptance/test_amides.py @@ -3,10 +3,11 @@ # pylint: disable=missing-docstring from logging import DEBUG, basicConfig, getLogger +from pathlib import Path import pytest -from logprep.util.json_handling import dump_config_as_file +from logprep.util.configuration import Configuration from tests.acceptance.util import get_test_output basicConfig(level=DEBUG, format="%(asctime)-15s %(name)-5s %(levelname)-8s: %(message)s") @@ -15,7 +16,7 @@ @pytest.fixture def config(): - config_yml = { + config_dict = { "process_count": 1, "timeout": 0.1, "profile_pipelines": True, @@ -46,14 +47,14 @@ def config(): }, } - return config_yml + return Configuration(**config_dict) -def test_amides(tmp_path, config): - config_path = str(tmp_path / "generated_config.yml") - dump_config_as_file(config_path, config) +def test_amides(tmp_path: Path, config: Configuration): + config_path = tmp_path / "generated_config.yml" + config_path.write_text(config.as_yaml()) - test_output = get_test_output(config_path) + test_output = get_test_output(str(config_path)) test_output_documents = [event for event in test_output[0] if event.get("amides")] attributed_documents = [ event for event in test_output_documents if event.get("amides").get("attributions") diff --git a/tests/acceptance/test_config_refresh.py b/tests/acceptance/test_config_refresh.py index 9de605a48..5d5969886 100644 --- a/tests/acceptance/test_config_refresh.py +++ b/tests/acceptance/test_config_refresh.py @@ -1,5 +1,4 @@ # pylint: disable=missing-docstring -import json from pathlib import Path from ruamel.yaml import YAML @@ -16,25 +15,27 @@ def teardown_function(): def test_two_times_config_refresh_after_5_seconds(tmp_path): - config = Configuration.create_from_yaml("tests/testdata/config/config.yml") - config.update({"config_refresh_interval": 5, "metrics": {"enabled": False}}) + config = Configuration.from_sources(["tests/testdata/config/config.yml"]) + config.config_refresh_interval = 5 + config.metrics = {"enabled": False} config_path = tmp_path / "generated_config.yml" - config_path.write_text(json.dumps(config)) + config_path.write_text(config.as_json()) proc = start_logprep(config_path) wait_for_output(proc, "Config refresh interval is set to: 5 seconds", test_timeout=5) - config.update({"version": 2}) - config_path.write_text(json.dumps(config)) + config.version = "2" + config_path.write_text(config.as_json()) wait_for_output(proc, "Successfully reloaded configuration", test_timeout=7) - config.update({"version": "other version"}) - config_path.write_text(json.dumps(config)) + config.version = "other version" + config_path.write_text(config.as_json()) wait_for_output(proc, "Successfully reloaded configuration", test_timeout=6) def test_no_config_refresh_after_5_seconds(tmp_path): - config = Configuration.create_from_yaml("tests/testdata/config/config.yml") - config.update({"config_refresh_interval": 5, "metrics": {"enabled": False}}) + config = Configuration.from_sources(["tests/testdata/config/config.yml"]) + config.config_refresh_interval = 5 + config.metrics = {"enabled": False} config_path = tmp_path / "generated_config.yml" - config_path.write_text(json.dumps(config)) + config_path.write_text(config.as_json()) proc = start_logprep(config_path) wait_for_output(proc, "Config refresh interval is set to: 5 seconds", test_timeout=5) wait_for_output( diff --git a/tests/acceptance/test_file_input.py b/tests/acceptance/test_file_input.py index 7f1e18a54..3e3ab59a9 100644 --- a/tests/acceptance/test_file_input.py +++ b/tests/acceptance/test_file_input.py @@ -5,7 +5,7 @@ import pytest -from logprep.util.json_handling import dump_config_as_file +from logprep.util.configuration import Configuration from tests.acceptance.util import ( get_default_logprep_config, start_logprep, @@ -53,7 +53,7 @@ def config_fixture(): } ] config = get_default_logprep_config(pipeline, with_hmac=False) - config["input"] = { + config.input = { "testinput": { "type": "file_input", "logfile_path": "", @@ -74,13 +74,13 @@ def teardown_function(): stop_logprep() -def test_file_input_accepts_message_for_single_pipeline(tmp_path, config): +def test_file_input_accepts_message_for_single_pipeline(tmp_path, config: Configuration): output_path = tmp_path / "output.jsonl" input_path = tmp_path / "input.log" - config["input"]["testinput"]["logfile_path"] = str(input_path) - config["output"] = {"testoutput": {"type": "jsonl_output", "output_file": str(output_path)}} - config_path = str(tmp_path / "generated_config.yml") - dump_config_as_file(config_path, config) + config.input["testinput"]["logfile_path"] = str(input_path) + config.output = {"testoutput": {"type": "jsonl_output", "output_file": str(output_path)}} + config_path = tmp_path / "generated_config.yml" + config_path.write_text(config.as_yaml()) write_file(str(input_path), test_initial_log_data) proc = start_logprep(config_path) wait_for_output(proc, "Logprep INFO : Log level set to 'INFO'") @@ -88,14 +88,14 @@ def test_file_input_accepts_message_for_single_pipeline(tmp_path, config): assert test_initial_log_data[0] in output_path.read_text() -def test_file_input_accepts_message_for_two_pipelines(tmp_path, config): - config["process_count"] = 2 +def test_file_input_accepts_message_for_two_pipelines(tmp_path, config: Configuration): + config.process_count = 2 output_path = tmp_path / "output.jsonl" input_path = tmp_path / "input.log" - config["input"]["testinput"]["logfile_path"] = str(input_path) - config["output"] = {"testoutput": {"type": "jsonl_output", "output_file": str(output_path)}} - config_path = str(tmp_path / "generated_config.yml") - dump_config_as_file(config_path, config) + config.input["testinput"]["logfile_path"] = str(input_path) + config.output = {"testoutput": {"type": "jsonl_output", "output_file": str(output_path)}} + config_path = tmp_path / "generated_config.yml" + config_path.write_text(config.as_yaml()) write_file(str(input_path), test_initial_log_data) proc = start_logprep(config_path) wait_for_output(proc, "Logprep INFO : Log level set to 'INFO'") diff --git a/tests/acceptance/test_full_configuration.py b/tests/acceptance/test_full_configuration.py index e4a621d5e..8466c79d4 100644 --- a/tests/acceptance/test_full_configuration.py +++ b/tests/acceptance/test_full_configuration.py @@ -6,7 +6,6 @@ import requests -from logprep.util.json_handling import dump_config_as_file from tests.acceptance.util import ( HTTPServerForTesting, convert_to_http_config, @@ -25,9 +24,9 @@ def teardown_function(): def test_start_of_logprep_with_full_configuration_from_file(tmp_path): pipeline = get_full_pipeline(exclude=["normalizer"]) config = get_default_logprep_config(pipeline, with_hmac=False) - config.get("output").update({"kafka": {"type": "dummy_output", "default": False}}) - config_path = str(tmp_path / "generated_config.yml") - dump_config_as_file(config_path, config) + config.output.update({"kafka": {"type": "dummy_output", "default": False}}) + config_path = tmp_path / "generated_config.yml" + config_path.write_text(config.as_yaml(), encoding="utf-8") proc = start_logprep(config_path) output = proc.stdout.readline().decode("utf8") while True: @@ -44,20 +43,20 @@ def test_start_of_logprep_with_full_configuration_from_file(tmp_path): def test_start_of_logprep_with_full_configuration_http(): pipeline = get_full_pipeline(exclude=["normalizer"]) config = get_default_logprep_config(pipeline, with_hmac=False) - config.get("output").update({"kafka": {"type": "dummy_output", "default": False}}) + config.output.update({"kafka": {"type": "dummy_output", "default": False}}) endpoint = "http://localhost:32000" config = convert_to_http_config(config, endpoint) - config_path = "generated_config.yml" - dump_config_as_file(config_path, config) + config_path = Path("generated_config.yml") + config_path.write_text(config.as_yaml(), encoding="utf-8") with HTTPServerForTesting.run_in_thread(): - proc = start_logprep(f"{endpoint}/{config_path}") + proc = start_logprep(f"{endpoint}/{str(config_path)}") output = proc.stdout.readline().decode("utf8") while True: - assert not re.search("Invalid", output) - assert not re.search("Exception", output) - assert not re.search("critical", output) - assert not re.search("Error", output) - assert not re.search("ERROR", output) + assert not re.search("Invalid", output), output + assert not re.search("Exception", output), output + assert not re.search("critical", output), output + assert not re.search("Error", output), output + assert not re.search("ERROR", output), output if re.search("Startup complete", output): break output = proc.stdout.readline().decode("utf8") @@ -124,31 +123,29 @@ def test_logprep_exposes_prometheus_metrics(tmp_path): # normalizer is excluded because of deprecation pipeline = get_full_pipeline(exclude=["requester", "selective_extractor", "normalizer"]) config = get_default_logprep_config(pipeline, with_hmac=False) - config |= { - "version": "my_custom_version", - "config_refresh_interval": 300, - "metrics": {"enabled": True, "port": 8000}, - "input": { - "fileinput": { - "type": "file_input", - "logfile_path": str(input_file_path), - "start": "begin", - "interval": 1, - "watch_file": True, - } + config.version = "my_custom_version" + config.config_refresh_interval = 300 + config.metrics = {"enabled": True, "port": 8000} + config.input = { + "fileinput": { + "type": "file_input", + "logfile_path": str(input_file_path), + "start": "begin", + "interval": 1, + "watch_file": True, + } + } + config.output = { + "kafka": { # the name has to be kafka for some default rules + "type": "console_output", }, - "output": { - "kafka": { # the name has to be kafka for some default rules - "type": "console_output", - }, - "second_output": { - "type": "console_output", - }, + "second_output": { + "type": "console_output", }, } - config_path = str(tmp_path / "generated_config.yml") + config_path = tmp_path / "generated_config.yml" # duplicate one processor to test that rule metrics are separated by processor names and not by type - config["pipeline"].append( + config.pipeline.append( { "calculator2": { "generic_rules": ["tests/testdata/unit/calculator/generic_rules"], @@ -157,7 +154,7 @@ def test_logprep_exposes_prometheus_metrics(tmp_path): } } ) - dump_config_as_file(config_path, config) + config_path.write_text(config.as_yaml(), encoding="utf-8") proc = start_logprep(config_path, env={"PROMETHEUS_MULTIPROC_DIR": tmp_path}) input_file_path.write_text("user root logged in\n", encoding="utf8") while True: diff --git a/tests/acceptance/test_http_input.py b/tests/acceptance/test_http_input.py index 58e89f8f8..4be0690e6 100644 --- a/tests/acceptance/test_http_input.py +++ b/tests/acceptance/test_http_input.py @@ -3,11 +3,12 @@ import os import time from logging import DEBUG, basicConfig, getLogger +from pathlib import Path import pytest import requests -from logprep.util.json_handling import dump_config_as_file +from logprep.util.configuration import Configuration from tests.acceptance.util import ( get_default_logprep_config, start_logprep, @@ -31,7 +32,7 @@ def config_fixture(): } ] config = get_default_logprep_config(pipeline, with_hmac=False) - config["input"] = { + config.input = { "testinput": { "type": "http_input", "uvicorn_config": { @@ -55,11 +56,11 @@ def teardown_function(): @pytest.mark.filterwarnings("ignore:Unverified HTTPS request is being made to host '127.0.0.1'") -def test_http_input_accepts_message_for_single_pipeline(tmp_path, config): +def test_http_input_accepts_message_for_single_pipeline(tmp_path: Path, config: Configuration): output_path = tmp_path / "output.jsonl" - config["output"] = {"testoutput": {"type": "jsonl_output", "output_file": str(output_path)}} - config_path = str(tmp_path / "generated_config.yml") - dump_config_as_file(config_path, config) + config.output = {"testoutput": {"type": "jsonl_output", "output_file": str(output_path)}} + config_path = tmp_path / "generated_config.yml" + config_path.write_text(config.as_yaml()) proc = start_logprep(config_path) wait_for_output(proc, "Uvicorn running on https://127.0.0.1:9000", test_timeout=15) # nosemgrep @@ -70,11 +71,11 @@ def test_http_input_accepts_message_for_single_pipeline(tmp_path, config): @pytest.mark.filterwarnings("ignore:Unverified HTTPS request is being made to host '127.0.0.1'") def test_http_input_accepts_message_for_two_pipelines(tmp_path, config): - config["process_count"] = 2 + config.process_count = 2 output_path = tmp_path / "output.jsonl" - config["output"] = {"testoutput": {"type": "jsonl_output", "output_file": str(output_path)}} - config_path = str(tmp_path / "generated_config.yml") - dump_config_as_file(config_path, config) + config.output = {"testoutput": {"type": "jsonl_output", "output_file": str(output_path)}} + config_path = tmp_path / "generated_config.yml" + config_path.write_text(config.as_yaml()) proc = start_logprep(config_path) wait_for_output(proc, "Uvicorn running on https://127.0.0.1:9001", test_timeout=15) # nosemgrep @@ -99,12 +100,12 @@ def test_http_input_accepts_message_for_two_pipelines(tmp_path, config): @pytest.mark.skipif(os.environ.get("GITHUB_ACTIONS") == "true", reason="sometimes fails on CI") @pytest.mark.filterwarnings("ignore:Unverified HTTPS request is being made to host '127.0.0.1'") -def test_http_input_accepts_message_for_three_pipelines(tmp_path, config): - config["process_count"] = 3 +def test_http_input_accepts_message_for_three_pipelines(tmp_path: Path, config: Configuration): + config.process_count = 3 output_path = tmp_path / "output.jsonl" - config["output"] = {"testoutput": {"type": "jsonl_output", "output_file": str(output_path)}} - config_path = str(tmp_path / "generated_config.yml") - dump_config_as_file(config_path, config) + config.output = {"testoutput": {"type": "jsonl_output", "output_file": str(output_path)}} + config_path = tmp_path / "generated_config.yml" + config_path.write_text(config.as_yaml()) proc = start_logprep(config_path) wait_for_output(proc, "Uvicorn running on https://127.0.0.1:9002", test_timeout=15) # nosemgrep diff --git a/tests/acceptance/test_multiple_outputs.py b/tests/acceptance/test_multiple_outputs.py index 35463f2a2..942757b2f 100644 --- a/tests/acceptance/test_multiple_outputs.py +++ b/tests/acceptance/test_multiple_outputs.py @@ -1,10 +1,11 @@ # pylint: disable=missing-docstring # pylint: disable=line-too-long import time +from pathlib import Path import pytest -from logprep.util.json_handling import dump_config_as_file +from logprep.util.configuration import Configuration from tests.acceptance.util import start_logprep, stop_logprep, wait_for_output CHECK_INTERVAL = 0.1 @@ -16,65 +17,67 @@ def wait_for_interval(interval): @pytest.fixture(name="config") def get_config(): - return { - "version": "1", - "logger": {"level": "DEBUG"}, - "process_count": 1, - "timeout": 0.1, - "profile_pipelines": False, - "pipeline": [ - { - "dissector": { - "type": "dissector", - "specific_rules": ["tests/testdata/acceptance/dissector/rules/specific"], - "generic_rules": ["tests/testdata/acceptance/dissector/rules/generic"], + return Configuration( + **{ + "version": "1", + "logger": {"level": "DEBUG"}, + "process_count": 1, + "timeout": 0.1, + "profile_pipelines": False, + "pipeline": [ + { + "dissector": { + "type": "dissector", + "specific_rules": ["tests/testdata/acceptance/dissector/rules/specific"], + "generic_rules": ["tests/testdata/acceptance/dissector/rules/generic"], + } + }, + { + "selective_extractor": { + "type": "selective_extractor", + "specific_rules": [ + "tests/testdata/acceptance/selective_extractor/rules/specific" + ], + "generic_rules": [ + "tests/testdata/acceptance/selective_extractor/rules/generic" + ], + } + }, + { + "pseudonymizer": { + "type": "pseudonymizer", + "pubkey_analyst": "tests/testdata/acceptance/pseudonymizer/example_analyst_pub.pem", + "pubkey_depseudo": "tests/testdata/acceptance/pseudonymizer/example_depseudo_pub.pem", + "hash_salt": "a_secret_tasty_ingredient", + "outputs": [{"jsonl": "pseudonyms"}], + "specific_rules": [ + "tests/testdata/acceptance/pseudonymizer/rules_static/specific" + ], + "generic_rules": [ + "tests/testdata/acceptance/pseudonymizer/rules_static/generic" + ], + "regex_mapping": "tests/testdata/acceptance/pseudonymizer/rules_static/regex_mapping.yml", + "max_cached_pseudonyms": 1000000, + } + }, + { + "pre_detector": { + "type": "pre_detector", + "outputs": [{"jsonl": "pre_detector_topic"}], + "generic_rules": ["tests/testdata/acceptance/pre_detector/rules/generic"], + "specific_rules": ["tests/testdata/acceptance/pre_detector/rules/specific"], + "tree_config": "tests/testdata/acceptance/pre_detector/tree_config.json", + } + }, + ], + "input": { + "jsonl": { + "type": "jsonl_input", + "documents_path": "tests/testdata/input_logdata/kafka_raw_event_for_pre_detector.jsonl", } }, - { - "selective_extractor": { - "type": "selective_extractor", - "specific_rules": [ - "tests/testdata/acceptance/selective_extractor/rules/specific" - ], - "generic_rules": [ - "tests/testdata/acceptance/selective_extractor/rules/generic" - ], - } - }, - { - "pseudonymizer": { - "type": "pseudonymizer", - "pubkey_analyst": "tests/testdata/acceptance/pseudonymizer/example_analyst_pub.pem", - "pubkey_depseudo": "tests/testdata/acceptance/pseudonymizer/example_depseudo_pub.pem", - "hash_salt": "a_secret_tasty_ingredient", - "outputs": [{"jsonl": "pseudonyms"}], - "specific_rules": [ - "tests/testdata/acceptance/pseudonymizer/rules_static/specific" - ], - "generic_rules": [ - "tests/testdata/acceptance/pseudonymizer/rules_static/generic" - ], - "regex_mapping": "tests/testdata/acceptance/pseudonymizer/rules_static/regex_mapping.yml", - "max_cached_pseudonyms": 1000000, - } - }, - { - "pre_detector": { - "type": "pre_detector", - "outputs": [{"jsonl": "pre_detector_topic"}], - "generic_rules": ["tests/testdata/acceptance/pre_detector/rules/generic"], - "specific_rules": ["tests/testdata/acceptance/pre_detector/rules/specific"], - "tree_config": "tests/testdata/acceptance/pre_detector/tree_config.json", - } - }, - ], - "input": { - "jsonl": { - "type": "jsonl_input", - "documents_path": "tests/testdata/input_logdata/kafka_raw_event_for_pre_detector.jsonl", - } - }, - } + } + ) def setup_function(): @@ -85,36 +88,33 @@ def teardown_function(): stop_logprep() -def test_full_pipeline_run_with_two_outputs(tmp_path, config): +def test_full_pipeline_run_with_two_outputs(tmp_path: Path, config: Configuration): output_path1 = tmp_path / "output1.jsonl" output_path_custom1 = tmp_path / "output_custom1.jsonl" output_path_error1 = tmp_path / "output_error1.jsonl" output_path2 = tmp_path / "output2.jsonl" output_path_custom2 = tmp_path / "output_custom2.jsonl" output_path_error2 = tmp_path / "output_error2.jsonl" - config["input"]["jsonl"][ + config.input["jsonl"][ "documents_path" ] = "tests/testdata/input_logdata/selective_extractor_events.jsonl" - output = { - "output": { - "jsonl": { - "type": "jsonl_output", - "output_file": f"{str(output_path1)}", - "output_file_custom": f"{str(output_path_custom1)}", - "output_file_error": f"{str(output_path_error1)}", - }, - "second_output": { - "type": "jsonl_output", - "output_file": f"{str(output_path2)}", - "output_file_custom": f"{str(output_path_custom2)}", - "output_file_error": f"{str(output_path_error2)}", - }, + config.output = { + "jsonl": { + "type": "jsonl_output", + "output_file": f"{str(output_path1)}", + "output_file_custom": f"{str(output_path_custom1)}", + "output_file_error": f"{str(output_path_error1)}", + }, + "second_output": { + "type": "jsonl_output", + "output_file": f"{str(output_path2)}", + "output_file_custom": f"{str(output_path_custom2)}", + "output_file_error": f"{str(output_path_error2)}", }, } - config |= output - config_path = str(tmp_path / "generated_config.yml") - dump_config_as_file(config_path, config) - proc = start_logprep(config_path) + config_path = tmp_path / "generated_config.yml" + config_path.write_text(config.as_yaml()) + proc = start_logprep(str(config_path)) wait_for_output(proc, "no documents left") stop_logprep(proc) assert output_path1.read_text(), "output is not empty" diff --git a/tests/acceptance/test_pre_detection.py b/tests/acceptance/test_pre_detection.py index 3b029db93..c0dbdfb18 100644 --- a/tests/acceptance/test_pre_detection.py +++ b/tests/acceptance/test_pre_detection.py @@ -3,12 +3,12 @@ # pylint: disable=too-many-locals import json import re -from logging import basicConfig, DEBUG, getLogger +from logging import DEBUG, basicConfig, getLogger +from pathlib import Path import pytest from deepdiff import DeepDiff -from logprep.util.json_handling import dump_config_as_file from tests.acceptance.util import get_default_logprep_config, get_test_output basicConfig(level=DEBUG, format="%(asctime)-15s %(name)-5s %(levelname)-8s: %(message)s") @@ -67,15 +67,15 @@ ) # fmt: on def test_events_pre_detected_correctly( - tmp_path, input_event, expected_output_event, expected_extra_output + tmp_path: Path, input_event, expected_output_event, expected_extra_output ): input_file_path = tmp_path / "input.json" input_file_path.write_text(json.dumps(input_event)) config = get_default_logprep_config(pipeline_config=pipeline, with_hmac=False) - config["input"]["jsonl"]["documents_path"] = str(input_file_path) - config_path = str(tmp_path / "generated_config.yml") - dump_config_as_file(config_path, config) - logprep_output, logprep_extra_output, logprep_error_output = get_test_output(config_path) + config.input["jsonl"]["documents_path"] = str(input_file_path) + config_path = tmp_path / "generated_config.yml" + config_path.write_text(config.as_yaml()) + logprep_output, logprep_extra_output, logprep_error_output = get_test_output(str(config_path)) assert not logprep_error_output diff = DeepDiff( expected_output_event, diff --git a/tests/acceptance/test_preprocessing.py b/tests/acceptance/test_preprocessing.py index 615fb8bf8..15c855545 100644 --- a/tests/acceptance/test_preprocessing.py +++ b/tests/acceptance/test_preprocessing.py @@ -1,10 +1,11 @@ # pylint: disable=missing-docstring # pylint: disable=no-self-use from logging import DEBUG, basicConfig, getLogger +from pathlib import Path import pytest -from logprep.util.json_handling import dump_config_as_file +from logprep.util.configuration import Configuration from tests.acceptance.util import get_default_logprep_config, get_test_output basicConfig(level=DEBUG, format="%(asctime)-15s %(name)-5s %(levelname)-8s: %(message)s") @@ -12,7 +13,7 @@ @pytest.fixture(name="config") -def get_config(): +def get_config() -> Configuration: pipeline = [ { "dissector": { @@ -26,17 +27,17 @@ def get_config(): class TestVersionInfoTargetField: - def test_preprocessor_adds_version_information(self, tmp_path, config): - config["input"]["jsonl"].update( + def test_preprocessor_adds_version_information(self, tmp_path: Path, config: Configuration): + config.input["jsonl"].update( { "documents_path": "tests/testdata/input_logdata/selective_extractor_events.jsonl", "preprocessing": {"version_info_target_field": "version_info"}, } ) - config_path = str(tmp_path / "generated_config.yml") - dump_config_as_file(config_path, config) - test_output, _, __ = get_test_output(config_path) + config_path = tmp_path / "generated_config.yml" + config_path.write_text(config.as_yaml()) + test_output, _, __ = get_test_output(str(config_path)) assert test_output, "should not be empty" processed_event = test_output[0] assert processed_event.get("version_info", {}).get( diff --git a/tests/acceptance/test_selective_extractor_full_pipeline_pass.py b/tests/acceptance/test_selective_extractor_full_pipeline_pass.py index 927d05690..707e1418c 100644 --- a/tests/acceptance/test_selective_extractor_full_pipeline_pass.py +++ b/tests/acceptance/test_selective_extractor_full_pipeline_pass.py @@ -2,7 +2,7 @@ # pylint: disable=line-too-long import pytest -from logprep.util.json_handling import dump_config_as_file +from logprep.util.configuration import Configuration from tests.acceptance.util import get_default_logprep_config, get_test_output @@ -28,13 +28,13 @@ def config_fixture(): class TestSelectiveExtractor: - def test_selective_extractor_full_pipeline_pass(self, tmp_path, config): - config_path = str(tmp_path / "generated_config.yml") - config["input"]["jsonl"][ + def test_selective_extractor_full_pipeline_pass(self, tmp_path, config: Configuration): + config_path = tmp_path / "generated_config.yml" + config.input["jsonl"][ "documents_path" ] = "tests/testdata/input_logdata/selective_extractor_events.jsonl" - dump_config_as_file(config_path, config) - test_output, test_custom, _ = get_test_output(config_path) + config_path.write_text(config.as_yaml()) + test_output, test_custom, _ = get_test_output(str(config_path)) assert test_output, "should not be empty" assert test_custom, "should not be empty" assert len(test_custom) == 2, "2 events extracted" @@ -45,14 +45,14 @@ def test_selective_extractor_full_pipeline_pass(self, tmp_path, config): "event": {"action": "less_evil_action"}, } in test_output - def test_extraction_field_not_in_event(self, tmp_path, config): + def test_extraction_field_not_in_event(self, tmp_path, config: Configuration): # tests behaviour in case a field from the extraction list is not in the provided event - config_path = str(tmp_path / "generated_config.yml") - config["input"]["jsonl"][ + config_path = tmp_path / "generated_config.yml" + config.input["jsonl"][ "documents_path" ] = "tests/testdata/input_logdata/selective_extractor_events_2.jsonl" - dump_config_as_file(config_path, config) - test_output, test_custom, _ = get_test_output(config_path) + config_path.write_text(config.as_yaml()) + test_output, test_custom, _ = get_test_output(str(config_path)) assert test_output, "should not be empty" assert test_custom, "should not be empty" assert len(test_custom) == 1, "one extracted event" diff --git a/tests/acceptance/test_wineventlog_processing.py b/tests/acceptance/test_wineventlog_processing.py index 4d239cdd1..179065b02 100644 --- a/tests/acceptance/test_wineventlog_processing.py +++ b/tests/acceptance/test_wineventlog_processing.py @@ -4,7 +4,8 @@ import pytest -from logprep.util.json_handling import dump_config_as_file, parse_jsonl +from logprep.util.configuration import Configuration +from logprep.util.json_handling import parse_jsonl from tests.acceptance.util import ( get_default_logprep_config, get_difference, @@ -18,7 +19,7 @@ logger = logging.getLogger("Logprep-Test") -@pytest.fixture(name="config_template") +@pytest.fixture(name="config") def fixture_config_template(): pipeline = [ { @@ -58,37 +59,31 @@ def fixture_config_template(): ], ) def test_events_labeled_correctly( - tmp_path, config_template, specific_rules, generic_rules, schema, expected_output + tmp_path, config: Configuration, specific_rules, generic_rules, schema, expected_output ): # pylint: disable=too-many-arguments expected_output_path = os.path.join( "tests/testdata/acceptance/expected_result", expected_output ) + set_config(config, specific_rules, generic_rules, schema) + config.input["jsonl"]["documents_path"] = "tests/testdata/input_logdata/wineventlog_raw.jsonl" + config_path = tmp_path / "generated_config.yml" + config_path.write_text(config.as_yaml()) - set_config(config_template, specific_rules, generic_rules, schema) - config_template["input"]["jsonl"][ - "documents_path" - ] = "tests/testdata/input_logdata/wineventlog_raw.jsonl" - config_path = str(tmp_path / "generated_config.yml") - dump_config_as_file(config_path, config_template) - - test_output, _, _ = get_test_output(config_path) + test_output, _, _ = get_test_output(str(config_path)) assert test_output, "should not be empty" store_latest_test_output(expected_output, test_output) - expected_output = parse_jsonl(expected_output_path) - result = get_difference(test_output, expected_output) - assert ( result["difference"][0] == result["difference"][1] ), f"Missmatch in event at line {result['event_line_no']}!" -def set_config(config_template, specific_rules, generic_rules, schema): - config_template["pipeline"][0]["labelername"]["schema"] = os.path.join("tests/testdata", schema) - config_template["pipeline"][0]["labelername"]["specific_rules"] = [ +def set_config(config: Configuration, specific_rules, generic_rules, schema): + config.pipeline[0]["labelername"]["schema"] = os.path.join("tests/testdata", schema) + config.pipeline[0]["labelername"]["specific_rules"] = [ os.path.join("tests/testdata", rule) for rule in specific_rules ] - config_template["pipeline"][0]["labelername"]["generic_rules"] = [ + config.pipeline[0]["labelername"]["generic_rules"] = [ os.path.join("tests/testdata", rule) for rule in generic_rules ] diff --git a/tests/acceptance/test_wineventlog_pseudonymization.py b/tests/acceptance/test_wineventlog_pseudonymization.py index 6a6f1dfc1..e4f25dcda 100644 --- a/tests/acceptance/test_wineventlog_pseudonymization.py +++ b/tests/acceptance/test_wineventlog_pseudonymization.py @@ -4,7 +4,8 @@ import pytest -from logprep.util.json_handling import dump_config_as_file, parse_jsonl +from logprep.util.configuration import Configuration +from logprep.util.json_handling import parse_jsonl from tests.acceptance.util import ( get_default_logprep_config, get_difference, @@ -35,7 +36,7 @@ def get_config(): return get_default_logprep_config(pipeline, with_hmac=False) -def test_events_pseudonymized_correctly(tmp_path, config): +def test_events_pseudonymized_correctly(tmp_path, config: Configuration): expected_output_file_name = "pseudonymized_win_event_log.jsonl" expected_output_path = path.join( "tests/testdata/acceptance/expected_result", expected_output_file_name @@ -48,19 +49,17 @@ def test_events_pseudonymized_correctly(tmp_path, config): event for event in expected_output if "pseudonym" in event.keys() ] - config["input"]["jsonl"][ - "documents_path" - ] = "tests/testdata/input_logdata/wineventlog_raw.jsonl" - config_path = str(tmp_path / "generated_config.yml") - dump_config_as_file(config_path, config) + config.input["jsonl"]["documents_path"] = "tests/testdata/input_logdata/wineventlog_raw.jsonl" + config_path = tmp_path / "generated_config.yml" + config_path.write_text(config.as_yaml()) - logprep_output, logprep_extra_output, logprep_error_output = get_test_output(config_path) + logprep_output, logprep_extra_output, logprep_error_output = get_test_output(str(config_path)) assert logprep_output, "should not be empty" assert len(logprep_error_output) == 0, "There shouldn't be any logprep errors" result = get_difference(logprep_output, expected_logprep_outputs) assert ( result["difference"][0] == result["difference"][1] - ), "Missmatch in event at line {}!".format(result["event_line_no"]) + ), f"Missmatch in event at line {result['event_line_no']}!" # FIXME: Test is only testing for the logprep outputs with the pseudonym inside, but not the # extra outputs. diff --git a/tests/acceptance/util.py b/tests/acceptance/util.py index 7ba5fc87c..ba4615ffc 100644 --- a/tests/acceptance/util.py +++ b/tests/acceptance/util.py @@ -22,6 +22,7 @@ from logprep.abc.processor import Processor from logprep.registry import Registry from logprep.runner import Runner +from logprep.util.configuration import Configuration from logprep.util.decorators import timeout from logprep.util.helper import recursive_compare, remove_file_if_exists from logprep.util.json_handling import parse_jsonl @@ -113,7 +114,7 @@ def store_latest_test_output(target_output_identifier, output_of_test): latest_output.write(json.dumps(test_output_line) + "\n") -def get_runner_outputs(patched_runner) -> list: +def get_runner_outputs(patched_runner: Runner) -> list: # pylint: disable=protected-access """ Extracts the outputs of a patched logprep runner. @@ -130,7 +131,7 @@ def get_runner_outputs(patched_runner) -> list: and errors """ parsed_outputs = [None, None, None] - output_config = list(patched_runner._configuration.get("output").values())[0] + output_config = list(patched_runner._configuration.output.values())[0] output_paths = [ output_path for key, output_path in output_config.items() if "output_file" in key ] @@ -164,13 +165,12 @@ def get_patched_runner(config_path): runner : Runner The patched logprep runner """ - runner = Runner(bypass_check_to_obtain_non_singleton_instance=True) - runner.load_configuration(config_path) + runner = Runner(Configuration.from_sources([config_path])) # patch runner to stop on empty pipeline def keep_iterating(): """generator that stops on first iteration""" - return # nosemgrep + return yield runner._keep_iterating = keep_iterating # pylint: disable=protected-access @@ -178,7 +178,7 @@ def keep_iterating(): return runner -def get_test_output(config_path): +def get_test_output(config_path: str) -> list[dict]: patched_runner = get_patched_runner(config_path) return get_runner_outputs(patched_runner=patched_runner) @@ -206,7 +206,7 @@ def poll(self, _): ... -def get_default_logprep_config(pipeline_config, with_hmac=True): +def get_default_logprep_config(pipeline_config, with_hmac=True) -> Configuration: config_yml = { "version": "1", "process_count": 1, @@ -239,7 +239,7 @@ def get_default_logprep_config(pipeline_config, with_hmac=True): } } - return config_yml + return Configuration(**config_yml) def start_logprep(config_path: str, env: dict = None) -> subprocess.Popen: @@ -312,7 +312,7 @@ def get_full_pipeline(exclude=None): return [{processor_name: config} for processor_name, config in processor_configs if config] -def convert_to_http_config(config: dict, endpoint) -> dict: +def convert_to_http_config(config: Configuration, endpoint) -> dict: config = deepcopy(config) http_fields = [ "regex_mapping", @@ -324,7 +324,7 @@ def convert_to_http_config(config: dict, endpoint) -> dict: "schema", "template", ] - for processor_config in config.get("pipeline"): + for processor_config in config.pipeline: name, value = processor_config.popitem() for rule_kind in ("specific_rules", "generic_rules"): rules = Processor.resolve_directories(value.get(rule_kind)) diff --git a/tests/testdata/auto_tests/template_replacer/rules/specific/template_replacer.json b/tests/testdata/auto_tests/template_replacer/rules/specific/template_replacer.json index 051249872..9723abc3e 100644 --- a/tests/testdata/auto_tests/template_replacer/rules/specific/template_replacer.json +++ b/tests/testdata/auto_tests/template_replacer/rules/specific/template_replacer.json @@ -1,5 +1,7 @@ -[{ - "filter": "winlog.provider_name: \"the provider\" AND winlog.event_id: 123", - "template_replacer": {}, - "description": "" -}] \ No newline at end of file +[ + { + "filter": "winlog.provider_name: \"the provider\" AND winlog.event_id: 123", + "template_replacer": {}, + "description": "" + } +] \ No newline at end of file diff --git a/tests/testdata/config/config-auto-tests.yml b/tests/testdata/config/config-auto-tests.yml index ac85a1e66..d228b0a15 100644 --- a/tests/testdata/config/config-auto-tests.yml +++ b/tests/testdata/config/config-auto-tests.yml @@ -1,6 +1,15 @@ process_count: 1 timeout: 0.1 +input: + autorule_input: + type: dummy_input + documents: [] + +output: + dummy_output: + type: dummy_output + pipeline: - labelername: type: labeler @@ -29,7 +38,7 @@ pipeline: generic_rules: - tests/testdata/auto_tests/pre_detector/rules/generic/ outputs: - - jsonl: sre + - dummy_output: sre - pseudonymizer: type: pseudonymizer pubkey_analyst: tests/testdata/auto_tests/pseudonymizer/example_analyst_pub.pem @@ -37,7 +46,7 @@ pipeline: regex_mapping: tests/testdata/auto_tests/pseudonymizer/regex_mapping.yml hash_salt: a_secret_tasty_ingredient outputs: - - jsonl: pseudonyms + - dummy_output: pseudonyms specific_rules: - tests/testdata/auto_tests/pseudonymizer/rules/generic/ generic_rules: diff --git a/tests/testdata/config/config-only-output.yml b/tests/testdata/config/config-only-output.yml new file mode 100644 index 000000000..173c0fe09 --- /dev/null +++ b/tests/testdata/config/config-only-output.yml @@ -0,0 +1,3 @@ +output: + kafka_output: + type: dummy_output diff --git a/tests/testdata/config/config.yml b/tests/testdata/config/config.yml index b91157ba2..556f724d7 100644 --- a/tests/testdata/config/config.yml +++ b/tests/testdata/config/config.yml @@ -59,3 +59,13 @@ output: bootstrap.servers: "127.0.0.1:9092" acks: "-1" compression.type: none + kafka: + type: confluentkafka_output + topic: producer + error_topic: producer_error + flush_timeout: 30 + send_timeout: 2 + kafka_config: + bootstrap.servers: "127.0.0.1:9092" + acks: "-1" + compression.type: none diff --git a/tests/testdata/config/config2.yml b/tests/testdata/config/config2.yml index 8ba1980e2..d11a57b47 100644 --- a/tests/testdata/config/config2.yml +++ b/tests/testdata/config/config2.yml @@ -1,6 +1,6 @@ process_count: 2 timeout: 0.1 - +version: alternative pipeline: - labelername: type: labeler diff --git a/tests/testdata/metadata.py b/tests/testdata/metadata.py index 65e7f1cbe..460aa0a8f 100644 --- a/tests/testdata/metadata.py +++ b/tests/testdata/metadata.py @@ -1,4 +1,4 @@ -from os.path import split, join +from os.path import join, split path_to_testdata = split(__file__)[0] @@ -14,3 +14,4 @@ path_to_alternative_config = join(path_to_testdata, "config/config2.yml") path_to_invalid_config = join(path_to_testdata, "config/config-invalid.yml") path_to_invalid_yml_config = join(path_to_testdata, "config/config-yml-invalid.yml") +path_to_only_output_config = join(path_to_testdata, "config/config-only-output.yml") diff --git a/tests/unit/framework/test_pipeline.py b/tests/unit/framework/test_pipeline.py index 2e7061228..e992317e5 100644 --- a/tests/unit/framework/test_pipeline.py +++ b/tests/unit/framework/test_pipeline.py @@ -3,7 +3,7 @@ # pylint: disable=attribute-defined-outside-init from copy import deepcopy from logging import DEBUG, getLogger -from multiprocessing import Lock, active_children +from multiprocessing import Lock from unittest import mock import pytest @@ -15,7 +15,6 @@ CriticalInputParsingError, FatalInputError, InputWarning, - SourceDisconnectedWarning, ) from logprep.abc.output import ( CriticalOutputError, @@ -24,23 +23,28 @@ OutputWarning, ) from logprep.factory import Factory -from logprep.framework.pipeline import MultiprocessingPipeline, Pipeline +from logprep.framework.pipeline import Pipeline from logprep.processor.base.exceptions import ProcessingCriticalError, ProcessingWarning from logprep.processor.deleter.rule import DeleterRule -from logprep.util.getter import GetterFactory +from logprep.util.configuration import Configuration original_create = Factory.create class ConfigurationForTests: - logprep_config = { - "version": 1, - "timeout": 0.001, - "input": {"dummy": {"type": "dummy_input", "documents": [{"test": "empty"}]}}, - "output": {"dummy": {"type": "dummy_output"}}, - "pipeline": [{"mock_processor1": {"proc": "conf"}}, {"mock_processor2": {"proc": "conf"}}], - "metrics": {"period": 300, "enabled": False}, - } + logprep_config = Configuration( + **{ + "version": 1, + "timeout": 0.001, + "input": {"dummy": {"type": "dummy_input", "documents": [{"test": "empty"}]}}, + "output": {"dummy": {"type": "dummy_output"}}, + "pipeline": [ + {"mock_processor1": {"proc": "conf"}}, + {"mock_processor2": {"proc": "conf"}}, + ], + "metrics": {"enabled": False}, + } + ) lock = Lock() @@ -85,7 +89,7 @@ def test_passes_timeout_parameter_to_inputs_get_next(self, _): self.pipeline._setup() self.pipeline._input.get_next.return_value = ({}, {}) self.pipeline.process_pipeline() - timeout = self.logprep_config.get("timeout") + timeout = self.logprep_config.timeout self.pipeline._input.get_next.assert_called_with(timeout) def test_empty_documents_are_not_forwarded_to_other_processors(self, _): @@ -157,15 +161,6 @@ def test_all_events_provided_by_input_arrive_at_output(self, _): self.pipeline.run() assert self.pipeline._output["dummy"].events == expected_output_data - def test_enable_iteration_sets_iterate_to_true_stop_to_false(self, _): - assert not self.pipeline._iterate() - - self.pipeline._enable_iteration() - assert self.pipeline._iterate() - - self.pipeline.stop() - assert not self.pipeline._iterate() - @mock.patch("logging.Logger.error") def test_critical_output_error_is_logged_and_stored_as_failed(self, mock_error, _): self.pipeline._setup() @@ -531,13 +526,13 @@ def test_pipeline_raises_http_error_from_factory_create(self, mock_create): def test_multiple_outputs(self, _): output_config = {"kafka_output": {}, "opensearch_output": {}} - self.pipeline._logprep_config.update({"output": output_config}) + self.pipeline._logprep_config.output = output_config self.pipeline._setup() assert isinstance(self.pipeline._output, dict) assert len(self.pipeline._output) == 2 def test_output_creates_real_outputs(self, _): - self.pipeline._logprep_config["output"] = { + self.pipeline._logprep_config.output = { "dummy1": {"type": "dummy_output", "default": False}, "dummy2": {"type": "dummy_output"}, } @@ -549,7 +544,7 @@ def test_output_creates_real_outputs(self, _): assert not self.pipeline._output["dummy1"].default def test_process_pipeline_runs_scheduled_tasks(self, _): - self.pipeline._logprep_config["output"] = { + self.pipeline._logprep_config.output = { "dummy": {"type": "dummy_output"}, } with mock.patch("logprep.factory.Factory.create", original_create): @@ -573,13 +568,28 @@ def test_event_with_critical_input_parsing_error_is_stored_in_error_output(self, self.pipeline.process_pipeline() self.pipeline._output["dummy"].store_failed.assert_called() + def test_stop_breaks_while_loop_and_shutdown_is_called(self, _): + iterations = [None, None, 1] + self.pipeline._shut_down = mock.MagicMock() + + def continue_iterating_mock(): + effect = iterations.pop(0) + if effect is None: + return True + self.pipeline.stop() + + self.pipeline.process_pipeline = mock.MagicMock() + self.pipeline.process_pipeline.side_effect = continue_iterating_mock + self.pipeline.run() + self.pipeline._shut_down.assert_called() + class TestPipelineWithActualInput: def setup_method(self): - self.config = GetterFactory.from_string("tests/testdata/config/config.yml").get_yaml() - del self.config["output"] - self.config["process_count"] = 1 - self.config["input"] = { + self.config = Configuration.from_sources(["tests/testdata/config/config.yml"]) + self.config.output = {} + self.config.process_count = 1 + self.config.input = { "test_input": { "type": "dummy_input", "documents": [], @@ -588,7 +598,7 @@ def setup_method(self): } def test_pipeline_without_output_connector_and_one_input_event_and_preprocessors(self): - self.config["input"]["test_input"]["documents"] = [{"applyrule": "yes"}] + self.config.input["test_input"]["documents"] = [{"applyrule": "yes"}] pipeline = Pipeline(config=self.config) assert pipeline._output is None event, extra_outputs = pipeline.process_pipeline() @@ -596,10 +606,10 @@ def test_pipeline_without_output_connector_and_one_input_event_and_preprocessors assert "arrival_time" in event assert extra_outputs == [] - def test_pipeline_without_connectors_and_with_that_no_preprocessors(self): + def test_process_event_processes_without_input_and_without_output(self): event = {"applyrule": "yes"} expected_event = {"applyrule": "yes", "label": {"reporter": ["windows"]}} - del self.config["input"] + self.config.input = {} pipeline = Pipeline(config=self.config) assert pipeline._output is None assert pipeline._input is None @@ -611,7 +621,7 @@ def test_pipeline_without_output_connector_and_two_input_events_and_preprocessor {"applyrule": "yes"}, {"winlog": {"event_data": {"IpAddress": "123.132.113.123"}}}, ] - self.config["input"]["test_input"]["documents"] = input_events + self.config.input["test_input"]["documents"] = input_events pipeline = Pipeline(config=self.config) assert pipeline._output is None event, extra_outputs = pipeline.process_pipeline() @@ -624,8 +634,8 @@ def test_pipeline_without_output_connector_and_two_input_events_and_preprocessor assert len(extra_outputs) == 1 def test_pipeline_hmac_error_message_without_output_connector(self): - self.config["input"]["test_input"]["documents"] = [{"applyrule": "yes"}] - self.config["input"]["test_input"]["preprocessing"] = { + self.config.input["test_input"]["documents"] = [{"applyrule": "yes"}] + self.config.input["test_input"]["preprocessing"] = { "hmac": {"target": "non_existing_field", "key": "secret", "output_field": "hmac"} } pipeline = Pipeline(config=self.config) @@ -634,7 +644,7 @@ def test_pipeline_hmac_error_message_without_output_connector(self): assert event["hmac"]["hmac"] == "error" def test_pipeline_run_raises_assertion_when_run_without_input(self): - del self.config["input"] + self.config.input = {} pipeline = Pipeline(config=self.config) with pytest.raises( AssertionError, match="Pipeline should not be run without input connector" @@ -648,127 +658,8 @@ def test_pipeline_run_raises_assertion_when_run_without_output(self): ): pipeline.run() - def test_process_pipeline_raises_assertion_when_no_input_connector_is_set(self): - del self.config["input"] + def test_stop_sets_continue_iterating_to_false(self): pipeline = Pipeline(config=self.config) - with pytest.raises( - AssertionError, match="Run process_pipeline only with an valid input connector" - ): - pipeline.process_pipeline() - - -class TestMultiprocessingPipeline(ConfigurationForTests): - def test_creates_a_new_process(self): - children_before = active_children() - children_running = self.start_and_stop_pipeline( - MultiprocessingPipeline( - pipeline_index=1, - config=self.logprep_config, - log_queue=mock.MagicMock(), - lock=self.lock, - used_server_ports=mock.MagicMock(), - ) - ) - - assert len(children_running) == (len(children_before) + 1) - - def test_stop_terminates_the_process(self): - children_running = self.start_and_stop_pipeline( - MultiprocessingPipeline( - pipeline_index=1, - config=self.logprep_config, - log_queue=mock.MagicMock(), - lock=self.lock, - used_server_ports=mock.MagicMock(), - ) - ) - children_after = active_children() - - assert len(children_after) == (len(children_running) - 1) - - def test_enable_iteration_sets_iterate_to_true_stop_to_false(self): - pipeline = MultiprocessingPipeline( - pipeline_index=1, - config=self.logprep_config, - log_queue=mock.MagicMock(), - lock=self.lock, - used_server_ports=mock.MagicMock(), - ) - assert not pipeline._iterate() - - pipeline._enable_iteration() - assert pipeline._iterate() - + pipeline._continue_iterating.value = True pipeline.stop() - assert not pipeline._iterate() - - def test_graceful_shutdown_of_pipeline_on_source_disconnected_error(self, capfd): - pipeline = MultiprocessingPipeline( - pipeline_index=1, - config=self.logprep_config, - log_queue=mock.MagicMock(), - lock=self.lock, - used_server_ports=mock.MagicMock(), - ) - pipeline._input = mock.MagicMock() - pipeline._input.get_next = mock.MagicMock() - - def raise_source_disconnected_error(_): - raise SourceDisconnectedWarning(pipeline._input, "source was disconnected") - - pipeline._input.get_next.side_effect = raise_source_disconnected_error - pipeline.start() - pipeline.stop() - pipeline.join() - _, err = capfd.readouterr() - assert "AttributeError: 'bool' object has no attribute 'get_lock'" not in err - - def test_graceful_shutdown_of_pipeline_on_fata_input_error(self, capfd): - pipeline = MultiprocessingPipeline( - pipeline_index=1, - config=self.logprep_config, - log_queue=mock.MagicMock(), - lock=self.lock, - used_server_ports=mock.MagicMock(), - ) - pipeline._input = mock.MagicMock() - pipeline._input.get_next = mock.MagicMock() - - def raise_fatal_input_error(_): - raise FatalInputError(pipeline._input, "realy bad things happened") - - pipeline._input.get_next.side_effect = raise_fatal_input_error - pipeline.start() - pipeline.stop() - pipeline.join() - _, err = capfd.readouterr() - assert "AttributeError: 'bool' object has no attribute 'get_lock'" not in err - - def test_graceful_shutdown_of_pipeline_on_fatal_output_error(self, capfd): - pipeline = MultiprocessingPipeline( - pipeline_index=1, - config=self.logprep_config, - log_queue=mock.MagicMock(), - lock=self.lock, - used_server_ports=mock.MagicMock(), - ) - pipeline._output = mock.MagicMock() - pipeline._output.store = mock.MagicMock() - pipeline._output.store.side_effect = FatalOutputError( - pipeline._output, "bad things happened" - ) - pipeline.start() - pipeline.stop() - pipeline.join() - _, err = capfd.readouterr() - assert "AttributeError: 'bool' object has no attribute 'get_lock'" not in err - - @staticmethod - def start_and_stop_pipeline(wrapper): - wrapper.start() - children_running = active_children() - - wrapper.stop() - wrapper.join() - - return children_running + assert not pipeline._continue_iterating.value diff --git a/tests/unit/framework/test_pipeline_manager.py b/tests/unit/framework/test_pipeline_manager.py index 146409311..a731e370d 100644 --- a/tests/unit/framework/test_pipeline_manager.py +++ b/tests/unit/framework/test_pipeline_manager.py @@ -1,110 +1,33 @@ # pylint: disable=missing-docstring # pylint: disable=protected-access # pylint: disable=attribute-defined-outside-init -import os +from copy import deepcopy from logging import Logger from unittest import mock -from pytest import raises - -from logprep.framework.pipeline import MultiprocessingPipeline -from logprep.framework.pipeline_manager import ( - MustSetConfigurationFirstError, - PipelineManager, -) -from logprep.util.configuration import Configuration +from logprep.framework.pipeline_manager import PipelineManager +from logprep.metrics.exporter import PrometheusExporter +from logprep.util.configuration import Configuration, MetricsConfig from tests.testdata.metadata import path_to_config -class MultiprocessingPipelineMock(MultiprocessingPipeline): - process_count = 0 - exitcode = -1 - - def __init__(self): - self.was_started = False - self.was_stopped = False - - self.process_is_alive = False - self._id = MultiprocessingPipelineMock.process_count - - MultiprocessingPipelineMock.process_count += 1 - - def __repr__(self): - return f"MultiprocessingLogprepWrapperMock-{self._id}" - - def start(self): - self.was_started = True - self.process_is_alive = True - - def stop(self): - self.was_stopped = True - self.process_is_alive = False - - def is_alive(self): - return self.process_is_alive - - def join(self, timeout=None): - pass - - -class PipelineManagerForTesting(PipelineManager): - def _create_pipeline(self, index): - return MultiprocessingPipelineMock() - - +@mock.patch("multiprocessing.Process", new=mock.MagicMock()) class TestPipelineManager: def setup_class(self): - self.config = Configuration.create_from_yaml(path_to_config) + self.config = Configuration.from_sources([path_to_config]) self.logger = Logger("test") - self.manager = PipelineManagerForTesting() - self.manager.set_configuration(self.config) + self.manager = PipelineManager(self.config) def teardown_method(self): self.manager._pipelines = [] - def test_create_pipeline_fails_if_config_is_unset(self): - manager = PipelineManager() - - with raises( - MustSetConfigurationFirstError, - match="Failed to create new pipeline: Configuration is unset", - ): - pipeline_index = 1 - manager._create_pipeline(pipeline_index) - def test_get_count_returns_count_of_pipelines(self): for count in range(5): self.manager.set_count(count) assert self.manager.get_count() == count - def test_increase_to_count_adds_required_number_of_pipelines(self): - process_count = MultiprocessingPipelineMock.process_count - self.manager.set_count(0) - self.manager._increase_to_count(4) - - assert MultiprocessingPipelineMock.process_count == (4 + process_count) - - def test_increase_to_count_does_nothing_if_count_is_equal_or_less_to_current_count(self): - current_pipelines = list(self.manager._pipelines) - - for count in range(len(current_pipelines) + 1): - self.manager._increase_to_count(count) - - assert self.manager._pipelines == current_pipelines - - def test_increase_to_count_increases_number_of_pipeline_starts_metric(self): - self.manager.metrics.number_of_pipeline_starts = 0 - self.manager._increase_to_count(2) - assert self.manager.metrics.number_of_pipeline_starts == 2 - - def test_processes_created_by_run_are_started(self): - self.manager.set_count(3) - - for processor in self.manager._pipelines: - assert processor.was_started - def test_decrease_to_count_removes_required_number_of_pipelines(self): self.manager._increase_to_count(3) @@ -114,6 +37,13 @@ def test_decrease_to_count_removes_required_number_of_pipelines(self): self.manager._decrease_to_count(1) assert len(self.manager._pipelines) == 1 + def test_set_count_calls_multiprocessing_process(self): + self.manager._pipelines = [] + with mock.patch("multiprocessing.Process") as process_mock: + self.manager.set_count(2) + process_mock.assert_called() + assert len(self.manager._pipelines) == 2 + def test_decrease_to_count_does_nothing_if_count_is_equal_or_more_than_current_count(self): current_pipelines = list(self.manager._pipelines) @@ -144,19 +74,23 @@ def test_set_count_does_nothing_if_count_is_equal_to_current_count_of_pipelines( assert self.manager._pipelines == current_pipelines def test_remove_failed_pipelines_removes_terminated_pipelines(self): - self.manager.set_count(2) - failed_pipeline = self.manager._pipelines[-1] - failed_pipeline.process_is_alive = False - + failed_pipeline = mock.MagicMock() + failed_pipeline.is_alive = mock.MagicMock(return_value=False) + ok_pipeline = mock.MagicMock() + ok_pipeline.is_alive = mock.MagicMock(return_value=True) + self.manager._pipelines = [failed_pipeline, ok_pipeline] self.manager.restart_failed_pipeline() assert not failed_pipeline in self.manager._pipelines @mock.patch("logging.Logger.warning") def test_remove_failed_pipelines_logs_warning_for_removed_failed_pipelines(self, logger_mock): - self.manager.set_count(2) - failed_pipeline = self.manager._pipelines[-1] - failed_pipeline.process_is_alive = False + failed_pipeline = mock.MagicMock() + failed_pipeline.is_alive = mock.MagicMock(return_value=False) + failed_pipeline.exitcode = -1 + ok_pipeline = mock.MagicMock() + ok_pipeline.is_alive = mock.MagicMock(return_value=True) + self.manager._pipelines = [failed_pipeline, ok_pipeline] self.manager.restart_failed_pipeline() logger_mock.assert_called_with("Restarted 1 failed pipeline(s), with exit code(s): [-1]") @@ -172,36 +106,78 @@ def test_stop_terminates_processes_created(self): assert logprep_instance.was_started and logprep_instance.was_stopped def test_restart_failed_pipelines_calls_prometheus_cleanup_method(self, tmpdir): - os.environ["PROMETHEUS_MULTIPROC_DIR"] = str(tmpdir) - failed_pipeline = mock.MagicMock() - failed_pipeline.is_alive = mock.MagicMock() # nosemgrep - failed_pipeline.is_alive.return_value = False # nosemgrep - failed_pipeline.pid = 42 - manager = PipelineManager() - manager.set_configuration({"metrics": {"enabled": True}, "process_count": 2}) - prometheus_exporter_mock = mock.MagicMock() - manager.prometheus_exporter = prometheus_exporter_mock - manager._pipelines = [failed_pipeline] - manager.restart_failed_pipeline() - prometheus_exporter_mock.mark_process_dead.assert_called() - prometheus_exporter_mock.mark_process_dead.assert_called_with(42) - del os.environ["PROMETHEUS_MULTIPROC_DIR"] + with mock.patch("os.environ", new={"PROMETHEUS_MULTIPROC_DIR": str(tmpdir)}): + failed_pipeline = mock.MagicMock() + failed_pipeline.is_alive = mock.MagicMock() + failed_pipeline.is_alive.return_value = False + failed_pipeline.pid = 42 + self.config.metrics = {"enabled": True, "port": 1234} + self.config.process_count = 2 + manager = PipelineManager(self.config) + prometheus_exporter_mock = mock.MagicMock() + manager.prometheus_exporter = prometheus_exporter_mock + manager._pipelines = [failed_pipeline] + manager.restart_failed_pipeline() + prometheus_exporter_mock.mark_process_dead.assert_called() + prometheus_exporter_mock.mark_process_dead.assert_called_with(42) def test_restart_failed_pipelines_increases_number_of_failed_pipelines_metrics(self): failed_pipeline = mock.MagicMock() - failed_pipeline.is_alive = mock.MagicMock() # nosemgrep - failed_pipeline.is_alive.return_value = False # nosemgrep + failed_pipeline.is_alive = mock.MagicMock() + failed_pipeline.is_alive.return_value = False self.manager._pipelines = [failed_pipeline] self.manager.metrics.number_of_failed_pipelines = 0 self.manager.restart_failed_pipeline() assert self.manager.metrics.number_of_failed_pipelines == 1 def test_stop_calls_prometheus_cleanup_method(self, tmpdir): - os.environ["PROMETHEUS_MULTIPROC_DIR"] = str(tmpdir) - manager = PipelineManager() - manager.set_configuration({"metrics": {"enabled": True}, "process_count": 2}) - prometheus_exporter_mock = mock.MagicMock() - manager.prometheus_exporter = prometheus_exporter_mock - manager.stop() - prometheus_exporter_mock.cleanup_prometheus_multiprocess_dir.assert_called() - del os.environ["PROMETHEUS_MULTIPROC_DIR"] + with mock.patch("os.environ", new={"PROMETHEUS_MULTIPROC_DIR": str(tmpdir)}): + config = deepcopy(self.config) + config.metrics = {"enabled": True, "port": 1234} + self.config.process_count = 2 + manager = PipelineManager(config) + prometheus_exporter_mock = mock.MagicMock() + manager.prometheus_exporter = prometheus_exporter_mock + manager.stop() + prometheus_exporter_mock.cleanup_prometheus_multiprocess_dir.assert_called() + + def test_prometheus_exporter_is_instanciated_if_metrics_enabled(self): + self.config.metrics = MetricsConfig(enabled=True, port=8000) + manager = PipelineManager(self.config) + assert isinstance(manager.prometheus_exporter, PrometheusExporter) + + def test_stop_stops_queue_listener(self): + with mock.patch.object(self.manager, "_queue_listener") as _queue_listener_mock: + self.manager.stop() + _queue_listener_mock.stop.assert_called() + + def test_stop_closes_log_queue(self): + with mock.patch.object(self.manager, "log_queue") as log_queue_mock: + with mock.patch.object(self.manager, "_queue_listener"): + self.manager.stop() + log_queue_mock.close.assert_called() + + def test_set_count_increases_number_of_pipeline_starts_metric(self): + self.manager.metrics.number_of_pipeline_starts = 0 + self.manager.set_count(2) + assert self.manager.metrics.number_of_pipeline_starts == 2 + + def test_set_count_increases_number_of_pipeline_stops_metric(self): + self.manager.metrics.number_of_pipeline_stops = 0 + self.manager.set_count(2) + self.manager.set_count(0) + assert self.manager.metrics.number_of_pipeline_stops == 2 + + def test_restart_calls_set_count(self): + with mock.patch.object(self.manager, "set_count") as mock_set_count: + self.manager.restart() + mock_set_count.assert_called() + assert mock_set_count.call_count == 2 + + def test_restart_calls_prometheus_exporter_run(self): + self.config.metrics = MetricsConfig(enabled=True, port=666) + pipeline_manager = PipelineManager(self.config) + pipeline_manager.prometheus_exporter.is_running = False + with mock.patch.object(pipeline_manager.prometheus_exporter, "run") as mock_run: + pipeline_manager.restart() + mock_run.assert_called() diff --git a/tests/unit/metrics/test_exporter.py b/tests/unit/metrics/test_exporter.py index 242656382..0f6781268 100644 --- a/tests/unit/metrics/test_exporter.py +++ b/tests/unit/metrics/test_exporter.py @@ -9,6 +9,7 @@ from prometheus_client import REGISTRY from logprep.metrics.exporter import PrometheusExporter +from logprep.util.configuration import MetricsConfig @mock.patch( @@ -18,21 +19,15 @@ class TestPrometheusExporter: def setup_method(self): REGISTRY.__init__() - self.metrics_config = {"metrics": {"enabled": True, "port": 80}} + self.metrics_config = MetricsConfig(enabled=True, port=80) def test_correct_setup(self): - exporter = PrometheusExporter(self.metrics_config.get("metrics")) - assert exporter._port == self.metrics_config["metrics"]["port"] + exporter = PrometheusExporter(self.metrics_config) + assert exporter._port == self.metrics_config.port def test_default_port_if_missing_in_config(self): - metrics_config = { - "metrics": { - "period": 10, - "enabled": True, - } - } + metrics_config = MetricsConfig(enabled=True) exporter = PrometheusExporter(metrics_config) - assert exporter._port == 8000 @mock.patch("logprep.metrics.exporter.start_http_server") diff --git a/tests/unit/metrics/test_metrics.py b/tests/unit/metrics/test_metrics.py index 79d54e5c0..0ce5a2e85 100644 --- a/tests/unit/metrics/test_metrics.py +++ b/tests/unit/metrics/test_metrics.py @@ -169,6 +169,28 @@ def test_init_tracker_raises_on_try_to_overwrite_tracker_with_different_type(sel ) metric.init_tracker() + def test_add_with_labels_none_value_raises_typeerror(self): + metric = CounterMetric( + name="testmetric", + description="empty description", + labels={"A": "a"}, + registry=self.custom_registry, + ) + metric.init_tracker() + with pytest.raises(TypeError, match="not supported between instances of 'NoneType'"): + metric.add_with_labels(None, {"A": "a"}) + + def test_add_with_labels_none_labels_raises_typeerror(self): + metric = CounterMetric( + name="testmetric", + description="empty description", + labels={"A": "a"}, + registry=self.custom_registry, + ) + metric.init_tracker() + with pytest.raises(TypeError, match=" unsupported operand type(s) for |"): + metric.add_with_labels(1, None) + class TestGaugeMetric: def setup_method(self): diff --git a/tests/unit/processor/labeler/test_labeler.py b/tests/unit/processor/labeler/test_labeler.py index 3a5908b46..19ff4cee2 100644 --- a/tests/unit/processor/labeler/test_labeler.py +++ b/tests/unit/processor/labeler/test_labeler.py @@ -9,10 +9,10 @@ import pytest from pytest import raises +from logprep.factory import Factory from logprep.processor.base.exceptions import ValueDoesnotExistInSchemaError from logprep.processor.labeler.labeling_schema import LabelingSchema from logprep.processor.labeler.rule import LabelerRule -from logprep.factory import Factory from tests.testdata.metadata import path_to_schema, path_to_schema2 from tests.unit.processor.base import BaseProcessorTestCase @@ -247,7 +247,8 @@ def test_create_fails_when_rules_do_not_conform_to_labeling_schema(self): with raises( ValueDoesnotExistInSchemaError, match="Invalid value 'windows' for key 'reporter'." ): - Factory.create({"test instance": config}, self.logger) + labeler = Factory.create({"test instance": config}, self.logger) + labeler.setup() def test_create_loads_the_specified_labeling_schema(self): config = copy.deepcopy(self.CONFIG) diff --git a/tests/unit/processor/pseudonymizer/test_pseudonymizer.py b/tests/unit/processor/pseudonymizer/test_pseudonymizer.py index 623de0c3d..85ee1c48f 100644 --- a/tests/unit/processor/pseudonymizer/test_pseudonymizer.py +++ b/tests/unit/processor/pseudonymizer/test_pseudonymizer.py @@ -770,9 +770,8 @@ def test_tld_extractor_uses_file(self): def _load_specific_rule(self, rule): self.object._config.regex_mapping = self.regex_mapping - del self.object.__dict__["_regex_mapping"] super()._load_specific_rule(rule) - self.object._replace_regex_keywords_by_regex_expression() + self.object.setup() def test_pseudonymize_url_fields_not_in_pseudonymize(self): pseudonym = "" diff --git a/tests/unit/processor/test_process.py b/tests/unit/processor/test_process.py index c94abf447..4b6d80e39 100644 --- a/tests/unit/processor/test_process.py +++ b/tests/unit/processor/test_process.py @@ -1,6 +1,5 @@ # pylint: disable=missing-docstring # pylint: disable=protected-access -import re from logging import getLogger from unittest import mock from unittest.mock import call @@ -11,9 +10,10 @@ from logprep.framework.pipeline import Pipeline from logprep.processor.dissector.rule import DissectorRule from logprep.processor.generic_adder.rule import GenericAdderRule +from logprep.util.configuration import Configuration -class TestSpecificGenericProcessStrategy: +class TestSpecificGenericProcessing: @mock.patch("logprep.abc.processor.Processor._process_rule_tree") def test_process(self, mock_process_rule_tree): processor = Factory.create( @@ -108,7 +108,7 @@ def test_apply_processor_multiple_times_not_enabled(self): assert expected_event == event @pytest.mark.parametrize("execution_number", range(5)) # repeat test to ensure determinism - def test_strategy_applies_rules_in_deterministic_order(self, execution_number): + def test_applies_rules_in_deterministic_order(self, execution_number): config = {"type": "generic_adder", "specific_rules": [], "generic_rules": []} processor = Factory.create({"custom_lister": config}, getLogger("test-logger")) rule_one_dict = {"filter": "val", "generic_adder": {"add": {"some": "value"}}} @@ -124,14 +124,11 @@ def test_strategy_applies_rules_in_deterministic_order(self, execution_number): mock_callback.assert_has_calls(expected_call_order, any_order=False) @mock.patch("logging.Logger.warning") - def test_strategy_processes_generic_rules_after_processor_error_in_specific_rules( - self, mock_warning - ): - config = { - "pipeline": [ - {"adder": {"type": "generic_adder", "specific_rules": [], "generic_rules": []}} - ] - } + def test_processes_generic_rules_after_processor_error_in_specific_rules(self, mock_warning): + config = Configuration() + config.pipeline = [ + {"adder": {"type": "generic_adder", "specific_rules": [], "generic_rules": []}} + ] specific_rule_one_dict = { "filter": "val", "generic_adder": {"add": {"first": "value", "second": "value"}}, diff --git a/tests/unit/test_quickstart.py b/tests/unit/test_quickstart.py index f143ff4e0..7f58c8d8e 100644 --- a/tests/unit/test_quickstart.py +++ b/tests/unit/test_quickstart.py @@ -1,20 +1,14 @@ -from logging import getLogger from unittest import mock import pytest from logprep import run_logprep -from logprep.util.configuration import Configuration class TestQuickstart: QUICKSTART_CONFIG_PATH = "quickstart/exampledata/config/pipeline.yml" - def test_validity_of_quickstart_config(self): - config = Configuration().create_from_yaml(self.QUICKSTART_CONFIG_PATH) - config.verify(getLogger("test-logger")) - - def test_quickstart_rules_are_valid(self): + def test_quickstart_setup_is_valid(self): """ensures the quickstart rules are valid""" with mock.patch( "sys.argv", diff --git a/tests/unit/test_run_logprep.py b/tests/unit/test_run_logprep.py index 1c0f2a955..aff6b0bc7 100644 --- a/tests/unit/test_run_logprep.py +++ b/tests/unit/test_run_logprep.py @@ -6,40 +6,116 @@ from pathlib import Path from unittest import mock +import pytest import requests import responses from click.testing import CliRunner +from logprep import run_logprep from logprep._version import get_versions from logprep.run_logprep import cli -from logprep.util.configuration import InvalidConfigurationError +from logprep.util.configuration import Configuration, InvalidConfigurationError +from logprep.util.defaults import DEFAULT_CONFIG_LOCATION class TestRunLogprepCli: def setup_method(self): self.cli_runner = CliRunner() + @pytest.mark.parametrize( + "command, target", + [ + ("run tests/testdata/config/config.yml", "logprep.run_logprep.Runner.start"), + ( + "test config tests/testdata/config/config.yml", + "logprep.run_logprep._get_configuration", + ), + ( + "test unit tests/testdata/config/config.yml", + "logprep.util.auto_rule_tester.auto_rule_tester.AutoRuleTester.run", + ), + ( + "print tests/testdata/config/config.yml", + "logprep.util.configuration.Configuration.as_yaml", + ), + ( + "run tests/testdata/config/config.yml tests/testdata/config/config.yml", + "logprep.run_logprep.Runner.start", + ), + ( + "test config tests/testdata/config/config.yml tests/testdata/config/config.yml", + "logprep.run_logprep._get_configuration", + ), + ( + "test unit tests/testdata/config/config.yml tests/testdata/config/config.yml", + "logprep.util.auto_rule_tester.auto_rule_tester.AutoRuleTester.run", + ), + ( + "print tests/testdata/config/config.yml tests/testdata/config/config.yml", + "logprep.util.configuration.Configuration.as_yaml", + ), + ( + "test dry-run tests/testdata/config/config.yml quickstart/exampledata/input_logdata/test_input.jsonl", + "logprep.util.rule_dry_runner.DryRunner.run", + ), + ( + "test integration tests/testdata/config/config.yml path/to/testset", + "logprep.util.auto_rule_tester.auto_rule_corpus_tester.RuleCorpusTester.run", + ), + ( + "test dry-run tests/testdata/config/config.yml tests/testdata/config/config.yml asdfsdv", + "logprep.util.rule_dry_runner.DryRunner.run", + ), + ( + "test integration tests/testdata/config/config.yml tests/testdata/config/config.yml path/to/testset", + "logprep.util.auto_rule_tester.auto_rule_corpus_tester.RuleCorpusTester.run", + ), + ], + ) + def test_cli_commands_with_configs(self, command: str, target: str): + with mock.patch(target) as mocked_target: + result = self.cli_runner.invoke(cli, command.split()) + mocked_target.assert_called() + assert result.exit_code == 0, f"{result.exc_info}" + + @pytest.mark.parametrize( + "command", + [ + ("run",), + ("test", "config"), + ("test", "unit"), + ("test", "dry-run", "input_data"), + ("test", "integration", "testdata"), + ], + ) + def test_cli_invokes_default_config_location(self, command): + result = self.cli_runner.invoke(cli, [*command]) + assert result.exit_code != 0 + assert "does not exist: /etc/logprep/pipeline.yml" in result.stdout + @mock.patch("logprep.run_logprep.Runner") def test_cli_run_starts_runner_with_config(self, mock_runner): runner_instance = mock.MagicMock() + config_file_path = ("tests/testdata/config/config.yml",) + expected_config = Configuration.from_sources(config_file_path) mock_runner.get_runner.return_value = runner_instance - args = ["run", "tests/testdata/config/config.yml"] + args = ["run", *config_file_path] result = self.cli_runner.invoke(cli, args) assert result.exit_code == 0 + mock_runner.get_runner.assert_called_with(expected_config) runner_instance.start.assert_called() - config_file_path = "tests/testdata/config/config.yml" - runner_instance.load_configuration.assert_called_with(config_file_path) @mock.patch("logprep.run_logprep.Runner") - def test_cli_run_uses_getter_to_get_config(self, mock_runner): + def test_cli_run_starts_runner_with_multiple_configs(self, mock_runner): runner_instance = mock.MagicMock() mock_runner.get_runner.return_value = runner_instance - args = ["run", "file://tests/testdata/config/config.yml"] + config_file_path = ("tests/testdata/config/config.yml", "tests/testdata/config/config.yml") + expected_config = Configuration.from_sources(config_file_path) + args = ["run", *config_file_path] result = self.cli_runner.invoke(cli, args) assert result.exit_code == 0 + mock_runner.get_runner.assert_called_with(expected_config) runner_instance.start.assert_called() - config_file_path = "file://tests/testdata/config/config.yml" - runner_instance.load_configuration.assert_called_with(config_file_path) def test_exits_after_getter_error_for_not_existing_protocol(self): args = ["run", "almighty_protocol://tests/testdata/config/config.yml"] @@ -47,7 +123,7 @@ def test_exits_after_getter_error_for_not_existing_protocol(self): assert result.exit_code == 1 assert "No getter for protocol 'almighty_protocol'" in result.output - @mock.patch("logprep.util.configuration.Configuration.verify") + @mock.patch("logprep.util.configuration.Configuration._verify") def test_test_config_verifies_configuration_successfully(self, mock_verify): args = ["test", "config", "tests/testdata/config/config.yml"] result = self.cli_runner.invoke(cli, args) @@ -55,7 +131,7 @@ def test_test_config_verifies_configuration_successfully(self, mock_verify): mock_verify.assert_called() assert "The verification of the configuration was successful" in result.stdout - @mock.patch("logprep.util.configuration.Configuration.verify") + @mock.patch("logprep.util.configuration.Configuration._verify") def test_test_config_verifies_configuration_unsuccessfully(self, mock_verify): mock_verify.side_effect = InvalidConfigurationError args = ["test", "config", "tests/testdata/config/config.yml"] @@ -77,7 +153,7 @@ def test_version_arg_prints_logprep_version(self): assert result.exit_code == 0 assert f"python version: {sys.version.split()[0]}" in result.output assert f"logprep version: {get_versions()['version']}" in result.output - assert f"configuration version: no configuration found" in result.output + assert "configuration version: no configuration found" in result.output def test_run_version_arg_prints_logprep_version_with_config_version(self): args = ["run", "--version", "tests/testdata/config/config.yml"] @@ -96,7 +172,7 @@ def test_run_version_arg_prints_logprep_version_without_config_value(self): assert f"python version: {sys.version.split()[0]}" in result.output assert f"logprep version: {get_versions()['version']}" in result.output assert ( - "configuration version: unset, file://tests/testdata/config/config2.yml" + "configuration version: alternative, file://tests/testdata/config/config2.yml" in result.output ) @@ -137,46 +213,33 @@ def test_run_version_arg_prints_with_http_config_without_exposing_secret_data(se assert "username" not in result.output assert "password" not in result.output - def test_run_no_config_error_is_printed_if_no_config_was_arg_was_given(self): - result = self.cli_runner.invoke(cli, ["run"]) - assert result.exit_code == 2 - assert "Usage: logprep run [OPTIONS] CONFIG\nTry 'logprep run --help' for help.\n\nError: Missing argument 'CONFIG'." - def test_run_no_config_error_is_printed_if_given_config_file_does_not_exist(self, capsys): non_existing_config_file = "/tmp/does/not/exist.yml" result = self.cli_runner.invoke(cli, ["run", non_existing_config_file]) assert result.exit_code == 1 expected_lines = ( - f"The given config file does not exist: {non_existing_config_file}\nCreate the " - f"configuration or change the path. Use '--help' for more information." + f"One or more of the given config file(s) does not exist: " + f"{non_existing_config_file}\n" ) assert expected_lines in result.output - @mock.patch("logprep.runner.Runner.start") - @mock.patch("logprep.runner.Runner.stop") - def test_main_calls_runner_stop_on_any_exception(self, mock_stop, mock_start): - mock_start.side_effect = Exception + @mock.patch("logprep.runner.Runner._runner") + def test_main_calls_runner_stop_on_any_exception(self, mock_runner): + mock_runner.start.side_effect = Exception config_path = "tests/testdata/config/config.yml" result = self.cli_runner.invoke(cli, ["run", config_path]) assert result.exit_code == 1 - mock_stop.assert_called() - - def test_logprep_exits_if_logger_can_not_be_created(self): - with mock.patch("logprep.run_logprep.Configuration.get") as mock_create: - mock_create.side_effect = BaseException - config_path = "tests/testdata/config/config.yml" - result = self.cli_runner.invoke(cli, ["run", config_path]) - assert result.exit_code == 1 + mock_runner.stop.assert_called() def test_logprep_exits_on_invalid_configuration(self): - with mock.patch("logprep.util.configuration.Configuration.verify") as mock_verify: + with mock.patch("logprep.util.configuration.Configuration._verify") as mock_verify: mock_verify.side_effect = InvalidConfigurationError config_path = "tests/testdata/config/config.yml" result = self.cli_runner.invoke(cli, ["run", config_path]) assert result.exit_code == 1 def test_logprep_exits_on_any_exception_during_verify(self): - with mock.patch("logprep.util.configuration.Configuration.verify") as mock_verify: + with mock.patch("logprep.util.configuration.Configuration._verify") as mock_verify: mock_verify.side_effect = Exception config_path = "tests/testdata/config/config.yml" result = self.cli_runner.invoke(cli, ["run", config_path]) @@ -191,9 +254,9 @@ def test_logprep_exits_on_request_exception(self): @mock.patch("logprep.util.rule_dry_runner.DryRunner.run") def test_test_dry_run_starts_dry_runner(self, mock_dry_runner): - config_path = "tests/testdata/config/config.yml" + config_path = ("tests/testdata/config/config.yml",) events_path = "quickstart/exampledata/input_logdata/test_input.jsonl" - result = self.cli_runner.invoke(cli, ["test", "dry-run", config_path, events_path]) + result = self.cli_runner.invoke(cli, ["test", "dry-run", *config_path, events_path]) assert result.exit_code == 0 mock_dry_runner.assert_called() @@ -215,3 +278,12 @@ def test_test_ruleset_starts_rule_corpus_tester(self, mock_tester): result = self.cli_runner.invoke(cli, ["test", "integration", config_path, test_data_path]) assert result.exit_code == 0 mock_tester.assert_called() + + @mock.patch("logging.Logger.info") + def test_run_logprep_logs_log_level(self, mock_info): + config = Configuration.from_sources(("tests/testdata/config/config.yml",)) + assert config.logger.get("level") == "INFO" + with mock.patch("logprep.run_logprep.Runner"): + with pytest.raises(SystemExit): + run_logprep.run(("tests/testdata/config/config.yml",)) + mock_info.assert_has_calls([mock.call("Log level set to 'INFO'")]) diff --git a/tests/unit/test_runner.py b/tests/unit/test_runner.py index fe1da1674..4b0847adf 100644 --- a/tests/unit/test_runner.py +++ b/tests/unit/test_runner.py @@ -3,47 +3,19 @@ # pylint: disable=missing-class-docstring # pylint: disable=missing-function-docstring # pylint: disable=attribute-defined-outside-init -import json -import os -from copy import deepcopy +import re +import uuid from functools import partial -from logging import Logger +from pathlib import Path from unittest import mock -from pytest import raises +import pytest from requests.exceptions import HTTPError, SSLError from logprep._version import get_versions -from logprep.runner import ( - CannotReloadWhenConfigIsUnsetError, - MustConfigureBeforeRunningError, - MustNotConfigureTwiceError, - MustNotCreateMoreThanOneManagerError, - Runner, - UseGetRunnerToCreateRunnerSingleton, -) -from tests.testdata.metadata import ( - path_to_alternative_config, - path_to_config, - path_to_invalid_config, -) -from tests.unit.framework.test_pipeline_manager import PipelineManagerForTesting - - -class RunnerForTesting(Runner): - def __init__(self): - super().__init__(bypass_check_to_obtain_non_singleton_instance=True) - - def _create_manager(self): - self._manager = PipelineManagerForTesting() - - -class LogprepRunnerTest: - def setup_method(self, _): - self.logger = Logger("test") - - self.runner = RunnerForTesting() - self.runner._create_manager() +from logprep.runner import Runner +from logprep.util.configuration import Configuration +from tests.testdata.metadata import path_to_config def mock_keep_iterating(iterations): @@ -51,364 +23,272 @@ def mock_keep_iterating(iterations): yield True -class TestRunnerExpectedFailures(LogprepRunnerTest): - def test_init_fails_when_bypass_check_flag_is_not_set(self): - with raises(UseGetRunnerToCreateRunnerSingleton): - Runner() - - def test_fails_when_calling_create_manager_more_than_once(self): - runner = Runner(bypass_check_to_obtain_non_singleton_instance=True) - runner.load_configuration(path_to_config) +@pytest.fixture(name="config_path", scope="function") +def fixture_config_path(tmp_path: Path) -> Path: + config_path = tmp_path / uuid.uuid4().hex + configuration = Configuration.from_sources([path_to_config]) + config_path.write_text(configuration.as_yaml()) + return config_path - runner._create_manager() - with raises(MustNotCreateMoreThanOneManagerError): - runner._create_manager() - def test_fails_when_calling_load_configuration_with_non_existing_path(self): - with raises(FileNotFoundError): - self.runner.load_configuration("non-existing-file") +@pytest.fixture(name="configuration") +def fixture_configuration(config_path: Path) -> Configuration: + return Configuration.from_sources([str(config_path)]) - def test_fails_when_calling_load_configuration_more_than_once(self): - self.runner.load_configuration(path_to_config) - with raises(MustNotConfigureTwiceError): - self.runner.load_configuration(path_to_config) +@pytest.fixture(name="runner") +def fixture_runner(configuration: Configuration) -> Runner: + runner = Runner(configuration) # we want to have a fresh runner for each test + return runner - def test_fails_when_called_without_configuring_first(self): - with raises(MustConfigureBeforeRunningError): - self.runner.start() - @mock.patch("logprep.util.configuration.Configuration.verify") - def test_load_configuration_calls_verify_on_config(self, mock_verify): - self.runner.load_configuration(path_to_config) - mock_verify.assert_called() - - def test_fails_when_calling_reload_configuration_when_config_is_unset(self): - with raises(CannotReloadWhenConfigIsUnsetError): - self.runner.reload_configuration() - - -class TestRunner(LogprepRunnerTest): - def setup_method(self, _): - self.logger = Logger("test") - - self.runner = RunnerForTesting() - self.runner.load_configuration(path_to_config) - self.runner._create_manager() +class TestRunner: + def test_runner_sets_configuration(self): + configuration = Configuration.from_sources([path_to_config]) + runner = Runner.get_runner(configuration) + assert isinstance(runner._configuration, Configuration) + assert runner._configuration == configuration def test_get_runner_returns_the_same_runner_on_all_calls(self): - runner = Runner.get_runner() + configuration = Configuration.from_sources([path_to_config]) + runner = Runner.get_runner(configuration) for _ in range(10): - assert runner == Runner.get_runner() + assert runner is Runner.get_runner(configuration) @mock.patch("logging.Logger.info") - def test_reload_configuration_logs_info_when_reloading_config_was_successful(self, mock_info): - self.runner.reload_configuration() - mock_info.assert_has_calls([mock.call("Successfully reloaded configuration")]) - - def test_reload_configuration_reduces_logprep_instance_count_to_new_value(self): - self.runner._manager.set_count(3) - - self.runner._yaml_path = path_to_alternative_config - self.runner.reload_configuration() - assert self.runner._manager.get_count() == 2 - - def test_reload_configuration_counts_config_refreshes_if_successful(self): - self.runner.metrics.number_of_config_refreshes = 0 - self.runner._yaml_path = path_to_alternative_config - self.runner.reload_configuration() - assert self.runner.metrics.number_of_config_refreshes == 1 - - def test_reload_configuration_leaves_old_configuration_in_place_if_new_config_is_invalid(self): - old_configuration = deepcopy(self.runner._configuration) - - self.runner._yaml_path = path_to_invalid_config - self.runner.reload_configuration() - - assert self.runner._configuration == old_configuration + def test_reload_configuration_logs_info_when_reloading_config_was_successful( + self, mock_info, runner + ): + with mock.patch.object(runner._manager, "restart"): + runner.metrics.number_of_config_refreshes = 0 + runner._configuration.version = "very old version" + runner.reload_configuration() + mock_info.assert_has_calls([mock.call("Successfully reloaded configuration")]) + assert runner.metrics.number_of_config_refreshes == 1 - @mock.patch("logging.Logger.error") - def test_reload_configuration_logs_error_when_new_configuration_is_invalid(self, mock_error): - self.runner._yaml_path = path_to_invalid_config - self.runner.reload_configuration() - assert ( - "Invalid configuration, leaving old configuration in place:" - in mock_error.call_args[0][0] + @mock.patch("logging.Logger.info") + def test_reload_configuration_logs_info_if_config_does_not_change(self, mock_info, runner): + runner.metrics.number_of_config_refreshes = 0 + runner.metrics.number_of_config_refresh_failures = 0 + runner.reload_configuration() + mock_info.assert_has_calls( + [ + mock.call( + "Configuration version didn't change. Continue running with current version." + ) + ] ) + assert runner.metrics.number_of_config_refreshes == 0 + assert runner.metrics.number_of_config_refresh_failures == 0 - def test_reload_configuration_does_not_count_config_refresh_if_new_configuration_is_invalid( - self, + @mock.patch("logging.Logger.error") + def test_reload_configuration_logs_error_on_invalid_config( + self, mock_error, runner, config_path ): - self.runner.metrics.number_of_config_refreshes = 0 - self.runner._yaml_path = path_to_invalid_config - self.runner.reload_configuration() - assert self.runner.metrics.number_of_config_refreshes == 0 - - def test_reload_configuration_creates_new_logprep_instances_with_new_configuration(self): - self.runner._manager.set_count(3) - old_logprep_instances = list(self.runner._manager._pipelines) - - self.runner.reload_configuration() - - assert set(old_logprep_instances).isdisjoint(set(self.runner._manager._pipelines)) - assert len(self.runner._manager._pipelines) == 3 - - def test_start_sets_config_refresh_interval_to_a_minimum_of_5_seconds(self): - self.runner._keep_iterating = partial(mock_keep_iterating, 1) - self.runner._config_refresh_interval = 0 - self.runner.start() - assert self.runner.scheduler.jobs[0].interval == 5 - - @mock.patch("schedule.Scheduler.run_pending") - def test_iteration_calls_run_pending(self, mock_run_pending): - self.runner._keep_iterating = partial(mock_keep_iterating, 1) - self.runner.start() - mock_run_pending.assert_called() - - @mock.patch("schedule.Scheduler.run_pending") - def test_iteration_calls_run_pending_on_every_iteration(self, mock_run_pending): - self.runner._keep_iterating = partial(mock_keep_iterating, 3) - self.runner.start() - assert mock_run_pending.call_count == 3 + runner.metrics.number_of_config_refreshes = 0 + runner.metrics.number_of_config_refresh_failures = 0 + config_path.write_text("invalid config") + runner.reload_configuration() + mock_error.assert_called() + assert runner.metrics.number_of_config_refreshes == 0 + assert runner.metrics.number_of_config_refresh_failures == 1 + + def test_reload_configuration_leaves_old_configuration_in_place_if_new_config_is_invalid( + self, runner, config_path + ): + assert runner._configuration.version == "1" + config_path.write_text("invalid config") + runner.reload_configuration() + assert runner._configuration.version == "1" + + def test_reload_invokes_manager_restart_on_config_change(self, runner: Runner): + runner._configuration.version = "very old version" + with mock.patch.object(runner._manager, "restart") as mock_restart: + runner.reload_configuration() + mock_restart.assert_called() + + @pytest.mark.parametrize( + "new_value, expected_value", + [(None, None), (0, 5), (1, 5), (2, 5), (3, 5), (10, 10), (42, 42)], + ) + def test_set_config_refresh_interval(self, new_value, expected_value, runner): + with mock.patch.object(runner, "_manager"): + runner._config_refresh_interval = new_value + runner._keep_iterating = partial(mock_keep_iterating, 1) + runner.start() + if expected_value is None: + assert len(runner.scheduler.jobs) == 0 + else: + assert runner.scheduler.jobs[0].interval == expected_value @mock.patch("schedule.Scheduler.run_pending") - def test_iteration_stops_if_continue_iterating_returns_false(self, mock_run_pending): - def patch_runner(runner): - def patch(): # nosemgrep - with runner._continue_iterating.get_lock(): - runner._continue_iterating.value = False - - return patch - - mock_run_pending.side_effect = patch_runner(self.runner) - self.runner.start() - assert mock_run_pending.call_count == 1 - - def test_reload_configuration_does_not_schedules_job_if_no_config_refresh_interval_is_set(self): - assert len(self.runner.scheduler.jobs) == 0 - if "config_refresh_interval" in self.runner._configuration: - self.runner._configuration.pop("config_refresh_interval") - self.runner.reload_configuration(refresh=True) - assert len(self.runner.scheduler.jobs) == 0 - - def test_reload_configuration_schedules_job_if_config_refresh_interval_is_set(self, tmp_path): - self.runner.metrics.config_refresh_interval = 0 - assert len(self.runner.scheduler.jobs) == 0 - config_path = tmp_path / "config.yml" - config_update = {"config_refresh_interval": 5, "version": "current version"} - self.runner._configuration.update(config_update) - config_update = deepcopy(self.runner._configuration) - config_update.update({"config_refresh_interval": 5, "version": "new version"}) - config_path.write_text(json.dumps(config_update)) - self.runner._yaml_path = str(config_path) - self.runner.reload_configuration(refresh=True) - assert len(self.runner.scheduler.jobs) == 1 - assert self.runner.metrics.config_refresh_interval == 5 - - def test_reload_configuration_reschedules_job_with_new_refresh_interval(self, tmp_path): - assert len(self.runner.scheduler.jobs) == 0 - config_path = tmp_path / "config.yml" - # set current state - config_update = deepcopy(self.runner._configuration) - config_update.update({"config_refresh_interval": 5, "version": "current version"}) - self.runner._configuration.update(config_update) + def test_iteration_calls_run_pending(self, mock_run_pending, runner): + with mock.patch.object(runner, "_manager"): + runner._keep_iterating = partial(mock_keep_iterating, 3) + runner.start() + mock_run_pending.call_count = 3 + + def test_reload_configuration_schedules_job_if_config_refresh_interval_is_set( + self, runner: Runner, configuration: Configuration, config_path: Path + ): + runner.metrics.config_refresh_interval = 0 + assert len(runner.scheduler.jobs) == 0 + configuration.config_refresh_interval = 60 + config_path.write_text(configuration.as_yaml()) + runner._configuration.version = "very old version" + with mock.patch.object(runner._manager, "restart"): + runner.reload_configuration() + assert len(runner.scheduler.jobs) == 1 + assert runner.metrics.config_refresh_interval == 60 + + def test_reload_configuration_does_not_schedules_job_if_no_config_refresh_interval_is_set( + self, runner: Runner + ) -> None: + assert len(runner.scheduler.jobs) == 0 + if runner._configuration.config_refresh_interval is not None: + runner._configuration.config_refresh_interval = None + runner.reload_configuration() + assert len(runner.scheduler.jobs) == 0 + + def test_reload_configuration_reschedules_job_with_new_refresh_interval( + self, runner: Runner, configuration: Configuration, config_path: Path + ) -> None: + assert len(runner.scheduler.jobs) == 0 # first refresh - config_update.update({"config_refresh_interval": 5, "version": "new version"}) - config_path.write_text(json.dumps(config_update)) - self.runner._yaml_path = str(config_path) - self.runner.reload_configuration(refresh=True) - assert len(self.runner.scheduler.jobs) == 1 - assert self.runner.scheduler.jobs[0].interval == 5 + configuration.config_refresh_interval = 5 + config_path.write_text(configuration.as_yaml()) + runner._configuration.version = "very old version" + with mock.patch.object(runner._manager, "restart"): + runner.reload_configuration() + assert len(runner.scheduler.jobs) == 1 + assert runner.scheduler.jobs[0].interval == 5 # second refresh with new refresh interval - config_update.update({"config_refresh_interval": 10, "version": "newer version"}) - config_path.write_text(json.dumps(config_update)) - self.runner._yaml_path = str(config_path) - self.runner.reload_configuration(refresh=True) - assert len(self.runner.scheduler.jobs) == 1 - assert self.runner.scheduler.jobs[0].interval == 10 - + configuration.config_refresh_interval = 10 + config_path.write_text(configuration.as_yaml()) + runner._configuration.version = "even older version" + with mock.patch.object(runner._manager, "restart"): + runner.reload_configuration() + assert len(runner.scheduler.jobs) == 1 + assert runner.scheduler.jobs[0].interval == 10 + + @pytest.mark.parametrize( + "exception, log_message", + [ + (HTTPError(404), "404"), + ( + FileNotFoundError("no such file or directory"), + "One or more of the given config file(s) does not exist", + ), + (SSLError("SSL context"), "SSL context"), + ], + ) @mock.patch("logprep.abc.getter.Getter.get") - def test_reload_configuration_logs_request_exception_and_schedules_new_refresh_with_a_quarter_the_time( - self, mock_get + def test_reload_configuration_logs_exception_and_schedules_new_refresh_with_a_quarter_the_time( + self, mock_get, runner: Runner, caplog, exception, log_message ): - mock_get.side_effect = HTTPError(404) - assert len(self.runner.scheduler.jobs) == 0 - self.runner._config_refresh_interval = 40 - with mock.patch("logging.Logger.warning") as mock_warning: - with mock.patch("logging.Logger.info") as mock_info: - self.runner.reload_configuration(refresh=True) - mock_warning.assert_called_with("Failed to load configuration: 404") - mock_info.assert_called_with("Config refresh interval is set to: 10.0 seconds") - assert len(self.runner.scheduler.jobs) == 1 - assert self.runner.scheduler.jobs[0].interval == 10 + mock_get.side_effect = exception + assert len(runner.scheduler.jobs) == 0 + runner._config_refresh_interval = 40 + runner.reload_configuration() + assert log_message in caplog.text + assert len(runner.scheduler.jobs) == 1 + assert runner.scheduler.jobs[0].interval == 10 @mock.patch("logprep.abc.getter.Getter.get") def test_reload_configuration_sets_config_refresh_interval_metric_with_a_quarter_of_the_time( - self, mock_get + self, mock_get, runner: Runner ): mock_get.side_effect = HTTPError(404) - assert len(self.runner.scheduler.jobs) == 0 - self.runner._config_refresh_interval = 40 - self.runner.metrics.config_refresh_interval = 0 - self.runner.reload_configuration(refresh=True) - assert self.runner.metrics.config_refresh_interval == 10 - - @mock.patch("logprep.abc.getter.Getter.get") - def test_reload_configuration_logs_filenotfounderror_and_schedules_new_refresh_with_a_quarter_the_time( - self, mock_get - ): - mock_get.side_effect = FileNotFoundError("no such file or directory") - assert len(self.runner.scheduler.jobs) == 0 - self.runner._config_refresh_interval = 40 - with mock.patch("logging.Logger.warning") as mock_warning: - with mock.patch("logging.Logger.info") as mock_info: - self.runner.reload_configuration(refresh=True) - mock_warning.assert_called_with("Failed to load configuration: no such file or directory") - mock_info.assert_called_with("Config refresh interval is set to: 10.0 seconds") - assert len(self.runner.scheduler.jobs) == 1 - assert self.runner.scheduler.jobs[0].interval == 10 + assert len(runner.scheduler.jobs) == 0 + runner._config_refresh_interval = 40 + runner.metrics.config_refresh_interval = 0 + runner.reload_configuration() + assert runner.metrics.config_refresh_interval == 10 @mock.patch("logprep.abc.getter.Getter.get") - def test_reload_configuration_logs_sslerror_and_schedules_new_refresh_with_a_quarter_the_time( - self, mock_get + def test_reload_configuration_does_not_set_refresh_interval_below_5_seconds( + self, mock_get, caplog, runner: Runner ): - mock_get.side_effect = SSLError("SSL context") - assert len(self.runner.scheduler.jobs) == 0 - self.runner._config_refresh_interval = 40 - with mock.patch("logging.Logger.warning") as mock_warning: - with mock.patch("logging.Logger.info") as mock_info: - self.runner.reload_configuration(refresh=True) - mock_warning.assert_called_with("Failed to load configuration: SSL context") - mock_info.assert_called_with("Config refresh interval is set to: 10.0 seconds") - assert len(self.runner.scheduler.jobs) == 1 - assert self.runner.scheduler.jobs[0].interval == 10 - - @mock.patch("logprep.abc.getter.Getter.get") - def test_reload_configuration_does_not_set_refresh_interval_below_5_seconds(self, mock_get): mock_get.side_effect = HTTPError(404) - assert len(self.runner.scheduler.jobs) == 0 - self.runner._config_refresh_interval = 12 - with mock.patch("logging.Logger.warning") as mock_warning: - with mock.patch("logging.Logger.info") as mock_info: - self.runner.reload_configuration(refresh=True) - mock_warning.assert_called_with("Failed to load configuration: 404") - mock_info.assert_called_with("Config refresh interval is set to: 5 seconds") - assert len(self.runner.scheduler.jobs) == 1 - assert self.runner.scheduler.jobs[0].interval == 5 + assert len(runner.scheduler.jobs) == 0 + runner._config_refresh_interval = 12 + with caplog.at_level("INFO"): + runner.reload_configuration() + assert re.search(r"Failed to load configuration: .*404", caplog.text) + assert re.search("Config refresh interval is set to: 5 seconds", caplog.text) + assert len(runner.scheduler.jobs) == 1 + assert runner.scheduler.jobs[0].interval == 5 def test_reload_configuration_sets_refresh_interval_on_successful_reload_after_request_exception( - self, tmp_path + self, runner: Runner, config_path: Path ): - self.runner._config_refresh_interval = 12 - config_path = tmp_path / "config.yml" - config_update = deepcopy(self.runner._configuration) - config_update.update({"config_refresh_interval": 60, "version": "new version"}) - config_path.write_text(json.dumps(config_update)) - self.runner._yaml_path = str(config_path) + runner._config_refresh_interval = 12 + new_config = Configuration.from_sources([str(config_path)]) + new_config.config_refresh_interval = 60 + new_config.version = "new version" + config_path.write_text(new_config.as_yaml()) with mock.patch("logprep.abc.getter.Getter.get") as mock_get: mock_get.side_effect = HTTPError(404) - self.runner.reload_configuration(refresh=True) - assert len(self.runner.scheduler.jobs) == 1 - assert self.runner.scheduler.jobs[0].interval == 5 - self.runner.reload_configuration(refresh=True) - assert len(self.runner.scheduler.jobs) == 1 - assert self.runner.scheduler.jobs[0].interval == 60 - - def test_reload_configuration_sets_refresh_interval_after_request_exception_without_new_config( - self, tmp_path + runner.reload_configuration() + assert len(runner.scheduler.jobs) == 1 + assert runner.scheduler.jobs[0].interval == 5 + with mock.patch.object(runner._manager, "restart"): + runner.reload_configuration() + assert len(runner.scheduler.jobs) == 1 + assert runner.scheduler.jobs[0].interval == 60 + + def test_reload_configuration_logs_new_version_and_sets_metric( + self, runner: Runner, config_path: Path ): - config_update = {"config_refresh_interval": 12, "version": "current version"} - self.runner._config_refresh_interval = 12 - self.runner._configuration.update(config_update) - config_path = tmp_path / "config.yml" - config_update = deepcopy(self.runner._configuration) - config_path.write_text(json.dumps(config_update)) - self.runner._yaml_path = str(config_path) - with mock.patch("logprep.abc.getter.Getter.get") as mock_get: - mock_get.side_effect = HTTPError(404) - self.runner.reload_configuration(refresh=True) - assert len(self.runner.scheduler.jobs) == 1 - assert self.runner.scheduler.jobs[0].interval == 5 - self.runner.reload_configuration(refresh=True) - assert len(self.runner.scheduler.jobs) == 1 - assert self.runner.scheduler.jobs[0].interval == 12 - - def test_reload_configuration_logs_new_version_and_sets_metric(self, tmp_path): - assert len(self.runner.scheduler.jobs) == 0 - config_path = tmp_path / "config.yml" - config_update = {"config_refresh_interval": 5, "version": "current version"} - self.runner._configuration.update(config_update) - config_update = deepcopy(self.runner._configuration) - config_update.update({"config_refresh_interval": 5, "version": "new version"}) - config_path.write_text(json.dumps(config_update)) - self.runner._yaml_path = str(config_path) + assert len(runner.scheduler.jobs) == 0 + new_config = Configuration.from_sources([str(config_path)]) + new_config.config_refresh_interval = 5 + version = str(uuid.uuid4().hex) + new_config.version = version + config_path.write_text(new_config.as_yaml()) with mock.patch("logging.Logger.info") as mock_info: with mock.patch("logprep.metrics.metrics.GaugeMetric.add_with_labels") as mock_add: - self.runner.reload_configuration(refresh=True) - mock_info.assert_called_with("Configuration version: new version") + with mock.patch.object(runner._manager, "restart"): + runner.reload_configuration() + mock_info.assert_called_with(f"Configuration version: {version}") mock_add.assert_called() mock_add.assert_has_calls( - (mock.call(1, {"logprep": f"{get_versions()['version']}", "config": "new version"}),) + (mock.call(1, {"logprep": f"{get_versions()['version']}", "config": version}),) ) - def test_reload_configuration_decreases_processes_after_increase(self, tmp_path): - self.runner._manager.set_configuration(self.runner._configuration) - self.runner._manager.set_count(self.runner._configuration["process_count"]) - assert self.runner._configuration.get("process_count") == 3 - assert len(self.runner._manager._pipelines) == 3 - config_update = { - "config_refresh_interval": 5, - "version": "current version", - } - self.runner._configuration.update(config_update) - self.runner.reload_configuration(refresh=True) - assert len(self.runner._manager._pipelines) == 3 - config_path = tmp_path / "config.yml" - self.runner._yaml_path = str(config_path) - config_update = deepcopy(self.runner._configuration) - config_update.update( - {"config_refresh_interval": 5, "version": "new version", "process_count": 4} - ) - config_path.write_text(json.dumps(config_update)) - self.runner.reload_configuration(refresh=True) - assert len(self.runner._manager._pipelines) == 4 - config_update.update( - {"config_refresh_interval": 5, "version": "newer version", "process_count": 1} + def test_stop_method(self, runner: Runner): + assert not runner._exit_received + runner.stop() + assert runner._exit_received + + @mock.patch("logprep.runner.Runner._keep_iterating", new=partial(mock_keep_iterating, 1)) + def test_start_sets_version_metric(self, runner: Runner): + runner._configuration.version = "very custom version" + with mock.patch("logprep.metrics.metrics.GaugeMetric.add_with_labels") as mock_add: + runner.start() + mock_add.assert_called() + mock_add.assert_has_calls( + ( + mock.call( + 1, + { + "logprep": f"{get_versions()['version']}", + "config": runner._configuration.version, + }, + ), + ) ) - config_path.write_text(json.dumps(config_update)) - self.runner.reload_configuration(refresh=True) - assert len(self.runner._manager._pipelines) == 1 - @mock.patch( - "logprep.framework.pipeline_manager.PrometheusExporter.cleanup_prometheus_multiprocess_dir" - ) - def test_reload_configuration_does_not_call_prometheus_clean_up_method( - self, prometheus, tmp_path, tmpdir - ): - os.environ["PROMETHEUS_MULTIPROC_DIR"] = str(tmpdir) - config_path = tmp_path / "config.yml" - config_update = { - "config_refresh_interval": 5, - "version": "current version", - "metrics": {"enabled": True}, + def test_start_calls_manager_stop_after_breaking_the_loop(self, runner: Runner): + with mock.patch.object(runner, "_manager") as mock_manager: + runner._exit_received = True + runner.start() + mock_manager.stop.assert_called() + mock_manager.restart_failed_pipeline.assert_not_called() + + def test_metric_labels_returns_versions(self, runner: Runner): + assert runner._metric_labels == { + "logprep": f"{get_versions()['version']}", + "config": runner._configuration.version, } - self.runner._configuration.update(config_update) - config_update = deepcopy(self.runner._configuration) - config_update.update({"config_refresh_interval": 5, "version": "new version"}) - config_path.write_text(json.dumps(config_update)) - self.runner._yaml_path = str(config_path) - self.runner.reload_configuration(refresh=True) - prometheus.assert_not_called() - del os.environ["PROMETHEUS_MULTIPROC_DIR"] - - def test_loop_restarts_failed_pipelines(self): - self.runner._manager.set_configuration(self.runner._configuration) - self.runner._manager.set_count(self.runner._configuration["process_count"]) - assert len(self.runner._manager._pipelines) == 3 - self.runner._manager._pipelines[1].process_is_alive = False - with mock.patch("logging.Logger.warning") as mock_warning: - self.runner._loop() - mock_warning.assert_called_once_with( - "Restarted 1 failed pipeline(s), with exit code(s): [-1]" - ) diff --git a/tests/unit/util/test_auto_rule_tester.py b/tests/unit/util/test_auto_rule_tester.py index 1eeed8707..2faae2749 100644 --- a/tests/unit/util/test_auto_rule_tester.py +++ b/tests/unit/util/test_auto_rule_tester.py @@ -159,9 +159,8 @@ def test_pseudonymizer_specific_setup_called_on_load_rules( auto_rule_tester._reset_trees( processor ) # Called every time by auto tester before adding rules - mock_replace_regex_keywords_by_regex_expression.assert_called_once() auto_rule_tester._load_rules(processor, "specific_rules") - assert mock_replace_regex_keywords_by_regex_expression.call_count == 2 + assert mock_replace_regex_keywords_by_regex_expression.call_count == 1 @mock.patch("logprep.processor.list_comparison.processor.ListComparison.setup") def test_list_comparison_specific_setup_called_on_load_rules( diff --git a/tests/unit/util/test_configuration.py b/tests/unit/util/test_configuration.py index ca5f79554..a2706abb0 100644 --- a/tests/unit/util/test_configuration.py +++ b/tests/unit/util/test_configuration.py @@ -1,170 +1,327 @@ # pylint: disable=missing-docstring # pylint: disable=protected-access # pylint: disable=line-too-long -import os -import re -from copy import deepcopy +import json +import uuid from logging import getLogger from pathlib import Path from unittest import mock import pytest +import responses +from attrs import asdict +from requests.exceptions import HTTPError +from ruamel.yaml.scanner import ScannerError from logprep.util.configuration import ( Configuration, InvalidConfigurationError, InvalidConfigurationErrors, - InvalidInputConnectorConfigurationError, - InvalidProcessorConfigurationError, - RequiredConfigurationKeyMissingError, + MetricsConfig, +) +from logprep.util.getter import FileGetter, GetterNotFoundError +from tests.testdata.metadata import ( + path_to_config, + path_to_invalid_config, + path_to_only_output_config, ) -from logprep.util.getter import GetterFactory -from logprep.util.json_handling import dump_config_as_file -from tests.testdata.metadata import path_to_config logger = getLogger() +@pytest.fixture(name="config_path", scope="function") +def fixture_config_path(tmp_path: Path) -> Path: + config_path = tmp_path / uuid.uuid4().hex + configuration = Configuration.from_sources([path_to_config]) + config_path.write_text(configuration.as_yaml()) + return config_path + + class TestConfiguration: - config: dict - - def setup_method(self): - self.config = Configuration.create_from_yaml(path_to_config) - - def teardown_method(self): - if "LOGPREP_VERSION" in os.environ: - os.environ.pop("LOGPREP_VERSION") - if "LOGPREP_PROCESS_COUNT" in os.environ: - os.environ.pop("LOGPREP_PROCESS_COUNT") - if "LOGPREP_LOG_LEVEL" in os.environ: - os.environ.pop("LOGPREP_LOG_LEVEL") - if "LOGPREP_PIPELINE" in os.environ: - os.environ.pop("LOGPREP_PIPELINE") - if "LOGPREP_OUTPUT" in os.environ: - os.environ.pop("LOGPREP_OUTPUT") - if "LOGPREP_INPUT" in os.environ: - os.environ.pop("LOGPREP_INPUT") - - def assert_fails_when_replacing_key_with_value(self, key, value, expected_message): - config = Configuration.create_from_yaml(path_to_config) - - parent = config - if not isinstance(key, str): - key = list(key) - while len(key) > 1: - parent = parent[key.pop(0)] - key = key[0] - parent[key] = value - - with pytest.raises(InvalidConfigurationError, match=expected_message): - config.verify(logger) - - @mock.patch("logprep.util.configuration.print_fcolor") - def test_invalid_yml_prints_formatted_error(self, mock_print_fcolor, tmp_path): - broken_config_path = Path(tmp_path / "test_config") - broken_config_path.write_text("process_count: 5\ninvalid_yaml", encoding="utf8") - with pytest.raises(SystemExit, match="1"): - Configuration.create_from_yaml(str(broken_config_path)) - mock_print_fcolor.assert_called() - call_msg = str(mock_print_fcolor.call_args_list[0][0][1]) - assert call_msg.startswith("Error parsing YAML file") + @pytest.mark.parametrize( + "attribute, attribute_type, default", + [ + ("version", str, "unset"), + ("config_refresh_interval", type(None), None), + ("process_count", int, 1), + ("timeout", float, 5.0), + ("logger", dict, {"level": "INFO"}), + ("pipeline", list, []), + ("input", dict, {}), + ("output", dict, {}), + ("metrics", MetricsConfig, MetricsConfig(**{"enabled": False, "port": 8000})), + ], + ) + def test_configuration_init(self, attribute, attribute_type, default): + config = Configuration() + assert isinstance(getattr(config, attribute), attribute_type) + assert getattr(config, attribute) == default - def test_verify_passes_for_valid_configuration(self): - try: - self.config.verify(logger) - except InvalidConfigurationError as error: - pytest.fail(f"The verification should pass for a valid configuration.: {error}") + def test_create_from_source_creates_configuration(self): + config = Configuration.from_source(path_to_config) + assert isinstance(config, Configuration) - def test_verify_pipeline_only_passes_for_valid_configuration(self): - try: - self.config.verify_pipeline_only(logger) - except InvalidConfigurationError: - pytest.fail("The verification should pass for a valid configuration.") - - def test_verify_fails_on_missing_required_value(self): - not_required_keys = ["version"] - for key in list(self.config.keys()): - if key in not_required_keys: - continue - config = deepcopy(self.config) - del config[key] - - with pytest.raises(InvalidConfigurationError): - config.verify(logger) - - def test_verify_pipeline_only_fails_on_missing_pipeline_value(self): - for key in list(key for key in self.config.keys() if key != "pipeline"): - config = deepcopy(self.config) - del config[key] - config.verify_pipeline_only(logger) - - config = deepcopy(self.config) - del config["pipeline"] - with pytest.raises(InvalidConfigurationError): - config.verify(logger) + def test_create_from_source_adds_getter(self): + config = Configuration.from_source(path_to_config) + assert isinstance(config._getter, FileGetter) - def test_verify_fails_on_low_process_count(self): - for i in range(0, -10, -1): - self.assert_fails_when_replacing_key_with_value( - "process_count", i, "Process count must be an integer of one or larger, not:" - ) + def test_create_from_sources_adds_configs(self): + config = Configuration.from_sources([path_to_config, path_to_config]) + assert isinstance(config, Configuration) + assert isinstance(config._configs, tuple) + assert isinstance(config._configs[0], Configuration) - def test_verify_fails_on_empty_pipeline(self): - self.assert_fails_when_replacing_key_with_value( - "pipeline", [], '"pipeline" must contain at least one item!' + @pytest.mark.parametrize( + "attribute, first_value, second_value", + [ + ("version", "1", "2"), + ("config_refresh_interval", 0, 900), + ("process_count", 1, 2), + ("timeout", 1.0, 2.0), + ("logger", {"level": "INFO"}, {"level": "DEBUG"}), + ( + "metrics", + {"enabled": False, "port": 8000}, + {"enabled": True, "port": 9000}, + ), + ( + "metrics", + {"enabled": False, "port": 8000}, + {"enabled": True, "port": 9000}, + ), + ], + ) + def test_get_last_value(self, tmp_path, attribute, first_value, second_value): + first_config = tmp_path / "pipeline.yml" + first_config.write_text( + f""" +input: + dummy: + type: dummy_input + documents: [] +output: + dummy: + type: dummy_output +{attribute}: {first_value} +""" ) - - def test_verify_verifies_input_config(self): - self.assert_fails_when_replacing_key_with_value( - "input", - {"random_name": {"type": "unknown"}}, - "Invalid input connector configuration: Unknown type 'unknown'", + second_config = tmp_path / "pipeline2.yml" + second_config.write_text( + f""" +input: + dummy: + type: dummy_input + documents: [] +output: + dummy: + type: dummy_output +{attribute}: {second_value} +""" ) - def test_verify_verifies_output_config(self): - self.assert_fails_when_replacing_key_with_value( - "output", - {"random_name": {"type": "unknown"}}, - "Invalid output connector configuration: Unknown type 'unknown'", - ) + config = Configuration.from_sources([str(first_config), str(second_config)]) + attribute_from_test = getattr(config, attribute) + if hasattr(attribute_from_test, "__attrs_attrs__"): + assert asdict(attribute_from_test) == second_value + else: + assert attribute_from_test == second_value @pytest.mark.parametrize( - "test_case, metrics_config_dict, raised_error", + "attribute, value, expected_error, expected_message", [ + ("process_count", -1, ValueError, "must be >= 1"), + ("pipeline", {}, TypeError, "must be "), + ("timeout", "foo", TypeError, "must be "), + ("timeout", -0.1, ValueError, "must be > 0"), ( - "valid configuration", - {"metrics": {"enabled": True, "port": 8000}}, + "output", + {"dummy1": {"type": "dummy_output"}, "dummy2": {"type": "dummy_output"}}, None, - ), - ( - "invalid datatype in port", - {"metrics": {"enabled": True, "port": "8000"}}, None, ), ], ) - def test_verify_metrics_config( - self, metrics_config_dict, raised_error, test_case - ): # pylint: disable=unused-argument - metrics_config = deepcopy(self.config) - metrics_config.update(metrics_config_dict) - if raised_error is not None: - try: - metrics_config._verify_metrics_config() - except InvalidConfigurationErrors as error: - assert any( - (isinstance(error, raised_error) for error in error.errors) - ), f"No '{raised_error.__name__}' raised for test case '{test_case}'!" + def test_validation(self, attribute, value, expected_error, expected_message): + if expected_error is None: + Configuration(**{attribute: value}) else: - metrics_config._verify_metrics_config() + with pytest.raises(expected_error, match=expected_message): + Configuration(**{attribute: value}) + + def test_pipeline_property_is_merged_from_configs(self, tmp_path): + first_config = tmp_path / "pipeline.yml" + first_config.write_text( + """ +input: + dummy: + type: dummy_input + documents: [] +output: + dummy: + type: dummy_output +pipeline: + - labelername: + type: labeler + schema: quickstart/exampledata/rules/labeler/schema.json + include_parent_labels: true + specific_rules: [] + generic_rules: [] +""" + ) + second_config = tmp_path / "pipeline2.yml" + second_config.write_text( + """ +pipeline: + - dissectorname: + type: dissector + specific_rules: [] + generic_rules: [] +""" + ) + config = Configuration.from_sources([str(first_config), str(second_config)]) + assert isinstance(config.pipeline, list) + assert len(config.pipeline) == 2 + assert config.pipeline[0]["labelername"]["type"] == "labeler" + assert config.pipeline[1]["dissectorname"]["type"] == "dissector" + + def test_create_from_sources_with_incomplete_second_config(self): + config = Configuration.from_sources([path_to_config, path_to_only_output_config]) + assert config.output.get("kafka_output").get("type") == "dummy_output" + + def test_create_from_sources_collects_errors(self): + with pytest.raises(InvalidConfigurationErrors) as raised: + config = Configuration.from_sources([path_to_invalid_config, path_to_invalid_config]) + assert len(raised.value.errors) == 2 + assert isinstance(config, Configuration) + assert isinstance(config._configs, tuple) + + def test_create_from_sources_loads_processors(self): + config = Configuration.from_sources([path_to_config]) + labeler = config.pipeline[2] + assert isinstance(labeler, dict) + assert isinstance(labeler["labelername"], dict) + assert isinstance(labeler["labelername"]["type"], str) + assert labeler["labelername"]["type"] == "labeler" + + def test_create_from_sources_loads_rules(self): + config = Configuration.from_sources([path_to_config]) + labeler = config.pipeline[2] + assert isinstance(labeler, dict) + assert isinstance(labeler["labelername"], dict) + assert isinstance(labeler["labelername"]["specific_rules"], list) + assert isinstance(labeler["labelername"]["generic_rules"], list) + assert isinstance(labeler["labelername"]["specific_rules"][0], dict) + assert isinstance(labeler["labelername"]["generic_rules"][0], dict) + + def test_verify_passes_for_valid_configuration(self): + try: + Configuration.from_sources([path_to_config]) + except InvalidConfigurationError as error: + pytest.fail(f"The verification should pass for a valid configuration.: {error}") @pytest.mark.parametrize( - "test_case, config_dict, raised_errors", + "test_case, test_config, error_count", [ ( - "valid configuration", - {}, - None, + "str as processor definition", + {"pipeline": [{"processor_name": "SHOULD BE A DICT"}]}, + 1, + ), + ( + "unknown processor type", + {"pipeline": [{"processor_name": {"type": "UNKNOWN"}}]}, + 1, + ), + ( + "incomplete processor definition", + {"pipeline": [{"processor_name": {"type": "labeler"}}]}, + 1, + ), + ( + "failure in rule definition", + { + "pipeline": [ + { + "processor_name": { + "type": "dissector", + "specific_rules": [ + { + "filter": "message", + "dissector": { + "mapping": {"message": "%{source} %{target}"} + }, + "description": "do nothing rule for dissector", + } + ], + "generic_rules": [ + { + "filter": "message", + "dissector": "THIS SHOULD BE A DICT", + "description": "do nothing rule for dissector", + } + ], + } + } + ] + }, + 1, + ), + ( + "collects multiple errors", + { + "pipeline": [ + { + "error_processor": "THIS SHOULD BE A DICT", + }, + { + "processor_name": { + "type": "dissector", + "specific_rules": [ + { + "filter": "message", + "dissector": { + "mapping": {"message": "%{source} %{target}"} + }, + "description": "do nothing rule for dissector", + } + ], + "generic_rules": [ + { + "filter": "message", + "dissector": "THIS SHOULD BE A DICT", + "description": "do nothing rule for dissector", + } + ], + }, + }, + { + "another_error_processor": {"type": "UNKNOWN"}, + }, + ] + }, + 3, + ), + ("verifies input config", {"input": {"random_name": {"type": "UNKNOWN"}}}, 1), + ("verifies output config", {"output": {"kafka_output": {"type": "UNKNOWN"}}}, 1), + ( + "multiple outputs but one config failure", + { + "output": { + "dummy": {"type": "WRONG_TYPE"}, + "kafka_output": {"type": "dummy_output"}, + }, + }, + 1, + ), + ( + "multiple output configs success", + { + "output": { + "dummy": {"type": "dummy_output"}, + "kafka_output": {"type": "dummy_output"}, + } + }, + 0, ), ( "processor does not exist", @@ -172,17 +329,12 @@ def test_verify_metrics_config( "pipeline": [ { "some_processor_name": { - "type": "does_not_exist", + "type": "DOES_NOT_EXIST", } } ] }, - [ - ( - InvalidProcessorConfigurationError, - "Invalid processor configuration: some_processor_name - Unknown type 'does_not_exist'", - ) - ], + 1, ), ( "generic_rules missing from processor", @@ -198,15 +350,7 @@ def test_verify_metrics_config( } ] }, - [ - ( - InvalidProcessorConfigurationError, - re.escape( - "Invalid processor configuration: labelername - Required option(s) are " - + "missing: 'generic_rules'." - ), - ) - ], + 1, ), ( "unknown option without spaces in processor", @@ -219,17 +363,12 @@ def test_verify_metrics_config( "include_parent_labels": "on", "specific_rules": ["quickstart/exampledata/rules/labeler/specific"], "generic_rules": ["quickstart/exampledata/rules/labeler/generic"], - "some_unknown_option": "foo", + "SOME_UNKNOWN_OPTION": "FOO", } } ] }, - [ - ( - InvalidProcessorConfigurationError, - "Invalid processor configuration: labelername - Unknown option: 'some_unknown_option'.", - ) - ], + 1, ), ( "unknown option with spaces in processor", @@ -242,17 +381,12 @@ def test_verify_metrics_config( "include_parent_labels": "on", "specific_rules": ["quickstart/exampledata/rules/labeler/specific"], "generic_rules": ["quickstart/exampledata/rules/labeler/generic"], - "some unknown option": "foo", + "SOME UNKNOWN OPTION": "FOO", } } ] }, - [ - ( - InvalidProcessorConfigurationError, - "Invalid processor configuration: labelername - Unknown option: 'some unknown option'.", - ) - ], + 1, ), ( "two processor do not exist", @@ -260,62 +394,13 @@ def test_verify_metrics_config( "pipeline": [ { "some_processor_name": { - "type": "does_not_exist", + "type": "DOES_NOT_EXIST", } }, - {"another_processor_name": {"type": "does_not_exist"}}, + {"another_processor_name": {"type": "DOES_NOT_EXIST"}}, ] }, - [ - ( - InvalidProcessorConfigurationError, - "Invalid processor configuration: some_processor_name - Unknown type 'does_not_exist'", - ), - ( - InvalidProcessorConfigurationError, - "Invalid processor configuration: another_processor_name - Unknown type 'does_not_exist'", - ), - ], - ), - ( - "pipeline count invalid and processor type missing", - {"process_count": 0, "pipeline": [{"some_processor_name": {}}]}, - [ - ( - InvalidConfigurationError, - "Invalid Configuration: Process count must be an integer of one or larger, not: 0", - ), - ], - ), - ( - "pipeline is empty list", - {"pipeline": []}, - [ - ( - InvalidConfigurationError, - 'Invalid Configuration: "pipeline" must contain at least one item!', - ) - ], - ), - ( - "pipeline is empty dict", - {"pipeline": {}}, - [ - ( - InvalidConfigurationError, - 'Invalid Configuration: "pipeline" must contain at least one item!', - ) - ], - ), - ( - "pipeline is string", - {"pipeline": "foo"}, - [ - ( - InvalidConfigurationError, - '"pipeline" must be a list of processor dictionary configurations!', - ) - ], + 2, ), ( "processor error for config and output does not exists", @@ -329,13 +414,13 @@ def test_verify_metrics_config( "include_parent_labels": "on", "specific_rules": ["quickstart/exampledata/rules/labeler/specific"], "generic_rules": ["quickstart/exampledata/rules/labeler/generic"], - "some unknown option": "foo", + "SOME UNKNOWN OPTION": "FOO", } }, { "pseudo": { "type": "pseudonymizer", - "outputs": [{"kafka": "topic"}], + "outputs": [{"KAFKA": "topic"}], "pubkey_analyst": "tests/testdata/unit/pseudonymizer/example_analyst_pub.pem", "pubkey_depseudo": "tests/testdata/unit/pseudonymizer/example_depseudo_pub.pem", "hash_salt": "a_secret_tasty_ingredient", @@ -351,285 +436,56 @@ def test_verify_metrics_config( }, ], }, - [ - ( - InvalidProcessorConfigurationError, - "Invalid processor configuration: labelername - Unknown option: 'some unknown option'.", - ), - ( - InvalidProcessorConfigurationError, - "Invalid processor configuration: pseudo: output 'kafka' does not exist in logprep outputs", - ), - ], - ), - ], - ) - def test_verify_error(self, config_dict, raised_errors, test_case): - config = deepcopy(self.config) - config.update(config_dict) - if raised_errors is not None: - with pytest.raises(InvalidConfigurationErrors) as e_info: - config.verify(logger) - errors_set = [(type(err), str(err)) for err in e_info.value.errors] - assert len(raised_errors) == len(errors_set), test_case - zipped_errors = zip(raised_errors, errors_set) - for expected_error, raised_error in zipped_errors: - assert expected_error[0] == raised_error[0], "error class differ" - assert re.search(expected_error[1], raised_error[1]), "error message differ" - - @pytest.mark.parametrize( - "test_case, config_dict, raised_errors", - [ - ( - "valid configuration", - {}, - None, - ), - ( - "processor does not exist", - { - "pipeline": [ - { - "some_processor_name": { - "type": "does_not_exist", - } - } - ] - }, - [ - ( - InvalidProcessorConfigurationError, - "Invalid processor configuration: some_processor_name - Unknown type 'does_not_exist'", - ) - ], + 2, ), ( - "generic_rules missing from processor", + "rule with not existent output", { + "output": {"kafka_output": {"type": "dummy_output"}}, "pipeline": [ { - "labelername": { - "type": "labeler", - "schema": "quickstart/exampledata/rules/labeler/schema.json", - "include_parent_labels": "on", - "specific_rules": ["quickstart/exampledata/rules/labeler/specific"], + "selective_extractor": { + "type": "selective_extractor", + "generic_rules": [], + "specific_rules": [ + { + "filter": "message", + "selective_extractor": { + "outputs": [{"DOES_NOT_EXIST": "FOO"}] + }, + "source_fields": ["field.extract", "field2", "field3"], + } + ], } } - ] - }, - [ - ( - InvalidProcessorConfigurationError, - re.escape( - "Invalid processor configuration: labelername - Required option(s) are " - + "missing: 'generic_rules'." - ), - ) - ], - ), - ( - "two processors do not exist", - { - "pipeline": [ - { - "some_processor_name": { - "type": "does_not_exist", - } - }, - {"another_processor_name": {"type": "does_not_exist"}}, - ] - }, - [ - ( - InvalidProcessorConfigurationError, - "Invalid processor configuration: some_processor_name - Unknown type 'does_not_exist'", - ), - ( - InvalidProcessorConfigurationError, - "Invalid processor configuration: another_processor_name - Unknown type 'does_not_exist'", - ), - ], - ), - ( - "pipeline count invalid and processor type missing", - {"process_count": 0, "pipeline": [{"some_processor_name": {}}]}, - [ - ( - InvalidConfigurationError, - "Invalid Configuration: Process count must be an integer of one or larger, not: 0", - ), - ( - InvalidProcessorConfigurationError, - "Invalid processor configuration: some_processor_name - The type specification is missing for element with name 'some_processor_name'", - ), - ], - ), - ( - "metrics configured without errors", - { - "metrics": { - "period": 10, - "enabled": True, - "port": 8000, - } - }, - [], - ), - ( - "metrics enabled", - { - "metrics": { - "port": 8000, - } + ], }, - [ - ( - RequiredConfigurationKeyMissingError, - "Required option is missing: metrics > enabled", - ) - ], + 1, ), ], ) - def test_verify_errors_get_collected(self, config_dict, raised_errors, test_case): - config = deepcopy(self.config) - config.update(config_dict) - if raised_errors is not None: - errors = config._check_for_errors(logger) - collected_errors = [] - for error in errors: - collected_errors += error.errors - errors_set = [(type(error), str(error)) for error in collected_errors] - assert len(raised_errors) == len(errors_set), test_case - zipped_errors = zip(raised_errors, errors_set) - for expected_error, raised_error in zipped_errors: - assert expected_error[0] == raised_error[0], "error class differ" - assert re.search(expected_error[1], raised_error[1]), "error message differ" + def test_verify_verifies_config(self, tmp_path, test_case, test_config, error_count): + test_config_path = tmp_path / "failure-config.yml" + test_config = Configuration(**test_config) + if not test_config.input: + test_config.input = {"dummy": {"type": "dummy_input", "documents": []}} + if not test_config.output: + test_config.output = {"dummy": {"type": "dummy_output"}} + test_config_path.write_text(test_config.as_yaml()) + if error_count: + with pytest.raises(InvalidConfigurationErrors) as raised: + Configuration.from_sources([str(test_config_path)]) + assert len(raised.value.errors) == error_count, test_case else: - config._verify_metrics_config() - - def test_verify_input_raises_missing_input_key(self): - config = deepcopy(self.config) - del config["input"] - with pytest.raises( - RequiredConfigurationKeyMissingError, match="Required option is missing: input" - ): - config._verify_input(logger) - - def test_verify_input_raises_type_error(self): - config = deepcopy(self.config) - del config["input"]["kafka_input"]["topic"] - with pytest.raises( - InvalidInputConnectorConfigurationError, - match=re.escape( - "Invalid input connector configuration: Required option(s) are missing: " - + "'topic'." - ), - ): - config._verify_input(logger) - - def test_verify_output_raises_missing_output_key(self): - config = deepcopy(self.config) - del config["output"] - with pytest.raises( - RequiredConfigurationKeyMissingError, match="Required option is missing: output" - ): - config._verify_output(logger) - - def test_patch_yaml_with_json_connectors_inserts_json_input_connector(self, tmp_path): - regular_config = GetterFactory.from_string(path_to_config).get_yaml() - assert ( - regular_config.get("input", {}).get("kafka_input", {}).get("type") - == "confluentkafka_input" - ) - patched_config_path = Configuration.patch_yaml_with_json_connectors( - path_to_config, str(tmp_path) - ) - patched_config = GetterFactory.from_string(patched_config_path).get_yaml() - assert patched_config.get("input", {}).get("patched_input", {}).get("type") == "json_input" - - def test_patch_yaml_with_json_connectors_inserts_jsonl_input_connector(self, tmp_path): - regular_config = GetterFactory.from_string(path_to_config).get_yaml() - assert ( - regular_config.get("input", {}).get("kafka_input", {}).get("type") - == "confluentkafka_input" - ) - input_file_path = tmp_path / "test.jsonl" - patched_config_path = Configuration.patch_yaml_with_json_connectors( - path_to_config, str(tmp_path), str(input_file_path) - ) - patched_config = GetterFactory.from_string(patched_config_path).get_yaml() - assert patched_config.get("input", {}).get("patched_input", {}).get("type") == "jsonl_input" - - def test_patch_yaml_with_json_connectors_keeps_preprocessors(self, tmp_path): - regular_config = GetterFactory.from_string(path_to_config).get_yaml() - regular_config["input"]["kafka_input"]["preprocessing"] = { - "log_arrival_time_target_field": "foo" - } - test_config_path = str(tmp_path / "test_config.yaml") - dump_config_as_file(test_config_path, regular_config) - patched_config_path = Configuration.patch_yaml_with_json_connectors( - test_config_path, str(tmp_path) - ) - patched_config = GetterFactory.from_string(patched_config_path).get_yaml() - assert patched_config.get("input", {}).get("patched_input", {}).get("preprocessing") == { - "log_arrival_time_target_field": "foo" - } + Configuration.from_sources([str(test_config_path)]) - def test_patch_yaml_with_json_connectors_inserts_jsonl_output_connector(self, tmp_path): - regular_config = GetterFactory.from_string(path_to_config).get_yaml() - assert ( - regular_config.get("output", {}).get("kafka_output", {}).get("type") - == "confluentkafka_output" - ) - patched_config_path = Configuration.patch_yaml_with_json_connectors( - path_to_config, str(tmp_path) - ) - patched_config = GetterFactory.from_string(patched_config_path).get_yaml() - assert ( - patched_config.get("output", {}).get("patched_output", {}).get("type") == "jsonl_output" - ) - - def test_patch_yaml_with_json_connectors_set_process_count_to_one(self, tmp_path): - regular_config = GetterFactory.from_string(path_to_config).get_yaml() - assert regular_config.get("process_count") == 3 - patched_config_path = Configuration.patch_yaml_with_json_connectors( - path_to_config, str(tmp_path) - ) - patched_config = GetterFactory.from_string(patched_config_path).get_yaml() - assert patched_config.get("process_count") == 1 - - def test_patch_yaml_with_json_connectors_drops_metrics_config(self, tmp_path): - regular_config = GetterFactory.from_string(path_to_config).get_yaml() - regular_config["metrics"] = {"enabled": "true"} - test_config_path = str(tmp_path / "test_config.yaml") - dump_config_as_file(test_config_path, regular_config) - patched_config_path = Configuration.patch_yaml_with_json_connectors( - test_config_path, str(tmp_path) - ) - patched_config = GetterFactory.from_string(patched_config_path).get_yaml() - assert patched_config.get("metrics") is None - - def test_config_gets_enriched_by_environment(self, tmp_path): - config_path = tmp_path / "pipeline.yml" - config_path.write_text( - """ -version: $LOGPREP_VERSION -process_count: $LOGPREP_PROCESS_COUNT -timeout: 0.1 -logger: - level: $LOGPREP_LOG_LEVEL -$LOGPREP_PIPELINE -$LOGPREP_INPUT -$LOGPREP_OUTPUT -""" - ) - os.environ["LOGPREP_VERSION"] = "1" - os.environ["LOGPREP_PROCESS_COUNT"] = "1" - os.environ["LOGPREP_LOG_LEVEL"] = "DEBUG" - os.environ[ - "LOGPREP_PIPELINE" - ] = """ + patch = mock.patch( + "os.environ", + { + "LOGPREP_VERSION": "1", + "LOGPREP_PROCESS_COUNT": "16", + "LOGPREP_LOG_LEVEL": "DEBUG", + "LOGPREP_PIPELINE": """ pipeline: - labelername: type: labeler @@ -639,10 +495,8 @@ def test_config_gets_enriched_by_environment(self, tmp_path): - quickstart/exampledata/rules/labeler/specific generic_rules: - quickstart/exampledata/rules/labeler/generic -""" - os.environ[ - "LOGPREP_OUTPUT" - ] = """ +""", + "LOGPREP_OUTPUT": """ output: kafka: type: confluentkafka_output @@ -654,13 +508,35 @@ def test_config_gets_enriched_by_environment(self, tmp_path): bootstrap.servers: "172.21.0.5:9092" acks: "-1" compression.type: "none" +""", + "LOGPREP_INPUT": "input:\n kafka:\n type: confluentkafka_input\n topic: consumer\n kafka_config:\n bootstrap.servers: localhost:9092\n group.id: testgroup\n", + }, + ) + + @patch + def test_config_gets_enriched_by_environment(self, tmp_path): + config_path = tmp_path / "pipeline.yml" + config_path.write_text( + """ +version: $LOGPREP_VERSION +process_count: $LOGPREP_PROCESS_COUNT +timeout: 0.1 +logger: + level: $LOGPREP_LOG_LEVEL +$LOGPREP_PIPELINE +$LOGPREP_INPUT +$LOGPREP_OUTPUT """ - os.environ[ - "LOGPREP_INPUT" - ] = "input:\n kafka:\n type: confluentkafka_input\n topic: consumer\n kafka_config:\n bootstrap.servers: localhost:9092\n group.id: testgroup\n" - config = Configuration.create_from_yaml(str(config_path)) - config.verify(mock.MagicMock()) + ) + config = Configuration.from_sources([str(config_path)]) + config._verify() + assert config.version == "1" + assert config.process_count == 16 + assert config.output["kafka"]["topic"] == "producer" + assert config.input["kafka"]["topic"] == "consumer" + assert len(config.pipeline) == 1 + @patch def test_config_gets_enriched_by_environment_with_non_existent_variable(self, tmp_path): config_path = tmp_path / "pipeline.yml" config_path.write_text( @@ -676,169 +552,614 @@ def test_config_gets_enriched_by_environment_with_non_existent_variable(self, tm $LOGPREP_OUTPUT """ ) - os.environ["LOGPREP_VERSION"] = "1" - os.environ["LOGPREP_PROCESS_COUNT"] = "1" - os.environ["LOGPREP_LOG_LEVEL"] = "DEBUG" - os.environ[ - "LOGPREP_PIPELINE" - ] = """ -pipeline: - - labelername: - type: labeler - schema: quickstart/exampledata/rules/labeler/schema.json - include_parent_labels: true - specific_rules: - - quickstart/exampledata/rules/labeler/specific - generic_rules: - - quickstart/exampledata/rules/labeler/generic -""" - os.environ[ - "LOGPREP_OUTPUT" - ] = """ -output: - kafka: - type: confluentkafka_output - topic: producer - error_topic: producer_error - flush_timeout: 30 - send_timeout: 2 - kafka_config: - bootstrap.servers: "172.21.0.5:9092" - acks: "-1" - compression.type: "none" -""" - os.environ[ - "LOGPREP_INPUT" - ] = "input:\n kafka:\n type: confluentkafka_input\n topic: consumer\n" - config = Configuration.create_from_yaml(str(config_path)) with pytest.raises( InvalidConfigurationErrors, match=r"Environment variable\(s\) used, but not set: LOGPREP_I_DO_NOT_EXIST", ): - config.verify(mock.MagicMock()) + Configuration.from_sources([str(config_path)]) - def test_verifies_processor_configs_against_defined_outputs(self): - config = Configuration() - pipeline = [ - { - "se": { - "type": "selective_extractor", - "specific_rules": ["tests/testdata/unit/selective_extractor/rules/specific"], - "generic_rules": ["tests/testdata/unit/selective_extractor/rules/generic"], - } - }, - { - "pd": { - "type": "pre_detector", - "generic_rules": ["tests/testdata/unit/pre_detector/rules/generic"], - "specific_rules": ["tests/testdata/unit/pre_detector/rules/specific"], - "outputs": [{"kafka": "pre_detector_alerts"}], - "alert_ip_list_path": "tests/testdata/unit/pre_detector/alert_ips.yml", - } - }, - { - "pseudo": { - "type": "pseudonymizer", - "outputs": [{"kafka": "topic"}], - "pubkey_analyst": "tests/testdata/unit/pseudonymizer/example_analyst_pub.pem", - "pubkey_depseudo": "tests/testdata/unit/pseudonymizer/example_depseudo_pub.pem", - "hash_salt": "a_secret_tasty_ingredient", - "specific_rules": ["tests/testdata/unit/pseudonymizer/rules/specific/"], - "generic_rules": ["tests/testdata/unit/pseudonymizer/rules/generic/"], - "regex_mapping": "tests/testdata/unit/pseudonymizer/rules/regex_mapping.yml", - "max_cached_pseudonyms": 1000000, - } - }, - ] - config.update({"pipeline": pipeline, "output": {}}) + def test_duplicate_rule_id_per_processor_raises(self, tmp_path): + config_path = tmp_path / "pipeline.yml" + config_path.write_text( + """ +input: + dummy: + type: dummy_input + documents: [] +output: + dummy: + type: dummy_output +pipeline: + - my dissector: + type: dissector + specific_rules: + - filter: message + dissector: + id: same id + mapping: + message: "%{new_field} %{next_field}" + - filter: message + dissector: + id: same id + mapping: + message: "%{other_field} %{next_field}" + generic_rules: [] +""" + ) with pytest.raises(InvalidConfigurationErrors) as raised: - config._verify_pipeline(logger=logger) - assert len(raised.value.errors) == 3 + Configuration.from_sources([str(config_path)]) + assert len(raised.value.errors) == 1 for error in raised.value.errors: - assert "output 'kafka' does not exist in logprep outputs" in error.args[0] - - def test_verify_pipeline_without_processor_outputs_ignores_processor_output_errors(self): - config = Configuration() - pipeline = [ - { - "pd": { - "type": "pre_detector", - "generic_rules": ["tests/testdata/unit/pre_detector/rules/generic"], - "specific_rules": ["tests/testdata/unit/pre_detector/rules/specific"], - "outputs": [{"kafka": "pre_detector_alerts"}], - "alert_ip_list_path": "tests/testdata/unit/pre_detector/alert_ips.yml", - } - }, - ] - config.update({"pipeline": pipeline, "output": {}}) - try: - config.verify_pipeline_without_processor_outputs(logger=logger) - except InvalidConfigurationErrors as error: - assert False, f"Shouldn't raise output does not exist error: '{error}'" + assert "Duplicate rule id: same id" in error.args[0] - def test_duplicate_rule_id_per_processor_raises(self): - config = Configuration() - pipeline = [ - { - "my dissector": { - "type": "dissector", - "specific_rules": [ - { - "filter": "message", - "dissector": { - "id": "same id", - "mapping": {"message": "%{new_field} %{next_field}"}, - }, - }, - { - "filter": "message", - "dissector": { - "id": "same id", - "mapping": {"message": "%{other_field} %{next_field}"}, - }, - }, - ], - "generic_rules": [], - } - }, - ] - config.update({"pipeline": pipeline, "output": {}}) + def test_duplicate_rule_id_in_different_rule_trees_per_processor_raises(self, tmp_path): + config_path = tmp_path / "pipeline.yml" + config_path.write_text( + """ +input: + dummy: + type: dummy_input + documents: [] +output: + dummy: + type: dummy_output +pipeline: + - my dissector: + type: dissector + specific_rules: + - filter: message + dissector: + id: same id + mapping: + message: "%{new_field} %{next_field}" + generic_rules: + - filter: message + dissector: + id: same id + mapping: + message: "%{other_field} %{next_field}" +""" + ) with pytest.raises(InvalidConfigurationErrors) as raised: - config._verify_pipeline(logger=logger) + Configuration.from_sources([str(config_path)]) assert len(raised.value.errors) == 1 for error in raised.value.errors: assert "Duplicate rule id: same id" in error.args[0] - def test_duplicate_rule_id_in_different_rule_trees_per_processor_raises(self): - config = Configuration() - pipeline = [ + @pytest.mark.parametrize( + "test_case, metrics_config_dict, raised_error", + [ + ( + "valid configuration", + {"enabled": True, "port": 8000}, + None, + ), + ( + "invalid datatype in port", + {"enabled": True, "port": "8000"}, + TypeError, + ), + ( + "unknown option", + {"enabled": True, "port": 8000, "UNKNOWN_OPTION": "FOO"}, + TypeError, + ), + ], + ) + def test_verify_metrics_config( + self, metrics_config_dict, raised_error, test_case + ): # pylint: disable=unused-argument + if raised_error is None: + _ = Configuration(**{"metrics": metrics_config_dict}) + else: + with pytest.raises(raised_error): + _ = Configuration(**{"metrics": metrics_config_dict}) + + def test_reload_reloads_complete_config(self, tmp_path): + config_path = tmp_path / "pipeline.yml" + config_path.write_text( + """ +version: first_version +process_count: 2 +timeout: 0.1 +logger: + level: DEBUG +input: + dummy: + type: dummy_input + documents: [] +output: + dummy: + type: dummy_output +""" + ) + config = Configuration.from_sources([str(config_path)]) + assert config.version == "first_version" + config_path.write_text( + """ +version: second_version +process_count: 2 +timeout: 0.1 +logger: + level: DEBUG +input: + dummy: + type: dummy_input + documents: [] +output: + dummy: + type: dummy_output +""" + ) + config.reload() + assert config.version == "second_version" + + def test_reload_raises_on_invalid_config(self, tmp_path): + config_path = tmp_path / "pipeline.yml" + config_path.write_text( + """ +version: first_version +process_count: 2 +timeout: 0.1 +logger: + level: DEBUG +input: + dummy: + type: dummy_input + documents: [] +output: + dummy: + type: dummy_output +""" + ) + config = Configuration.from_sources([str(config_path)]) + assert config.version == "first_version" + config_path.write_text( + """ +version: second_version +process_count: THIS SHOULD BE AN INT +timeout: 0.1 +logger: + level: DEBUG +input: + dummy: + type: dummy_input + documents: [] +output: + dummy: + type: dummy_output +""" + ) + with pytest.raises(InvalidConfigurationError): + config.reload() + assert config.version == "first_version" + + def test_reload_raises_on_invalid_processor_config(self, tmp_path): + config_path = tmp_path / "pipeline.yml" + config_path.write_text( + """ +version: first_version +process_count: 2 +timeout: 0.1 +logger: + level: DEBUG +pipeline: + - labelername: + type: labeler + schema: quickstart/exampledata/rules/labeler/schema.json + include_parent_labels: true + specific_rules: [] + generic_rules: [] +input: + dummy: + type: dummy_input + documents: [] +output: + dummy: + type: dummy_output +""" + ) + config = Configuration.from_sources([str(config_path)]) + assert config.version == "first_version" + config_path.write_text( + """ +version: second_version +process_count: 2 +timeout: 0.1 +logger: + level: DEBUG +pipeline: + - labelername: + type: labeler + schema: quickstart/exampledata/rules/labeler/schema.json + include_parent_labels: true + specific_rules: [] + generic_rules: [] + - new_processor: + type: THIS SHOULD BE A VALID PROCESSOR +input: + dummy: + type: dummy_input + documents: [] +output: + dummy: + type: dummy_output +""" + ) + with pytest.raises(InvalidConfigurationError): + config.reload() + assert config.version == "first_version" + + def test_reload_raises_on_same_version(self, tmp_path): + config_path = tmp_path / "pipeline.yml" + config_path.write_text( + """ +version: first_version +process_count: 2 +timeout: 0.1 +logger: + level: DEBUG +input: + dummy: + type: dummy_input + documents: [] +output: + dummy: + type: dummy_output +""" + ) + config = Configuration.from_sources([str(config_path)]) + assert config.version == "first_version" + config_path.write_text( + """ +version: first_version +process_count: 2 +timeout: 0.1 +logger: + level: DEBUG +pipeline: + - labelername: + type: labeler + schema: quickstart/exampledata/rules/labeler/schema.json + include_parent_labels: true + specific_rules: [] + generic_rules: [] +input: + dummy: + type: dummy_input + documents: [] +output: + dummy: + type: dummy_output +""" + ) + with pytest.raises(InvalidConfigurationError, match="Configuration version didn't change."): + config.reload() + assert config.version == "first_version" + + def test_as_dict_returns_config(self): + config = Configuration.from_sources([path_to_config, path_to_only_output_config]) + config_dict = config.as_dict() + assert isinstance(config_dict, dict) + assert config_dict["output"]["kafka_output"]["type"] == "dummy_output" + assert len(config_dict["output"]) == 1, "only last output should be in config" + assert len(config_dict["pipeline"]) == 4, "all processors should be in config" + labeler = config_dict["pipeline"][2]["labelername"] + assert len(labeler["specific_rules"]) == 1 + assert isinstance(labeler["specific_rules"][0], dict) + + def test_as_json_returns_json(self): + config = Configuration.from_sources([path_to_config, path_to_only_output_config]) + assert isinstance(config.as_json(), str) + assert '"type": "dummy_output"' in config.as_json() + + def test_as_yaml_returns_yaml(self): + config = Configuration.from_sources([path_to_config, path_to_only_output_config]) + assert isinstance(config.as_yaml(), str) + assert "type: dummy_output" in config.as_yaml() + + def test_as_dict_returns_json_serializable_dict(self, config_path): + config = Configuration.from_sources([str(config_path)]) + config.version = "super_custom_version" + config_dict = config.as_dict() + assert isinstance(config_dict, dict) + for key in config_dict.get("pipeline"): + try: + assert json.dumps(key) + except Exception as error: + raise AssertionError(f"Value for key {key} is not json serializable") from error + assert json.dumps(config_dict), "Config dict is not json serializable" + assert config.as_json(), "Config json is not json serializable" + config_path.write_text(config.as_json()) + + def test_returned_json_is_valid_config(self, config_path): + config = Configuration.from_sources([str(config_path)]) + config.version = "super_custom_version" + config_path.write_text(config.as_json()) + newconfig = Configuration.from_sources([str(config_path)]) + assert newconfig.version == "super_custom_version" + + def test_returned_yaml_is_valid_config(self, config_path): + config = Configuration.from_sources([str(config_path)]) + config.version = "super_custom_version" + config_path.write_text(config.as_yaml()) + newconfig = Configuration.from_sources([str(config_path)]) + assert newconfig.version == "super_custom_version" + + def test_reload_loads_generated_config(self, config_path): + config = Configuration.from_sources([str(config_path)]) + config_path.write_text(config.as_yaml()) + config.version = "very old version" + config.reload() + assert config.version == "1" + + def test_reload_sets_pipeline(self, config_path): + config = Configuration.from_sources([str(config_path)]) + config.config_refresh_interval = 5 + config_path.write_text(config.as_yaml()) + config.version = "older version" + config.reload() + assert config.pipeline[2]["labelername"]["type"] == "labeler" + + def test_reload_sets_new_pipline(self, config_path): + config = Configuration.from_sources([str(config_path)]) + assert len(config.pipeline) == 4 + config.pipeline.append( { - "my dissector": { - "type": "dissector", - "specific_rules": [ - { - "filter": "message", - "dissector": { - "id": "same id", - "mapping": {"message": "%{new_field} %{next_field}"}, - }, - }, - ], - "generic_rules": [ - { - "filter": "message", - "dissector": { - "id": "same id", - "mapping": {"message": "%{other_field} %{next_field}"}, - }, - }, - ], + "new_processor": { + "type": "field_manager", + "generic_rules": [], + "specific_rules": [], } - }, - ] - config.update({"pipeline": pipeline, "output": {}}) - with pytest.raises(InvalidConfigurationErrors) as raised: - config._verify_pipeline(logger=logger) - assert len(raised.value.errors) == 1 - for error in raised.value.errors: - assert "Duplicate rule id: same id" in error.args[0] + } + ) + config_path.write_text(config.as_yaml()) + config.version = "older version" + config.reload() + assert len(config.pipeline) == 5 + assert config.pipeline[4]["new_processor"]["type"] == "field_manager" + + def test_configurations_are_equal_if_version_is_equal(self): + config = Configuration.from_sources([path_to_config]) + config2 = Configuration.from_sources([path_to_config]) + assert config is not config2 + assert config.version == config2.version + config.config_refresh_interval = 99 + assert config.config_refresh_interval != config2.config_refresh_interval + assert config == config2 + + @pytest.mark.parametrize( + "testcase, mocked, side_effect, expected_error_message", + [ + ( + "getter protocol does not exist", + "logprep.util.getter.GetterFactory.from_string", + GetterNotFoundError("No getter for protocol 'does_not_exist'"), + r"No getter for protocol 'does_not_exist'", + ), + ( + "getter raises FileNotFoundError", + "logprep.util.getter.FileGetter.get", + FileNotFoundError, + r"One or more of the given config file\(s\) does not exist:", + ), + ( + "document is not a valid json or yaml", + "logprep.util.getter.FileGetter.get_yaml", + ScannerError, + "Invalid yaml or json file:", + ), + ( + "url returns 404", + "logprep.util.getter.GetterFactory.from_string", + HTTPError("404 Client Error: Not Found for url: http://does_not_exist"), + "404 Client Error: Not Found for url: http://does_not_exist", + ), + ], + ) + def test_configuration_raises_invalidconfigurationerror( + self, testcase, mocked, side_effect, expected_error_message + ): + with mock.patch(mocked, side_effect=side_effect): + with pytest.raises(InvalidConfigurationError, match=expected_error_message): + Configuration.from_sources([path_to_config]) + + def test_valueerror_in_from_source(self, config_path): + config_path.write_text("process_count: -1") + with pytest.raises(InvalidConfigurationError, match=r"'process_count' must be >= 1"): + Configuration.from_sources([str(config_path)]) + + def test_from_sources_without_config_paths_attribute(self): + with pytest.raises( + InvalidConfigurationError, match=r"does not exist: \/etc\/logprep\/pipeline\.yml" + ): + Configuration.from_sources() + + def test_config_with_missing_environment_error(self): + with mock.patch("os.environ", {"PROMETHEUS_MULTIPROC_DIR": "DOES/NOT/EXIST"}): + with pytest.raises( + InvalidConfigurationError, + match=r"'DOES\/NOT\/EXIST' does not exist", + ): + Configuration.from_sources([path_to_config]) + + def test_config_with_single_json_rule(self, config_path): + config_path.write_text( + """ +{ +"input": { + "dummy": {"type": "dummy_input", "documents": []} +}, +"output": { + "dummy": {"type": "dummy_output"} +}, +"pipeline": [ + { + "my dissector": { + "type": "dissector", + "specific_rules": [], + "generic_rules": [ + { + "filter": "message", + "dissector": { + "id": "random id", + "mapping": { + "message": "%{new_field} %{next_field}" + } + } + } + ] + } + } + ] +} +""" + ) + config = Configuration.from_sources([str(config_path)]) + assert len(config.pipeline) == 1 + + def test_config_with_missing_environment_variable_and_other_failure_raises(self, config_path): + config_path.write_text( + """ +version: $LOGPREP_VERSION +process_count: 1 +pipeline: + - labelername: + type: DOES_NOT_EXIST + generic_rules: [] + specific_rules: [] +input: + dummy: + type: dummy_input + documents: [] +output: + dummy: + type: dummy_output +""" + ) + with pytest.raises(InvalidConfigurationError) as raised: + Configuration.from_sources([str(config_path)]) + assert len(raised.value.errors) == 2 + + def test_processor_config_with_file_path(self, config_path): + config_path.write_text( + """ +pipeline: + - the almighty dissector: + type: dissector + generic_rules: + - tests/testdata/unit/dissector/generic_rules/dissector_rule.json + specific_rules: [] +input: + dummy: + type: dummy_input + documents: [] +output: + dummy: + type: dummy_output +""" + ) + config = Configuration.from_sources([str(config_path)]) + assert len(config.pipeline) == 1 + assert len(config.pipeline[0]["the almighty dissector"]["generic_rules"]) == 1 + + @responses.activate + def test_processor_config_with_url_path(self, tmp_path): + config_path = tmp_path / "pipeline.yml" + config_path.write_text( + """ +pipeline: + - the almighty dissector: + type: dissector + generic_rules: + - http://localhost/dissector_rule.json + specific_rules: [] +input: + dummy: + type: dummy_input + documents: [] +output: + dummy: + type: dummy_output +""" + ) + resp_text = json.dumps( + [ + { + "filter": "message", + "dissector": { + "id": "random id", + "mapping": {"message": "%{new_field} %{next_field}"}, + }, + } + ] + ) + responses.add( + responses.GET, + "http://localhost/dissector_rule.json", + resp_text, + ) + config = Configuration.from_sources([str(config_path)]) + assert len(config.pipeline) == 1 + assert len(config.pipeline[0]["the almighty dissector"]["generic_rules"]) == 1 + + +class TestInvalidConfigurationErrors: + @pytest.mark.parametrize( + "error_list, expected_error_list", + [ + ([], []), + ( + [ + InvalidConfigurationError("test"), + InvalidConfigurationError("test"), + ], + [ + InvalidConfigurationError("test"), + ], + ), + ( + [ + InvalidConfigurationError("test"), + InvalidConfigurationError("test"), + TypeError("typeerror"), + ], + [ + InvalidConfigurationError("test"), + InvalidConfigurationError("typeerror"), + ], + ), + ( + [ + InvalidConfigurationError("test"), + InvalidConfigurationError("test"), + TypeError("typeerror"), + ValueError("valueerror"), + ], + [ + InvalidConfigurationError("test"), + InvalidConfigurationError("typeerror"), + InvalidConfigurationError("valueerror"), + ], + ), + ( + [ + InvalidConfigurationError("test"), + InvalidConfigurationError("test"), + TypeError("typeerror"), + ValueError("valueerror"), + ], + [ + InvalidConfigurationError("test"), + InvalidConfigurationError("typeerror"), + InvalidConfigurationError("valueerror"), + ], + ), + ], + ) + def test_invalid_configuration_error_only_append_unique_errors( + self, error_list, expected_error_list + ): + error = InvalidConfigurationErrors(error_list) + assert len(error.errors) == len(expected_error_list) + assert error.errors == expected_error_list diff --git a/tests/unit/util/test_getter.py b/tests/unit/util/test_getter.py index 353aa8db7..d17a1e802 100644 --- a/tests/unit/util/test_getter.py +++ b/tests/unit/util/test_getter.py @@ -8,7 +8,6 @@ from unittest import mock import pytest -import requests import responses from requests.auth import HTTPBasicAuth from requests.exceptions import Timeout diff --git a/tests/unit/util/test_helper.py b/tests/unit/util/test_helper.py index 555d510ee..a02354857 100644 --- a/tests/unit/util/test_helper.py +++ b/tests/unit/util/test_helper.py @@ -1,16 +1,20 @@ # pylint: disable=missing-docstring # pylint: disable=no-self-use +import re from unittest import mock import pytest +from logprep.util.configuration import Configuration from logprep.util.helper import ( camel_to_snake, - snake_to_camel, get_dotted_field_value, + get_versions_string, pop_dotted_field_value, + snake_to_camel, ) from logprep.util.json_handling import is_json +from tests.testdata.metadata import path_to_alternative_config, path_to_config class TestCamelToSnake: @@ -221,3 +225,49 @@ def test_get_dotted_field_removes_source_field2(self): value = pop_dotted_field_value(event, dotted_field) assert value == {"field": "value"} assert not event + + +class TestGetVersionString: + def test_get_version_string(self): + config = Configuration() + config.version = "0.1.0" + expected_pattern = ( + r"python version:\s+3\.\d+\.\d+\n" + r"logprep version:\s+[^\s]+\n" + r"configuration version:\s+0\.1\.0, None" + ) + + result = get_versions_string(config) + assert re.search(expected_pattern, result) + + def test_get_version_string_with_config_source(self): + config = Configuration.from_sources([path_to_config]) + expected_pattern = ( + r"python version:\s+3\.\d+\.\d+\n" + r"logprep version:\s+[^\s]+\n" + r"configuration version:\s+1,\s+file://[^\s]+/config\.yml" + ) + + result = get_versions_string(config) + assert re.search(expected_pattern, result) + + def test_get_version_string_with_multiple_config_sources(self): + config = Configuration.from_sources([path_to_config, path_to_config]) + expected_pattern = ( + r"python version:\s+3\.\d+\.\d+\n" + r"logprep version:\s+[^\s]+\n" + r"configuration version:\s+1,\s+file://[^\s]+/config\.yml,\s+file://[^\s]+/config\.yml" + ) + + result = get_versions_string(config) + assert re.search(expected_pattern, result) + + def test_get_version_string_without_config(self): + expected_pattern = ( + r"python version:\s+3\.\d+\.\d+\n" + r"logprep version:\s+[^\s]+\n" + r"configuration version:\s+no configuration found in file:///etc/logprep/pipeline.yml" + ) + + result = get_versions_string(None) + assert re.search(expected_pattern, result) diff --git a/tests/unit/util/test_rule_dry_runner.py b/tests/unit/util/test_rule_dry_runner.py index e20a2a86a..b3bafeeb8 100644 --- a/tests/unit/util/test_rule_dry_runner.py +++ b/tests/unit/util/test_rule_dry_runner.py @@ -4,7 +4,9 @@ import logging import os import tempfile +from pathlib import Path +from logprep.util.configuration import Configuration from logprep.util.rule_dry_runner import DryRunner @@ -18,8 +20,7 @@ def setup_method(self): type: dissector specific_rules: - tests/testdata/unit/dissector/ - generic_rules: - - tests/testdata/unit/dissector/ + generic_rules: [] - labelername: type: labeler schema: tests/testdata/unit/labeler/schemas/schema3.json @@ -35,7 +36,7 @@ def setup_method(self): regex_mapping: tests/testdata/unit/pseudonymizer/rules/regex_mapping.yml hash_salt: a_secret_tasty_ingredient outputs: - - patched_output: pseudonyms + - kafka_output: pseudonyms specific_rules: - tests/testdata/unit/pseudonymizer/rules/specific/ generic_rules: @@ -48,17 +49,28 @@ def setup_method(self): generic_rules: - tests/testdata/unit/pre_detector/rules/generic/ outputs: - - patched_output: sre_topic + - kafka_output: sre_topic - selective_extractor: type: selective_extractor specific_rules: - - tests/testdata/unit/selective_extractor/rules/specific/ - generic_rules: - - tests/testdata/unit/selective_extractor/rules/generic/ + - filter: message + selective_extractor: + source_fields: ["field1", "field2"] + outputs: + - kafka_output: topic + description: my reference rule + generic_rules: [] + input: + kafka_output: + type: dummy_input + documents: [] + output: + kafka_output: + type: dummy_output """ - self.config_path = os.path.join(tempfile.gettempdir(), "dry-run-config.yml") - with open(self.config_path, "w", encoding="utf8") as config_file: - config_file.write(config) + self.config_path = Path(tempfile.gettempdir()) / "dry-run-config.yml" + self.config_path.write_text(config) + self.config = Configuration.from_sources([str(self.config_path)]) def teardown_method(self): os.remove(self.config_path) @@ -71,7 +83,7 @@ def test_dry_run_accepts_json_as_input(self, tmp_path, capsys): dry_runner = DryRunner( input_file_path=input_json_file, - config_path=self.config_path, + config=self.config, full_output=True, use_json=True, logger=logging.getLogger("test-logger"), @@ -91,7 +103,7 @@ def test_dry_run_accepts_json_in_list_as_input(self, tmp_path, capsys): dry_runner = DryRunner( input_file_path=input_json_file, - config_path=self.config_path, + config=self.config, full_output=True, use_json=True, logger=logging.getLogger("test-logger"), @@ -111,7 +123,7 @@ def test_dry_run_accepts_jsonl_as_input(self, tmp_path, capsys): dry_runner = DryRunner( input_file_path=input_jsonl_file, - config_path=self.config_path, + config=self.config, full_output=True, use_json=False, logger=logging.getLogger("test-logger"), @@ -140,7 +152,7 @@ def test_dry_run_print_custom_output(self, tmp_path, capsys): dry_runner = DryRunner( input_file_path=input_json_file, - config_path=self.config_path, + config=self.config, full_output=True, use_json=True, logger=logging.getLogger("test-logger"), @@ -166,7 +178,7 @@ def test_dry_run_prints_predetection(self, tmp_path, capsys): dry_runner = DryRunner( input_file_path=input_json_file, - config_path=self.config_path, + config=self.config, full_output=True, use_json=True, logger=logging.getLogger("test-logger"), diff --git a/tests/unit/util/test_schema_and_rule_checker.py b/tests/unit/util/test_schema_and_rule_checker.py deleted file mode 100644 index a9bf21b1b..000000000 --- a/tests/unit/util/test_schema_and_rule_checker.py +++ /dev/null @@ -1,9 +0,0 @@ -# pylint: disable=missing-docstring -from logprep.util.schema_and_rule_checker import SchemaAndRuleChecker - - -class TestSchemaAndRuleChecker: - def test_init(self): - rule_checker = SchemaAndRuleChecker() - assert isinstance(rule_checker.errors, list) - assert len(rule_checker.errors) == 0 diff --git a/versioneer.py b/versioneer.py index 0e28fb52d..27d91c655 100644 --- a/versioneer.py +++ b/versioneer.py @@ -290,6 +290,8 @@ import sys from typing import Callable, Dict +from logprep.abc.exceptions import LogprepException + class VersioneerConfig: """Container for Versioneer configuration parameters.""" @@ -366,7 +368,7 @@ def get_config_from_root(root): return cfg -class NotThisMethod(Exception): +class NotThisMethod(LogprepException): """Exception raised if a method is not valid for the current scenario.""" @@ -486,7 +488,7 @@ def get_config(): return cfg -class NotThisMethod(Exception): +class NotThisMethod(LogprepException): """Exception raised if a method is not valid for the current scenario.""" @@ -1688,7 +1690,7 @@ def render(pieces, style): } -class VersioneerBadRootError(Exception): +class VersioneerBadRootError(LogprepException): """The project root directory is unknown or missing key files."""