241 feature load files from http is broken because of file validator (#…

…243) * Refactor http input acceptance test * add acceptance test for full pipeline run * add convert_to_http_config * remove file validation Co-authored-by: dtrai2 <[email protected]>
fkie-cad · Dec 8, 2022 · d459e73 · d459e73
1 parent 68bcd21
commit d459e73
Show file tree

Hide file tree

Showing 13 changed files with 245 additions and 62 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -155,7 +155,7 @@ jobs:
           pylint --rcfile=.pylintrc --fail-under 9.5 ${{ steps.changed-files.outputs.all_changed_files }}
 
       - name: Run tests and collect coverage
-        run: pytest tests --cov=logprep --cov-report=xml
+        run: pytest tests/unit --cov=logprep --cov-report=xml
 
       - name: Upload coverage reports to Codecov with GitHub Action
         uses: codecov/codecov-action@v2

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -124,7 +124,7 @@ jobs:
           pylint --rcfile=.pylintrc --fail-under 9.5 ${{ steps.changed-files.outputs.all_changed_files }}
 
       - name: Run tests and collect coverage
-        run: pytest tests --cov=logprep --cov-report=xml
+        run: pytest tests/unit --cov=logprep --cov-report=xml
 
       - name: Upload coverage reports to Codecov with GitHub Action
         uses: codecov/codecov-action@v2

diff --git a/logprep/processor/labeler/processor.py b/logprep/processor/labeler/processor.py
@@ -38,7 +38,6 @@
 from logprep.abc import Processor
 from logprep.processor.labeler.labeling_schema import LabelingSchema
 from logprep.processor.labeler.rule import LabelerRule
-from logprep.util.validators import file_validator, json_validator
 
 
 class Labeler(Processor):
@@ -48,7 +47,7 @@ class Labeler(Processor):
     class Config(Processor.Config):
         """Labeler Configurations"""
 
-        schema: str = field(validator=[file_validator, json_validator])
+        schema: str = field(validator=[validators.instance_of(str)])
         """Path to a labeling schema file. For string format see :ref:`getters`."""
         include_parent_labels: Optional[bool] = field(
             default=False, validator=validators.optional(validator=validators.instance_of(bool))

diff --git a/logprep/processor/normalizer/processor.py b/logprep/processor/normalizer/processor.py
@@ -47,7 +47,7 @@
 from logprep.processor.normalizer.rule import NormalizerRule
 from logprep.util.getter import GetterFactory
 from logprep.util.helper import add_field_to, get_dotted_field_value
-from logprep.util.validators import file_validator, directory_validator
+from logprep.util.validators import directory_validator
 
 
 class Normalizer(Processor):
@@ -57,10 +57,12 @@ class Normalizer(Processor):
     class Config(Processor.Config):
         """config description for Normalizer"""
 
-        regex_mapping: str = field(validator=file_validator)
+        regex_mapping: str = field(validator=validators.instance_of(str))
         """Path to regex mapping file with regex keywords that are replaced with regex expressions
             by the normalizer. For string format see :ref:`getters`."""
-        html_replace_fields: Optional[str] = field(default=None, validator=file_validator)
+        html_replace_fields: Optional[str] = field(
+            default=None, validator=[validators.optional(validators.instance_of(str))]
+        )
         """Path to yaml file with html replace fields. For string format see :ref:`getters`"""
         count_grok_pattern_matches: Optional[dict] = field(
             default=None, validator=validators.optional(validators.instance_of(dict))

diff --git a/logprep/processor/pre_detector/processor.py b/logprep/processor/pre_detector/processor.py
@@ -29,7 +29,6 @@
 from logprep.abc import Processor
 from logprep.processor.pre_detector.ip_alerter import IPAlerter
 from logprep.processor.pre_detector.rule import PreDetectorRule
-from logprep.util.validators import file_validator
 
 
 class PreDetectorError(BaseException):
@@ -55,7 +54,9 @@ class Config(Processor.Config):
         A Kafka topic for the detection results of the Predetector.
         Results in this topic can be linked to the original event via a `pre_detector_id`.
         """
-        alert_ip_list_path: str = field(default=None, validator=file_validator)
+        alert_ip_list_path: str = field(
+            default=None, validator=validators.optional(validators.instance_of(str))
+        )
         """
         Path to a YML file or a list of paths to YML files with dictionaries of IPs.
         For string format see :ref:`getters`.

diff --git a/logprep/processor/pseudonymizer/processor.py b/logprep/processor/pseudonymizer/processor.py
@@ -44,7 +44,7 @@
 from logprep.util.cache import Cache
 from logprep.util.getter import GetterFactory
 from logprep.util.hasher import SHA256Hasher
-from logprep.util.validators import file_validator, list_of_urls_validator
+from logprep.util.validators import list_of_urls_validator
 
 
 class Pseudonymizer(Processor):
@@ -60,20 +60,20 @@ class Config(Processor.Config):
         These are not the pseudonymized events, but just the pseudonyms with the encrypted real
         values.
         """
-        pubkey_analyst: str = field(validator=file_validator)
+        pubkey_analyst: str = field(validator=validators.instance_of(str))
         """
         Path to the public key of an analyst. For string format see :ref:`getters`.
 
         * /var/git/analyst_pub.pem"""
-        pubkey_depseudo: str = field(validator=file_validator)
+        pubkey_depseudo: str = field(validator=validators.instance_of(str))
         """
         Path to the public key for depseudonymization. For string format see :ref:`getters`.
 
         * /var/git/depseudo_pub.pem
         """
         hash_salt: str = field(validator=validators.instance_of(str))
         """A salt that is used for hashing."""
-        regex_mapping: str = field(validator=file_validator)
+        regex_mapping: str = field(validator=validators.instance_of(str))
         """
         Path to a file (for string format see :ref:`getters`) with a regex mapping for pseudonymization, i.e.:
 

diff --git a/logprep/processor/template_replacer/processor.py b/logprep/processor/template_replacer/processor.py
@@ -35,7 +35,6 @@
 from logprep.processor.template_replacer.rule import TemplateReplacerRule
 from logprep.util.getter import GetterFactory
 from logprep.util.helper import get_dotted_field_value
-from logprep.util.validators import file_validator
 
 
 class TemplateReplacerError(BaseException):
@@ -52,7 +51,7 @@ class TemplateReplacer(Processor):
     class Config(Processor.Config):
         """TemplateReplacer config"""
 
-        template: str = field(validator=file_validator)
+        template: str = field(validator=validators.instance_of(str))
         """
         Path to a YML file (for path format see :ref:`getters`) with a list of replacements in the
         format `%{provider_name}-%{event_id}: %{new_message}`.

diff --git a/tests/acceptance/test_full_configuration.py b/tests/acceptance/test_full_configuration.py
@@ -0,0 +1,90 @@
+# pylint: disable=missing-docstring
+import contextlib
+from pathlib import Path
+import threading
+import socketserver
+import http.server
+import re
+from tests.acceptance.util import (
+    get_full_pipeline,
+    get_default_logprep_config,
+    start_logprep,
+    stop_logprep,
+    convert_to_http_config,
+)
+from logprep.util.json_handling import dump_config_as_file
+
+
+class TestServer(socketserver.TCPServer):
+    allow_reuse_address = True
+
+    @classmethod
+    def run_http_server(cls, port=32000):
+        with TestServer(("", port), http.server.SimpleHTTPRequestHandler) as httpd:
+            try:
+                cls.httpd = httpd
+                cls.httpd.serve_forever()
+            finally:
+                cls.httpd.server_close()
+
+    @classmethod
+    @contextlib.contextmanager
+    def run_in_thread(cls):
+        """Context manager to run the server in a separate thread"""
+        cls.thread = threading.Thread(target=cls.run_http_server)
+        cls.thread.start()
+        yield
+        cls.httpd.shutdown()
+        cls.thread.join()
+
+    @classmethod
+    def stop(cls):
+        if hasattr(cls, "httpd"):
+            cls.httpd.shutdown()
+        if hasattr(cls, "thread"):
+            cls.thread.join()
+
+
+def teardown_function():
+    Path("generated_config.yml").unlink(missing_ok=True)
+    TestServer.stop()
+    stop_logprep()
+
+
+def test_start_of_logprep_with_full_configuration_from_file(tmp_path):
+    pipeline = get_full_pipeline()
+    config = get_default_logprep_config(pipeline, with_hmac=False)
+    config_path = str(tmp_path / "generated_config.yml")
+    dump_config_as_file(config_path, config)
+    proc = start_logprep(config_path)
+    output = proc.stdout.readline().decode("utf8")
+    while True:
+        assert not re.search("Invalid", output)
+        assert not re.search("Exception", output)
+        assert not re.search("critical", output)
+        assert not re.search("Error", output)
+        assert not re.search("ERROR", output)
+        if re.search("Startup complete", output):
+            break
+        output = proc.stdout.readline().decode("utf8")
+
+
+def test_start_of_logprep_with_full_configuration_http():
+    pipeline = get_full_pipeline()
+    config = get_default_logprep_config(pipeline, with_hmac=False)
+    endpoint = "http://localhost:32000"
+    config = convert_to_http_config(config, endpoint)
+    config_path = "generated_config.yml"
+    dump_config_as_file(config_path, config)
+    with TestServer.run_in_thread():
+        proc = start_logprep(f"{endpoint}/{config_path}")
+        output = proc.stdout.readline().decode("utf8")
+        while True:
+            assert not re.search("Invalid", output)
+            assert not re.search("Exception", output)
+            assert not re.search("critical", output)
+            assert not re.search("Error", output)
+            assert not re.search("ERROR", output)
+            if re.search("Startup complete", output):
+                break
+            output = proc.stdout.readline().decode("utf8")
diff --git a/tests/acceptance/test_http_input.py b/tests/acceptance/test_http_input.py
@@ -1,43 +1,23 @@
 # pylint: disable=missing-docstring
 # pylint: disable=line-too-long
-import os
-import re
-import signal
-import subprocess
-import sys
 import time
 from logging import DEBUG, basicConfig, getLogger
 
 import pytest
 import requests
 
 from logprep.util.json_handling import dump_config_as_file
-from tests.acceptance.util import get_default_logprep_config
+from tests.acceptance.util import (
+    get_default_logprep_config,
+    start_logprep,
+    wait_for_output,
+    stop_logprep,
+)
 
 basicConfig(level=DEBUG, format="%(asctime)-15s %(name)-5s %(levelname)-8s: %(message)s")
 logger = getLogger("Logprep-Test")
 
 
-def start_logprep(config_path: str) -> subprocess.Popen:
-    environment = {"PYTHONPATH": "."}
-    return subprocess.Popen(  # nosemgrep
-        f"{sys.executable} logprep/run_logprep.py {config_path}",
-        shell=True,
-        env=environment,
-        stdin=subprocess.PIPE,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        close_fds=True,
-    )
-
-
-def wait_for_output(proc, expected_output):
-    output = proc.stdout.readline()
-    while expected_output not in output.decode("utf8"):
-        output = proc.stdout.readline()
-        time.sleep(0.1)  # nosemgrep
-
-
 @pytest.fixture(name="config")
 def config_fixture():
     pipeline = [
@@ -74,14 +54,7 @@ def config_fixture():
 
 
 def teardown_function():
-    # cleanup processes
-    output = subprocess.check_output("ps -x | grep run_logprep", shell=True)  # nosemgrep
-    for line in output.decode("utf8").splitlines():
-        process_id = re.match(r"^\s+(\d+)\s.+", line).group(1)
-        try:
-            os.kill(int(process_id), signal.SIGKILL)
-        except ProcessLookupError:
-            pass
+    stop_logprep()
 
 
 def test_http_input_accepts_message_for_single_pipeline(tmp_path, config):
@@ -91,7 +64,8 @@ def test_http_input_accepts_message_for_single_pipeline(tmp_path, config):
     dump_config_as_file(config_path, config)
     proc = start_logprep(config_path)
     wait_for_output(proc, "Uvicorn running on https://127.0.0.1:9000")
-    requests.post("https://127.0.0.1:9000/plaintext", data="my message", verify=False)  # nosemgrep
+    # nosemgrep
+    requests.post("https://127.0.0.1:9000/plaintext", data="my message", verify=False, timeout=5)
     time.sleep(0.5)  # nosemgrep
     assert "my message" in output_path.read_text()
 
@@ -104,11 +78,19 @@ def test_http_input_accepts_message_for_two_pipelines(tmp_path, config):
     dump_config_as_file(config_path, config)
     proc = start_logprep(config_path)
     wait_for_output(proc, "Uvicorn running on https://127.0.0.1:9001")
-    requests.post(  # nosemgrep
-        "https://127.0.0.1:9000/plaintext", data="my first message", verify=False
+    # nosemgrep
+    requests.post(
+        "https://127.0.0.1:9000/plaintext",
+        data="my first message",
+        verify=False,
+        timeout=5,
     )
-    requests.post(  # nosemgrep
-        "https://127.0.0.1:9001/plaintext", data="my second message", verify=False
+    # nosemgrep
+    requests.post(
+        "https://127.0.0.1:9001/plaintext",
+        data="my second message",
+        verify=False,
+        timeout=5,
     )
     time.sleep(0.5)  # nosemgrep
     output_content = output_path.read_text()
@@ -124,14 +106,26 @@ def test_http_input_accepts_message_for_three_pipelines(tmp_path, config):
     dump_config_as_file(config_path, config)
     proc = start_logprep(config_path)
     wait_for_output(proc, "Uvicorn running on https://127.0.0.1:9002")
-    requests.post(  # nosemgrep
-        "https://127.0.0.1:9000/plaintext", data="my first message", verify=False
+    # nosemgrep
+    requests.post(
+        "https://127.0.0.1:9000/plaintext",
+        data="my first message",
+        verify=False,
+        timeout=5,
     )
-    requests.post(  # nosemgrep
-        "https://127.0.0.1:9001/plaintext", data="my second message", verify=False
+    # nosemgrep
+    requests.post(
+        "https://127.0.0.1:9001/plaintext",
+        data="my second message",
+        verify=False,
+        timeout=5,
     )
-    requests.post(  # nosemgrep
-        "https://127.0.0.1:9002/plaintext", data="my third message", verify=False
+    # nosemgrep
+    requests.post(
+        "https://127.0.0.1:9002/plaintext",
+        data="my third message",
+        verify=False,
+        timeout=5,
     )
     time.sleep(0.5)  # nosemgrep
     output_content = output_path.read_text()