From 1a42a12454da6c6a73efd67d0f15e7167687aba4 Mon Sep 17 00:00:00 2001
From: djkhl <49399649+djkhl@users.noreply.github.com>
Date: Tue, 29 Oct 2024 11:20:31 +0100
Subject: [PATCH] Add Lucene compliant regex filter expression (#675)

* Rebase

* Adding lucine compliance unit test for development

* Adding lucene compliance for filter parsing of a rule.

* Adding logger with deprecation warning for regex_fields

* Add comment and documentation for lucene regex filter  annotation

* Quickfix for lucene regex filter

* Adjusting Format

* Adjusting Format 2

* Adjusting Format 3

* Attempting to remove indeces for regex filter string

* Adding notebook for lucene regex filter development

* WIP notebook for lucene regex filter development

* Adding Notebook for lucene regex filter testing.

* Adding Notebook for lucene regex filter testing same results as unit test

* Adding first running version of lucene regex filter

* Improving notebook for lucene conform regex filter.

* Improving notebook for lucene conform regex filter 2.

* Slight improve

* Bug fix in regex notebook.

* Adding Deprecated Warning

* Removing temporary test

* Adding rule tests for lucene compliance

* Black formatting

* Black formatting

* Remove prototypey

* add changelog entry and some prototypey things that actually do nothing yet

* Adding lucine compliance unit test for development

* Adding lucene compliance for filter parsing of a rule.

* Quickfix for lucene regex filter

* Adjusting Format 2

* Adding Deprecated Warning

* Black formatting

* Add documentation

* Delete prototypeclass

* add notebook to documentation

---------

Co-authored-by: FabianMoessner <fmmoessner@gmail.com>
Co-authored-by: MoessnerFabian(Group) <fabian.moessner@materna.group>
---
 CHANGELOG.md                                  |   1 +
 doc/source/development/coding_examples.rst    |   1 +
 .../notebooks/processor_examples/regex.ipynb  | 206 ++++++++++++++++++
 .../filter/expression/filter_expression.py    |   2 +-
 logprep/filter/lucene_filter.py               |  60 ++++-
 tests/unit/filter/test_lucene_filter.py       |  41 +++-
 .../processor/labeler/test_labeler_rule.py    |  54 +++++
 7 files changed, 348 insertions(+), 17 deletions(-)
 create mode 100644 doc/source/development/notebooks/processor_examples/regex.ipynb

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 76e3be40e..78e65bd06 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,7 @@
 * adds `desired_cluster_status` option to opensearch output to signal healthy cluster status
 * initially run health checks on setup for every configured component
 * make `imagePullPolicy` configurable for helm chart deployments
+* it is now possible to use Lucene compliant Filter Expressions
 * make `terminationGracePeriodSeconds` configurable in helm chart values
 
 
diff --git a/doc/source/development/coding_examples.rst b/doc/source/development/coding_examples.rst
index d3d3377d4..51ff96de3 100644
--- a/doc/source/development/coding_examples.rst
+++ b/doc/source/development/coding_examples.rst
@@ -4,6 +4,7 @@ Processor Case Examples
 .. toctree::
    :maxdepth: 1
 
+   notebooks/processor_examples/regex.ipynb
    notebooks/processor_examples/concatenator.ipynb
    notebooks/processor_examples/calculator.ipynb
    notebooks/processor_examples/dissector.ipynb
diff --git a/doc/source/development/notebooks/processor_examples/regex.ipynb b/doc/source/development/notebooks/processor_examples/regex.ipynb
new file mode 100644
index 000000000..f933f9e62
--- /dev/null
+++ b/doc/source/development/notebooks/processor_examples/regex.ipynb
@@ -0,0 +1,206 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Lucene regex filter\n",
+    "This presentations contains an example of a filter with a lucene conform regular expression. \n",
+    "A concatenator that merges different fields form an event is used as a processor for demonstrating the filter function. \n",
+    "\n",
+    "Until now it was necessary to flag keys of values that contain a regular expression with regex_fields. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "document = {\n",
+    "    'data_stream': {\n",
+    "        'dataset': 'windows', \n",
+    "        'namespace': 'devopslab', \n",
+    "        'type': 'logs'\n",
+    "        }, \n",
+    "    '_op_type': 'create'\n",
+    "    }\n",
+    "\n",
+    "expected = {\n",
+    "    'data_stream': {\n",
+    "        'dataset': 'windows', \n",
+    "        'namespace': 'devopslab', \n",
+    "        'type': 'logs'\n",
+    "        }, \n",
+    "    '_op_type': 'create', \n",
+    "    '_index': 'logs-windows-devopslab'\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define process"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0,\"../../../../../\")\n",
+    "import tempfile\n",
+    "from copy import deepcopy\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from unittest import mock\n",
+    "from logprep.factory import Factory\n",
+    "\n",
+    "rule_path = Path(tempfile.gettempdir()) / \"concatenator\"\n",
+    "rule_path.mkdir(exist_ok=True)\n",
+    "rule_file = rule_path / \"data-stream.yml\"\n",
+    "\n",
+    "if rule_file.exists():\n",
+    "    rule_file.unlink()\n",
+    "\n",
+    "processor_config = {\n",
+    "    \"myconcatenator\":{   \n",
+    "        \"type\": \"concatenator\",\n",
+    "        \"specific_rules\": [str(rule_path)],\n",
+    "        \"generic_rules\": [\"/dev\"],\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "def concat_with_rule(rule_yaml):\n",
+    "    mydocument = deepcopy(document)\n",
+    "    if rule_file.exists():\n",
+    "        rule_file.unlink()\n",
+    "    rule_file.write_text(rule_yaml)\n",
+    "    concatenator = Factory.create(processor_config)\n",
+    "    print(f\"before: {mydocument}\")\n",
+    "    concatenator.process(mydocument)\n",
+    "    print(f\"after: {mydocument}\")\n",
+    "    print(mydocument == expected)\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### regex_fields version"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[Deprecated]: regex_fields are no longer necessary. Use Lucene regex annotation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "[Deprecation warning]: regex_fields are no longer necessary. Use lucene regex annotation.\n",
+      "before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}\n",
+      "after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create', '_index': 'logs-windows-devopslab'}\n",
+      "True\n"
+     ]
+    }
+   ],
+   "source": [
+    "rule_yaml = \"\"\"---\n",
+    "filter: 'data_stream.type: \".*lo.*\"'     \n",
+    "regex_fields:\n",
+    "  - \"data_stream.type\"\n",
+    "concatenator:\n",
+    "  source_fields:\n",
+    "    - data_stream.type\n",
+    "    - data_stream.dataset\n",
+    "    - data_stream.namespace\n",
+    "  target_field: _index\n",
+    "  separator: \"-\"\n",
+    "  overwrite_target: false\n",
+    "  delete_source_fields: false\n",
+    "\"\"\"\n",
+    "\n",
+    "concat_with_rule(rule_yaml)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Lucene conform version without the need of regex_fields"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}\n",
+      "after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create', '_index': 'logs-windows-devopslab'}\n",
+      "True\n"
+     ]
+    }
+   ],
+   "source": [
+    "rule_yaml = \"\"\"---\n",
+    "filter: 'data_stream.type: \"/.*lo.*/\"'    \n",
+    "concatenator:\n",
+    "  source_fields:\n",
+    "    - data_stream.type\n",
+    "    - data_stream.dataset\n",
+    "    - data_stream.namespace\n",
+    "  target_field: _index\n",
+    "  separator: \"-\"\n",
+    "  overwrite_target: false\n",
+    "  delete_source_fields: false\n",
+    "\"\"\"\n",
+    "concat_with_rule(rule_yaml)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "586280540a85d3e21edc698fe7b86af2848b9b02644e6c22463da25c40a3f1be"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/logprep/filter/expression/filter_expression.py b/logprep/filter/expression/filter_expression.py
index 738010374..9ef840494 100644
--- a/logprep/filter/expression/filter_expression.py
+++ b/logprep/filter/expression/filter_expression.py
@@ -3,7 +3,7 @@
 import re
 from abc import ABC, abstractmethod
 from itertools import chain, zip_longest
-from typing import List, Any
+from typing import Any, List
 
 
 class FilterExpressionError(BaseException):
diff --git a/logprep/filter/lucene_filter.py b/logprep/filter/lucene_filter.py
index ee24d8d7e..18a5187ee 100644
--- a/logprep/filter/lucene_filter.py
+++ b/logprep/filter/lucene_filter.py
@@ -62,7 +62,18 @@
 ------------
 
 It is possible use regex expressions to match values.
-For this, the field with the regex pattern must be added to the optional field
+To be recognized as a regular expression the filter field has to be start and end with
+:code:`/`.
+
+
+..  code-block:: yaml
+    :linenos:
+    :caption: Example
+
+    filter: 'ip_address: "/192\.168\.0\..*/"'
+
+
+[Deprecated, but still functional] The field with the regex pattern must be added to the optional field
 :code:`regex_fields` in the rule definition.
 
 In the following example the field :code:`ip_address` is defined as regex field.
@@ -84,24 +95,39 @@
 from itertools import chain, zip_longest
 
 # pylint: enable=anomalous-backslash-in-string
-from typing import List, Union, Optional
+from typing import List, Optional, Union
 
+import logging
 import luqum
-from luqum.parser import parser, ParseSyntaxError, IllegalCharacterError
-from luqum.tree import OrOperation, AndOperation, Group, FieldGroup, SearchField, Phrase, Word, Not
+from luqum.parser import IllegalCharacterError, ParseSyntaxError, parser
+from luqum.tree import (
+    AndOperation,
+    FieldGroup,
+    Group,
+    Not,
+    OrOperation,
+    Phrase,
+    Regex,
+    SearchField,
+    Word,
+)
 
 from logprep.filter.expression.filter_expression import (
-    Or,
+    Always,
     And,
-    StringFilterExpression,
-    SigmaFilterExpression,
-    RegExFilterExpression,
-    Not as NotExpression,
     Exists,
-    Null,
-    Always,
     FilterExpression,
 )
+from logprep.filter.expression.filter_expression import Not as NotExpression
+from logprep.filter.expression.filter_expression import (
+    Null,
+    Or,
+    RegExFilterExpression,
+    SigmaFilterExpression,
+    StringFilterExpression,
+)
+
+logger = logging.getLogger("LuceneFilter")
 
 
 class LuceneFilterError(BaseException):
@@ -309,10 +335,22 @@ def _get_filter_expression(
                 return RegExFilterExpression(key[:-1] + key_and_modifier[:-1], value)
 
         dotted_field = ".".join(key)
+
         if self._special_fields.items():
             for sf_key, sf_value in self._special_fields.items():
                 if sf_value is True or dotted_field in sf_value:
+                    if sf_key == "regex_fields":
+                        logger.warning(
+                            "[Deprecated]: regex_fields are no longer necessary. "
+                            "Use Lucene regex annotation."
+                        )
+
                     return self._special_fields_map[sf_key](key, value)
+
+        if value.startswith("/") and value.endswith("/"):
+            value = value.strip("/")
+            return RegExFilterExpression(key, value)
+
         return StringFilterExpression(key, value)
 
     @staticmethod
diff --git a/tests/unit/filter/test_lucene_filter.py b/tests/unit/filter/test_lucene_filter.py
index d843fb438..2873a7a8c 100644
--- a/tests/unit/filter/test_lucene_filter.py
+++ b/tests/unit/filter/test_lucene_filter.py
@@ -5,16 +5,20 @@
 import pytest
 from pytest import raises
 
-from logprep.filter.lucene_filter import LuceneFilter, LuceneFilterError, LuceneTransformer
 from logprep.filter.expression.filter_expression import (
-    StringFilterExpression,
-    RegExFilterExpression,
-    Or,
     And,
-    Null,
     Always,
     Exists,
     Not,
+    Null,
+    Or,
+    RegExFilterExpression,
+    StringFilterExpression,
+)
+from logprep.filter.lucene_filter import (
+    LuceneFilter,
+    LuceneFilterError,
+    LuceneTransformer,
 )
 
 
@@ -451,3 +455,30 @@ def test_create_filter_success(self, testcase, input_str, cleaned_str):
     def test_create_filter_error(self, testcase, input_str, message):
         with raises(LuceneFilterError, match=re.escape(message)):
             LuceneFilter.create(f'foo: "{input_str}"')
+
+    def test_creates_lucene_compliance_filter_two_matching_regex_keys_of_two(self):
+        lucene_filter = LuceneFilter.create(
+            'regex_key_one: "/.*value.*/" AND regex_key_two: "/.*value.*/"',
+        )
+
+        assert lucene_filter == And(
+            RegExFilterExpression(["regex_key_one"], ".*value.*"),
+            RegExFilterExpression(["regex_key_two"], ".*value.*"),
+        )
+
+    def test_creates_lucene_compliance_filter_one_regex_key(self):
+        lucene_filter = LuceneFilter.create(
+            'regex_key_one: "/.*value.*/"',
+        )
+
+        assert lucene_filter == RegExFilterExpression(["regex_key_one"], ".*value.*")
+
+    def test_creates_lucene_compliance_filter_one_matching_one_missmatch_regex_key_of_two(self):
+        lucene_filter = LuceneFilter.create(
+            'regex_key_one: "/.*value.*/" AND key_two: "value"',
+        )
+
+        assert lucene_filter == And(
+            RegExFilterExpression(["regex_key_one"], ".*value.*"),
+            StringFilterExpression(["key_two"], "value"),
+        )
diff --git a/tests/unit/processor/labeler/test_labeler_rule.py b/tests/unit/processor/labeler/test_labeler_rule.py
index bd78a9d47..0bf877017 100644
--- a/tests/unit/processor/labeler/test_labeler_rule.py
+++ b/tests/unit/processor/labeler/test_labeler_rule.py
@@ -214,3 +214,57 @@ def test_null_returns_true_for_matching_document(self):
         document = {"applyrule": None}
 
         assert rule.matches(document)
+
+    def test_lucene_regex_matches_returns_true_for_matching_document(self):
+        rule_definition = {
+            "filter": 'applyrule: "/.*yes.*/"',
+            "labeler": {"label": {"reporter": ["windows"]}},
+        }
+        rule = LabelerRule._create_from_dict(rule_definition)
+        assert rule.matches({"applyrule": "yes"})
+        assert rule.matches({"applyrule": "yes!"})
+        assert rule.matches({"applyrule": "no? yes!"})
+
+    def test_lucene_regex_matches_returns_false_for_non_matching_document(self):
+        rule_definition = {
+            "filter": 'applyrule: "/.*yes.*/"',
+            "labeler": {"label": {"reporter": ["windows"]}},
+        }
+        rule = LabelerRule._create_from_dict(rule_definition)
+        non_matching_documents = [
+            {},
+            {"applyrule": "no"},
+            {"applyrule": "ye s"},
+            {"applyrule": "YES"},
+            {"wrong key": "yes"},
+        ]
+
+        for document in non_matching_documents:
+            assert not rule.matches(document)
+
+    def test_complex_lucene_regex_matches_returns_true_for_matching_document(self):
+        rule_definition = {
+            "filter": r'applyrule: "/(?:(?=.*[a-z])(?:(?=.*[A-Z])(?=.*[\d\W])|(?=.*\W)(?=.*\d))|(?=.*\W)(?=.*[A-Z])(?=.*\d)).{8,}/"',
+            # pylint: disable=line-too-long
+            "labeler": {"label": {"reporter": ["windows"]}},
+        }
+        rule = LabelerRule._create_from_dict(rule_definition)
+        assert rule.matches({"applyrule": "UPlo8888"})
+        assert rule.matches({"applyrule": "UPlo99999"})
+        assert rule.matches({"applyrule": "UPlo$$$$"})
+        assert rule.matches({"applyrule": "UP$$$$88"})
+
+    def test_complex_lucene_regex_does_not_match_returns_true_for_matching_document(self):
+        rule_definition = {
+            "filter": r'applyrule: "/(?:(?=.*[a-z])(?:(?=.*[A-Z])(?=.*[\d\W])|(?=.*\W)(?=.*\d))|(?=.*\W)(?=.*[A-Z])(?=.*\d)).{8,}/"',
+            # pylint: disable=line-too-long
+            "labeler": {"label": {"reporter": ["windows"]}},
+        }
+        rule = LabelerRule._create_from_dict(rule_definition)
+        assert not rule.matches({"applyrule": ""})
+        assert not rule.matches({"applyrule": "UPlo777"})
+        assert not rule.matches({"applyrule": "UP888888"})
+        assert not rule.matches({"applyrule": "lo888888"})
+        assert not rule.matches({"applyrule": "UPloXXXX"})
+        assert not rule.matches({"applyrule": "88888888"})
+        assert not rule.matches({"applyrule": "UPlo$$7"})