Add Lucene compliant regex filter expression (#675)

* Rebase * Adding lucine compliance unit test for development * Adding lucene compliance for filter parsing of a rule. * Adding logger with deprecation warning for regex_fields * Add comment and documentation for lucene regex filter annotation * Quickfix for lucene regex filter * Adjusting Format * Adjusting Format 2 * Adjusting Format 3 * Attempting to remove indeces for regex filter string * Adding notebook for lucene regex filter development * WIP notebook for lucene regex filter development * Adding Notebook for lucene regex filter testing. * Adding Notebook for lucene regex filter testing same results as unit test * Adding first running version of lucene regex filter * Improving notebook for lucene conform regex filter. * Improving notebook for lucene conform regex filter 2. * Slight improve * Bug fix in regex notebook. * Adding Deprecated Warning * Removing temporary test * Adding rule tests for lucene compliance * Black formatting * Black formatting * Remove prototypey * add changelog entry and some prototypey things that actually do nothing yet * Adding lucine compliance unit test for development * Adding lucene compliance for filter parsing of a rule. * Quickfix for lucene regex filter * Adjusting Format 2 * Adding Deprecated Warning * Black formatting * Add documentation * Delete prototypeclass * add notebook to documentation --------- Co-authored-by: FabianMoessner <[email protected]> Co-authored-by: MoessnerFabian(Group) <[email protected]>
fkie-cad · Oct 29, 2024 · 1a42a12 · 1a42a12
1 parent 499ff55
commit 1a42a12
Show file tree

Hide file tree

Showing 7 changed files with 348 additions and 17 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@
 * adds `desired_cluster_status` option to opensearch output to signal healthy cluster status
 * initially run health checks on setup for every configured component
 * make `imagePullPolicy` configurable for helm chart deployments
+* it is now possible to use Lucene compliant Filter Expressions
 * make `terminationGracePeriodSeconds` configurable in helm chart values
 
 

diff --git a/doc/source/development/coding_examples.rst b/doc/source/development/coding_examples.rst
@@ -4,6 +4,7 @@ Processor Case Examples
 .. toctree::
    :maxdepth: 1
 
+   notebooks/processor_examples/regex.ipynb
    notebooks/processor_examples/concatenator.ipynb
    notebooks/processor_examples/calculator.ipynb
    notebooks/processor_examples/dissector.ipynb

diff --git a/doc/source/development/notebooks/processor_examples/regex.ipynb b/doc/source/development/notebooks/processor_examples/regex.ipynb
@@ -0,0 +1,206 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Lucene regex filter\n",
+    "This presentations contains an example of a filter with a lucene conform regular expression. \n",
+    "A concatenator that merges different fields form an event is used as a processor for demonstrating the filter function. \n",
+    "\n",
+    "Until now it was necessary to flag keys of values that contain a regular expression with regex_fields. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "document = {\n",
+    "    'data_stream': {\n",
+    "        'dataset': 'windows', \n",
+    "        'namespace': 'devopslab', \n",
+    "        'type': 'logs'\n",
+    "        }, \n",
+    "    '_op_type': 'create'\n",
+    "    }\n",
+    "\n",
+    "expected = {\n",
+    "    'data_stream': {\n",
+    "        'dataset': 'windows', \n",
+    "        'namespace': 'devopslab', \n",
+    "        'type': 'logs'\n",
+    "        }, \n",
+    "    '_op_type': 'create', \n",
+    "    '_index': 'logs-windows-devopslab'\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define process"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0,\"../../../../../\")\n",
+    "import tempfile\n",
+    "from copy import deepcopy\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from unittest import mock\n",
+    "from logprep.factory import Factory\n",
+    "\n",
+    "rule_path = Path(tempfile.gettempdir()) / \"concatenator\"\n",
+    "rule_path.mkdir(exist_ok=True)\n",
+    "rule_file = rule_path / \"data-stream.yml\"\n",
+    "\n",
+    "if rule_file.exists():\n",
+    "    rule_file.unlink()\n",
+    "\n",
+    "processor_config = {\n",
+    "    \"myconcatenator\":{   \n",
+    "        \"type\": \"concatenator\",\n",
+    "        \"specific_rules\": [str(rule_path)],\n",
+    "        \"generic_rules\": [\"/dev\"],\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "def concat_with_rule(rule_yaml):\n",
+    "    mydocument = deepcopy(document)\n",
+    "    if rule_file.exists():\n",
+    "        rule_file.unlink()\n",
+    "    rule_file.write_text(rule_yaml)\n",
+    "    concatenator = Factory.create(processor_config)\n",
+    "    print(f\"before: {mydocument}\")\n",
+    "    concatenator.process(mydocument)\n",
+    "    print(f\"after: {mydocument}\")\n",
+    "    print(mydocument == expected)\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### regex_fields version"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[Deprecated]: regex_fields are no longer necessary. Use Lucene regex annotation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "[Deprecation warning]: regex_fields are no longer necessary. Use lucene regex annotation.\n",
+      "before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}\n",
+      "after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create', '_index': 'logs-windows-devopslab'}\n",
+      "True\n"
+     ]
+    }
+   ],
+   "source": [
+    "rule_yaml = \"\"\"---\n",
+    "filter: 'data_stream.type: \".*lo.*\"'     \n",
+    "regex_fields:\n",
+    "  - \"data_stream.type\"\n",
+    "concatenator:\n",
+    "  source_fields:\n",
+    "    - data_stream.type\n",
+    "    - data_stream.dataset\n",
+    "    - data_stream.namespace\n",
+    "  target_field: _index\n",
+    "  separator: \"-\"\n",
+    "  overwrite_target: false\n",
+    "  delete_source_fields: false\n",
+    "\"\"\"\n",
+    "\n",
+    "concat_with_rule(rule_yaml)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Lucene conform version without the need of regex_fields"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}\n",
+      "after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create', '_index': 'logs-windows-devopslab'}\n",
+      "True\n"
+     ]
+    }
+   ],
+   "source": [
+    "rule_yaml = \"\"\"---\n",
+    "filter: 'data_stream.type: \"/.*lo.*/\"'    \n",
+    "concatenator:\n",
+    "  source_fields:\n",
+    "    - data_stream.type\n",
+    "    - data_stream.dataset\n",
+    "    - data_stream.namespace\n",
+    "  target_field: _index\n",
+    "  separator: \"-\"\n",
+    "  overwrite_target: false\n",
+    "  delete_source_fields: false\n",
+    "\"\"\"\n",
+    "concat_with_rule(rule_yaml)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "586280540a85d3e21edc698fe7b86af2848b9b02644e6c22463da25c40a3f1be"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/logprep/filter/expression/filter_expression.py b/logprep/filter/expression/filter_expression.py
@@ -3,7 +3,7 @@
 import re
 from abc import ABC, abstractmethod
 from itertools import chain, zip_longest
-from typing import List, Any
+from typing import Any, List
 
 
 class FilterExpressionError(BaseException):

diff --git a/logprep/filter/lucene_filter.py b/logprep/filter/lucene_filter.py
@@ -62,7 +62,18 @@
 ------------
 
 It is possible use regex expressions to match values.
-For this, the field with the regex pattern must be added to the optional field
+To be recognized as a regular expression the filter field has to be start and end with
+:code:`/`.
+
+
+..  code-block:: yaml
+    :linenos:
+    :caption: Example
+
+    filter: 'ip_address: "/192\.168\.0\..*/"'
+
+
+[Deprecated, but still functional] The field with the regex pattern must be added to the optional field
 :code:`regex_fields` in the rule definition.
 
 In the following example the field :code:`ip_address` is defined as regex field.
@@ -84,24 +95,39 @@
 from itertools import chain, zip_longest
 
 # pylint: enable=anomalous-backslash-in-string
-from typing import List, Union, Optional
+from typing import List, Optional, Union
 
+import logging
 import luqum
-from luqum.parser import parser, ParseSyntaxError, IllegalCharacterError
-from luqum.tree import OrOperation, AndOperation, Group, FieldGroup, SearchField, Phrase, Word, Not
+from luqum.parser import IllegalCharacterError, ParseSyntaxError, parser
+from luqum.tree import (
+    AndOperation,
+    FieldGroup,
+    Group,
+    Not,
+    OrOperation,
+    Phrase,
+    Regex,
+    SearchField,
+    Word,
+)
 
 from logprep.filter.expression.filter_expression import (
-    Or,
+    Always,
     And,
-    StringFilterExpression,
-    SigmaFilterExpression,
-    RegExFilterExpression,
-    Not as NotExpression,
     Exists,
-    Null,
-    Always,
     FilterExpression,
 )
+from logprep.filter.expression.filter_expression import Not as NotExpression
+from logprep.filter.expression.filter_expression import (
+    Null,
+    Or,
+    RegExFilterExpression,
+    SigmaFilterExpression,
+    StringFilterExpression,
+)
+
+logger = logging.getLogger("LuceneFilter")
 
 
 class LuceneFilterError(BaseException):
@@ -309,10 +335,22 @@ def _get_filter_expression(
                 return RegExFilterExpression(key[:-1] + key_and_modifier[:-1], value)
 
         dotted_field = ".".join(key)
+
         if self._special_fields.items():
             for sf_key, sf_value in self._special_fields.items():
                 if sf_value is True or dotted_field in sf_value:
+                    if sf_key == "regex_fields":
+                        logger.warning(
+                            "[Deprecated]: regex_fields are no longer necessary. "
+                            "Use Lucene regex annotation."
+                        )
+
                     return self._special_fields_map[sf_key](key, value)
+
+        if value.startswith("/") and value.endswith("/"):
+            value = value.strip("/")
+            return RegExFilterExpression(key, value)
+
         return StringFilterExpression(key, value)
 
     @staticmethod

diff --git a/tests/unit/filter/test_lucene_filter.py b/tests/unit/filter/test_lucene_filter.py
@@ -5,16 +5,20 @@
 import pytest
 from pytest import raises
 
-from logprep.filter.lucene_filter import LuceneFilter, LuceneFilterError, LuceneTransformer
 from logprep.filter.expression.filter_expression import (
-    StringFilterExpression,
-    RegExFilterExpression,
-    Or,
     And,
-    Null,
     Always,
     Exists,
     Not,
+    Null,
+    Or,
+    RegExFilterExpression,
+    StringFilterExpression,
+)
+from logprep.filter.lucene_filter import (
+    LuceneFilter,
+    LuceneFilterError,
+    LuceneTransformer,
 )
 
 
@@ -451,3 +455,30 @@ def test_create_filter_success(self, testcase, input_str, cleaned_str):
     def test_create_filter_error(self, testcase, input_str, message):
         with raises(LuceneFilterError, match=re.escape(message)):
             LuceneFilter.create(f'foo: "{input_str}"')
+
+    def test_creates_lucene_compliance_filter_two_matching_regex_keys_of_two(self):
+        lucene_filter = LuceneFilter.create(
+            'regex_key_one: "/.*value.*/" AND regex_key_two: "/.*value.*/"',
+        )
+
+        assert lucene_filter == And(
+            RegExFilterExpression(["regex_key_one"], ".*value.*"),
+            RegExFilterExpression(["regex_key_two"], ".*value.*"),
+        )
+
+    def test_creates_lucene_compliance_filter_one_regex_key(self):
+        lucene_filter = LuceneFilter.create(
+            'regex_key_one: "/.*value.*/"',
+        )
+
+        assert lucene_filter == RegExFilterExpression(["regex_key_one"], ".*value.*")
+
+    def test_creates_lucene_compliance_filter_one_matching_one_missmatch_regex_key_of_two(self):
+        lucene_filter = LuceneFilter.create(
+            'regex_key_one: "/.*value.*/" AND key_two: "value"',
+        )
+
+        assert lucene_filter == And(
+            RegExFilterExpression(["regex_key_one"], ".*value.*"),
+            StringFilterExpression(["key_two"], "value"),
+        )