From 1a42a12454da6c6a73efd67d0f15e7167687aba4 Mon Sep 17 00:00:00 2001 From: djkhl <49399649+djkhl@users.noreply.github.com> Date: Tue, 29 Oct 2024 11:20:31 +0100 Subject: [PATCH] Add Lucene compliant regex filter expression (#675) * Rebase * Adding lucine compliance unit test for development * Adding lucene compliance for filter parsing of a rule. * Adding logger with deprecation warning for regex_fields * Add comment and documentation for lucene regex filter annotation * Quickfix for lucene regex filter * Adjusting Format * Adjusting Format 2 * Adjusting Format 3 * Attempting to remove indeces for regex filter string * Adding notebook for lucene regex filter development * WIP notebook for lucene regex filter development * Adding Notebook for lucene regex filter testing. * Adding Notebook for lucene regex filter testing same results as unit test * Adding first running version of lucene regex filter * Improving notebook for lucene conform regex filter. * Improving notebook for lucene conform regex filter 2. * Slight improve * Bug fix in regex notebook. * Adding Deprecated Warning * Removing temporary test * Adding rule tests for lucene compliance * Black formatting * Black formatting * Remove prototypey * add changelog entry and some prototypey things that actually do nothing yet * Adding lucine compliance unit test for development * Adding lucene compliance for filter parsing of a rule. * Quickfix for lucene regex filter * Adjusting Format 2 * Adding Deprecated Warning * Black formatting * Add documentation * Delete prototypeclass * add notebook to documentation --------- Co-authored-by: FabianMoessner Co-authored-by: MoessnerFabian(Group) --- CHANGELOG.md | 1 + doc/source/development/coding_examples.rst | 1 + .../notebooks/processor_examples/regex.ipynb | 206 ++++++++++++++++++ .../filter/expression/filter_expression.py | 2 +- logprep/filter/lucene_filter.py | 60 ++++- tests/unit/filter/test_lucene_filter.py | 41 +++- .../processor/labeler/test_labeler_rule.py | 54 +++++ 7 files changed, 348 insertions(+), 17 deletions(-) create mode 100644 doc/source/development/notebooks/processor_examples/regex.ipynb diff --git a/CHANGELOG.md b/CHANGELOG.md index 76e3be40e..78e65bd06 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ * adds `desired_cluster_status` option to opensearch output to signal healthy cluster status * initially run health checks on setup for every configured component * make `imagePullPolicy` configurable for helm chart deployments +* it is now possible to use Lucene compliant Filter Expressions * make `terminationGracePeriodSeconds` configurable in helm chart values diff --git a/doc/source/development/coding_examples.rst b/doc/source/development/coding_examples.rst index d3d3377d4..51ff96de3 100644 --- a/doc/source/development/coding_examples.rst +++ b/doc/source/development/coding_examples.rst @@ -4,6 +4,7 @@ Processor Case Examples .. toctree:: :maxdepth: 1 + notebooks/processor_examples/regex.ipynb notebooks/processor_examples/concatenator.ipynb notebooks/processor_examples/calculator.ipynb notebooks/processor_examples/dissector.ipynb diff --git a/doc/source/development/notebooks/processor_examples/regex.ipynb b/doc/source/development/notebooks/processor_examples/regex.ipynb new file mode 100644 index 000000000..f933f9e62 --- /dev/null +++ b/doc/source/development/notebooks/processor_examples/regex.ipynb @@ -0,0 +1,206 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lucene regex filter\n", + "This presentations contains an example of a filter with a lucene conform regular expression. \n", + "A concatenator that merges different fields form an event is used as a processor for demonstrating the filter function. \n", + "\n", + "Until now it was necessary to flag keys of values that contain a regular expression with regex_fields. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "document = {\n", + " 'data_stream': {\n", + " 'dataset': 'windows', \n", + " 'namespace': 'devopslab', \n", + " 'type': 'logs'\n", + " }, \n", + " '_op_type': 'create'\n", + " }\n", + "\n", + "expected = {\n", + " 'data_stream': {\n", + " 'dataset': 'windows', \n", + " 'namespace': 'devopslab', \n", + " 'type': 'logs'\n", + " }, \n", + " '_op_type': 'create', \n", + " '_index': 'logs-windows-devopslab'\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define process" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0,\"../../../../../\")\n", + "import tempfile\n", + "from copy import deepcopy\n", + "from pathlib import Path\n", + "\n", + "from unittest import mock\n", + "from logprep.factory import Factory\n", + "\n", + "rule_path = Path(tempfile.gettempdir()) / \"concatenator\"\n", + "rule_path.mkdir(exist_ok=True)\n", + "rule_file = rule_path / \"data-stream.yml\"\n", + "\n", + "if rule_file.exists():\n", + " rule_file.unlink()\n", + "\n", + "processor_config = {\n", + " \"myconcatenator\":{ \n", + " \"type\": \"concatenator\",\n", + " \"specific_rules\": [str(rule_path)],\n", + " \"generic_rules\": [\"/dev\"],\n", + " }\n", + " }\n", + "\n", + "def concat_with_rule(rule_yaml):\n", + " mydocument = deepcopy(document)\n", + " if rule_file.exists():\n", + " rule_file.unlink()\n", + " rule_file.write_text(rule_yaml)\n", + " concatenator = Factory.create(processor_config)\n", + " print(f\"before: {mydocument}\")\n", + " concatenator.process(mydocument)\n", + " print(f\"after: {mydocument}\")\n", + " print(mydocument == expected)\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### regex_fields version" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Deprecated]: regex_fields are no longer necessary. Use Lucene regex annotation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "[Deprecation warning]: regex_fields are no longer necessary. Use lucene regex annotation.\n", + "before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}\n", + "after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create', '_index': 'logs-windows-devopslab'}\n", + "True\n" + ] + } + ], + "source": [ + "rule_yaml = \"\"\"---\n", + "filter: 'data_stream.type: \".*lo.*\"' \n", + "regex_fields:\n", + " - \"data_stream.type\"\n", + "concatenator:\n", + " source_fields:\n", + " - data_stream.type\n", + " - data_stream.dataset\n", + " - data_stream.namespace\n", + " target_field: _index\n", + " separator: \"-\"\n", + " overwrite_target: false\n", + " delete_source_fields: false\n", + "\"\"\"\n", + "\n", + "concat_with_rule(rule_yaml)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Lucene conform version without the need of regex_fields" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}\n", + "after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create', '_index': 'logs-windows-devopslab'}\n", + "True\n" + ] + } + ], + "source": [ + "rule_yaml = \"\"\"---\n", + "filter: 'data_stream.type: \"/.*lo.*/\"' \n", + "concatenator:\n", + " source_fields:\n", + " - data_stream.type\n", + " - data_stream.dataset\n", + " - data_stream.namespace\n", + " target_field: _index\n", + " separator: \"-\"\n", + " overwrite_target: false\n", + " delete_source_fields: false\n", + "\"\"\"\n", + "concat_with_rule(rule_yaml)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + }, + "vscode": { + "interpreter": { + "hash": "586280540a85d3e21edc698fe7b86af2848b9b02644e6c22463da25c40a3f1be" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/logprep/filter/expression/filter_expression.py b/logprep/filter/expression/filter_expression.py index 738010374..9ef840494 100644 --- a/logprep/filter/expression/filter_expression.py +++ b/logprep/filter/expression/filter_expression.py @@ -3,7 +3,7 @@ import re from abc import ABC, abstractmethod from itertools import chain, zip_longest -from typing import List, Any +from typing import Any, List class FilterExpressionError(BaseException): diff --git a/logprep/filter/lucene_filter.py b/logprep/filter/lucene_filter.py index ee24d8d7e..18a5187ee 100644 --- a/logprep/filter/lucene_filter.py +++ b/logprep/filter/lucene_filter.py @@ -62,7 +62,18 @@ ------------ It is possible use regex expressions to match values. -For this, the field with the regex pattern must be added to the optional field +To be recognized as a regular expression the filter field has to be start and end with +:code:`/`. + + +.. code-block:: yaml + :linenos: + :caption: Example + + filter: 'ip_address: "/192\.168\.0\..*/"' + + +[Deprecated, but still functional] The field with the regex pattern must be added to the optional field :code:`regex_fields` in the rule definition. In the following example the field :code:`ip_address` is defined as regex field. @@ -84,24 +95,39 @@ from itertools import chain, zip_longest # pylint: enable=anomalous-backslash-in-string -from typing import List, Union, Optional +from typing import List, Optional, Union +import logging import luqum -from luqum.parser import parser, ParseSyntaxError, IllegalCharacterError -from luqum.tree import OrOperation, AndOperation, Group, FieldGroup, SearchField, Phrase, Word, Not +from luqum.parser import IllegalCharacterError, ParseSyntaxError, parser +from luqum.tree import ( + AndOperation, + FieldGroup, + Group, + Not, + OrOperation, + Phrase, + Regex, + SearchField, + Word, +) from logprep.filter.expression.filter_expression import ( - Or, + Always, And, - StringFilterExpression, - SigmaFilterExpression, - RegExFilterExpression, - Not as NotExpression, Exists, - Null, - Always, FilterExpression, ) +from logprep.filter.expression.filter_expression import Not as NotExpression +from logprep.filter.expression.filter_expression import ( + Null, + Or, + RegExFilterExpression, + SigmaFilterExpression, + StringFilterExpression, +) + +logger = logging.getLogger("LuceneFilter") class LuceneFilterError(BaseException): @@ -309,10 +335,22 @@ def _get_filter_expression( return RegExFilterExpression(key[:-1] + key_and_modifier[:-1], value) dotted_field = ".".join(key) + if self._special_fields.items(): for sf_key, sf_value in self._special_fields.items(): if sf_value is True or dotted_field in sf_value: + if sf_key == "regex_fields": + logger.warning( + "[Deprecated]: regex_fields are no longer necessary. " + "Use Lucene regex annotation." + ) + return self._special_fields_map[sf_key](key, value) + + if value.startswith("/") and value.endswith("/"): + value = value.strip("/") + return RegExFilterExpression(key, value) + return StringFilterExpression(key, value) @staticmethod diff --git a/tests/unit/filter/test_lucene_filter.py b/tests/unit/filter/test_lucene_filter.py index d843fb438..2873a7a8c 100644 --- a/tests/unit/filter/test_lucene_filter.py +++ b/tests/unit/filter/test_lucene_filter.py @@ -5,16 +5,20 @@ import pytest from pytest import raises -from logprep.filter.lucene_filter import LuceneFilter, LuceneFilterError, LuceneTransformer from logprep.filter.expression.filter_expression import ( - StringFilterExpression, - RegExFilterExpression, - Or, And, - Null, Always, Exists, Not, + Null, + Or, + RegExFilterExpression, + StringFilterExpression, +) +from logprep.filter.lucene_filter import ( + LuceneFilter, + LuceneFilterError, + LuceneTransformer, ) @@ -451,3 +455,30 @@ def test_create_filter_success(self, testcase, input_str, cleaned_str): def test_create_filter_error(self, testcase, input_str, message): with raises(LuceneFilterError, match=re.escape(message)): LuceneFilter.create(f'foo: "{input_str}"') + + def test_creates_lucene_compliance_filter_two_matching_regex_keys_of_two(self): + lucene_filter = LuceneFilter.create( + 'regex_key_one: "/.*value.*/" AND regex_key_two: "/.*value.*/"', + ) + + assert lucene_filter == And( + RegExFilterExpression(["regex_key_one"], ".*value.*"), + RegExFilterExpression(["regex_key_two"], ".*value.*"), + ) + + def test_creates_lucene_compliance_filter_one_regex_key(self): + lucene_filter = LuceneFilter.create( + 'regex_key_one: "/.*value.*/"', + ) + + assert lucene_filter == RegExFilterExpression(["regex_key_one"], ".*value.*") + + def test_creates_lucene_compliance_filter_one_matching_one_missmatch_regex_key_of_two(self): + lucene_filter = LuceneFilter.create( + 'regex_key_one: "/.*value.*/" AND key_two: "value"', + ) + + assert lucene_filter == And( + RegExFilterExpression(["regex_key_one"], ".*value.*"), + StringFilterExpression(["key_two"], "value"), + ) diff --git a/tests/unit/processor/labeler/test_labeler_rule.py b/tests/unit/processor/labeler/test_labeler_rule.py index bd78a9d47..0bf877017 100644 --- a/tests/unit/processor/labeler/test_labeler_rule.py +++ b/tests/unit/processor/labeler/test_labeler_rule.py @@ -214,3 +214,57 @@ def test_null_returns_true_for_matching_document(self): document = {"applyrule": None} assert rule.matches(document) + + def test_lucene_regex_matches_returns_true_for_matching_document(self): + rule_definition = { + "filter": 'applyrule: "/.*yes.*/"', + "labeler": {"label": {"reporter": ["windows"]}}, + } + rule = LabelerRule._create_from_dict(rule_definition) + assert rule.matches({"applyrule": "yes"}) + assert rule.matches({"applyrule": "yes!"}) + assert rule.matches({"applyrule": "no? yes!"}) + + def test_lucene_regex_matches_returns_false_for_non_matching_document(self): + rule_definition = { + "filter": 'applyrule: "/.*yes.*/"', + "labeler": {"label": {"reporter": ["windows"]}}, + } + rule = LabelerRule._create_from_dict(rule_definition) + non_matching_documents = [ + {}, + {"applyrule": "no"}, + {"applyrule": "ye s"}, + {"applyrule": "YES"}, + {"wrong key": "yes"}, + ] + + for document in non_matching_documents: + assert not rule.matches(document) + + def test_complex_lucene_regex_matches_returns_true_for_matching_document(self): + rule_definition = { + "filter": r'applyrule: "/(?:(?=.*[a-z])(?:(?=.*[A-Z])(?=.*[\d\W])|(?=.*\W)(?=.*\d))|(?=.*\W)(?=.*[A-Z])(?=.*\d)).{8,}/"', + # pylint: disable=line-too-long + "labeler": {"label": {"reporter": ["windows"]}}, + } + rule = LabelerRule._create_from_dict(rule_definition) + assert rule.matches({"applyrule": "UPlo8888"}) + assert rule.matches({"applyrule": "UPlo99999"}) + assert rule.matches({"applyrule": "UPlo$$$$"}) + assert rule.matches({"applyrule": "UP$$$$88"}) + + def test_complex_lucene_regex_does_not_match_returns_true_for_matching_document(self): + rule_definition = { + "filter": r'applyrule: "/(?:(?=.*[a-z])(?:(?=.*[A-Z])(?=.*[\d\W])|(?=.*\W)(?=.*\d))|(?=.*\W)(?=.*[A-Z])(?=.*\d)).{8,}/"', + # pylint: disable=line-too-long + "labeler": {"label": {"reporter": ["windows"]}}, + } + rule = LabelerRule._create_from_dict(rule_definition) + assert not rule.matches({"applyrule": ""}) + assert not rule.matches({"applyrule": "UPlo777"}) + assert not rule.matches({"applyrule": "UP888888"}) + assert not rule.matches({"applyrule": "lo888888"}) + assert not rule.matches({"applyrule": "UPloXXXX"}) + assert not rule.matches({"applyrule": "88888888"}) + assert not rule.matches({"applyrule": "UPlo$$7"})