Skip to content

Commit

Permalink
Add Lucene compliant regex filter expression (#675)
Browse files Browse the repository at this point in the history
* Rebase

* Adding lucine compliance unit test for development

* Adding lucene compliance for filter parsing of a rule.

* Adding logger with deprecation warning for regex_fields

* Add comment and documentation for lucene regex filter  annotation

* Quickfix for lucene regex filter

* Adjusting Format

* Adjusting Format 2

* Adjusting Format 3

* Attempting to remove indeces for regex filter string

* Adding notebook for lucene regex filter development

* WIP notebook for lucene regex filter development

* Adding Notebook for lucene regex filter testing.

* Adding Notebook for lucene regex filter testing same results as unit test

* Adding first running version of lucene regex filter

* Improving notebook for lucene conform regex filter.

* Improving notebook for lucene conform regex filter 2.

* Slight improve

* Bug fix in regex notebook.

* Adding Deprecated Warning

* Removing temporary test

* Adding rule tests for lucene compliance

* Black formatting

* Black formatting

* Remove prototypey

* add changelog entry and some prototypey things that actually do nothing yet

* Adding lucine compliance unit test for development

* Adding lucene compliance for filter parsing of a rule.

* Quickfix for lucene regex filter

* Adjusting Format 2

* Adding Deprecated Warning

* Black formatting

* Add documentation

* Delete prototypeclass

* add notebook to documentation

---------

Co-authored-by: FabianMoessner <[email protected]>
Co-authored-by: MoessnerFabian(Group) <[email protected]>
  • Loading branch information
3 people authored Oct 29, 2024
1 parent 499ff55 commit 1a42a12
Show file tree
Hide file tree
Showing 7 changed files with 348 additions and 17 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
* adds `desired_cluster_status` option to opensearch output to signal healthy cluster status
* initially run health checks on setup for every configured component
* make `imagePullPolicy` configurable for helm chart deployments
* it is now possible to use Lucene compliant Filter Expressions
* make `terminationGracePeriodSeconds` configurable in helm chart values


Expand Down
1 change: 1 addition & 0 deletions doc/source/development/coding_examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ Processor Case Examples
.. toctree::
:maxdepth: 1

notebooks/processor_examples/regex.ipynb
notebooks/processor_examples/concatenator.ipynb
notebooks/processor_examples/calculator.ipynb
notebooks/processor_examples/dissector.ipynb
Expand Down
206 changes: 206 additions & 0 deletions doc/source/development/notebooks/processor_examples/regex.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Lucene regex filter\n",
"This presentations contains an example of a filter with a lucene conform regular expression. \n",
"A concatenator that merges different fields form an event is used as a processor for demonstrating the filter function. \n",
"\n",
"Until now it was necessary to flag keys of values that contain a regular expression with regex_fields. "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"document = {\n",
" 'data_stream': {\n",
" 'dataset': 'windows', \n",
" 'namespace': 'devopslab', \n",
" 'type': 'logs'\n",
" }, \n",
" '_op_type': 'create'\n",
" }\n",
"\n",
"expected = {\n",
" 'data_stream': {\n",
" 'dataset': 'windows', \n",
" 'namespace': 'devopslab', \n",
" 'type': 'logs'\n",
" }, \n",
" '_op_type': 'create', \n",
" '_index': 'logs-windows-devopslab'\n",
" }"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Define process"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.insert(0,\"../../../../../\")\n",
"import tempfile\n",
"from copy import deepcopy\n",
"from pathlib import Path\n",
"\n",
"from unittest import mock\n",
"from logprep.factory import Factory\n",
"\n",
"rule_path = Path(tempfile.gettempdir()) / \"concatenator\"\n",
"rule_path.mkdir(exist_ok=True)\n",
"rule_file = rule_path / \"data-stream.yml\"\n",
"\n",
"if rule_file.exists():\n",
" rule_file.unlink()\n",
"\n",
"processor_config = {\n",
" \"myconcatenator\":{ \n",
" \"type\": \"concatenator\",\n",
" \"specific_rules\": [str(rule_path)],\n",
" \"generic_rules\": [\"/dev\"],\n",
" }\n",
" }\n",
"\n",
"def concat_with_rule(rule_yaml):\n",
" mydocument = deepcopy(document)\n",
" if rule_file.exists():\n",
" rule_file.unlink()\n",
" rule_file.write_text(rule_yaml)\n",
" concatenator = Factory.create(processor_config)\n",
" print(f\"before: {mydocument}\")\n",
" concatenator.process(mydocument)\n",
" print(f\"after: {mydocument}\")\n",
" print(mydocument == expected)\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### regex_fields version"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Deprecated]: regex_fields are no longer necessary. Use Lucene regex annotation.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"[Deprecation warning]: regex_fields are no longer necessary. Use lucene regex annotation.\n",
"before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}\n",
"after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create', '_index': 'logs-windows-devopslab'}\n",
"True\n"
]
}
],
"source": [
"rule_yaml = \"\"\"---\n",
"filter: 'data_stream.type: \".*lo.*\"' \n",
"regex_fields:\n",
" - \"data_stream.type\"\n",
"concatenator:\n",
" source_fields:\n",
" - data_stream.type\n",
" - data_stream.dataset\n",
" - data_stream.namespace\n",
" target_field: _index\n",
" separator: \"-\"\n",
" overwrite_target: false\n",
" delete_source_fields: false\n",
"\"\"\"\n",
"\n",
"concat_with_rule(rule_yaml)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Lucene conform version without the need of regex_fields"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}\n",
"after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create', '_index': 'logs-windows-devopslab'}\n",
"True\n"
]
}
],
"source": [
"rule_yaml = \"\"\"---\n",
"filter: 'data_stream.type: \"/.*lo.*/\"' \n",
"concatenator:\n",
" source_fields:\n",
" - data_stream.type\n",
" - data_stream.dataset\n",
" - data_stream.namespace\n",
" target_field: _index\n",
" separator: \"-\"\n",
" overwrite_target: false\n",
" delete_source_fields: false\n",
"\"\"\"\n",
"concat_with_rule(rule_yaml)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
},
"vscode": {
"interpreter": {
"hash": "586280540a85d3e21edc698fe7b86af2848b9b02644e6c22463da25c40a3f1be"
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}
2 changes: 1 addition & 1 deletion logprep/filter/expression/filter_expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
from abc import ABC, abstractmethod
from itertools import chain, zip_longest
from typing import List, Any
from typing import Any, List


class FilterExpressionError(BaseException):
Expand Down
60 changes: 49 additions & 11 deletions logprep/filter/lucene_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,18 @@
------------
It is possible use regex expressions to match values.
For this, the field with the regex pattern must be added to the optional field
To be recognized as a regular expression the filter field has to be start and end with
:code:`/`.
.. code-block:: yaml
:linenos:
:caption: Example
filter: 'ip_address: "/192\.168\.0\..*/"'
[Deprecated, but still functional] The field with the regex pattern must be added to the optional field
:code:`regex_fields` in the rule definition.
In the following example the field :code:`ip_address` is defined as regex field.
Expand All @@ -84,24 +95,39 @@
from itertools import chain, zip_longest

# pylint: enable=anomalous-backslash-in-string
from typing import List, Union, Optional
from typing import List, Optional, Union

import logging
import luqum
from luqum.parser import parser, ParseSyntaxError, IllegalCharacterError
from luqum.tree import OrOperation, AndOperation, Group, FieldGroup, SearchField, Phrase, Word, Not
from luqum.parser import IllegalCharacterError, ParseSyntaxError, parser
from luqum.tree import (
AndOperation,
FieldGroup,
Group,
Not,
OrOperation,
Phrase,
Regex,
SearchField,
Word,
)

from logprep.filter.expression.filter_expression import (
Or,
Always,
And,
StringFilterExpression,
SigmaFilterExpression,
RegExFilterExpression,
Not as NotExpression,
Exists,
Null,
Always,
FilterExpression,
)
from logprep.filter.expression.filter_expression import Not as NotExpression
from logprep.filter.expression.filter_expression import (
Null,
Or,
RegExFilterExpression,
SigmaFilterExpression,
StringFilterExpression,
)

logger = logging.getLogger("LuceneFilter")


class LuceneFilterError(BaseException):
Expand Down Expand Up @@ -309,10 +335,22 @@ def _get_filter_expression(
return RegExFilterExpression(key[:-1] + key_and_modifier[:-1], value)

dotted_field = ".".join(key)

if self._special_fields.items():
for sf_key, sf_value in self._special_fields.items():
if sf_value is True or dotted_field in sf_value:
if sf_key == "regex_fields":
logger.warning(
"[Deprecated]: regex_fields are no longer necessary. "
"Use Lucene regex annotation."
)

return self._special_fields_map[sf_key](key, value)

if value.startswith("/") and value.endswith("/"):
value = value.strip("/")
return RegExFilterExpression(key, value)

return StringFilterExpression(key, value)

@staticmethod
Expand Down
41 changes: 36 additions & 5 deletions tests/unit/filter/test_lucene_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,20 @@
import pytest
from pytest import raises

from logprep.filter.lucene_filter import LuceneFilter, LuceneFilterError, LuceneTransformer
from logprep.filter.expression.filter_expression import (
StringFilterExpression,
RegExFilterExpression,
Or,
And,
Null,
Always,
Exists,
Not,
Null,
Or,
RegExFilterExpression,
StringFilterExpression,
)
from logprep.filter.lucene_filter import (
LuceneFilter,
LuceneFilterError,
LuceneTransformer,
)


Expand Down Expand Up @@ -451,3 +455,30 @@ def test_create_filter_success(self, testcase, input_str, cleaned_str):
def test_create_filter_error(self, testcase, input_str, message):
with raises(LuceneFilterError, match=re.escape(message)):
LuceneFilter.create(f'foo: "{input_str}"')

def test_creates_lucene_compliance_filter_two_matching_regex_keys_of_two(self):
lucene_filter = LuceneFilter.create(
'regex_key_one: "/.*value.*/" AND regex_key_two: "/.*value.*/"',
)

assert lucene_filter == And(
RegExFilterExpression(["regex_key_one"], ".*value.*"),
RegExFilterExpression(["regex_key_two"], ".*value.*"),
)

def test_creates_lucene_compliance_filter_one_regex_key(self):
lucene_filter = LuceneFilter.create(
'regex_key_one: "/.*value.*/"',
)

assert lucene_filter == RegExFilterExpression(["regex_key_one"], ".*value.*")

def test_creates_lucene_compliance_filter_one_matching_one_missmatch_regex_key_of_two(self):
lucene_filter = LuceneFilter.create(
'regex_key_one: "/.*value.*/" AND key_two: "value"',
)

assert lucene_filter == And(
RegExFilterExpression(["regex_key_one"], ".*value.*"),
StringFilterExpression(["key_two"], "value"),
)
Loading

0 comments on commit 1a42a12

Please sign in to comment.