Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug lucene compliance #734

Merged
merged 23 commits into from
Jan 9, 2025
117 changes: 72 additions & 45 deletions doc/source/development/notebooks/processor_examples/regex.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,17 @@
"metadata": {},
"source": [
"# Lucene regex filter\n",
"This presentations contains an example of a filter with a lucene conform regular expression. \n",
"This presentations contains an example of a filter with a Lucene conform regular expression. \n",
"A concatenator that merges different fields form an event is used as a processor for demonstrating the filter function. \n",
"\n",
"Until now it was necessary to flag keys of values that contain a regular expression with regex_fields. "
"Until now it was necessary to flag the keys in regex_fields, when the value was containing a regular expression. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Set document and define concatenator process to test the filter"
]
},
{
Expand All @@ -17,11 +24,20 @@
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.insert(0,\"../../../../../\")\n",
"import tempfile\n",
"from copy import deepcopy\n",
"from pathlib import Path\n",
"\n",
"from unittest import mock\n",
"from logprep.factory import Factory\n",
"\n",
"document = {\n",
" 'data_stream': {\n",
" 'dataset': 'windows', \n",
" 'namespace': 'devopslab', \n",
" 'type': 'logs'\n",
" 'type': '/logs/'\n",
" }, \n",
" '_op_type': 'create'\n",
" }\n",
Expand All @@ -34,30 +50,7 @@
" }, \n",
" '_op_type': 'create', \n",
" '_index': 'logs-windows-devopslab'\n",
" }"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Define process"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.insert(0,\"../../../../../\")\n",
"import tempfile\n",
"from copy import deepcopy\n",
"from pathlib import Path\n",
"\n",
"from unittest import mock\n",
"from logprep.factory import Factory\n",
" }\n",
"\n",
"rule_path = Path(tempfile.gettempdir()) / \"concatenator\"\n",
"rule_path.mkdir(exist_ok=True)\n",
Expand All @@ -73,6 +66,8 @@
" }\n",
" }\n",
"\n",
"concatenator = Factory.create(processor_config)\n",
"\n",
"def concat_with_rule(rule_yaml):\n",
" mydocument = deepcopy(document)\n",
" if rule_file.exists():\n",
Expand All @@ -81,21 +76,19 @@
" concatenator = Factory.create(processor_config)\n",
" print(f\"before: {mydocument}\")\n",
" concatenator.process(mydocument)\n",
" print(f\"after: {mydocument}\")\n",
" print(mydocument == expected)\n",
" "
" print(f\"after: {mydocument}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### regex_fields version"
"### Former version with explicit regex_fields annotation"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [
{
Expand All @@ -109,18 +102,14 @@
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"[Deprecation warning]: regex_fields are no longer necessary. Use lucene regex annotation.\n",
"before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}\n",
"after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create', '_index': 'logs-windows-devopslab'}\n",
"True\n"
"before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create'}\n",
"after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create', '_index': '/logs/-windows-devopslab'}\n"
]
}
],
"source": [
"rule_yaml = \"\"\"---\n",
"filter: 'data_stream.type: \".*lo.*\"' \n",
"filter: 'data_stream.type: \".*lo.*\"'\n",
"regex_fields:\n",
" - \"data_stream.type\"\n",
"concatenator:\n",
Expand All @@ -134,34 +123,34 @@
" delete_source_fields: false\n",
"\"\"\"\n",
"\n",
"\n",
"concat_with_rule(rule_yaml)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Lucene conform version without the need of regex_fields"
"### New Lucene conform version without the need of regex_fields"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}\n",
"after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create', '_index': 'logs-windows-devopslab'}\n",
"True\n"
"before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create'}\n",
"after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create', '_index': '/logs/-windows-devopslab'}\n"
]
}
],
"source": [
"rule_yaml = \"\"\"---\n",
"filter: 'data_stream.type: \"/.*lo.*/\"' \n",
"filter: 'data_stream.type: /.*log.*/' \n",
"concatenator:\n",
" source_fields:\n",
" - data_stream.type\n",
Expand All @@ -174,6 +163,44 @@
"\"\"\"\n",
"concat_with_rule(rule_yaml)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Escaping a slash. One escape is needed for yml format, the other one for Lucene syntax. "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create'}\n",
"after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create', '_index': '/logs/-windows-devopslab'}\n"
]
}
],
"source": [
"rule_yaml = \"\"\"---\n",
"filter: 'data_stream.type: /\\\\/lo.*/' \n",
" \n",
"concatenator:\n",
" source_fields:\n",
" - data_stream.type\n",
" - data_stream.dataset\n",
" - data_stream.namespace\n",
" target_field: _index\n",
" separator: \"-\"\n",
" overwrite_target: false\n",
" delete_source_fields: false\n",
"\"\"\"\n",
"concat_with_rule(rule_yaml)"
]
}
],
"metadata": {
Expand Down
41 changes: 33 additions & 8 deletions logprep/filter/lucene_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
:linenos:
:caption: Example

filter: 'ip_address: "/192\.168\.0\..*/"'
filter: 'ip_address: /192\.168\.0\..*/'


[Deprecated, but still functional] The field with the regex pattern must be added to the optional field
Expand Down Expand Up @@ -107,6 +107,7 @@
Not,
OrOperation,
Phrase,
Regex,
SearchField,
Word,
)
Expand Down Expand Up @@ -323,15 +324,32 @@ def _create_field(self, tree: luqum.tree) -> Optional[FilterExpression]:
value = self._strip_quote_from_string(tree.expr.value)
value = self._remove_lucene_escaping(value)
return self._get_filter_expression(key, value)
elif isinstance(tree.expr, Regex):
key = tree.name.replace("\\", "")
key = key.split(".")
if tree.expr.value == "null":
return Null(key)

value = self._strip_quote_from_string(tree.expr.value)
value = self._remove_lucene_escaping(value)
return self._get_filter_expression_regex(key, value)
return None

def _get_filter_expression(
self, key: List[str], value
) -> Union[RegExFilterExpression, StringFilterExpression]:
@staticmethod
def _check_key_and_modifier(key, value):
key_and_modifier = key[-1].split("|")
if len(key_and_modifier) == 2:
if key_and_modifier[-1] == "re":
return RegExFilterExpression(key[:-1] + key_and_modifier[:-1], value)
return None

def _get_filter_expression(
self, key: List[str], value
) -> Union[RegExFilterExpression, StringFilterExpression]:

key_and_modifier_check = LuceneTransformer._check_key_and_modifier(key, value)
if key_and_modifier_check is not None:
return key_and_modifier_check

dotted_field = ".".join(key)

Expand All @@ -346,12 +364,19 @@ def _get_filter_expression(

return self._special_fields_map[sf_key](key, value)

if value.startswith("/") and value.endswith("/"):
value = value.strip("/")
return RegExFilterExpression(key, value)

return StringFilterExpression(key, value)

def _get_filter_expression_regex(
self, key: List[str], value
) -> Union[RegExFilterExpression, StringFilterExpression]:

key_and_modifier_check = LuceneTransformer._check_key_and_modifier(key, value)
if key_and_modifier_check is not None:
return key_and_modifier_check

value = value.strip("/")
return RegExFilterExpression(key, value)

@staticmethod
def _create_value_expression(word: luqum.tree) -> Union[Exists, Always]:
value = word.value.replace("\\", "")
Expand Down
23 changes: 19 additions & 4 deletions tests/unit/filter/test_lucene_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,27 +458,42 @@ def test_create_filter_error(self, testcase, input_str, message):

def test_creates_lucene_compliance_filter_two_matching_regex_keys_of_two(self):
lucene_filter = LuceneFilter.create(
'regex_key_one: "/.*value.*/" AND regex_key_two: "/.*value.*/"',
"regex_key_one: /.*value.*/ AND regex_key_two: /.*value.*/",
)

assert lucene_filter == And(
RegExFilterExpression(["regex_key_one"], ".*value.*"),
RegExFilterExpression(["regex_key_two"], ".*value.*"),
)

def test_creates_lucene_compliance_filter_one_regex_key(self):
def test_creates_StringFilter_not_Regex(self):
lucene_filter = LuceneFilter.create(
'regex_key_one: "/.*value.*/"',
)

assert lucene_filter == StringFilterExpression(["regex_key_one"], "/.*value.*/")

def test_new_lucene_compliance(self):
lucene_filter = LuceneFilter.create("regex_key_one:/.*value.*/")

assert lucene_filter == RegExFilterExpression(["regex_key_one"], ".*value.*")

def test_creates_lucene_compliance_filter_one_matching_one_missmatch_regex_key_of_two(self):
lucene_filter = LuceneFilter.create(
'regex_key_one: "/.*value.*/" AND key_two: "value"',
'regex_key_one:/.*value.*/ AND key_two: "/.*value.*/"',
)

assert lucene_filter == And(
RegExFilterExpression(["regex_key_one"], ".*value.*"),
StringFilterExpression(["key_two"], "value"),
StringFilterExpression(["key_two"], "/.*value.*/"),
)

def test_new_lucene_compliance_double_escape(self):
lucene_filter = LuceneFilter.create("regex_key_one:/\\/.*value.*/")

assert lucene_filter == RegExFilterExpression(["regex_key_one"], "\/.*value.*")

def test_new_lucene_compliance_single_escape(self):
lucene_filter = LuceneFilter.create("regex_key_one:/\/.*value.*/")

assert lucene_filter == RegExFilterExpression(["regex_key_one"], "\/.*value.*")
8 changes: 4 additions & 4 deletions tests/unit/processor/labeler/test_labeler_rule.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def test_null_returns_true_for_matching_document(self):

def test_lucene_regex_matches_returns_true_for_matching_document(self):
rule_definition = {
"filter": 'applyrule: "/.*yes.*/"',
"filter": "applyrule: /.*yes.*/",
"labeler": {"label": {"reporter": ["windows"]}},
}
rule = LabelerRule._create_from_dict(rule_definition)
Expand All @@ -228,7 +228,7 @@ def test_lucene_regex_matches_returns_true_for_matching_document(self):

def test_lucene_regex_matches_returns_false_for_non_matching_document(self):
rule_definition = {
"filter": 'applyrule: "/.*yes.*/"',
"filter": "applyrule: /.*yes.*/",
"labeler": {"label": {"reporter": ["windows"]}},
}
rule = LabelerRule._create_from_dict(rule_definition)
Expand All @@ -245,7 +245,7 @@ def test_lucene_regex_matches_returns_false_for_non_matching_document(self):

def test_complex_lucene_regex_matches_returns_true_for_matching_document(self):
rule_definition = {
"filter": r'applyrule: "/(?:(?=.*[a-z])(?:(?=.*[A-Z])(?=.*[\d\W])|(?=.*\W)(?=.*\d))|(?=.*\W)(?=.*[A-Z])(?=.*\d)).{8,}/"',
"filter": r"applyrule: /(?:(?=.*[a-z])(?:(?=.*[A-Z])(?=.*[\d\W])|(?=.*\W)(?=.*\d))|(?=.*\W)(?=.*[A-Z])(?=.*\d)).{8,}/",
# pylint: disable=line-too-long
"labeler": {"label": {"reporter": ["windows"]}},
}
Expand All @@ -257,7 +257,7 @@ def test_complex_lucene_regex_matches_returns_true_for_matching_document(self):

def test_complex_lucene_regex_does_not_match_returns_true_for_matching_document(self):
rule_definition = {
"filter": r'applyrule: "/(?:(?=.*[a-z])(?:(?=.*[A-Z])(?=.*[\d\W])|(?=.*\W)(?=.*\d))|(?=.*\W)(?=.*[A-Z])(?=.*\d)).{8,}/"',
"filter": r"applyrule: /(?:(?=.*[a-z])(?:(?=.*[A-Z])(?=.*[\d\W])|(?=.*\W)(?=.*\d))|(?=.*\W)(?=.*[A-Z])(?=.*\d)).{8,}/",
# pylint: disable=line-too-long
"labeler": {"label": {"reporter": ["windows"]}},
}
Expand Down
Loading