diff --git a/doc/source/development/notebooks/processor_examples/regex.ipynb b/doc/source/development/notebooks/processor_examples/regex.ipynb index 2c05d28bb..ab109ca53 100644 --- a/doc/source/development/notebooks/processor_examples/regex.ipynb +++ b/doc/source/development/notebooks/processor_examples/regex.ipynb @@ -5,10 +5,17 @@ "metadata": {}, "source": [ "# Lucene regex filter\n", - "This presentations contains an example of a filter with a lucene conform regular expression. \n", + "This presentations contains an example of a filter with a Lucene conform regular expression. \n", "A concatenator that merges different fields form an event is used as a processor for demonstrating the filter function. \n", "\n", - "Until now it was necessary to flag keys of values that contain a regular expression with regex_fields. " + "Until now it was necessary to flag the keys in regex_fields, when the value was containing a regular expression. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set document and define concatenator process to test the filter" ] }, { @@ -17,11 +24,20 @@ "metadata": {}, "outputs": [], "source": [ + "import sys\n", + "sys.path.insert(0,\"../../../../../\")\n", + "import tempfile\n", + "from copy import deepcopy\n", + "from pathlib import Path\n", + "\n", + "from unittest import mock\n", + "from logprep.factory import Factory\n", + "\n", "document = {\n", " 'data_stream': {\n", " 'dataset': 'windows', \n", " 'namespace': 'devopslab', \n", - " 'type': 'logs'\n", + " 'type': '/logs/'\n", " }, \n", " '_op_type': 'create'\n", " }\n", @@ -34,30 +50,7 @@ " }, \n", " '_op_type': 'create', \n", " '_index': 'logs-windows-devopslab'\n", - " }" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define process" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "sys.path.insert(0,\"../../../../../\")\n", - "import tempfile\n", - "from copy import deepcopy\n", - "from pathlib import Path\n", - "\n", - "from unittest import mock\n", - "from logprep.factory import Factory\n", + " }\n", "\n", "rule_path = Path(tempfile.gettempdir()) / \"concatenator\"\n", "rule_path.mkdir(exist_ok=True)\n", @@ -73,6 +66,8 @@ " }\n", " }\n", "\n", + "concatenator = Factory.create(processor_config)\n", + "\n", "def concat_with_rule(rule_yaml):\n", " mydocument = deepcopy(document)\n", " if rule_file.exists():\n", @@ -81,21 +76,19 @@ " concatenator = Factory.create(processor_config)\n", " print(f\"before: {mydocument}\")\n", " concatenator.process(mydocument)\n", - " print(f\"after: {mydocument}\")\n", - " print(mydocument == expected)\n", - " " + " print(f\"after: {mydocument}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### regex_fields version" + "### Former version with explicit regex_fields annotation" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -109,18 +102,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "\n", - "\n", - "[Deprecation warning]: regex_fields are no longer necessary. Use lucene regex annotation.\n", - "before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}\n", - "after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create', '_index': 'logs-windows-devopslab'}\n", - "True\n" + "before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create'}\n", + "after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create', '_index': '/logs/-windows-devopslab'}\n" ] } ], "source": [ "rule_yaml = \"\"\"---\n", - "filter: 'data_stream.type: \".*lo.*\"' \n", + "filter: 'data_stream.type: \".*lo.*\"'\n", "regex_fields:\n", " - \"data_stream.type\"\n", "concatenator:\n", @@ -134,6 +123,7 @@ " delete_source_fields: false\n", "\"\"\"\n", "\n", + "\n", "concat_with_rule(rule_yaml)\n" ] }, @@ -141,27 +131,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Lucene conform version without the need of regex_fields" + "### New Lucene conform version without the need of regex_fields" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}\n", - "after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create', '_index': 'logs-windows-devopslab'}\n", - "True\n" + "before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create'}\n", + "after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create', '_index': '/logs/-windows-devopslab'}\n" ] } ], "source": [ "rule_yaml = \"\"\"---\n", - "filter: 'data_stream.type: \"/.*lo.*/\"' \n", + "filter: 'data_stream.type: /.*log.*/' \n", "concatenator:\n", " source_fields:\n", " - data_stream.type\n", @@ -174,6 +163,44 @@ "\"\"\"\n", "concat_with_rule(rule_yaml)\n" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Escaping a slash. One escape is needed for yml format, the other one for Lucene syntax. " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create'}\n", + "after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create', '_index': '/logs/-windows-devopslab'}\n" + ] + } + ], + "source": [ + "rule_yaml = \"\"\"---\n", + "filter: 'data_stream.type: /\\\\/lo.*/' \n", + " \n", + "concatenator:\n", + " source_fields:\n", + " - data_stream.type\n", + " - data_stream.dataset\n", + " - data_stream.namespace\n", + " target_field: _index\n", + " separator: \"-\"\n", + " overwrite_target: false\n", + " delete_source_fields: false\n", + "\"\"\"\n", + "concat_with_rule(rule_yaml)" + ] } ], "metadata": { diff --git a/logprep/filter/lucene_filter.py b/logprep/filter/lucene_filter.py index 703eec5fb..f2eb3a60f 100644 --- a/logprep/filter/lucene_filter.py +++ b/logprep/filter/lucene_filter.py @@ -70,7 +70,7 @@ :linenos: :caption: Example - filter: 'ip_address: "/192\.168\.0\..*/"' + filter: 'ip_address: /192\.168\.0\..*/' [Deprecated, but still functional] The field with the regex pattern must be added to the optional field @@ -107,6 +107,7 @@ Not, OrOperation, Phrase, + Regex, SearchField, Word, ) @@ -323,15 +324,32 @@ def _create_field(self, tree: luqum.tree) -> Optional[FilterExpression]: value = self._strip_quote_from_string(tree.expr.value) value = self._remove_lucene_escaping(value) return self._get_filter_expression(key, value) + elif isinstance(tree.expr, Regex): + key = tree.name.replace("\\", "") + key = key.split(".") + if tree.expr.value == "null": + return Null(key) + + value = self._strip_quote_from_string(tree.expr.value) + value = self._remove_lucene_escaping(value) + return self._get_filter_expression_regex(key, value) return None - def _get_filter_expression( - self, key: List[str], value - ) -> Union[RegExFilterExpression, StringFilterExpression]: + @staticmethod + def _check_key_and_modifier(key, value): key_and_modifier = key[-1].split("|") if len(key_and_modifier) == 2: if key_and_modifier[-1] == "re": return RegExFilterExpression(key[:-1] + key_and_modifier[:-1], value) + return None + + def _get_filter_expression( + self, key: List[str], value + ) -> Union[RegExFilterExpression, StringFilterExpression]: + + key_and_modifier_check = LuceneTransformer._check_key_and_modifier(key, value) + if key_and_modifier_check is not None: + return key_and_modifier_check dotted_field = ".".join(key) @@ -346,12 +364,19 @@ def _get_filter_expression( return self._special_fields_map[sf_key](key, value) - if value.startswith("/") and value.endswith("/"): - value = value.strip("/") - return RegExFilterExpression(key, value) - return StringFilterExpression(key, value) + def _get_filter_expression_regex( + self, key: List[str], value + ) -> Union[RegExFilterExpression, StringFilterExpression]: + + key_and_modifier_check = LuceneTransformer._check_key_and_modifier(key, value) + if key_and_modifier_check is not None: + return key_and_modifier_check + + value = value.strip("/") + return RegExFilterExpression(key, value) + @staticmethod def _create_value_expression(word: luqum.tree) -> Union[Exists, Always]: value = word.value.replace("\\", "") diff --git a/tests/unit/filter/test_lucene_filter.py b/tests/unit/filter/test_lucene_filter.py index 2873a7a8c..8ab949619 100644 --- a/tests/unit/filter/test_lucene_filter.py +++ b/tests/unit/filter/test_lucene_filter.py @@ -458,7 +458,7 @@ def test_create_filter_error(self, testcase, input_str, message): def test_creates_lucene_compliance_filter_two_matching_regex_keys_of_two(self): lucene_filter = LuceneFilter.create( - 'regex_key_one: "/.*value.*/" AND regex_key_two: "/.*value.*/"', + "regex_key_one: /.*value.*/ AND regex_key_two: /.*value.*/", ) assert lucene_filter == And( @@ -466,19 +466,34 @@ def test_creates_lucene_compliance_filter_two_matching_regex_keys_of_two(self): RegExFilterExpression(["regex_key_two"], ".*value.*"), ) - def test_creates_lucene_compliance_filter_one_regex_key(self): + def test_creates_StringFilter_not_Regex(self): lucene_filter = LuceneFilter.create( 'regex_key_one: "/.*value.*/"', ) + assert lucene_filter == StringFilterExpression(["regex_key_one"], "/.*value.*/") + + def test_new_lucene_compliance(self): + lucene_filter = LuceneFilter.create("regex_key_one:/.*value.*/") + assert lucene_filter == RegExFilterExpression(["regex_key_one"], ".*value.*") def test_creates_lucene_compliance_filter_one_matching_one_missmatch_regex_key_of_two(self): lucene_filter = LuceneFilter.create( - 'regex_key_one: "/.*value.*/" AND key_two: "value"', + 'regex_key_one:/.*value.*/ AND key_two: "/.*value.*/"', ) assert lucene_filter == And( RegExFilterExpression(["regex_key_one"], ".*value.*"), - StringFilterExpression(["key_two"], "value"), + StringFilterExpression(["key_two"], "/.*value.*/"), ) + + def test_new_lucene_compliance_double_escape(self): + lucene_filter = LuceneFilter.create("regex_key_one:/\\/.*value.*/") + + assert lucene_filter == RegExFilterExpression(["regex_key_one"], "\/.*value.*") + + def test_new_lucene_compliance_single_escape(self): + lucene_filter = LuceneFilter.create("regex_key_one:/\/.*value.*/") + + assert lucene_filter == RegExFilterExpression(["regex_key_one"], "\/.*value.*") diff --git a/tests/unit/processor/labeler/test_labeler_rule.py b/tests/unit/processor/labeler/test_labeler_rule.py index 72afb22da..aade96836 100644 --- a/tests/unit/processor/labeler/test_labeler_rule.py +++ b/tests/unit/processor/labeler/test_labeler_rule.py @@ -218,7 +218,7 @@ def test_null_returns_true_for_matching_document(self): def test_lucene_regex_matches_returns_true_for_matching_document(self): rule_definition = { - "filter": 'applyrule: "/.*yes.*/"', + "filter": "applyrule: /.*yes.*/", "labeler": {"label": {"reporter": ["windows"]}}, } rule = LabelerRule._create_from_dict(rule_definition) @@ -228,7 +228,7 @@ def test_lucene_regex_matches_returns_true_for_matching_document(self): def test_lucene_regex_matches_returns_false_for_non_matching_document(self): rule_definition = { - "filter": 'applyrule: "/.*yes.*/"', + "filter": "applyrule: /.*yes.*/", "labeler": {"label": {"reporter": ["windows"]}}, } rule = LabelerRule._create_from_dict(rule_definition) @@ -245,7 +245,7 @@ def test_lucene_regex_matches_returns_false_for_non_matching_document(self): def test_complex_lucene_regex_matches_returns_true_for_matching_document(self): rule_definition = { - "filter": r'applyrule: "/(?:(?=.*[a-z])(?:(?=.*[A-Z])(?=.*[\d\W])|(?=.*\W)(?=.*\d))|(?=.*\W)(?=.*[A-Z])(?=.*\d)).{8,}/"', + "filter": r"applyrule: /(?:(?=.*[a-z])(?:(?=.*[A-Z])(?=.*[\d\W])|(?=.*\W)(?=.*\d))|(?=.*\W)(?=.*[A-Z])(?=.*\d)).{8,}/", # pylint: disable=line-too-long "labeler": {"label": {"reporter": ["windows"]}}, } @@ -257,7 +257,7 @@ def test_complex_lucene_regex_matches_returns_true_for_matching_document(self): def test_complex_lucene_regex_does_not_match_returns_true_for_matching_document(self): rule_definition = { - "filter": r'applyrule: "/(?:(?=.*[a-z])(?:(?=.*[A-Z])(?=.*[\d\W])|(?=.*\W)(?=.*\d))|(?=.*\W)(?=.*[A-Z])(?=.*\d)).{8,}/"', + "filter": r"applyrule: /(?:(?=.*[a-z])(?:(?=.*[A-Z])(?=.*[\d\W])|(?=.*\W)(?=.*\d))|(?=.*\W)(?=.*[A-Z])(?=.*\d)).{8,}/", # pylint: disable=line-too-long "labeler": {"label": {"reporter": ["windows"]}}, }