Skip to content

Commit

Permalink
WIP notebook for lucene regex filter development
Browse files Browse the repository at this point in the history
  • Loading branch information
fabian-moessner committed Oct 10, 2024
1 parent 6ba5b2f commit bea645c
Showing 1 changed file with 54 additions and 12 deletions.
66 changes: 54 additions & 12 deletions doc/source/development/notebooks/processor_examples/regex.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -13,34 +13,54 @@
"\n",
"\n",
"document = { \n",
" \"regex_key_one\": \"*1value2*\", \n",
" \"regex_key_two\": \"Here is 1234 in the message\" \n",
" \"regex_key_one\": \"value\", \n",
" \"regex_key_two\": \"*1value2*\" \n",
"}\n",
"\n",
"\n",
"rule_yaml = \"\"\"\n",
"---\n",
"filter: 'regex_key_one: \"/.*value.*/\" AND regex_key_two: \"/.*1234.*/\"'\n",
"filter: 'regex_key_one: \"/.*value.*/\" AND regex_key_two: \"/.*value.*/\"'\n",
"regex_fields:\n",
" - \"regex_key_one\"\n",
" - \"regex_key_two\"\n",
"pseudonymizer:\n",
" mapping:\n",
" winlog.event_data.param1: \"RE_WHOLE_FIELD\"\n",
" winlog.event_data.param2: \"RE_WHOLE_FIELD\"\n",
" regex_key_one: \"RE_WHOLE_FIELD\"\n",
" regex_key_two: \"RE_WHOLE_FIELD\"\n",
" description: \"...\"\n",
"\"\"\"\n",
"\n",
"rule_path = Path(tempfile.gettempdir()) / \"concatenator\"\n",
"\n",
"\n",
"document = { \n",
" \"regex_key_one\": \"*value*\", \n",
" \"regex_key_two\": \"value\", \n",
" \"test_pseudonymizer\": \"test\", \"something_special\": \"pseudonymize_me\"\n",
"}\n",
"\n",
"rule_yaml = \"\"\"\n",
"---\n",
"filter: \"test_pseudonymizer AND something_special\"\n",
"pseudonymizer:\n",
" id: pseudonymizer-1352bc0a-53ae-4740-bb9e-1e865f63375f\n",
" mapping:\n",
" something_special: \"RE_WHOLE_FIELD\"\n",
"description: \"...\"\n",
"\"\"\"\n",
"\n",
"\n",
"\n",
"\n",
"rule_path = Path(tempfile.gettempdir()) / \"pseudonymizer\"\n",
"rule_path.mkdir(exist_ok=True)\n",
"rule_file = rule_path / \"data-stream.yml\"\n",
"rule_file.write_text(rule_yaml)\n",
"\n",
"processor_config = {\n",
" \"mydropper\": {\n",
" \"mypseudonymizer\": {\n",
" \"type\": \"pseudonymizer\",\n",
" \"specific_rules\": [str(rule_path)],\n",
" \"generic_rules\": [\"/dev\"],\n",
" \"generic_rules\": [\"../../../../../examples/exampledata/rules/pseudonymizer/generic/\"],\n",
" \"outputs\": [{\"kafka\": \"topic\"}],\n",
" \"pubkey_analyst\": \"../../../../../tests/testdata/unit/pseudonymizer/example_analyst_pub.pem\",\n",
" \"pubkey_depseudo\": \"../../../../../tests/testdata/unit/pseudonymizer/example_depseudo_pub.pem\",\n",
Expand Down Expand Up @@ -85,8 +105,30 @@
"name": "stdout",
"output_type": "stream",
"text": [
"before: {'regex_key_one': '*1value2*', 'regex_key_two': 'Here is 1234 in the message'}\n",
"after: {'regex_key_one': '*1value2*', 'regex_key_two': 'Here is 1234 in the message'}\n"
"before: {'regex_key_one': '*value*', 'regex_key_two': 'value', 'test_pseudonymizer': 'test', 'something_special': 'pseudonymize_me'}\n"
]
},
{
"ename": "ProcessingCriticalError",
"evalue": "ProcessingCriticalError: ProcessingCriticalError: 'str' object has no attribute 'groups' -> event was send to error output and further processing stopped, rule.id='pseudonymizer-1352bc0a-53ae-4740-bb9e-1e865f63375f', rule.description='...', event={'regex_key_one': '*value*', 'regex_key_two': 'value', 'test_pseudonymizer': 'test', 'something_special': 'pseudonymize_me'}",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/abc/processor.py:194\u001b[0m, in \u001b[0;36mProcessor._apply_rules_wrapper\u001b[0;34m(self, event, rule)\u001b[0m\n\u001b[1;32m 193\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 194\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_apply_rules\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevent\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrule\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 195\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ProcessingWarning \u001b[38;5;28;01mas\u001b[39;00m error:\n",
"File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/processor/pseudonymizer/processor.py:261\u001b[0m, in \u001b[0;36mPseudonymizer._apply_rules\u001b[0;34m(self, event, rule)\u001b[0m\n\u001b[1;32m 260\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 261\u001b[0m field_value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_pseudonymize_field\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrule\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdotted_field\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mregex\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfield_value\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 262\u001b[0m _ \u001b[38;5;241m=\u001b[39m add_field_to(event, dotted_field, field_value, overwrite_output_field\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
"File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/processor/pseudonymizer/processor.py:271\u001b[0m, in \u001b[0;36mPseudonymizer._pseudonymize_field\u001b[0;34m(self, rule, dotted_field, regex, field_value)\u001b[0m\n\u001b[1;32m 268\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_pseudonymize_field\u001b[39m(\n\u001b[1;32m 269\u001b[0m \u001b[38;5;28mself\u001b[39m, rule: PseudonymizerRule, dotted_field: \u001b[38;5;28mstr\u001b[39m, regex: Pattern, field_value: \u001b[38;5;28mstr\u001b[39m\n\u001b[1;32m 270\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mstr\u001b[39m:\n\u001b[0;32m--> 271\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mregex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroups\u001b[49m \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 272\u001b[0m plaintext_values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m(value \u001b[38;5;28;01mfor\u001b[39;00m value \u001b[38;5;129;01min\u001b[39;00m regex\u001b[38;5;241m.\u001b[39mfindall(field_value) \u001b[38;5;28;01mif\u001b[39;00m value)\n",
"\u001b[0;31mAttributeError\u001b[0m: 'str' object has no attribute 'groups'",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[0;31mProcessingCriticalError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[3], line 5\u001b[0m\n\u001b[1;32m 2\u001b[0m mydocument \u001b[38;5;241m=\u001b[39m deepcopy(document)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbefore: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmydocument\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 5\u001b[0m \u001b[43mpseudonymizer_processor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprocess\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmydocument\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mafter : \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmydocument\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
"File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/abc/processor.py:161\u001b[0m, in \u001b[0;36mProcessor.process\u001b[0;34m(self, event)\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_extra_data\u001b[38;5;241m.\u001b[39mclear()\n\u001b[1;32m 160\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdescribe()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m processing event \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 161\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_process_rule_tree\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevent\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_specific_tree\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_process_rule_tree(event, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_generic_tree)\n\u001b[1;32m 163\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_extra_data \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_extra_data \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n",
"File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/abc/processor.py:190\u001b[0m, in \u001b[0;36mProcessor._process_rule_tree\u001b[0;34m(self, event, tree)\u001b[0m\n\u001b[1;32m 188\u001b[0m _process_rule_tree_multiple_times(tree, event)\n\u001b[1;32m 189\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 190\u001b[0m \u001b[43m_process_rule_tree_once\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtree\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mevent\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/abc/processor.py:185\u001b[0m, in \u001b[0;36mProcessor._process_rule_tree.<locals>._process_rule_tree_once\u001b[0;34m(tree, event)\u001b[0m\n\u001b[1;32m 183\u001b[0m matching_rules \u001b[38;5;241m=\u001b[39m tree\u001b[38;5;241m.\u001b[39mget_matching_rules(event)\n\u001b[1;32m 184\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m rule \u001b[38;5;129;01min\u001b[39;00m matching_rules:\n\u001b[0;32m--> 185\u001b[0m \u001b[43m_process_rule\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrule\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mevent\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/metrics/metrics.py:207\u001b[0m, in \u001b[0;36mMetric.measure_time.<locals>.without_append.<locals>.inner\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 205\u001b[0m metric \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmetrics, metric_name)\n\u001b[1;32m 206\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m metric\u001b[38;5;241m.\u001b[39mtracker\u001b[38;5;241m.\u001b[39mlabels(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mmetric\u001b[38;5;241m.\u001b[39mlabels)\u001b[38;5;241m.\u001b[39mtime():\n\u001b[0;32m--> 207\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 208\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n",
"File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/abc/processor.py:170\u001b[0m, in \u001b[0;36mProcessor._process_rule_tree.<locals>._process_rule\u001b[0;34m(rule, event)\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[38;5;129m@Metric\u001b[39m\u001b[38;5;241m.\u001b[39mmeasure_time()\n\u001b[1;32m 169\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_process_rule\u001b[39m(rule, event):\n\u001b[0;32m--> 170\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_apply_rules_wrapper\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevent\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrule\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 171\u001b[0m rule\u001b[38;5;241m.\u001b[39mmetrics\u001b[38;5;241m.\u001b[39mnumber_of_processed_events \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 172\u001b[0m applied_rules\u001b[38;5;241m.\u001b[39madd(rule)\n",
"File \u001b[0;32m~/arbeit/venvs/logprep/lib/python3.12/site-packages/logprep/abc/processor.py:200\u001b[0m, in \u001b[0;36mProcessor._apply_rules_wrapper\u001b[0;34m(self, event, rule)\u001b[0m\n\u001b[1;32m 198\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error \u001b[38;5;66;03m# is needed to prevent wrapping it in itself\u001b[39;00m\n\u001b[1;32m 199\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m error:\n\u001b[0;32m--> 200\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ProcessingCriticalError(\u001b[38;5;28mstr\u001b[39m(error), rule, event) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merror\u001b[39;00m\n\u001b[1;32m 201\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(rule, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdelete_source_fields\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 202\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n",
"\u001b[0;31mProcessingCriticalError\u001b[0m: ProcessingCriticalError: ProcessingCriticalError: 'str' object has no attribute 'groups' -> event was send to error output and further processing stopped, rule.id='pseudonymizer-1352bc0a-53ae-4740-bb9e-1e865f63375f', rule.description='...', event={'regex_key_one': '*value*', 'regex_key_two': 'value', 'test_pseudonymizer': 'test', 'something_special': 'pseudonymize_me'}"
]
}
],
Expand All @@ -96,7 +138,7 @@
"\n",
"print(f\"before: {mydocument}\")\n",
"pseudonymizer_processor.process(mydocument)\n",
"print(f\"after: {mydocument}\")"
"print(f\"after : {mydocument}\")"
]
}
],
Expand Down

0 comments on commit bea645c

Please sign in to comment.