Skip to content

Commit

Permalink
prevent duplicate pseudonyms (#494)
Browse files Browse the repository at this point in the history
  • Loading branch information
dtrai2 authored Dec 7, 2023
1 parent 4a7f67e commit 3c2700b
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 4 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

### Bugfix

* remove duplicate pseudonyms in extra outputs of pseudonymizer

## v9.0.1
### Breaking

Expand Down
7 changes: 5 additions & 2 deletions logprep/processor/pseudonymizer/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ class Config(Processor.Config):
max_cached_pseudonymized_urls: int = field(
validator=[validators.instance_of(int), validators.gt(0)], default=10000
)
"""The maximum number of cached pseudonymized urls. Default is 10000.
"""The maximum number of cached pseudonymized urls. Default is 10000.
Behaves similarly to the max_cached_pseudonyms. Has to be greater than 0."""
tld_lists: Optional[list] = field(default=None, validator=[list_of_urls_validator])
"""Optional list of path to files with top-level domain lists
Expand Down Expand Up @@ -220,7 +220,10 @@ def load_rules(self, specific_rules_targets: List[str], generic_rules_targets: L
def process(self, event: dict):
self.pseudonyms = []
super().process(event)
return (self.pseudonyms, self._config.outputs) if self.pseudonyms else None
unique_pseudonyms = list(
{pseudonyms["pseudonym"]: pseudonyms for pseudonyms in self.pseudonyms}.values()
)
return (unique_pseudonyms, self._config.outputs) if unique_pseudonyms else None

def _apply_rules(self, event: dict, rule: PseudonymizerRule):
for dotted_field, regex in rule.pseudonyms.items():
Expand Down
86 changes: 84 additions & 2 deletions tests/unit/processor/pseudonymizer/test_pseudonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -912,7 +912,6 @@ def test_process_returns_extra_output(self):
"pseudonymizer": {
"pseudonyms": {
"winlog.event_data.param1": "RE_WHOLE_FIELD",
"winlog.event_data.param2": "RE_WHOLE_FIELD",
}
},
}
Expand All @@ -923,7 +922,6 @@ def test_process_returns_extra_output(self):
"provider_name": "Test456",
"event_data": {
"param1": "Pseudonymize me!",
"param2": "Pseudonymize me!",
},
},
}
Expand All @@ -936,9 +934,93 @@ def test_process_returns_extra_output(self):
assert isinstance(extra_output[1], tuple)
assert isinstance(extra_output[1][0], dict)
assert extra_output[1][0] == {"kafka": "topic"}, "Output is set as in CONFIG"
assert len(extra_output[0]) == 1, "Should contain only one pseudonym"
assert extra_output[0][0].get("pseudonym"), "pseudonym is set"
assert extra_output[0][0].get("origin"), "encrypted original is set"
assert extra_output[0][0].get("@timestamp"), "timestamp is set if present in event"

def test_extra_output_contains_only_one_pseudonym_even_if_pseudonym_appears_multiple_times_in_event(
self,
):
rule_dict = {
"filter": "winlog.event_id: 1234 AND winlog.provider_name: Test456",
"pseudonymizer": {
"pseudonyms": {
"winlog.event_data.param1": "RE_WHOLE_FIELD",
"winlog.event_data.param2": "RE_WHOLE_FIELD",
}
},
}
event = {
"@timestamp": "custom timestamp",
"winlog": {
"event_id": 1234,
"provider_name": "Test456",
"event_data": {
"param1": "Pseudonymize me - appears twice!",
"param2": "Pseudonymize me - appears twice!",
},
},
}
self._load_specific_rule(rule_dict) # First call
extra_output = self.object.process(event)
assert extra_output
assert isinstance(extra_output, tuple)
assert len(extra_output) == 2
assert isinstance(extra_output[0], list)
assert isinstance(extra_output[1], tuple)
assert isinstance(extra_output[1][0], dict)
assert extra_output[1][0] == {"kafka": "topic"}, "Output is set as in CONFIG"
assert (
len(extra_output[0]) == 1
), "Should contain only one pseudonym, as the value for both is the same"
assert extra_output[0][0].get("pseudonym"), "pseudonym is set"
assert extra_output[0][0].get("origin"), "encrypted original is set"
assert extra_output[0][0].get("@timestamp"), "timestamp is set if present in event"

def test_extra_output_contains_different_pseudonyms_for_different_values(self):
rule_dict = {
"filter": "winlog.event_id: 1234 AND winlog.provider_name: Test456",
"pseudonymizer": {
"pseudonyms": {
"winlog.event_data.param1": "RE_WHOLE_FIELD",
"winlog.event_data.param2": "RE_WHOLE_FIELD",
}
},
}
event = {
"@timestamp": "custom timestamp",
"winlog": {
"event_id": 1234,
"provider_name": "Test456",
"event_data": {
"param1": "Pseudonymize me - first!",
"param2": "Pseudonymize me - second!",
},
},
}
self._load_specific_rule(rule_dict) # First call
extra_output = self.object.process(event)
assert extra_output
assert isinstance(extra_output, tuple)
assert len(extra_output) == 2
assert isinstance(extra_output[0], list)
assert isinstance(extra_output[1], tuple)
assert isinstance(extra_output[1][0], dict)
assert extra_output[1][0] == {"kafka": "topic"}, "Output is set as in CONFIG"
assert len(extra_output[0]) == 2, "Should contain two pseudonyms, for each value one"
assert extra_output[0][0].get("pseudonym"), "pseudonym is set"
assert extra_output[0][0].get("origin"), "encrypted original is set"
assert extra_output[0][0].get("@timestamp"), "timestamp is set if present in event"
assert extra_output[0][1].get("pseudonym"), "pseudonym is set"
assert extra_output[0][1].get("origin"), "encrypted original is set"
assert extra_output[0][1].get("@timestamp"), "timestamp is set if present in event"
assert extra_output[0][0].get("pseudonym") != extra_output[0][1].get(
"pseudonym"
), "pseudonyms should differ"
assert extra_output[0][0].get("origin") != extra_output[0][1].get(
"origin"
), "origins should differ"

def test_ignores_missing_field(self):
rule_dict = {
Expand Down

0 comments on commit 3c2700b

Please sign in to comment.