From 03b8ccd1db6d03ed37cb745545adce47b842e649 Mon Sep 17 00:00:00 2001 From: ekneg54 Date: Tue, 26 Sep 2023 20:10:57 +0000 Subject: [PATCH] fix bug following new tldextract feature --- logprep/processor/pseudonymizer/processor.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/logprep/processor/pseudonymizer/processor.py b/logprep/processor/pseudonymizer/processor.py index 8e9accf60..e0ba367ad 100644 --- a/logprep/processor/pseudonymizer/processor.py +++ b/logprep/processor/pseudonymizer/processor.py @@ -39,7 +39,7 @@ import re from functools import cached_property from logging import Logger -from typing import Any, List, Optional, Tuple, Union, Pattern +from typing import Any, List, Optional, Pattern, Tuple, Union from urllib.parse import parse_qs from attr import define, field, validators @@ -328,7 +328,9 @@ def _parse_url_parts(self, tld_extractor: TLDExtract, url_str: str) -> dict: parts["domain"] = url.domain parts["subdomain"] = url.subdomain parts["suffix"] = url.suffix - url_list = ".".join(list(url)) + url_list = list(url) + url_list.pop() + url_list = ".".join(url_list) parts["path"] = self._find_first( rf"(?:^[a-z0-9]+\:\/\/)?{url_list}(?:\:\d+)?([^#^\?]*).*", url_str )