Skip to content

Commit

Permalink
remove-tldextract-and-urlextract-dependencies (#718)
Browse files Browse the repository at this point in the history
* update changelog
* remove urlextract dependency
* remove tldextractor from pseudonymizer
* remove tldextractor from domain_resolver
* remove tldextractor from domain_label_extractor
* add tldlist

---------

Co-authored-by: dtrai2 <[email protected]>
  • Loading branch information
ekneg54 and dtrai2 authored Dec 9, 2024
1 parent 96f9a26 commit b20afb6
Show file tree
Hide file tree
Showing 13 changed files with 16,054 additions and 263 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
* `CriticalInputError` is raised when the input preprocessor values can't be set, this was so far only true
for the hmac preprocessor, but is now also applied for all other preprocessors.
* fix `delimiter` typo in `StringSplitterRule` configuration
* removed the configuration `tld_lists` in `domain_resolver`, `domain_label_extractor` and `pseudonymizer` as
the list is now fixed inside the packaged logprep

### Features

* configuration of `initContainers` in logprep helm chart is now possible
* configuration of `initContainers` in logprep helm chart is now possible

### Improvements

Expand All @@ -24,6 +26,8 @@
* refactored some processors to make use of the new helper methods
* add `pre-commit` hooks to the repository, install new dev dependency and run `pre-commit install` in the root dir
* the default `securityContext`for the pod is now configurable
* remove `tldextract` dependency
* remove `urlextract` dependency

### Bugfix

Expand Down
49 changes: 7 additions & 42 deletions logprep/processor/domain_label_extractor/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
- tests/testdata/rules/specific/
generic_rules:
- tests/testdata/rules/generic/
tld_lists: /path/to/list/file
tagging_field_name: resolved
.. autoclass:: logprep.processor.domain_label_extractor.processor.DomainLabelExtractor.Config
Expand All @@ -36,21 +35,14 @@

import ipaddress
import logging
import os
import tempfile
from functools import cached_property
from pathlib import Path
from typing import Optional
from urllib.parse import urlsplit

from attr import define, field, validators
from filelock import FileLock
from tldextract import TLDExtract

from logprep.processor.domain_label_extractor.rule import DomainLabelExtractorRule
from logprep.processor.field_manager.processor import FieldManager
from logprep.util.getter import GetterFactory
from logprep.util.helper import add_and_overwrite, add_fields_to, get_dotted_field_value
from logprep.util.validators import list_of_urls_validator
from logprep.util.url.url import Domain

logger = logging.getLogger("DomainLabelExtractor")

Expand All @@ -68,40 +60,9 @@ class Config(FieldManager.Config):
"""Optional configuration field that defines into which field in the event the
informational tags should be written to. If this field is not present it defaults
to :code:`tags`."""
tld_lists: Optional[list] = field(default=None, validator=[list_of_urls_validator])
"""Optional list of path to files with top-level domain lists
(like https://publicsuffix.org/list/public_suffix_list.dat). If no path is given,
a default list will be retrieved online and cached in a local directory. For local
files the path has to be given with :code:`file:///path/to/file.dat`."""

rule_class = DomainLabelExtractorRule

__slots__ = ["detection_results", "_pre_detector_topic", "_ids"]

@cached_property
def _tld_extractor(self):
if self._config.tld_lists is not None:
_tld_extractor = TLDExtract(suffix_list_urls=self._config.tld_lists)
else:
_tld_extractor = TLDExtract()
return _tld_extractor

def setup(self):
super().setup()
if self._config.tld_lists:
downloaded_tld_lists_paths = []
logger.debug("start tldlists download...")
for index, tld_list in enumerate(self._config.tld_lists):
logprep_tmp_dir = Path(tempfile.gettempdir()) / "logprep"
os.makedirs(logprep_tmp_dir, exist_ok=True)
list_path = logprep_tmp_dir / f"{self.name}-tldlist-{index}.dat"
if not os.path.isfile(list_path):
with FileLock(list_path):
list_path.touch()
list_path.write_bytes(GetterFactory.from_string(tld_list).get_raw())
downloaded_tld_lists_paths.append(f"file://{str(list_path.absolute())}")
logger.debug("finished tldlists download...")

def _apply_rules(self, event, rule: DomainLabelExtractorRule):
"""
Apply matching rule to given log event. Such that a given domain,
Expand Down Expand Up @@ -135,7 +96,11 @@ def _apply_rules(self, event, rule: DomainLabelExtractorRule):
)
return

labels = self._tld_extractor(domain)
url = urlsplit(domain)
domain = url.hostname
if url.scheme == "":
domain = url.path
labels = Domain(domain)
if labels.suffix != "":
fields = {
f"{rule.target_field}.registered_domain": f"{labels.domain}.{labels.suffix}",
Expand Down
42 changes: 6 additions & 36 deletions logprep/processor/domain_resolver/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
- tests/testdata/rules/specific/
generic_rules:
- tests/testdata/rules/generic/
tld_list: tmp/path/tld.dat
timeout: 0.5
max_cached_domains: 20000
max_caching_days: 1
Expand All @@ -34,27 +33,21 @@

import datetime
import logging
import os
import socket
import tempfile
from functools import cached_property
from multiprocessing import context
from multiprocessing.pool import ThreadPool
from pathlib import Path
from typing import Optional
from urllib.parse import urlsplit

from attr import define, field, validators
from filelock import FileLock
from tldextract import TLDExtract

from logprep.abc.processor import Processor
from logprep.metrics.metrics import CounterMetric
from logprep.processor.domain_resolver.rule import DomainResolverRule
from logprep.util.cache import Cache
from logprep.util.getter import GetterFactory
from logprep.util.hasher import SHA256Hasher
from logprep.util.helper import add_fields_to, get_dotted_field_value
from logprep.util.validators import list_of_urls_validator

logger = logging.getLogger("DomainResolver")

Expand All @@ -66,11 +59,6 @@ class DomainResolver(Processor):
class Config(Processor.Config):
"""DomainResolver config"""

tld_lists: Optional[list] = field(default=None, validator=[list_of_urls_validator])
"""Optional list of path to files with top-level domain lists
(like https://publicsuffix.org/list/public_suffix_list.dat). If no path is given,
a default list will be retrieved online and cached in a local directory. For local
files the path has to be given with :code:`file:///path/to/file.dat`."""
timeout: Optional[float] = field(
default=0.5,
validator=validators.optional(validators.instance_of(float)),
Expand Down Expand Up @@ -158,34 +146,16 @@ def _hasher(self):
def _thread_pool(self):
return ThreadPool(processes=1)

@cached_property
def _tld_extractor(self):
if self._config.tld_lists is not None:
return TLDExtract(suffix_list_urls=self._config.tld_lists)
return TLDExtract()

def setup(self):
super().setup()
if self._config.tld_lists:
downloaded_tld_lists_paths = []
logger.debug("start tldlists download...")
for index, tld_list in enumerate(self._config.tld_lists):
logprep_tmp_dir = Path(tempfile.gettempdir()) / "logprep"
os.makedirs(logprep_tmp_dir, exist_ok=True)
list_path = logprep_tmp_dir / f"{self.name}-tldlist-{index}.dat"
if not os.path.isfile(list_path):
with FileLock(list_path):
list_path.touch()
list_path.write_bytes(GetterFactory.from_string(tld_list).get_raw())
downloaded_tld_lists_paths.append(f"file://{str(list_path.absolute())}")
logger.debug("finished tldlists download...")

def _apply_rules(self, event, rule):
source_field = rule.source_fields[0]
domain_or_url_str = get_dotted_field_value(event, source_field)
if not domain_or_url_str:
return
domain = self._tld_extractor(domain_or_url_str).fqdn

url = urlsplit(domain_or_url_str)
domain = url.hostname
if url.scheme == "":
domain = url.path
if not domain:
return
self.metrics.total_urls += 1
Expand Down
36 changes: 9 additions & 27 deletions logprep/processor/pseudonymizer/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,6 @@
regex_mapping: /path/to/regex_mapping.json
max_cached_pseudonyms: 1000000
mode: GCM
tld_lists:
-/path/to/tld_list.dat
.. autoclass:: logprep.processor.pseudonymizer.processor.Pseudonymizer.Config
:members:
Expand All @@ -50,12 +48,10 @@
import re
from functools import cached_property, lru_cache
from itertools import chain
from typing import Optional, Pattern
from typing import Pattern
from urllib.parse import parse_qs, urlencode, urlparse

from attrs import define, field, validators
from tldextract import TLDExtract
from urlextract import URLExtract

from logprep.abc.processor import Processor
from logprep.factory_error import InvalidConfigurationError
Expand All @@ -70,7 +66,7 @@
DualPKCS1HybridGCMEncrypter,
Encrypter,
)
from logprep.util.validators import list_of_urls_validator
from logprep.util.url.url import extract_urls


class Pseudonymizer(FieldManager):
Expand Down Expand Up @@ -137,12 +133,6 @@ class Config(FieldManager.Config):
)
"""The maximum number of cached pseudonymized urls. Default is 10000.
Behaves similarly to the max_cached_pseudonyms. Has to be greater than 0."""
tld_lists: Optional[list] = field(default=None, validator=[list_of_urls_validator])
"""Optional list of path to files with top-level domain lists
(like https://publicsuffix.org/list/public_suffix_list.dat). If no path is given,
a default list will be retrieved online and cached in a local directory. For local
files the path has to be given with :code:`file:///path/to/file.dat`."""

mode: str = field(
validator=[validators.instance_of(str), validators.in_(("GCM", "CTR"))], default="GCM"
)
Expand Down Expand Up @@ -198,10 +188,6 @@ class Metrics(Processor.Metrics):

rule_class = PseudonymizerRule

@cached_property
def _url_extractor(self):
return URLExtract()

@cached_property
def _hasher(self):
return SHA256Hasher()
Expand All @@ -215,12 +201,6 @@ def _encrypter(self) -> Encrypter:
encrypter.load_public_keys(self._config.pubkey_analyst, self._config.pubkey_depseudo)
return encrypter

@cached_property
def _tld_extractor(self) -> TLDExtract:
if self._config.tld_lists is not None:
return TLDExtract(suffix_list_urls=self._config.tld_lists)
return TLDExtract()

@cached_property
def _regex_mapping(self) -> dict:
return GetterFactory.from_string(self._config.regex_mapping).get_yaml()
Expand Down Expand Up @@ -280,7 +260,7 @@ def _pseudonymize_field(
else:
plaintext_values = set(chain(*[value for value in regex.findall(field_value) if value]))
if plaintext_values and dotted_field in rule.url_fields:
for url_string in self._url_extractor.gen_urls(field_value):
for url_string in extract_urls(field_value):
field_value = field_value.replace(
url_string, self._pseudonymize_url_cached(url_string)
)
Expand Down Expand Up @@ -309,13 +289,15 @@ def _pseudonymize(self, value):
return {"pseudonym": hash_string, "origin": encrypted_origin}

def _pseudonymize_url(self, url_string: str) -> str:
url = self._tld_extractor(url_string)
if url_string.startswith(("http://", "https://")):
parsed_url = urlparse(url_string)
else:
parsed_url = urlparse("http://" + url_string)
if url.subdomain:
url_string = url_string.replace(url.subdomain, self._pseudonymize_string(url.subdomain))
parsed_url = urlparse(f"http://{url_string}")
if parsed_url.hostname:
splitted_hostname = parsed_url.hostname.split(".")
if len(splitted_hostname) > 2:
subdomain = ".".join(splitted_hostname[0:-2])
url_string = url_string.replace(subdomain, self._pseudonymize_string(subdomain))
if parsed_url.fragment:
url_string = url_string.replace(
f"#{parsed_url.fragment}", f"#{self._pseudonymize_string(parsed_url.fragment)}"
Expand Down
Empty file added logprep/util/url/__init__.py
Empty file.
Loading

0 comments on commit b20afb6

Please sign in to comment.