From b6027a920f78ea12af29a345a9163a71fa5cea29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Zimmermann?= <101292599+ekneg54@users.noreply.github.com> Date: Thu, 6 Jun 2024 11:24:16 +0200 Subject: [PATCH] add depseudo tools (#599) * add depseudo tools * update changelog --------- Co-authored-by: djkhl --- CHANGELOG.md | 2 + doc/source/user_manual/execution.rst | 44 ++++++ logprep/processor/pseudonymizer/encrypter.py | 6 +- logprep/run_logprep.py | 14 ++ logprep/util/pseudo/__init__.py | 0 logprep/util/pseudo/commands/__init__.py | 0 .../util/pseudo/commands/depseudonymize.py | 21 +++ logprep/util/pseudo/commands/generate_keys.py | 19 +++ logprep/util/pseudo/commands/pseudonymize.py | 17 +++ .../util/pseudo/depseudonymizer/__init__.py | 0 .../pseudo/depseudonymizer/depseudonymizer.py | 137 ++++++++++++++++++ logprep/util/pseudo/keygenerator/__init__.py | 0 .../pseudo/keygenerator/generate_rsa_key.py | 8 + tests/unit/util/pseudo/__init__.py | 0 tests/unit/util/pseudo/test_depseudomize.py | 133 +++++++++++++++++ .../unit/util/pseudo/test_generate_rsa_key.py | 18 +++ tests/unit/util/pseudo/test_pseudonymize.py | 18 +++ 17 files changed, 433 insertions(+), 4 deletions(-) create mode 100644 logprep/util/pseudo/__init__.py create mode 100644 logprep/util/pseudo/commands/__init__.py create mode 100644 logprep/util/pseudo/commands/depseudonymize.py create mode 100644 logprep/util/pseudo/commands/generate_keys.py create mode 100644 logprep/util/pseudo/commands/pseudonymize.py create mode 100644 logprep/util/pseudo/depseudonymizer/__init__.py create mode 100644 logprep/util/pseudo/depseudonymizer/depseudonymizer.py create mode 100644 logprep/util/pseudo/keygenerator/__init__.py create mode 100644 logprep/util/pseudo/keygenerator/generate_rsa_key.py create mode 100644 tests/unit/util/pseudo/__init__.py create mode 100644 tests/unit/util/pseudo/test_depseudomize.py create mode 100644 tests/unit/util/pseudo/test_generate_rsa_key.py create mode 100644 tests/unit/util/pseudo/test_pseudonymize.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 197af8230..bc9254331 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,11 +18,13 @@ * `timestamper` now writes `_timestamper_missing_field_warning` tag to event tags instead of `_timestamper_failure` in case of missing fields * rename `--thread_count` parameter to `--thread-count` in http generator * removed `--report` parameter and feature from http generator +* when using `extend_target_list` in the `field manager`the ordering of the given source fields is now preserved ### Features * add UCL into the Quickstart Setup * add logprep http output connector +* add pseudonymization tools to logprep -> see: `logprep pseudo --help` ### Improvements diff --git a/doc/source/user_manual/execution.rst b/doc/source/user_manual/execution.rst index 7f6edf8ce..6aab6c331 100644 --- a/doc/source/user_manual/execution.rst +++ b/doc/source/user_manual/execution.rst @@ -128,3 +128,47 @@ To find out more about the usage of the http event generator execute: logprep generate http --help + +Pseudonymization Tools +---------------------- + +Logprep provides tools to pseudonymize and depseudonymize values. This can be useful for testing +and debugging purposes. But this can also be used to depseudonymize values pseudonymized by +Logpreps :code:`Pseudonymizer` Processor. + +These tools can be used to pseudonymize given strings using the same method as used in Logprep +and provides functionality to depseudonymize values using a pair of keys. + +generate keys +^^^^^^^^^^^^^ + +.. code-block:: bash + + logprep pseudo generate -f analyst 1024 + logprep pseudo generate -f depseudo 2048 + +this will generate four files to pseudonymize in the next step. +the depseudo key has to be longer than the analyst key due to the hash padding involved in the procedure. + +* get help with :code:`logprep pseudo generate --help` + +pseudonymize +^^^^^^^^^^^^ + +.. code-block:: bash + + logprep pseudo pseudonymize analyst depseudo mystring + +This will pseudonymize the provided string using the analyst and depseudo keys. + get help with :code:`logperp pseudo pseudonymize --help` + +depseudonymize +^^^^^^^^^^^^^^ + +.. code-block:: bash + + logprep pseudo depseudonymize analyst depseudo + +This will depseudonymize the provided string using the analyst and depseudo keys. + +* get help with :code:`logprep pseudo depseudonymize --help` diff --git a/logprep/processor/pseudonymizer/encrypter.py b/logprep/processor/pseudonymizer/encrypter.py index 4325d100c..ac4cdddf9 100644 --- a/logprep/processor/pseudonymizer/encrypter.py +++ b/logprep/processor/pseudonymizer/encrypter.py @@ -6,6 +6,7 @@ from Crypto.Cipher import AES, PKCS1_OAEP from Crypto.PublicKey import RSA from Crypto.Random import get_random_bytes + from logprep.util.getter import GetterFactory @@ -39,10 +40,7 @@ def load_public_keys(self, keyfile_analyst: str, keyfile_depseudo: str): pub_key_depseudo_str = GetterFactory.from_string(keyfile_depseudo).get() self._pubkey_depseudo = RSA.import_key(pub_key_depseudo_str) - def encrypt( - self, - input_str: str, - ) -> str: + def encrypt(self, input_str: str) -> str: """Encrypt a string using hybrid encryption. The input string is encrypted with AES in CTR mode using a random diff --git a/logprep/run_logprep.py b/logprep/run_logprep.py index 50da4f8f7..82953688c 100644 --- a/logprep/run_logprep.py +++ b/logprep/run_logprep.py @@ -18,6 +18,7 @@ from logprep.util.configuration import Configuration, InvalidConfigurationError from logprep.util.defaults import DEFAULT_LOG_CONFIG from logprep.util.helper import get_versions_string, print_fcolor +from logprep.util.pseudo.commands import depseudonymize, generate_keys, pseudonymize from logprep.util.rule_dry_runner import DryRunner warnings.simplefilter("always", DeprecationWarning) @@ -298,6 +299,19 @@ def print_config(configs: tuple[str], output) -> None: print(config.as_yaml()) +@cli.group(short_help="pseudonymization toolbox") +def pseudo(): + """ + The pseudo command group offers a set of commands to + generate keys, pseudonymize and depseudonymize + """ + + +pseudo.add_command(cmd=generate_keys.generate, name="generate") +pseudo.add_command(cmd=pseudonymize.pseudonymize, name="pseudonymize") +pseudo.add_command(cmd=depseudonymize.depseudonymize, name="depseudonymize") + + def signal_handler(__: int, _) -> None: """Handle signals for stopping the runner and reloading the configuration.""" Runner.get_runner(Configuration()).stop() diff --git a/logprep/util/pseudo/__init__.py b/logprep/util/pseudo/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/logprep/util/pseudo/commands/__init__.py b/logprep/util/pseudo/commands/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/logprep/util/pseudo/commands/depseudonymize.py b/logprep/util/pseudo/commands/depseudonymize.py new file mode 100644 index 000000000..67657f2af --- /dev/null +++ b/logprep/util/pseudo/commands/depseudonymize.py @@ -0,0 +1,21 @@ +"""Command line tool to depseudonymize a string using the given keys.""" + +import click + +from logprep.util.pseudo.depseudonymizer.depseudonymizer import Depseudonymizer + + +@click.command() +@click.argument("analyst-key", type=str) +@click.argument("depseudo-key", type=str) +@click.argument("pseudo-string", type=str) +def depseudonymize(analyst_key: str, depseudo_key: str, pseudo_string: str): + """depseudonymize a string using the given keys.""" + depseudo = Depseudonymizer(pseudo_string) + keys = {} + for key_file_name in analyst_key, depseudo_key: + with open(f"{key_file_name}.key", "r", encoding="utf8") as key_file: + keys[key_file_name] = key_file.read() + depseudo.depseudo_key = keys[depseudo_key] + depseudo.analyst_key = keys[analyst_key] + print(depseudo.depseudonymize()) diff --git a/logprep/util/pseudo/commands/generate_keys.py b/logprep/util/pseudo/commands/generate_keys.py new file mode 100644 index 000000000..c996e1ecc --- /dev/null +++ b/logprep/util/pseudo/commands/generate_keys.py @@ -0,0 +1,19 @@ +import click + +from logprep.util.pseudo.keygenerator import generate_rsa_key + + +@click.command() +@click.argument("key-length", default="1024", type=int) +@click.option("-f", "--file") +def generate(key_length: int, file: str): + """Generate RSA keys for pseudonymization.""" + priv_key, pub_key = generate_rsa_key.generate_keys(key_length=key_length) + if not file: + print(priv_key.decode("utf8")) + print(pub_key.decode("utf8")) + else: + with open(f"{file}.key", "w", encoding="utf8") as private_key_file: + private_key_file.write(priv_key.decode("utf8")) + with open(f"{file}.crt", "w", encoding="utf8") as public_key_file: + public_key_file.write(pub_key.decode("utf8")) diff --git a/logprep/util/pseudo/commands/pseudonymize.py b/logprep/util/pseudo/commands/pseudonymize.py new file mode 100644 index 000000000..2a5d18a9e --- /dev/null +++ b/logprep/util/pseudo/commands/pseudonymize.py @@ -0,0 +1,17 @@ +import click + +from logprep.processor.pseudonymizer.encrypter import DualPKCS1HybridEncrypter + + +@click.command() +@click.argument("analyst-key", type=str) +@click.argument("depseudo-key", type=str) +@click.argument("string", type=str) +def pseudonymize(analyst_key: str, depseudo_key: str, string: str): + """pseudonymize a string using the given keys.""" + encrypter = DualPKCS1HybridEncrypter() + encrypter.load_public_keys( + keyfile_analyst=f"{analyst_key}.crt", + keyfile_depseudo=f"{depseudo_key}.crt", + ) + print(encrypter.encrypt(string)) diff --git a/logprep/util/pseudo/depseudonymizer/__init__.py b/logprep/util/pseudo/depseudonymizer/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/logprep/util/pseudo/depseudonymizer/depseudonymizer.py b/logprep/util/pseudo/depseudonymizer/depseudonymizer.py new file mode 100644 index 000000000..9d0aa4efa --- /dev/null +++ b/logprep/util/pseudo/depseudonymizer/depseudonymizer.py @@ -0,0 +1,137 @@ +"""module to depseudonymize""" + +import base64 +from dataclasses import dataclass +from Crypto.PublicKey import RSA +from Crypto.Cipher.PKCS1_OAEP import PKCS1OAEP_Cipher +from Crypto.Cipher import AES, PKCS1_OAEP + + +class DepseudonymizeError(Exception): + """Depseudonymizer custom Exception""" + + +@dataclass +class Depseudonymizer: + """class to depseudonymize a pseudonymized string + + Parameters + ---------- + + pseudonymized_string: str + The base64 encoded pseudonymized string. + Base64 decoding is done in __post_init__ method + """ + + pseudonymized_string: str + """the pseudonymized string""" + + _analyst_key: PKCS1OAEP_Cipher = None + + _depseudo_key: PKCS1OAEP_Cipher = None + + def __post_init__(self) -> None: + self.pseudonymized_string = base64.b64decode(self.pseudonymized_string) + + @property + def encrypted_session_key(self) -> bytes: + """the encrypted session key + + Returns + ------- + bytes + the first 16 bytes of the pseudonymized_string + """ + return self.pseudonymized_string[:256] + + @property + def cipher_nonce(self) -> bytes: + """the cipher nonce + + Returns + ------- + bytes + The 2 bytes after the session key + """ + return self.pseudonymized_string[256:264] + + @property + def ciphertext(self) -> bytes: + """the cipher text + + Returns + ------- + bytes + All bytes after the first 18 bytes + """ + return self.pseudonymized_string[264:] + + @property + def depseudo_key(self) -> PKCS1OAEP_Cipher: + """getter for depseudo_key + + Returns + ------- + PKCS1OAEP_Cipher + returns a PKCS1OAEP_Cipher representation of the depseudo key + """ + return self._depseudo_key + + @depseudo_key.setter + def depseudo_key(self, depseudo_key: str) -> None: + """setter for the depseudo_key + saves the depseudo_key as PKCS1OAEP_Cipher in _depseudo_key + + Parameters + ---------- + depseudo_key : str + the depseudo privat key + """ + self._depseudo_key = RSA.import_key(depseudo_key) + + @property + def analyst_key(self) -> PKCS1OAEP_Cipher: + """getter for analyst_key + + Returns + ------- + PKCS1OAEP_Cipher + returns a PKCS1OAEP_Cipher representation of the analyst key + """ + return self._analyst_key + + @analyst_key.setter + def analyst_key(self, analyst_key: str) -> None: + """setter for the analyst_key + saves the analyst_key as PKCS1OAEP_Cipher in _analyst_key + + Parameters + ---------- + analyst_key : str + the analyst privat key + """ + self._analyst_key = RSA.import_key(analyst_key) + + def depseudonymize(self) -> str: + """depseudonymizes after setting the depseudo and analyst keys + + Returns + ------- + str + the depseudonymized string + + Raises + ------ + DepseudonymizeError + if depseudo_key or analyst_key is not set + """ + if self._depseudo_key is None: + raise DepseudonymizeError("No depseudo key") + if self._analyst_key is None: + raise DepseudonymizeError("No analyst key") + cipher_rsa_depseudo = PKCS1_OAEP.new(self._depseudo_key) + cipher_rsa_analyst = PKCS1_OAEP.new(self._analyst_key) + depseudo_decrypted_session_key = cipher_rsa_depseudo.decrypt(self.encrypted_session_key) + analyst_decrypted_session_key = cipher_rsa_analyst.decrypt(depseudo_decrypted_session_key) + cipher_aes = AES.new(analyst_decrypted_session_key, AES.MODE_CTR, nonce=self.cipher_nonce) + return cipher_aes.decrypt(self.ciphertext).decode("utf-8") diff --git a/logprep/util/pseudo/keygenerator/__init__.py b/logprep/util/pseudo/keygenerator/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/logprep/util/pseudo/keygenerator/generate_rsa_key.py b/logprep/util/pseudo/keygenerator/generate_rsa_key.py new file mode 100644 index 000000000..7fb6868a8 --- /dev/null +++ b/logprep/util/pseudo/keygenerator/generate_rsa_key.py @@ -0,0 +1,8 @@ +from Crypto.PublicKey import RSA + + +def generate_keys(key_length): + key = RSA.generate(key_length) + pv_key_string = key.exportKey() + pb_key_string = key.publickey().exportKey() + return pv_key_string, pb_key_string diff --git a/tests/unit/util/pseudo/__init__.py b/tests/unit/util/pseudo/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/unit/util/pseudo/test_depseudomize.py b/tests/unit/util/pseudo/test_depseudomize.py new file mode 100644 index 000000000..b6719b01e --- /dev/null +++ b/tests/unit/util/pseudo/test_depseudomize.py @@ -0,0 +1,133 @@ +# pylint: disable=missing-docstring +# pylint: disable=protected-access +import base64 + +import pytest +from Crypto.Cipher import AES, PKCS1_OAEP +from Crypto.PublicKey import RSA + +from logprep.processor.pseudonymizer.encrypter import DualPKCS1HybridEncrypter +from logprep.util.pseudo.depseudonymizer.depseudonymizer import ( + DepseudonymizeError, + Depseudonymizer, +) +from logprep.util.pseudo.keygenerator.generate_rsa_key import generate_keys + + +@pytest.fixture(name="analyst_keys", scope="module") +def get_analyst_keys(): + return generate_keys(key_length=1024) + + +@pytest.fixture(name="depseudo_keys", scope="module") +def get_depseudo_keys(): + return generate_keys(key_length=2048) + + +def encrypt(plaintext, pubkey_analyst, pubkey_depseudo): + + encrypter = DualPKCS1HybridEncrypter() + encrypter._pubkey_analyst = RSA.import_key(pubkey_analyst.decode("utf-8")) + encrypter._pubkey_depseudo = RSA.import_key(pubkey_depseudo.decode("utf-8")) + return encrypter.encrypt(plaintext) + + +class TestDepseudonymizer: + def test_depseudonymize_manual(self, analyst_keys, depseudo_keys): + privkey_analyst, pubkey_analyst = analyst_keys + privkey_depseudo, pubkey_depseudo = depseudo_keys + cipher_rsa_analyst = PKCS1_OAEP.new(RSA.import_key(privkey_analyst.decode("utf-8"))) + cipher_rsa_depseudo = PKCS1_OAEP.new(RSA.import_key(privkey_depseudo.decode("utf-8"))) + encrypted_value = encrypt( + plaintext="1", + pubkey_analyst=pubkey_analyst, + pubkey_depseudo=pubkey_depseudo, + ) + assert encrypted_value != "1" + encrypted_value_b64decoded = base64.b64decode(encrypted_value) + encrypted_session_key = encrypted_value_b64decoded[:256] + cipher_nonce = encrypted_value_b64decoded[256:264] + ciphertext = encrypted_value_b64decoded[264:] + encrypted_session_key = cipher_rsa_depseudo.decrypt(encrypted_session_key) + decrypted_session_key = cipher_rsa_analyst.decrypt(encrypted_session_key) + cipher_aes = AES.new(decrypted_session_key, AES.MODE_CTR, nonce=cipher_nonce) + decrypted_value = cipher_aes.decrypt(ciphertext).decode("utf-8") + assert decrypted_value == "1" + + def test_depseudonymizer_populates_properties(self, analyst_keys, depseudo_keys): + _, pubkey_analyst = analyst_keys + _, pubkey_depseudo = depseudo_keys + encrypted_value = encrypt( + plaintext="1", + pubkey_analyst=pubkey_analyst, + pubkey_depseudo=pubkey_depseudo, + ) + + depseudo = Depseudonymizer(encrypted_value) + encrypted_value_b64decoded = base64.b64decode(encrypted_value) + assert depseudo.pseudonymized_string == encrypted_value_b64decoded + assert depseudo.encrypted_session_key == encrypted_value_b64decoded[:256] + assert depseudo.cipher_nonce == encrypted_value_b64decoded[256:264] + assert depseudo.ciphertext == encrypted_value_b64decoded[264:] + + def test_depseudonymizer_depseudonymize_raises_if_no_depseudo_key( + self, analyst_keys, depseudo_keys + ): + _, pubkey_analyst = analyst_keys + _, pubkey_depseudo = depseudo_keys + encrypted_value = encrypt( + plaintext="1", + pubkey_analyst=pubkey_analyst, + pubkey_depseudo=pubkey_depseudo, + ) + + depseudo = Depseudonymizer(encrypted_value) + with pytest.raises(DepseudonymizeError, match=r"No depseudo key"): + depseudo.depseudonymize() + + def test_depseudonymizer_depseudonymize_raises_if_no_analyst_key( + self, analyst_keys, depseudo_keys + ): + _, pubkey_analyst = analyst_keys + privkey_depseudo, pubkey_depseudo = depseudo_keys + encrypted_value = encrypt( + plaintext="1", + pubkey_analyst=pubkey_analyst, + pubkey_depseudo=pubkey_depseudo, + ) + + depseudo = Depseudonymizer(encrypted_value) + depseudo.depseudo_key = privkey_depseudo.decode("utf-8") + with pytest.raises(DepseudonymizeError, match=r"No analyst key"): + depseudo.depseudonymize() + + @pytest.mark.parametrize( + "plaintext", + [ + "1", + "message", + "second_message", + "asökdfjqqiweuraö", + "23884ß10239847ß", + "§$RFWSF", + "askjf2q903rui0üajfdskalsdhfkj9pw8ue7rfdaödsjiöaldfjfjq093r7uüadsjfaskdjfu20984r290fda6ds5f4a6sd54fa65sdff4asd2f1a6s5d4fa6s5df4asdf4a6sd54fa6s5d4füojedsaüfjk", # pylint: disable=line-too-long + """ + asdkfjasödlfkj + asdkfjasdufasopid + 237429034 + """, + ], + ) + def test_depseudonymizer_depseudonymize_messages(self, analyst_keys, depseudo_keys, plaintext): + privkey_analyst, pubkey_analyst = analyst_keys + privkey_depseudo, pubkey_depseudo = depseudo_keys + encrypted_value = encrypt( + plaintext=plaintext, + pubkey_analyst=pubkey_analyst, + pubkey_depseudo=pubkey_depseudo, + ) + + depseudo = Depseudonymizer(encrypted_value) + depseudo.depseudo_key = privkey_depseudo.decode("utf-8") + depseudo.analyst_key = privkey_analyst.decode("utf-8") + assert depseudo.depseudonymize() == plaintext diff --git a/tests/unit/util/pseudo/test_generate_rsa_key.py b/tests/unit/util/pseudo/test_generate_rsa_key.py new file mode 100644 index 000000000..5a76664f8 --- /dev/null +++ b/tests/unit/util/pseudo/test_generate_rsa_key.py @@ -0,0 +1,18 @@ +from logprep.util.pseudo.keygenerator.generate_rsa_key import generate_keys + + +class TestRsaKeyGenerator: + def test_generate_keys_returns_tuple(self): + pv_key_string, pb_key_string = generate_keys(key_length=1024) + assert pv_key_string + assert pb_key_string + + def test_generate_keys_returns_bytes(self): + pv_key_string, pb_key_string = generate_keys(key_length=1024) + assert isinstance(pv_key_string, bytes) + assert isinstance(pb_key_string, bytes) + + def test_generate_tubles_returns_public_and_private_keys(self): + pv_key_string, pb_key_string = generate_keys(key_length=1024) + assert "BEGIN PUBLIC KEY" in pb_key_string.decode("utf-8") + assert "BEGIN RSA PRIVATE KEY" in pv_key_string.decode("utf-8") diff --git a/tests/unit/util/pseudo/test_pseudonymize.py b/tests/unit/util/pseudo/test_pseudonymize.py new file mode 100644 index 000000000..0473e24dc --- /dev/null +++ b/tests/unit/util/pseudo/test_pseudonymize.py @@ -0,0 +1,18 @@ +from Crypto.PublicKey import RSA + +from logprep.processor.pseudonymizer.encrypter import DualPKCS1HybridEncrypter +from logprep.util.pseudo.keygenerator.generate_rsa_key import generate_keys + + +class TestPseudonymizer: + def test_pseudonymize(self): + public_key_analyst, private_key_analyst = generate_keys( + key_length=1024, + ) + public_key_depseudo, private_key_depseudo = generate_keys(key_length=2048) + encrypter = DualPKCS1HybridEncrypter() + encrypter._pubkey_analyst = RSA.import_key(public_key_analyst.decode("utf-8")) + encrypter._pubkey_depseudo = RSA.import_key(public_key_depseudo.decode("utf-8")) + value = "1" + encrypted_origin = encrypter.encrypt(value) + assert value != encrypted_origin