From 92c89f0414ae2cf7f701210d221a7b34233df4be Mon Sep 17 00:00:00 2001 From: lionick <44701697+lionick@users.noreply.github.com> Date: Fri, 3 Nov 2023 09:44:54 +0200 Subject: [PATCH] Add blacklist functionality while ingesting stats (#37) * add blacklist functionality while ingesting stats * minor fix --- app/ingester/loginsIngester.py | 37 ++++++++++++++++++++++++++----- app/ingester/membeshipIngester.py | 33 +++++++++++++++++++++++---- app/ingester/usersIngester.py | 34 +++++++++++++++++++++++----- app/utils/configParser.py | 3 +-- 4 files changed, 91 insertions(+), 16 deletions(-) diff --git a/app/ingester/loginsIngester.py b/app/ingester/loginsIngester.py index 2556ccf..c210c71 100644 --- a/app/ingester/loginsIngester.py +++ b/app/ingester/loginsIngester.py @@ -1,4 +1,5 @@ from app.logger import log +from app.utils import configParser from app.utils.ipDatabase import geoip2Database from sqlalchemy.exc import NoResultFound from .utilsIngester import utilsIngester @@ -7,6 +8,7 @@ class LoginDataIngester: logger = log.get_logger("LoginDataIngester") + @classmethod def getIdpId(cls, entityid, idpName, tenenvId, session): # Check if IdP exists @@ -152,8 +154,20 @@ def getCountryFromIP(cls, ipAddress, session): return countryId @classmethod - def ingestLoginDataPerTenenv(cls, tenenvId, session): + def ingestLoginDataPerTenenv(cls, tenenv, session): + tenenvId = tenenv['id'] + tenant_name = tenenv['tenant_name'] + environment_name = tenenv['environment_name'] + hashed_user_ids = [] + + config_file = f'config.{tenant_name.lower()}.{environment_name.lower()}.py' + if (configParser.getConfig('user_id_blacklist', config_file) is not False and + 'user_ids' in configParser.getConfig('user_id_blacklist', config_file)): + user_ids = configParser.getConfig('user_id_blacklist', config_file)['user_ids'].split('\n') + # Hash each value using SHA-256 + hashed_user_ids = [hashlib.md5(value.strip().encode()).hexdigest() for value in user_ids] + # get dates not mapped for logi5ns data datesNotMapped = utilsIngester.getDatesNotMapped( "statistics_country_hashed", @@ -175,7 +189,14 @@ def ingestLoginDataPerTenenv(cls, tenenvId, session): AND tenenv_id={0} {1} """.format(tenenvId, between)).all() loginMappedItems = 0 + for login in loginsNotMapped: + + if (login[0]['voPersonId'] in hashed_user_ids): + cls.logger.info("""Ignore this user with + hash {0} as he is at the blacklist""". format(login[0]['voPersonId'])) + continue + if (not login[0]['failedLogin'] and utilsIngester.validateTenenv(login[0]['tenenvId'], session) and 'voPersonId' in login[0] @@ -197,7 +218,7 @@ def ingestLoginDataPerTenenv(cls, tenenvId, session): login[0]['spName'], login[0]['tenenvId'], session) - + if ('countryCode' in login[0] and 'countryName' in login[0]): # find countryId countryId = LoginDataIngester.getCountryFromCountryCode([login[0]['countryCode'], login[0]['countryName']], session) @@ -222,6 +243,12 @@ def ingestLoginDataPerTenenv(cls, tenenvId, session): @classmethod def ingestLoginData(cls, session): - tenenvIds = session.exec("""SELECT id FROM tenenv_info""").all() - for tenenvId in tenenvIds: - LoginDataIngester.ingestLoginDataPerTenenv(tenenvId[0], session) + tenenvIds = session.exec("""SELECT tenenv_info.id, + tenant_info.name AS tenant_name, + environment_info.name AS environment_name + FROM tenenv_info + JOIN tenant_info ON tenant_id=tenant_info.id + JOIN environment_info ON env_id=environment_info.id + """).all() + for tenenv in tenenvIds: + LoginDataIngester.ingestLoginDataPerTenenv(tenenv, session) diff --git a/app/ingester/membeshipIngester.py b/app/ingester/membeshipIngester.py index 0396bae..7389256 100644 --- a/app/ingester/membeshipIngester.py +++ b/app/ingester/membeshipIngester.py @@ -1,6 +1,8 @@ from app.logger import log +from app.utils import configParser from sqlalchemy.exc import NoResultFound from .utilsIngester import utilsIngester +import hashlib class MembershipDataIngester: @@ -27,7 +29,20 @@ def getCommunityId(cls, communityName, tenenvId, session): return communityId @classmethod - def ingestMembershipDataPerTenenv(cls, tenenvId, session): + def ingestMembershipDataPerTenenv(cls, tenenv, session): + tenenvId = tenenv['id'] + tenant_name = tenenv['tenant_name'] + environment_name = tenenv['environment_name'] + hashed_user_ids = [] + + config_file = f'config.{tenant_name.lower()}.{environment_name.lower()}.py' + + if (configParser.getConfig('user_id_blacklist', config_file) is not False and + 'user_ids' in configParser.getConfig('user_id_blacklist', config_file)): + user_ids = configParser.getConfig('user_id_blacklist', config_file)['user_ids'].split('\n') + # Hash each value using SHA-256 + hashed_user_ids = [hashlib.md5(value.strip().encode()).hexdigest() for value in user_ids] + # get dates not mapped for users data datesNotMapped = utilsIngester.getDatesNotMapped( "members", @@ -54,6 +69,10 @@ def ingestMembershipDataPerTenenv(cls, tenenvId, session): cls.logger.error(""" VO name '{0}' not found """.format(membership[0]['voName'])) continue + if (membership[0]['voPersonId'] in hashed_user_ids): + cls.logger.info("""Ignore this user with + hash {0} as he is at the blacklist""". format(membership[0]['voPersonId'])) + continue session.exec("""INSERT INTO members(community_id, hasheduserid, status, tenenv_id, created, updated) VALUES ('{0}','{1}','{2}', {3}, '{4}', '{4}') @@ -69,9 +88,15 @@ def ingestMembershipDataPerTenenv(cls, tenenvId, session): @classmethod def ingestMembershipData(cls, session): - tenenvIds = session.exec("""SELECT id FROM tenenv_info""").all() + tenenvIds = session.exec("""SELECT tenenv_info.id, + tenant_info.name AS tenant_name, + environment_info.name AS environment_name + FROM tenenv_info + JOIN tenant_info ON tenant_id=tenant_info.id + JOIN environment_info ON env_id=environment_info.id + """).all() # for each tenenv on database try to ingest UserData # from statistics_raw table - for tenenvId in tenenvIds: + for tenenv in tenenvIds: MembershipDataIngester.ingestMembershipDataPerTenenv( - tenenvId[0], session) + tenenv, session) diff --git a/app/ingester/usersIngester.py b/app/ingester/usersIngester.py index 213272c..718145c 100644 --- a/app/ingester/usersIngester.py +++ b/app/ingester/usersIngester.py @@ -1,12 +1,26 @@ from app.logger import log +from app.utils import configParser from .utilsIngester import utilsIngester - +import hashlib class UserDataIngester: logger = log.get_logger("UserDataIngester") @classmethod - def ingestUserDataPerTenenv(cls, tenenvId, session): + def ingestUserDataPerTenenv(cls, tenenv, session): + tenenvId = tenenv['id'] + tenant_name = tenenv['tenant_name'] + environment_name = tenenv['environment_name'] + hashed_user_ids = [] + + config_file = f'config.{tenant_name.lower()}.{environment_name.lower()}.py' + + if (configParser.getConfig('user_id_blacklist', config_file) is not False and + 'user_ids' in configParser.getConfig('user_id_blacklist', config_file)): + user_ids = configParser.getConfig('user_id_blacklist', config_file)['user_ids'].split('\n') + # Hash each value using SHA-256 + hashed_user_ids = [hashlib.md5(value.strip().encode()).hexdigest() for value in user_ids] + # get dates not mapped for users data datesNotMapped = utilsIngester.getDatesNotMapped( "users", @@ -34,6 +48,10 @@ def ingestUserDataPerTenenv(cls, tenenvId, session): cls.logger.error(""" user status '{0}' is not valid """.format(user[0]['status'])) continue + if (user[0]['voPersonId'] in hashed_user_ids): + cls.logger.info("""Ignore this user with + hash {0} as he is at the blacklist""". format(user[0]['voPersonId'])) + continue session.exec("""INSERT INTO users(hasheduserid, created, updated, status, tenenv_id) VALUES ('{0}','{1}','{1}', '{2}', {3}) ON CONFLICT(hasheduserid, tenenv_id) @@ -48,8 +66,14 @@ def ingestUserDataPerTenenv(cls, tenenvId, session): @classmethod def ingestUserData(cls, session): - tenenvIds = session.exec("""SELECT id FROM tenenv_info""").all() + tenenvIds = session.exec("""SELECT tenenv_info.id, + tenant_info.name AS tenant_name, + environment_info.name AS environment_name + FROM tenenv_info + JOIN tenant_info ON tenant_id=tenant_info.id + JOIN environment_info ON env_id=environment_info.id + """).all() # for each tenenv on database try to ingest UserData # from statistics_raw table - for tenenvId in tenenvIds: - UserDataIngester.ingestUserDataPerTenenv(tenenvId[0], session) \ No newline at end of file + for tenenv in tenenvIds: + UserDataIngester.ingestUserDataPerTenenv(tenenv, session) \ No newline at end of file diff --git a/app/utils/configParser.py b/app/utils/configParser.py index 8de64cc..75b6017 100644 --- a/app/utils/configParser.py +++ b/app/utils/configParser.py @@ -30,7 +30,6 @@ def getConfig(section='source_database', config_file='config.py'): config[param[0]] = param[1] else: - raise Exception( - 'Section {0} not found in the {1} file'.format(section, config_file)) + return False return config