From 7a1c360c2bb21ca9d6f8fd5a9c4bf74d179a9ce9 Mon Sep 17 00:00:00 2001 From: Felix Zailskas Date: Tue, 30 Jan 2024 14:14:25 +0100 Subject: [PATCH 01/51] Removed all unused or unnecessary code Signed-off-by: Felix Zailskas --- {src => scripts}/create_geojson.py | 6 +- .../evp => scripts/model_testing}/ml_model.py | 0 .../evp => scripts/model_testing}/nn_model.py | 0 .../model_testing}/sagemaker_training.py | 0 {src => scripts}/report.py | 0 src/bdc/__init__.py | 2 - src/bdc/data_collector.py | 95 --------- src/controller/Controller.py | 189 ------------------ src/controller/messenger.py | 48 ----- src/database/__init__.py | 2 - src/database/database_dummy.py | 31 --- src/database/db_connection.py | 17 -- src/database/models.py | 129 ------------ src/database/parsers.py | 73 ------- src/demo/__init__.py | 2 +- src/demo/demos.py | 29 --- src/evp/data_processing.py | 48 ----- src/main.py | 4 - tests/conftest.py | 31 --- tests/mock_components.py | 8 - tests/test_leadparser.py | 12 -- 21 files changed, 6 insertions(+), 720 deletions(-) rename {src => scripts}/create_geojson.py (93%) rename {src/evp => scripts/model_testing}/ml_model.py (100%) rename {src/evp => scripts/model_testing}/nn_model.py (100%) rename {src/evp => scripts/model_testing}/sagemaker_training.py (100%) rename {src => scripts}/report.py (100%) delete mode 100644 src/bdc/data_collector.py delete mode 100644 src/controller/Controller.py delete mode 100644 src/controller/messenger.py delete mode 100644 src/database/database_dummy.py delete mode 100644 src/database/db_connection.py delete mode 100644 src/database/models.py delete mode 100644 src/database/parsers.py delete mode 100644 src/evp/data_processing.py delete mode 100644 tests/mock_components.py delete mode 100644 tests/test_leadparser.py diff --git a/src/create_geojson.py b/scripts/create_geojson.py similarity index 93% rename from src/create_geojson.py rename to scripts/create_geojson.py index daab877..9ef0260 100644 --- a/src/create_geojson.py +++ b/scripts/create_geojson.py @@ -7,7 +7,11 @@ import pandas as pd import regex as re -geojson_directory = "./src/data" +abspath = os.path.abspath(__file__) +dname = os.path.dirname(abspath) +os.chdir(dname) + +geojson_directory = "../src/data" # List all GeoJSON files in the directory geojson_files = [ diff --git a/src/evp/ml_model.py b/scripts/model_testing/ml_model.py similarity index 100% rename from src/evp/ml_model.py rename to scripts/model_testing/ml_model.py diff --git a/src/evp/nn_model.py b/scripts/model_testing/nn_model.py similarity index 100% rename from src/evp/nn_model.py rename to scripts/model_testing/nn_model.py diff --git a/src/evp/sagemaker_training.py b/scripts/model_testing/sagemaker_training.py similarity index 100% rename from src/evp/sagemaker_training.py rename to scripts/model_testing/sagemaker_training.py diff --git a/src/report.py b/scripts/report.py similarity index 100% rename from src/report.py rename to scripts/report.py diff --git a/src/bdc/__init__.py b/src/bdc/__init__.py index 5d5bca2..c46ca70 100644 --- a/src/bdc/__init__.py +++ b/src/bdc/__init__.py @@ -1,4 +1,2 @@ # SPDX-License-Identifier: MIT # SPDX-FileCopyrightText: 2023 Ruchita Nathani - -from .data_collector import DataCollector diff --git a/src/bdc/data_collector.py b/src/bdc/data_collector.py deleted file mode 100644 index 3802183..0000000 --- a/src/bdc/data_collector.py +++ /dev/null @@ -1,95 +0,0 @@ -# SPDX-License-Identifier: MIT -# SPDX-FileCopyrightText: 2023 Ruchita Nathani - -import csv -import json -import os -import random - -import requests - -from database.models import AnnualIncome, ProductOfInterest -from logger import get_logger - -log = get_logger() - - -class DataCollector: - # Limit API calls for testing - API_LIMIT = 10 - - def __init__(self): - self.data = [] - - def get_data_from_csv(self, file_path: str = "../data/sumup_leads_email.csv"): - """Retrieve information from the CSV file and utilize it in the Google API""" - self.data = [] - file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), file_path) - try: - with open(file_path, "r", encoding="utf8") as file: - csv_reader = csv.reader(file) - next(csv_reader) - - for row in csv_reader: - data_dict = { - "last_name": row[0], - "first_name": row[1], - "company_account": row[2], - "phone_number": row[3], - "email_address": row[4], - } - - self.data.append(data_dict) - log.info(f"Successfully read data from {file_path}") - except FileNotFoundError as e: - log.error(f"Error: Input file {file_path} for BDC not found.") - - return self.data - - def get_data_from_api(self, file_path: str = "../data/collected_data.json"): - """will utilize the data from the CSV file in the API key we are using, retrieve the necessary information from the API, and extract specific information that we need for the predictor. This relevant data will be stored in a JSON file.""" - api_url = "https://dummyjson.com/users" - try: - response = requests.get(api_url) - except Exception as e: - log.error("Error when fetching dummies") - return None - - if response.status_code == 200: - data = response.json() - file_path = os.path.join( - os.path.abspath(os.path.dirname(__file__)), - file_path, - ) - with open(file_path, "w") as json_file: - user_data = [] - for users in data["users"]: - data_dict = { - "lead_id": users["id"], - "first_name": users["firstName"], - "last_name": users["lastName"], - "phone_number": users["phone"], - "email_address": users["email"], - "company_address": users["company"]["address"]["address"], - "company_department": users["company"]["department"], - "company_name": users["company"]["name"], - "annual_income": random.randint(0, AnnualIncome.Class10.value), - "life_time_value": random.randint( - 0, AnnualIncome.Class10.value - ), - "customer_probability": random.random(), - "product_of_interest": random.choice(list(ProductOfInterest)), - } - - user_data.append(data_dict) - - json.dump(user_data, json_file, indent=4) - log.info( - f"Successfully fetched data from {api_url} and stored at {file_path}" - ) - return random.choice(user_data) - else: - log.warning( - f"Failed to fetch data from {api_url}. Status code: {response.status_code}" - ) - return None diff --git a/src/controller/Controller.py b/src/controller/Controller.py deleted file mode 100644 index 16cf2b7..0000000 --- a/src/controller/Controller.py +++ /dev/null @@ -1,189 +0,0 @@ -# SPDX-License-Identifier: MIT -# SPDX-FileCopyrightText: 2023 Berkay Bozkurt - -from queue import Queue -from threading import Lock, Thread -from typing import Any - -from messenger import Message, MessageType, create_data_message - -from logger import get_logger - -log = get_logger() - - -class ControllerMeta(type): - - """ - Thread safe singleton implementation of Controller - """ - - _instances = {} # Dictionary to store instances of Controller - _lock: Lock = Lock() # A lock to ensure thread-safety when creating instances - - def __call__(cls, *args: Any, **kwds: Any): - with cls._lock: - if cls not in cls._instances: - instance = super().__call__( - *args, **kwds - ) # Create a new instance of Controller - cls._instances[ - cls - ] = instance # Store the instance in the _instances dictionary - - return cls._instances[cls] # Return the instance of Controller - - -class Controller(metaclass=ControllerMeta): - """ - Controller class with message processing and sending functionality. - """ - - def __init__(self, name: str) -> None: - self.name = name - self._finish_flag = False - self._finish_flag_lock = Lock() - self._message_queue: Queue[Message] = Queue(0) # Queue for processing messages - self._routing_queue: Queue[Message] = Queue(0) # Queue for routing messages - self._message_queue_processor_thread: Thread = ( - None # Thread for processing messages - ) - self._routing_queue_processor_thread: Thread = ( - None # Thread for routing messages - ) - self._start_message_queue_processing_thread() # Start the message processing thread - self._start_routing_queue_processing_thread() # Start the routing thread - - def _message_queue_processor(self): - while True: - # do read operation with lock to get latest value - with self._finish_flag_lock: - # if set True then exit loop - if self._finish_flag: - break - if not self._message_queue.empty(): - try: - # Get a message from the message queue and process it‚ - msg = self._message_queue.get() - # Simulate processing of the message - log.debug(f"Processing on {msg}") - self._enqueue_routing(msg) - # Simulate completion of processing - log.debug(f"Processed {msg}") - # Handle any errors during message processing - except Exception as e: - log.error(f"Error while processing message: {e}") - finally: - # Mark the task as done in the processing queue - self._message_queue.task_done() - log.debug(f"Message queue processor thread exited.") - - def _routing_queue_processor(self): - while True: - with self._finish_flag_lock: - if self._finish_flag: - break - if not self._routing_queue.empty(): - try: - # Mark the task as done in the processing queue - msg = self._routing_queue.get() - log.debug(f"Routing {msg}") - if msg.data_type == MessageType.DATA: - self._route_to_EVP(msg) - elif msg.data_type == MessageType.PREDICTION: - self._route_to_BDC(msg) - else: - log.warning(f"Unknown message type: {msg.data_type}") - log.debug(f"Routed {msg}") - # Handle any errors during message routing - except Exception as e: - log.error(f"Error while routing message: {e}") - finally: - # Mark the task as done in the processing queue - self._routing_queue.task_done() - log.debug(f"Routing queue processor thread exited.") - - # Start the message processing thread - def _start_message_queue_processing_thread(self): - if ( - not self._message_queue_processor_thread - or not self._message_queue_processor_thread.is_alive() - ): - self._message_queue_processor_thread = Thread( - target=self._message_queue_processor, daemon=True - ) - self._message_queue_processor_thread.start() - - # Start the message routing thread - def _start_routing_queue_processing_thread(self): - if ( - not self._routing_queue_processor_thread - or not self._routing_queue_processor_thread.is_alive() - ): - self._routing_queue_processor_thread = Thread( - target=self._routing_queue_processor, daemon=True - ) - self._routing_queue_processor_thread.start() - - def _route_to_BDC(self, msg: Message): - # TODO call the method of base data collector - return - - def _route_to_EVP(self, msg: Message): - # TODO call the method of estimated value predictor - return - - # Enqueue a message in the processing queue - def _enqueue_message(self, msg: Message): - self._message_queue.put(msg) - - # Enqueue a message in the routing queue - def _enqueue_routing(self, msg: Message): - self._routing_queue.put(msg) - - # Public interface to send a message - def send_message(self, msg: Message): - """ - processes message, forwards to related components. - """ - if not self._finish_flag: - self._enqueue_message(msg) - else: - log.debug(f"Controller finished can not send messages... ") - - def finish(self): - """ - finishes controller, after all waiting messages are processed and routed - """ - - # wait till queues are empty. - while not self._message_queue.empty() or not self._routing_queue.empty(): - log.debug( - f"Waiting for message and routing threads to finish their jobs... " - ) - - with self._finish_flag_lock: - # Set the finish flag to signal threads to stop - self._finish_flag = True - - log.debug(f"Finishing threads... ") - - # Wait for the message queue processing thread to finish - if ( - self._message_queue_processor_thread - and self._message_queue_processor_thread.is_alive() - ): - log.debug(f"Finishing message queue processor thread...") - self._message_queue_processor_thread.join() - # Wait for the routing queue processing thread to finish - if ( - self._routing_queue_processor_thread - and self._routing_queue_processor_thread.is_alive() - ): - log.debug(f"Finishing routing queue processor thread...") - self._routing_queue_processor_thread.join() - - # check if there are any elements in queues, if not, all cool! - log.debug(f"Threads finished... ") - log.debug(f"routing queue size... {self._routing_queue.unfinished_tasks}") - log.debug(f"message queue size... {self._message_queue.unfinished_tasks}") diff --git a/src/controller/messenger.py b/src/controller/messenger.py deleted file mode 100644 index 7af5723..0000000 --- a/src/controller/messenger.py +++ /dev/null @@ -1,48 +0,0 @@ -# SPDX-License-Identifier: MIT -# SPDX-FileCopyrightText: 2023 Berkay Bozkurt - -from enum import Enum -from typing import Dict, Optional - -from pydantic import BaseModel - - -class MessageType(Enum): - DATA = "data" - PREDICTION = "prediction" - - -class SenderType(Enum): - BDC = "base_data_collector" - EVP = "estimated_value_predictor" - - -class Message(BaseModel): - sender_name: SenderType - data_type: MessageType - data: Optional[Dict] = {} - result: Optional[Dict] = {} - - -def create_data_message(lead_id, **features): - """ - Creates a data message, called by BDC. - """ - message = Message( - sender_name=SenderType.BDC, - data_type=MessageType.DATA, - data={"lead_id": lead_id, **features}, - ) - return message - - -def create_prediction_message(lead_id, prediction_value): - """ - Create a prediction message, called by EVP. - """ - message = Message( - sender_name=SenderType.EVP, - data_type=MessageType.PREDICTION, - result={"lead_id": lead_id, "prediction value": prediction_value}, - ) - return message diff --git a/src/database/__init__.py b/src/database/__init__.py index 27d62e8..1c26196 100644 --- a/src/database/__init__.py +++ b/src/database/__init__.py @@ -4,8 +4,6 @@ from config import DATABASE_TYPE from logger import get_logger -from .database_dummy import DatabaseDummy -from .db_connection import mongo_connection from .leads import LocalRepository, Repository, S3Repository _database = None diff --git a/src/database/database_dummy.py b/src/database/database_dummy.py deleted file mode 100644 index 1bd61a9..0000000 --- a/src/database/database_dummy.py +++ /dev/null @@ -1,31 +0,0 @@ -# SPDX-License-Identifier: MIT -# SPDX-FileCopyrightText: 2023 Felix Zailskas - -from typing import List - -import pandas as pd - -from database.models import Lead -from database.parsers import LeadParser -from logger import get_logger - -log = get_logger() - - -class DatabaseDummy: - def __init__(self, input_file: str = "data/leads_enriched.csv") -> None: - self.file = input_file - self.leads = LeadParser.parse_leads_from_csv(self.file) - - def get_lead_by_id(self, id_: int) -> Lead: - return self.leads[id_] - - def get_all_leads(self) -> List[Lead]: - return self.leads - - def get_cardinality(self) -> int: - return len(self.leads) - - def update_lead(self, lead: Lead): - log.debug(f"Updating database entry for lead#{lead.lead_id}") - log.debug(f"Update values: {lead}") diff --git a/src/database/db_connection.py b/src/database/db_connection.py deleted file mode 100644 index f0d5d99..0000000 --- a/src/database/db_connection.py +++ /dev/null @@ -1,17 +0,0 @@ -# SPDX-License-Identifier: MIT -# SPDX-FileCopyrightText: 2023 Ruchita Nathani - -import pymongo - -from config import DB_CONNECTION - -_client = None - - -def mongo_connection(collection_name="default"): - global _client - if _client is None: - _client = pymongo.MongoClient(DB_CONNECTION) - db = _client["leads_enriched"] - collection = db[collection_name] - return collection diff --git a/src/database/models.py b/src/database/models.py deleted file mode 100644 index 2632e5d..0000000 --- a/src/database/models.py +++ /dev/null @@ -1,129 +0,0 @@ -# SPDX-License-Identifier: MIT -# SPDX-FileCopyrightText: 2023 Felix Zailskas - -from enum import Enum, IntEnum -from typing import Optional - -import numpy as np -from pydantic import BaseModel, EmailStr, Field -from sklearn.preprocessing import OneHotEncoder - - -class AnnualIncome(IntEnum): - Undefined = 0 # 0€ - Class1 = 1 # (0€, 35000€] - Class2 = 35001 # (35000€, 60000€] - Class3 = 60001 # (60000€, 100000€] - Class4 = 100001 # (100000€, 200000€] - Class5 = 200001 # (200000€, 400000€] - Class6 = 400001 # (400000€, 600000€] - Class7 = 600001 # (600000€, 1000000€] - Class8 = 1000001 # (1000000€, 2000000€] - Class9 = 2000001 # (2000000€, 5000000€] - Class10 = 5000001 # (5000000€, inf€] - - @classmethod - def _missing_(cls, value): - annual_income = cls.Undefined - for income_value in cls: - if value < income_value: - break - annual_income = income_value - return annual_income - - -class UserRatingsTotal(IntEnum): - Undefined = 0 - Class1 = 50 - Class2 = 100 - Class3 = 500 - Class4 = 1000 - Class5 = 10000 - - @classmethod - def _missing_(cls, value): - rating_total = cls.Undefined - for income_value in cls: - if value < income_value: - break - rating_total = income_value - return rating_total - - -class ProductOfInterest(str, Enum): - Undefined = "Undefined" - Nothing = "Nothing" - Terminals = "Terminals" - CashRegisterSystem = "Cash Register System" - BusinessAccount = "Business Account" - All = "All" - Other = "Other" - - @classmethod - def _missing_(cls, value): - return cls.Undefined - - -class BusinessStatus(str, Enum): - Undefined = "Undefined" - Operational = "OPERATIONAL" - ClosedTemporarily = "CLOSED_TEMPORARILY" - ClosedPermanently = "CLOSED_PERMANENTLY" - - @classmethod - def _missing_(cls, value): - return cls.Undefined - - -def encode_category(value, categories): - ohe = OneHotEncoder(sparse_output=False) - ohe.fit(np.array(categories).reshape(-1, 1)) - encoded = ohe.transform(np.array([value]).reshape(-1, 1)) - return encoded - - -class Lead(BaseModel): - lead_id: int # could be expended to a UUID later - first_name: str - last_name: str - email_address: str - phone_number: str - annual_income: Optional[AnnualIncome] - product_of_interest: Optional[ProductOfInterest] - lead_value: Optional[float] - domain: Optional[str] - number_valid: Optional[bool] - number_possible: Optional[bool] - google_places_business_status: Optional[BusinessStatus] - google_places_user_ratings_total: Optional[UserRatingsTotal] - - def to_one_hot_vector(self): - vector = np.array([]) - vector = np.append( - vector, - encode_category( - self.annual_income.value, [item.value for item in AnnualIncome] - ), - ) - vector = np.append( - vector, - encode_category( - self.product_of_interest.value, - [item.value for item in ProductOfInterest], - ), - ) - vector = np.append( - vector, np.array([int(self.domain is not None)]).astype(float) - ) - vector = np.append( - vector, - np.array([int(self.number_valid and self.number_possible)]).astype(float), - ) - vector = np.append( - vector, - encode_category( - self.google_places_business_status.value, - [item.value for item in BusinessStatus], - ), - ) - return vector diff --git a/src/database/parsers.py b/src/database/parsers.py deleted file mode 100644 index 9ce7e64..0000000 --- a/src/database/parsers.py +++ /dev/null @@ -1,73 +0,0 @@ -# SPDX-License-Identifier: MIT -# SPDX-FileCopyrightText: 2023 Felix Zailskas - -from typing import Dict, List - -import numpy as np -import pandas as pd - -from database.models import ( - AnnualIncome, - BusinessStatus, - Lead, - ProductOfInterest, - UserRatingsTotal, -) -from logger import get_logger - -log = get_logger() - - -class LeadParser: - @staticmethod - def parse_leads_from_csv(path: str) -> List[Lead]: - try: - data_df = pd.read_csv(path) - except FileNotFoundError: - log.error(f"Could not find {path} while parsing leads") - leads = data_df.apply( - lambda row: Lead( - lead_id=row.name, - first_name=row["First Name"], - last_name=row["Last Name"], - email_address=str(row["Email"]), - phone_number=str(row["Phone"]), - annual_income=AnnualIncome.Undefined, - product_of_interest=ProductOfInterest.Undefined, - lead_value=float(row["lead_value"]) if "lead_value" in row else None, - domain=row["domain"] if not pd.isna(row["domain"]) else None, - number_valid=row["number_valid"], - number_possible=row["number_possible"], - google_places_business_status=BusinessStatus( - row["google_places_business_status"] - ), - google_places_user_ratings_total=UserRatingsTotal( - row["google_places_user_ratings_total"] - ), - ), - axis=1, - ).to_list() - return leads - - @staticmethod - def parse_lead_from_dict(data: Dict) -> Lead: - print(data) - return Lead( - lead_id=data["lead_id"], - first_name=data["First Name"], - last_name=data["Last Name"], - email_address=str(data["Email"]), - phone_number=str(data["Phone"]), - annual_income=AnnualIncome.Undefined, - product_of_interest=ProductOfInterest.Undefined, - lead_value=float(data["lead_value"]) if "lead_value" in data else None, - domain=data["domain"] if not pd.isna(data["domain"]) else None, - number_valid=data["number_valid"], - number_possible=data["number_possible"], - google_places_business_status=BusinessStatus( - data["google_places_business_status"] - ), - google_places_user_ratings_total=UserRatingsTotal( - data["google_places_user_ratings_total"] - ), - ) diff --git a/src/demo/__init__.py b/src/demo/__init__.py index a03c078..86afb75 100644 --- a/src/demo/__init__.py +++ b/src/demo/__init__.py @@ -2,4 +2,4 @@ # SPDX-FileCopyrightText: 2023 Berkay Bozkurt from .console_utils import get_int_input, get_multiple_choice, get_yes_no_input -from .demos import bdc_demo, db_demo, evp_demo, pipeline_demo, preprocessing_demo +from .demos import evp_demo, pipeline_demo, preprocessing_demo diff --git a/src/demo/demos.py b/src/demo/demos.py index 0ad094b..4fb80ea 100644 --- a/src/demo/demos.py +++ b/src/demo/demos.py @@ -9,7 +9,6 @@ from sklearn.metrics import classification_report -from bdc import DataCollector from bdc.pipeline import Pipeline from database import get_database from demo.console_utils import ( @@ -37,22 +36,6 @@ INPUT_FILE_BDC = "../data/sumup_leads_email.csv" OUTPUT_FILE_BDC = "../data/collected_data.json" - -# bdc_demo -def bdc_demo(): - dc = DataCollector() - try: - choice = get_int_input("(1) Read CSV\n(2) Dummy API\n", range(1, 3)) - if choice == 1: - dc.get_data_from_csv(file_path=INPUT_FILE_BDC) - elif choice == 2: - dc.get_data_from_api(file_path=OUTPUT_FILE_BDC) - else: - print("Invalid choice") - except ValueError: - print("Invalid choice") - - # evp demo def evp_demo(): data = get_database().load_preprocessed_data() @@ -145,18 +128,6 @@ def predict_single_lead(evp: EstimatedValuePredictor): print("Invalid Choice") -# db_demo -def db_demo(): - amt_leads = get_database().get_cardinality() - lead_id = get_int_input( - f"Choose a lead_id in range [1, {amt_leads}]\n", range(1, amt_leads + 1) - ) - if 1 <= lead_id <= amt_leads: - print(get_database().get_lead_by_id(lead_id)) - else: - print("Invalid Choice") - - def add_step_if_requested(steps, step_class, step_desc, step_warning_message: str = ""): if get_yes_no_input(f"Run {step_desc} {step_warning_message}(y/N)?\n"): force = get_yes_no_input("Force execution if data is present? (y/N)\n") diff --git a/src/evp/data_processing.py b/src/evp/data_processing.py deleted file mode 100644 index ecfc7e3..0000000 --- a/src/evp/data_processing.py +++ /dev/null @@ -1,48 +0,0 @@ -# SPDX-License-Identifier: MIT -# SPDX-FileCopyrightText: 2023 Felix Zailskas - -import numpy as np -import pandas as pd -from sklearn.model_selection import train_test_split - -from logger import get_logger - -log = get_logger() - - -def split_dataset( - in_path: str, - out_path: str, - train_size: float, - val_size: float, - test_size: float, - add_labels: bool = False, -): - valid_sizes = train_size + val_size + test_size == 1 - if not valid_sizes: - log.error( - "Invalid size combination. Training, validation and test size must add to 1" - ) - return None - try: - full_df = pd.read_csv(in_path, index_col=None) - if add_labels: - full_df["lead_value"] = np.random.uniform( - low=1000, high=1000000, size=len(full_df) - ) - except FileNotFoundError: - log.error(f"Could not find {in_path} splitting data") - return - relative_val_size = val_size / (1 - test_size) - train_val_df, test_df = train_test_split( - full_df, - test_size=test_size, - ) - train_df, val_df = train_test_split(train_val_df, test_size=relative_val_size) - train_df = train_df.reset_index(drop=True) - val_df = val_df.reset_index(drop=True) - test_df = test_df.reset_index(drop=True) - train_df.to_csv(f"{out_path}_train.csv") - val_df.to_csv(f"{out_path}_val.csv") - test_df.to_csv(f"{out_path}_test.csv") - return train_df, val_df, test_df diff --git a/src/main.py b/src/main.py index 49cf73e..aec2fda 100644 --- a/src/main.py +++ b/src/main.py @@ -4,8 +4,6 @@ import os from demo import ( - bdc_demo, - db_demo, evp_demo, get_multiple_choice, pipeline_demo, @@ -20,9 +18,7 @@ log = get_logger() DEMOS = { - "BDC": bdc_demo, "EVP": evp_demo, - "DB": db_demo, "Pipeline": pipeline_demo, "Data preprocessing": preprocessing_demo, } diff --git a/tests/conftest.py b/tests/conftest.py index 33dbbcd..5a15f67 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,33 +1,2 @@ # SPDX-License-Identifier: MIT # SPDX-FileCopyrightText: 2023 Felix Zailskas - -from typing import Dict - -import pytest -from mock_components import get_database_mock - - -@pytest.fixture -def mock_database(): - import database - - database._database = get_database_mock() - yield database.get_database() - database._database = None - - -@pytest.fixture -def create_lead_dict(request) -> Dict: - lead_value_adjustments = request.param - lead_data = { - "lead_id": 0, - "annual_income": 0, - "product_of_interest": "Nothing", - "first_name": "Manu", - "last_name": "Musterperson", - "phone_number": "49123123123", - "email_address": "test@test.de", - } - for key, value in lead_value_adjustments.items(): - lead_data[key] = value - yield lead_data diff --git a/tests/mock_components.py b/tests/mock_components.py deleted file mode 100644 index 393dc1e..0000000 --- a/tests/mock_components.py +++ /dev/null @@ -1,8 +0,0 @@ -# SPDX-License-Identifier: MIT -# SPDX-FileCopyrightText: 2023 Felix Zailskas - -from database import DatabaseDummy - - -def get_database_mock(): - return DatabaseDummy("tests/test_data/database_dummies.csv") diff --git a/tests/test_leadparser.py b/tests/test_leadparser.py deleted file mode 100644 index 620eda3..0000000 --- a/tests/test_leadparser.py +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: MIT -# SPDX-FileCopyrightText: 2023 Felix Zailskas - -from mock_components import get_database_mock - -from database.models import Lead - - -def test_parser(): - leads = get_database_mock().get_all_leads() - for lead in leads: - assert type(lead) == Lead From ed4b0b56c45e836d2f969ecb6ecb9b77d8df1c61 Mon Sep 17 00:00:00 2001 From: Felix Zailskas Date: Tue, 30 Jan 2024 14:43:11 +0100 Subject: [PATCH 02/51] removed models folder as models are stored elsewhere now Signed-off-by: Felix Zailskas --- src/models/.gitkeep | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 src/models/.gitkeep diff --git a/src/models/.gitkeep b/src/models/.gitkeep deleted file mode 100644 index e69de29..0000000 From 3c02157da3701648b8f381f0d6498c0cb000287f Mon Sep 17 00:00:00 2001 From: Felix Zailskas Date: Thu, 1 Feb 2024 10:59:24 +0100 Subject: [PATCH 03/51] Added documentation on Cotroller and possible prediction improvement Signed-off-by: Felix Zailskas --- Documentation/ideas.md | 26 +++ deprecated/controller/Controller.py | 189 ++++++++++++++++++ deprecated/controller/messenger.py | 48 +++++ .../model_testing}/requirements.txt | 0 4 files changed, 263 insertions(+) create mode 100644 Documentation/ideas.md create mode 100644 deprecated/controller/Controller.py create mode 100644 deprecated/controller/messenger.py rename {src/evp => scripts/model_testing}/requirements.txt (100%) diff --git a/Documentation/ideas.md b/Documentation/ideas.md new file mode 100644 index 0000000..abbd41c --- /dev/null +++ b/Documentation/ideas.md @@ -0,0 +1,26 @@ + + +# Unused Ideas + +This document lists ideas and implementations which have either not been tried yet or have been deprecated as they are not used in the current product version but still carry some conceptual value. + +## Deprecated + +The original implementation of the deprecated modules can be found in the `deprecated/` directory. + +### Controller + +The controller module was originally planned to be used as a communication device between EVP and BDC. Whenever the salesperson interface would register a new lead the controller is supposed to trigger the BDC pipeline to enrich the data of that lead and preprocess it to create a feature vector. The successful completion of the BDC pipeline is then registered at the controller which will then trigger an inference of the EVP to compute the predicted merchant size and write this back to the lead data. The computed merchant size can then be used to rank the leads and allow the salesperson to decide the value of the leads and which one to call. + +The current implementation of the module supports queueing messages from the BDC and EVP as indicated by their type. Depending on the message type the message is then routed to the corresponding module (EVP or BDC). The actual processing of the messages by the modules is not implemented. All of this is done asynchronously by using the python threading library. + +## Possible ML improvements + +### Creating data subsets + +The data collected by the BDC pipeline has not been refined to only include semantically valuable data fields. It is possible that some data fields contain no predictive power. This would mean they are practically polluting the dataset with unnecessary information. A proper analysis of the predictive power of all data fields would allow cutting down on the amount of data for each lead, reducing processing time and possibly make predictions more precise. This approach has been explored very briefly by the subset 1 as described in `Classifier-Comparison.md`. However, the choice of included features has not been justified by experiments making them somewhat arbitrary. Additionally, an analysis of this type could give insights on which data fields to expand on and what new data one might want to collect to increase the EVP's performance in predicting merchant sizes. + +Possibly filtering data based on some quality metric could also improve general performance. The regional_atlas_score and google_confidence_score have been tried for this but did not improve performance. However, these values are computed somewhat arbitrarily and implementing a more refined quality metric might result in more promising results. diff --git a/deprecated/controller/Controller.py b/deprecated/controller/Controller.py new file mode 100644 index 0000000..16cf2b7 --- /dev/null +++ b/deprecated/controller/Controller.py @@ -0,0 +1,189 @@ +# SPDX-License-Identifier: MIT +# SPDX-FileCopyrightText: 2023 Berkay Bozkurt + +from queue import Queue +from threading import Lock, Thread +from typing import Any + +from messenger import Message, MessageType, create_data_message + +from logger import get_logger + +log = get_logger() + + +class ControllerMeta(type): + + """ + Thread safe singleton implementation of Controller + """ + + _instances = {} # Dictionary to store instances of Controller + _lock: Lock = Lock() # A lock to ensure thread-safety when creating instances + + def __call__(cls, *args: Any, **kwds: Any): + with cls._lock: + if cls not in cls._instances: + instance = super().__call__( + *args, **kwds + ) # Create a new instance of Controller + cls._instances[ + cls + ] = instance # Store the instance in the _instances dictionary + + return cls._instances[cls] # Return the instance of Controller + + +class Controller(metaclass=ControllerMeta): + """ + Controller class with message processing and sending functionality. + """ + + def __init__(self, name: str) -> None: + self.name = name + self._finish_flag = False + self._finish_flag_lock = Lock() + self._message_queue: Queue[Message] = Queue(0) # Queue for processing messages + self._routing_queue: Queue[Message] = Queue(0) # Queue for routing messages + self._message_queue_processor_thread: Thread = ( + None # Thread for processing messages + ) + self._routing_queue_processor_thread: Thread = ( + None # Thread for routing messages + ) + self._start_message_queue_processing_thread() # Start the message processing thread + self._start_routing_queue_processing_thread() # Start the routing thread + + def _message_queue_processor(self): + while True: + # do read operation with lock to get latest value + with self._finish_flag_lock: + # if set True then exit loop + if self._finish_flag: + break + if not self._message_queue.empty(): + try: + # Get a message from the message queue and process it‚ + msg = self._message_queue.get() + # Simulate processing of the message + log.debug(f"Processing on {msg}") + self._enqueue_routing(msg) + # Simulate completion of processing + log.debug(f"Processed {msg}") + # Handle any errors during message processing + except Exception as e: + log.error(f"Error while processing message: {e}") + finally: + # Mark the task as done in the processing queue + self._message_queue.task_done() + log.debug(f"Message queue processor thread exited.") + + def _routing_queue_processor(self): + while True: + with self._finish_flag_lock: + if self._finish_flag: + break + if not self._routing_queue.empty(): + try: + # Mark the task as done in the processing queue + msg = self._routing_queue.get() + log.debug(f"Routing {msg}") + if msg.data_type == MessageType.DATA: + self._route_to_EVP(msg) + elif msg.data_type == MessageType.PREDICTION: + self._route_to_BDC(msg) + else: + log.warning(f"Unknown message type: {msg.data_type}") + log.debug(f"Routed {msg}") + # Handle any errors during message routing + except Exception as e: + log.error(f"Error while routing message: {e}") + finally: + # Mark the task as done in the processing queue + self._routing_queue.task_done() + log.debug(f"Routing queue processor thread exited.") + + # Start the message processing thread + def _start_message_queue_processing_thread(self): + if ( + not self._message_queue_processor_thread + or not self._message_queue_processor_thread.is_alive() + ): + self._message_queue_processor_thread = Thread( + target=self._message_queue_processor, daemon=True + ) + self._message_queue_processor_thread.start() + + # Start the message routing thread + def _start_routing_queue_processing_thread(self): + if ( + not self._routing_queue_processor_thread + or not self._routing_queue_processor_thread.is_alive() + ): + self._routing_queue_processor_thread = Thread( + target=self._routing_queue_processor, daemon=True + ) + self._routing_queue_processor_thread.start() + + def _route_to_BDC(self, msg: Message): + # TODO call the method of base data collector + return + + def _route_to_EVP(self, msg: Message): + # TODO call the method of estimated value predictor + return + + # Enqueue a message in the processing queue + def _enqueue_message(self, msg: Message): + self._message_queue.put(msg) + + # Enqueue a message in the routing queue + def _enqueue_routing(self, msg: Message): + self._routing_queue.put(msg) + + # Public interface to send a message + def send_message(self, msg: Message): + """ + processes message, forwards to related components. + """ + if not self._finish_flag: + self._enqueue_message(msg) + else: + log.debug(f"Controller finished can not send messages... ") + + def finish(self): + """ + finishes controller, after all waiting messages are processed and routed + """ + + # wait till queues are empty. + while not self._message_queue.empty() or not self._routing_queue.empty(): + log.debug( + f"Waiting for message and routing threads to finish their jobs... " + ) + + with self._finish_flag_lock: + # Set the finish flag to signal threads to stop + self._finish_flag = True + + log.debug(f"Finishing threads... ") + + # Wait for the message queue processing thread to finish + if ( + self._message_queue_processor_thread + and self._message_queue_processor_thread.is_alive() + ): + log.debug(f"Finishing message queue processor thread...") + self._message_queue_processor_thread.join() + # Wait for the routing queue processing thread to finish + if ( + self._routing_queue_processor_thread + and self._routing_queue_processor_thread.is_alive() + ): + log.debug(f"Finishing routing queue processor thread...") + self._routing_queue_processor_thread.join() + + # check if there are any elements in queues, if not, all cool! + log.debug(f"Threads finished... ") + log.debug(f"routing queue size... {self._routing_queue.unfinished_tasks}") + log.debug(f"message queue size... {self._message_queue.unfinished_tasks}") diff --git a/deprecated/controller/messenger.py b/deprecated/controller/messenger.py new file mode 100644 index 0000000..7af5723 --- /dev/null +++ b/deprecated/controller/messenger.py @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: MIT +# SPDX-FileCopyrightText: 2023 Berkay Bozkurt + +from enum import Enum +from typing import Dict, Optional + +from pydantic import BaseModel + + +class MessageType(Enum): + DATA = "data" + PREDICTION = "prediction" + + +class SenderType(Enum): + BDC = "base_data_collector" + EVP = "estimated_value_predictor" + + +class Message(BaseModel): + sender_name: SenderType + data_type: MessageType + data: Optional[Dict] = {} + result: Optional[Dict] = {} + + +def create_data_message(lead_id, **features): + """ + Creates a data message, called by BDC. + """ + message = Message( + sender_name=SenderType.BDC, + data_type=MessageType.DATA, + data={"lead_id": lead_id, **features}, + ) + return message + + +def create_prediction_message(lead_id, prediction_value): + """ + Create a prediction message, called by EVP. + """ + message = Message( + sender_name=SenderType.EVP, + data_type=MessageType.PREDICTION, + result={"lead_id": lead_id, "prediction value": prediction_value}, + ) + return message diff --git a/src/evp/requirements.txt b/scripts/model_testing/requirements.txt similarity index 100% rename from src/evp/requirements.txt rename to scripts/model_testing/requirements.txt From 06330b018996138519e178bc2a5766ec30179aa7 Mon Sep 17 00:00:00 2001 From: Felix Zailskas Date: Thu, 1 Feb 2024 11:00:35 +0100 Subject: [PATCH 04/51] Removed docker support Signed-off-by: Felix Zailskas --- Dockerfile | 16 ---------------- build_app.sh | 8 -------- docker-compose.yml | 30 ------------------------------ run_app.sh | 25 ------------------------- 4 files changed, 79 deletions(-) delete mode 100644 Dockerfile delete mode 100755 build_app.sh delete mode 100644 docker-compose.yml delete mode 100755 run_app.sh diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 328ddbe..0000000 --- a/Dockerfile +++ /dev/null @@ -1,16 +0,0 @@ -# SPDX-License-Identifier: MIT -# SPDX-FileCopyrightText: 2023 Felix Zailskas - -FROM python:3.10-slim - -WORKDIR /app - -ADD Pipfile . -RUN pip install pipenv -RUN pipenv install - -ADD src . -ADD .env . - -ENTRYPOINT [ "pipenv", "run" ] -CMD [ "python", "main.py" ] diff --git a/build_app.sh b/build_app.sh deleted file mode 100755 index 5556ce2..0000000 --- a/build_app.sh +++ /dev/null @@ -1,8 +0,0 @@ -# SPDX-License-Identifier: MIT -# SPDX-FileCopyrightText: 2023 Felix Zailskas - -#!/bin/bash - -application_name="sumup_app" - -docker build -t $application_name . diff --git a/docker-compose.yml b/docker-compose.yml deleted file mode 100644 index bdf1f14..0000000 --- a/docker-compose.yml +++ /dev/null @@ -1,30 +0,0 @@ -# SPDX-License-Identifier: MIT -# SPDX-FileCopyrightText: 2023 Ruchita Nathani - -version: "3" -services: - mongodb: - image: mongo:latest - ports: - - "27017:27017" - environment: - MONGO_INITDB_ROOT_USERNAME: ${DB_USER} - MONGO_INITDB_ROOT_PASSWORD: ${DB_PASSWORD} - networks: - - network_private - - sumup_app: - build: . - depends_on: - - mongodb - env_file: - - .env - # volumes: - # - .:opt/sumup_app - command: python main.py - networks: - - network_private - -networks: - network_private: - driver: bridge diff --git a/run_app.sh b/run_app.sh deleted file mode 100755 index 33f544c..0000000 --- a/run_app.sh +++ /dev/null @@ -1,25 +0,0 @@ -# SPDX-License-Identifier: MIT -# SPDX-FileCopyrightText: 2023 Felix Zailskas - -#!/bin/bash - -application_name="sumup_app" - -# Start building the docker command -command="docker run -i" - -# Read each line in the file -while IFS= read -r line; do - # Skip lines that start with '#' or are empty - if [[ $line =~ ^# ]] || [[ -z $line ]]; then - continue - fi - - # Append non-comment lines to the command - command+=" -e $line" -done < ".env" - -command+=" $application_name" - -# Run the command -eval $command From 91e4090245d52541308ef3eccf2b00f2ac373347 Mon Sep 17 00:00:00 2001 From: Felix Zailskas Date: Thu, 1 Feb 2024 15:06:46 +0100 Subject: [PATCH 05/51] Cleanup Pipfile and remove unsused imports Add facebook step to deprecated directory. document possible future improvements Signed-off-by: Felix Zailskas --- .env.template | 10 +- Documentation/ideas.md | 8 + Pipfile | 60 +- Pipfile.lock | 713 ++++-------------- .../steps/social_media_api.py | 0 src/bdc/pipeline.py | 2 +- src/bdc/steps/__init__.py | 1 - src/demo/console_utils.py | 2 +- .../config_sprint09_release.json | 4 - src/demo/pipeline_configs/config_template | 4 - src/demo/pipeline_configs/run_all_steps.json | 4 - src/demo/pipeline_utils.py | 3 - src/main.py | 11 +- tests/test_console_utils.py | 2 +- 14 files changed, 169 insertions(+), 655 deletions(-) rename {src/bdc => deprecated}/steps/social_media_api.py (100%) diff --git a/.env.template b/.env.template index a260b2c..e417cd6 100644 --- a/.env.template +++ b/.env.template @@ -9,15 +9,9 @@ GOOGLE_PLACES_API_KEY= OPEN_AI_API_KEY= -DB_USER= -DB_PASSWORD= -DB_CONNECTION= - -FACEBOOK_APP_ID= -FACEBOOK_APP_SECRET= -OPEN_AI_API_KEY= - +# Need to be set when 'DATABASE_TYPE' is 'S3' AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= +# Choose between 'Local' and 'S3' DATABASE_TYPE= diff --git a/Documentation/ideas.md b/Documentation/ideas.md index abbd41c..12a9868 100644 --- a/Documentation/ideas.md +++ b/Documentation/ideas.md @@ -13,10 +13,18 @@ The original implementation of the deprecated modules can be found in the `depre ### Controller +**_Note:_** This package has the additional dependency `pydantic==2.4.2` + The controller module was originally planned to be used as a communication device between EVP and BDC. Whenever the salesperson interface would register a new lead the controller is supposed to trigger the BDC pipeline to enrich the data of that lead and preprocess it to create a feature vector. The successful completion of the BDC pipeline is then registered at the controller which will then trigger an inference of the EVP to compute the predicted merchant size and write this back to the lead data. The computed merchant size can then be used to rank the leads and allow the salesperson to decide the value of the leads and which one to call. The current implementation of the module supports queueing messages from the BDC and EVP as indicated by their type. Depending on the message type the message is then routed to the corresponding module (EVP or BDC). The actual processing of the messages by the modules is not implemented. All of this is done asynchronously by using the python threading library. +### FacebookGraphAPI + +**_Note:_** This package has the additional dependency `facebook-sdk==3.1.0`. Also the environment variables `FACEBOOK_APP_ID` `FACEBOOK_APP_SECRET` need to be set with a valid token. + +This step was supposed to be used for querying lead data from the facebook by using either the business owner's name or the company name. The attempt was deprecated as the cost for the needed API token was evaluated too high and because the usage permissions of the facebook API were changed. Furthermore, it is paramount to check the legal ramifications of querying facebook for this kind of data as there might be legal consequences of searching for individuals on facebook instead of their businesses due to data privacy regulations in the EU. + ## Possible ML improvements ### Creating data subsets diff --git a/Pipfile b/Pipfile index c8c7e99..14abe41 100644 --- a/Pipfile +++ b/Pipfile @@ -7,53 +7,47 @@ verify_ssl = true name = "pypi" [dev-packages] -pytest = "==7.4.0" coverage = "==7.4.1" -pre-commit = "==3.5.0" flake8 = "==6.0.0" -pytest-env = "==1.0.1" -matplotlib = "==3.8.2" -plotly = "==5.18.0" geopy = "==2.4.1" +matplotlib = "==3.8.2" notebook = "==7.0.6" +plotly = "==5.18.0" +pre-commit = "==3.5.0" +pytest = "==7.4.0" +pytest-env = "==1.0.1" [packages] -numpy = "==1.26.1" -requests = "==2.31.0" -scikit-learn = "==1.3.2" -pydantic = "==2.4.2" -email-validator = "==2.1.0.post1" -pandas = "==2.0.3" +autocorrect = "==2.6.1" beautifulsoup4 = "==4.12.2" -tqdm = "==4.65.0" -python-dotenv = "==0.21.0" -googlemaps = "==4.10.0" -phonenumbers = "==8.13.25" -pymongo = "==4.6.0" -facebook-sdk = "==3.1.0" boto3 = "==1.33.1" +colorama = "==0.4.6" +deep-translator = "==1.11.4" +deutschland = "==0.4.0" +email-validator = "==2.1.0.post1" +fsspec = "==2023.12.2" +geopandas = "==0.14.1" +googlemaps = "==4.10.0" +joblib = "==1.3.2" +lightgbm = "==4.3.0" +numpy = "==1.26.1" openai = "==1.3.3" -tiktoken = "==0.5.1" +osmnx = "==1.7.1" +pandas = "==2.0.3" +phonenumbers = "==8.13.25" pylanguagetool = "==0.10.0" +pyspellchecker = "==0.7.2" +python-dotenv = "==0.21.0" reportlab = "==4.0.7" -osmnx = "==1.7.1" -geopandas = "==0.14.1" +requests = "==2.31.0" +s3fs = "==2023.12.2" +scikit-learn = "==1.3.2" shapely = "==2.0.2" -pyspellchecker = "==0.7.2" -autocorrect = "==2.6.1" textblob = "==0.17.1" -deep-translator = "==1.11.4" -fsspec = "2023.12.2" -s3fs = "2023.12.2" -imblearn = "==0.0" -sagemaker = "==2.198.0" -joblib = "1.3.2" +tiktoken = "==0.5.1" +torch = "==2.1.2" +tqdm = "==4.65.0" xgboost = "==2.0.3" -colorama = "==0.4.6" -torch = "2.1.2" -deutschland = "0.4.0" -bs4 = "0.0.2" -lightgbm = "==4.3.0" [requires] python_version = "3.10" diff --git a/Pipfile.lock b/Pipfile.lock index 290f0f5..ebdaba4 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "6ab737c4b2ed7f02c0f9bbd5eaca04ea76853afbc7da60d5416b3a79d3ccb58f" + "sha256": "2a572dbe6247b7d3f516703542275c39b368211d26d4a47ea102f361a43fc4b4" }, "pipfile-spec": 6, "requires": { @@ -187,14 +187,6 @@ "markers": "python_version >= '3.7'", "version": "==1.33.13" }, - "bs4": { - "hashes": [ - "sha256:a48685c58f50fe127722417bae83fe6badf500d54b55f7e39ffe43b798653925", - "sha256:abf8742c0805ef7f662dce4b51cca104cffe52b835238afc169142ab9b3fbccc" - ], - "index": "pypi", - "version": "==0.0.2" - }, "certifi": { "hashes": [ "sha256:9b469f3a900bf28dc19b8cfbf8019bf47f7fdd1a65a1d4ffb98fc14166beb4d1", @@ -319,17 +311,9 @@ "sha256:a4bc13d623356b373c2c27c53dbd9c68cae5d526270bfa71f6c6fa69669c6b27", "sha256:c1ca117dbce1fe20a5809dc96f01e1c2840f6dcc939b3ddbb1111bf330ba82df" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' and python_version < '4'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2' and python_version < '4'", "version": "==0.7.2" }, - "cloudpickle": { - "hashes": [ - "sha256:61f594d1f4c295fa5cd9014ceb3a1fc4a70b0de1164b94fbc2d854ccba056f9f", - "sha256:d89684b8de9e34a2a43b3460fbca07d09d6e25ce858df4d5a44240403b6178f5" - ], - "markers": "python_version >= '3.6'", - "version": "==2.2.1" - }, "colorama": { "hashes": [ "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", @@ -355,14 +339,6 @@ "markers": "python_version >= '3.5'", "version": "==1.7" }, - "contextlib2": { - "hashes": [ - "sha256:3fbdb64466afd23abaf6c977627b75b6139a5a3e8ce38405c5b413aed7a0471f", - "sha256:ab1e2bfe1d01d968e1b7e8d9023bc51ef3509bba217bb730cee3827e1ee82869" - ], - "markers": "python_version >= '3.6'", - "version": "==21.6.0" - }, "dateparser": { "hashes": [ "sha256:0b21ad96534e562920a0083e97fd45fa959882d4162acc358705144520a35830", @@ -509,14 +485,6 @@ "markers": "python_version >= '3.8' and python_version < '4'", "version": "==0.4.0" }, - "dill": { - "hashes": [ - "sha256:3ebe3c479ad625c4553aca177444d89b486b1d84982eeacded644afc0cf797ca", - "sha256:c36ca9ffb54365bdd2f8eb3eff7d2a21237f8452b57ace88b1ac615b7e815bd7" - ], - "markers": "python_version >= '3.8'", - "version": "==0.3.8" - }, "distro": { "hashes": [ "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", @@ -550,14 +518,6 @@ "markers": "python_version < '3.11'", "version": "==1.2.0" }, - "facebook-sdk": { - "hashes": [ - "sha256:2e987b3e0f466a6f4ee77b935eb023dba1384134f004a2af21f1cfff7fe0806e", - "sha256:cabcd2e69ea3d9f042919c99b353df7aa1e2be86d040121f6e9f5e63c1cf0f8d" - ], - "index": "pypi", - "version": "==3.1.0" - }, "filelock": { "hashes": [ "sha256:521f5f56c50f8426f5e03ad3b281b490a87ef15bc6c526f168290f0c7148d44e", @@ -704,14 +664,6 @@ "markers": "python_version >= '3.9'", "version": "==0.14.1" }, - "google-pasta": { - "hashes": [ - "sha256:4612951da876b1a10fe3960d7226f0c7682cf901e16ac06e473b267a5afa8954", - "sha256:b32482794a366b5366a32c92a9a9201b107821889935a02b3e51f6b432ea84ed", - "sha256:c9f2c8dfc8f96d0d5808299920721be30c9eec37f2389f28904f454565c8a16e" - ], - "version": "==0.2.0" - }, "googlemaps": { "hashes": [ "sha256:3055fcbb1aa262a9159b589b5e6af762b10e80634ae11c59495bd44867e47d88" @@ -760,29 +712,6 @@ "markers": "python_version >= '3.5'", "version": "==3.6" }, - "imbalanced-learn": { - "hashes": [ - "sha256:02ef5bc9ef046f44aa20353a904366a948f7944155e77b6f09b500a70981fd13", - "sha256:b9ccd9aaa3028699079d43a6d4d9fc9d039f55376733b31f87c7d9b125dcc165" - ], - "version": "==0.12.0" - }, - "imblearn": { - "hashes": [ - "sha256:d42c2d709d22c00d2b9a91e638d57240a8b79b4014122d92181fcd2549a2f79a", - "sha256:d8fbb662919c1b16f438ad91a8256220e53bcf6815c9ad5502c518b798de34f2" - ], - "index": "pypi", - "version": "==0.0" - }, - "importlib-metadata": { - "hashes": [ - "sha256:1231cf92d825c9e03cfc4da076a16de6422c863558229ea0b22b675657463443", - "sha256:f0afba6205ad8f8947c7d338b5342d5db2afbfd82f9cbef7879a9539cc12eb9b" - ], - "markers": "python_version >= '3.8'", - "version": "==6.11.0" - }, "jinja2": { "hashes": [ "sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa", @@ -808,22 +737,6 @@ "markers": "python_version >= '3.7'", "version": "==1.3.2" }, - "jsonschema": { - "hashes": [ - "sha256:7996507afae316306f9e2290407761157c6f78002dcf7419acb99822143d1c6f", - "sha256:85727c00279f5fa6bedbe6238d2aa6403bedd8b4864ab11207d07df3cc1b2ee5" - ], - "markers": "python_version >= '3.8'", - "version": "==4.21.1" - }, - "jsonschema-specifications": { - "hashes": [ - "sha256:48a76787b3e70f5ed53f1160d2b81f586e4ca6d1548c5de7085d1682674764cc", - "sha256:87e4fdf3a94858b8a2ba2778d9ba57d8a9cafca7c7489c46ba0d30a8bc6a9c3c" - ], - "markers": "python_version >= '3.8'", - "version": "==2023.12.1" - }, "lightgbm": { "hashes": [ "sha256:006f5784a9bcee43e5a7e943dc4f02de1ba2ee7a7af1ee5f190d383f3b6c9ebe", @@ -1104,24 +1017,6 @@ "markers": "python_version >= '3.7'", "version": "==6.0.4" }, - "multiprocess": { - "hashes": [ - "sha256:0dfd078c306e08d46d7a8d06fb120313d87aa43af60d66da43ffff40b44d2f41", - "sha256:161af703d4652a0e1410be6abccecde4a7ddffd19341be0a7011b94aeb171ac1", - "sha256:37b55f71c07e2d741374998c043b9520b626a8dddc8b3129222ca4f1a06ef67a", - "sha256:476887be10e2f59ff183c006af746cb6f1fd0eadcfd4ef49e605cbe2659920ee", - "sha256:a0bafd3ae1b732eac64be2e72038231c1ba97724b60b09400d68f229fcc2fbf3", - "sha256:a71d82033454891091a226dfc319d0cfa8019a4e888ef9ca910372a446de4435", - "sha256:af4cabb0dac72abfb1e794fa7855c325fd2b55a10a44628a3c1ad3311c04127a", - "sha256:ba8c31889abf4511c7308a8c52bb4a30b9d590e7f58523302ba00237702ca054", - "sha256:c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02", - "sha256:d951bed82c8f73929ac82c61f01a7b5ce8f3e5ef40f5b52553b4f547ce2b08ec", - "sha256:e7b9d0f307cd9bd50851afaac0dba2cb6c44449efff697df7c7645f7d3f2be3a", - "sha256:fc0544c531920dde3b00c29863377f87e1632601092ea2daca74e4beb40faa2e" - ], - "markers": "python_version >= '3.8'", - "version": "==0.70.16" - }, "networkx": { "hashes": [ "sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6", @@ -1179,32 +1074,33 @@ }, "onnxruntime": { "hashes": [ - "sha256:00cccc37a5195c8fca5011b9690b349db435986bd508eb44c9fce432da9228a4", - "sha256:04ebcd29c20473596a1412e471524b2fb88d55e6301c40b98dd2407b5911595f", - "sha256:212741b519ee61a4822c79c47147d63a8b0ffde25cd33988d3d7be9fbd51005d", - "sha256:28ff758b17ce3ca6bcad3d936ec53bd7f5482e7630a13f6dcae518eba8f71d85", - "sha256:3bc41f323ac77acfed190be8ffdc47a6a75e4beeb3473fbf55eeb075ccca8df2", - "sha256:3c467eaa3d2429c026b10c3d17b78b7f311f718ef9d2a0d6938e5c3c2611b0cf", - "sha256:3e253e572021563226a86f1c024f8f70cdae28f2fb1cc8c3a9221e8b1ce37db5", - "sha256:4137e5d443e2dccebe5e156a47f1d6d66f8077b03587c35f11ee0c7eda98b533", - "sha256:4c2dcf1b70f8434abb1116fe0975c00e740722aaf321997195ea3618cc00558e", - "sha256:5b8f5083f903408238883821dd8c775f8120cb4a604166dbdabe97f4715256d5", - "sha256:5f91f5497fe3df4ceee2f9e66c6148d9bfeb320cd6a71df361c66c5b8bac985a", - "sha256:6829dc2a79d48c911fedaf4c0f01e03c86297d32718a3fdee7a282766dfd282a", - "sha256:76f876c53bfa912c6c242fc38213a6f13f47612d4360bc9d599bd23753e53161", - "sha256:78d81d9af457a1dc90db9a7da0d09f3ccb1288ea1236c6ab19f0ca61f3eee2d3", - "sha256:985a029798744ce4743fcf8442240fed35c8e4d4d30ec7d0c2cdf1388cd44408", - "sha256:9996bab0f202a6435ab867bc55598f15210d0b72794d5de83712b53d564084ae", - "sha256:9aded21fe3d898edd86be8aa2eb995aa375e800ad3dfe4be9f618a20b8ee3630", - "sha256:a225bb683991001d111f75323d355b3590e75e16b5e0f07a0401e741a0143ea1", - "sha256:a82a8f0b4c978d08f9f5c7a6019ae51151bced9fd91e5aaa0c20a9e4ac7a60b6", - "sha256:c56695c1a343c7c008b647fff3df44da63741fbe7b6003ef576758640719be7b", - "sha256:d4a0151e1accd04da6711f6fd89024509602f82c65a754498e960b032359b02d", - "sha256:e8aa5bba78afbd4d8a2654b14ec7462ff3ce4a6aad312a3c2d2c2b65009f2541", - "sha256:ef2b1fc269cabd27f129fb9058917d6fdc89b188c49ed8700f300b945c81f889", - "sha256:f36b56a593b49a3c430be008c2aea6658d91a3030115729609ec1d5ffbaab1b6" - ], - "version": "==1.16.3" + "sha256:16d26badd092c8c257fa57c458bb600d96dc15282c647ccad0ed7b2732e6c03b", + "sha256:1ec485643b93e0a3896c655eb2426decd63e18a278bb7ccebc133b340723624f", + "sha256:461fa0fc7d9c392c352b6cccdedf44d818430f3d6eacd924bb804fdea2dcfd02", + "sha256:4b038324586bc905299e435f7c00007e6242389c856b82fe9357fdc3b1ef2bdc", + "sha256:5632077c3ab8b0cd4f74b0af9c4e924be012b1a7bcd7daa845763c6c6bf14b7d", + "sha256:5a06ab84eaa350bf64b1d747b33ccf10da64221ed1f38f7287f15eccbec81603", + "sha256:5d3d11db2c8242766212a68d0b139745157da7ce53bd96ba349a5c65e5a02357", + "sha256:61a12732cba869b3ad2d4e29ab6cb62c7a96f61b8c213f7fcb961ba412b70b37", + "sha256:6f1273bebcdb47ed932d076c85eb9488bc4768fcea16d5f2747ca692fad4f9d3", + "sha256:7466724e809a40e986b1637cba156ad9fc0d1952468bc00f79ef340bc0199552", + "sha256:83c35809cda898c5a11911c69ceac8a2ac3925911854c526f73bad884582f911", + "sha256:90c0890e36f880281c6c698d9bc3de2afbeee2f76512725ec043665c25c67d21", + "sha256:93d39b3fa1ee01f034f098e1c7769a811a21365b4883f05f96c14a2b60c6028b", + "sha256:ac2f286da3494b29b4186ca193c7d4e6a2c1f770c4184c7192c5da142c3dec28", + "sha256:b4c87d83c6f58d1af2675fc99e3dc810f2dbdb844bcefd0c1b7573632661f6fc", + "sha256:b7b337cd0586f7836601623cbd30a443df9528ef23965860d11c753ceeb009f2", + "sha256:bb1bf1ee575c665b8bbc3813ab906e091a645a24ccc210be7932154b8260eca1", + "sha256:cb60fd3c2c1acd684752eb9680e89ae223e9801a9b0e0dc7b28adabe45a2e380", + "sha256:d2b22a25a94109cc983443116da8d9805ced0256eb215c5e6bc6dcbabefeab96", + "sha256:d47bee7557a8b99c8681b6882657a515a4199778d6d5e24e924d2aafcef55b0a", + "sha256:dba55723bf9b835e358f48c98a814b41692c393eb11f51e02ece0625c756b797", + "sha256:ee48422349cc500273beea7607e33c2237909f58468ae1d6cccfc4aecd158565", + "sha256:f34cc46553359293854e38bdae2ab1be59543aad78a6317e7746d30e311110c3", + "sha256:fa464aa4d81df818375239e481887b656e261377d5b6b9a4692466f5f3261edc", + "sha256:fbb9faaf51d01aa2c147ef52524d9326744c852116d8005b9041809a71838878" + ], + "version": "==1.17.0" }, "openai": { "hashes": [ @@ -1264,14 +1160,6 @@ "markers": "python_version >= '3.8'", "version": "==2.0.3" }, - "pathos": { - "hashes": [ - "sha256:4f2a42bc1e10ccf0fe71961e7145fc1437018b6b21bd93b2446abc3983e49a7a", - "sha256:d669275e6eb4b3fbcd2846d7a6d1bba315fe23add0c614445ba1408d8b38bafe" - ], - "markers": "python_version >= '3.8'", - "version": "==0.3.2" - }, "phonenumbers": { "hashes": [ "sha256:4ae2d2e253a4752a269ae1147822b9aa500f14b2506a91f884e68b136901f128", @@ -1354,30 +1242,6 @@ "markers": "python_version >= '3.8'", "version": "==10.2.0" }, - "platformdirs": { - "hashes": [ - "sha256:11c8f37bcca40db96d8144522d925583bdb7a31f7b0e37e3ed4318400a8e2380", - "sha256:906d548203468492d432bcb294d4bc2fff751bf84971fbb2c10918cc206ee420" - ], - "markers": "python_version >= '3.8'", - "version": "==4.1.0" - }, - "pox": { - "hashes": [ - "sha256:16e6eca84f1bec3828210b06b052adf04cf2ab20c22fd6fbef5f78320c9a6fed", - "sha256:651b8ae8a7b341b7bfd267f67f63106daeb9805f1ac11f323d5280d2da93fdb6" - ], - "markers": "python_version >= '3.8'", - "version": "==0.3.4" - }, - "ppft": { - "hashes": [ - "sha256:76a429a7d7b74c4d743f6dba8351e58d62b6432ed65df9fe204790160dab996d", - "sha256:de2dd4b1b080923dd9627fbdea52649fd741c752fce4f3cf37e26f785df23d9b" - ], - "markers": "python_version >= '3.8'", - "version": "==1.7.6.8" - }, "protobuf": { "hashes": [ "sha256:10894a2885b7175d3984f2be8d9850712c57d5e7587a2410720af8be56cdaf62", @@ -1446,124 +1310,96 @@ }, "pydantic": { "hashes": [ - "sha256:94f336138093a5d7f426aac732dcfe7ab4eb4da243c88f891d65deb4a2556ee7", - "sha256:bc3ddf669d234f4220e6e1c4d96b061abe0998185a8d7855c0126782b7abc8c1" + "sha256:1440966574e1b5b99cf75a13bec7b20e3512e8a61b894ae252f56275e2c465ae", + "sha256:ae887bd94eb404b09d86e4d12f93893bdca79d766e738528c6fa1c849f3c6bcf" ], - "index": "pypi", - "markers": "python_version >= '3.7'", - "version": "==2.4.2" + "markers": "python_version >= '3.8'", + "version": "==2.6.0" }, "pydantic-core": { "hashes": [ - "sha256:042462d8d6ba707fd3ce9649e7bf268633a41018d6a998fb5fbacb7e928a183e", - "sha256:0523aeb76e03f753b58be33b26540880bac5aa54422e4462404c432230543f33", - "sha256:05560ab976012bf40f25d5225a58bfa649bb897b87192a36c6fef1ab132540d7", - "sha256:0675ba5d22de54d07bccde38997e780044dcfa9a71aac9fd7d4d7a1d2e3e65f7", - "sha256:073d4a470b195d2b2245d0343569aac7e979d3a0dcce6c7d2af6d8a920ad0bea", - "sha256:07ec6d7d929ae9c68f716195ce15e745b3e8fa122fc67698ac6498d802ed0fa4", - "sha256:0880e239827b4b5b3e2ce05e6b766a7414e5f5aedc4523be6b68cfbc7f61c5d0", - "sha256:0c27f38dc4fbf07b358b2bc90edf35e82d1703e22ff2efa4af4ad5de1b3833e7", - "sha256:0d8a8adef23d86d8eceed3e32e9cca8879c7481c183f84ed1a8edc7df073af94", - "sha256:0e2a35baa428181cb2270a15864ec6286822d3576f2ed0f4cd7f0c1708472aff", - "sha256:0f8682dbdd2f67f8e1edddcbffcc29f60a6182b4901c367fc8c1c40d30bb0a82", - "sha256:0fa467fd300a6f046bdb248d40cd015b21b7576c168a6bb20aa22e595c8ffcdd", - "sha256:128552af70a64660f21cb0eb4876cbdadf1a1f9d5de820fed6421fa8de07c893", - "sha256:1396e81b83516b9d5c9e26a924fa69164156c148c717131f54f586485ac3c15e", - "sha256:149b8a07712f45b332faee1a2258d8ef1fb4a36f88c0c17cb687f205c5dc6e7d", - "sha256:14ac492c686defc8e6133e3a2d9eaf5261b3df26b8ae97450c1647286750b901", - "sha256:14cfbb00959259e15d684505263d5a21732b31248a5dd4941f73a3be233865b9", - "sha256:14e09ff0b8fe6e46b93d36a878f6e4a3a98ba5303c76bb8e716f4878a3bee92c", - "sha256:154ea7c52e32dce13065dbb20a4a6f0cc012b4f667ac90d648d36b12007fa9f7", - "sha256:15d6bca84ffc966cc9976b09a18cf9543ed4d4ecbd97e7086f9ce9327ea48891", - "sha256:1d40f55222b233e98e3921df7811c27567f0e1a4411b93d4c5c0f4ce131bc42f", - "sha256:25bd966103890ccfa028841a8f30cebcf5875eeac8c4bde4fe221364c92f0c9a", - "sha256:2cf5bb4dd67f20f3bbc1209ef572a259027c49e5ff694fa56bed62959b41e1f9", - "sha256:2e0e2959ef5d5b8dc9ef21e1a305a21a36e254e6a34432d00c72a92fdc5ecda5", - "sha256:320f14bd4542a04ab23747ff2c8a778bde727158b606e2661349557f0770711e", - "sha256:3625578b6010c65964d177626fde80cf60d7f2e297d56b925cb5cdeda6e9925a", - "sha256:39215d809470f4c8d1881758575b2abfb80174a9e8daf8f33b1d4379357e417c", - "sha256:3f0ac9fb8608dbc6eaf17956bf623c9119b4db7dbb511650910a82e261e6600f", - "sha256:417243bf599ba1f1fef2bb8c543ceb918676954734e2dcb82bf162ae9d7bd514", - "sha256:420a692b547736a8d8703c39ea935ab5d8f0d2573f8f123b0a294e49a73f214b", - "sha256:443fed67d33aa85357464f297e3d26e570267d1af6fef1c21ca50921d2976302", - "sha256:48525933fea744a3e7464c19bfede85df4aba79ce90c60b94d8b6e1eddd67096", - "sha256:485a91abe3a07c3a8d1e082ba29254eea3e2bb13cbbd4351ea4e5a21912cc9b0", - "sha256:4a5be350f922430997f240d25f8219f93b0c81e15f7b30b868b2fddfc2d05f27", - "sha256:4d966c47f9dd73c2d32a809d2be529112d509321c5310ebf54076812e6ecd884", - "sha256:524ff0ca3baea164d6d93a32c58ac79eca9f6cf713586fdc0adb66a8cdeab96a", - "sha256:53df009d1e1ba40f696f8995683e067e3967101d4bb4ea6f667931b7d4a01357", - "sha256:5994985da903d0b8a08e4935c46ed8daf5be1cf217489e673910951dc533d430", - "sha256:5cabb9710f09d5d2e9e2748c3e3e20d991a4c5f96ed8f1132518f54ab2967221", - "sha256:5fdb39f67c779b183b0c853cd6b45f7db84b84e0571b3ef1c89cdb1dfc367325", - "sha256:600d04a7b342363058b9190d4e929a8e2e715c5682a70cc37d5ded1e0dd370b4", - "sha256:631cb7415225954fdcc2a024119101946793e5923f6c4d73a5914d27eb3d3a05", - "sha256:63974d168b6233b4ed6a0046296803cb13c56637a7b8106564ab575926572a55", - "sha256:64322bfa13e44c6c30c518729ef08fda6026b96d5c0be724b3c4ae4da939f875", - "sha256:655f8f4c8d6a5963c9a0687793da37b9b681d9ad06f29438a3b2326d4e6b7970", - "sha256:6835451b57c1b467b95ffb03a38bb75b52fb4dc2762bb1d9dbed8de31ea7d0fc", - "sha256:6db2eb9654a85ada248afa5a6db5ff1cf0f7b16043a6b070adc4a5be68c716d6", - "sha256:7c4d1894fe112b0864c1fa75dffa045720a194b227bed12f4be7f6045b25209f", - "sha256:7eb037106f5c6b3b0b864ad226b0b7ab58157124161d48e4b30c4a43fef8bc4b", - "sha256:8282bab177a9a3081fd3d0a0175a07a1e2bfb7fcbbd949519ea0980f8a07144d", - "sha256:82f55187a5bebae7d81d35b1e9aaea5e169d44819789837cdd4720d768c55d15", - "sha256:8572cadbf4cfa95fb4187775b5ade2eaa93511f07947b38f4cd67cf10783b118", - "sha256:8cdbbd92154db2fec4ec973d45c565e767ddc20aa6dbaf50142676484cbff8ee", - "sha256:8f6e6aed5818c264412ac0598b581a002a9f050cb2637a84979859e70197aa9e", - "sha256:92f675fefa977625105708492850bcbc1182bfc3e997f8eecb866d1927c98ae6", - "sha256:962ed72424bf1f72334e2f1e61b68f16c0e596f024ca7ac5daf229f7c26e4208", - "sha256:9badf8d45171d92387410b04639d73811b785b5161ecadabf056ea14d62d4ede", - "sha256:9c120c9ce3b163b985a3b966bb701114beb1da4b0468b9b236fc754783d85aa3", - "sha256:9f6f3e2598604956480f6c8aa24a3384dbf6509fe995d97f6ca6103bb8c2534e", - "sha256:a1254357f7e4c82e77c348dabf2d55f1d14d19d91ff025004775e70a6ef40ada", - "sha256:a1392e0638af203cee360495fd2cfdd6054711f2db5175b6e9c3c461b76f5175", - "sha256:a1c311fd06ab3b10805abb72109f01a134019739bd3286b8ae1bc2fc4e50c07a", - "sha256:a5cb87bdc2e5f620693148b5f8f842d293cae46c5f15a1b1bf7ceeed324a740c", - "sha256:a7a7902bf75779bc12ccfc508bfb7a4c47063f748ea3de87135d433a4cca7a2f", - "sha256:aad7bd686363d1ce4ee930ad39f14e1673248373f4a9d74d2b9554f06199fb58", - "sha256:aafdb89fdeb5fe165043896817eccd6434aee124d5ee9b354f92cd574ba5e78f", - "sha256:ae8a8843b11dc0b03b57b52793e391f0122e740de3df1474814c700d2622950a", - "sha256:b00bc4619f60c853556b35f83731bd817f989cba3e97dc792bb8c97941b8053a", - "sha256:b1f22a9ab44de5f082216270552aa54259db20189e68fc12484873d926426921", - "sha256:b3c01c2fb081fced3bbb3da78510693dc7121bb893a1f0f5f4b48013201f362e", - "sha256:b3dcd587b69bbf54fc04ca157c2323b8911033e827fffaecf0cafa5a892a0904", - "sha256:b4a6db486ac8e99ae696e09efc8b2b9fea67b63c8f88ba7a1a16c24a057a0776", - "sha256:bec7dd208a4182e99c5b6c501ce0b1f49de2802448d4056091f8e630b28e9a52", - "sha256:c0877239307b7e69d025b73774e88e86ce82f6ba6adf98f41069d5b0b78bd1bf", - "sha256:caa48fc31fc7243e50188197b5f0c4228956f97b954f76da157aae7f67269ae8", - "sha256:cfe1090245c078720d250d19cb05d67e21a9cd7c257698ef139bc41cf6c27b4f", - "sha256:d43002441932f9a9ea5d6f9efaa2e21458221a3a4b417a14027a1d530201ef1b", - "sha256:d64728ee14e667ba27c66314b7d880b8eeb050e58ffc5fec3b7a109f8cddbd63", - "sha256:d6495008733c7521a89422d7a68efa0a0122c99a5861f06020ef5b1f51f9ba7c", - "sha256:d8f1ebca515a03e5654f88411420fea6380fc841d1bea08effb28184e3d4899f", - "sha256:d99277877daf2efe074eae6338453a4ed54a2d93fb4678ddfe1209a0c93a2468", - "sha256:da01bec0a26befab4898ed83b362993c844b9a607a86add78604186297eb047e", - "sha256:db9a28c063c7c00844ae42a80203eb6d2d6bbb97070cfa00194dff40e6f545ab", - "sha256:dda81e5ec82485155a19d9624cfcca9be88a405e2857354e5b089c2a982144b2", - "sha256:e357571bb0efd65fd55f18db0a2fb0ed89d0bb1d41d906b138f088933ae618bb", - "sha256:e544246b859f17373bed915182ab841b80849ed9cf23f1f07b73b7c58baee5fb", - "sha256:e562617a45b5a9da5be4abe72b971d4f00bf8555eb29bb91ec2ef2be348cd132", - "sha256:e570ffeb2170e116a5b17e83f19911020ac79d19c96f320cbfa1fa96b470185b", - "sha256:e6f31a17acede6a8cd1ae2d123ce04d8cca74056c9d456075f4f6f85de055607", - "sha256:e9121b4009339b0f751955baf4543a0bfd6bc3f8188f8056b1a25a2d45099934", - "sha256:ebedb45b9feb7258fac0a268a3f6bec0a2ea4d9558f3d6f813f02ff3a6dc6698", - "sha256:ecaac27da855b8d73f92123e5f03612b04c5632fd0a476e469dfc47cd37d6b2e", - "sha256:ecdbde46235f3d560b18be0cb706c8e8ad1b965e5c13bbba7450c86064e96561", - "sha256:ed550ed05540c03f0e69e6d74ad58d026de61b9eaebebbaaf8873e585cbb18de", - "sha256:eeb3d3d6b399ffe55f9a04e09e635554012f1980696d6b0aca3e6cf42a17a03b", - "sha256:ef337945bbd76cce390d1b2496ccf9f90b1c1242a3a7bc242ca4a9fc5993427a", - "sha256:f1365e032a477c1430cfe0cf2856679529a2331426f8081172c4a74186f1d595", - "sha256:f23b55eb5464468f9e0e9a9935ce3ed2a870608d5f534025cd5536bca25b1402", - "sha256:f2e9072d71c1f6cfc79a36d4484c82823c560e6f5599c43c1ca6b5cdbd54f881", - "sha256:f323306d0556351735b54acbf82904fe30a27b6a7147153cbe6e19aaaa2aa429", - "sha256:f36a3489d9e28fe4b67be9992a23029c3cec0babc3bd9afb39f49844a8c721c5", - "sha256:f64f82cc3443149292b32387086d02a6c7fb39b8781563e0ca7b8d7d9cf72bd7", - "sha256:f6defd966ca3b187ec6c366604e9296f585021d922e666b99c47e78738b5666c", - "sha256:f7c2b8eb9fc872e68b46eeaf835e86bccc3a58ba57d0eedc109cbb14177be531", - "sha256:fa7db7558607afeccb33c0e4bf1c9a9a835e26599e76af6fe2fcea45904083a6", - "sha256:fcb83175cc4936a5425dde3356f079ae03c0802bbdf8ff82c035f8a54b333521" - ], - "markers": "python_version >= '3.7'", - "version": "==2.10.1" + "sha256:06f0d5a1d9e1b7932477c172cc720b3b23c18762ed7a8efa8398298a59d177c7", + "sha256:07982b82d121ed3fc1c51faf6e8f57ff09b1325d2efccaa257dd8c0dd937acca", + "sha256:0f478ec204772a5c8218e30eb813ca43e34005dff2eafa03931b3d8caef87d51", + "sha256:102569d371fadc40d8f8598a59379c37ec60164315884467052830b28cc4e9da", + "sha256:10dca874e35bb60ce4f9f6665bfbfad050dd7573596608aeb9e098621ac331dc", + "sha256:150ba5c86f502c040b822777e2e519b5625b47813bd05f9273a8ed169c97d9ae", + "sha256:1661c668c1bb67b7cec96914329d9ab66755911d093bb9063c4c8914188af6d4", + "sha256:1a2fe7b00a49b51047334d84aafd7e39f80b7675cad0083678c58983662da89b", + "sha256:1ae8048cba95f382dba56766525abca438328455e35c283bb202964f41a780b0", + "sha256:20f724a023042588d0f4396bbbcf4cffd0ddd0ad3ed4f0d8e6d4ac4264bae81e", + "sha256:2133b0e412a47868a358713287ff9f9a328879da547dc88be67481cdac529118", + "sha256:21e3298486c4ea4e4d5cc6fb69e06fb02a4e22089304308817035ac006a7f506", + "sha256:21ebaa4bf6386a3b22eec518da7d679c8363fb7fb70cf6972161e5542f470798", + "sha256:23632132f1fd608034f1a56cc3e484be00854db845b3a4a508834be5a6435a6f", + "sha256:2d5bea8012df5bb6dda1e67d0563ac50b7f64a5d5858348b5c8cb5043811c19d", + "sha256:300616102fb71241ff477a2cbbc847321dbec49428434a2f17f37528721c4948", + "sha256:30a8259569fbeec49cfac7fda3ec8123486ef1b729225222f0d41d5f840b476f", + "sha256:399166f24c33a0c5759ecc4801f040dbc87d412c1a6d6292b2349b4c505effc9", + "sha256:3fac641bbfa43d5a1bed99d28aa1fded1984d31c670a95aac1bf1d36ac6ce137", + "sha256:42c29d54ed4501a30cd71015bf982fa95e4a60117b44e1a200290ce687d3e640", + "sha256:462d599299c5971f03c676e2b63aa80fec5ebc572d89ce766cd11ca8bcb56f3f", + "sha256:4eebbd049008eb800f519578e944b8dc8e0f7d59a5abb5924cc2d4ed3a1834ff", + "sha256:502c062a18d84452858f8aea1e520e12a4d5228fc3621ea5061409d666ea1706", + "sha256:5317c04349472e683803da262c781c42c5628a9be73f4750ac7d13040efb5d2d", + "sha256:5511f962dd1b9b553e9534c3b9c6a4b0c9ded3d8c2be96e61d56f933feef9e1f", + "sha256:561be4e3e952c2f9056fba5267b99be4ec2afadc27261505d4992c50b33c513c", + "sha256:601d3e42452cd4f2891c13fa8c70366d71851c1593ed42f57bf37f40f7dca3c8", + "sha256:644904600c15816a1f9a1bafa6aab0d21db2788abcdf4e2a77951280473f33e1", + "sha256:653a5dfd00f601a0ed6654a8b877b18d65ac32c9d9997456e0ab240807be6cf7", + "sha256:694a5e9f1f2c124a17ff2d0be613fd53ba0c26de588eb4bdab8bca855e550d95", + "sha256:71b4a48a7427f14679f0015b13c712863d28bb1ab700bd11776a5368135c7d60", + "sha256:72bf9308a82b75039b8c8edd2be2924c352eda5da14a920551a8b65d5ee89253", + "sha256:735dceec50fa907a3c314b84ed609dec54b76a814aa14eb90da31d1d36873a5e", + "sha256:73802194f10c394c2bedce7a135ba1d8ba6cff23adf4217612bfc5cf060de34c", + "sha256:780daad9e35b18d10d7219d24bfb30148ca2afc309928e1d4d53de86822593dc", + "sha256:8655f55fe68c4685673265a650ef71beb2d31871c049c8b80262026f23605ee3", + "sha256:877045a7969ace04d59516d5d6a7dee13106822f99a5d8df5e6822941f7bedc8", + "sha256:87bce04f09f0552b66fca0c4e10da78d17cb0e71c205864bab4e9595122cb9d9", + "sha256:8d4dfc66abea3ec6d9f83e837a8f8a7d9d3a76d25c9911735c76d6745950e62c", + "sha256:8ec364e280db4235389b5e1e6ee924723c693cbc98e9d28dc1767041ff9bc388", + "sha256:8fa00fa24ffd8c31fac081bf7be7eb495be6d248db127f8776575a746fa55c95", + "sha256:920c4897e55e2881db6a6da151198e5001552c3777cd42b8a4c2f72eedc2ee91", + "sha256:920f4633bee43d7a2818e1a1a788906df5a17b7ab6fe411220ed92b42940f818", + "sha256:9795f56aa6b2296f05ac79d8a424e94056730c0b860a62b0fdcfe6340b658cc8", + "sha256:98f0edee7ee9cc7f9221af2e1b95bd02810e1c7a6d115cfd82698803d385b28f", + "sha256:99c095457eea8550c9fa9a7a992e842aeae1429dab6b6b378710f62bfb70b394", + "sha256:99d3a433ef5dc3021c9534a58a3686c88363c591974c16c54a01af7efd741f13", + "sha256:99f9a50b56713a598d33bc23a9912224fc5d7f9f292444e6664236ae471ddf17", + "sha256:9c46e556ee266ed3fb7b7a882b53df3c76b45e872fdab8d9cf49ae5e91147fd7", + "sha256:9f5d37ff01edcbace53a402e80793640c25798fb7208f105d87a25e6fcc9ea06", + "sha256:a0b4cfe408cd84c53bab7d83e4209458de676a6ec5e9c623ae914ce1cb79b96f", + "sha256:a497be217818c318d93f07e14502ef93d44e6a20c72b04c530611e45e54c2196", + "sha256:ac89ccc39cd1d556cc72d6752f252dc869dde41c7c936e86beac5eb555041b66", + "sha256:adf28099d061a25fbcc6531febb7a091e027605385de9fe14dd6a97319d614cf", + "sha256:afa01d25769af33a8dac0d905d5c7bb2d73c7c3d5161b2dd6f8b5b5eea6a3c4c", + "sha256:b1fc07896fc1851558f532dffc8987e526b682ec73140886c831d773cef44b76", + "sha256:b49c604ace7a7aa8af31196abbf8f2193be605db6739ed905ecaf62af31ccae0", + "sha256:b9f3e0bffad6e238f7acc20c393c1ed8fab4371e3b3bc311020dfa6020d99212", + "sha256:ba07646f35e4e49376c9831130039d1b478fbfa1215ae62ad62d2ee63cf9c18f", + "sha256:bd88f40f2294440d3f3c6308e50d96a0d3d0973d6f1a5732875d10f569acef49", + "sha256:c0be58529d43d38ae849a91932391eb93275a06b93b79a8ab828b012e916a206", + "sha256:c45f62e4107ebd05166717ac58f6feb44471ed450d07fecd90e5f69d9bf03c48", + "sha256:c56da23034fe66221f2208c813d8aa509eea34d97328ce2add56e219c3a9f41c", + "sha256:c94b5537bf6ce66e4d7830c6993152940a188600f6ae044435287753044a8fe2", + "sha256:cebf8d56fee3b08ad40d332a807ecccd4153d3f1ba8231e111d9759f02edfd05", + "sha256:d0bf6f93a55d3fa7a079d811b29100b019784e2ee6bc06b0bb839538272a5610", + "sha256:d195add190abccefc70ad0f9a0141ad7da53e16183048380e688b466702195dd", + "sha256:d25ef0c33f22649b7a088035fd65ac1ce6464fa2876578df1adad9472f918a76", + "sha256:d6cbdf12ef967a6aa401cf5cdf47850559e59eedad10e781471c960583f25aa1", + "sha256:d8c032ccee90b37b44e05948b449a2d6baed7e614df3d3f47fe432c952c21b60", + "sha256:daff04257b49ab7f4b3f73f98283d3dbb1a65bf3500d55c7beac3c66c310fe34", + "sha256:e83ebbf020be727d6e0991c1b192a5c2e7113eb66e3def0cd0c62f9f266247e4", + "sha256:ed3025a8a7e5a59817b7494686d449ebfbe301f3e757b852c8d0d1961d6be864", + "sha256:f1936ef138bed2165dd8573aa65e3095ef7c2b6247faccd0e15186aabdda7f66", + "sha256:f5247a3d74355f8b1d780d0f3b32a23dd9f6d3ff43ef2037c6dcd249f35ecf4c", + "sha256:fa496cd45cda0165d597e9d6f01e36c33c9508f75cf03c0a650018c5048f578e", + "sha256:fb4363e6c9fc87365c2bc777a1f585a22f2f56642501885ffc7942138499bf54", + "sha256:fb4370b15111905bf8b5ba2129b926af9470f014cb0493a67d23e9d7a48348e8", + "sha256:fbec2af0ebafa57eb82c18c304b37c86a8abddf7022955d1742b3d5471a6339e" + ], + "markers": "python_version >= '3.8'", + "version": "==2.16.1" }, "pylanguagetool": { "hashes": [ @@ -1574,95 +1410,6 @@ "markers": "python_version >= '3.8'", "version": "==0.10.0" }, - "pymongo": { - "hashes": [ - "sha256:014e7049dd019a6663747ca7dae328943e14f7261f7c1381045dfc26a04fa330", - "sha256:055f5c266e2767a88bb585d01137d9c7f778b0195d3dbf4a487ef0638be9b651", - "sha256:05c30fd35cc97f14f354916b45feea535d59060ef867446b5c3c7f9b609dd5dc", - "sha256:0634994b026336195778e5693583c060418d4ab453eff21530422690a97e1ee8", - "sha256:09c7de516b08c57647176b9fc21d929d628e35bcebc7422220c89ae40b62126a", - "sha256:107a234dc55affc5802acb3b6d83cbb8c87355b38a9457fcd8806bdeb8bce161", - "sha256:10a379fb60f1b2406ae57b8899bacfe20567918c8e9d2d545e1b93628fcf2050", - "sha256:128b1485753106c54af481789cdfea12b90a228afca0b11fb3828309a907e10e", - "sha256:1394c4737b325166a65ae7c145af1ebdb9fb153ebedd37cf91d676313e4a67b8", - "sha256:1c63e3a2e8fb815c4b1f738c284a4579897e37c3cfd95fdb199229a1ccfb638a", - "sha256:1e4ed21029d80c4f62605ab16398fe1ce093fff4b5f22d114055e7d9fbc4adb0", - "sha256:1ec71ac633b126c0775ed4604ca8f56c3540f5c21a1220639f299e7a544b55f9", - "sha256:21812453354b151200034750cd30b0140e82ec2a01fd4357390f67714a1bfbde", - "sha256:256c503a75bd71cf7fb9ebf889e7e222d49c6036a48aad5a619f98a0adf0e0d7", - "sha256:2703a9f8f5767986b4f51c259ff452cc837c5a83c8ed5f5361f6e49933743b2f", - "sha256:288c21ab9531b037f7efa4e467b33176bc73a0c27223c141b822ab4a0e66ff2a", - "sha256:2972dd1f1285866aba027eff2f4a2bbf8aa98563c2ced14cb34ee5602b36afdf", - "sha256:2973f113e079fb98515722cd728e1820282721ec9fd52830e4b73cabdbf1eb28", - "sha256:2ca0ba501898b2ec31e6c3acf90c31910944f01d454ad8e489213a156ccf1bda", - "sha256:2d2be5c9c3488fa8a70f83ed925940f488eac2837a996708d98a0e54a861f212", - "sha256:2f8c04277d879146eacda920476e93d520eff8bec6c022ac108cfa6280d84348", - "sha256:325701ae7b56daa5b0692305b7cb505ca50f80a1288abb32ff420a8a209b01ca", - "sha256:3729b8db02063da50eeb3db88a27670d85953afb9a7f14c213ac9e3dca93034b", - "sha256:3919708594b86d0f5cdc713eb6fccd3f9b9532af09ea7a5d843c933825ef56c4", - "sha256:39a1cd5d383b37285641d5a7a86be85274466ae336a61b51117155936529f9b3", - "sha256:3ec6c20385c5a58e16b1ea60c5e4993ea060540671d7d12664f385f2fb32fe79", - "sha256:47aa128be2e66abd9d1a9b0437c62499d812d291f17b55185cb4aa33a5f710a4", - "sha256:49f2af6cf82509b15093ce3569229e0d53c90ad8ae2eef940652d4cf1f81e045", - "sha256:4a0269811661ba93c472c8a60ea82640e838c2eb148d252720a09b5123f2c2fe", - "sha256:518c90bdd6e842c446d01a766b9136fec5ec6cc94f3b8c3f8b4a332786ee6b64", - "sha256:5717a308a703dda2886a5796a07489c698b442f5e409cf7dc2ac93de8d61d764", - "sha256:5802acc012bbb4bce4dff92973dff76482f30ef35dd4cb8ab5b0e06aa8f08c80", - "sha256:5e63146dbdb1eac207464f6e0cfcdb640c9c5ff0f57b754fa96fe252314a1dc6", - "sha256:6695d7136a435c1305b261a9ddb9b3ecec9863e05aab3935b96038145fd3a977", - "sha256:680fa0fc719e1a3dcb81130858368f51d83667d431924d0bcf249644bce8f303", - "sha256:6b18276f14b4b6d92e707ab6db19b938e112bd2f1dc3f9f1a628df58e4fd3f0d", - "sha256:6bafea6061d63059d8bc2ffc545e2f049221c8a4457d236c5cd6a66678673eab", - "sha256:6d6a1b1361f118e7fefa17ae3114e77f10ee1b228b20d50c47c9f351346180c8", - "sha256:747c84f4e690fbe6999c90ac97246c95d31460d890510e4a3fa61b7d2b87aa34", - "sha256:79f41576b3022c2fe9780ae3e44202b2438128a25284a8ddfa038f0785d87019", - "sha256:7b0e6361754ac596cd16bfc6ed49f69ffcd9b60b7bc4bcd3ea65c6a83475e4ff", - "sha256:7e3b0127b260d4abae7b62203c4c7ef0874c901b55155692353db19de4b18bc4", - "sha256:7fc2bb8a74dcfcdd32f89528e38dcbf70a3a6594963d60dc9595e3b35b66e414", - "sha256:806e094e9e85d8badc978af8c95b69c556077f11844655cb8cd2d1758769e521", - "sha256:81dd1308bd5630d2bb5980f00aa163b986b133f1e9ed66c66ce2a5bc3572e891", - "sha256:82e620842e12e8cb4050d2643a81c8149361cd82c0a920fa5a15dc4ca8a4000f", - "sha256:85f2cdc400ee87f5952ebf2a117488f2525a3fb2e23863a8efe3e4ee9e54e4d1", - "sha256:8ab6bcc8e424e07c1d4ba6df96f7fb963bcb48f590b9456de9ebd03b88084fe8", - "sha256:8adf014f2779992eba3b513e060d06f075f0ab2fb3ad956f413a102312f65cdf", - "sha256:9b0f98481ad5dc4cb430a60bbb8869f05505283b9ae1c62bdb65eb5e020ee8e3", - "sha256:9bea9138b0fc6e2218147e9c6ce1ff76ff8e29dc00bb1b64842bd1ca107aee9f", - "sha256:a09bfb51953930e7e838972ddf646c5d5f984992a66d79da6ba7f6a8d8a890cd", - "sha256:a0be99b599da95b7a90a918dd927b20c434bea5e1c9b3efc6a3c6cd67c23f813", - "sha256:a49aca4d961823b2846b739380c847e8964ff7ae0f0a683992b9d926054f0d6d", - "sha256:a4dc1319d0c162919ee7f4ee6face076becae2abbd351cc14f1fe70af5fb20d9", - "sha256:a8273e1abbcff1d7d29cbbb1ea7e57d38be72f1af3c597c854168508b91516c2", - "sha256:a8f7f9feecae53fa18d6a3ea7c75f9e9a1d4d20e5c3f9ce3fba83f07bcc4eee2", - "sha256:ad4f66fbb893b55f96f03020e67dcab49ffde0177c6565ccf9dec4fdf974eb61", - "sha256:af425f323fce1b07755edd783581e7283557296946212f5b1a934441718e7528", - "sha256:b14dd73f595199f4275bed4fb509277470d9b9059310537e3b3daba12b30c157", - "sha256:b4ad70d7cac4ca0c7b31444a0148bd3af01a2662fa12b1ad6f57cd4a04e21766", - "sha256:b80a4ee19b3442c57c38afa978adca546521a8822d663310b63ae2a7d7b13f3a", - "sha256:ba51129fcc510824b6ca6e2ce1c27e3e4d048b6e35d3ae6f7e517bed1b8b25ce", - "sha256:c011bd5ad03cc096f99ffcfdd18a1817354132c1331bed7a837a25226659845f", - "sha256:cc94f9fea17a5af8cf1a343597711a26b0117c0b812550d99934acb89d526ed2", - "sha256:ccd785fafa1c931deff6a7116e9a0d402d59fabe51644b0d0c268295ff847b25", - "sha256:d16a534da0e39785687b7295e2fcf9a339f4a20689024983d11afaa4657f8507", - "sha256:d3077a31633beef77d057c6523f5de7271ddef7bde5e019285b00c0cc9cac1e3", - "sha256:d603edea1ff7408638b2504905c032193b7dcee7af269802dbb35bc8c3310ed5", - "sha256:db082f728160369d9a6ed2e722438291558fc15ce06d0a7d696a8dad735c236b", - "sha256:ddef295aaf80cefb0c1606f1995899efcb17edc6b327eb6589e234e614b87756", - "sha256:e16ade71c93f6814d095d25cd6d28a90d63511ea396bd96e9ffcb886b278baaa", - "sha256:e24025625bad66895b1bc3ae1647f48f0a92dd014108fb1be404c77f0b69ca67", - "sha256:e3db7d833a7c38c317dc95b54e27f1d27012e031b45a7c24e360b53197d5f6e7", - "sha256:e5e193f89f4f8c1fe273f9a6e6df915092c9f2af6db2d1afb8bd53855025c11f", - "sha256:eb438a8bf6b695bf50d57e6a059ff09652a07968b2041178b3744ea785fcef9b", - "sha256:ebf02c32afa6b67e5861a27183dd98ed88419a94a2ab843cc145fb0bafcc5b28", - "sha256:ecd9e1fa97aa11bf67472220285775fa15e896da108f425e55d23d7540a712ce", - "sha256:ef67fedd863ffffd4adfd46d9d992b0f929c7f61a8307366d664d93517f2c78e", - "sha256:f28ae33dc5a0b9cee06e95fd420e42155d83271ab75964baf747ce959cac5f52", - "sha256:fb1c56d891f9e34303c451998ef62ba52659648bb0d75b03c5e4ac223a3342c2", - "sha256:fe03bf25fae4b95d8afe40004a321df644400fdcba4c8e5e1a19c1085b740888" - ], - "index": "pypi", - "markers": "python_version >= '3.7'", - "version": "==4.6.0" - }, "pyproj": { "hashes": [ "sha256:18faa54a3ca475bfe6255156f2f2874e9a1c8917b0004eee9f664b86ccc513d3", @@ -1710,7 +1457,7 @@ "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", "version": "==2.8.2" }, "python-dotenv": { @@ -1729,71 +1476,6 @@ ], "version": "==2023.4" }, - "pyyaml": { - "hashes": [ - "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5", - "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc", - "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df", - "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741", - "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206", - "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27", - "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595", - "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62", - "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98", - "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696", - "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290", - "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9", - "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d", - "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6", - "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867", - "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47", - "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486", - "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6", - "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3", - "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007", - "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938", - "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0", - "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c", - "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735", - "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d", - "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28", - "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4", - "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba", - "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8", - "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef", - "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5", - "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd", - "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3", - "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0", - "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515", - "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c", - "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c", - "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924", - "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34", - "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43", - "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859", - "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673", - "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54", - "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a", - "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b", - "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab", - "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa", - "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c", - "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585", - "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d", - "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f" - ], - "markers": "python_version >= '3.6'", - "version": "==6.0.1" - }, - "referencing": { - "hashes": [ - "sha256:39240f2ecc770258f28b642dd47fd74bc8b02484de54e1882b74b35ebd779bd5", - "sha256:c775fedf74bc0f9189c2a3be1c12fd03e8c23f4d371dce795df44e06c5b412f7" - ], - "markers": "python_version >= '3.8'", - "version": "==0.33.0" - }, "regex": { "hashes": [ "sha256:0694219a1d54336fd0445ea382d49d36882415c0134ee1e8332afd1529f0baa5", @@ -1911,111 +1593,6 @@ "markers": "python_version >= '3.7'", "version": "==2.31.0" }, - "rpds-py": { - "hashes": [ - "sha256:01f58a7306b64e0a4fe042047dd2b7d411ee82e54240284bab63e325762c1147", - "sha256:0210b2668f24c078307260bf88bdac9d6f1093635df5123789bfee4d8d7fc8e7", - "sha256:02866e060219514940342a1f84303a1ef7a1dad0ac311792fbbe19b521b489d2", - "sha256:0387ce69ba06e43df54e43968090f3626e231e4bc9150e4c3246947567695f68", - "sha256:060f412230d5f19fc8c8b75f315931b408d8ebf56aec33ef4168d1b9e54200b1", - "sha256:071bc28c589b86bc6351a339114fb7a029f5cddbaca34103aa573eba7b482382", - "sha256:0bfb09bf41fe7c51413f563373e5f537eaa653d7adc4830399d4e9bdc199959d", - "sha256:10162fe3f5f47c37ebf6d8ff5a2368508fe22007e3077bf25b9c7d803454d921", - "sha256:149c5cd24f729e3567b56e1795f74577aa3126c14c11e457bec1b1c90d212e38", - "sha256:1701fc54460ae2e5efc1dd6350eafd7a760f516df8dbe51d4a1c79d69472fbd4", - "sha256:1957a2ab607f9added64478a6982742eb29f109d89d065fa44e01691a20fc20a", - "sha256:1a746a6d49665058a5896000e8d9d2f1a6acba8a03b389c1e4c06e11e0b7f40d", - "sha256:1bfcad3109c1e5ba3cbe2f421614e70439f72897515a96c462ea657261b96518", - "sha256:1d36b2b59e8cc6e576f8f7b671e32f2ff43153f0ad6d0201250a7c07f25d570e", - "sha256:1db228102ab9d1ff4c64148c96320d0be7044fa28bd865a9ce628ce98da5973d", - "sha256:1dc29db3900cb1bb40353772417800f29c3d078dbc8024fd64655a04ee3c4bdf", - "sha256:1e626b365293a2142a62b9a614e1f8e331b28f3ca57b9f05ebbf4cf2a0f0bdc5", - "sha256:1f3c3461ebb4c4f1bbc70b15d20b565759f97a5aaf13af811fcefc892e9197ba", - "sha256:20de7b7179e2031a04042e85dc463a93a82bc177eeba5ddd13ff746325558aa6", - "sha256:24e4900a6643f87058a27320f81336d527ccfe503984528edde4bb660c8c8d59", - "sha256:2528ff96d09f12e638695f3a2e0c609c7b84c6df7c5ae9bfeb9252b6fa686253", - "sha256:25f071737dae674ca8937a73d0f43f5a52e92c2d178330b4c0bb6ab05586ffa6", - "sha256:270987bc22e7e5a962b1094953ae901395e8c1e1e83ad016c5cfcfff75a15a3f", - "sha256:292f7344a3301802e7c25c53792fae7d1593cb0e50964e7bcdcc5cf533d634e3", - "sha256:2953937f83820376b5979318840f3ee47477d94c17b940fe31d9458d79ae7eea", - "sha256:2a792b2e1d3038daa83fa474d559acfd6dc1e3650ee93b2662ddc17dbff20ad1", - "sha256:2a7b2f2f56a16a6d62e55354dd329d929560442bd92e87397b7a9586a32e3e76", - "sha256:2f4eb548daf4836e3b2c662033bfbfc551db58d30fd8fe660314f86bf8510b93", - "sha256:3664d126d3388a887db44c2e293f87d500c4184ec43d5d14d2d2babdb4c64cad", - "sha256:3677fcca7fb728c86a78660c7fb1b07b69b281964673f486ae72860e13f512ad", - "sha256:380e0df2e9d5d5d339803cfc6d183a5442ad7ab3c63c2a0982e8c824566c5ccc", - "sha256:3ac732390d529d8469b831949c78085b034bff67f584559340008d0f6041a049", - "sha256:4128980a14ed805e1b91a7ed551250282a8ddf8201a4e9f8f5b7e6225f54170d", - "sha256:4341bd7579611cf50e7b20bb8c2e23512a3dc79de987a1f411cb458ab670eb90", - "sha256:436474f17733c7dca0fbf096d36ae65277e8645039df12a0fa52445ca494729d", - "sha256:4dc889a9d8a34758d0fcc9ac86adb97bab3fb7f0c4d29794357eb147536483fd", - "sha256:4e21b76075c01d65d0f0f34302b5a7457d95721d5e0667aea65e5bb3ab415c25", - "sha256:516fb8c77805159e97a689e2f1c80655c7658f5af601c34ffdb916605598cda2", - "sha256:5576ee2f3a309d2bb403ec292d5958ce03953b0e57a11d224c1f134feaf8c40f", - "sha256:5a024fa96d541fd7edaa0e9d904601c6445e95a729a2900c5aec6555fe921ed6", - "sha256:5d0e8a6434a3fbf77d11448c9c25b2f25244226cfbec1a5159947cac5b8c5fa4", - "sha256:5e7d63ec01fe7c76c2dbb7e972fece45acbb8836e72682bde138e7e039906e2c", - "sha256:60e820ee1004327609b28db8307acc27f5f2e9a0b185b2064c5f23e815f248f8", - "sha256:637b802f3f069a64436d432117a7e58fab414b4e27a7e81049817ae94de45d8d", - "sha256:65dcf105c1943cba45d19207ef51b8bc46d232a381e94dd38719d52d3980015b", - "sha256:698ea95a60c8b16b58be9d854c9f993c639f5c214cf9ba782eca53a8789d6b19", - "sha256:70fcc6c2906cfa5c6a552ba7ae2ce64b6c32f437d8f3f8eea49925b278a61453", - "sha256:720215373a280f78a1814becb1312d4e4d1077b1202a56d2b0815e95ccb99ce9", - "sha256:7450dbd659fed6dd41d1a7d47ed767e893ba402af8ae664c157c255ec6067fde", - "sha256:7b7d9ca34542099b4e185b3c2a2b2eda2e318a7dbde0b0d83357a6d4421b5296", - "sha256:7fbd70cb8b54fe745301921b0816c08b6d917593429dfc437fd024b5ba713c58", - "sha256:81038ff87a4e04c22e1d81f947c6ac46f122e0c80460b9006e6517c4d842a6ec", - "sha256:810685321f4a304b2b55577c915bece4c4a06dfe38f6e62d9cc1d6ca8ee86b99", - "sha256:82ada4a8ed9e82e443fcef87e22a3eed3654dd3adf6e3b3a0deb70f03e86142a", - "sha256:841320e1841bb53fada91c9725e766bb25009cfd4144e92298db296fb6c894fb", - "sha256:8587fd64c2a91c33cdc39d0cebdaf30e79491cc029a37fcd458ba863f8815383", - "sha256:8ffe53e1d8ef2520ebcf0c9fec15bb721da59e8ef283b6ff3079613b1e30513d", - "sha256:9051e3d2af8f55b42061603e29e744724cb5f65b128a491446cc029b3e2ea896", - "sha256:91e5a8200e65aaac342a791272c564dffcf1281abd635d304d6c4e6b495f29dc", - "sha256:93432e747fb07fa567ad9cc7aaadd6e29710e515aabf939dfbed8046041346c6", - "sha256:938eab7323a736533f015e6069a7d53ef2dcc841e4e533b782c2bfb9fb12d84b", - "sha256:9584f8f52010295a4a417221861df9bea4c72d9632562b6e59b3c7b87a1522b7", - "sha256:9737bdaa0ad33d34c0efc718741abaafce62fadae72c8b251df9b0c823c63b22", - "sha256:99da0a4686ada4ed0f778120a0ea8d066de1a0a92ab0d13ae68492a437db78bf", - "sha256:99f567dae93e10be2daaa896e07513dd4bf9c2ecf0576e0533ac36ba3b1d5394", - "sha256:9bdf1303df671179eaf2cb41e8515a07fc78d9d00f111eadbe3e14262f59c3d0", - "sha256:9f0e4dc0f17dcea4ab9d13ac5c666b6b5337042b4d8f27e01b70fae41dd65c57", - "sha256:a000133a90eea274a6f28adc3084643263b1e7c1a5a66eb0a0a7a36aa757ed74", - "sha256:a3264e3e858de4fc601741498215835ff324ff2482fd4e4af61b46512dd7fc83", - "sha256:a71169d505af63bb4d20d23a8fbd4c6ce272e7bce6cc31f617152aa784436f29", - "sha256:a967dd6afda7715d911c25a6ba1517975acd8d1092b2f326718725461a3d33f9", - "sha256:aa5bfb13f1e89151ade0eb812f7b0d7a4d643406caaad65ce1cbabe0a66d695f", - "sha256:ae35e8e6801c5ab071b992cb2da958eee76340e6926ec693b5ff7d6381441745", - "sha256:b686f25377f9c006acbac63f61614416a6317133ab7fafe5de5f7dc8a06d42eb", - "sha256:b760a56e080a826c2e5af09002c1a037382ed21d03134eb6294812dda268c811", - "sha256:b86b21b348f7e5485fae740d845c65a880f5d1eda1e063bc59bef92d1f7d0c55", - "sha256:b9412abdf0ba70faa6e2ee6c0cc62a8defb772e78860cef419865917d86c7342", - "sha256:bd345a13ce06e94c753dab52f8e71e5252aec1e4f8022d24d56decd31e1b9b23", - "sha256:be22ae34d68544df293152b7e50895ba70d2a833ad9566932d750d3625918b82", - "sha256:bf046179d011e6114daf12a534d874958b039342b347348a78b7cdf0dd9d6041", - "sha256:c3d2010656999b63e628a3c694f23020322b4178c450dc478558a2b6ef3cb9bb", - "sha256:c64602e8be701c6cfe42064b71c84ce62ce66ddc6422c15463fd8127db3d8066", - "sha256:d65e6b4f1443048eb7e833c2accb4fa7ee67cc7d54f31b4f0555b474758bee55", - "sha256:d8bbd8e56f3ba25a7d0cf980fc42b34028848a53a0e36c9918550e0280b9d0b6", - "sha256:da1ead63368c04a9bded7904757dfcae01eba0e0f9bc41d3d7f57ebf1c04015a", - "sha256:dbbb95e6fc91ea3102505d111b327004d1c4ce98d56a4a02e82cd451f9f57140", - "sha256:dbc56680ecf585a384fbd93cd42bc82668b77cb525343170a2d86dafaed2a84b", - "sha256:df3b6f45ba4515632c5064e35ca7f31d51d13d1479673185ba8f9fefbbed58b9", - "sha256:dfe07308b311a8293a0d5ef4e61411c5c20f682db6b5e73de6c7c8824272c256", - "sha256:e796051f2070f47230c745d0a77a91088fbee2cc0502e9b796b9c6471983718c", - "sha256:efa767c220d94aa4ac3a6dd3aeb986e9f229eaf5bce92d8b1b3018d06bed3772", - "sha256:f0b8bf5b8db49d8fd40f54772a1dcf262e8be0ad2ab0206b5a2ec109c176c0a4", - "sha256:f175e95a197f6a4059b50757a3dca33b32b61691bdbd22c29e8a8d21d3914cae", - "sha256:f2f3b28b40fddcb6c1f1f6c88c6f3769cd933fa493ceb79da45968a21dccc920", - "sha256:f6c43b6f97209e370124baf2bf40bb1e8edc25311a158867eb1c3a5d449ebc7a", - "sha256:f7f4cb1f173385e8a39c29510dd11a78bf44e360fb75610594973f5ea141028b", - "sha256:fad059a4bd14c45776600d223ec194e77db6c20255578bb5bcdd7c18fd169361", - "sha256:ff1dcb8e8bc2261a088821b2595ef031c91d499a0c1b031c152d43fe0a6ecec8", - "sha256:ffee088ea9b593cc6160518ba9bd319b5475e5f3e578e4552d63818773c6f56a" - ], - "markers": "python_version >= '3.8'", - "version": "==0.17.1" - }, "s3fs": { "hashes": [ "sha256:0d5a99039665f30b2dbee5495de3b299a022d51b3195a9440f5df47c2621b777", @@ -2033,21 +1610,6 @@ "markers": "python_version >= '3.7'", "version": "==0.8.2" }, - "sagemaker": { - "hashes": [ - "sha256:6798d51a32e583c6d29355d947423b53e0d10271ae36bce609c8dd5ddced3e7b" - ], - "index": "pypi", - "markers": "python_version >= '3.8'", - "version": "==2.198.0" - }, - "schema": { - "hashes": [ - "sha256:f06717112c61895cabc4707752b88716e8420a8819d71404501e114f91043197", - "sha256:f3ffdeeada09ec34bf40d7d79996d9f7175db93b7a5065de0faa7f41083c1e6c" - ], - "version": "==0.7.5" - }, "scikit-learn": { "hashes": [ "sha256:0402638c9a7c219ee52c94cbebc8fcb5eb9fe9c773717965c1f4185588ad3107", @@ -2173,17 +1735,9 @@ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", "version": "==1.16.0" }, - "smdebug-rulesconfig": { - "hashes": [ - "sha256:104da3e6931ecf879dfc687ca4bbb3bee5ea2bc27f4478e9dbb3ee3655f1ae61", - "sha256:7a19e6eb2e6bcfefbc07e4a86ef7a88f32495001a038bf28c7d8e77ab793fcd6" - ], - "markers": "python_version >= '2.7'", - "version": "==1.0.1" - }, "sniffio": { "hashes": [ "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101", @@ -2208,14 +1762,6 @@ "markers": "python_version >= '3.8'", "version": "==1.12" }, - "tblib": { - "hashes": [ - "sha256:059bd77306ea7b419d4f76016aef6d7027cc8a0785579b5aad198803435f882c", - "sha256:289fa7359e580950e7d9743eab36b0691f0310fce64dee7d9c31065b8f723e23" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==1.7.0" - }, "textblob": { "hashes": [ "sha256:15546d7f309e96a3f542bee42751c8e5ce4d519d3d274ee79df2318141f0b788", @@ -2520,14 +2066,6 @@ ], "markers": "python_version >= '3.7'", "version": "==1.9.4" - }, - "zipp": { - "hashes": [ - "sha256:0e923e726174922dce09c53c59ad483ff7bbb8e572e00c7f7c46b88556409f31", - "sha256:84e64a1c28cf7e91ed2078bb8cc8c259cb19b76942096c8d7b84947690cabaf0" - ], - "markers": "python_version >= '3.8'", - "version": "==3.17.0" } }, "develop": { @@ -3126,11 +2664,11 @@ }, "ipython": { "hashes": [ - "sha256:2f21bd3fc1d51550c89ee3944ae04bbc7bc79e129ea0937da6e6c68bfdbf117a", - "sha256:bc9716aad6f29f36c449e30821c9dd0c1c1a7b59ddcc26931685b87b4c569619" + "sha256:1050a3ab8473488d7eee163796b02e511d0735cf43a04ba2a8348bd0f2eaf8a5", + "sha256:48fbc236fbe0e138b88773fa0437751f14c3645fb483f1d4c5dee58b37e5ce73" ], "markers": "python_version >= '3.10'", - "version": "==8.20.0" + "version": "==8.21.0" }, "isoduration": { "hashes": [ @@ -3170,6 +2708,7 @@ "version": "==2.4" }, "jsonschema": { + "extras": ["format-nongpl"], "hashes": [ "sha256:7996507afae316306f9e2290407761157c6f78002dcf7419acb99822143d1c6f", "sha256:85727c00279f5fa6bedbe6238d2aa6403bedd8b4864ab11207d07df3cc1b2ee5" @@ -3704,11 +3243,11 @@ }, "platformdirs": { "hashes": [ - "sha256:11c8f37bcca40db96d8144522d925583bdb7a31f7b0e37e3ed4318400a8e2380", - "sha256:906d548203468492d432bcb294d4bc2fff751bf84971fbb2c10918cc206ee420" + "sha256:0614df2a2f37e1a662acbd8e2b25b92ccf8632929bc6d43467e17fe89c75e068", + "sha256:ef0cc731df711022c174543cb70a9b5bd22e5a9337c8624ef2c2ceb8ddad8768" ], "markers": "python_version >= '3.8'", - "version": "==4.1.0" + "version": "==4.2.0" }, "plotly": { "hashes": [ @@ -3851,7 +3390,7 @@ "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", "version": "==2.8.2" }, "python-json-logger": { @@ -4177,7 +3716,7 @@ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", "version": "==1.16.0" }, "sniffio": { diff --git a/src/bdc/steps/social_media_api.py b/deprecated/steps/social_media_api.py similarity index 100% rename from src/bdc/steps/social_media_api.py rename to deprecated/steps/social_media_api.py diff --git a/src/bdc/pipeline.py b/src/bdc/pipeline.py index 7764899..22f2294 100644 --- a/src/bdc/pipeline.py +++ b/src/bdc/pipeline.py @@ -52,7 +52,7 @@ def run(self): # cleanup step.finish() - except StepError as e: + except (StepError, Exception) as e: error_occurred = True log.error(f"Step {step.name} failed! {e}") finally: diff --git a/src/bdc/steps/__init__.py b/src/bdc/steps/__init__.py index 3150b5e..40a9222 100644 --- a/src/bdc/steps/__init__.py +++ b/src/bdc/steps/__init__.py @@ -11,4 +11,3 @@ from .regionalatlas import RegionalAtlas from .scrape_address import ScrapeAddress from .search_offeneregister import SearchOffeneRegister -from .social_media_api import FacebookGraphAPI diff --git a/src/demo/console_utils.py b/src/demo/console_utils.py index b4911dc..ab9de27 100644 --- a/src/demo/console_utils.py +++ b/src/demo/console_utils.py @@ -87,7 +87,7 @@ def get_multiple_choice(prompt: str, choices: list) -> str: prompt += "".join( f"({index}) : {choice} \n" for index, choice in enumerate(choices) ) - ind = get_int_input(prompt, range(len(choices) + 1)) + ind = get_int_input(prompt, range(len(choices))) return choices[ind] except ValueError: print("Invalid input. Please enter a valid integer.") diff --git a/src/demo/pipeline_configs/config_sprint09_release.json b/src/demo/pipeline_configs/config_sprint09_release.json index a6ff45b..080cd4b 100644 --- a/src/demo/pipeline_configs/config_sprint09_release.json +++ b/src/demo/pipeline_configs/config_sprint09_release.json @@ -10,10 +10,6 @@ "name": "ScrapeAddress", "force_refresh": true }, - { - "name": "FacebookGraphAPI", - "force_refresh": true - }, { "name": "PreprocessPhonenumbers", "force_refresh": true diff --git a/src/demo/pipeline_configs/config_template b/src/demo/pipeline_configs/config_template index fb505d2..c48e5e8 100644 --- a/src/demo/pipeline_configs/config_template +++ b/src/demo/pipeline_configs/config_template @@ -11,10 +11,6 @@ "name": "ScrapeAddress", "force_refresh": true }, - { - "name": "FacebookGraphAPI", - "force_refresh": true - }, { "name": "PreprocessPhonenumbers", "force_refresh": true diff --git a/src/demo/pipeline_configs/run_all_steps.json b/src/demo/pipeline_configs/run_all_steps.json index 139a8d8..1e442f0 100644 --- a/src/demo/pipeline_configs/run_all_steps.json +++ b/src/demo/pipeline_configs/run_all_steps.json @@ -14,10 +14,6 @@ "name": "ScrapeAddress", "force_refresh": true }, - { - "name": "FacebookGraphAPI", - "force_refresh": true - }, { "name": "PreprocessPhonenumbers", "force_refresh": true diff --git a/src/demo/pipeline_utils.py b/src/demo/pipeline_utils.py index c7120c7..23c77b5 100644 --- a/src/demo/pipeline_utils.py +++ b/src/demo/pipeline_utils.py @@ -10,7 +10,6 @@ from bdc.steps import ( AnalyzeEmails, - FacebookGraphAPI, GooglePlaces, GooglePlacesDetailed, GPTReviewSentimentAnalyzer, @@ -28,7 +27,6 @@ STEP_STR_TO_CLASS = { "HashGenerator": HashGenerator, "AnalyzeEmails": AnalyzeEmails, - "FacebookGraphAPI": FacebookGraphAPI, "GooglePlaces": GooglePlaces, "GooglePlacesDetailed": GooglePlacesDetailed, "GPTReviewSentimentAnalyzer": GPTReviewSentimentAnalyzer, @@ -44,7 +42,6 @@ _additional_pipeline_steps = [ (ScrapeAddress, "Scrape Address", "(will take a long time)"), (SearchOffeneRegister, "Search OffeneRegister", "(will take a long time)"), - (FacebookGraphAPI, "Facebook Graph API", "(will use token)"), (PreprocessPhonenumbers, "Phone Number Validation", ""), ( GooglePlaces, diff --git a/src/main.py b/src/main.py index aec2fda..be59119 100644 --- a/src/main.py +++ b/src/main.py @@ -3,12 +3,7 @@ import os -from demo import ( - evp_demo, - get_multiple_choice, - pipeline_demo, - preprocessing_demo, -) +from demo import evp_demo, get_multiple_choice, pipeline_demo, preprocessing_demo from logger import get_logger abspath = os.path.abspath(__file__) @@ -18,9 +13,9 @@ log = get_logger() DEMOS = { - "EVP": evp_demo, - "Pipeline": pipeline_demo, + "Base Data Collector": pipeline_demo, "Data preprocessing": preprocessing_demo, + "Estimated Value Predictor": evp_demo, } PROMPT = "Choose demo:\n" diff --git a/tests/test_console_utils.py b/tests/test_console_utils.py index 7b02f9d..c0092d9 100644 --- a/tests/test_console_utils.py +++ b/tests/test_console_utils.py @@ -124,7 +124,7 @@ def test_valid_input(self, mock_get_int_input): result = get_multiple_choice("Select an option: ", choices) self.assertEqual(result, "Option A") mock_get_int_input.assert_called_with( - "Select an option: " + choice_string, range(len(choices) + 1) + "Select an option: " + choice_string, range(len(choices)) ) From bd398df8366e04ab9947d155db07e32b70b1fc68 Mon Sep 17 00:00:00 2001 From: Ahmed Sheta Date: Fri, 2 Feb 2024 01:15:31 +0100 Subject: [PATCH 06/51] added functionality of applying ml model on lead data Signed-off-by: Ahmed Sheta --- src/evp/applying_ml.py | 75 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 src/evp/applying_ml.py diff --git a/src/evp/applying_ml.py b/src/evp/applying_ml.py new file mode 100644 index 0000000..739f537 --- /dev/null +++ b/src/evp/applying_ml.py @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: MIT +# SPDX-FileCopyrightText: 2024 Ahmed Sheta + +import pickle +from io import BytesIO + +import boto3 +import joblib +import pandas as pd + +############################## adapting the preprocessing files ########################### +# Load the data from the CSV files +historical_preprocessed_data = pd.read_csv( + "s3://amos--data--features/preprocessed_data_files/preprocessed_data.csv" +) +toBePredicted_preprocessed_data = pd.read_csv( + "s3://amos--data--events/leads/preprocessed_leads_data.csv" +) + +# Get the columns in the order of f2.csv +historical_columns_order = historical_preprocessed_data.columns + +# Create missing columns in f1.csv and fill with 0 +missing_columns = set(historical_columns_order) - set( + toBePredicted_preprocessed_data.columns +) +for column in missing_columns: + toBePredicted_preprocessed_data[column] = 0 + +for column in toBePredicted_preprocessed_data.columns: + if column not in historical_columns_order: + toBePredicted_preprocessed_data = toBePredicted_preprocessed_data.drop( + column, axis=1 + ) + +# Reorder columns in f1.csv to match f2.csv +toBePredicted_preprocessed_data = toBePredicted_preprocessed_data[ + historical_columns_order +] + +toBePredicted_preprocessed_data.to_csv( + "s3://amos--data--events/leads/toBePredicted_preprocessed_data_updated.csv", + index=False, +) + +assert list(toBePredicted_preprocessed_data.columns) == list( + historical_preprocessed_data.columns +), "Column names are different" + +####################### Applying ML model on lead data #################################### + +bucket_name = "amos--models" +file_key = "models/lightgbm_epochs(1)_f1(0.6375)_numclasses(5)_model_updated.pkl" # adjust according to the desired model + +# create an S3 client +s3 = boto3.client("s3") + +# download the file from S3 +response = s3.get_object(Bucket=bucket_name, Key=file_key) +model_content = response["Body"].read() + +# load model +with BytesIO(model_content) as model_file: + model = joblib.load(model_file) + +data_path = "s3://amos--data--events/leads/toBePredicted_preprocessed_data_updated.csv" +df = pd.read_csv(data_path) +input = df.drop("MerchantSizeByDPV", axis=1) + +print(f"df.columns = {df.columns}") +predictions = model.predict(input) +size_mapping = {0: "XS", 1: "S", 2: "M", 3: "L", 4: "XL"} +remapped_predictions = [size_mapping[prediction] for prediction in predictions] + +print(remapped_predictions) From 4ee3015e39766e99f5f0c040e06231a5a7da44d0 Mon Sep 17 00:00:00 2001 From: Ahmed Sheta Date: Fri, 2 Feb 2024 01:32:55 +0100 Subject: [PATCH 07/51] documentation Signed-off-by: Ahmed Sheta --- src/evp/applying_ml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/evp/applying_ml.py b/src/evp/applying_ml.py index 739f537..8e7954a 100644 --- a/src/evp/applying_ml.py +++ b/src/evp/applying_ml.py @@ -33,7 +33,7 @@ column, axis=1 ) -# Reorder columns in f1.csv to match f2.csv +# reorder columns toBePredicted_preprocessed_data = toBePredicted_preprocessed_data[ historical_columns_order ] @@ -43,6 +43,7 @@ index=False, ) +# check if columns in both dataframe are in same order and same number assert list(toBePredicted_preprocessed_data.columns) == list( historical_preprocessed_data.columns ), "Column names are different" @@ -67,7 +68,6 @@ df = pd.read_csv(data_path) input = df.drop("MerchantSizeByDPV", axis=1) -print(f"df.columns = {df.columns}") predictions = model.predict(input) size_mapping = {0: "XS", 1: "S", 2: "M", 3: "L", 4: "XL"} remapped_predictions = [size_mapping[prediction] for prediction in predictions] From 4a92080fb01f32240ee1f4481fae82dab112af26 Mon Sep 17 00:00:00 2001 From: Ahmed Sheta Date: Fri, 2 Feb 2024 09:37:16 +0100 Subject: [PATCH 08/51] added functionality to export the predictions and to update the preprocessed data paths Signed-off-by: Ahmed Sheta --- src/evp/applying_ml.py | 34 ++++++++++++++++++++++++++++-- src/preprocessing/preprocessing.py | 16 ++++++++------ 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/src/evp/applying_ml.py b/src/evp/applying_ml.py index 8e7954a..af4feac 100644 --- a/src/evp/applying_ml.py +++ b/src/evp/applying_ml.py @@ -1,15 +1,32 @@ # SPDX-License-Identifier: MIT # SPDX-FileCopyrightText: 2024 Ahmed Sheta +import os import pickle +import sys from io import BytesIO import boto3 import joblib import pandas as pd +######################### preprocessing the leads ################################## +current_dir = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd() +parent_dir = os.path.join(current_dir, "..") +sys.path.append(parent_dir) +from preprocessing import Preprocessing + +preprocessor = Preprocessing(filter_null_data=False, historical_data=False) +leads_enriched_path = "s3://amos--data--events/leads/enriched.csv" +preprocessor.data_path = leads_enriched_path +preprocessor.prerocessed_data_output_path = ( + "s3://amos--data--events/leads/preprocessed_leads_data.csv" +) +df = preprocessor.implement_preprocessing_pipeline() +preprocessor.save_preprocessed_data() + ############################## adapting the preprocessing files ########################### -# Load the data from the CSV files +# load the data from the CSV files historical_preprocessed_data = pd.read_csv( "s3://amos--data--features/preprocessed_data_files/preprocessed_data.csv" ) @@ -72,4 +89,17 @@ size_mapping = {0: "XS", 1: "S", 2: "M", 3: "L", 4: "XL"} remapped_predictions = [size_mapping[prediction] for prediction in predictions] -print(remapped_predictions) +# print(remapped_predictions) + +enriched_data = pd.read_csv("s3://amos--data--events/leads/enriched.csv") + +# first 5 columns: Last Name,First Name,Company / Account,Phone,Email, +raw_data = enriched_data.iloc[:, :5] +raw_data["PredictedMerchantSize"] = remapped_predictions + +raw_data.to_csv( + "s3://amos--data--events/leads/predicted_MerchantSize_of_leads.csv", index=True +) +print( + f"Saved the predicted Merchant Size of the leads at s3://amos--data--events/leads/predicted_MerchantSize_of_leads.csv" +) diff --git a/src/preprocessing/preprocessing.py b/src/preprocessing/preprocessing.py index 9f11a10..78f7c06 100644 --- a/src/preprocessing/preprocessing.py +++ b/src/preprocessing/preprocessing.py @@ -31,9 +31,11 @@ class Preprocessing: def __init__(self, filter_null_data=True, historical_data=False): data_repo = get_database() - data_path = data_repo.get_output_path() + self.data_path = data_repo.get_output_path() if historical_data: - input_path_components = data_path.split("\\" if "\\" in data_path else "/") + input_path_components = self.data_path.split( + "\\" if "\\" in self.data_path else "/" + ) input_path_components.pop() input_path_components.pop() input_path_components.append("historical_data/100k_historic_enriched.csv") @@ -41,15 +43,17 @@ def __init__(self, filter_null_data=True, historical_data=False): data = pd.read_csv(input_path) log.debug(f"Data path = {input_path}") else: - log.debug(f"Data path = {data_path}") - data = pd.read_csv(data_path) - self.preprocessed_df = data.copy() + log.debug(f"Data path = {self.data_path}") + data = pd.read_csv(self.data_path) + self.preprocessed_df = data.copy() if historical_data: self.prerocessed_data_output_path = "s3://amos--data--features/preprocessed_data_files/preprocessed_data.csv" else: # created the new output path based on which repo used - path_components = data_path.split("\\" if "\\" in data_path else "/") + path_components = self.data_path.split( + "\\" if "\\" in self.data_path else "/" + ) path_components.pop() path_components.append("preprocessed_data.csv") self.prerocessed_data_output_path = "/".join(path_components) From f2c2286636c07c400ed9ba7acce859f61abfbc32 Mon Sep 17 00:00:00 2001 From: Ahmed Sheta Date: Fri, 2 Feb 2024 10:50:29 +0100 Subject: [PATCH 09/51] added MerchantSize Prediction to demos Signed-off-by: Ahmed Sheta --- src/demo/__init__.py | 9 ++- src/demo/demos.py | 141 +++++++++++++++++++++++++++++++++++++++++++ src/main.py | 2 + 3 files changed, 151 insertions(+), 1 deletion(-) diff --git a/src/demo/__init__.py b/src/demo/__init__.py index a03c078..5a97b1a 100644 --- a/src/demo/__init__.py +++ b/src/demo/__init__.py @@ -2,4 +2,11 @@ # SPDX-FileCopyrightText: 2023 Berkay Bozkurt from .console_utils import get_int_input, get_multiple_choice, get_yes_no_input -from .demos import bdc_demo, db_demo, evp_demo, pipeline_demo, preprocessing_demo +from .demos import ( + bdc_demo, + db_demo, + evp_demo, + pipeline_demo, + predict_MerchantSize_on_lead_data_demo, + preprocessing_demo, +) diff --git a/src/demo/demos.py b/src/demo/demos.py index 0ad094b..9294fcd 100644 --- a/src/demo/demos.py +++ b/src/demo/demos.py @@ -7,6 +7,10 @@ # SPDX-FileCopyrightText: 2023 Ahmed Sheta +import re +import subprocess + +import xgboost as xgb from sklearn.metrics import classification_report from bdc import DataCollector @@ -252,3 +256,140 @@ def preprocessing_demo(): ) df = preprocessor.implement_preprocessing_pipeline() preprocessor.save_preprocessed_data() + + +def predict_MerchantSize_on_lead_data_demo(): + import os + import pickle + import sys + from io import BytesIO + + import boto3 + import joblib + import pandas as pd + + log.info( + "Note: Enriched data must be located at s3://amos--data--events/leads/enriched.csv" + ) + + ######################### preprocessing the leads ################################## + current_dir = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd() + parent_dir = os.path.join(current_dir, "..") + sys.path.append(parent_dir) + from preprocessing import Preprocessing + + preprocessor = Preprocessing(filter_null_data=False, historical_data=False) + leads_enriched_path = "s3://amos--data--events/leads/enriched.csv" + if not leads_enriched_path: + log.error( + "No such file exists in the directory s3://amos--data--events/leads/enriched.csv" + ) + preprocessor.data_path = leads_enriched_path + preprocessor.prerocessed_data_output_path = ( + "s3://amos--data--events/leads/preprocessed_leads_data.csv" + ) + df = preprocessor.implement_preprocessing_pipeline() + preprocessor.save_preprocessed_data() + + ############################## adapting the preprocessing files ########################### + # load the data from the CSV files + historical_preprocessed_data = pd.read_csv( + "s3://amos--data--features/preprocessed_data_files/preprocessed_data.csv" + ) + toBePredicted_preprocessed_data = pd.read_csv( + "s3://amos--data--events/leads/preprocessed_leads_data.csv" + ) + + historical_columns_order = historical_preprocessed_data.columns + + missing_columns = set(historical_columns_order) - set( + toBePredicted_preprocessed_data.columns + ) + for column in missing_columns: + toBePredicted_preprocessed_data[column] = 0 + + for column in toBePredicted_preprocessed_data.columns: + if column not in historical_columns_order: + toBePredicted_preprocessed_data = toBePredicted_preprocessed_data.drop( + column, axis=1 + ) + + # reorder columns + toBePredicted_preprocessed_data = toBePredicted_preprocessed_data[ + historical_columns_order + ] + + toBePredicted_preprocessed_data.to_csv( + "s3://amos--data--events/leads/toBePredicted_preprocessed_data_updated.csv", + index=False, + ) + + # check if columns in both dataframe are in same order and same number + assert list(toBePredicted_preprocessed_data.columns) == list( + historical_preprocessed_data.columns + ), "Column names are different" + + ####################### Applying ML model on lead data #################################### + + bucket_name = "amos--models" + + model_name = get_string_input( + "Provide model file name in amos--models/models S3 Bucket\nInput example: lightgbm_epochs(1)_f1(0.6375)_numclasses(5)_model.pkl\n" + ) + # file_key = "models/lightgbm_epochs(1)_f1(0.6375)_numclasses(5)_model_updated.pkl" # adjust according to the desired model + model_name = model_name.replace(" ", "") + xgb_bool = False + if model_name[:3].lower() == "xgb": + xgb_bool = True + + file_key = f"models/" + model_name + + def check_classification_task(string): + match = re.search(r"\d+", string) + if match: + last_number = int(match.group()) + if last_number == 3: + return True + else: + False + + classification_task_3 = check_classification_task(file_key) + # create an S3 client + s3 = boto3.client("s3") + + # download the file from S3 + response = s3.get_object(Bucket=bucket_name, Key=file_key) + model_content = response["Body"].read() + + # load model + with BytesIO(model_content) as model_file: + model = joblib.load(model_file) + log.info(f"Loaded the model sucessfully!") + + data_path = ( + "s3://amos--data--events/leads/toBePredicted_preprocessed_data_updated.csv" + ) + df = pd.read_csv(data_path) + input = df.drop("MerchantSizeByDPV", axis=1) + if xgb_bool: + input = xgb.DMatrix(input) + + predictions = model.predict(input) + if classification_task_3: + size_mapping = {0: "XS", 1: "{S, M, L}", 2: "XL"} + else: + size_mapping = {0: "XS", 1: "S", 2: "M", 3: "L", 4: "XL"} + remapped_predictions = [size_mapping[prediction] for prediction in predictions] + + enriched_data = pd.read_csv("s3://amos--data--events/leads/enriched.csv") + + # first 5 columns: Last Name,First Name,Company / Account,Phone,Email, + raw_data = enriched_data.iloc[:, :5] + raw_data["PredictedMerchantSize"] = remapped_predictions + + raw_data.to_csv( + "s3://amos--data--events/leads/predicted_MerchantSize_of_leads.csv", index=True + ) + log.info( + f"Saved the predicted Merchant Size of the leads at s3://amos--data--events/leads/predicted_MerchantSize_of_leads.csv" + ) diff --git a/src/main.py b/src/main.py index 49cf73e..63f3f64 100644 --- a/src/main.py +++ b/src/main.py @@ -9,6 +9,7 @@ evp_demo, get_multiple_choice, pipeline_demo, + predict_MerchantSize_on_lead_data_demo, preprocessing_demo, ) from logger import get_logger @@ -25,6 +26,7 @@ "DB": db_demo, "Pipeline": pipeline_demo, "Data preprocessing": preprocessing_demo, + "Merchant Size prediction": predict_MerchantSize_on_lead_data_demo, } PROMPT = "Choose demo:\n" From 98d315e3be7aefe4eee99d05d73356628c77c402 Mon Sep 17 00:00:00 2001 From: Ahmed Sheta Date: Fri, 2 Feb 2024 19:56:12 +0100 Subject: [PATCH 10/51] modified the merge conflicts and removed the applying_ml.py Signed-off-by: Ahmed Sheta --- src/demo/__init__.py | 7 ++- src/evp/applying_ml.py | 105 ----------------------------------------- src/main.py | 10 +++- 3 files changed, 14 insertions(+), 108 deletions(-) delete mode 100644 src/evp/applying_ml.py diff --git a/src/demo/__init__.py b/src/demo/__init__.py index 95ec72f..3da6dd3 100644 --- a/src/demo/__init__.py +++ b/src/demo/__init__.py @@ -2,4 +2,9 @@ # SPDX-FileCopyrightText: 2023 Berkay Bozkurt from .console_utils import get_int_input, get_multiple_choice, get_yes_no_input -from .demos import evp_demo, pipeline_demo, preprocessing_demo, predict_MerchantSize_on_lead_data_demo, +from .demos import ( + evp_demo, + pipeline_demo, + predict_MerchantSize_on_lead_data_demo, + preprocessing_demo, +) diff --git a/src/evp/applying_ml.py b/src/evp/applying_ml.py deleted file mode 100644 index af4feac..0000000 --- a/src/evp/applying_ml.py +++ /dev/null @@ -1,105 +0,0 @@ -# SPDX-License-Identifier: MIT -# SPDX-FileCopyrightText: 2024 Ahmed Sheta - -import os -import pickle -import sys -from io import BytesIO - -import boto3 -import joblib -import pandas as pd - -######################### preprocessing the leads ################################## -current_dir = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd() -parent_dir = os.path.join(current_dir, "..") -sys.path.append(parent_dir) -from preprocessing import Preprocessing - -preprocessor = Preprocessing(filter_null_data=False, historical_data=False) -leads_enriched_path = "s3://amos--data--events/leads/enriched.csv" -preprocessor.data_path = leads_enriched_path -preprocessor.prerocessed_data_output_path = ( - "s3://amos--data--events/leads/preprocessed_leads_data.csv" -) -df = preprocessor.implement_preprocessing_pipeline() -preprocessor.save_preprocessed_data() - -############################## adapting the preprocessing files ########################### -# load the data from the CSV files -historical_preprocessed_data = pd.read_csv( - "s3://amos--data--features/preprocessed_data_files/preprocessed_data.csv" -) -toBePredicted_preprocessed_data = pd.read_csv( - "s3://amos--data--events/leads/preprocessed_leads_data.csv" -) - -# Get the columns in the order of f2.csv -historical_columns_order = historical_preprocessed_data.columns - -# Create missing columns in f1.csv and fill with 0 -missing_columns = set(historical_columns_order) - set( - toBePredicted_preprocessed_data.columns -) -for column in missing_columns: - toBePredicted_preprocessed_data[column] = 0 - -for column in toBePredicted_preprocessed_data.columns: - if column not in historical_columns_order: - toBePredicted_preprocessed_data = toBePredicted_preprocessed_data.drop( - column, axis=1 - ) - -# reorder columns -toBePredicted_preprocessed_data = toBePredicted_preprocessed_data[ - historical_columns_order -] - -toBePredicted_preprocessed_data.to_csv( - "s3://amos--data--events/leads/toBePredicted_preprocessed_data_updated.csv", - index=False, -) - -# check if columns in both dataframe are in same order and same number -assert list(toBePredicted_preprocessed_data.columns) == list( - historical_preprocessed_data.columns -), "Column names are different" - -####################### Applying ML model on lead data #################################### - -bucket_name = "amos--models" -file_key = "models/lightgbm_epochs(1)_f1(0.6375)_numclasses(5)_model_updated.pkl" # adjust according to the desired model - -# create an S3 client -s3 = boto3.client("s3") - -# download the file from S3 -response = s3.get_object(Bucket=bucket_name, Key=file_key) -model_content = response["Body"].read() - -# load model -with BytesIO(model_content) as model_file: - model = joblib.load(model_file) - -data_path = "s3://amos--data--events/leads/toBePredicted_preprocessed_data_updated.csv" -df = pd.read_csv(data_path) -input = df.drop("MerchantSizeByDPV", axis=1) - -predictions = model.predict(input) -size_mapping = {0: "XS", 1: "S", 2: "M", 3: "L", 4: "XL"} -remapped_predictions = [size_mapping[prediction] for prediction in predictions] - -# print(remapped_predictions) - -enriched_data = pd.read_csv("s3://amos--data--events/leads/enriched.csv") - -# first 5 columns: Last Name,First Name,Company / Account,Phone,Email, -raw_data = enriched_data.iloc[:, :5] -raw_data["PredictedMerchantSize"] = remapped_predictions - -raw_data.to_csv( - "s3://amos--data--events/leads/predicted_MerchantSize_of_leads.csv", index=True -) -print( - f"Saved the predicted Merchant Size of the leads at s3://amos--data--events/leads/predicted_MerchantSize_of_leads.csv" -) diff --git a/src/main.py b/src/main.py index 98b7618..266f07f 100644 --- a/src/main.py +++ b/src/main.py @@ -3,7 +3,13 @@ import os -from demo import evp_demo, get_multiple_choice, pipeline_demo, preprocessing_demo, predict_MerchantSize_on_lead_data_demo +from demo import ( + evp_demo, + get_multiple_choice, + pipeline_demo, + predict_MerchantSize_on_lead_data_demo, + preprocessing_demo, +) from logger import get_logger abspath = os.path.abspath(__file__) @@ -16,7 +22,7 @@ "Base Data Collector": pipeline_demo, "Data preprocessing": preprocessing_demo, "Estimated Value Predictor": evp_demo, - "Merchant Size prediction": predict_MerchantSize_on_lead_data_demo, + "Merchant Size Prediction": predict_MerchantSize_on_lead_data_demo, } PROMPT = "Choose demo:\n" From 4d997568b9b4b207c11eb7c25f485b7d46fac542 Mon Sep 17 00:00:00 2001 From: Ahmed Sheta Date: Sat, 3 Feb 2024 02:19:09 +0100 Subject: [PATCH 11/51] inital commit to fix the file names bug Signed-off-by: Ahmed Sheta --- src/database/leads/local_repository.py | 4 +- src/database/leads/s3_repository.py | 4 +- src/demo/demos.py | 72 ++++++++++++++++++++++++-- src/preprocessing/preprocessing.py | 58 +++++++++++---------- 4 files changed, 107 insertions(+), 31 deletions(-) diff --git a/src/database/leads/local_repository.py b/src/database/leads/local_repository.py index ebeb90b..c5e53e4 100644 --- a/src/database/leads/local_repository.py +++ b/src/database/leads/local_repository.py @@ -249,7 +249,9 @@ def save_classification_report(self, report, model_name: str): except Exception as e: log.error(f"Could not save report at {report_file_path}! Error: {str(e)}") - def load_preprocessed_data(self, file_name: str = "preprocessed_data.csv"): + def load_preprocessed_data( + self, file_name: str = "historical_preprocessed_data.csv" + ): try: return pd.read_csv(os.path.join(self.DF_PREPROCESSED_INPUT, file_name)) except FileNotFoundError: diff --git a/src/database/leads/s3_repository.py b/src/database/leads/s3_repository.py index 4264ef4..2e11ed5 100644 --- a/src/database/leads/s3_repository.py +++ b/src/database/leads/s3_repository.py @@ -374,7 +374,9 @@ def save_classification_report(self, report, model_name: str): except Exception as e: log.error(f"Could not save report for '{model_name}' to S3: {str(e)}") - def load_preprocessed_data(self, file_name: str = "preprocessed_data.csv"): + def load_preprocessed_data( + self, file_name: str = "historical_preprocessed_data.csv" + ): file_path = self.DF_PREPROCESSED_INPUT + file_name if not file_path.startswith("s3://"): log.error( diff --git a/src/demo/demos.py b/src/demo/demos.py index c4de78d..41d3054 100644 --- a/src/demo/demos.py +++ b/src/demo/demos.py @@ -10,6 +10,7 @@ import re import subprocess +import pandas as pd import xgboost as xgb from sklearn.metrics import classification_report @@ -40,6 +41,7 @@ INPUT_FILE_BDC = "../data/sumup_leads_email.csv" OUTPUT_FILE_BDC = "../data/collected_data.json" + # evp demo def evp_demo(): data = get_database().load_preprocessed_data() @@ -212,19 +214,71 @@ def pipeline_demo(): def preprocessing_demo(): - if get_yes_no_input("Filter out the API-irrelevant data? (y/n)"): + if get_yes_no_input("Filter out the API-irrelevant data? (y/n)\n"): filter_bool = True else: filter_bool = False if get_yes_no_input( - "Run on historical data ? (y/n)\nNote: DATABASE_TYPE should be S3!" + "Run on historical data ? (y/n)\n'n' means it will run on lead data!\n" ): historical_bool = True else: historical_bool = False + if get_yes_no_input("Run on S3? (y/n)\n'n' means it will run locally!\n"): + S3_bool = True + else: + S3_bool = False + preprocessor = Preprocessing( filter_null_data=filter_bool, historical_data=historical_bool ) + if historical_bool and S3_bool: + preprocessor.data_path = ( + "s3://amos--data--events/historical_data/100k_historic_enriched.csv" + ) + preprocessor.prerocessed_data_output_path = "s3://amos--data--features/preprocessed_data_files/historical_preprocessed_data.csv" + elif historical_bool and not S3_bool: + # input path + input_path_components = preprocessor.data_path.split( + "\\" if "\\" in preprocessor.data_path else "/" + ) + input_path_components.pop() + input_path_components.append("100k_historic_enriched.csv") + input_path = "/".join(input_path_components) + preprocessor.data_path = input_path + + # output path + path_components = preprocessor.data_path.split( + "\\" if "\\" in preprocessor.data_path else "/" + ) + path_components.pop() + path_components.append( + "preprocessed_data_files/historical_preprocessed_data.csv" + ) + preprocessor.prerocessed_data_output_path = "/".join(path_components) + elif not historical_bool and S3_bool: + preprocessor.data_path = "s3://amos--data--events/leads/enriched.csv" + preprocessor.prerocessed_data_output_path = "s3://amos--data--features/preprocessed_data_files/leads_preprocessed_data.csv" + elif not historical_bool and not S3_bool: + # input path + input_path_components = preprocessor.data_path.split( + "\\" if "\\" in preprocessor.data_path else "/" + ) + input_path_components.pop() + input_path_components.append("leads_enriched.csv") + input_path = "/".join(input_path_components) + preprocessor.data_path = input_path + + # output path + path_components = preprocessor.data_path.split( + "\\" if "\\" in preprocessor.data_path else "/" + ) + path_components.pop() + path_components.append("preprocessed_data_files/leads_preprocessed_data.csv") + preprocessor.prerocessed_data_output_path = "/".join(path_components) + + preprocessor.preprocessed_df = pd.read_csv(preprocessor.data_path) + df = preprocessor.implement_preprocessing_pipeline() preprocessor.save_preprocessed_data() @@ -250,7 +304,18 @@ def predict_MerchantSize_on_lead_data_demo(): from preprocessing import Preprocessing preprocessor = Preprocessing(filter_null_data=False, historical_data=False) - leads_enriched_path = "s3://amos--data--events/leads/enriched.csv" + + leads_enriched_path = "s3://amos--data--events/leads/enriched.csv" # S3 path + + # # input path + # input_path_components = preprocessor.data_path.split( + # "\\" if "\\" in preprocessor.data_path else "/" + # ) + # input_path_components.pop() + # input_path_components.append("leads_enriched.csv") + # input_path = "/".join(input_path_components) # local path + # preprocessor.data_path = input_path + if not leads_enriched_path: log.error( "No such file exists in the directory s3://amos--data--events/leads/enriched.csv" @@ -259,6 +324,7 @@ def predict_MerchantSize_on_lead_data_demo(): preprocessor.prerocessed_data_output_path = ( "s3://amos--data--events/leads/preprocessed_leads_data.csv" ) + preprocessor.preprocessed_df = pd.read_csv(leads_enriched_path) df = preprocessor.implement_preprocessing_pipeline() preprocessor.save_preprocessed_data() diff --git a/src/preprocessing/preprocessing.py b/src/preprocessing/preprocessing.py index 78f7c06..f47510b 100644 --- a/src/preprocessing/preprocessing.py +++ b/src/preprocessing/preprocessing.py @@ -32,31 +32,34 @@ class Preprocessing: def __init__(self, filter_null_data=True, historical_data=False): data_repo = get_database() self.data_path = data_repo.get_output_path() - if historical_data: - input_path_components = self.data_path.split( - "\\" if "\\" in self.data_path else "/" - ) - input_path_components.pop() - input_path_components.pop() - input_path_components.append("historical_data/100k_historic_enriched.csv") - input_path = "/".join(input_path_components) - data = pd.read_csv(input_path) - log.debug(f"Data path = {input_path}") - else: - log.debug(f"Data path = {self.data_path}") - data = pd.read_csv(self.data_path) - self.preprocessed_df = data.copy() - - if historical_data: - self.prerocessed_data_output_path = "s3://amos--data--features/preprocessed_data_files/preprocessed_data.csv" - else: - # created the new output path based on which repo used - path_components = self.data_path.split( - "\\" if "\\" in self.data_path else "/" - ) - path_components.pop() - path_components.append("preprocessed_data.csv") - self.prerocessed_data_output_path = "/".join(path_components) + self.preprocessed_df = None + self.prerocessed_data_output_path = None + # if historical_data: + # input_path_components = self.data_path.split( + # "\\" if "\\" in self.data_path else "/" + # ) + # input_path_components.pop() + # input_path_components.pop() + # input_path_components.append("historical_data/100k_historic_enriched.csv") + # input_path = "/".join(input_path_components) + # data = pd.read_csv(input_path) + # log.debug(f"Data path = {input_path}") + # self.preprocessed_df = data.copy() + # else: + # log.debug(f"Data path = {self.data_path}") + # data = pd.read_csv(self.data_path) + # self.preprocessed_df = data.copy() + + # if historical_data: + # self.prerocessed_data_output_path = "s3://amos--data--features/preprocessed_data_files/preprocessed_data.csv" + # else: + # # created the new output path based on which repo used + # path_components = self.data_path.split( + # "\\" if "\\" in self.data_path else "/" + # ) + # path_components.pop() + # path_components.append("preprocessed_data_files/preprocessed_data.csv") + # self.prerocessed_data_output_path = "/".join(path_components) self.filter_bool = filter_null_data # columns that would be added later after one-hot encoding each class @@ -114,7 +117,10 @@ def filter_out_null_data(self): ] def fill_missing_values(self, column, strategy="constant"): - if column in self.preprocessed_df.columns: + if ( + column in self.preprocessed_df.columns + and not self.preprocessed_df[column].empty + ): imputer = SimpleImputer(strategy=strategy) self.preprocessed_df[column] = imputer.fit_transform( self.preprocessed_df[[column]] From 8debd4ae628348108e0d10d2e30d9f79841fa0d2 Mon Sep 17 00:00:00 2001 From: Berkay Bozkurt Date: Sat, 3 Feb 2024 02:46:01 +0100 Subject: [PATCH 12/51] documentation.yml created Signed-off-by: Berkay Bozkurt --- .github/workflows/documentation.yml | 36 ++++++++++++++++++++++++++++ .gitignore | 6 +++++ Pipfile | 2 ++ src/bdc/__init__.py | 3 +++ src/bdc/steps/__init__.py | 21 ++++++++-------- src/bdc/steps/helpers/__init__.py | 6 ++--- src/database/leads/__init__.py | 6 ++--- src/demo/__init__.py | 1 + src/docs/.gitkeep | 0 src/docs/Makefile | 23 ++++++++++++++++++ src/docs/conf.py | 34 ++++++++++++++++++++++++++ src/docs/index.rst | 23 ++++++++++++++++++ src/docs/make.bat | 37 +++++++++++++++++++++++++++++ src/evp/__init__.py | 3 ++- src/logger/__init__.py | 2 +- src/preprocessing/__init__.py | 2 +- 16 files changed, 186 insertions(+), 19 deletions(-) create mode 100644 .github/workflows/documentation.yml create mode 100644 src/docs/.gitkeep create mode 100644 src/docs/Makefile create mode 100644 src/docs/conf.py create mode 100644 src/docs/index.rst create mode 100644 src/docs/make.bat diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml new file mode 100644 index 0000000..937d492 --- /dev/null +++ b/.github/workflows/documentation.yml @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: MIT +# SPDX-FileCopyrightText: 2023 Berkay Bozkurt + +name: documentation + +on: [push, pull_request, workflow_dispatch] + +permissions: + contents: write + +jobs: + docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v3 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pipenv + # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + pipenv install --dev + - name: Generate Sphinx + run: | + cd src/docs + sphinx-apidoc -o . .. + make clean + make html + - name: Deploy to GitHub Pages + uses: peaceiris/actions-gh-pages@v3 + if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/feature/pdoc' }} + with: + publish_branch: gh-pages + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: _build/html/ + force_orphan: true diff --git a/.gitignore b/.gitignore index 202c3dc..1ef6d26 100644 --- a/.gitignore +++ b/.gitignore @@ -56,6 +56,12 @@ bin/ **/data/models/* **/data/classification_reports/* +**/docs/* +!**/docs/conf.py +!**/docs/index.rst +!**/docs/make.bat +!**/docs/Makefile + # Env files *.env diff --git a/Pipfile b/Pipfile index 14abe41..35f96b9 100644 --- a/Pipfile +++ b/Pipfile @@ -48,6 +48,8 @@ tiktoken = "==0.5.1" torch = "==2.1.2" tqdm = "==4.65.0" xgboost = "==2.0.3" +sphinx = "==7.2.6" +sphinx_rtd_theme = "==2.0.0" [requires] python_version = "3.10" diff --git a/src/bdc/__init__.py b/src/bdc/__init__.py index c46ca70..b443ba0 100644 --- a/src/bdc/__init__.py +++ b/src/bdc/__init__.py @@ -1,2 +1,5 @@ # SPDX-License-Identifier: MIT # SPDX-FileCopyrightText: 2023 Ruchita Nathani + +from .pipeline import * +from .steps import * diff --git a/src/bdc/steps/__init__.py b/src/bdc/steps/__init__.py index 40a9222..9b0533b 100644 --- a/src/bdc/steps/__init__.py +++ b/src/bdc/steps/__init__.py @@ -1,13 +1,14 @@ # SPDX-License-Identifier: MIT # SPDX-FileCopyrightText: 2023 Lucca Baumgärtner -from .analyze_emails import AnalyzeEmails -from .analyze_reviews import GPTReviewSentimentAnalyzer, SmartReviewInsightsEnhancer -from .google_places import GooglePlaces -from .google_places_detailed import GooglePlacesDetailed -from .gpt_summarizer import GPTSummarizer -from .hash_generator import HashGenerator -from .preprocess_phonenumbers import PreprocessPhonenumbers -from .regionalatlas import RegionalAtlas -from .scrape_address import ScrapeAddress -from .search_offeneregister import SearchOffeneRegister +from .analyze_emails import * +from .analyze_reviews import * +from .google_places import * +from .google_places_detailed import * +from .gpt_summarizer import * +from .hash_generator import * +from .preprocess_phonenumbers import * +from .regionalatlas import * +from .scrape_address import * +from .search_offeneregister import * +from .step import * diff --git a/src/bdc/steps/helpers/__init__.py b/src/bdc/steps/helpers/__init__.py index 8b04f93..31a6103 100644 --- a/src/bdc/steps/helpers/__init__.py +++ b/src/bdc/steps/helpers/__init__.py @@ -1,9 +1,9 @@ # SPDX-License-Identifier: MIT # SPDX-FileCopyrightText: 2023 Berkay Bozkurt -from .generate_hash_leads import LeadHashGenerator -from .offeneregister_api import OffeneRegisterAPI -from .text_analyzer import TextAnalyzer +from .generate_hash_leads import * +from .offeneregister_api import * +from .text_analyzer import * _lead_hash_generator = None diff --git a/src/database/leads/__init__.py b/src/database/leads/__init__.py index 7631206..72e0837 100644 --- a/src/database/leads/__init__.py +++ b/src/database/leads/__init__.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: MIT # SPDX-FileCopyrightText: 2023 Sophie Heasman -from .local_repository import LocalRepository -from .repository import Repository -from .s3_repository import S3Repository, decode_s3_url +from .local_repository import * +from .repository import * +from .s3_repository import * diff --git a/src/demo/__init__.py b/src/demo/__init__.py index 3da6dd3..bb87121 100644 --- a/src/demo/__init__.py +++ b/src/demo/__init__.py @@ -8,3 +8,4 @@ predict_MerchantSize_on_lead_data_demo, preprocessing_demo, ) +from .pipeline_utils import * diff --git a/src/docs/.gitkeep b/src/docs/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/docs/Makefile b/src/docs/Makefile new file mode 100644 index 0000000..da3682c --- /dev/null +++ b/src/docs/Makefile @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: MIT +# SPDX-FileCopyrightText: 2023 Berkay Bozkurt + +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/src/docs/conf.py b/src/docs/conf.py new file mode 100644 index 0000000..55d1434 --- /dev/null +++ b/src/docs/conf.py @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: MIT +# SPDX-FileCopyrightText: 2023 Berkay Bozkurt + +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +import os +import sys + +sys.path.insert(0, os.path.abspath("..")) + +project = "Sales Lead Qualifier" +copyright = "2024, Sum Insight" +author = "Sum Insight" +release = "01.00.00" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = ["sphinx.ext.autodoc", "sphinx.ext.viewcode", "sphinx.ext.napoleon"] +templates_path = ["_templates"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "sphinx_rtd_theme" +html_static_path = ["_static"] diff --git a/src/docs/index.rst b/src/docs/index.rst new file mode 100644 index 0000000..5cdad59 --- /dev/null +++ b/src/docs/index.rst @@ -0,0 +1,23 @@ + .. SPDX-License-Identifier: MIT + SPDX-FileCopyrightText: 2023 Berkay Bozkurt + +.. Sales Lead Qualifier documentation master file, created by + sphinx-quickstart on Sat Feb 3 02:33:45 2024. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to Sales Lead Qualifier's documentation! +================================================ + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + modules + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/src/docs/make.bat b/src/docs/make.bat new file mode 100644 index 0000000..000a0f0 --- /dev/null +++ b/src/docs/make.bat @@ -0,0 +1,37 @@ +:: # SPDX-License-Identifier: MIT +:: # SPDX-FileCopyrightText: 2023 Berkay Bozkurt +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/src/evp/__init__.py b/src/evp/__init__.py index dd4f024..5259ee9 100644 --- a/src/evp/__init__.py +++ b/src/evp/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: MIT # SPDX-FileCopyrightText: 2023 Felix Zailskas -from .evp import EstimatedValuePredictor +from .evp import * +from .predictors import * diff --git a/src/logger/__init__.py b/src/logger/__init__.py index 7eb19e1..5cc57bd 100644 --- a/src/logger/__init__.py +++ b/src/logger/__init__.py @@ -3,7 +3,7 @@ import os -from .logger import CustomLogger +from .logger import * _logger = None diff --git a/src/preprocessing/__init__.py b/src/preprocessing/__init__.py index 9acc755..cb29c38 100644 --- a/src/preprocessing/__init__.py +++ b/src/preprocessing/__init__.py @@ -1,4 +1,4 @@ # SPDX-License-Identifier: MIT # SPDX-FileCopyrightText: 2023 Ahmed Sheta -from .preprocessing import Preprocessing +from .preprocessing import * From 4a25985a18882fc92a76a5d8112af7ed67f1842d Mon Sep 17 00:00:00 2001 From: Berkay Bozkurt Date: Sat, 3 Feb 2024 02:51:53 +0100 Subject: [PATCH 13/51] pip install fixed Signed-off-by: Berkay Bozkurt --- .github/workflows/documentation.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 937d492..81c6401 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -17,9 +17,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install pipenv - # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - pipenv install --dev + pip install sphinx sphinx_rtd_theme - name: Generate Sphinx run: | cd src/docs From c746b9408b87e2fff3305c805cfedaadc38a3fe3 Mon Sep 17 00:00:00 2001 From: Berkay Bozkurt Date: Sat, 3 Feb 2024 02:59:35 +0100 Subject: [PATCH 14/51] path fixed Signed-off-by: Berkay Bozkurt --- .github/workflows/documentation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 81c6401..369534d 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -30,5 +30,5 @@ jobs: with: publish_branch: gh-pages github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: _build/html/ + publish_dir: src/docs/_build/html/* force_orphan: true From f8ff03261852ec34cb4d8d3397230fe14957869a Mon Sep 17 00:00:00 2001 From: Berkay Bozkurt Date: Sat, 3 Feb 2024 03:04:18 +0100 Subject: [PATCH 15/51] path fixed Signed-off-by: Berkay Bozkurt --- .github/workflows/documentation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 369534d..4a77f63 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -30,5 +30,5 @@ jobs: with: publish_branch: gh-pages github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: src/docs/_build/html/* + publish_dir: src/docs/_build/html/ force_orphan: true From 1fadef13b879cc16958bf1d2c5d1d7036e4e88ed Mon Sep 17 00:00:00 2001 From: Berkay Bozkurt Date: Sat, 3 Feb 2024 03:16:46 +0100 Subject: [PATCH 16/51] path fixed Signed-off-by: Berkay Bozkurt --- .github/workflows/documentation.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 4a77f63..51b168e 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -12,12 +12,14 @@ jobs: docs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v3 + - uses: actions/checkout@v4 + - name: Set up Python 3.10 + uses: actions/setup-python@v4 + with: + python-version: "3.10" - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install sphinx sphinx_rtd_theme + pip install sphinx sphinx_rtd_theme myst_parser - name: Generate Sphinx run: | cd src/docs From 0cd4e45e374b50b30f621672add3da9cbef05312 Mon Sep 17 00:00:00 2001 From: Berkay Bozkurt Date: Sat, 3 Feb 2024 03:24:54 +0100 Subject: [PATCH 17/51] install all dependencies Signed-off-by: Berkay Bozkurt --- .github/workflows/documentation.yml | 11 +++++++---- Pipfile | 1 + 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 51b168e..bb5c163 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -19,13 +19,16 @@ jobs: python-version: "3.10" - name: Install dependencies run: | - pip install sphinx sphinx_rtd_theme myst_parser + python -m pip install --upgrade pip + pip install pipenv + # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + pipenv install --dev - name: Generate Sphinx run: | cd src/docs - sphinx-apidoc -o . .. - make clean - make html + pipenv run sphinx-apidoc -o . .. + pipenv run make clean + pipenv run make html - name: Deploy to GitHub Pages uses: peaceiris/actions-gh-pages@v3 if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/feature/pdoc' }} diff --git a/Pipfile b/Pipfile index 35f96b9..e407593 100644 --- a/Pipfile +++ b/Pipfile @@ -50,6 +50,7 @@ tqdm = "==4.65.0" xgboost = "==2.0.3" sphinx = "==7.2.6" sphinx_rtd_theme = "==2.0.0" +myst_parser = "==2.0.0" [requires] python_version = "3.10" From f23de91d46d7eeafd807b0c75090493b81d0b20c Mon Sep 17 00:00:00 2001 From: Berkay Bozkurt Date: Sat, 3 Feb 2024 03:33:32 +0100 Subject: [PATCH 18/51] workflow set to main Signed-off-by: Berkay Bozkurt --- .github/workflows/documentation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index bb5c163..ab325af 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -31,7 +31,7 @@ jobs: pipenv run make html - name: Deploy to GitHub Pages uses: peaceiris/actions-gh-pages@v3 - if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/feature/pdoc' }} + if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} with: publish_branch: gh-pages github_token: ${{ secrets.GITHUB_TOKEN }} From 93ba51cb6336d5cdb95bca82865820357105deac Mon Sep 17 00:00:00 2001 From: Ahmed Sheta Date: Sat, 3 Feb 2024 05:21:10 +0100 Subject: [PATCH 19/51] fixed bug and now the pipeline can run locally Signed-off-by: Ahmed Sheta --- src/demo/demos.py | 151 +++++++++++++---------------- src/preprocessing/preprocessing.py | 74 ++++++++------ 2 files changed, 113 insertions(+), 112 deletions(-) diff --git a/src/demo/demos.py b/src/demo/demos.py index 41d3054..83723b2 100644 --- a/src/demo/demos.py +++ b/src/demo/demos.py @@ -230,52 +230,8 @@ def preprocessing_demo(): S3_bool = False preprocessor = Preprocessing( - filter_null_data=filter_bool, historical_data=historical_bool + filter_null_data=filter_bool, historical_bool=historical_bool, S3_bool=S3_bool ) - if historical_bool and S3_bool: - preprocessor.data_path = ( - "s3://amos--data--events/historical_data/100k_historic_enriched.csv" - ) - preprocessor.prerocessed_data_output_path = "s3://amos--data--features/preprocessed_data_files/historical_preprocessed_data.csv" - elif historical_bool and not S3_bool: - # input path - input_path_components = preprocessor.data_path.split( - "\\" if "\\" in preprocessor.data_path else "/" - ) - input_path_components.pop() - input_path_components.append("100k_historic_enriched.csv") - input_path = "/".join(input_path_components) - preprocessor.data_path = input_path - - # output path - path_components = preprocessor.data_path.split( - "\\" if "\\" in preprocessor.data_path else "/" - ) - path_components.pop() - path_components.append( - "preprocessed_data_files/historical_preprocessed_data.csv" - ) - preprocessor.prerocessed_data_output_path = "/".join(path_components) - elif not historical_bool and S3_bool: - preprocessor.data_path = "s3://amos--data--events/leads/enriched.csv" - preprocessor.prerocessed_data_output_path = "s3://amos--data--features/preprocessed_data_files/leads_preprocessed_data.csv" - elif not historical_bool and not S3_bool: - # input path - input_path_components = preprocessor.data_path.split( - "\\" if "\\" in preprocessor.data_path else "/" - ) - input_path_components.pop() - input_path_components.append("leads_enriched.csv") - input_path = "/".join(input_path_components) - preprocessor.data_path = input_path - - # output path - path_components = preprocessor.data_path.split( - "\\" if "\\" in preprocessor.data_path else "/" - ) - path_components.pop() - path_components.append("preprocessed_data_files/leads_preprocessed_data.csv") - preprocessor.prerocessed_data_output_path = "/".join(path_components) preprocessor.preprocessed_df = pd.read_csv(preprocessor.data_path) @@ -294,37 +250,23 @@ def predict_MerchantSize_on_lead_data_demo(): import pandas as pd log.info( - "Note: Enriched data must be located at s3://amos--data--events/leads/enriched.csv" + "Note: In case of running locally, enriched data must be located at src/data/leads_enriched.csv locally\nIn case of running on S3, enriched data must be located at s3://amos--data--events/leads/enriched.csv or" ) ######################### preprocessing the leads ################################## + if get_yes_no_input("Run on S3? (y/n)\n'n' means it will run locally!\n"): + S3_bool = True + else: + S3_bool = False current_dir = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd() parent_dir = os.path.join(current_dir, "..") sys.path.append(parent_dir) from preprocessing import Preprocessing - preprocessor = Preprocessing(filter_null_data=False, historical_data=False) - - leads_enriched_path = "s3://amos--data--events/leads/enriched.csv" # S3 path - - # # input path - # input_path_components = preprocessor.data_path.split( - # "\\" if "\\" in preprocessor.data_path else "/" - # ) - # input_path_components.pop() - # input_path_components.append("leads_enriched.csv") - # input_path = "/".join(input_path_components) # local path - # preprocessor.data_path = input_path - - if not leads_enriched_path: - log.error( - "No such file exists in the directory s3://amos--data--events/leads/enriched.csv" - ) - preprocessor.data_path = leads_enriched_path - preprocessor.prerocessed_data_output_path = ( - "s3://amos--data--events/leads/preprocessed_leads_data.csv" + preprocessor = Preprocessing( + filter_null_data=False, historical_bool=False, S3_bool=S3_bool ) - preprocessor.preprocessed_df = pd.read_csv(leads_enriched_path) + preprocessor.preprocessed_df = pd.read_csv(preprocessor.data_path) df = preprocessor.implement_preprocessing_pipeline() preprocessor.save_preprocessed_data() @@ -333,9 +275,18 @@ def predict_MerchantSize_on_lead_data_demo(): historical_preprocessed_data = pd.read_csv( "s3://amos--data--features/preprocessed_data_files/preprocessed_data.csv" ) - toBePredicted_preprocessed_data = pd.read_csv( - "s3://amos--data--events/leads/preprocessed_leads_data.csv" - ) + if S3_bool: + toBePredicted_preprocessed_data = pd.read_csv( + "s3://amos--data--events/leads/preprocessed_leads_data.csv" + ) + else: + path_components = preprocessor.data_path.split( + "\\" if "\\" in preprocessor.data_path else "/" + ) + path_components.pop() + path_components.append("preprocessed_data_files/leads_preprocessed_data.csv") + leads_preprocessed_data_path = "/".join(path_components) + toBePredicted_preprocessed_data = pd.read_csv(leads_preprocessed_data_path) historical_columns_order = historical_preprocessed_data.columns @@ -355,11 +306,21 @@ def predict_MerchantSize_on_lead_data_demo(): toBePredicted_preprocessed_data = toBePredicted_preprocessed_data[ historical_columns_order ] - - toBePredicted_preprocessed_data.to_csv( - "s3://amos--data--events/leads/toBePredicted_preprocessed_data_updated.csv", - index=False, - ) + if S3_bool: + toBePredicted_preprocessed_data.to_csv( + "s3://amos--data--events/leads/toBePredicted_preprocessed_data_updated.csv", + index=False, + ) + else: + path_components = preprocessor.data_path.split( + "\\" if "\\" in preprocessor.data_path else "/" + ) + path_components.pop() + path_components.append("toBePredicted_preprocessed_data_updated.csv") + local_preprocessed_data_path = "/".join(path_components) + toBePredicted_preprocessed_data.to_csv( + local_preprocessed_data_path, index=False + ) # check if columns in both dataframe are in same order and same number assert list(toBePredicted_preprocessed_data.columns) == list( @@ -403,9 +364,13 @@ def check_classification_task(string): model = joblib.load(model_file) log.info(f"Loaded the model sucessfully!") - data_path = ( - "s3://amos--data--events/leads/toBePredicted_preprocessed_data_updated.csv" - ) + if S3_bool: + data_path = ( + "s3://amos--data--events/leads/toBePredicted_preprocessed_data_updated.csv" + ) + else: + data_path = local_preprocessed_data_path + df = pd.read_csv(data_path) input = df.drop("MerchantSizeByDPV", axis=1) if xgb_bool: @@ -418,15 +383,31 @@ def check_classification_task(string): size_mapping = {0: "XS", 1: "S", 2: "M", 3: "L", 4: "XL"} remapped_predictions = [size_mapping[prediction] for prediction in predictions] - enriched_data = pd.read_csv("s3://amos--data--events/leads/enriched.csv") + if S3_bool: + enriched_data = pd.read_csv("s3://amos--data--events/leads/enriched.csv") + else: + enriched_data = pd.read_csv(preprocessor.data_path) # first 5 columns: Last Name,First Name,Company / Account,Phone,Email, raw_data = enriched_data.iloc[:, :5] + print(f"raw_data = {raw_data.shape}") + print(f"remapped_predictions = {len(remapped_predictions)}") raw_data["PredictedMerchantSize"] = remapped_predictions - raw_data.to_csv( - "s3://amos--data--events/leads/predicted_MerchantSize_of_leads.csv", index=True - ) - log.info( - f"Saved the predicted Merchant Size of the leads at s3://amos--data--events/leads/predicted_MerchantSize_of_leads.csv" - ) + if S3_bool: + raw_data.to_csv( + "s3://amos--data--events/leads/predicted_MerchantSize_of_leads.csv", + index=True, + ) + log.info( + f"Saved the predicted Merchant Size of the leads at s3://amos--data--events/leads/predicted_MerchantSize_of_leads.csv" + ) + else: + path_components = preprocessor.data_path.split( + "\\" if "\\" in preprocessor.data_path else "/" + ) + path_components.pop() + path_components.append("predicted_MerchantSize_of_leads.csv") + output_path = "/".join(path_components) + raw_data.to_csv(output_path, index=True) + log.info(f"Saved the predicted Merchant Size of the leads at {output_path}") diff --git a/src/preprocessing/preprocessing.py b/src/preprocessing/preprocessing.py index f47510b..a278b2a 100644 --- a/src/preprocessing/preprocessing.py +++ b/src/preprocessing/preprocessing.py @@ -29,37 +29,57 @@ class Preprocessing: - def __init__(self, filter_null_data=True, historical_data=False): + def __init__(self, filter_null_data=True, historical_bool=True, S3_bool=False): data_repo = get_database() self.data_path = data_repo.get_output_path() self.preprocessed_df = None self.prerocessed_data_output_path = None - # if historical_data: - # input_path_components = self.data_path.split( - # "\\" if "\\" in self.data_path else "/" - # ) - # input_path_components.pop() - # input_path_components.pop() - # input_path_components.append("historical_data/100k_historic_enriched.csv") - # input_path = "/".join(input_path_components) - # data = pd.read_csv(input_path) - # log.debug(f"Data path = {input_path}") - # self.preprocessed_df = data.copy() - # else: - # log.debug(f"Data path = {self.data_path}") - # data = pd.read_csv(self.data_path) - # self.preprocessed_df = data.copy() - - # if historical_data: - # self.prerocessed_data_output_path = "s3://amos--data--features/preprocessed_data_files/preprocessed_data.csv" - # else: - # # created the new output path based on which repo used - # path_components = self.data_path.split( - # "\\" if "\\" in self.data_path else "/" - # ) - # path_components.pop() - # path_components.append("preprocessed_data_files/preprocessed_data.csv") - # self.prerocessed_data_output_path = "/".join(path_components) + if historical_bool and S3_bool: + self.data_path = ( + "s3://amos--data--events/historical_data/100k_historic_enriched.csv" + ) + self.prerocessed_data_output_path = "s3://amos--data--features/preprocessed_data_files/historical_preprocessed_data.csv" + elif historical_bool and not S3_bool: + # input path + input_path_components = self.data_path.split( + "\\" if "\\" in self.data_path else "/" + ) + input_path_components.pop() + input_path_components.append("100k_historic_enriched.csv") + input_path = "/".join(input_path_components) + self.data_path = input_path + + # output path + path_components = self.data_path.split( + "\\" if "\\" in self.data_path else "/" + ) + path_components.pop() + path_components.append( + "preprocessed_data_files/historical_preprocessed_data.csv" + ) + self.prerocessed_data_output_path = "/".join(path_components) + elif not historical_bool and S3_bool: + self.data_path = "s3://amos--data--events/leads/enriched.csv" + self.prerocessed_data_output_path = "s3://amos--data--features/preprocessed_data_files/leads_preprocessed_data.csv" + elif not historical_bool and not S3_bool: + # input path + input_path_components = self.data_path.split( + "\\" if "\\" in self.data_path else "/" + ) + input_path_components.pop() + input_path_components.append("leads_enriched.csv") + input_path = "/".join(input_path_components) + self.data_path = input_path + + # output path + path_components = self.data_path.split( + "\\" if "\\" in self.data_path else "/" + ) + path_components.pop() + path_components.append( + "preprocessed_data_files/leads_preprocessed_data.csv" + ) + self.prerocessed_data_output_path = "/".join(path_components) self.filter_bool = filter_null_data # columns that would be added later after one-hot encoding each class From 46cf06587a5b67a3468b5de8fa3b7cd3ab6cb7a7 Mon Sep 17 00:00:00 2001 From: Berkay Bozkurt Date: Sat, 3 Feb 2024 13:00:28 +0100 Subject: [PATCH 20/51] sphinx author name changed Signed-off-by: Berkay Bozkurt --- src/docs/conf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/docs/conf.py b/src/docs/conf.py index 55d1434..8ae4ed8 100644 --- a/src/docs/conf.py +++ b/src/docs/conf.py @@ -15,8 +15,8 @@ sys.path.insert(0, os.path.abspath("..")) project = "Sales Lead Qualifier" -copyright = "2024, Sum Insight" -author = "Sum Insight" +copyright = "2024, SumInsights" +author = "SumInsights" release = "01.00.00" # -- General configuration --------------------------------------------------- From d9195678296a562380c30586763374b559924b7c Mon Sep 17 00:00:00 2001 From: Berkay Bozkurt Date: Sat, 3 Feb 2024 13:02:10 +0100 Subject: [PATCH 21/51] demo/__init__.py failing imports are added Signed-off-by: Berkay Bozkurt --- src/demo/__init__.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/demo/__init__.py b/src/demo/__init__.py index bb87121..2ce7a72 100644 --- a/src/demo/__init__.py +++ b/src/demo/__init__.py @@ -1,11 +1,6 @@ # SPDX-License-Identifier: MIT # SPDX-FileCopyrightText: 2023 Berkay Bozkurt -from .console_utils import get_int_input, get_multiple_choice, get_yes_no_input -from .demos import ( - evp_demo, - pipeline_demo, - predict_MerchantSize_on_lead_data_demo, - preprocessing_demo, -) +from .console_utils import * +from .demos import * from .pipeline_utils import * From ce6bf6a797a14a5ea8ee342a9565508d63e22c5e Mon Sep 17 00:00:00 2001 From: Berkay Bozkurt Date: Sat, 3 Feb 2024 14:11:20 +0100 Subject: [PATCH 22/51] README.md typo fixed and included to sphinx Signed-off-by: Berkay Bozkurt --- Pipfile.lock | 645 ++++++++++++++++++++++++++++++--------------- README.md | 2 +- src/docs/conf.py | 8 +- src/docs/index.rst | 1 + 4 files changed, 444 insertions(+), 212 deletions(-) diff --git a/Pipfile.lock b/Pipfile.lock index ebdaba4..f6b083a 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "2a572dbe6247b7d3f516703542275c39b368211d26d4a47ea102f361a43fc4b4" + "sha256": "9e3d29b16e3d34d5c059c059728a9a36510bd8554044aa224a482cc910d553c1" }, "pipfile-spec": 6, "requires": { @@ -18,11 +18,11 @@ "default": { "aiobotocore": { "hashes": [ - "sha256:0b095af50da2d6f94e93ca959e2a4876f0f0d84d534b61b21d8e050832d04ab6", - "sha256:904a7ad7cc8671d662cfd596906dafe839118ea2a66332c37908e3dcfdee1e45" + "sha256:487fede588040bfa3a43df945275c28c1c73ca75bf705295adb9fbadd2e89be7", + "sha256:6dd7352248e3523019c5a54a395d2b1c31080697fc80a9ad2672de4eec8c7abd" ], "markers": "python_version >= '3.8'", - "version": "==2.11.1" + "version": "==2.11.2" }, "aiohttp": { "hashes": [ @@ -122,6 +122,14 @@ "markers": "python_version >= '3.7'", "version": "==1.3.1" }, + "alabaster": { + "hashes": [ + "sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65", + "sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92" + ], + "markers": "python_version >= '3.9'", + "version": "==0.7.16" + }, "annotated-types": { "hashes": [ "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43", @@ -161,6 +169,14 @@ "index": "pypi", "version": "==2.6.1" }, + "babel": { + "hashes": [ + "sha256:6919867db036398ba21eb5c7a0f6b28ab8cbc3ae7a73a44ebe34ae74a4e7d363", + "sha256:efb1a25b7118e67ce3a259bed20545c29cb68be8ad2c784c83689981b7a57287" + ], + "markers": "python_version >= '3.7'", + "version": "==2.14.0" + }, "beautifulsoup4": { "hashes": [ "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da", @@ -189,11 +205,11 @@ }, "certifi": { "hashes": [ - "sha256:9b469f3a900bf28dc19b8cfbf8019bf47f7fdd1a65a1d4ffb98fc14166beb4d1", - "sha256:e036ab49d5b79556f99cfc2d9320b34cfbe5be05c5871b51de9329f0603b0474" + "sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f", + "sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1" ], "markers": "python_version >= '3.6'", - "version": "==2023.11.17" + "version": "==2024.2.2" }, "charset-normalizer": { "hashes": [ @@ -501,6 +517,14 @@ "markers": "python_version >= '3.8'", "version": "==2.5.0" }, + "docutils": { + "hashes": [ + "sha256:96f387a2c5562db4476f09f13bbab2192e764cac08ebbf3a34a95d9b1e4a59d6", + "sha256:f08a4e276c3a1583a86dce3e34aba3fe04d02bba2dd51ed16106244e8a923e3b" + ], + "markers": "python_version >= '3.7'", + "version": "==0.20.1" + }, "email-validator": { "hashes": [ "sha256:a4b0bd1cf55f073b924258d19321b1f3aa74b4b5a71a42c305575dba920e1a44", @@ -712,6 +736,14 @@ "markers": "python_version >= '3.5'", "version": "==3.6" }, + "imagesize": { + "hashes": [ + "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b", + "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==1.4.1" + }, "jinja2": { "hashes": [ "sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa", @@ -856,71 +888,95 @@ "markers": "python_version >= '3.8' and python_version < '4.0'", "version": "==2.0.1" }, + "markdown-it-py": { + "hashes": [ + "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", + "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb" + ], + "markers": "python_version >= '3.8'", + "version": "==3.0.0" + }, "markupsafe": { "hashes": [ - "sha256:0042d6a9880b38e1dd9ff83146cc3c9c18a059b9360ceae207805567aacccc69", - "sha256:0c26f67b3fe27302d3a412b85ef696792c4a2386293c53ba683a89562f9399b0", - "sha256:0fbad3d346df8f9d72622ac71b69565e621ada2ce6572f37c2eae8dacd60385d", - "sha256:15866d7f2dc60cfdde12ebb4e75e41be862348b4728300c36cdf405e258415ec", - "sha256:1c98c33ffe20e9a489145d97070a435ea0679fddaabcafe19982fe9c971987d5", - "sha256:21e7af8091007bf4bebf4521184f4880a6acab8df0df52ef9e513d8e5db23411", - "sha256:23984d1bdae01bee794267424af55eef4dfc038dc5d1272860669b2aa025c9e3", - "sha256:31f57d64c336b8ccb1966d156932f3daa4fee74176b0fdc48ef580be774aae74", - "sha256:3583a3a3ab7958e354dc1d25be74aee6228938312ee875a22330c4dc2e41beb0", - "sha256:36d7626a8cca4d34216875aee5a1d3d654bb3dac201c1c003d182283e3205949", - "sha256:396549cea79e8ca4ba65525470d534e8a41070e6b3500ce2414921099cb73e8d", - "sha256:3a66c36a3864df95e4f62f9167c734b3b1192cb0851b43d7cc08040c074c6279", - "sha256:3aae9af4cac263007fd6309c64c6ab4506dd2b79382d9d19a1994f9240b8db4f", - "sha256:3ab3a886a237f6e9c9f4f7d272067e712cdb4efa774bef494dccad08f39d8ae6", - "sha256:47bb5f0142b8b64ed1399b6b60f700a580335c8e1c57f2f15587bd072012decc", - "sha256:49a3b78a5af63ec10d8604180380c13dcd870aba7928c1fe04e881d5c792dc4e", - "sha256:4df98d4a9cd6a88d6a585852f56f2155c9cdb6aec78361a19f938810aa020954", - "sha256:5045e892cfdaecc5b4c01822f353cf2c8feb88a6ec1c0adef2a2e705eef0f656", - "sha256:5244324676254697fe5c181fc762284e2c5fceeb1c4e3e7f6aca2b6f107e60dc", - "sha256:54635102ba3cf5da26eb6f96c4b8c53af8a9c0d97b64bdcb592596a6255d8518", - "sha256:54a7e1380dfece8847c71bf7e33da5d084e9b889c75eca19100ef98027bd9f56", - "sha256:55d03fea4c4e9fd0ad75dc2e7e2b6757b80c152c032ea1d1de487461d8140efc", - "sha256:698e84142f3f884114ea8cf83e7a67ca8f4ace8454e78fe960646c6c91c63bfa", - "sha256:6aa5e2e7fc9bc042ae82d8b79d795b9a62bd8f15ba1e7594e3db243f158b5565", - "sha256:7653fa39578957bc42e5ebc15cf4361d9e0ee4b702d7d5ec96cdac860953c5b4", - "sha256:765f036a3d00395a326df2835d8f86b637dbaf9832f90f5d196c3b8a7a5080cb", - "sha256:78bc995e004681246e85e28e068111a4c3f35f34e6c62da1471e844ee1446250", - "sha256:7a07f40ef8f0fbc5ef1000d0c78771f4d5ca03b4953fc162749772916b298fc4", - "sha256:8b570a1537367b52396e53325769608f2a687ec9a4363647af1cded8928af959", - "sha256:987d13fe1d23e12a66ca2073b8d2e2a75cec2ecb8eab43ff5624ba0ad42764bc", - "sha256:9896fca4a8eb246defc8b2a7ac77ef7553b638e04fbf170bff78a40fa8a91474", - "sha256:9e9e3c4020aa2dc62d5dd6743a69e399ce3de58320522948af6140ac959ab863", - "sha256:a0b838c37ba596fcbfca71651a104a611543077156cb0a26fe0c475e1f152ee8", - "sha256:a4d176cfdfde84f732c4a53109b293d05883e952bbba68b857ae446fa3119b4f", - "sha256:a76055d5cb1c23485d7ddae533229039b850db711c554a12ea64a0fd8a0129e2", - "sha256:a76cd37d229fc385738bd1ce4cba2a121cf26b53864c1772694ad0ad348e509e", - "sha256:a7cc49ef48a3c7a0005a949f3c04f8baa5409d3f663a1b36f0eba9bfe2a0396e", - "sha256:abf5ebbec056817057bfafc0445916bb688a255a5146f900445d081db08cbabb", - "sha256:b0fe73bac2fed83839dbdbe6da84ae2a31c11cfc1c777a40dbd8ac8a6ed1560f", - "sha256:b6f14a9cd50c3cb100eb94b3273131c80d102e19bb20253ac7bd7336118a673a", - "sha256:b83041cda633871572f0d3c41dddd5582ad7d22f65a72eacd8d3d6d00291df26", - "sha256:b835aba863195269ea358cecc21b400276747cc977492319fd7682b8cd2c253d", - "sha256:bf1196dcc239e608605b716e7b166eb5faf4bc192f8a44b81e85251e62584bd2", - "sha256:c669391319973e49a7c6230c218a1e3044710bc1ce4c8e6eb71f7e6d43a2c131", - "sha256:c7556bafeaa0a50e2fe7dc86e0382dea349ebcad8f010d5a7dc6ba568eaaa789", - "sha256:c8f253a84dbd2c63c19590fa86a032ef3d8cc18923b8049d91bcdeeb2581fbf6", - "sha256:d18b66fe626ac412d96c2ab536306c736c66cf2a31c243a45025156cc190dc8a", - "sha256:d5291d98cd3ad9a562883468c690a2a238c4a6388ab3bd155b0c75dd55ece858", - "sha256:d5c31fe855c77cad679b302aabc42d724ed87c043b1432d457f4976add1c2c3e", - "sha256:d6e427c7378c7f1b2bef6a344c925b8b63623d3321c09a237b7cc0e77dd98ceb", - "sha256:dac1ebf6983148b45b5fa48593950f90ed6d1d26300604f321c74a9ca1609f8e", - "sha256:de8153a7aae3835484ac168a9a9bdaa0c5eee4e0bc595503c95d53b942879c84", - "sha256:e1a0d1924a5013d4f294087e00024ad25668234569289650929ab871231668e7", - "sha256:e7902211afd0af05fbadcc9a312e4cf10f27b779cf1323e78d52377ae4b72bea", - "sha256:e888ff76ceb39601c59e219f281466c6d7e66bd375b4ec1ce83bcdc68306796b", - "sha256:f06e5a9e99b7df44640767842f414ed5d7bedaaa78cd817ce04bbd6fd86e2dd6", - "sha256:f6be2d708a9d0e9b0054856f07ac7070fbe1754be40ca8525d5adccdbda8f475", - "sha256:f9917691f410a2e0897d1ef99619fd3f7dd503647c8ff2475bf90c3cf222ad74", - "sha256:fc1a75aa8f11b87910ffd98de62b29d6520b6d6e8a3de69a70ca34dea85d2a8a", - "sha256:fe8512ed897d5daf089e5bd010c3dc03bb1bdae00b35588c49b98268d4a01e00" + "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf", + "sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff", + "sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f", + "sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3", + "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532", + "sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f", + "sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617", + "sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df", + "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4", + "sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906", + "sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f", + "sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4", + "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8", + "sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371", + "sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2", + "sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465", + "sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52", + "sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6", + "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169", + "sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad", + "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2", + "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0", + "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029", + "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f", + "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a", + "sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced", + "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5", + "sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c", + "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf", + "sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9", + "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb", + "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad", + "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3", + "sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1", + "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46", + "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc", + "sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a", + "sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee", + "sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900", + "sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5", + "sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea", + "sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f", + "sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5", + "sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e", + "sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a", + "sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f", + "sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50", + "sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a", + "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b", + "sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4", + "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff", + "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2", + "sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46", + "sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b", + "sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf", + "sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5", + "sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5", + "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab", + "sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd", + "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68" + ], + "markers": "python_version >= '3.7'", + "version": "==2.1.5" + }, + "mdit-py-plugins": { + "hashes": [ + "sha256:b51b3bb70691f57f974e257e367107857a93b36f322a9e6d44ca5bf28ec2def9", + "sha256:d8ab27e9aed6c38aa716819fedfde15ca275715955f8a185a8e1cf90fb1d2c1b" + ], + "markers": "python_version >= '3.8'", + "version": "==0.4.0" + }, + "mdurl": { + "hashes": [ + "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", + "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba" ], "markers": "python_version >= '3.7'", - "version": "==2.1.4" + "version": "==0.1.2" }, "more-itertools": { "hashes": [ @@ -939,83 +995,107 @@ }, "multidict": { "hashes": [ - "sha256:01a3a55bd90018c9c080fbb0b9f4891db37d148a0a18722b42f94694f8b6d4c9", - "sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8", - "sha256:0dfad7a5a1e39c53ed00d2dd0c2e36aed4650936dc18fd9a1826a5ae1cad6f03", - "sha256:11bdf3f5e1518b24530b8241529d2050014c884cf18b6fc69c0c2b30ca248710", - "sha256:1502e24330eb681bdaa3eb70d6358e818e8e8f908a22a1851dfd4e15bc2f8161", - "sha256:16ab77bbeb596e14212e7bab8429f24c1579234a3a462105cda4a66904998664", - "sha256:16d232d4e5396c2efbbf4f6d4df89bfa905eb0d4dc5b3549d872ab898451f569", - "sha256:21a12c4eb6ddc9952c415f24eef97e3e55ba3af61f67c7bc388dcdec1404a067", - "sha256:27c523fbfbdfd19c6867af7346332b62b586eed663887392cff78d614f9ec313", - "sha256:281af09f488903fde97923c7744bb001a9b23b039a909460d0f14edc7bf59706", - "sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2", - "sha256:3601a3cece3819534b11d4efc1eb76047488fddd0c85a3948099d5da4d504636", - "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49", - "sha256:36c63aaa167f6c6b04ef2c85704e93af16c11d20de1d133e39de6a0e84582a93", - "sha256:39ff62e7d0f26c248b15e364517a72932a611a9b75f35b45be078d81bdb86603", - "sha256:43644e38f42e3af682690876cff722d301ac585c5b9e1eacc013b7a3f7b696a0", - "sha256:4372381634485bec7e46718edc71528024fcdc6f835baefe517b34a33c731d60", - "sha256:458f37be2d9e4c95e2d8866a851663cbc76e865b78395090786f6cd9b3bbf4f4", - "sha256:45e1ecb0379bfaab5eef059f50115b54571acfbe422a14f668fc8c27ba410e7e", - "sha256:4b9d9e4e2b37daddb5c23ea33a3417901fa7c7b3dee2d855f63ee67a0b21e5b1", - "sha256:4ceef517eca3e03c1cceb22030a3e39cb399ac86bff4e426d4fc6ae49052cc60", - "sha256:4d1a3d7ef5e96b1c9e92f973e43aa5e5b96c659c9bc3124acbbd81b0b9c8a951", - "sha256:4dcbb0906e38440fa3e325df2359ac6cb043df8e58c965bb45f4e406ecb162cc", - "sha256:509eac6cf09c794aa27bcacfd4d62c885cce62bef7b2c3e8b2e49d365b5003fe", - "sha256:52509b5be062d9eafc8170e53026fbc54cf3b32759a23d07fd935fb04fc22d95", - "sha256:52f2dffc8acaba9a2f27174c41c9e57f60b907bb9f096b36b1a1f3be71c6284d", - "sha256:574b7eae1ab267e5f8285f0fe881f17efe4b98c39a40858247720935b893bba8", - "sha256:5979b5632c3e3534e42ca6ff856bb24b2e3071b37861c2c727ce220d80eee9ed", - "sha256:59d43b61c59d82f2effb39a93c48b845efe23a3852d201ed2d24ba830d0b4cf2", - "sha256:5a4dcf02b908c3b8b17a45fb0f15b695bf117a67b76b7ad18b73cf8e92608775", - "sha256:5cad9430ab3e2e4fa4a2ef4450f548768400a2ac635841bc2a56a2052cdbeb87", - "sha256:5fc1b16f586f049820c5c5b17bb4ee7583092fa0d1c4e28b5239181ff9532e0c", - "sha256:62501642008a8b9871ddfccbf83e4222cf8ac0d5aeedf73da36153ef2ec222d2", - "sha256:64bdf1086b6043bf519869678f5f2757f473dee970d7abf6da91ec00acb9cb98", - "sha256:64da238a09d6039e3bd39bb3aee9c21a5e34f28bfa5aa22518581f910ff94af3", - "sha256:666daae833559deb2d609afa4490b85830ab0dfca811a98b70a205621a6109fe", - "sha256:67040058f37a2a51ed8ea8f6b0e6ee5bd78ca67f169ce6122f3e2ec80dfe9b78", - "sha256:6748717bb10339c4760c1e63da040f5f29f5ed6e59d76daee30305894069a660", - "sha256:6b181d8c23da913d4ff585afd1155a0e1194c0b50c54fcfe286f70cdaf2b7176", - "sha256:6ed5f161328b7df384d71b07317f4d8656434e34591f20552c7bcef27b0ab88e", - "sha256:7582a1d1030e15422262de9f58711774e02fa80df0d1578995c76214f6954988", - "sha256:7d18748f2d30f94f498e852c67d61261c643b349b9d2a581131725595c45ec6c", - "sha256:7d6ae9d593ef8641544d6263c7fa6408cc90370c8cb2bbb65f8d43e5b0351d9c", - "sha256:81a4f0b34bd92df3da93315c6a59034df95866014ac08535fc819f043bfd51f0", - "sha256:8316a77808c501004802f9beebde51c9f857054a0c871bd6da8280e718444449", - "sha256:853888594621e6604c978ce2a0444a1e6e70c8d253ab65ba11657659dcc9100f", - "sha256:99b76c052e9f1bc0721f7541e5e8c05db3941eb9ebe7b8553c625ef88d6eefde", - "sha256:a2e4369eb3d47d2034032a26c7a80fcb21a2cb22e1173d761a162f11e562caa5", - "sha256:ab55edc2e84460694295f401215f4a58597f8f7c9466faec545093045476327d", - "sha256:af048912e045a2dc732847d33821a9d84ba553f5c5f028adbd364dd4765092ac", - "sha256:b1a2eeedcead3a41694130495593a559a668f382eee0727352b9a41e1c45759a", - "sha256:b1e8b901e607795ec06c9e42530788c45ac21ef3aaa11dbd0c69de543bfb79a9", - "sha256:b41156839806aecb3641f3208c0dafd3ac7775b9c4c422d82ee2a45c34ba81ca", - "sha256:b692f419760c0e65d060959df05f2a531945af31fda0c8a3b3195d4efd06de11", - "sha256:bc779e9e6f7fda81b3f9aa58e3a6091d49ad528b11ed19f6621408806204ad35", - "sha256:bf6774e60d67a9efe02b3616fee22441d86fab4c6d335f9d2051d19d90a40063", - "sha256:c048099e4c9e9d615545e2001d3d8a4380bd403e1a0578734e0d31703d1b0c0b", - "sha256:c5cb09abb18c1ea940fb99360ea0396f34d46566f157122c92dfa069d3e0e982", - "sha256:cc8e1d0c705233c5dd0c5e6460fbad7827d5d36f310a0fadfd45cc3029762258", - "sha256:d5e3fc56f88cc98ef8139255cf8cd63eb2c586531e43310ff859d6bb3a6b51f1", - "sha256:d6aa0418fcc838522256761b3415822626f866758ee0bc6632c9486b179d0b52", - "sha256:d6c254ba6e45d8e72739281ebc46ea5eb5f101234f3ce171f0e9f5cc86991480", - "sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7", - "sha256:dcfe792765fab89c365123c81046ad4103fcabbc4f56d1c1997e6715e8015461", - "sha256:ddd3915998d93fbcd2566ddf9cf62cdb35c9e093075f862935573d265cf8f65d", - "sha256:ddff9c4e225a63a5afab9dd15590432c22e8057e1a9a13d28ed128ecf047bbdc", - "sha256:e41b7e2b59679edfa309e8db64fdf22399eec4b0b24694e1b2104fb789207779", - "sha256:e69924bfcdda39b722ef4d9aa762b2dd38e4632b3641b1d9a57ca9cd18f2f83a", - "sha256:ea20853c6dbbb53ed34cb4d080382169b6f4554d394015f1bef35e881bf83547", - "sha256:ee2a1ece51b9b9e7752e742cfb661d2a29e7bcdba2d27e66e28a99f1890e4fa0", - "sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171", - "sha256:f70b98cd94886b49d91170ef23ec5c0e8ebb6f242d734ed7ed677b24d50c82cf", - "sha256:fc35cb4676846ef752816d5be2193a1e8367b4c1397b74a565a9d0389c433a1d", - "sha256:ff959bee35038c4624250473988b24f846cbeb2c6639de3602c073f10410ceba" + "sha256:01265f5e40f5a17f8241d52656ed27192be03bfa8764d88e8220141d1e4b3556", + "sha256:0275e35209c27a3f7951e1ce7aaf93ce0d163b28948444bec61dd7badc6d3f8c", + "sha256:04bde7a7b3de05732a4eb39c94574db1ec99abb56162d6c520ad26f83267de29", + "sha256:04da1bb8c8dbadf2a18a452639771951c662c5ad03aefe4884775454be322c9b", + "sha256:09a892e4a9fb47331da06948690ae38eaa2426de97b4ccbfafbdcbe5c8f37ff8", + "sha256:0d63c74e3d7ab26de115c49bffc92cc77ed23395303d496eae515d4204a625e7", + "sha256:107c0cdefe028703fb5dafe640a409cb146d44a6ae201e55b35a4af8e95457dd", + "sha256:141b43360bfd3bdd75f15ed811850763555a251e38b2405967f8e25fb43f7d40", + "sha256:14c2976aa9038c2629efa2c148022ed5eb4cb939e15ec7aace7ca932f48f9ba6", + "sha256:19fe01cea168585ba0f678cad6f58133db2aa14eccaf22f88e4a6dccadfad8b3", + "sha256:1d147090048129ce3c453f0292e7697d333db95e52616b3793922945804a433c", + "sha256:1d9ea7a7e779d7a3561aade7d596649fbecfa5c08a7674b11b423783217933f9", + "sha256:215ed703caf15f578dca76ee6f6b21b7603791ae090fbf1ef9d865571039ade5", + "sha256:21fd81c4ebdb4f214161be351eb5bcf385426bf023041da2fd9e60681f3cebae", + "sha256:220dd781e3f7af2c2c1053da9fa96d9cf3072ca58f057f4c5adaaa1cab8fc442", + "sha256:228b644ae063c10e7f324ab1ab6b548bdf6f8b47f3ec234fef1093bc2735e5f9", + "sha256:29bfeb0dff5cb5fdab2023a7a9947b3b4af63e9c47cae2a10ad58394b517fddc", + "sha256:2f4848aa3baa109e6ab81fe2006c77ed4d3cd1e0ac2c1fbddb7b1277c168788c", + "sha256:2faa5ae9376faba05f630d7e5e6be05be22913782b927b19d12b8145968a85ea", + "sha256:2ffc42c922dbfddb4a4c3b438eb056828719f07608af27d163191cb3e3aa6cc5", + "sha256:37b15024f864916b4951adb95d3a80c9431299080341ab9544ed148091b53f50", + "sha256:3cc2ad10255f903656017363cd59436f2111443a76f996584d1077e43ee51182", + "sha256:3d25f19500588cbc47dc19081d78131c32637c25804df8414463ec908631e453", + "sha256:403c0911cd5d5791605808b942c88a8155c2592e05332d2bf78f18697a5fa15e", + "sha256:411bf8515f3be9813d06004cac41ccf7d1cd46dfe233705933dd163b60e37600", + "sha256:425bf820055005bfc8aa9a0b99ccb52cc2f4070153e34b701acc98d201693733", + "sha256:435a0984199d81ca178b9ae2c26ec3d49692d20ee29bc4c11a2a8d4514c67eda", + "sha256:4a6a4f196f08c58c59e0b8ef8ec441d12aee4125a7d4f4fef000ccb22f8d7241", + "sha256:4cc0ef8b962ac7a5e62b9e826bd0cd5040e7d401bc45a6835910ed699037a461", + "sha256:51d035609b86722963404f711db441cf7134f1889107fb171a970c9701f92e1e", + "sha256:53689bb4e102200a4fafa9de9c7c3c212ab40a7ab2c8e474491914d2305f187e", + "sha256:55205d03e8a598cfc688c71ca8ea5f66447164efff8869517f175ea632c7cb7b", + "sha256:5c0631926c4f58e9a5ccce555ad7747d9a9f8b10619621f22f9635f069f6233e", + "sha256:5cb241881eefd96b46f89b1a056187ea8e9ba14ab88ba632e68d7a2ecb7aadf7", + "sha256:60d698e8179a42ec85172d12f50b1668254628425a6bd611aba022257cac1386", + "sha256:612d1156111ae11d14afaf3a0669ebf6c170dbb735e510a7438ffe2369a847fd", + "sha256:6214c5a5571802c33f80e6c84713b2c79e024995b9c5897f794b43e714daeec9", + "sha256:6939c95381e003f54cd4c5516740faba40cf5ad3eeff460c3ad1d3e0ea2549bf", + "sha256:69db76c09796b313331bb7048229e3bee7928eb62bab5e071e9f7fcc4879caee", + "sha256:6bf7a982604375a8d49b6cc1b781c1747f243d91b81035a9b43a2126c04766f5", + "sha256:766c8f7511df26d9f11cd3a8be623e59cca73d44643abab3f8c8c07620524e4a", + "sha256:76c0de87358b192de7ea9649beb392f107dcad9ad27276324c24c91774ca5271", + "sha256:76f067f5121dcecf0d63a67f29080b26c43c71a98b10c701b0677e4a065fbd54", + "sha256:7901c05ead4b3fb75113fb1dd33eb1253c6d3ee37ce93305acd9d38e0b5f21a4", + "sha256:79660376075cfd4b2c80f295528aa6beb2058fd289f4c9252f986751a4cd0496", + "sha256:79a6d2ba910adb2cbafc95dad936f8b9386e77c84c35bc0add315b856d7c3abb", + "sha256:7afcdd1fc07befad18ec4523a782cde4e93e0a2bf71239894b8d61ee578c1319", + "sha256:7be7047bd08accdb7487737631d25735c9a04327911de89ff1b26b81745bd4e3", + "sha256:7c6390cf87ff6234643428991b7359b5f59cc15155695deb4eda5c777d2b880f", + "sha256:7df704ca8cf4a073334e0427ae2345323613e4df18cc224f647f251e5e75a527", + "sha256:85f67aed7bb647f93e7520633d8f51d3cbc6ab96957c71272b286b2f30dc70ed", + "sha256:896ebdcf62683551312c30e20614305f53125750803b614e9e6ce74a96232604", + "sha256:92d16a3e275e38293623ebf639c471d3e03bb20b8ebb845237e0d3664914caef", + "sha256:99f60d34c048c5c2fabc766108c103612344c46e35d4ed9ae0673d33c8fb26e8", + "sha256:9fe7b0653ba3d9d65cbe7698cca585bf0f8c83dbbcc710db9c90f478e175f2d5", + "sha256:a3145cb08d8625b2d3fee1b2d596a8766352979c9bffe5d7833e0503d0f0b5e5", + "sha256:aeaf541ddbad8311a87dd695ed9642401131ea39ad7bc8cf3ef3967fd093b626", + "sha256:b55358304d7a73d7bdf5de62494aaf70bd33015831ffd98bc498b433dfe5b10c", + "sha256:b82cc8ace10ab5bd93235dfaab2021c70637005e1ac787031f4d1da63d493c1d", + "sha256:c0868d64af83169e4d4152ec612637a543f7a336e4a307b119e98042e852ad9c", + "sha256:c1c1496e73051918fcd4f58ff2e0f2f3066d1c76a0c6aeffd9b45d53243702cc", + "sha256:c9bf56195c6bbd293340ea82eafd0071cb3d450c703d2c93afb89f93b8386ccc", + "sha256:cbebcd5bcaf1eaf302617c114aa67569dd3f090dd0ce8ba9e35e9985b41ac35b", + "sha256:cd6c8fca38178e12c00418de737aef1261576bd1b6e8c6134d3e729a4e858b38", + "sha256:ceb3b7e6a0135e092de86110c5a74e46bda4bd4fbfeeb3a3bcec79c0f861e450", + "sha256:cf590b134eb70629e350691ecca88eac3e3b8b3c86992042fb82e3cb1830d5e1", + "sha256:d3eb1ceec286eba8220c26f3b0096cf189aea7057b6e7b7a2e60ed36b373b77f", + "sha256:d65f25da8e248202bd47445cec78e0025c0fe7582b23ec69c3b27a640dd7a8e3", + "sha256:d6f6d4f185481c9669b9447bf9d9cf3b95a0e9df9d169bbc17e363b7d5487755", + "sha256:d84a5c3a5f7ce6db1f999fb9438f686bc2e09d38143f2d93d8406ed2dd6b9226", + "sha256:d946b0a9eb8aaa590df1fe082cee553ceab173e6cb5b03239716338629c50c7a", + "sha256:dce1c6912ab9ff5f179eaf6efe7365c1f425ed690b03341911bf4939ef2f3046", + "sha256:de170c7b4fe6859beb8926e84f7d7d6c693dfe8e27372ce3b76f01c46e489fcf", + "sha256:e02021f87a5b6932fa6ce916ca004c4d441509d33bbdbeca70d05dff5e9d2479", + "sha256:e030047e85cbcedbfc073f71836d62dd5dadfbe7531cae27789ff66bc551bd5e", + "sha256:e0e79d91e71b9867c73323a3444724d496c037e578a0e1755ae159ba14f4f3d1", + "sha256:e4428b29611e989719874670fd152b6625500ad6c686d464e99f5aaeeaca175a", + "sha256:e4972624066095e52b569e02b5ca97dbd7a7ddd4294bf4e7247d52635630dd83", + "sha256:e7be68734bd8c9a513f2b0cfd508802d6609da068f40dc57d4e3494cefc92929", + "sha256:e8e94e6912639a02ce173341ff62cc1201232ab86b8a8fcc05572741a5dc7d93", + "sha256:ea1456df2a27c73ce51120fa2f519f1bea2f4a03a917f4a43c8707cf4cbbae1a", + "sha256:ebd8d160f91a764652d3e51ce0d2956b38efe37c9231cd82cfc0bed2e40b581c", + "sha256:eca2e9d0cc5a889850e9bbd68e98314ada174ff6ccd1129500103df7a94a7a44", + "sha256:edd08e6f2f1a390bf137080507e44ccc086353c8e98c657e666c017718561b89", + "sha256:f285e862d2f153a70586579c15c44656f888806ed0e5b56b64489afe4a2dbfba", + "sha256:f2a1dee728b52b33eebff5072817176c172050d44d67befd681609b4746e1c2e", + "sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da", + "sha256:fb616be3538599e797a2017cccca78e354c767165e8858ab5116813146041a24", + "sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423", + "sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef" ], "markers": "python_version >= '3.7'", - "version": "==6.0.4" + "version": "==6.0.5" + }, + "myst-parser": { + "hashes": [ + "sha256:7c36344ae39c8e740dad7fdabf5aa6fc4897a813083c6cc9990044eb93656b14", + "sha256:ea929a67a6a0b1683cdbe19b8d2e724cd7643f8aa3e7bb18dd65beac3483bead" + ], + "markers": "python_version >= '3.8'", + "version": "==2.0.0" }, "networkx": { "hashes": [ @@ -1401,6 +1481,14 @@ "markers": "python_version >= '3.8'", "version": "==2.16.1" }, + "pygments": { + "hashes": [ + "sha256:b27c2826c47d0f3219f29554824c30c5e8945175d888647acd804ddd04af846c", + "sha256:da46cec9fd2de5be3a8a784f434e4c4ab670b4ff54d605c4c2717e9d49c4c367" + ], + "markers": "python_version >= '3.7'", + "version": "==2.17.2" + }, "pylanguagetool": { "hashes": [ "sha256:406629d7ed1a78d95499ebebc7f5a4950f714904a8117edb78f89757fcd90fbe", @@ -1471,10 +1559,67 @@ }, "pytz": { "hashes": [ - "sha256:31d4583c4ed539cd037956140d695e42c033a19e984bfce9964a3f7d59bc2b40", - "sha256:f90ef520d95e7c46951105338d918664ebfd6f1d995bd7d153127ce90efafa6a" + "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812", + "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319" ], - "version": "==2023.4" + "version": "==2024.1" + }, + "pyyaml": { + "hashes": [ + "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5", + "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc", + "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df", + "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741", + "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206", + "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27", + "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595", + "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62", + "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98", + "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696", + "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290", + "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9", + "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d", + "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6", + "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867", + "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47", + "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486", + "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6", + "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3", + "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007", + "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938", + "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0", + "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c", + "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735", + "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d", + "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28", + "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4", + "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba", + "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8", + "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef", + "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5", + "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd", + "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3", + "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0", + "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515", + "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c", + "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c", + "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924", + "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34", + "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43", + "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859", + "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673", + "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54", + "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a", + "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b", + "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab", + "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa", + "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c", + "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585", + "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d", + "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f" + ], + "markers": "python_version >= '3.6'", + "version": "==6.0.1" }, "regex": { "hashes": [ @@ -1746,6 +1891,13 @@ "markers": "python_version >= '3.7'", "version": "==1.3.0" }, + "snowballstemmer": { + "hashes": [ + "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1", + "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a" + ], + "version": "==2.2.0" + }, "soupsieve": { "hashes": [ "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690", @@ -1754,6 +1906,79 @@ "markers": "python_version >= '3.8'", "version": "==2.5" }, + "sphinx": { + "hashes": [ + "sha256:1e09160a40b956dc623c910118fa636da93bd3ca0b9876a7b3df90f07d691560", + "sha256:9a5160e1ea90688d5963ba09a2dcd8bdd526620edbb65c328728f1b2228d5ab5" + ], + "index": "pypi", + "markers": "python_version >= '3.9'", + "version": "==7.2.6" + }, + "sphinx-rtd-theme": { + "hashes": [ + "sha256:bd5d7b80622406762073a04ef8fadc5f9151261563d47027de09910ce03afe6b", + "sha256:ec93d0856dc280cf3aee9a4c9807c60e027c7f7b461b77aeffed682e68f0e586" + ], + "markers": "python_version >= '3.6'", + "version": "==2.0.0" + }, + "sphinxcontrib-applehelp": { + "hashes": [ + "sha256:c40a4f96f3776c4393d933412053962fac2b84f4c99a7982ba42e09576a70619", + "sha256:cb61eb0ec1b61f349e5cc36b2028e9e7ca765be05e49641c97241274753067b4" + ], + "markers": "python_version >= '3.9'", + "version": "==1.0.8" + }, + "sphinxcontrib-devhelp": { + "hashes": [ + "sha256:6485d09629944511c893fa11355bda18b742b83a2b181f9a009f7e500595c90f", + "sha256:9893fd3f90506bc4b97bdb977ceb8fbd823989f4316b28c3841ec128544372d3" + ], + "markers": "python_version >= '3.9'", + "version": "==1.0.6" + }, + "sphinxcontrib-htmlhelp": { + "hashes": [ + "sha256:0dc87637d5de53dd5eec3a6a01753b1ccf99494bd756aafecd74b4fa9e729015", + "sha256:393f04f112b4d2f53d93448d4bce35842f62b307ccdc549ec1585e950bc35e04" + ], + "markers": "python_version >= '3.9'", + "version": "==2.0.5" + }, + "sphinxcontrib-jquery": { + "hashes": [ + "sha256:1620739f04e36a2c779f1a131a2dfd49b2fd07351bf1968ced074365933abc7a", + "sha256:f936030d7d0147dd026a4f2b5a57343d233f1fc7b363f68b3d4f1cb0993878ae" + ], + "markers": "python_version >= '2.7'", + "version": "==4.1" + }, + "sphinxcontrib-jsmath": { + "hashes": [ + "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", + "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.1" + }, + "sphinxcontrib-qthelp": { + "hashes": [ + "sha256:053dedc38823a80a7209a80860b16b722e9e0209e32fea98c90e4e6624588ed6", + "sha256:e2ae3b5c492d58fcbd73281fbd27e34b8393ec34a073c792642cd8e529288182" + ], + "markers": "python_version >= '3.9'", + "version": "==1.0.7" + }, + "sphinxcontrib-serializinghtml": { + "hashes": [ + "sha256:326369b8df80a7d2d8d7f99aa5ac577f51ea51556ed974e7716cfd4fca3f6cb7", + "sha256:93f3f5dc458b91b192fe10c397e324f262cf163d79f3282c158e8436a2c4511f" + ], + "markers": "python_version >= '3.9'", + "version": "==1.1.10" + }, "sympy": { "hashes": [ "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5", @@ -2178,11 +2403,11 @@ }, "certifi": { "hashes": [ - "sha256:9b469f3a900bf28dc19b8cfbf8019bf47f7fdd1a65a1d4ffb98fc14166beb4d1", - "sha256:e036ab49d5b79556f99cfc2d9320b34cfbe5be05c5871b51de9329f0603b0474" + "sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f", + "sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1" ], "markers": "python_version >= '3.6'", - "version": "==2023.11.17" + "version": "==2024.2.2" }, "cffi": { "hashes": [ @@ -2908,69 +3133,69 @@ }, "markupsafe": { "hashes": [ - "sha256:0042d6a9880b38e1dd9ff83146cc3c9c18a059b9360ceae207805567aacccc69", - "sha256:0c26f67b3fe27302d3a412b85ef696792c4a2386293c53ba683a89562f9399b0", - "sha256:0fbad3d346df8f9d72622ac71b69565e621ada2ce6572f37c2eae8dacd60385d", - "sha256:15866d7f2dc60cfdde12ebb4e75e41be862348b4728300c36cdf405e258415ec", - "sha256:1c98c33ffe20e9a489145d97070a435ea0679fddaabcafe19982fe9c971987d5", - "sha256:21e7af8091007bf4bebf4521184f4880a6acab8df0df52ef9e513d8e5db23411", - "sha256:23984d1bdae01bee794267424af55eef4dfc038dc5d1272860669b2aa025c9e3", - "sha256:31f57d64c336b8ccb1966d156932f3daa4fee74176b0fdc48ef580be774aae74", - "sha256:3583a3a3ab7958e354dc1d25be74aee6228938312ee875a22330c4dc2e41beb0", - "sha256:36d7626a8cca4d34216875aee5a1d3d654bb3dac201c1c003d182283e3205949", - "sha256:396549cea79e8ca4ba65525470d534e8a41070e6b3500ce2414921099cb73e8d", - "sha256:3a66c36a3864df95e4f62f9167c734b3b1192cb0851b43d7cc08040c074c6279", - "sha256:3aae9af4cac263007fd6309c64c6ab4506dd2b79382d9d19a1994f9240b8db4f", - "sha256:3ab3a886a237f6e9c9f4f7d272067e712cdb4efa774bef494dccad08f39d8ae6", - "sha256:47bb5f0142b8b64ed1399b6b60f700a580335c8e1c57f2f15587bd072012decc", - "sha256:49a3b78a5af63ec10d8604180380c13dcd870aba7928c1fe04e881d5c792dc4e", - "sha256:4df98d4a9cd6a88d6a585852f56f2155c9cdb6aec78361a19f938810aa020954", - "sha256:5045e892cfdaecc5b4c01822f353cf2c8feb88a6ec1c0adef2a2e705eef0f656", - "sha256:5244324676254697fe5c181fc762284e2c5fceeb1c4e3e7f6aca2b6f107e60dc", - "sha256:54635102ba3cf5da26eb6f96c4b8c53af8a9c0d97b64bdcb592596a6255d8518", - "sha256:54a7e1380dfece8847c71bf7e33da5d084e9b889c75eca19100ef98027bd9f56", - "sha256:55d03fea4c4e9fd0ad75dc2e7e2b6757b80c152c032ea1d1de487461d8140efc", - "sha256:698e84142f3f884114ea8cf83e7a67ca8f4ace8454e78fe960646c6c91c63bfa", - "sha256:6aa5e2e7fc9bc042ae82d8b79d795b9a62bd8f15ba1e7594e3db243f158b5565", - "sha256:7653fa39578957bc42e5ebc15cf4361d9e0ee4b702d7d5ec96cdac860953c5b4", - "sha256:765f036a3d00395a326df2835d8f86b637dbaf9832f90f5d196c3b8a7a5080cb", - "sha256:78bc995e004681246e85e28e068111a4c3f35f34e6c62da1471e844ee1446250", - "sha256:7a07f40ef8f0fbc5ef1000d0c78771f4d5ca03b4953fc162749772916b298fc4", - "sha256:8b570a1537367b52396e53325769608f2a687ec9a4363647af1cded8928af959", - "sha256:987d13fe1d23e12a66ca2073b8d2e2a75cec2ecb8eab43ff5624ba0ad42764bc", - "sha256:9896fca4a8eb246defc8b2a7ac77ef7553b638e04fbf170bff78a40fa8a91474", - "sha256:9e9e3c4020aa2dc62d5dd6743a69e399ce3de58320522948af6140ac959ab863", - "sha256:a0b838c37ba596fcbfca71651a104a611543077156cb0a26fe0c475e1f152ee8", - "sha256:a4d176cfdfde84f732c4a53109b293d05883e952bbba68b857ae446fa3119b4f", - "sha256:a76055d5cb1c23485d7ddae533229039b850db711c554a12ea64a0fd8a0129e2", - "sha256:a76cd37d229fc385738bd1ce4cba2a121cf26b53864c1772694ad0ad348e509e", - "sha256:a7cc49ef48a3c7a0005a949f3c04f8baa5409d3f663a1b36f0eba9bfe2a0396e", - "sha256:abf5ebbec056817057bfafc0445916bb688a255a5146f900445d081db08cbabb", - "sha256:b0fe73bac2fed83839dbdbe6da84ae2a31c11cfc1c777a40dbd8ac8a6ed1560f", - "sha256:b6f14a9cd50c3cb100eb94b3273131c80d102e19bb20253ac7bd7336118a673a", - "sha256:b83041cda633871572f0d3c41dddd5582ad7d22f65a72eacd8d3d6d00291df26", - "sha256:b835aba863195269ea358cecc21b400276747cc977492319fd7682b8cd2c253d", - "sha256:bf1196dcc239e608605b716e7b166eb5faf4bc192f8a44b81e85251e62584bd2", - "sha256:c669391319973e49a7c6230c218a1e3044710bc1ce4c8e6eb71f7e6d43a2c131", - "sha256:c7556bafeaa0a50e2fe7dc86e0382dea349ebcad8f010d5a7dc6ba568eaaa789", - "sha256:c8f253a84dbd2c63c19590fa86a032ef3d8cc18923b8049d91bcdeeb2581fbf6", - "sha256:d18b66fe626ac412d96c2ab536306c736c66cf2a31c243a45025156cc190dc8a", - "sha256:d5291d98cd3ad9a562883468c690a2a238c4a6388ab3bd155b0c75dd55ece858", - "sha256:d5c31fe855c77cad679b302aabc42d724ed87c043b1432d457f4976add1c2c3e", - "sha256:d6e427c7378c7f1b2bef6a344c925b8b63623d3321c09a237b7cc0e77dd98ceb", - "sha256:dac1ebf6983148b45b5fa48593950f90ed6d1d26300604f321c74a9ca1609f8e", - "sha256:de8153a7aae3835484ac168a9a9bdaa0c5eee4e0bc595503c95d53b942879c84", - "sha256:e1a0d1924a5013d4f294087e00024ad25668234569289650929ab871231668e7", - "sha256:e7902211afd0af05fbadcc9a312e4cf10f27b779cf1323e78d52377ae4b72bea", - "sha256:e888ff76ceb39601c59e219f281466c6d7e66bd375b4ec1ce83bcdc68306796b", - "sha256:f06e5a9e99b7df44640767842f414ed5d7bedaaa78cd817ce04bbd6fd86e2dd6", - "sha256:f6be2d708a9d0e9b0054856f07ac7070fbe1754be40ca8525d5adccdbda8f475", - "sha256:f9917691f410a2e0897d1ef99619fd3f7dd503647c8ff2475bf90c3cf222ad74", - "sha256:fc1a75aa8f11b87910ffd98de62b29d6520b6d6e8a3de69a70ca34dea85d2a8a", - "sha256:fe8512ed897d5daf089e5bd010c3dc03bb1bdae00b35588c49b98268d4a01e00" + "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf", + "sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff", + "sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f", + "sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3", + "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532", + "sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f", + "sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617", + "sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df", + "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4", + "sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906", + "sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f", + "sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4", + "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8", + "sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371", + "sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2", + "sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465", + "sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52", + "sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6", + "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169", + "sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad", + "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2", + "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0", + "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029", + "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f", + "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a", + "sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced", + "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5", + "sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c", + "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf", + "sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9", + "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb", + "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad", + "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3", + "sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1", + "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46", + "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc", + "sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a", + "sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee", + "sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900", + "sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5", + "sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea", + "sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f", + "sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5", + "sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e", + "sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a", + "sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f", + "sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50", + "sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a", + "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b", + "sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4", + "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff", + "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2", + "sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46", + "sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b", + "sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf", + "sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5", + "sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5", + "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab", + "sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd", + "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68" ], "markers": "python_version >= '3.7'", - "version": "==2.1.4" + "version": "==2.1.5" }, "matplotlib": { "hashes": [ diff --git a/README.md b/README.md index c9ab0ae..39a44fa 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ SPDX-FileCopyrightText: 2023 Berkay Bozkurt # Sales-Lead-Qualifier Project (AMOS WS 2023/24) -## Sum Insight Logo +## Sum Insights Logo diff --git a/src/docs/conf.py b/src/docs/conf.py index 8ae4ed8..0288591 100644 --- a/src/docs/conf.py +++ b/src/docs/conf.py @@ -22,7 +22,13 @@ # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration -extensions = ["sphinx.ext.autodoc", "sphinx.ext.viewcode", "sphinx.ext.napoleon"] +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.viewcode", + "sphinx.ext.napoleon", + "myst_parser", +] + templates_path = ["_templates"] exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] diff --git a/src/docs/index.rst b/src/docs/index.rst index 5cdad59..ad0ace2 100644 --- a/src/docs/index.rst +++ b/src/docs/index.rst @@ -13,6 +13,7 @@ Welcome to Sales Lead Qualifier's documentation! :maxdepth: 2 :caption: Contents: + readme_link modules Indices and tables From 65f68a83dcb3c3999be472963757f994c439be5b Mon Sep 17 00:00:00 2001 From: Berkay Bozkurt Date: Sat, 3 Feb 2024 14:14:27 +0100 Subject: [PATCH 23/51] licenses added to readme linker Signed-off-by: Berkay Bozkurt --- .gitignore | 1 + src/docs/readme_link.md | 8 ++++++++ 2 files changed, 9 insertions(+) create mode 100644 src/docs/readme_link.md diff --git a/.gitignore b/.gitignore index 1ef6d26..c346f48 100644 --- a/.gitignore +++ b/.gitignore @@ -61,6 +61,7 @@ bin/ !**/docs/index.rst !**/docs/make.bat !**/docs/Makefile +!**/docs/readme_link.md # Env files *.env diff --git a/src/docs/readme_link.md b/src/docs/readme_link.md new file mode 100644 index 0000000..e5f876e --- /dev/null +++ b/src/docs/readme_link.md @@ -0,0 +1,8 @@ + + +```{include} ../../README.md + +``` From 960fdabd0c64a0da2a767be8681eb2fab7637be2 Mon Sep 17 00:00:00 2001 From: Berkay Bozkurt Date: Sat, 3 Feb 2024 14:17:55 +0100 Subject: [PATCH 24/51] sphinx moved under dev-packages Signed-off-by: Berkay Bozkurt --- Pipfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Pipfile b/Pipfile index e407593..9c56c91 100644 --- a/Pipfile +++ b/Pipfile @@ -16,6 +16,9 @@ plotly = "==5.18.0" pre-commit = "==3.5.0" pytest = "==7.4.0" pytest-env = "==1.0.1" +sphinx = "==7.2.6" +sphinx_rtd_theme = "==2.0.0" +myst_parser = "==2.0.0" [packages] autocorrect = "==2.6.1" @@ -48,9 +51,6 @@ tiktoken = "==0.5.1" torch = "==2.1.2" tqdm = "==4.65.0" xgboost = "==2.0.3" -sphinx = "==7.2.6" -sphinx_rtd_theme = "==2.0.0" -myst_parser = "==2.0.0" [requires] python_version = "3.10" From aa8ced84ae0501678be7172956c577a24bc2b0d2 Mon Sep 17 00:00:00 2001 From: Ahmed Sheta Date: Sat, 3 Feb 2024 22:29:48 +0100 Subject: [PATCH 25/51] updated pipfile, removed debugging prints, added logs Signed-off-by: Ahmed Sheta --- Pipfile.lock | 430 ++++++++++++++++++++-------------------------- src/demo/demos.py | 16 +- 2 files changed, 204 insertions(+), 242 deletions(-) diff --git a/Pipfile.lock b/Pipfile.lock index f6b083a..d3f4ad6 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "9e3d29b16e3d34d5c059c059728a9a36510bd8554044aa224a482cc910d553c1" + "sha256": "8d74161673d9b82cb7933149388452406f1efaf7a82db95bfd11997ef8b36d33" }, "pipfile-spec": 6, "requires": { @@ -122,14 +122,6 @@ "markers": "python_version >= '3.7'", "version": "==1.3.1" }, - "alabaster": { - "hashes": [ - "sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65", - "sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92" - ], - "markers": "python_version >= '3.9'", - "version": "==0.7.16" - }, "annotated-types": { "hashes": [ "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43", @@ -169,14 +161,6 @@ "index": "pypi", "version": "==2.6.1" }, - "babel": { - "hashes": [ - "sha256:6919867db036398ba21eb5c7a0f6b28ab8cbc3ae7a73a44ebe34ae74a4e7d363", - "sha256:efb1a25b7118e67ce3a259bed20545c29cb68be8ad2c784c83689981b7a57287" - ], - "markers": "python_version >= '3.7'", - "version": "==2.14.0" - }, "beautifulsoup4": { "hashes": [ "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da", @@ -327,7 +311,7 @@ "sha256:a4bc13d623356b373c2c27c53dbd9c68cae5d526270bfa71f6c6fa69669c6b27", "sha256:c1ca117dbce1fe20a5809dc96f01e1c2840f6dcc939b3ddbb1111bf330ba82df" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2' and python_version < '4'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' and python_version < '4'", "version": "==0.7.2" }, "colorama": { @@ -517,14 +501,6 @@ "markers": "python_version >= '3.8'", "version": "==2.5.0" }, - "docutils": { - "hashes": [ - "sha256:96f387a2c5562db4476f09f13bbab2192e764cac08ebbf3a34a95d9b1e4a59d6", - "sha256:f08a4e276c3a1583a86dce3e34aba3fe04d02bba2dd51ed16106244e8a923e3b" - ], - "markers": "python_version >= '3.7'", - "version": "==0.20.1" - }, "email-validator": { "hashes": [ "sha256:a4b0bd1cf55f073b924258d19321b1f3aa74b4b5a71a42c305575dba920e1a44", @@ -736,14 +712,6 @@ "markers": "python_version >= '3.5'", "version": "==3.6" }, - "imagesize": { - "hashes": [ - "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b", - "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==1.4.1" - }, "jinja2": { "hashes": [ "sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa", @@ -888,14 +856,6 @@ "markers": "python_version >= '3.8' and python_version < '4.0'", "version": "==2.0.1" }, - "markdown-it-py": { - "hashes": [ - "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", - "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb" - ], - "markers": "python_version >= '3.8'", - "version": "==3.0.0" - }, "markupsafe": { "hashes": [ "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf", @@ -962,22 +922,6 @@ "markers": "python_version >= '3.7'", "version": "==2.1.5" }, - "mdit-py-plugins": { - "hashes": [ - "sha256:b51b3bb70691f57f974e257e367107857a93b36f322a9e6d44ca5bf28ec2def9", - "sha256:d8ab27e9aed6c38aa716819fedfde15ca275715955f8a185a8e1cf90fb1d2c1b" - ], - "markers": "python_version >= '3.8'", - "version": "==0.4.0" - }, - "mdurl": { - "hashes": [ - "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", - "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba" - ], - "markers": "python_version >= '3.7'", - "version": "==0.1.2" - }, "more-itertools": { "hashes": [ "sha256:1bc4f91ee5b1b31ac7ceacc17c09befe6a40a503907baf9c839c229b5095cfd2", @@ -1089,14 +1033,6 @@ "markers": "python_version >= '3.7'", "version": "==6.0.5" }, - "myst-parser": { - "hashes": [ - "sha256:7c36344ae39c8e740dad7fdabf5aa6fc4897a813083c6cc9990044eb93656b14", - "sha256:ea929a67a6a0b1683cdbe19b8d2e724cd7643f8aa3e7bb18dd65beac3483bead" - ], - "markers": "python_version >= '3.8'", - "version": "==2.0.0" - }, "networkx": { "hashes": [ "sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6", @@ -1481,14 +1417,6 @@ "markers": "python_version >= '3.8'", "version": "==2.16.1" }, - "pygments": { - "hashes": [ - "sha256:b27c2826c47d0f3219f29554824c30c5e8945175d888647acd804ddd04af846c", - "sha256:da46cec9fd2de5be3a8a784f434e4c4ab670b4ff54d605c4c2717e9d49c4c367" - ], - "markers": "python_version >= '3.7'", - "version": "==2.17.2" - }, "pylanguagetool": { "hashes": [ "sha256:406629d7ed1a78d95499ebebc7f5a4950f714904a8117edb78f89757fcd90fbe", @@ -1531,6 +1459,14 @@ "markers": "python_version >= '3.9'", "version": "==3.6.1" }, + "pyreadline3": { + "hashes": [ + "sha256:6f3d1f7b8a31ba32b73917cefc1f28cc660562f39aea8646d30bd6eff21f7bae", + "sha256:b0efb6516fd4fb07b45949053826a62fa4cb353db5be2bbb4a7aa1fdd1e345fb" + ], + "markers": "python_version >= '3.8' and sys_platform == 'win32'", + "version": "==3.4.1" + }, "pyspellchecker": { "hashes": [ "sha256:b5ef23437702b8d03626f814b9646779b572d378b325ad252d8a8e616b3d76db", @@ -1545,7 +1481,7 @@ "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==2.8.2" }, "python-dotenv": { @@ -1564,63 +1500,6 @@ ], "version": "==2024.1" }, - "pyyaml": { - "hashes": [ - "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5", - "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc", - "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df", - "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741", - "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206", - "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27", - "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595", - "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62", - "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98", - "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696", - "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290", - "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9", - "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d", - "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6", - "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867", - "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47", - "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486", - "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6", - "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3", - "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007", - "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938", - "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0", - "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c", - "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735", - "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d", - "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28", - "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4", - "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba", - "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8", - "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef", - "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5", - "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd", - "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3", - "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0", - "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515", - "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c", - "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c", - "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924", - "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34", - "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43", - "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859", - "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673", - "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54", - "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a", - "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b", - "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab", - "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa", - "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c", - "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585", - "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d", - "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f" - ], - "markers": "python_version >= '3.6'", - "version": "==6.0.1" - }, "regex": { "hashes": [ "sha256:0694219a1d54336fd0445ea382d49d36882415c0134ee1e8332afd1529f0baa5", @@ -1880,7 +1759,7 @@ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.16.0" }, "sniffio": { @@ -1891,13 +1770,6 @@ "markers": "python_version >= '3.7'", "version": "==1.3.0" }, - "snowballstemmer": { - "hashes": [ - "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1", - "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a" - ], - "version": "==2.2.0" - }, "soupsieve": { "hashes": [ "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690", @@ -1906,79 +1778,6 @@ "markers": "python_version >= '3.8'", "version": "==2.5" }, - "sphinx": { - "hashes": [ - "sha256:1e09160a40b956dc623c910118fa636da93bd3ca0b9876a7b3df90f07d691560", - "sha256:9a5160e1ea90688d5963ba09a2dcd8bdd526620edbb65c328728f1b2228d5ab5" - ], - "index": "pypi", - "markers": "python_version >= '3.9'", - "version": "==7.2.6" - }, - "sphinx-rtd-theme": { - "hashes": [ - "sha256:bd5d7b80622406762073a04ef8fadc5f9151261563d47027de09910ce03afe6b", - "sha256:ec93d0856dc280cf3aee9a4c9807c60e027c7f7b461b77aeffed682e68f0e586" - ], - "markers": "python_version >= '3.6'", - "version": "==2.0.0" - }, - "sphinxcontrib-applehelp": { - "hashes": [ - "sha256:c40a4f96f3776c4393d933412053962fac2b84f4c99a7982ba42e09576a70619", - "sha256:cb61eb0ec1b61f349e5cc36b2028e9e7ca765be05e49641c97241274753067b4" - ], - "markers": "python_version >= '3.9'", - "version": "==1.0.8" - }, - "sphinxcontrib-devhelp": { - "hashes": [ - "sha256:6485d09629944511c893fa11355bda18b742b83a2b181f9a009f7e500595c90f", - "sha256:9893fd3f90506bc4b97bdb977ceb8fbd823989f4316b28c3841ec128544372d3" - ], - "markers": "python_version >= '3.9'", - "version": "==1.0.6" - }, - "sphinxcontrib-htmlhelp": { - "hashes": [ - "sha256:0dc87637d5de53dd5eec3a6a01753b1ccf99494bd756aafecd74b4fa9e729015", - "sha256:393f04f112b4d2f53d93448d4bce35842f62b307ccdc549ec1585e950bc35e04" - ], - "markers": "python_version >= '3.9'", - "version": "==2.0.5" - }, - "sphinxcontrib-jquery": { - "hashes": [ - "sha256:1620739f04e36a2c779f1a131a2dfd49b2fd07351bf1968ced074365933abc7a", - "sha256:f936030d7d0147dd026a4f2b5a57343d233f1fc7b363f68b3d4f1cb0993878ae" - ], - "markers": "python_version >= '2.7'", - "version": "==4.1" - }, - "sphinxcontrib-jsmath": { - "hashes": [ - "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", - "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8" - ], - "markers": "python_version >= '3.5'", - "version": "==1.0.1" - }, - "sphinxcontrib-qthelp": { - "hashes": [ - "sha256:053dedc38823a80a7209a80860b16b722e9e0209e32fea98c90e4e6624588ed6", - "sha256:e2ae3b5c492d58fcbd73281fbd27e34b8393ec34a073c792642cd8e529288182" - ], - "markers": "python_version >= '3.9'", - "version": "==1.0.7" - }, - "sphinxcontrib-serializinghtml": { - "hashes": [ - "sha256:326369b8df80a7d2d8d7f99aa5ac577f51ea51556ed974e7716cfd4fca3f6cb7", - "sha256:93f3f5dc458b91b192fe10c397e324f262cf163d79f3282c158e8436a2c4511f" - ], - "markers": "python_version >= '3.9'", - "version": "==1.1.10" - }, "sympy": { "hashes": [ "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5", @@ -2294,6 +2093,14 @@ } }, "develop": { + "alabaster": { + "hashes": [ + "sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65", + "sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92" + ], + "markers": "python_version >= '3.9'", + "version": "==0.7.16" + }, "anyio": { "hashes": [ "sha256:44a3c9aba0f5defa43261a8b3efb97891f2bd7d804e0e1f56419befa1adfc780", @@ -2302,14 +2109,6 @@ "markers": "python_version >= '3.7'", "version": "==3.7.1" }, - "appnope": { - "hashes": [ - "sha256:02bd91c4de869fbb1e1c50aafc4098827a7a54ab2f39d9dcba6c9547ed920e24", - "sha256:265a455292d0bd8a72453494fa24df5a11eb18373a60c7c0430889f22548605e" - ], - "markers": "platform_system == 'Darwin'", - "version": "==0.1.3" - }, "argon2-cffi": { "hashes": [ "sha256:879c3e79a2729ce768ebb7d36d4609e3a78a4ca2ec3a9f12286ca057e3d0db08", @@ -2571,6 +2370,15 @@ "markers": "python_full_version >= '3.7.0'", "version": "==3.3.2" }, + "colorama": { + "hashes": [ + "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", + "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6" + ], + "index": "pypi", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6'", + "version": "==0.4.6" + }, "comm": { "hashes": [ "sha256:0bc91edae1344d39d3661dcbc36937181fdaddb304790458f8b044dbc064b89a", @@ -2743,6 +2551,14 @@ ], "version": "==0.3.8" }, + "docutils": { + "hashes": [ + "sha256:96f387a2c5562db4476f09f13bbab2192e764cac08ebbf3a34a95d9b1e4a59d6", + "sha256:f08a4e276c3a1583a86dce3e34aba3fe04d02bba2dd51ed16106244e8a923e3b" + ], + "markers": "python_version >= '3.7'", + "version": "==0.20.1" + }, "exceptiongroup": { "hashes": [ "sha256:4bfd3996ac73b41e9b9628b04e079f193850720ea5945fc96a08633c66912f14", @@ -2871,6 +2687,14 @@ "markers": "python_version >= '3.5'", "version": "==3.6" }, + "imagesize": { + "hashes": [ + "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b", + "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==1.4.1" + }, "iniconfig": { "hashes": [ "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3", @@ -3131,6 +2955,14 @@ "markers": "python_version >= '3.7'", "version": "==1.4.5" }, + "markdown-it-py": { + "hashes": [ + "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", + "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb" + ], + "markers": "python_version >= '3.8'", + "version": "==3.0.0" + }, "markupsafe": { "hashes": [ "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf", @@ -3248,6 +3080,22 @@ "markers": "python_version >= '3.6'", "version": "==0.7.0" }, + "mdit-py-plugins": { + "hashes": [ + "sha256:b51b3bb70691f57f974e257e367107857a93b36f322a9e6d44ca5bf28ec2def9", + "sha256:d8ab27e9aed6c38aa716819fedfde15ca275715955f8a185a8e1cf90fb1d2c1b" + ], + "markers": "python_version >= '3.8'", + "version": "==0.4.0" + }, + "mdurl": { + "hashes": [ + "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", + "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba" + ], + "markers": "python_version >= '3.7'", + "version": "==0.1.2" + }, "mistune": { "hashes": [ "sha256:71481854c30fdbc938963d3605b72501f5c10a9320ecd412c121c163a1c7d205", @@ -3256,6 +3104,14 @@ "markers": "python_version >= '3.7'", "version": "==3.0.2" }, + "myst-parser": { + "hashes": [ + "sha256:7c36344ae39c8e740dad7fdabf5aa6fc4897a813083c6cc9990044eb93656b14", + "sha256:ea929a67a6a0b1683cdbe19b8d2e724cd7643f8aa3e7bb18dd65beac3483bead" + ], + "markers": "python_version >= '3.8'", + "version": "==2.0.0" + }, "nbclient": { "hashes": [ "sha256:4b28c207877cf33ef3a9838cdc7a54c5ceff981194a82eac59d558f05487295e", @@ -3384,14 +3240,6 @@ "markers": "python_version >= '3.6'", "version": "==0.8.3" }, - "pexpect": { - "hashes": [ - "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", - "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f" - ], - "markers": "sys_platform != 'win32'", - "version": "==4.9.0" - }, "pillow": { "hashes": [ "sha256:0304004f8067386b477d20a518b50f3fa658a28d44e4116970abfcd94fac34a8", @@ -3538,14 +3386,6 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", "version": "==5.9.8" }, - "ptyprocess": { - "hashes": [ - "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", - "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220" - ], - "markers": "os_name != 'nt'", - "version": "==0.7.0" - }, "pure-eval": { "hashes": [ "sha256:01eaab343580944bc56080ebe0a674b39ec44a945e6d09ba7db3cb8cec289350", @@ -3615,7 +3455,7 @@ "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==2.8.2" }, "python-json-logger": { @@ -3626,6 +3466,38 @@ "markers": "python_version >= '3.6'", "version": "==2.0.7" }, + "pywin32": { + "hashes": [ + "sha256:06d3420a5155ba65f0b72f2699b5bacf3109f36acbe8923765c22938a69dfc8d", + "sha256:1c73ea9a0d2283d889001998059f5eaaba3b6238f767c9cf2833b13e6a685f65", + "sha256:37257794c1ad39ee9be652da0462dc2e394c8159dfd913a8a4e8eb6fd346da0e", + "sha256:383229d515657f4e3ed1343da8be101000562bf514591ff383ae940cad65458b", + "sha256:39b61c15272833b5c329a2989999dcae836b1eed650252ab1b7bfbe1d59f30f4", + "sha256:5821ec52f6d321aa59e2db7e0a35b997de60c201943557d108af9d4ae1ec7040", + "sha256:70dba0c913d19f942a2db25217d9a1b726c278f483a919f1abfed79c9cf64d3a", + "sha256:72c5f621542d7bdd4fdb716227be0dd3f8565c11b280be6315b06ace35487d36", + "sha256:84f4471dbca1887ea3803d8848a1616429ac94a4a8d05f4bc9c5dcfd42ca99c8", + "sha256:a7639f51c184c0272e93f244eb24dafca9b1855707d94c192d4a0b4c01e1100e", + "sha256:e25fd5b485b55ac9c057f67d94bc203f3f6595078d1fb3b458c9c28b7153a802", + "sha256:e4c092e2589b5cf0d365849e73e02c391c1349958c5ac3e9d5ccb9a28e017b3a", + "sha256:e65028133d15b64d2ed8f06dd9fbc268352478d4f9289e69c190ecd6818b6407", + "sha256:e8ac1ae3601bee6ca9f7cb4b5363bf1c0badb935ef243c4733ff9a393b1690c0" + ], + "markers": "sys_platform == 'win32' and platform_python_implementation != 'PyPy'", + "version": "==306" + }, + "pywinpty": { + "hashes": [ + "sha256:1617b729999eb6713590e17665052b1a6ae0ad76ee31e60b444147c5b6a35dca", + "sha256:189380469ca143d06e19e19ff3fba0fcefe8b4a8cc942140a6b863aed7eebb2d", + "sha256:21319cd1d7c8844fb2c970fb3a55a3db5543f112ff9cfcd623746b9c47501575", + "sha256:7520575b6546db23e693cbd865db2764097bd6d4ef5dc18c92555904cd62c3d4", + "sha256:8197de460ae8ebb7f5d1701dfa1b5df45b157bb832e92acba316305e18ca00dd", + "sha256:853985a8f48f4731a716653170cd735da36ffbdc79dcb4c7b7140bce11d8c722" + ], + "markers": "os_name == 'nt'", + "version": "==2.0.12" + }, "pyyaml": { "hashes": [ "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5", @@ -3941,7 +3813,7 @@ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.16.0" }, "sniffio": { @@ -3952,6 +3824,13 @@ "markers": "python_version >= '3.7'", "version": "==1.3.0" }, + "snowballstemmer": { + "hashes": [ + "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1", + "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a" + ], + "version": "==2.2.0" + }, "soupsieve": { "hashes": [ "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690", @@ -3960,6 +3839,79 @@ "markers": "python_version >= '3.8'", "version": "==2.5" }, + "sphinx": { + "hashes": [ + "sha256:1e09160a40b956dc623c910118fa636da93bd3ca0b9876a7b3df90f07d691560", + "sha256:9a5160e1ea90688d5963ba09a2dcd8bdd526620edbb65c328728f1b2228d5ab5" + ], + "index": "pypi", + "markers": "python_version >= '3.9'", + "version": "==7.2.6" + }, + "sphinx-rtd-theme": { + "hashes": [ + "sha256:bd5d7b80622406762073a04ef8fadc5f9151261563d47027de09910ce03afe6b", + "sha256:ec93d0856dc280cf3aee9a4c9807c60e027c7f7b461b77aeffed682e68f0e586" + ], + "markers": "python_version >= '3.6'", + "version": "==2.0.0" + }, + "sphinxcontrib-applehelp": { + "hashes": [ + "sha256:c40a4f96f3776c4393d933412053962fac2b84f4c99a7982ba42e09576a70619", + "sha256:cb61eb0ec1b61f349e5cc36b2028e9e7ca765be05e49641c97241274753067b4" + ], + "markers": "python_version >= '3.9'", + "version": "==1.0.8" + }, + "sphinxcontrib-devhelp": { + "hashes": [ + "sha256:6485d09629944511c893fa11355bda18b742b83a2b181f9a009f7e500595c90f", + "sha256:9893fd3f90506bc4b97bdb977ceb8fbd823989f4316b28c3841ec128544372d3" + ], + "markers": "python_version >= '3.9'", + "version": "==1.0.6" + }, + "sphinxcontrib-htmlhelp": { + "hashes": [ + "sha256:0dc87637d5de53dd5eec3a6a01753b1ccf99494bd756aafecd74b4fa9e729015", + "sha256:393f04f112b4d2f53d93448d4bce35842f62b307ccdc549ec1585e950bc35e04" + ], + "markers": "python_version >= '3.9'", + "version": "==2.0.5" + }, + "sphinxcontrib-jquery": { + "hashes": [ + "sha256:1620739f04e36a2c779f1a131a2dfd49b2fd07351bf1968ced074365933abc7a", + "sha256:f936030d7d0147dd026a4f2b5a57343d233f1fc7b363f68b3d4f1cb0993878ae" + ], + "markers": "python_version >= '2.7'", + "version": "==4.1" + }, + "sphinxcontrib-jsmath": { + "hashes": [ + "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", + "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.1" + }, + "sphinxcontrib-qthelp": { + "hashes": [ + "sha256:053dedc38823a80a7209a80860b16b722e9e0209e32fea98c90e4e6624588ed6", + "sha256:e2ae3b5c492d58fcbd73281fbd27e34b8393ec34a073c792642cd8e529288182" + ], + "markers": "python_version >= '3.9'", + "version": "==1.0.7" + }, + "sphinxcontrib-serializinghtml": { + "hashes": [ + "sha256:326369b8df80a7d2d8d7f99aa5ac577f51ea51556ed974e7716cfd4fca3f6cb7", + "sha256:93f3f5dc458b91b192fe10c397e324f262cf163d79f3282c158e8436a2c4511f" + ], + "markers": "python_version >= '3.9'", + "version": "==1.1.10" + }, "stack-data": { "hashes": [ "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", diff --git a/src/demo/demos.py b/src/demo/demos.py index 83723b2..7437aa1 100644 --- a/src/demo/demos.py +++ b/src/demo/demos.py @@ -254,6 +254,7 @@ def predict_MerchantSize_on_lead_data_demo(): ) ######################### preprocessing the leads ################################## + log.info(f"Preprocessing the leads!") if get_yes_no_input("Run on S3? (y/n)\n'n' means it will run locally!\n"): S3_bool = True else: @@ -271,6 +272,7 @@ def predict_MerchantSize_on_lead_data_demo(): preprocessor.save_preprocessed_data() ############################## adapting the preprocessing files ########################### + log.info(f"Adapting the leads' preprocessed data for the ML model!") # load the data from the CSV files historical_preprocessed_data = pd.read_csv( "s3://amos--data--features/preprocessed_data_files/preprocessed_data.csv" @@ -307,10 +309,17 @@ def predict_MerchantSize_on_lead_data_demo(): historical_columns_order ] if S3_bool: + log.info(f"Adapting the leads' preprocessed data for the ML model!") + toBePredicted_output_path_s3 = ( + "s3://amos--data--events/leads/toBePredicted_preprocessed_data_updated.csv" + ) toBePredicted_preprocessed_data.to_csv( - "s3://amos--data--events/leads/toBePredicted_preprocessed_data_updated.csv", + toBePredicted_output_path_s3, index=False, ) + log.info( + f"Saving the adapted preprocessed data at {toBePredicted_output_path_s3}" + ) else: path_components = preprocessor.data_path.split( "\\" if "\\" in preprocessor.data_path else "/" @@ -321,6 +330,9 @@ def predict_MerchantSize_on_lead_data_demo(): toBePredicted_preprocessed_data.to_csv( local_preprocessed_data_path, index=False ) + log.info( + f"Saving the adapted preprocessed data at {local_preprocessed_data_path}" + ) # check if columns in both dataframe are in same order and same number assert list(toBePredicted_preprocessed_data.columns) == list( @@ -390,8 +402,6 @@ def check_classification_task(string): # first 5 columns: Last Name,First Name,Company / Account,Phone,Email, raw_data = enriched_data.iloc[:, :5] - print(f"raw_data = {raw_data.shape}") - print(f"remapped_predictions = {len(remapped_predictions)}") raw_data["PredictedMerchantSize"] = remapped_predictions if S3_bool: From 98dc73f2960703784b878b7905f21286e0fe6cb9 Mon Sep 17 00:00:00 2001 From: Ahmed Sheta Date: Sun, 4 Feb 2024 18:10:37 +0100 Subject: [PATCH 26/51] modified such hat models can be loaded from local path and applied in Merchant Size Prediction Signed-off-by: Ahmed Sheta --- src/demo/demos.py | 48 +++++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/src/demo/demos.py b/src/demo/demos.py index 7437aa1..f5818e5 100644 --- a/src/demo/demos.py +++ b/src/demo/demos.py @@ -254,7 +254,6 @@ def predict_MerchantSize_on_lead_data_demo(): ) ######################### preprocessing the leads ################################## - log.info(f"Preprocessing the leads!") if get_yes_no_input("Run on S3? (y/n)\n'n' means it will run locally!\n"): S3_bool = True else: @@ -264,6 +263,7 @@ def predict_MerchantSize_on_lead_data_demo(): sys.path.append(parent_dir) from preprocessing import Preprocessing + log.info(f"Preprocessing the leads...") preprocessor = Preprocessing( filter_null_data=False, historical_bool=False, S3_bool=S3_bool ) @@ -272,7 +272,7 @@ def predict_MerchantSize_on_lead_data_demo(): preprocessor.save_preprocessed_data() ############################## adapting the preprocessing files ########################### - log.info(f"Adapting the leads' preprocessed data for the ML model!") + log.info(f"Adapting the leads' preprocessed data for the ML model...") # load the data from the CSV files historical_preprocessed_data = pd.read_csv( "s3://amos--data--features/preprocessed_data_files/preprocessed_data.csv" @@ -309,7 +309,6 @@ def predict_MerchantSize_on_lead_data_demo(): historical_columns_order ] if S3_bool: - log.info(f"Adapting the leads' preprocessed data for the ML model!") toBePredicted_output_path_s3 = ( "s3://amos--data--events/leads/toBePredicted_preprocessed_data_updated.csv" ) @@ -343,9 +342,14 @@ def predict_MerchantSize_on_lead_data_demo(): bucket_name = "amos--models" - model_name = get_string_input( - "Provide model file name in amos--models/models S3 Bucket\nInput example: lightgbm_epochs(1)_f1(0.6375)_numclasses(5)_model.pkl\n" - ) + if S3_bool: + model_name = get_string_input( + "Provide model file name in amos--models/models S3 Bucket\nInput example: lightgbm_epochs(1)_f1(0.6375)_numclasses(5)_model.pkl\n" + ) + else: + model_name = get_string_input( + "Provide model file name in data/models local directory\nInput example: lightgbm_epochs(1)_f1(0.6375)_numclasses(5)_model.pkl\n" + ) # file_key = "models/lightgbm_epochs(1)_f1(0.6375)_numclasses(5)_model_updated.pkl" # adjust according to the desired model model_name = model_name.replace(" ", "") xgb_bool = False @@ -364,17 +368,29 @@ def check_classification_task(string): False classification_task_3 = check_classification_task(file_key) - # create an S3 client - s3 = boto3.client("s3") - - # download the file from S3 - response = s3.get_object(Bucket=bucket_name, Key=file_key) - model_content = response["Body"].read() - # load model - with BytesIO(model_content) as model_file: - model = joblib.load(model_file) - log.info(f"Loaded the model sucessfully!") + try: + if S3_bool: + # create an S3 client + s3 = boto3.client("s3") + # download the file from S3 + response = s3.get_object(Bucket=bucket_name, Key=file_key) + model_content = response["Body"].read() + # load model + with BytesIO(model_content) as model_file: + model = joblib.load(model_file) + log.info(f"Loaded the model from S3 bucket sucessfully!") + else: + path_components = preprocessor.data_path.split( + "\\" if "\\" in preprocessor.data_path else "/" + ) + path_components.pop() + path_components.append(file_key) + model_local_path = "/".join(path_components) + model = joblib.load(model_local_path) + log.info(f"Loaded the model from the local path sucessfully!") + except: + log.error("No model found with the given name!") if S3_bool: data_path = ( From af8a7e2f72ee3fb66513841da600f063f0b25e74 Mon Sep 17 00:00:00 2001 From: Ahmed Sheta Date: Sun, 4 Feb 2024 19:38:48 +0100 Subject: [PATCH 27/51] modifications after review Signed-off-by: Ahmed Sheta --- src/demo/demos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/demo/demos.py b/src/demo/demos.py index f5818e5..b7a1481 100644 --- a/src/demo/demos.py +++ b/src/demo/demos.py @@ -250,7 +250,7 @@ def predict_MerchantSize_on_lead_data_demo(): import pandas as pd log.info( - "Note: In case of running locally, enriched data must be located at src/data/leads_enriched.csv locally\nIn case of running on S3, enriched data must be located at s3://amos--data--events/leads/enriched.csv or" + "Note: In case of running locally, enriched data must be located at src/data/leads_enriched.csv\nIn case of running on S3, enriched data must be located at s3://amos--data--events/leads/enriched.csv" ) ######################### preprocessing the leads ################################## From b27c0b5567e5e38c699ead45c482d668f7820e08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucca=20Baumg=C3=A4rtner?= Date: Mon, 5 Feb 2024 11:03:15 +0100 Subject: [PATCH 28/51] quick fix: make sure db type is respected in preprocessing step MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lucca Baumgärtner --- src/data/preprocessed_data_files/.gitkeep | 0 src/demo/demos.py | 8 ++------ 2 files changed, 2 insertions(+), 6 deletions(-) create mode 100644 src/data/preprocessed_data_files/.gitkeep diff --git a/src/data/preprocessed_data_files/.gitkeep b/src/data/preprocessed_data_files/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/demo/demos.py b/src/demo/demos.py index b7a1481..3b0bdc7 100644 --- a/src/demo/demos.py +++ b/src/demo/demos.py @@ -8,13 +8,13 @@ import re -import subprocess import pandas as pd import xgboost as xgb from sklearn.metrics import classification_report from bdc.pipeline import Pipeline +from config import DATABASE_TYPE from database import get_database from demo.console_utils import ( get_int_input, @@ -241,7 +241,6 @@ def preprocessing_demo(): def predict_MerchantSize_on_lead_data_demo(): import os - import pickle import sys from io import BytesIO @@ -254,10 +253,7 @@ def predict_MerchantSize_on_lead_data_demo(): ) ######################### preprocessing the leads ################################## - if get_yes_no_input("Run on S3? (y/n)\n'n' means it will run locally!\n"): - S3_bool = True - else: - S3_bool = False + S3_bool = DATABASE_TYPE == "S3" current_dir = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd() parent_dir = os.path.join(current_dir, "..") sys.path.append(parent_dir) From 5dad860f5224a0c6084acaae09945106a12e43ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucca=20Baumg=C3=A4rtner?= Date: Mon, 5 Feb 2024 11:42:08 +0100 Subject: [PATCH 29/51] add folder for models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lucca Baumgärtner --- .gitignore | 3 ++- src/data/models/.gitkeep | 0 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 src/data/models/.gitkeep diff --git a/.gitignore b/.gitignore index c346f48..e746e53 100644 --- a/.gitignore +++ b/.gitignore @@ -53,7 +53,8 @@ bin/ !**/data/merged_geo.geojson **/data/reviews/*.json **/data/gpt-results/*.json -**/data/models/* +**/data/models/*.pkl +**/data/models/*.joblib **/data/classification_reports/* **/docs/* diff --git a/src/data/models/.gitkeep b/src/data/models/.gitkeep new file mode 100644 index 0000000..e69de29 From c718b1bdd043f0537c89629f0729797f04d5ea47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucca=20Baumg=C3=A4rtner?= Date: Mon, 5 Feb 2024 11:53:50 +0100 Subject: [PATCH 30/51] return to menu if invalid model name is given MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lucca Baumgärtner --- src/demo/demos.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/demo/demos.py b/src/demo/demos.py index 3b0bdc7..2371f1d 100644 --- a/src/demo/demos.py +++ b/src/demo/demos.py @@ -387,6 +387,7 @@ def check_classification_task(string): log.info(f"Loaded the model from the local path sucessfully!") except: log.error("No model found with the given name!") + return if S3_bool: data_path = ( From 64a23ba7466faa8ff68080955f46e4fc16b55687 Mon Sep 17 00:00:00 2001 From: Felix Zailskas Date: Mon, 5 Feb 2024 19:03:33 +0100 Subject: [PATCH 31/51] Added test suite for AnalyzeEmails and HashGenerator steps Signed-off-by: Felix Zailskas --- .gitignore | 3 + tests/steps/test_analyze_emails.py | 134 +++++++++++++++++++++++++++++ tests/steps/test_hash_generator.py | 51 +++++++++++ 3 files changed, 188 insertions(+) create mode 100644 tests/steps/test_analyze_emails.py create mode 100644 tests/steps/test_hash_generator.py diff --git a/.gitignore b/.gitignore index e746e53..2662d39 100644 --- a/.gitignore +++ b/.gitignore @@ -78,3 +78,6 @@ report.pdf **/cache/* !.gitkeep + +# testing +.coverage diff --git a/tests/steps/test_analyze_emails.py b/tests/steps/test_analyze_emails.py new file mode 100644 index 0000000..7136c73 --- /dev/null +++ b/tests/steps/test_analyze_emails.py @@ -0,0 +1,134 @@ +# SPDX-License-Identifier: MIT +# SPDX-FileCopyrightText: 2024 Felix Zailskas + +import unittest +from unittest.mock import MagicMock, patch + +import pandas as pd + +import bdc.steps.helpers.generate_hash_leads +from bdc.steps.analyze_emails import ( + AnalyzeEmails, + analyze_email_account, + extract_custom_domain, +) + + +def get_mock_lead_hash_generator(): + class MockLeadHashGenerator: + def hash_lead(self, lead_data): + return "" + + def hash_check( + self, + lead_data: pd.Series, + data_fill_function: callable, + step_name: str, + fields_tofill: list[str], + *args, + **kwargs, + ): + return data_fill_function(*args, **kwargs) + + return MockLeadHashGenerator() + + +class TestExtractCustomDomain(unittest.TestCase): + def test_valid_email(self): + email = "user@example.com" + result = extract_custom_domain(email) + expected = pd.Series(["example.com", True]) + self.assertTrue(result.equals(expected)) + + def test_invalid_email(self): + email = "invalid_email" + result = extract_custom_domain(email) + expected = pd.Series([None, False]) + self.assertTrue(result.equals(expected)) + + def test_email_with_subdomain(self): + email = "user@sub.example.com" + result = extract_custom_domain(email) + expected = pd.Series(["sub.example.com", True]) + self.assertTrue(result.equals(expected)) + + def test_empty_email(self): + email = "" + result = extract_custom_domain(email) + expected = pd.Series([None, False]) + self.assertTrue(result.equals(expected)) + + +class TestAnalyzeEmailAccount(unittest.TestCase): + def _init_lead(self, Email: str, email_valid: bool): + lead = { + "First Name": "John", + "Last Name": "Doe", + "Email": Email, + "email_valid": email_valid, + } + return lead + + def test_valid_email_account(self): + lead = self._init_lead(Email="john.doe@example.com", email_valid=True) + result = analyze_email_account(lead) + expected = pd.Series([True, True]) + self.assertTrue(result.equals(expected)) + + def test_invalid_email_account(self): + lead = self._init_lead(Email="invalid_email", email_valid=False) + result = analyze_email_account(lead) + expected = pd.Series([False, False]) + self.assertTrue(result.equals(expected)) + + def test_missing_first_name(self): + lead = self._init_lead(Email="john@example.com", email_valid=True) + result = analyze_email_account(lead) + expected = pd.Series([True, False]) + self.assertTrue(result.equals(expected)) + + def test_missing_last_name(self): + lead = self._init_lead(Email="doe123@example.com", email_valid=True) + result = analyze_email_account(lead) + expected = pd.Series([False, True]) + self.assertTrue(result.equals(expected)) + + def test_missing_names(self): + lead = self._init_lead(Email="user@example.com", email_valid=True) + lead = {"Email": "user@example.com", "email_valid": True} + result = analyze_email_account(lead) + expected = pd.Series([False, False]) + self.assertTrue(result.equals(expected)) + + +# class TestStepExecution(unittest.TestCase): +# step: AnalyzeEmails + +# def setUp(self): +# lead_data = { +# "First Name": ["John"] * 3, +# "Last Name": ["Doe"] * 3, +# "Email": [ +# "john.doe@john.com", +# "invalid_email", +# "john@yahoo.com", +# ] +# } +# self.step = AnalyzeEmails(force_refresh=True) +# self.step.df = pd.DataFrame(lead_data) + +# @patch("bdc.steps.helpers.get_lead_hash_generator") +# def test_run_method(self, mock_get_lead_hash_generator): + +# # Mock the hash_check method +# mock_get_lead_hash_generator.return_value = get_mock_lead_hash_generator() + +# # Call the run method +# result = self.step.run() +# assert type(result) is pd.DataFrame +# assert ["First Name", "Last Name", "Email", "domain", "email_valid", "first_name_in_account", +# "last_name_in_account",] in result.columns.to_list() +# assert result["domain"].to_list() == ["john.com", None, None] + +if __name__ == "__main__": + unittest.main() diff --git a/tests/steps/test_hash_generator.py b/tests/steps/test_hash_generator.py new file mode 100644 index 0000000..01f8a24 --- /dev/null +++ b/tests/steps/test_hash_generator.py @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: MIT +# SPDX-FileCopyrightText: 2024 Felix Zailskas + +import hashlib +import unittest + +import pandas as pd + +from bdc.steps.hash_generator import HashGenerator + + +class YourClassTests(unittest.TestCase): + def setUp(self): + self.lead_data = { + "First Name": ["John"], + "Last Name": ["Doe"], + "Company / Account": ["ABC Corp"], + "Phone": ["+4912345678"], + "Email": ["john.doe@john.com"], + } + self.step = HashGenerator(force_refresh=True) + self.step.df = pd.DataFrame(self.lead_data) + + def test_hash_lead(self): + # Calculate the expected hash manually based on the data + expected_hash = hashlib.sha256( + ("John" + "Doe" + "ABC Corp" + "+4912345678" + "john.doe@john.com").encode() + ).hexdigest() + + # Call the hash_lead method with the sample data + result = self.step.run() + + # Assert that the actual hash matches the expected hash + assert type(result) is pd.DataFrame + columns = result.columns.to_list() + assert all( + col in columns + for col in [ + "First Name", + "Last Name", + "Email", + "Company / Account", + "Phone", + "lead_hash", + ] + ) + self.assertEqual(result.iloc[0]["lead_hash"], expected_hash) + + +if __name__ == "__main__": + unittest.main() From 108a57b7009b5918a154980df724e9cf77597ad5 Mon Sep 17 00:00:00 2001 From: Felix Zailskas Date: Mon, 5 Feb 2024 19:16:41 +0100 Subject: [PATCH 32/51] Added simple execution test Signed-off-by: Felix Zailskas --- tests/__init__.py | 16 ++++++ tests/steps/test_analyze_emails.py | 85 +++++++++++++----------------- 2 files changed, 53 insertions(+), 48 deletions(-) create mode 100644 tests/__init__.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..c93c73d --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: MIT +# SPDX-FileCopyrightText: 2024 Felix Zailskas + +import pandas as pd + + +def mock_hash_check( + self, + lead_data: pd.Series, + data_fill_function: callable, + step_name: str, + fields_tofill: list[str], + *args, + **kwargs, +): + return data_fill_function(*args, **kwargs) diff --git a/tests/steps/test_analyze_emails.py b/tests/steps/test_analyze_emails.py index 7136c73..2718bdc 100644 --- a/tests/steps/test_analyze_emails.py +++ b/tests/steps/test_analyze_emails.py @@ -6,31 +6,13 @@ import pandas as pd -import bdc.steps.helpers.generate_hash_leads from bdc.steps.analyze_emails import ( AnalyzeEmails, analyze_email_account, extract_custom_domain, ) - - -def get_mock_lead_hash_generator(): - class MockLeadHashGenerator: - def hash_lead(self, lead_data): - return "" - - def hash_check( - self, - lead_data: pd.Series, - data_fill_function: callable, - step_name: str, - fields_tofill: list[str], - *args, - **kwargs, - ): - return data_fill_function(*args, **kwargs) - - return MockLeadHashGenerator() +from bdc.steps.helpers.generate_hash_leads import LeadHashGenerator +from tests import mock_hash_check class TestExtractCustomDomain(unittest.TestCase): @@ -101,34 +83,41 @@ def test_missing_names(self): self.assertTrue(result.equals(expected)) -# class TestStepExecution(unittest.TestCase): -# step: AnalyzeEmails - -# def setUp(self): -# lead_data = { -# "First Name": ["John"] * 3, -# "Last Name": ["Doe"] * 3, -# "Email": [ -# "john.doe@john.com", -# "invalid_email", -# "john@yahoo.com", -# ] -# } -# self.step = AnalyzeEmails(force_refresh=True) -# self.step.df = pd.DataFrame(lead_data) - -# @patch("bdc.steps.helpers.get_lead_hash_generator") -# def test_run_method(self, mock_get_lead_hash_generator): - -# # Mock the hash_check method -# mock_get_lead_hash_generator.return_value = get_mock_lead_hash_generator() - -# # Call the run method -# result = self.step.run() -# assert type(result) is pd.DataFrame -# assert ["First Name", "Last Name", "Email", "domain", "email_valid", "first_name_in_account", -# "last_name_in_account",] in result.columns.to_list() -# assert result["domain"].to_list() == ["john.com", None, None] +class TestStepExecution(unittest.TestCase): + step: AnalyzeEmails + + def setUp(self): + lead_data = { + "First Name": ["John"] * 3, + "Last Name": ["Doe"] * 3, + "Email": [ + "john.doe@john.com", + "invalid_email", + "john@yahoo.com", + ], + } + self.step = AnalyzeEmails(force_refresh=True) + self.step.df = pd.DataFrame(lead_data) + + @patch.object(LeadHashGenerator, "hash_check", mock_hash_check) + def test_run_method(self): + result = self.step.run() + assert type(result) is pd.DataFrame + columns = result.columns.to_list() + assert all( + col in columns + for col in [ + "First Name", + "Last Name", + "Email", + "domain", + "email_valid", + "first_name_in_account", + "last_name_in_account", + ] + ) + assert result["domain"].to_list() == ["john.com", None, None] + if __name__ == "__main__": unittest.main() From 6dcd084658f74cb450549a213e468424618df1d6 Mon Sep 17 00:00:00 2001 From: Felix Zailskas Date: Tue, 6 Feb 2024 09:49:18 +0100 Subject: [PATCH 33/51] Added test case for phone numbers Signed-off-by: Felix Zailskas --- tests/steps/test_hash_generator.py | 2 +- tests/steps/test_preprocess_phonenumbers.py | 114 ++++++++++++++++++++ 2 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 tests/steps/test_preprocess_phonenumbers.py diff --git a/tests/steps/test_hash_generator.py b/tests/steps/test_hash_generator.py index 01f8a24..f9b7b56 100644 --- a/tests/steps/test_hash_generator.py +++ b/tests/steps/test_hash_generator.py @@ -9,7 +9,7 @@ from bdc.steps.hash_generator import HashGenerator -class YourClassTests(unittest.TestCase): +class TestStepExecution(unittest.TestCase): def setUp(self): self.lead_data = { "First Name": ["John"], diff --git a/tests/steps/test_preprocess_phonenumbers.py b/tests/steps/test_preprocess_phonenumbers.py new file mode 100644 index 0000000..f4f529b --- /dev/null +++ b/tests/steps/test_preprocess_phonenumbers.py @@ -0,0 +1,114 @@ +# SPDX-License-Identifier: MIT +# SPDX-FileCopyrightText: 2024 Felix Zailskas + +import unittest +from unittest.mock import patch + +import pandas as pd + +from bdc.steps.helpers.generate_hash_leads import LeadHashGenerator +from bdc.steps.preprocess_phonenumbers import PreprocessPhonenumbers +from tests import mock_hash_check + + +class TestStepExecution(unittest.TestCase): + def setUp(self): + self.lead_data = { + "First Name": ["John"] * 7, + "Last Name": ["Doe"] * 7, + "Phone": [ + "4930183992170", + "invalid_phone", + "442087599036", + "3197010281402", + "436601359011", + "33757056600", + "495111233421", + ], + } + self.step = PreprocessPhonenumbers(force_refresh=True) + self.step.df = pd.DataFrame(self.lead_data) + self.formatted_gt = [ + "+49 30 183992170", + "", + "+44 20 8759 9036", + "+31 970 102 81402", + "+43 660 1359011", + "+33 7 57 05 66 00", + "+49 511 1233421", + ] + self.country_gt = [ + "Germany", + "", + "United Kingdom", + "Netherlands", + "Austria", + "France", + "Germany", + ] + self.area_gt = [ + "Berlin", + "", + "London", + "", + "", + "", + "Hannover", + ] + self.valid_gt = [ + True, + False, + True, + True, + True, + True, + True, + ] + self.possible_gt = [ + True, + False, + True, + True, + True, + True, + True, + ] + + @patch.object(LeadHashGenerator, "hash_check", mock_hash_check) + def test_hash_lead(self): + result = self.step.run() + + assert type(result) is pd.DataFrame + columns = result.columns.to_list() + assert all( + col in columns + for col in [ + "First Name", + "Last Name", + "Phone", + "number_formatted", + "number_country", + "number_area", + "number_valid", + "number_possible", + ] + ) + # test formatted number + for test, gt in zip(result["number_formatted"].to_list(), self.formatted_gt): + self.assertEqual(test, gt) + # test country + for test, gt in zip(result["number_country"].to_list(), self.country_gt): + self.assertEqual(test, gt) + # test area + for test, gt in zip(result["number_area"].to_list(), self.area_gt): + self.assertEqual(test, gt) + # test valid + for test, gt in zip(result["number_valid"].to_list(), self.valid_gt): + self.assertEqual(test, gt) + # test possible + for test, gt in zip(result["number_possible"].to_list(), self.possible_gt): + self.assertEqual(test, gt) + + +if __name__ == "__main__": + unittest.main() From 9834ce3cb9fa1d58dffd26ed277524aba217d5b5 Mon Sep 17 00:00:00 2001 From: Felix Zailskas Date: Tue, 6 Feb 2024 10:14:06 +0100 Subject: [PATCH 34/51] Tests for pipeline utils Signed-off-by: Felix Zailskas --- tests/test_pipeline_utils.py | 152 +++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 tests/test_pipeline_utils.py diff --git a/tests/test_pipeline_utils.py b/tests/test_pipeline_utils.py new file mode 100644 index 0000000..434fb6c --- /dev/null +++ b/tests/test_pipeline_utils.py @@ -0,0 +1,152 @@ +# SPDX-License-Identifier: MIT +# SPDX-FileCopyrightText: 2024 Felix Zailskas + +import os +import unittest +from unittest.mock import MagicMock, mock_open, patch + +from bdc.steps import * +from demo.pipeline_utils import ( + get_all_available_pipeline_json_configs, + get_pipeline_additional_steps, + get_pipeline_config_from_json, + get_pipeline_initial_steps, + get_pipeline_steps, +) + + +class TestPipelineUtils(unittest.TestCase): + def test_get_pipeline_steps(self): + steps = get_pipeline_steps() + self.assertEqual( + [ + (HashGenerator, "Hash Generator", ""), + (AnalyzeEmails, "Analyze Emails", ""), + (ScrapeAddress, "Scrape Address", "(will take a long time)"), + ( + SearchOffeneRegister, + "Search OffeneRegister", + "(will take a long time)", + ), + (PreprocessPhonenumbers, "Phone Number Validation", ""), + ( + GooglePlaces, + "Google API", + "(will use token and generate cost!)", + ), + ( + GooglePlacesDetailed, + "Google API Detailed", + "(will use token and generate cost!)", + ), + ( + GPTReviewSentimentAnalyzer, + "openAI GPT Sentiment Analyzer", + "(will use token and generate cost!)", + ), + ( + GPTSummarizer, + "openAI GPT Summarizer", + "(will use token and generate cost!)", + ), + ( + SmartReviewInsightsEnhancer, + "Smart Review Insights", + "(will take looong time!)", + ), + (RegionalAtlas, "Regionalatlas", ""), + ], + steps, + ) + + def test_get_pipeline_initial_steps(self): + initial_steps = get_pipeline_initial_steps() + self.assertEqual( + [ + (HashGenerator, "Hash Generator", ""), + (AnalyzeEmails, "Analyze Emails", ""), + ], + initial_steps, + ) + + def test_get_pipeline_additional_steps(self): + additional_steps = get_pipeline_additional_steps() + self.assertEqual( + [ + (ScrapeAddress, "Scrape Address", "(will take a long time)"), + ( + SearchOffeneRegister, + "Search OffeneRegister", + "(will take a long time)", + ), + (PreprocessPhonenumbers, "Phone Number Validation", ""), + ( + GooglePlaces, + "Google API", + "(will use token and generate cost!)", + ), + ( + GooglePlacesDetailed, + "Google API Detailed", + "(will use token and generate cost!)", + ), + ( + GPTReviewSentimentAnalyzer, + "openAI GPT Sentiment Analyzer", + "(will use token and generate cost!)", + ), + ( + GPTSummarizer, + "openAI GPT Summarizer", + "(will use token and generate cost!)", + ), + ( + SmartReviewInsightsEnhancer, + "Smart Review Insights", + "(will take looong time!)", + ), + (RegionalAtlas, "Regionalatlas", ""), + ], + additional_steps, + ) + + def test_get_all_available_pipeline_json_configs(self): + # Create a temporary directory and add some JSON files for testing + with patch( + "os.listdir", MagicMock(return_value=["config1.json", "config2.json"]) + ): + configs = get_all_available_pipeline_json_configs(config_path="fake_path") + self.assertEqual(configs, ["config1.json", "config2.json"]) + + def test_get_pipeline_config_from_json(self): + # Create a temporary JSON file for testing + mock_json_content = """ + { + "config": { + "steps": [ + {"name": "HashGenerator", "force_refresh": true}, + {"name": "AnalyzeEmails", "force_refresh": false}, + {"name": "GooglePlacesDetailed", "force_refresh": false}, + {"name": "SearchOffeneRegister", "force_refresh": true} + ] + } + } + """ + steps_gt = [ + HashGenerator(force_refresh=True), + AnalyzeEmails(force_refresh=False), + GooglePlacesDetailed(force_refresh=False), + SearchOffeneRegister(force_refresh=True), + ] + with patch("builtins.open", mock_open(read_data=mock_json_content)): + steps = get_pipeline_config_from_json( + "fake_config.json", config_path="fake_path" + ) + for step, gt in zip(steps, steps_gt): + self.assertEqual(type(step), type(gt)) + self.assertEqual(step.name, gt.name) + self.assertEqual(step.force_refresh, gt.force_refresh) + + +if __name__ == "__main__": + unittest.main() From ddc1eb5066a8c5e9f60aaf87eee6ec451d12661a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucca=20Baumg=C3=A4rtner?= Date: Tue, 6 Feb 2024 11:15:20 +0100 Subject: [PATCH 35/51] Bugfix/235 all steps bdc errors (#239) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix minor error in regionalatlas step + moved scrape address step to deprecated + added new bdc config Signed-off-by: Lucca Baumgärtner * undo change to input filename Signed-off-by: Lucca Baumgärtner * Adjust gpt caching error logging Signed-off-by: Lucca Baumgärtner * change input file location back to original Signed-off-by: Lucca Baumgärtner * Add explanation for deprecation of scrape_addresses.py Signed-off-by: Lucca Baumgärtner * Adjust error logging for review analysis Signed-off-by: Lucca Baumgärtner * remove deprecated steps from tests Signed-off-by: Lucca Baumgärtner --------- Signed-off-by: Lucca Baumgärtner --- Documentation/ideas.md | 7 +++ .../steps/scrape_address.py | 0 src/bdc/steps/__init__.py | 1 - src/bdc/steps/regionalatlas.py | 5 +-- src/database/leads/local_repository.py | 6 ++- src/database/leads/s3_repository.py | 6 +-- .../config_sprint09_release.json | 4 -- src/demo/pipeline_configs/config_template | 4 -- .../force_refresh_all_steps.json | 43 +++++++++++++++++++ .../force_refresh_all_steps.json.license | 2 + .../pipeline_configs/regionalatlas_only.json | 27 ++++++++++++ .../regionalatlas_only.json.license | 2 + src/demo/pipeline_configs/run_all_steps.json | 24 +++++------ src/demo/pipeline_utils.py | 3 -- tests/test_pipeline_utils.py | 3 -- 15 files changed, 101 insertions(+), 36 deletions(-) rename {src/bdc => deprecated}/steps/scrape_address.py (100%) create mode 100644 src/demo/pipeline_configs/force_refresh_all_steps.json create mode 100644 src/demo/pipeline_configs/force_refresh_all_steps.json.license create mode 100644 src/demo/pipeline_configs/regionalatlas_only.json create mode 100644 src/demo/pipeline_configs/regionalatlas_only.json.license diff --git a/Documentation/ideas.md b/Documentation/ideas.md index 12a9868..c9eb7bc 100644 --- a/Documentation/ideas.md +++ b/Documentation/ideas.md @@ -25,6 +25,13 @@ The current implementation of the module supports queueing messages from the BDC This step was supposed to be used for querying lead data from the facebook by using either the business owner's name or the company name. The attempt was deprecated as the cost for the needed API token was evaluated too high and because the usage permissions of the facebook API were changed. Furthermore, it is paramount to check the legal ramifications of querying facebook for this kind of data as there might be legal consequences of searching for individuals on facebook instead of their businesses due to data privacy regulations in the EU. +### ScrapeAddresses + +This step was an early experiment, using only the custom domain from an email address. We check if there's a live website running +for the domain, and then try to parse the main site for a business address using a RegEx pattern. The pattern is not very precise +and calling the website, as well as parsing it, takes quite some time, which accumulates for a lot of entries. The Google places +step yields better results for the business address and is faster, that's why `scrape_addresses.py` was deprecated. + ## Possible ML improvements ### Creating data subsets diff --git a/src/bdc/steps/scrape_address.py b/deprecated/steps/scrape_address.py similarity index 100% rename from src/bdc/steps/scrape_address.py rename to deprecated/steps/scrape_address.py diff --git a/src/bdc/steps/__init__.py b/src/bdc/steps/__init__.py index 9b0533b..5736e45 100644 --- a/src/bdc/steps/__init__.py +++ b/src/bdc/steps/__init__.py @@ -9,6 +9,5 @@ from .hash_generator import * from .preprocess_phonenumbers import * from .regionalatlas import * -from .scrape_address import * from .search_offeneregister import * from .step import * diff --git a/src/bdc/steps/regionalatlas.py b/src/bdc/steps/regionalatlas.py index 38b9613..e24f634 100644 --- a/src/bdc/steps/regionalatlas.py +++ b/src/bdc/steps/regionalatlas.py @@ -6,7 +6,6 @@ import geopandas as gpd import osmnx import pandas as pd -from geopandas.tools import sjoin from pandas import DataFrame from tqdm import tqdm @@ -144,13 +143,13 @@ def run(self) -> DataFrame: tqdm.pandas(desc="Computing Regional Score") - self.df[self.added_cols[:-1]] = self.df.progress_apply( + self.df[self.added_cols[-1:]] = self.df.progress_apply( lambda lead: pd.Series( get_lead_hash_generator().hash_check( lead, self.calculate_regional_score, self.name + "_Regional-Score", - self.added_cols[:-1], + self.added_cols[-1:], lead, ) ), diff --git a/src/database/leads/local_repository.py b/src/database/leads/local_repository.py index c5e53e4..af36e8d 100644 --- a/src/database/leads/local_repository.py +++ b/src/database/leads/local_repository.py @@ -4,7 +4,6 @@ import csv import json import os -from datetime import datetime from pathlib import Path import joblib @@ -197,6 +196,11 @@ def fetch_gpt_result(self, file_id, operation_name): try: with open(json_file_path, "r", encoding="utf-8") as json_file: data = json.load(json_file) + if operation_name not in data: + log.info( + f"Data for operation {operation_name} was not found in {json_file_path}" + ) + return "" return data[operation_name] except: log.warning(f"Error loading GPT results from path {json_file_path}.") diff --git a/src/database/leads/s3_repository.py b/src/database/leads/s3_repository.py index 2e11ed5..dbdf620 100644 --- a/src/database/leads/s3_repository.py +++ b/src/database/leads/s3_repository.py @@ -95,7 +95,7 @@ def _fetch_object_s3(self, bucket, obj_key): obj = s3.get_object(Bucket=bucket, Key=obj_key) except botocore.exceptions.ClientError as e: log.warning( - f"{e.response['Error']['Code']}: {e.response['Error']['Message']}" + f"{e.response['Error']['Code']}: {e.response['Error']['Message']} (s3://{bucket}/{obj_key})" if "Error" in e.response else f"Error while getting object s3://{bucket}/{obj_key}" ) @@ -209,8 +209,8 @@ def fetch_review(self, place_id): json_content = json.loads(file_content) return json_content except Exception as e: - log.error( - f"Error loading review from S3 with id {place_id}. Error: {str(e)}" + log.info( + f"No reviews in S3 for place with at s3://{bucket}/{key}. Error: {str(e)}" ) return [] diff --git a/src/demo/pipeline_configs/config_sprint09_release.json b/src/demo/pipeline_configs/config_sprint09_release.json index 080cd4b..f726661 100644 --- a/src/demo/pipeline_configs/config_sprint09_release.json +++ b/src/demo/pipeline_configs/config_sprint09_release.json @@ -6,10 +6,6 @@ "name": "AnalyzeEmails", "force_refresh": true }, - { - "name": "ScrapeAddress", - "force_refresh": true - }, { "name": "PreprocessPhonenumbers", "force_refresh": true diff --git a/src/demo/pipeline_configs/config_template b/src/demo/pipeline_configs/config_template index c48e5e8..9fc5eb1 100644 --- a/src/demo/pipeline_configs/config_template +++ b/src/demo/pipeline_configs/config_template @@ -7,10 +7,6 @@ "name": "AnalyzeEmails", "force_refresh": true }, - { - "name": "ScrapeAddress", - "force_refresh": true - }, { "name": "PreprocessPhonenumbers", "force_refresh": true diff --git a/src/demo/pipeline_configs/force_refresh_all_steps.json b/src/demo/pipeline_configs/force_refresh_all_steps.json new file mode 100644 index 0000000..8356533 --- /dev/null +++ b/src/demo/pipeline_configs/force_refresh_all_steps.json @@ -0,0 +1,43 @@ +{ + "description": "This config runs all steps with force_refresh set to true.", + "config": { + "steps": [ + { + "name": "HashGenerator", + "force_refresh": true + }, + { + "name": "AnalyzeEmails", + "force_refresh": true + }, + { + "name": "PreprocessPhonenumbers", + "force_refresh": true + }, + { + "name": "GooglePlaces", + "force_refresh": true + }, + { + "name": "GooglePlacesDetailed", + "force_refresh": true + }, + { + "name": "GPTReviewSentimentAnalyzer", + "force_refresh": true + }, + { + "name": "GPTSummarizer", + "force_refresh": true + }, + { + "name": "SmartReviewInsightsEnhancer", + "force_refresh": true + }, + { + "name": "RegionalAtlas", + "force_refresh": true + } + ] + } +} diff --git a/src/demo/pipeline_configs/force_refresh_all_steps.json.license b/src/demo/pipeline_configs/force_refresh_all_steps.json.license new file mode 100644 index 0000000..f079a3f --- /dev/null +++ b/src/demo/pipeline_configs/force_refresh_all_steps.json.license @@ -0,0 +1,2 @@ +SPDX-License-Identifier: MIT +SPDX-FileCopyrightText: 2023 Berkay Bozkurt diff --git a/src/demo/pipeline_configs/regionalatlas_only.json b/src/demo/pipeline_configs/regionalatlas_only.json new file mode 100644 index 0000000..16c15eb --- /dev/null +++ b/src/demo/pipeline_configs/regionalatlas_only.json @@ -0,0 +1,27 @@ +{ + "description": "This config runs all steps with force_refresh set to true.", + "config": { + "steps": [ + { + "name": "HashGenerator", + "force_refresh": true + }, + { + "name": "AnalyzeEmails", + "force_refresh": true + }, + { + "name": "PreprocessPhonenumbers", + "force_refresh": true + }, + { + "name": "GooglePlaces", + "force_refresh": true + }, + { + "name": "RegionalAtlas", + "force_refresh": true + } + ] + } +} diff --git a/src/demo/pipeline_configs/regionalatlas_only.json.license b/src/demo/pipeline_configs/regionalatlas_only.json.license new file mode 100644 index 0000000..4ff3a64 --- /dev/null +++ b/src/demo/pipeline_configs/regionalatlas_only.json.license @@ -0,0 +1,2 @@ +SPDX-License-Identifier: MIT +SPDX-FileCopyrightText: 2023 Lucca Baumgärtner diff --git a/src/demo/pipeline_configs/run_all_steps.json b/src/demo/pipeline_configs/run_all_steps.json index 1e442f0..f694adb 100644 --- a/src/demo/pipeline_configs/run_all_steps.json +++ b/src/demo/pipeline_configs/run_all_steps.json @@ -1,46 +1,42 @@ { - "description": "This config runs all steps with force_refresh set to true.", + "description": "This config runs all steps with force_refresh set to false.", "config": { "steps": [ { "name": "HashGenerator", - "force_refresh": true + "force_refresh": false }, { "name": "AnalyzeEmails", - "force_refresh": true - }, - { - "name": "ScrapeAddress", - "force_refresh": true + "force_refresh": false }, { "name": "PreprocessPhonenumbers", - "force_refresh": true + "force_refresh": false }, { "name": "GooglePlaces", - "force_refresh": true + "force_refresh": false }, { "name": "GooglePlacesDetailed", - "force_refresh": true + "force_refresh": false }, { "name": "GPTReviewSentimentAnalyzer", - "force_refresh": true + "force_refresh": false }, { "name": "GPTSummarizer", - "force_refresh": true + "force_refresh": false }, { "name": "SmartReviewInsightsEnhancer", - "force_refresh": true + "force_refresh": false }, { "name": "RegionalAtlas", - "force_refresh": true + "force_refresh": false } ] } diff --git a/src/demo/pipeline_utils.py b/src/demo/pipeline_utils.py index 23c77b5..d95435a 100644 --- a/src/demo/pipeline_utils.py +++ b/src/demo/pipeline_utils.py @@ -17,7 +17,6 @@ HashGenerator, PreprocessPhonenumbers, RegionalAtlas, - ScrapeAddress, SearchOffeneRegister, SmartReviewInsightsEnhancer, ) @@ -33,14 +32,12 @@ "GPTSummarizer": GPTSummarizer, "PreprocessPhonenumbers": PreprocessPhonenumbers, "RegionalAtlas": RegionalAtlas, - "ScrapeAddress": ScrapeAddress, "SearchOffeneRegister": SearchOffeneRegister, "SmartReviewInsightsEnhancer": SmartReviewInsightsEnhancer, } # Please do not write following lists! Use the functions below instead. _additional_pipeline_steps = [ - (ScrapeAddress, "Scrape Address", "(will take a long time)"), (SearchOffeneRegister, "Search OffeneRegister", "(will take a long time)"), (PreprocessPhonenumbers, "Phone Number Validation", ""), ( diff --git a/tests/test_pipeline_utils.py b/tests/test_pipeline_utils.py index 434fb6c..966415d 100644 --- a/tests/test_pipeline_utils.py +++ b/tests/test_pipeline_utils.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: MIT # SPDX-FileCopyrightText: 2024 Felix Zailskas -import os import unittest from unittest.mock import MagicMock, mock_open, patch @@ -22,7 +21,6 @@ def test_get_pipeline_steps(self): [ (HashGenerator, "Hash Generator", ""), (AnalyzeEmails, "Analyze Emails", ""), - (ScrapeAddress, "Scrape Address", "(will take a long time)"), ( SearchOffeneRegister, "Search OffeneRegister", @@ -73,7 +71,6 @@ def test_get_pipeline_additional_steps(self): additional_steps = get_pipeline_additional_steps() self.assertEqual( [ - (ScrapeAddress, "Scrape Address", "(will take a long time)"), ( SearchOffeneRegister, "Search OffeneRegister", From 6c55b482baeb715abd882829735ee58a0e7393cf Mon Sep 17 00:00:00 2001 From: Ahmed Sheta Date: Tue, 6 Feb 2024 13:13:57 +0100 Subject: [PATCH 36/51] removed the redundant S3 question prompt from data preprocessing Signed-off-by: Ahmed Sheta --- src/demo/demos.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/demo/demos.py b/src/demo/demos.py index 2371f1d..2a57764 100644 --- a/src/demo/demos.py +++ b/src/demo/demos.py @@ -224,10 +224,7 @@ def preprocessing_demo(): historical_bool = True else: historical_bool = False - if get_yes_no_input("Run on S3? (y/n)\n'n' means it will run locally!\n"): - S3_bool = True - else: - S3_bool = False + S3_bool = DATABASE_TYPE == "S3" preprocessor = Preprocessing( filter_null_data=filter_bool, historical_bool=historical_bool, S3_bool=S3_bool From 7a5018783af7ffc40d3b6771b99049414394f5dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucca=20Baumg=C3=A4rtner?= Date: Mon, 5 Feb 2024 16:18:03 +0100 Subject: [PATCH 37/51] fix minor error in regionalatlas step + moved scrape address step to deprecated + added new bdc config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lucca Baumgärtner --- src/database/leads/local_repository.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/database/leads/local_repository.py b/src/database/leads/local_repository.py index af36e8d..eabbe6c 100644 --- a/src/database/leads/local_repository.py +++ b/src/database/leads/local_repository.py @@ -19,7 +19,7 @@ class LocalRepository(Repository): BASE_PATH = os.path.dirname(__file__) DF_INPUT = os.path.abspath( - os.path.join(BASE_PATH, "../../data/sumup_leads_email.csv") + os.path.join(BASE_PATH, "../../data/demo_leads_email.csv") ) DF_OUTPUT = os.path.abspath( os.path.join(BASE_PATH, "../../data/leads_enriched.csv") From 1db8d9f9f0b2e096da33d6c44cbc2b1b2af328a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucca=20Baumg=C3=A4rtner?= Date: Mon, 5 Feb 2024 16:19:19 +0100 Subject: [PATCH 38/51] undo change to input filename MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lucca Baumgärtner --- src/database/leads/local_repository.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/database/leads/local_repository.py b/src/database/leads/local_repository.py index eabbe6c..af36e8d 100644 --- a/src/database/leads/local_repository.py +++ b/src/database/leads/local_repository.py @@ -19,7 +19,7 @@ class LocalRepository(Repository): BASE_PATH = os.path.dirname(__file__) DF_INPUT = os.path.abspath( - os.path.join(BASE_PATH, "../../data/demo_leads_email.csv") + os.path.join(BASE_PATH, "../../data/sumup_leads_email.csv") ) DF_OUTPUT = os.path.abspath( os.path.join(BASE_PATH, "../../data/leads_enriched.csv") From 7f31452d252301c1a65f28e24cd79ad39c6be23f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucca=20Baumg=C3=A4rtner?= Date: Mon, 5 Feb 2024 16:44:29 +0100 Subject: [PATCH 39/51] Adjust gpt caching error logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lucca Baumgärtner --- src/database/leads/local_repository.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/database/leads/local_repository.py b/src/database/leads/local_repository.py index af36e8d..eabbe6c 100644 --- a/src/database/leads/local_repository.py +++ b/src/database/leads/local_repository.py @@ -19,7 +19,7 @@ class LocalRepository(Repository): BASE_PATH = os.path.dirname(__file__) DF_INPUT = os.path.abspath( - os.path.join(BASE_PATH, "../../data/sumup_leads_email.csv") + os.path.join(BASE_PATH, "../../data/demo_leads_email.csv") ) DF_OUTPUT = os.path.abspath( os.path.join(BASE_PATH, "../../data/leads_enriched.csv") From 2e8eb12214028e6e9b2fd49977f224f15cf1f8a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucca=20Baumg=C3=A4rtner?= Date: Mon, 5 Feb 2024 17:06:59 +0100 Subject: [PATCH 40/51] change input file location back to original MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lucca Baumgärtner --- src/database/leads/local_repository.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/database/leads/local_repository.py b/src/database/leads/local_repository.py index eabbe6c..af36e8d 100644 --- a/src/database/leads/local_repository.py +++ b/src/database/leads/local_repository.py @@ -19,7 +19,7 @@ class LocalRepository(Repository): BASE_PATH = os.path.dirname(__file__) DF_INPUT = os.path.abspath( - os.path.join(BASE_PATH, "../../data/demo_leads_email.csv") + os.path.join(BASE_PATH, "../../data/sumup_leads_email.csv") ) DF_OUTPUT = os.path.abspath( os.path.join(BASE_PATH, "../../data/leads_enriched.csv") From 3f7b1b4f41bb3093f71a87b82e9cfd68e0c63bf7 Mon Sep 17 00:00:00 2001 From: Tims777 Date: Thu, 1 Feb 2024 13:10:07 +0100 Subject: [PATCH 41/51] Fix get_multiple_choice in main.py Signed-off-by: Tims777 --- src/main.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/main.py b/src/main.py index 266f07f..ae3ed5b 100644 --- a/src/main.py +++ b/src/main.py @@ -31,10 +31,8 @@ if __name__ == "__main__": options = list(DEMOS.keys()) + [EXIT] while True: - try: - choice = get_multiple_choice(PROMPT, options) - if choice == EXIT: - break + choice = get_multiple_choice(PROMPT, options) + if choice == EXIT: + break + if choice != None: DEMOS[choice]() - except ValueError: - print("Invalid choice") From ea58ace1deaba40eb83d541d2e1005a873ec462e Mon Sep 17 00:00:00 2001 From: Tims777 Date: Tue, 6 Feb 2024 11:25:39 +0100 Subject: [PATCH 42/51] Rename menu options in main.py --- src/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main.py b/src/main.py index ae3ed5b..33300ba 100644 --- a/src/main.py +++ b/src/main.py @@ -21,8 +21,8 @@ DEMOS = { "Base Data Collector": pipeline_demo, "Data preprocessing": preprocessing_demo, - "Estimated Value Predictor": evp_demo, - "Merchant Size Prediction": predict_MerchantSize_on_lead_data_demo, + "ML model training": evp_demo, + "Merchant Size Predictor": predict_MerchantSize_on_lead_data_demo, } PROMPT = "Choose demo:\n" From 68156ccc4862651516877142d4c0ee3b2754d042 Mon Sep 17 00:00:00 2001 From: Tims777 Date: Tue, 6 Feb 2024 17:21:47 +0100 Subject: [PATCH 43/51] Refactor handling of local and S3 file paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Lucca Baumgärtner Signed-off-by: Tims777 --- src/database/leads/local_repository.py | 25 ++++- src/database/leads/repository.py | 16 ++- src/database/leads/s3_repository.py | 29 +++-- src/demo/demos.py | 141 ++++++------------------- src/preprocessing/preprocessing.py | 56 ++-------- 5 files changed, 93 insertions(+), 174 deletions(-) diff --git a/src/database/leads/local_repository.py b/src/database/leads/local_repository.py index af36e8d..973562f 100644 --- a/src/database/leads/local_repository.py +++ b/src/database/leads/local_repository.py @@ -27,6 +27,9 @@ class LocalRepository(Repository): DF_PREPROCESSED_INPUT = os.path.abspath( os.path.join(BASE_PATH, "../../data/preprocessed_data_files/") ) + DF_PREDICTION_OUTPUT = os.path.abspath( + os.path.join(BASE_PATH, "../../data/leads_predicted_size.csv") + ) REVIEWS = os.path.abspath(os.path.join(BASE_PATH, "../../data/reviews/")) SNAPSHOTS = os.path.abspath(os.path.join(BASE_PATH, "../../data/snapshots/")) GPT_RESULTS = os.path.abspath(os.path.join(BASE_PATH, "../../data/gpt-results/")) @@ -51,6 +54,13 @@ def save_dataframe(self): self.df.to_csv(self.DF_OUTPUT, index=False) log.info(f"Saved enriched data locally to {self.DF_OUTPUT}") + def save_prediction(self, df): + """ + Save dataframe in df parameter in chosen output location + """ + df.to_csv(self.DF_PREDICTION_OUTPUT, index=False) + log.info(f"Saved prediction result locally to {self.DF_PREDICTION_OUTPUT}") + def insert_data(self, data): """ TODO: Insert new data into specified dataframe @@ -253,10 +263,17 @@ def save_classification_report(self, report, model_name: str): except Exception as e: log.error(f"Could not save report at {report_file_path}! Error: {str(e)}") - def load_preprocessed_data( - self, file_name: str = "historical_preprocessed_data.csv" - ): + def get_preprocessed_data_path(self, historical: bool = True): + file_name = ( + "historical_preprocessed_data.csv" + if historical + else "preprocessed_data.csv" + ) + file_path = os.path.join(self.DF_PREPROCESSED_INPUT, file_name) + return file_path + + def load_preprocessed_data(self, historical: bool = True): try: - return pd.read_csv(os.path.join(self.DF_PREPROCESSED_INPUT, file_name)) + return pd.read_csv(self.get_preprocessed_data_path(historical)) except FileNotFoundError: log.error("Error: Could not find input file for preprocessed data.") diff --git a/src/database/leads/repository.py b/src/database/leads/repository.py index a44cc5b..25afde1 100644 --- a/src/database/leads/repository.py +++ b/src/database/leads/repository.py @@ -82,6 +82,13 @@ def save_dataframe(self): """ pass + @abstractmethod + def save_prediction(self, df): + """ + Save dataframe in df parameter in chosen output location + """ + pass + @abstractmethod def insert_data(self, data): """ @@ -221,7 +228,14 @@ def save_classification_report(self, report, model_name: str): pass @abstractmethod - def load_preprocessed_data(self, file_name: str): + def get_preprocessed_data_path(self, historical: bool = True): + """ + Returns the path for a preprocessed data file (either historical or current) + """ + pass + + @abstractmethod + def load_preprocessed_data(self, historical: bool = True): """ Load the preprocessed data from the given file """ diff --git a/src/database/leads/s3_repository.py b/src/database/leads/s3_repository.py index dbdf620..3e434ce 100644 --- a/src/database/leads/s3_repository.py +++ b/src/database/leads/s3_repository.py @@ -43,6 +43,7 @@ class S3Repository(Repository): MODELS_BUCKET = "amos--models" DF_INPUT = f"s3://{EVENTS_BUCKET}/leads/enriched.csv" DF_OUTPUT = f"s3://{EVENTS_BUCKET}/leads/enriched.csv" + DF_PREDICTION_OUTPUT = f"s3://{EVENTS_BUCKET}/leads/leads_predicted_size.csv" DF_PREPROCESSED_INPUT = f"s3://{FEATURES_BUCKET}/preprocessed_data_files/" REVIEWS = f"s3://{EVENTS_BUCKET}/reviews/" SNAPSHOTS = f"s3://{EVENTS_BUCKET}/snapshots/" @@ -131,6 +132,16 @@ def save_dataframe(self): self._save_to_s3(csv_buffer.getvalue(), bucket, obj_key) log.info(f"Successfully saved enriched leads to s3://{bucket}/{obj_key}") + def save_prediction(self, df): + """ + Save dataframe in df parameter in chosen output location + """ + bucket, obj_key = decode_s3_url(self.DF_PREDICTION_OUTPUT) + csv_buffer = StringIO() + df.to_csv(csv_buffer, index=False) + self._save_to_s3(csv_buffer.getvalue(), bucket, obj_key) + log.info(f"Successfully saved prediction result to s3://{bucket}/{obj_key}") + def _save_to_s3(self, data, bucket, key): s3.put_object( Bucket=bucket, @@ -374,15 +385,17 @@ def save_classification_report(self, report, model_name: str): except Exception as e: log.error(f"Could not save report for '{model_name}' to S3: {str(e)}") - def load_preprocessed_data( - self, file_name: str = "historical_preprocessed_data.csv" - ): + def get_preprocessed_data_path(self, historical: bool = True): + file_name = ( + "historical_preprocessed_data.csv" + if historical + else "preprocessed_data.csv" + ) file_path = self.DF_PREPROCESSED_INPUT + file_name - if not file_path.startswith("s3://"): - log.error( - "S3 location has to be defined like this: s3:///" - ) - return + return file_path + + def load_preprocessed_data(self, historical: bool = True): + file_path = self.get_preprocessed_data_path(historical) source = None remote_dataset = None diff --git a/src/demo/demos.py b/src/demo/demos.py index 2a57764..58050cb 100644 --- a/src/demo/demos.py +++ b/src/demo/demos.py @@ -224,10 +224,9 @@ def preprocessing_demo(): historical_bool = True else: historical_bool = False - S3_bool = DATABASE_TYPE == "S3" preprocessor = Preprocessing( - filter_null_data=filter_bool, historical_bool=historical_bool, S3_bool=S3_bool + filter_null_data=filter_bool, historical_bool=historical_bool ) preprocessor.preprocessed_df = pd.read_csv(preprocessor.data_path) @@ -239,10 +238,7 @@ def preprocessing_demo(): def predict_MerchantSize_on_lead_data_demo(): import os import sys - from io import BytesIO - import boto3 - import joblib import pandas as pd log.info( @@ -254,12 +250,13 @@ def predict_MerchantSize_on_lead_data_demo(): current_dir = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd() parent_dir = os.path.join(current_dir, "..") sys.path.append(parent_dir) + from database import get_database from preprocessing import Preprocessing + db = get_database() + log.info(f"Preprocessing the leads...") - preprocessor = Preprocessing( - filter_null_data=False, historical_bool=False, S3_bool=S3_bool - ) + preprocessor = Preprocessing(filter_null_data=False, historical_bool=False) preprocessor.preprocessed_df = pd.read_csv(preprocessor.data_path) df = preprocessor.implement_preprocessing_pipeline() preprocessor.save_preprocessed_data() @@ -267,67 +264,35 @@ def predict_MerchantSize_on_lead_data_demo(): ############################## adapting the preprocessing files ########################### log.info(f"Adapting the leads' preprocessed data for the ML model...") # load the data from the CSV files - historical_preprocessed_data = pd.read_csv( - "s3://amos--data--features/preprocessed_data_files/preprocessed_data.csv" - ) - if S3_bool: - toBePredicted_preprocessed_data = pd.read_csv( - "s3://amos--data--events/leads/preprocessed_leads_data.csv" - ) - else: - path_components = preprocessor.data_path.split( - "\\" if "\\" in preprocessor.data_path else "/" - ) - path_components.pop() - path_components.append("preprocessed_data_files/leads_preprocessed_data.csv") - leads_preprocessed_data_path = "/".join(path_components) - toBePredicted_preprocessed_data = pd.read_csv(leads_preprocessed_data_path) + historical_preprocessed_data = db.load_preprocessed_data(historical=True) + unlabeled_preprocessed_data = db.load_preprocessed_data(historical=False) historical_columns_order = historical_preprocessed_data.columns missing_columns = set(historical_columns_order) - set( - toBePredicted_preprocessed_data.columns + unlabeled_preprocessed_data.columns ) for column in missing_columns: - toBePredicted_preprocessed_data[column] = 0 + unlabeled_preprocessed_data[column] = 0 - for column in toBePredicted_preprocessed_data.columns: + for column in unlabeled_preprocessed_data.columns: if column not in historical_columns_order: - toBePredicted_preprocessed_data = toBePredicted_preprocessed_data.drop( + unlabeled_preprocessed_data = unlabeled_preprocessed_data.drop( column, axis=1 ) # reorder columns - toBePredicted_preprocessed_data = toBePredicted_preprocessed_data[ - historical_columns_order - ] - if S3_bool: - toBePredicted_output_path_s3 = ( - "s3://amos--data--events/leads/toBePredicted_preprocessed_data_updated.csv" - ) - toBePredicted_preprocessed_data.to_csv( - toBePredicted_output_path_s3, - index=False, - ) - log.info( - f"Saving the adapted preprocessed data at {toBePredicted_output_path_s3}" - ) - else: - path_components = preprocessor.data_path.split( - "\\" if "\\" in preprocessor.data_path else "/" - ) - path_components.pop() - path_components.append("toBePredicted_preprocessed_data_updated.csv") - local_preprocessed_data_path = "/".join(path_components) - toBePredicted_preprocessed_data.to_csv( - local_preprocessed_data_path, index=False - ) - log.info( - f"Saving the adapted preprocessed data at {local_preprocessed_data_path}" - ) + unlabeled_preprocessed_data = unlabeled_preprocessed_data[historical_columns_order] + unlabeled_preprocessed_data.to_csv( + preprocessor.preprocessed_data_output_path, + index=False, + ) + log.info( + f"Saving the adapted preprocessed data at {preprocessor.preprocessed_data_output_path}" + ) # check if columns in both dataframe are in same order and same number - assert list(toBePredicted_preprocessed_data.columns) == list( + assert list(unlabeled_preprocessed_data.columns) == list( historical_preprocessed_data.columns ), "Column names are different" @@ -343,57 +308,30 @@ def predict_MerchantSize_on_lead_data_demo(): model_name = get_string_input( "Provide model file name in data/models local directory\nInput example: lightgbm_epochs(1)_f1(0.6375)_numclasses(5)_model.pkl\n" ) - # file_key = "models/lightgbm_epochs(1)_f1(0.6375)_numclasses(5)_model_updated.pkl" # adjust according to the desired model - model_name = model_name.replace(" ", "") + model_name = model_name.strip() xgb_bool = False - if model_name[:3].lower() == "xgb": + if model_name.lower().startswith("xgb"): xgb_bool = True - file_key = f"models/" + model_name - def check_classification_task(string): - match = re.search(r"\d+", string) + match = re.search(r"numclasses\((\d+)\)", string) if match: - last_number = int(match.group()) + last_number = int(match.group(1)) if last_number == 3: return True else: False - classification_task_3 = check_classification_task(file_key) + classification_task_3 = check_classification_task(model_name) try: - if S3_bool: - # create an S3 client - s3 = boto3.client("s3") - # download the file from S3 - response = s3.get_object(Bucket=bucket_name, Key=file_key) - model_content = response["Body"].read() - # load model - with BytesIO(model_content) as model_file: - model = joblib.load(model_file) - log.info(f"Loaded the model from S3 bucket sucessfully!") - else: - path_components = preprocessor.data_path.split( - "\\" if "\\" in preprocessor.data_path else "/" - ) - path_components.pop() - path_components.append(file_key) - model_local_path = "/".join(path_components) - model = joblib.load(model_local_path) - log.info(f"Loaded the model from the local path sucessfully!") + model = db.load_ml_model(model_name) + log.info(f"Loaded the model {model_name}!") except: log.error("No model found with the given name!") return - if S3_bool: - data_path = ( - "s3://amos--data--events/leads/toBePredicted_preprocessed_data_updated.csv" - ) - else: - data_path = local_preprocessed_data_path - - df = pd.read_csv(data_path) + df = pd.read_csv(preprocessor.preprocessed_data_output_path) input = df.drop("MerchantSizeByDPV", axis=1) if xgb_bool: input = xgb.DMatrix(input) @@ -405,29 +343,10 @@ def check_classification_task(string): size_mapping = {0: "XS", 1: "S", 2: "M", 3: "L", 4: "XL"} remapped_predictions = [size_mapping[prediction] for prediction in predictions] - if S3_bool: - enriched_data = pd.read_csv("s3://amos--data--events/leads/enriched.csv") - else: - enriched_data = pd.read_csv(preprocessor.data_path) + enriched_data = pd.read_csv(preprocessor.data_path) # first 5 columns: Last Name,First Name,Company / Account,Phone,Email, raw_data = enriched_data.iloc[:, :5] raw_data["PredictedMerchantSize"] = remapped_predictions - if S3_bool: - raw_data.to_csv( - "s3://amos--data--events/leads/predicted_MerchantSize_of_leads.csv", - index=True, - ) - log.info( - f"Saved the predicted Merchant Size of the leads at s3://amos--data--events/leads/predicted_MerchantSize_of_leads.csv" - ) - else: - path_components = preprocessor.data_path.split( - "\\" if "\\" in preprocessor.data_path else "/" - ) - path_components.pop() - path_components.append("predicted_MerchantSize_of_leads.csv") - output_path = "/".join(path_components) - raw_data.to_csv(output_path, index=True) - log.info(f"Saved the predicted Merchant Size of the leads at {output_path}") + db.save_prediction(raw_data) diff --git a/src/preprocessing/preprocessing.py b/src/preprocessing/preprocessing.py index a278b2a..d9f9c29 100644 --- a/src/preprocessing/preprocessing.py +++ b/src/preprocessing/preprocessing.py @@ -29,57 +29,13 @@ class Preprocessing: - def __init__(self, filter_null_data=True, historical_bool=True, S3_bool=False): + def __init__(self, filter_null_data=True, historical_bool=True): data_repo = get_database() self.data_path = data_repo.get_output_path() self.preprocessed_df = None - self.prerocessed_data_output_path = None - if historical_bool and S3_bool: - self.data_path = ( - "s3://amos--data--events/historical_data/100k_historic_enriched.csv" - ) - self.prerocessed_data_output_path = "s3://amos--data--features/preprocessed_data_files/historical_preprocessed_data.csv" - elif historical_bool and not S3_bool: - # input path - input_path_components = self.data_path.split( - "\\" if "\\" in self.data_path else "/" - ) - input_path_components.pop() - input_path_components.append("100k_historic_enriched.csv") - input_path = "/".join(input_path_components) - self.data_path = input_path - - # output path - path_components = self.data_path.split( - "\\" if "\\" in self.data_path else "/" - ) - path_components.pop() - path_components.append( - "preprocessed_data_files/historical_preprocessed_data.csv" - ) - self.prerocessed_data_output_path = "/".join(path_components) - elif not historical_bool and S3_bool: - self.data_path = "s3://amos--data--events/leads/enriched.csv" - self.prerocessed_data_output_path = "s3://amos--data--features/preprocessed_data_files/leads_preprocessed_data.csv" - elif not historical_bool and not S3_bool: - # input path - input_path_components = self.data_path.split( - "\\" if "\\" in self.data_path else "/" - ) - input_path_components.pop() - input_path_components.append("leads_enriched.csv") - input_path = "/".join(input_path_components) - self.data_path = input_path - - # output path - path_components = self.data_path.split( - "\\" if "\\" in self.data_path else "/" - ) - path_components.pop() - path_components.append( - "preprocessed_data_files/leads_preprocessed_data.csv" - ) - self.prerocessed_data_output_path = "/".join(path_components) + self.preprocessed_data_output_path = data_repo.get_preprocessed_data_path( + historical_bool + ) self.filter_bool = filter_null_data # columns that would be added later after one-hot encoding each class @@ -291,9 +247,9 @@ def save_preprocessed_data(self): except ValueError as e: log.error(f"Failed to save the selected columns for preprocessing! {e}") try: - selected_df.to_csv(self.prerocessed_data_output_path, index=False) + selected_df.to_csv(self.preprocessed_data_output_path, index=False) log.info( - f"Preprocessed dataframe of shape {self.preprocessed_df.shape} is saved at {self.prerocessed_data_output_path}" + f"Preprocessed dataframe of shape {self.preprocessed_df.shape} is saved at {self.preprocessed_data_output_path}" ) except ValueError as e: log.error(f"Failed to save preprocessed data file! {e}") From 6a58037a8c529b99d1519819838ce594806c0f33 Mon Sep 17 00:00:00 2001 From: Fabian Utech Date: Tue, 6 Feb 2024 17:44:42 +0100 Subject: [PATCH 44/51] Add the SBOM generator as a markdown file (shellscript) and my feature analysis Signed-off-by: Fabian Utech --- Documentation/SBOM_generator.md | 61 + notebooks/fabian_feature_analysis.ipynb | 1768 +++++++++++++++++ .../fabian_feature_analysis.ipynb.license | 2 + 3 files changed, 1831 insertions(+) create mode 100644 Documentation/SBOM_generator.md create mode 100644 notebooks/fabian_feature_analysis.ipynb create mode 100644 notebooks/fabian_feature_analysis.ipynb.license diff --git a/Documentation/SBOM_generator.md b/Documentation/SBOM_generator.md new file mode 100644 index 0000000..65d5d4b --- /dev/null +++ b/Documentation/SBOM_generator.md @@ -0,0 +1,61 @@ +# Automatic SBOM generation + +```console +pipenv install +pipenv shell + +pip install pipreqs +pip install cyclonedx-bom +pip install pip-licenses + +# Create the SBOM (cyclonedx-bom) based on (pipreqs) requirements that are actually imported in the .py files + +$sbom = pipreqs --print | cyclonedx-py -r -pb -o - -i - + +# Create an XmlDocument object +$xml = New-Object System.Xml.XmlDocument + +# Load XML content into the XmlDocument +$xml.LoadXml($sbom) + + +# Create an empty CSV file +$csvPath = "SBOM.csv" + +# Initialize an empty array to store rows +$result = @() + +# Iterate through the XML nodes and create rows for each node +$xml.SelectNodes("//*[local-name()='component']") | ForEach-Object { + + $row = @{ + "Version" = $_.Version + "Context" = $_.Purl + "Name" = if ($_.Name -eq 'scikit_learn') { 'scikit-learn' } else { $_.Name } + } + + # Get license information + $match = pip-licenses --from=mixed --format=csv --with-system --packages $row.Name | ConvertFrom-Csv + + # Add license information to the row + $result += [PSCustomObject]@{ + "Context" = $row.Context + "Name" = $row.Name + "Version" = $row.Version + "License" = $match.License + } +} + +# Export the data to the CSV file +$result | Export-Csv -Path $csvPath -NoTypeInformation + +# Create the license file +$licensePath = $csvPath + '.license' +@" +SPDX-License-Identifier: CC-BY-4.0 +SPDX-FileCopyrightText: 2023 Fabian-Paul Utech +"@ | Out-File -FilePath $licensePath + +exit + +``` diff --git a/notebooks/fabian_feature_analysis.ipynb b/notebooks/fabian_feature_analysis.ipynb new file mode 100644 index 0000000..1b10567 --- /dev/null +++ b/notebooks/fabian_feature_analysis.ipynb @@ -0,0 +1,1768 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "23afeabd", + "metadata": {}, + "source": [ + "# The Dataset: Summary\n", + "1. Missing values exist\n", + "2. Correlations between RegionalAtlas Data\n", + "3. Input: numerical + bool + categorical datatype\n", + "4. Imbalanced Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d3b226f", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "df = pd.read_csv(\"s3://amos-training-data/100k_historic_enriched.csv\")\n", + "categories_order = ['XS', 'S', 'M', 'L','XL']\n", + "df.dtypes" + ] + }, + { + "cell_type": "markdown", + "id": "18fa38d5", + "metadata": {}, + "source": [ + "--------------------------------\n", + "# Feature Summary\n", + "\n", + "### Not interesting / label\n", + "- Last Name\n", + "- Account Owner\n", + "- Phone\n", + "- domain\n", + "- google_places_place_id_matches_phone_search\n", + "- number_formatted\n", + "- google_places_name\n", + "- MerchantSizeByDPV => **y**\n", + "\n", + "### Only for testing\n", + "- MCC Level: Is only available in the dataset, but it can be used to check if the feature is helpful\n", + "\n", + "### Can be used directly (Categorical / Boolean)\n", + "Regarding the categorical features, the country is important and the price level of the company (google only has ones specific to restaurants). \n", + "\n", + "- *email_valid*\n", + "- number_possible\n", + "- number_valid\n", + "- first_name_in_account\n", + "- last_name_in_account\n", + "- google_places_business_status\n", + "- google_places_place_id_matches_phone_search\n", + "- *number_area*\n", + "- **number_country**\n", + "- **google_places_price_level**\n", + "- google_places_confidence\n", + "\n", + "### Numerical Features\n", + "The most important features overall (not only regarding numerical features) are the number of ratings and the rating itself. Logically if many users rate a company, they most likely bought products at the corresponding company. Therefore the more the better. Interesting is that a rating in the middle range is more likely for larger companies, than the extreme values. \n", + "\n", + "- reviews_sentiment_score\n", + "- **google_places_user_ratings_total**\n", + "- **google_places_rating** \n", + "- google_places_candidate_count_mail \n", + "- google_places_candidate_count_phone\n", + "\n", + "\n", + "### Regionalatlas (Numerical Features)\n", + "Regarding regional information, the population density can be considered the most important feature because the leads that are based in the service sector have a higher chance to attract customers. Additionally the age structure is important. In Germany the age group between 25-44 is the most important one, due to their expenditure behavior and the acceptance of mobile payment. Please note that the information is specific to Germany and most likely vary in other countries. Specific to SumUp Leads we also noticed that a large service sector in the region is indication for leads due to the financial model (using card and mobile payment). \n", + "\n", + "- **regional_atlas_pop_density** \n", + "- regional_atlas_pop_development \n", + "- regional_atlas_age_0 \n", + "- regional_atlas_age_1 \n", + "- **regional_atlas_age_2** (pos) \n", + "- **regional_atlas_age_3** (neg) \n", + "- **regional_atlas_age_4** (neg) \n", + "- regional_atlas_pop_avg_age \n", + "- regional_atlas_per_service_sector \n", + "- regional_atlas_per_trade \n", + "- regional_atlas_employment_rate \n", + "- regional_atlas_unemployment_rate \n", + "- regional_atlas_per_long_term_unemployment\n", + "- regional_atlas_investments_p_employee \n", + "- regional_atlas_gross_salary_p_employee \n", + "- regional_atlas_disp_income_p_inhabitant \n", + "- regional_atlas_tot_income_p_taxpayer \n", + "- regional_atlas_gdp_p_employee \n", + "- regional_atlas_gdp_development \n", + "- regional_atlas_gdp_p_inhabitant \n", + "- regional_atlas_gdp_p_workhours \n", + "- regional_atlas_pop_avg_age_zensus \n", + "- regional_atlas_regional_score \n", + "\n", + "\n", + "### Processing needed (Ideas for future)\n", + "- google_places_place_id => b_google_places\n", + "- First Name => Gender\n", + "- google_places_detailed_website => bool website\n", + "- Email (ending / provider / company email address / ...) => CountVectorizer\n", + "- Company Name (GmbH, Restaurant, ...) => CountVectorizer\n", + "- google_places_formatted_address => City\n", + "- google_places_detailed_type => Cluster / one hot encoder" + ] + }, + { + "cell_type": "markdown", + "id": "988d4272", + "metadata": {}, + "source": [ + "----------------\n", + "# Future Ideas \n", + "## Data\n", + "1. Financial reports\n", + " - Very promising to see potential revenue which might need to be combined with merchant category (due to different payment methods)\n", + " - Source: Specific to the country (e.g. Germany has Bundesanzeiger, Handelsregister, ...)\n", + " - Processing: Using an API, either just checking the existence in the corresponding register or detailed information (financial reports) \n", + "2. Neighborhood\n", + " - Gain information about the potential revenue / price of the company based on the neighborhood\n", + " - Source: Google places for location, all other steps / sources mentioned previously for evaluating the neighbors\n", + " - Processing: Clustering of typical neighborhoods \n", + "3. Price & Product / Merch Category\n", + " - Identify price categories and product types which would help identify the merch category\n", + " - Source: Website / social media\n", + " - Processing: website parsing / image detection \n", + "4. User behavior analysis\n", + " - Identify certain user behavior types to see if there are differences\n", + " - Source: Own data\n", + " - Processing: user behavior on the website (duration how long they need to fill the form, ...)\n", + "5. Process analysis\n", + " - Analysis of the process by testing how the interaction looks like and if there is a certain behavior / structure observable that is specific to the size of the company\n", + " - Source: Own data\n", + " - Processing: Information about how long the process takes / how many calls / different numbers or contact / availability\n", + "6. Confidence metric:\n", + " - Creating a sophisticated confidence metric based on similar previous results\n", + " - Using multiple features to compare it with the added sources\n", + " \n", + "## Model\n", + "1. Using GAN or other generative models to handle imbalance and missing values\n", + "2. Using outlier detection for smaller classes\n", + "3. Extensive hyperparameter tuning\n", + "4. Weighted models based on importance of the class" + ] + }, + { + "cell_type": "markdown", + "id": "e59d30fb", + "metadata": {}, + "source": [ + "# Dataset Description" + ] + }, + { + "cell_type": "markdown", + "id": "a2bee470", + "metadata": {}, + "source": [ + "## Imbalanced Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f508fadb", + "metadata": {}, + "outputs": [], + "source": [ + "print(len(df))\n", + "df.groupby(by=['MerchantSizeByDPV']).size()/len(df)" + ] + }, + { + "cell_type": "markdown", + "id": "86fdc167", + "metadata": {}, + "source": [ + "## Repetitions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a850c5e7", + "metadata": {}, + "outputs": [], + "source": [ + "# Check if there are repetitions (> 0 => leads that exist multiple times according to the identifier)\n", + "identifier = df[['Phone','Email','Company Name','number_formatted','google_places_place_id','google_places_formatted_address','google_places_name','google_places_detailed_website']]\n", + "for col in identifier:\n", + " print(f'{col}: {len(df[col].unique())} ({1-len(df[col].unique())/df[col].count()})') " + ] + }, + { + "cell_type": "markdown", + "id": "37ce359c", + "metadata": {}, + "source": [ + "**Repetitions exist except for Email => Email as Identifier**" + ] + }, + { + "cell_type": "markdown", + "id": "e85e1911", + "metadata": {}, + "source": [ + "## Placeholder Values of Regionalatlas\n", + "\n", + "### Problem: Regionatlas, Defined Placeholder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7d07397", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\" Regionalatlas: Placeholder\n", + "2222222222: nichts vorhanden, genau 0\n", + "5555555555: Zahlenwert unbekannt oder geheim zu halten\n", + "6666666666: Tabellenfach gesperrt, da Aussage nicht sinnvoll\n", + "7777777777: keine Angabe, da Zahlenwert nicht sicher genug\n", + "8888888888: Angabe fällt später an\n", + "\"\"\"\n", + "\n", + "exclude_values = [2222222222.0, 5555555555.0, 6666666666.0, 7777777777.0, 8888888888.0]\n", + "regional_df = df.filter(like='regional', axis=1).dropna()\n", + "\n", + "# Dictionary to know which columns and indices have problematic values\n", + "rem_dic = {}\n", + "columns = []\n", + "\n", + "filter_df = regional_df.copy()\n", + "\n", + "for exc in exclude_values:\n", + " # Find all columns that have those values we need to exclude\n", + " col = regional_df.loc[:,(np.sum(df == exc,axis=0)>0)].columns.tolist()\n", + "\n", + " columns+=col\n", + " \n", + " \n", + " # Now we can use those columns to find the corresponding rows\n", + " for c in col:\n", + " indices = regional_df.loc[(np.sum(df == exc,axis=1)>0),col].index.tolist() \n", + " \n", + " rem_dic[c] = {str(exc):indices}\n", + " \n", + " filter_df = filter_df[df[c]!=exc]\n", + " print(f'column:{c}, value:{exc}')\n", + " \n", + "print(rem_dic)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16fcbf05", + "metadata": {}, + "outputs": [], + "source": [ + "# Irregular values defined by regionalatlas needs to be translated to nan so we can handle it later on\n", + "import numpy as np\n", + "regional_atlas = [col for col in df if col.startswith('regional_atlas')]\n", + "\n", + "print(\"Changed the following features, because of irregular values of regionalatlas:\")\n", + "for col in regional_atlas:\n", + " n_irr = (df[col]>=2222222222).sum()\n", + " n = (df[col].notnull()).sum()\n", + " \n", + " if (n_irr>0):\n", + " print(col+': '+str(n_irr)+' out of '+ str(n))\n", + " df[col] = np.where(df[col] >= 2222222222, np.nan, df[col])\n" + ] + }, + { + "cell_type": "markdown", + "id": "c7ab822c", + "metadata": {}, + "source": [ + "Changed the following features, because of irregular values of regionalatlas:\n", + "- regional_atlas_pop_development: 76 out of 68793\n", + "- regional_atlas_investments_p_employee: 3736 out of 68793\n", + "- regional_atlas_gross_salary_p_employee: 632 out of 68793\n", + "- regional_atlas_tot_income_p_taxpayer: 34 out of 68827" + ] + }, + { + "cell_type": "markdown", + "id": "5411cb6e", + "metadata": {}, + "source": [ + "## Empty Values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "730b574a", + "metadata": {}, + "outputs": [], + "source": [ + "isna = sum(df['google_places_place_id_matches_phone_search'].isna())\n", + "print(f'Empty: {isna}')\n", + "print(f'Not empty: {df.shape[0]-isna}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35186540", + "metadata": {}, + "outputs": [], + "source": [ + "print(df.groupby('MerchantSizeByDPV').count()['number_area'].reindex(categories_order))\n", + "\n", + "tmp = df[df['number_country']=='Germany'].groupby('MerchantSizeByDPV').count()\n", + "(tmp / (sum(tmp['Last Name'].values)/129))['First Name'].reindex(categories_order)" + ] + }, + { + "cell_type": "markdown", + "id": "f507c51d", + "metadata": {}, + "source": [ + "#### Different amounts of NaN values\n", + "- regional_atlas_pop_avg_age : 3213\n", + "- regional_atlas_per_service_sector : 3211\n", + "- regional_atlas_per_trade : 3211\n", + "- regional_atlas_unemployment_rate : 3119\n", + "- regional_atlas_disp_income_p_inhabitant : 3211\n", + "- regional_atlas_tot_income_p_taxpayer : 3211\n", + "- regional_atlas_gdp_p_workhours : 3211\n", + "- regional_atlas_pop_avg_age_zensus : 3119" + ] + }, + { + "cell_type": "markdown", + "id": "7f0a7994", + "metadata": {}, + "source": [ + "----------------------\n", + "# Numerical Features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27014d88", + "metadata": {}, + "outputs": [], + "source": [ + "min_max_df = df.agg({\n", + "'google_places_user_ratings_total':[\"min\",\"max\"],\n", + "'google_places_rating':[\"min\",\"max\"],\n", + "'google_places_price_level':[\"min\",\"max\"],\n", + "'reviews_sentiment_score':[\"min\",\"max\"],\n", + "'regional_atlas_age_0':[\"min\",\"max\"],\n", + "'regional_atlas_age_1':[\"min\",\"max\"],\n", + "'regional_atlas_age_2':[\"min\",\"max\"],\n", + "'regional_atlas_age_3':[\"min\",\"max\"],\n", + "'regional_atlas_age_4':[\"min\",\"max\"],\n", + "'regional_atlas_per_service_sector':[\"min\",\"max\"],\n", + "'regional_atlas_per_trade':[\"min\",\"max\"],\n", + "'regional_atlas_employment_rate':[\"min\",\"max\"],\n", + "'regional_atlas_unemployment_rate':[\"min\",\"max\"],\n", + "'regional_atlas_per_long_term_unemployment':[\"min\",\"max\"],\n", + "'regional_atlas_pop_density':[\"min\",\"max\"],\n", + "'regional_atlas_pop_development':[\"min\",\"max\"],\n", + "'regional_atlas_pop_avg_age':[\"min\",\"max\"],\n", + "'regional_atlas_investments_p_employee':[\"min\",\"max\"],\n", + "'regional_atlas_gross_salary_p_employee':[\"min\",\"max\"],\n", + "'regional_atlas_disp_income_p_inhabitant':[\"min\",\"max\"],\n", + "'regional_atlas_tot_income_p_taxpayer':[\"min\",\"max\"],\n", + "'regional_atlas_gdp_p_employee':[\"min\",\"max\"],\n", + "'regional_atlas_gdp_development':[\"min\",\"max\"],\n", + "'regional_atlas_gdp_p_inhabitant':[\"min\",\"max\"],\n", + "'regional_atlas_gdp_p_workhours':[\"min\",\"max\"],\n", + "'regional_atlas_pop_avg_age_zensus':[\"min\",\"max\"],\n", + "'regional_atlas_regional_score':[\"min\",\"max\"]\n", + "})\n", + "\n", + "# Apply the function for each column\n", + "for col in min_max_df.columns:\n", + " min_feature = min_max_df[col]['min']\n", + " max_feature = min_max_df[col]['max']\n", + " print(f'{col}: [{min_feature}, {max_feature}]') " + ] + }, + { + "cell_type": "markdown", + "id": "5f90f597", + "metadata": {}, + "source": [ + "## Range of the features\n", + "\n", + "| Variable | Theoretical Range | Practical Range |\n", + "|---------------------------------------------|-------------------|---------------------------|\n", + "| google_places_user_ratings_total | [0, inf] | [0.0, 86141.0] |\n", + "| google_places_rating | [1, 5] | [0.0, 5.0] |\n", + "| google_places_price_level | [1.0, 4.0] | [1.0, 4.0] |\n", + "| reviews_sentiment_score | [-1.0, 1] | [-1.0, 0.95] |\n", + "| regional_atlas_age_0 | [0, 100] | [12.6, 20.1] |\n", + "| regional_atlas_age_1 | [0, 100] | [4.2, 13.1] |\n", + "| regional_atlas_age_2 | [0, 100] | [18.7, 33.7] |\n", + "| regional_atlas_age_3 | [0, 100] | [21.6, 32.9] |\n", + "| regional_atlas_age_4 | [0, 100] | [15.9, 33.5] |\n", + "| regional_atlas_per_service_sector | [0, 100] | [44.4, 94.0] |\n", + "| regional_atlas_per_trade | [0, 100] | [13.8, 48.4] |\n", + "| regional_atlas_employment_rate | [0, 100] | [47.4, 72.4] |\n", + "| regional_atlas_unemployment_rate | [0, 100] | [1.6, 12.0] |\n", + "| regional_atlas_per_long_term_unemployment | [0, 100] | [14.5, 55.5] |\n", + "| regional_atlas_pop_density | [0, inf] | [35.3, 4788.2] |\n", + "| regional_atlas_pop_development | [-inf, inf] | [-180.4, 2567.6] |\n", + "| regional_atlas_pop_avg_age | [0, inf] | [40.7, 51.0] |\n", + "| regional_atlas_investments_p_employee | [0, inf] | [2.4, 51.0] |\n", + "| regional_atlas_gross_salary_p_employee | [0, inf] | [30.1, 90.6] |\n", + "| regional_atlas_disp_income_p_inhabitant | [0, inf] | [17635.0, 36686.0] |\n", + "| regional_atlas_tot_income_p_taxpayer | [0, inf] | [30.0, 74.3] |\n", + "| regional_atlas_gdp_p_employee | [0, inf] | [56707.0, 153485.0] |\n", + "| regional_atlas_gdp_development | [-inf, inf] | [-3.7, 80.0] |\n", + "| regional_atlas_gdp_p_inhabitant | [0, inf] | [17553.0, 158749.0] |\n", + "| regional_atlas_gdp_p_workhours | [0, inf] | [39.3, 114.3] |\n", + "| regional_atlas_pop_avg_age_zensus | [0, inf] | [39.1, 48.6] |\n", + "| regional_atlas_regional_score | [0, inf] | [47.30335218, 10342.70448564]|" + ] + }, + { + "cell_type": "markdown", + "id": "189519a2", + "metadata": {}, + "source": [ + "## Percentiles (for analysis and explanation)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51d78040", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import percentileofscore\n", + "\n", + "percentile_col = [\n", + "'regional_atlas_age_0',\n", + "'regional_atlas_age_1',\n", + " 'regional_atlas_age_2',\n", + "'regional_atlas_age_3',\n", + "'regional_atlas_age_4',\n", + "'google_places_user_ratings_total','google_places_rating','reviews_sentiment_score','regional_atlas_pop_density',\n", + "'regional_atlas_pop_development',\n", + "'regional_atlas_pop_avg_age',\n", + "'regional_atlas_per_service_sector',\n", + "'regional_atlas_per_trade',\n", + "'regional_atlas_employment_rate',\n", + "'regional_atlas_unemployment_rate',\n", + "'regional_atlas_per_long_term_unemployment',\n", + "'regional_atlas_investments_p_employee',\n", + "'regional_atlas_gross_salary_p_employee',\n", + "'regional_atlas_disp_income_p_inhabitant',\n", + "'regional_atlas_tot_income_p_taxpayer',\n", + "'regional_atlas_gdp_p_employee',\n", + "'regional_atlas_gdp_development',\n", + "'regional_atlas_gdp_p_inhabitant',\n", + "'regional_atlas_gdp_p_workhours',\n", + "'regional_atlas_pop_avg_age_zensus',\n", + "'regional_atlas_regional_score']\n", + "\n", + "for col in percentile_col:\n", + " no_nan = df[col][df[col].notnull()]\n", + " col_name = col+'_percentiles' \n", + " df[col_name] = no_nan.apply(lambda x: percentileofscore(no_nan, x))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "307c0e39", + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Adding the percentiles as columns for analysis and report\n", + "\n", + "for col in percentile_col:\n", + " feature = col+\"_percentiles\"\n", + " not_nan = df[feature].notnull()\n", + "\n", + " classes = df['MerchantSizeByDPV'].unique()\n", + "\n", + " for c in classes:\n", + " sns.kdeplot(df[not_nan][df[not_nan]['MerchantSizeByDPV']==c][feature], fill=False, label=c)\n", + " \n", + " # Add labels and title\n", + " plt.xlabel('Value')\n", + " plt.ylabel('Density')\n", + " plt.title('Distribution of '+col)\n", + " plt.legend()\n", + "\n", + " # Show the plot\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "ac1d5880", + "metadata": {}, + "source": [ + "## Features regarding their discriminative power (univariate)\n", + "Features and how they increase the chance to select bigger companies\n", + "1. **Best** [Criteria: Strong trend + difference between classes]\n", + " - google_places_user_ratings_total: number of ratings\n", + " - google_places_rating: middle rating is the best\n", + " - age_2: 25-44\n", + " - age_3: 45-64 (negative)\n", + " - age_4: >65 years old (negative trend)\n", + " - pop_density: the higher, the better\n", + " - per_service_sector: High amount of service sector\n", + " - pop_avg_age: (negative)\n", + " - regional_score: pop_density * employment_rate * disp_income_p_inhabitant / 1000000\n", + "2. **Middle** [Criteria: Small trend + difference between classes]\n", + " - age_0: 0-17\n", + " - age_1: 18-24 \n", + " - pop_development: population development in one year\n", + " - investments_p_employee: Investments per employee in the region\n", + " - gross_salary_p_employee: Gross salary per employee in the region\n", + " - gdp_p_employee: GDP per employee in the region\n", + " - gdp_development: GDP development in the region\n", + " - gdp_p_inhabitant: GDP per inhabitant in the region\n", + " - gdp_p_workhours: GDP per working hours in the region\n", + " - avg_age_zensus: Average age in the region (source is the census)\n", + "3. **Worst** [No trend + no real difference between classes]\n", + " - sentiment_score: Score how polarized the google reviews are\n", + " - employment_rate: Employment rate in the region\n", + " - unemployment_rate: Unemployment rate in the region\n", + " - long_term_unemployment: Long term unemployment rate in the region\n", + " - disp_income_p_inhabitant: Disposable income per inhabitant in the region\n", + " - tot_income_p_inhabitant: Total income per inhabitant in the region" + ] + }, + { + "cell_type": "markdown", + "id": "ef03f2da", + "metadata": {}, + "source": [ + "## Bivariate analysis of the numerical google places features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8073b0a8", + "metadata": {}, + "outputs": [], + "source": [ + "for c in classes:\n", + " tmp = df[not_nan][df[not_nan]['MerchantSizeByDPV']==c]\n", + " sns.kdeplot(x=tmp['google_places_user_ratings_total_percentiles'], y=tmp['google_places_rating_percentiles'], fill=False, label=c)\n", + "\n", + " plt.xlabel('ratings_total')\n", + " plt.ylabel('rating_avg')\n", + " plt.title('Distribution of '+c)\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "c73f6841", + "metadata": {}, + "source": [ + "## Conclusion\n", + "1. Number of ratings is the most important feature, and is a sign of popularity\n", + "2. Bigger companies have the tendency to have lower ratings while having higher number of ratings\n", + "3. Regions in Germany that are more lucrative:\n", + " - a region that has a high population density, especially bigger cities\n", + " - with a higher percentage of the age group \"25-44\"\n", + " - and have a relatively high service sector share\n", + " - while at the same time having a high regional score\n", + "4. GDP features can be important\n", + "5. Employment rates and income are not very important" + ] + }, + { + "cell_type": "markdown", + "id": "983dc6c8", + "metadata": {}, + "source": [ + "----------------------\n", + "# Categorical Features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "edb5923a", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "arr_false = {}\n", + "\n", + "for column in df:\n", + " \n", + " if df[column].dtype == bool:\n", + " false_count = np.count_nonzero(df[column] == False)\n", + " arr_false[column] = false_count\n", + " \n", + "print(arr_false)\n", + "# => remove email_valid, because all are positive" + ] + }, + { + "cell_type": "markdown", + "id": "3568fb12", + "metadata": {}, + "source": [ + "### Gender (Out of First Name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30d74633", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "Dangerous to come to a conclusion based on Gender.\n", + "Women are as contact persons underrepresented in the XL category.\n", + "Possible reasons:\n", + " - Outside (more likely): Underrepresented in responsible positions with increasing size of the company\n", + " - Inside (less likely): The marketing team convinces men better\n", + "\"\"\"\n", + "import gender_guesser.detector as gender\n", + "gd = gender.Detector()\n", + "df['Gender'] = df['First Name'].apply(str.capitalize).map(lambda x: gd.get_gender(x))\n", + "\n", + "group_feature = 'Gender' # MerchantSizeByDPV or Gender\n", + "total_counts = df[group_feature].value_counts().reset_index(name='total_count')\n", + "total_counts = total_counts.rename(columns={'index':group_feature})\n", + "grouped_counts = df.groupby(['Gender', 'MerchantSizeByDPV']).size().reset_index(name='count')\n", + "\n", + "result = pd.merge(grouped_counts, total_counts, on=group_feature)\n", + "result['proportion'] = result['count'] / result['total_count']\n", + "\n", + "category_order = ['XS','S','M','L','XL']\n", + "\n", + "\n", + "# Create separate DataFrames for each gender\n", + "# For better depiction .drop(index='XS') and take away XS from category_order\n", + "# andy: androgynous\n", + "andy_data = result[result['Gender'] == 'andy'].set_index('MerchantSizeByDPV')['proportion']\n", + "unknown_data = result[result['Gender'] == 'unknown'].set_index('MerchantSizeByDPV')['proportion']\n", + "mostly_female_data = result[result['Gender'] == 'mostly_female'].set_index('MerchantSizeByDPV')['proportion']\n", + "mostly_male_data = result[result['Gender'] == 'mostly_male'].set_index('MerchantSizeByDPV')['proportion']\n", + "male_data = result[result['Gender'] == 'male'].set_index('MerchantSizeByDPV')['proportion']\n", + "female_data = result[result['Gender'] == 'female'].set_index('MerchantSizeByDPV')['proportion']\n", + "\n", + "# Plotting\n", + "plt.plot(category_order, andy_data, label='Andy')\n", + "plt.plot(category_order, unknown_data, label='Unknown')\n", + "plt.plot(category_order, mostly_female_data, label='Mostly Female')\n", + "plt.plot(category_order, mostly_male_data, label='Mostly Male')\n", + "plt.plot(category_order, male_data, label='Male')\n", + "plt.plot(category_order, female_data, label='Female')\n", + "\n", + "# Set labels and title\n", + "plt.xlabel('MerchantSizeByDPV')\n", + "plt.ylabel('Proportion')\n", + "plt.title('Proportion of MerchantSizeByDPV for Each Gender')\n", + "\n", + "# Display the plot\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "2315b4ea", + "metadata": {}, + "source": [ + "### Conclusion: Gender\n", + "1. Andy\n", + "2. Unknown + Mostly Female, Male + Mostly Male, Female" + ] + }, + { + "cell_type": "markdown", + "id": "07f9b71f", + "metadata": {}, + "source": [ + "## MCC Level (Type of Business)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd4a15f7", + "metadata": {}, + "outputs": [], + "source": [ + "mcc_group = df.groupby(by=['MCC Level','MerchantSizeByDPV']).size()\n", + "grouped = mcc_group.unstack()\n", + "mcc_sum = mcc_group.groupby(level=0).sum()\n", + "\n", + "mcc_df = pd.concat([grouped, sum_test], axis=1)\n", + "tmp = mcc_df[0]\n", + "mcc_df = mcc_df.divide(mcc_df[0], axis=0).sort_values(by='XS', ascending=True)\n", + "mcc_df['Sum'] = tmp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e787a6cd", + "metadata": {}, + "outputs": [], + "source": [ + "print('Dropped the rows due to less than 50 examples:')\n", + "print(mcc_df[mcc_df['Sum']<50].index.values)\n", + "mcc_df = mcc_df[mcc_df['Sum']>=50]\n", + "\n", + "# Show every 10 categories (previously ordered by ascending XS), to compare the categories\n", + "# The first categories are the most attractive ones\n", + "for i in range(mcc_df.shape[0]): \n", + " if i % 10 == 0:\n", + " mcc_df.drop([0,'Sum','XS'],axis=1)[i:(i+5)].transpose().plot.line()" + ] + }, + { + "cell_type": "markdown", + "id": "9db7b90f", + "metadata": {}, + "source": [ + "### Conclusion: MCC\n", + "For example the most lucrative categories by each group. You have to consider that the first group is more lucrative than the second, etc.\n", + "\n", + "1. Café / Restaurant\n", + "2. Apparel\n", + "3. Book Stores\n", + "4. Pharmacy and Nutrition\n", + "5. Art Dealers and Categories" + ] + }, + { + "cell_type": "markdown", + "id": "ca0dc855", + "metadata": {}, + "source": [ + "## Google_places_detailed_type (Type of Business)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fc010da", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "\n", + "data = df[df['google_places_detailed_type'].notnull()]\n", + "test = pd.Series([x for item in data.google_places_detailed_type for x in ast.literal_eval(item)]).value_counts()\n", + "test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd76690a", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "\"\"\"\n", + "- isna => category 0\n", + "- remove establishment and point_of_interest => category 1\n", + "- aggregate categories like sublocality? , or administrative_area_level_x\n", + "\"\"\"\n", + "\n", + "docs = df['google_places_detailed_type'][df['google_places_detailed_type'].notna()]\n", + "docs = docs.apply(lambda row: ast.literal_eval(row))\n", + "\n", + "vectorizer = CountVectorizer(analyzer=lambda x: x) # , min_df = 50\n", + "categories = vectorizer.fit_transform(docs).toarray()\n", + "vectorizer.get_feature_names_out()" + ] + }, + { + "cell_type": "markdown", + "id": "78d3f491", + "metadata": {}, + "source": [ + "## Chi squared test for categorical variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd08fc9b", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import chi2_contingency\n", + "\n", + "# Create a contingency table for each feature\n", + "contingency_tables = {}\n", + "\n", + "cat_col = df[['google_places_candidate_count_mail','google_places_candidate_count_phone','google_places_rating','google_places_price_level','google_places_confidence','MCC Level', 'Gender','number_area','first_name_in_account','last_name_in_account','google_places_business_status','number_country','number_valid','number_possible','google_places_place_id_matches_phone_search']].fillna('no_data')\n", + "cat_col['b_google_website'] = df['google_places_detailed_website'].notnull()\n", + "\n", + "#for feature_column in df.columns[df.columns != 'label']:\n", + "for feature_column in cat_col.columns:\n", + " contingency_table = pd.crosstab(df['MerchantSizeByDPV'], cat_col[feature_column])\n", + " contingency_tables[feature_column] = contingency_table\n", + "\n", + "# Perform chi-squared test for each feature\n", + "results = {}\n", + "for feature, table in contingency_tables.items():\n", + " chi2_stat, p_value, dof, expected = chi2_contingency(table)\n", + " results[feature] = {'Chi-squared stat': chi2_stat, 'P-value': p_value, 'Degrees of Freedom': dof}\n", + "\n", + "# Display the results\n", + "for feature, result in results.items():\n", + " print(f\"\\nChi-squared test for {feature}:\")\n", + " print(f\"Chi-squared statistic: {result['Chi-squared stat']:.2f}\")\n", + " print(f\"P-value: {result['P-value']:.4f}\")\n", + " print(f\"Degrees of freedom: {result['Degrees of Freedom']}\")\n", + "\n", + "# p-value > 0.05 => ignore because hypothesis can't be rejected\n", + "# The higher the statistic the more influential\n", + "# => Ignore: number_area" + ] + }, + { + "cell_type": "markdown", + "id": "533b0983", + "metadata": {}, + "source": [ + "### Conclusion: Chi squared test\n", + "1. Ignore number_area (p-value > 0.05)\n", + "2. Best: \n", + " - MCC Level: Find the category of a merchant\n", + " - google_places_price_level: Get the prices of a company\n", + " - google_places_rating: different sources for number of ratings, but also possible social media likes [popularity measures]\n", + " \n", + "3. Middle: \n", + " - number_country\n", + " - google_places_confidence\n", + " - number_possible\n", + " - number_valid\n", + " - google_places_candidate_count_phone\n", + " - Gender\n", + "4. Worst:\n", + " - google_places_candidate_count_mail\n", + " - last_name_in_account\n", + " - google_places_place_id_matches_phone_search\n", + " - b_google_website\n", + " - first_name_in_account\n", + " - google_places_business_status\n", + "5. Invalid:\n", + " - number_area" + ] + }, + { + "cell_type": "markdown", + "id": "66816550", + "metadata": {}, + "source": [ + "----------------------------------\n", + "# Boolean features (Bayesian)" + ] + }, + { + "cell_type": "markdown", + "id": "c7600a68", + "metadata": {}, + "source": [ + "#### Count false per column\n", + "- email_valid: 0 => **not interesting**\n", + "- first_name_in_account: 7659 / 10,000\n", + "- last_name_in_account: 6872 / 10,000\n", + "- number_valid: 339 / 10,000\n", + "- number_possible: 297 / 10,000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c46eb0f4", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "def b_bayesian(df,bin_column,b_value=True):\n", + " \n", + " prior_A = df.groupby('MerchantSizeByDPV').count()['Email']/df.shape[0] \n", + " prior_B = df[df[bin_column]==b_value].shape[0] / df[bin_column].shape[0]\n", + " evidence_A = df[df[bin_column]==b_value].groupby('MerchantSizeByDPV').count()[bin_column] / df.groupby('MerchantSizeByDPV').count()[bin_column]\n", + " posterior_B = (prior_A*evidence_A) / prior_B\n", + " \n", + " return posterior_B.reindex(index=['XS', 'S', 'M', 'L','XL'])\n", + "\n", + "per_size = (df.groupby('MerchantSizeByDPV').count()['Email']/df.shape[0]).reindex(index=['XS', 'S', 'M', 'L','XL'])\n", + "\n", + "\n", + "series_not_possible =b_bayesian(df,'number_possible',False)-per_size\n", + "series_invalid = b_bayesian(df,'number_valid',False)-per_size\n", + "series_first_name = b_bayesian(df,'first_name_in_account',True)-per_size\n", + "series_last_name = b_bayesian(df,'last_name_in_account',True)-per_size\n", + "\n", + "series_possible =b_bayesian(df,'number_possible',True)-per_size\n", + "series_valid = b_bayesian(df,'number_valid',True)-per_size\n", + "series_no_first_name = b_bayesian(df,'first_name_in_account',False)-per_size\n", + "series_no_last_name = b_bayesian(df,'last_name_in_account',False)-per_size\n", + "\n", + "# Ensure the 'Category' column is ordered\n", + "categories_order = ['XS', 'S', 'M', 'L','XL']\n", + "\n", + "# Plot the lines\n", + "plt.figure(figsize=(10, 6))\n", + "\n", + "\n", + "plt.plot(categories_order, series_not_possible, label='Number not possible', marker='o')\n", + "plt.plot(categories_order, series_invalid, label='Number invalid', marker='d')\n", + "plt.plot(categories_order, series_first_name, label='First name in account')\n", + "plt.plot(categories_order, series_last_name, label='Last name in account')\n", + "plt.plot(categories_order, series_possible, label='Number possible')\n", + "plt.plot(categories_order, series_valid, label='Number valid')\n", + "plt.plot(categories_order, series_no_first_name, label='First name not in account')\n", + "plt.plot(categories_order, series_no_last_name, label='Last name not in account')\n", + "#plt.plot(categories_order, per_size, label='Percentage of merchant size', marker='s',c='black')\n", + "\n", + "\n", + "plt.title('Bayesian')\n", + "plt.xlabel('Categories')\n", + "plt.ylabel('Percentages')\n", + "plt.legend()\n", + "plt.grid(True)\n", + "\n", + "# Show the plot\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "id": "62ca6202", + "metadata": {}, + "source": [ + "## Conclusion: Boolean features (Bayesian)\n", + "1. Email_valid: not interesting\n", + "2. Number not possible and number invalid have much lower XS values (25% less)\n", + "3. First name and last name in account have minor effects on it, and are a sign for smaller categories\n", + "4. Number valid, number possible, and first name / last name not in account are not very different from the overall data" + ] + }, + { + "cell_type": "markdown", + "id": "0d647f05", + "metadata": {}, + "source": [ + "-----------------------------------\n", + "# Quality Evaluation of the Discriminative Quality of Regionalatlas Features" + ] + }, + { + "cell_type": "markdown", + "id": "e43187de", + "metadata": {}, + "source": [ + "## Boxplots and Violinplots for Visual Observation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a18821e", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import numpy as np\n", + "\n", + "\n", + "class_colors = sns.color_palette(\"colorblind\")[:5]\n", + "regional_df = df.filter(like='regional', axis=1)\n", + "regional_df['MerchantSizeByDPV'] = df['MerchantSizeByDPV']\n", + "\n", + "# Plot boxplots for each column with different MerchantSizeByDPV boxplots next to each other\n", + "for i, column in enumerate(regional_df.columns[:-1]): # Exclude the last column ('MerchantSizeByDPV') \n", + " \n", + " if column == 'regional_atlas_pop_development': \n", + " axes = sns.boxplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_pop_development']<2000],palette=class_colors, order=['XS', 'S','M','L','XL']) \n", + "\n", + " elif column == 'regional_atlas_gdp_development':\n", + " axes = sns.boxplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_gdp_development']<60],palette=class_colors, order=['XS', 'S','M','L','XL']) \n", + " \n", + " else:\n", + " axes = sns.boxplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df,palette=class_colors, order=['XS', 'S','M','L','XL'])\n", + " \n", + " axes.set_title(f'Boxplot of {column}')\n", + " axes.set_xlabel('MerchantSizeByDPV')\n", + " axes.set_ylabel(column) \n", + " \n", + " median_value = regional_df[regional_df['MerchantSizeByDPV'] == 'XL'][column].median()\n", + " axes.axhline(y=median_value, color='red', linestyle='--', label=f'Median (XL)')\n", + " axes.legend(bbox_to_anchor=(1.05, 0.5), loc='upper right')\n", + " \n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18949200", + "metadata": {}, + "outputs": [], + "source": [ + "# Same like the boxplots but now with violinplots\n", + "for column in regional_df.filter(like='regional', axis=1).columns: \n", + " if column == 'regional_atlas_pop_development': \n", + " axes = sns.violinplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_pop_development']<2000],palette=class_colors, order=['XS', 'S','M','L','XL']) \n", + "\n", + " elif column == 'regional_atlas_gdp_development':\n", + " axes = sns.violinplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_gdp_development']<60],palette=class_colors, order=['XS', 'S','M','L','XL']) \n", + " \n", + " else:\n", + " axes = sns.violinplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df,palette=class_colors, order=['XS', 'S','M','L','XL'])\n", + " \n", + " axes.set_title(f'Boxplot of {column}')\n", + " axes.set_xlabel('MerchantSizeByDPV')\n", + " axes.set_ylabel(column) \n", + " \n", + " median_value = regional_df[regional_df['MerchantSizeByDPV'] == 'XL'][column].median()\n", + " axes.axhline(y=median_value, color='red', linestyle='--', label=f'Median (XL)')\n", + " axes.legend(bbox_to_anchor=(1.05, 0.5), loc='upper right')\n", + " \n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "2fb6790a", + "metadata": {}, + "source": [ + "## Computing a Heuristic Metric to Identify Quality Groups" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c7b2074", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import preprocessing\n", + "\n", + "# Normalize the features before comparing / dividing them\n", + "x = regional_df.drop('MerchantSizeByDPV', axis = 1).values #returns a numpy array\n", + "min_max_scaler = preprocessing.MinMaxScaler()\n", + "x_scaled = min_max_scaler.fit_transform(x)\n", + "norm_regio = pd.DataFrame(x_scaled, columns=regional_df.drop('MerchantSizeByDPV', axis = 1).columns)\n", + "\n", + "# Compute the stats of the normalized regional data, to find a heuristic to evaluate the features' discriminative magnitudes\n", + "df_stats_XL = norm_regio[regional_df['MerchantSizeByDPV']=='XL'].describe()\n", + "df_stats_XS = norm_regio[regional_df['MerchantSizeByDPV']=='XS'].describe()\n", + "\n", + "((df_stats_XL.loc['50%'] - df_stats_XS.loc['50%'])/(df_stats_XL.loc['75%'] - df_stats_XL.loc['25%'])).sort_values(ascending=False)" + ] + }, + { + "cell_type": "markdown", + "id": "26a45107", + "metadata": {}, + "source": [ + "## Result: Quality Evaluation of the Discriminative Quality of Regionalatlas Features\n", + "\n", + "We can see that most of the negative values are related to age. The reason behind this might be the generational related consumer and payment behavior, and on socioeconomical reasons. Every generation in germany had a vastly different working/pension experience (f.e. Generation Silent / baby boomer / Gen Y) and finance/housing situation. It seems like the XL companies tend to be in the big german cities, which have a high population density and a high share of age_2 (25-44), and a flourishing service sector.\n", + "\n", + "1. pop_development: might be related to age_0 (more kids)\n", + "2. Employment rate: bigger cities (with high density) have a higher unemployment rate. But the high density might be able to compensate it.\n", + "3. age_0: minors are part of families, that might not be able to spend money in the service industry (biggest part of SumUp)\n", + "4. pop_avg_age & pop_avg_age_zensus: Different results because of different time the data was collected\n", + "5. age_4 (>64),age_3 (45-64): Are not age_0 group and might not be in the big cities / spent as much money as others in the service industry with card payment\n", + "\n", + "The following table provides information from the regionalatlas dataset. Each row represents a different feature, with corresponding absolute values computed previously and quality class. Higher values mean higher discriminative quality (for Value and Quality Class). The visual assessment is done by considering also the distribution identified in the violin plot.\n", + "\n", + "| Feature | Value | Quality Class (metric, boxplot related) | Quality Class (after visual assessment of violin plot) |\n", + "|---------------------------------------------|-------------------------|-------|-------|\n", + "| regional_atlas_pop_density | 0.581410 | 1 | 1 |\n", + "| regional_atlas_age_2 | 0.552632 | 1 | 1 |\n", + "| regional_atlas_age_3 | 0.488372 | 1 | 1 |\n", + "| regional_atlas_per_service_sector | 0.467153 | 1 | 1 |\n", + "| regional_atlas_regional_score | 0.420439 | 1 | 1 |\n", + "| regional_atlas_pop_avg_age_zensus | 0.384615 | 1 | 2 |\n", + "| regional_atlas_age_4 | 0.354839 | 1 | 2 |\n", + "| regional_atlas_investments_p_employee | 0.333333 | 1 | 1 |\n", + "| regional_atlas_pop_avg_age | 0.304348 | 1 | 2 |\n", + "| regional_atlas_age_0 | 0.222222 | 1 | 2 |\n", + "| regional_atlas_age_1 | 0.200000 | 2 | 3 |\n", + "| regional_atlas_unemployment_rate | 0.181818 | 2 | 3 |\n", + "| regional_atlas_gdp_p_inhabitant | 0.175996 | 2 | 2 |\n", + "| regional_atlas_gdp_development | 0.163934 | 2 | 3 |\n", + "| regional_atlas_per_trade | 0.162791 | 2 | 3 |\n", + "| regional_atlas_gdp_p_employee | 0.128667 | 2 | 3 |\n", + "| regional_atlas_employment_rate | 0.093333 | 2 | 3 |\n", + "| regional_atlas_gross_salary_p_employee | 0.060377 | 3 | 3 |\n", + "| regional_atlas_tot_income_p_taxpayer | 0.053691 | 3 | 3 |\n", + "| regional_atlas_pop_development | 0.026786 | 3 | 4 |\n", + "| regional_atlas_per_long_term_unemployment | 0.024096 | 3 | 3 |\n", + "| regional_atlas_gdp_p_workhours | 0.018957 | 4 | 4 |\n", + "| regional_atlas_disp_income_p_inhabitant | 0.000000 | 4 | 4 |\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "a3c6e408", + "metadata": {}, + "source": [ + "# Correlation above 0.89" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "684af78c", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "# Compute a correlation matrix for all float values of our dataframe\n", + "float_cols = df.columns[df.dtypes==float]\n", + "corr_matrix = df[float_cols].corr()\n", + "\n", + "# The diagonal values (correlation of each feature with itself) should be considered 0, to filter them out\n", + "np.fill_diagonal(corr_matrix.values, 0)\n", + "\n", + "# Create a new DataFrame that transforms all values to 0 that are below a value of defined by variable \"correlation_threshold\" \n", + "correlation_threshold = 0.89\n", + "filtered_correlation_df = corr_matrix.applymap(lambda x: x if abs(x) >= correlation_threshold else 0)\n", + "\n", + "# Identify the rows and columns that not only consists of 0 values (after filtering)\n", + "non_zero_rows = filtered_correlation_df.index[~(filtered_correlation_df == 0).all(axis=1)]\n", + "non_zero_columns = filtered_correlation_df.columns[~(filtered_correlation_df == 0).all(axis=0)]\n", + "new_correlation_df = filtered_correlation_df.loc[non_zero_rows, non_zero_columns]\n", + "\n", + "# Print the new correlation matrix and the corresponding plot\n", + "print(f\"New Correlation Matrix (values greater than {correlation_threshold}):\")\n", + "\n", + "plt.figure(figsize=(12, 10))\n", + "heatmap = sns.heatmap(new_correlation_df, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)\n", + "plt.title('Correlation Matrix Heatmap')\n", + "plt.savefig('correlation_matrix.svg', format='svg')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "7a77e1c4", + "metadata": {}, + "source": [ + "## Result: Correlation Analysis\n", + "Might be recommendable to decide between the following features (based on the discrimination quality):\n", + "1. pop_avg_age and avg_age_zensus are different. Maybe the reason is the method or the time when the data was collected. Remove one of them\n", + "2. age_4 vs. avg_age / avg_age_zensus\n", + "3. gdp_p_workhours vs. gdp_p_employee\n", + "4. age_2 vs. age_3\n", + "5. Might remove Regional Score" + ] + }, + { + "cell_type": "markdown", + "id": "f0ebb3f6", + "metadata": {}, + "source": [ + "---------------------------------------\n", + "# PCA Analysis for Expensive Algorithms" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ba9ee1b", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.decomposition import PCA\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "\n", + "reg_df = df.filter(like='regional', axis=1).dropna()\n", + "\n", + "# Standardize the features\n", + "scaler = StandardScaler()\n", + "scaled_data = scaler.fit_transform(reg_df.drop('MerchantSizeByDPV', axis=1))\n", + "\n", + "# Apply PCA\n", + "pca = PCA()\n", + "principal_components = pca.fit_transform(scaled_data)\n", + "\n", + "# Retrieve explained variance ratios\n", + "explained_variance_ratio = pca.explained_variance_ratio_\n", + "\n", + "components = pd.DataFrame(pca.components_, columns=filter_df.columns)\n", + "\n", + "# Print explained variance ratios\n", + "for i, ratio in enumerate(explained_variance_ratio, 1):\n", + " print(f\"Principal Component {i}: Explained Variance Ratio = {ratio:.4f}\")\n", + "\n", + "# Plot the cumulative explained variance\n", + "cumulative_variance = explained_variance_ratio.cumsum()\n", + "\n", + "plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o')\n", + "plt.title('Cumulative Explained Variance')\n", + "plt.xlabel('Number of Principal Components')\n", + "plt.ylabel('Cumulative Variance Explained')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "43e567f8", + "metadata": {}, + "source": [ + "---------------------------\n", + "# New Features" + ] + }, + { + "cell_type": "markdown", + "id": "b79613bf", + "metadata": {}, + "source": [ + "## Email Address Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d731eff3", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "# Count only those words, existing in a minum amount of 100 email adresses\n", + "count_vectorizer = CountVectorizer(min_df=50)\n", + "\n", + "# Fit and transform the text data\n", + "count_matrix = count_vectorizer.fit_transform(df['Email'])\n", + "\n", + "# Convert the matrix to a DataFrame for better readability\n", + "count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4314186c", + "metadata": {}, + "outputs": [], + "source": [ + "common_words = pd.DataFrame(count_df.sum()).transpose()\n", + "\n", + "for word in common_words:\n", + " print(word)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fdf6ba75", + "metadata": {}, + "outputs": [], + "source": [ + "# Names\n", + "names = [\n", + " 'alex', 'alexander', 'alexandra', 'ali', 'andre', 'andrea', 'andreas',\n", + " 'anja', 'anke', 'anna', \n", + " 'barbara', 'bauer', 'becker',\n", + " 'birgit', \n", + " 'christian', 'christina',\n", + " 'claudia', 'daniel', 'daniela', 'david',\n", + " 'dirk',\n", + " 'eva', \n", + " 'fischer', 'florian', \n", + " 'frank', \n", + " \n", + " 'heike', \n", + " \n", + " 'jan', 'jana', 'jens', 'joerg',\n", + " 'julia', 'kai', 'karin', 'katharina',\n", + " 'kathrin', 'katja', 'katrin', 'kerstin', \n", + " 'klaus', \n", + " 'lisa', 'manuela', 'marc',\n", + " 'marco', 'maria', 'marion', 'markus', 'martin', 'martina', 'matthias',\n", + " 'melanie', 'meyer', 'michael', 'michaela',\n", + " 'monika','mueller', 'nadine',\n", + " 'nicole', \n", + " 'oliver', \n", + " 'peter', 'petra', 'philipp', \n", + " 'ralf', \n", + " 'richter', 'robert', 'sabine', 'sabrina',\n", + " 'sandra', 'sarah', 'schmidt', 'schmitz', 'schneider',\n", + " 'sebastian', 'silke', 'simon', 'simone', 'sonja', \n", + " 'stefan', 'stefanie', 'steffi', \n", + " 'susanne', 'sven', 'tanja', \n", + " 'thomas','tim', 'tobias', 'uwe',\n", + " 'wagner', 'weber', \n", + " 'weiss', 'werner', 'wolf']\n", + "\n", + "# Weird terms\n", + "weird_terms = ['hallo', 'hello', 'moin', 'sumup']\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa498485", + "metadata": {}, + "outputs": [], + "source": [ + "grouped_common_words = []\n", + "per_size = (df.groupby('MerchantSizeByDPV').count()['Email']/df.shape[0]).reindex(index=['XS', 'S', 'M', 'L','XL'])\n", + "\n", + "for word in common_words.drop(weird_terms,axis=1): #common_words[names], common_words[weird_terms]\n", + " \n", + " indices= count_df[count_df[word]>0].index \n", + " per_word = (df.loc[indices].groupby('MerchantSizeByDPV').count()['Email']/len(df.loc[indices])).reindex(index=['XS', 'S', 'M', 'L','XL']) \n", + " \n", + " grouped_common_words.append((per_word-per_size).rename(word)) \n", + " \n", + "common_df = pd.concat(grouped_common_words, axis=1)\n", + "common_df = common_df.transpose()\n", + "\n", + "common_df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3aeb199a", + "metadata": {}, + "outputs": [], + "source": [ + "# The min/mean/max probability decrease (-) or increase (+) by a value of x with the existence of a certain common word\n", + "\n", + "print(f'{np.min(common_df[\"XS\"])}, {np.mean(common_df[\"XS\"])},{np.max(common_df[\"XS\"])}')\n", + "print(f'{np.min(common_df[\"S\"])}, {np.mean(common_df[\"S\"])},{np.max(common_df[\"S\"])}')\n", + "print(f'{np.min(common_df[\"M\"])}, {np.mean(common_df[\"M\"])},{np.max(common_df[\"M\"])}')\n", + "print(f'{np.min(common_df[\"L\"])}, {np.mean(common_df[\"L\"])},{np.max(common_df[\"L\"])}')\n", + "print(f'{np.min(common_df[\"XL\"])}, {np.mean(common_df[\"XL\"])},{np.max(common_df[\"XL\"])}')" + ] + }, + { + "cell_type": "markdown", + "id": "dc9ac1c0", + "metadata": {}, + "source": [ + "## Result: Email Address Analysis\n", + "If possible add the common words as one hot vector to the model" + ] + }, + { + "cell_type": "markdown", + "id": "83e84e7c", + "metadata": {}, + "source": [ + "## Financial report (Bundesanzeiger)\n", + "\n", + "#### Note:\n", + "- Execution: The underlying cell needs to be processed in a separate python file\n", + "- Source: It is difficult to find for Germany corresponding APIs for the financial report registers\n", + "- Choice of Source: Bundesanzeiger is the most promising register, due to its restriction for bigger companies. Problem is that each access takes at least 6sec (internet connection + captcha + ...)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38d47b36", + "metadata": {}, + "outputs": [], + "source": [ + "import multiprocessing\n", + "import time\n", + "import pandas as pd\n", + "import numpy as np\n", + "from deutschland.bundesanzeiger import Bundesanzeiger\n", + "import pickle\n", + "import time\n", + "\n", + "def access_ba(company,b_bundesanzeiger,):\n", + "\n", + " b_bundesanzeiger.append(True)\n", + " try:\n", + " ba = Bundesanzeiger()\n", + " data = ba.get_reports(company)\n", + " except:\n", + " b_bundesanzeiger[-1] = False\n", + " return\n", + "\n", + " if __name__ == '__main__':\n", + "\n", + " \"\"\"\n", + " with open('list_file.pkl', 'rb') as file:\n", + " loaded_list = pickle.load(file)\n", + " print(loaded_list)\n", + " \"\"\"\n", + "\n", + " pd.set_option('display.max_columns', None)\n", + "\n", + " historic = pd.read_csv('historic.csv',sep = ',')#_enriched\n", + "\n", + " df = historic.groupby('MerchantSizeByDPV').apply(lambda x: x.sample(100))\n", + "\n", + "\n", + " with multiprocessing.Manager() as manager:\n", + "\n", + " b_bundesanzeiger = manager.list()\n", + " content_array = []\n", + " durations = []\n", + "\n", + " for i, company in enumerate(df[\"Company Name\"]):\n", + "\n", + " print(i)\n", + "\n", + " start = time.time()\n", + "\n", + " # Start access_ba as a process\n", + " p = multiprocessing.Process(target=access_ba, name=\"access_ba\", args=(company,b_bundesanzeiger))\n", + "\n", + " p.start()\n", + "\n", + " # Wait 8 seconds for access_ba\t\n", + " p.join(8)\n", + "\n", + " # If thread is active\n", + " if p.is_alive():\n", + " print (\"Terminate access_ba\")\n", + "\n", + " # Terminate access_ba\n", + " p.terminate()\n", + " b_bundesanzeiger[-1] = 'killed'\n", + "\n", + " # Cleanup\n", + " p.join()\n", + " i+=1\n", + "\n", + " print(b_bundesanzeiger[-1])\n", + " end = time.time()\n", + " print(end-start)\n", + " print()\n", + " durations.append(end-start)\n", + "\n", + " \"\"\"if i==100:\n", + " with open('list_file.pkl', 'wb') as file:\n", + " pickle.dump(list(b_bundesanzeiger), file)\n", + " print(np.mean(np.array(list(b_bundesanzeiger))))\n", + " break\n", + " \"\"\"\n", + "\n", + " with open('list_file.pkl', 'wb') as file:\n", + " pickle.dump(list(b_bundesanzeiger), file)\n", + "\n", + " with open('time.pkl', 'wb') as file:\n", + " pickle.dump(durations, file)\n", + "\n", + " df.to_pickle(\"./dataframe_sample.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8a6f460", + "metadata": {}, + "outputs": [], + "source": [ + "with open('dataframe_sample.pkl', 'rb') as f:\n", + " df = pickle.load(f)\n", + "\n", + "df = df.reset_index(drop=True)\n", + "\n", + "with open('list_file.pkl', 'rb') as f:\n", + " mynewlist = pickle.load(f)\n", + "\n", + "with open('time.pkl', 'rb') as f:\n", + " time = pickle.load(f)\n", + "\n", + "df_stats = pd.DataFrame({'b_bundesanzeiger': mynewlist, 'time': time})\n", + "\n", + "df['b_bundesanzeiger'] = df_stats['b_bundesanzeiger']\n", + "df['time'] = df_stats['time']\n", + "\n", + "\n", + "counts =df.groupby('MerchantSizeByDPV')['b_bundesanzeiger'].value_counts()\n", + "\n", + "desired_value_counts = counts.unstack().fillna(0)\n", + "\n", + "# Compute total counts per category\n", + "total_counts_per_category = counts.groupby('MerchantSizeByDPV').sum()\n", + "\n", + "# Compute probability for each category\n", + "probabilities = desired_value_counts.apply(lambda x: x / total_counts_per_category)\n", + "\n", + "print(probabilities)" + ] + }, + { + "cell_type": "markdown", + "id": "956536b8", + "metadata": {}, + "source": [ + "## Result: Financial report (Bundesanzeiger)\n", + "- After 8 seconds the connection is killed, because it can take minutes. Most need less than 8 seconds.\n", + "- Results are based on a sample of 100 for each class\n", + "- Difference regarding the True values\n", + "\n", + "| b_bundesanzeiger | False | True | Killed |\n", + "|--------------------|-------|------|--------|\n", + "| MerchantSizeByDPV | | | |\n", + "| XS | 0.59 | 0.24 | 0.17 |\n", + "| S | 0.73 | 0.14 | 0.13 |\n", + "| M | 0.57 | 0.34 | 0.09 |\n", + "| L | 0.53 | 0.30 | 0.17 |\n", + "| XL | 0.45 | 0.41 | 0.14 |\n" + ] + }, + { + "cell_type": "markdown", + "id": "8504a6e0", + "metadata": {}, + "source": [ + "## Existence of company in google places" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30b18dea", + "metadata": {}, + "outputs": [], + "source": [ + "df['b_google_places'] = df[\"google_places_place_id\"].notnull()\n", + "counts =df.groupby('MerchantSizeByDPV')['b_google_places'].value_counts()\n", + "\n", + "desired_value_counts = counts.unstack().fillna(0)\n", + "\n", + "# Compute total counts per category\n", + "total_counts_per_category = counts.groupby('MerchantSizeByDPV').sum()\n", + "\n", + "# Compute probability for each category\n", + "probabilities = desired_value_counts.apply(lambda x: x / total_counts_per_category)\n", + "\n", + "print(probabilities)" + ] + }, + { + "cell_type": "markdown", + "id": "f42904bc", + "metadata": {}, + "source": [ + "### Result\n", + "- Bigger companies are easier found than smaller companies on google places\n", + "- Average of 77% are found on google_places (but we can not be 100% sure they are correct)" + ] + }, + { + "cell_type": "markdown", + "id": "eb71d595", + "metadata": {}, + "source": [ + "# ML - Model" + ] + }, + { + "cell_type": "markdown", + "id": "4738be7c", + "metadata": {}, + "source": [ + "## Ideas\n", + "1. Imputation due to missing values\n", + "2. SMOTE or GAN / Diffusion Model / ... to create more values against the imbalance\n", + "3. Consider L and XL as outlier (using Outlier Detection)\n", + "4. StratifiedKFold\n", + "5. Weighted Classifiers (for example higher weight for class XL using a Random Forest)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f167c71", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "# Separate features (X) and target variable (y)\n", + "table = df[regional_columns+['MerchantSizeByDPV']].dropna()\n", + "y = table['MerchantSizeByDPV']\n", + "X=table[regional_columns]\n", + "\n", + "X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X, y)\n", + "\n", + "# Split the data into training and testing sets\n", + "X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)\n", + "\n", + "# Create a logistic regression model\n", + "model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=4000, class_weight='balanced')\n", + "\n", + "# Fit the model on the training data\n", + "model.fit(X_train, y_train)\n", + "\n", + "# Make predictions on the testing data\n", + "y_pred = model.predict(X_test)\n", + "\n", + "# Evaluate the model\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "conf_matrix = confusion_matrix(y_test, y_pred)\n", + "class_report = classification_report(y_test, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e6afdec", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n", + "\n", + "# Assuming X and y are your feature matrix and target variable\n", + "X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)\n", + "\n", + "# Create and fit the Random Forest model\n", + "rf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n", + "rf_model.fit(X_train, y_train)\n", + "\n", + "# Make predictions on the test set\n", + "y_pred = rf_model.predict(X_test)\n", + "\n", + "# Evaluate the model\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "conf_matrix = confusion_matrix(y_test, y_pred)\n", + "class_report = classification_report(y_test, y_pred)\n", + "\n", + "# Display evaluation metrics\n", + "print(f'Accuracy: {accuracy:.2f}')\n", + "print(f'Confusion Matrix:\\n{conf_matrix}')\n", + "print(f'Classification Report:\\n{class_report}')\n" + ] + }, + { + "cell_type": "markdown", + "id": "4da857f4", + "metadata": {}, + "source": [ + "# Idea: Testing outlier detection to identify XL" + ] + }, + { + "cell_type": "markdown", + "id": "773b81db", + "metadata": {}, + "source": [ + "## Isolation Forest\n", + "Act like XL is an anomaly and we try to identify it" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abea2245", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.ensemble import IsolationForest\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n", + "\n", + "# Assuming X and y are your feature matrix and target variable\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "# Define the set of rare classes\n", + "rare_classes = ['XL'] # Replace with the actual class labels you consider rare\n", + " # MAYBE NOT ONLY XL, but also L and M\n", + "# Create a binary target variable indicating whether each instance is rare or not\n", + "y_train_rare = y_train.isin(rare_classes).astype(int)\n", + "y_test_rare = y_test.isin(rare_classes).astype(int)\n", + "\n", + "# Create and fit the Isolation Forest model\n", + "if_model = IsolationForest(contamination='auto')\n", + "if_model.fit(X_train)\n", + "\n", + "# Predict anomalies on the test set\n", + "y_pred_rare = if_model.predict(X_test)\n", + "\n", + "# Convert the predicted labels to binary (1 for anomalies, -1 for normal instances)\n", + "y_pred_rare_binary = (y_pred_rare == -1).astype(int)\n", + "\n", + "# Evaluate the model\n", + "accuracy = accuracy_score(y_test_rare, y_pred_rare_binary)\n", + "conf_matrix = confusion_matrix(y_test_rare, y_pred_rare_binary)\n", + "class_report = classification_report(y_test_rare, y_pred_rare_binary)\n", + "\n", + "# Display evaluation metrics\n", + "print(f'Accuracy: {accuracy:.2f}')\n", + "print(f'Confusion Matrix:\\n{conf_matrix}')\n", + "print(f'Classification Report:\\n{class_report}')\n", + "\n", + "plt.figure(figsize=(6, 4))\n", + "sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False,\n", + " xticklabels=[0,1], yticklabels=[0,1])\n", + "plt.xlabel('Predicted Label')\n", + "plt.ylabel('True Label')\n", + "plt.title('Confusion Matrix')\n", + "plt.show()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_pytorch_p310", + "language": "python", + "name": "conda_pytorch_p310" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/fabian_feature_analysis.ipynb.license b/notebooks/fabian_feature_analysis.ipynb.license new file mode 100644 index 0000000..72915b0 --- /dev/null +++ b/notebooks/fabian_feature_analysis.ipynb.license @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: MIT +# SPDX-FileCopyrightText: 2023 Fabian-Paul Utech From eb39e24ea4dfe3be792c4ba47379b88fced8495b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucca=20Baumg=C3=A4rtner?= Date: Tue, 6 Feb 2024 17:59:41 +0100 Subject: [PATCH 45/51] supress warnings, fix preprocessing path for historical df Co-authored-by: Simon Zimmermann MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lucca Baumgärtner --- src/database/leads/local_repository.py | 3 +++ src/database/leads/repository.py | 13 ++++++++++++- src/database/leads/s3_repository.py | 3 +++ src/demo/demos.py | 10 +++++++--- src/preprocessing/preprocessing.py | 2 +- 5 files changed, 26 insertions(+), 5 deletions(-) diff --git a/src/database/leads/local_repository.py b/src/database/leads/local_repository.py index 973562f..f41ba1d 100644 --- a/src/database/leads/local_repository.py +++ b/src/database/leads/local_repository.py @@ -24,6 +24,9 @@ class LocalRepository(Repository): DF_OUTPUT = os.path.abspath( os.path.join(BASE_PATH, "../../data/leads_enriched.csv") ) + DF_HISTORICAL_OUTPUT = os.path.abspath( + os.path.join(BASE_PATH, "../../data/100k_historic_enriched.csv") + ) DF_PREPROCESSED_INPUT = os.path.abspath( os.path.join(BASE_PATH, "../../data/preprocessed_data_files/") ) diff --git a/src/database/leads/repository.py b/src/database/leads/repository.py index 25afde1..90c9a78 100644 --- a/src/database/leads/repository.py +++ b/src/database/leads/repository.py @@ -18,12 +18,21 @@ def DF_INPUT(self): pass @property + @abstractmethod def DF_OUTPUT(self): """ Define database path to store dataframe """ pass + @property + @abstractmethod + def DF_HISTORICAL_OUTPUT(self): + """ + Define database path to store historical enriched dataframe (used for preprocessing input) + """ + pass + @property @abstractmethod def REVIEWS(self): @@ -65,7 +74,9 @@ def set_dataframe(self, df): def get_input_path(self): return self.DF_INPUT - def get_output_path(self): + def get_enriched_data_path(self, historical=False): + if historical: + return self.DF_HISTORICAL_OUTPUT return self.DF_OUTPUT @abstractmethod diff --git a/src/database/leads/s3_repository.py b/src/database/leads/s3_repository.py index 3e434ce..912f5b3 100644 --- a/src/database/leads/s3_repository.py +++ b/src/database/leads/s3_repository.py @@ -43,6 +43,9 @@ class S3Repository(Repository): MODELS_BUCKET = "amos--models" DF_INPUT = f"s3://{EVENTS_BUCKET}/leads/enriched.csv" DF_OUTPUT = f"s3://{EVENTS_BUCKET}/leads/enriched.csv" + DF_HISTORICAL_OUTPUT = ( + f"s3://{EVENTS_BUCKET}/historical_data/100k_historic_enriched.csv" + ) DF_PREDICTION_OUTPUT = f"s3://{EVENTS_BUCKET}/leads/leads_predicted_size.csv" DF_PREPROCESSED_INPUT = f"s3://{FEATURES_BUCKET}/preprocessed_data_files/" REVIEWS = f"s3://{EVENTS_BUCKET}/reviews/" diff --git a/src/demo/demos.py b/src/demo/demos.py index 58050cb..4d90acc 100644 --- a/src/demo/demos.py +++ b/src/demo/demos.py @@ -8,6 +8,7 @@ import re +import warnings import pandas as pd import xgboost as xgb @@ -33,6 +34,10 @@ from logger import get_logger from preprocessing import Preprocessing +warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning) +warnings.simplefilter(action="ignore", category=FutureWarning) + + log = get_logger() # Constants and configurations @@ -202,7 +207,7 @@ def pipeline_demo(): steps_info = "\n".join([str(step) for step in steps]) log.info( - f"Running Pipeline with steps:\n{steps_info}\ninput_location={get_database().get_input_path()}\noutput_location={get_database().get_output_path()}" + f"Running Pipeline with steps:\n{steps_info}\ninput_location={get_database().get_input_path()}\noutput_location={get_database().get_enriched_data_path()}" ) pipeline = Pipeline( @@ -272,8 +277,7 @@ def predict_MerchantSize_on_lead_data_demo(): missing_columns = set(historical_columns_order) - set( unlabeled_preprocessed_data.columns ) - for column in missing_columns: - unlabeled_preprocessed_data[column] = 0 + unlabeled_preprocessed_data[list(missing_columns)] = 0 for column in unlabeled_preprocessed_data.columns: if column not in historical_columns_order: diff --git a/src/preprocessing/preprocessing.py b/src/preprocessing/preprocessing.py index d9f9c29..aa91935 100644 --- a/src/preprocessing/preprocessing.py +++ b/src/preprocessing/preprocessing.py @@ -31,7 +31,7 @@ class Preprocessing: def __init__(self, filter_null_data=True, historical_bool=True): data_repo = get_database() - self.data_path = data_repo.get_output_path() + self.data_path = data_repo.get_enriched_data_path(historical=historical_bool) self.preprocessed_df = None self.preprocessed_data_output_path = data_repo.get_preprocessed_data_path( historical_bool From 0c3b36953c07a54718eafa43b7079062fbc2cc3e Mon Sep 17 00:00:00 2001 From: Fabian Utech <52272272+ur-tech@users.noreply.github.com> Date: Tue, 6 Feb 2024 18:07:01 +0100 Subject: [PATCH 46/51] Update fabian_feature_analysis.ipynb Signed-off-by: Fabian Utech <52272272+ur-tech@users.noreply.github.com> --- notebooks/fabian_feature_analysis.ipynb | 35 +------------------------ 1 file changed, 1 insertion(+), 34 deletions(-) diff --git a/notebooks/fabian_feature_analysis.ipynb b/notebooks/fabian_feature_analysis.ipynb index 1b10567..2dc3933 100644 --- a/notebooks/fabian_feature_analysis.ipynb +++ b/notebooks/fabian_feature_analysis.ipynb @@ -1290,40 +1290,7 @@ "outputs": [], "source": [ "# Names\n", - "names = [\n", - " 'alex', 'alexander', 'alexandra', 'ali', 'andre', 'andrea', 'andreas',\n", - " 'anja', 'anke', 'anna', \n", - " 'barbara', 'bauer', 'becker',\n", - " 'birgit', \n", - " 'christian', 'christina',\n", - " 'claudia', 'daniel', 'daniela', 'david',\n", - " 'dirk',\n", - " 'eva', \n", - " 'fischer', 'florian', \n", - " 'frank', \n", - " \n", - " 'heike', \n", - " \n", - " 'jan', 'jana', 'jens', 'joerg',\n", - " 'julia', 'kai', 'karin', 'katharina',\n", - " 'kathrin', 'katja', 'katrin', 'kerstin', \n", - " 'klaus', \n", - " 'lisa', 'manuela', 'marc',\n", - " 'marco', 'maria', 'marion', 'markus', 'martin', 'martina', 'matthias',\n", - " 'melanie', 'meyer', 'michael', 'michaela',\n", - " 'monika','mueller', 'nadine',\n", - " 'nicole', \n", - " 'oliver', \n", - " 'peter', 'petra', 'philipp', \n", - " 'ralf', \n", - " 'richter', 'robert', 'sabine', 'sabrina',\n", - " 'sandra', 'sarah', 'schmidt', 'schmitz', 'schneider',\n", - " 'sebastian', 'silke', 'simon', 'simone', 'sonja', \n", - " 'stefan', 'stefanie', 'steffi', \n", - " 'susanne', 'sven', 'tanja', \n", - " 'thomas','tim', 'tobias', 'uwe',\n", - " 'wagner', 'weber', \n", - " 'weiss', 'werner', 'wolf']\n", + "names = []\n", "\n", "# Weird terms\n", "weird_terms = ['hallo', 'hello', 'moin', 'sumup']\n", From 65e1780fa1967ab5d921b5ed0724656f8dffc27b Mon Sep 17 00:00:00 2001 From: Tims777 Date: Tue, 6 Feb 2024 18:20:56 +0100 Subject: [PATCH 47/51] Add demo_pipeline.json Signed-off-by: Tims777 --- src/demo/pipeline_configs/demo_pipeline.json | 39 +++++++++++++++++++ .../demo_pipeline.json.license | 2 + 2 files changed, 41 insertions(+) create mode 100644 src/demo/pipeline_configs/demo_pipeline.json create mode 100644 src/demo/pipeline_configs/demo_pipeline.json.license diff --git a/src/demo/pipeline_configs/demo_pipeline.json b/src/demo/pipeline_configs/demo_pipeline.json new file mode 100644 index 0000000..85df118 --- /dev/null +++ b/src/demo/pipeline_configs/demo_pipeline.json @@ -0,0 +1,39 @@ +{ + "description": "This config is optimized for demoing our software.", + "config": { + "steps": [ + { + "name": "HashGenerator", + "force_refresh": true + }, + { + "name": "AnalyzeEmails", + "force_refresh": true + }, + { + "name": "PreprocessPhonenumbers", + "force_refresh": true + }, + { + "name": "GooglePlaces", + "force_refresh": true + }, + { + "name": "GooglePlacesDetailed", + "force_refresh": true + }, + { + "name": "GPTReviewSentimentAnalyzer", + "force_refresh": true + }, + { + "name": "SmartReviewInsightsEnhancer", + "force_refresh": true + }, + { + "name": "RegionalAtlas", + "force_refresh": true + } + ] + } +} diff --git a/src/demo/pipeline_configs/demo_pipeline.json.license b/src/demo/pipeline_configs/demo_pipeline.json.license new file mode 100644 index 0000000..875941a --- /dev/null +++ b/src/demo/pipeline_configs/demo_pipeline.json.license @@ -0,0 +1,2 @@ +SPDX-License-Identifier: MIT +SPDX-FileCopyrightText: 2024 Simon Zimmermann From e92a57b6a94cfe3f3a4a341702a6afa0291ad5d3 Mon Sep 17 00:00:00 2001 From: Fabian Utech <52272272+ur-tech@users.noreply.github.com> Date: Tue, 6 Feb 2024 20:36:39 +0100 Subject: [PATCH 48/51] Update fabian_feature_analysis.ipynb Signed-off-by: Fabian Utech <52272272+ur-tech@users.noreply.github.com> --- notebooks/fabian_feature_analysis.ipynb | 1736 +---------------------- 1 file changed, 1 insertion(+), 1735 deletions(-) diff --git a/notebooks/fabian_feature_analysis.ipynb b/notebooks/fabian_feature_analysis.ipynb index 2dc3933..8b13789 100644 --- a/notebooks/fabian_feature_analysis.ipynb +++ b/notebooks/fabian_feature_analysis.ipynb @@ -1,1735 +1 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "23afeabd", - "metadata": {}, - "source": [ - "# The Dataset: Summary\n", - "1. Missing values exist\n", - "2. Correlations between RegionalAtlas Data\n", - "3. Input: numerical + bool + categorical datatype\n", - "4. Imbalanced Dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9d3b226f", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "df = pd.read_csv(\"s3://amos-training-data/100k_historic_enriched.csv\")\n", - "categories_order = ['XS', 'S', 'M', 'L','XL']\n", - "df.dtypes" - ] - }, - { - "cell_type": "markdown", - "id": "18fa38d5", - "metadata": {}, - "source": [ - "--------------------------------\n", - "# Feature Summary\n", - "\n", - "### Not interesting / label\n", - "- Last Name\n", - "- Account Owner\n", - "- Phone\n", - "- domain\n", - "- google_places_place_id_matches_phone_search\n", - "- number_formatted\n", - "- google_places_name\n", - "- MerchantSizeByDPV => **y**\n", - "\n", - "### Only for testing\n", - "- MCC Level: Is only available in the dataset, but it can be used to check if the feature is helpful\n", - "\n", - "### Can be used directly (Categorical / Boolean)\n", - "Regarding the categorical features, the country is important and the price level of the company (google only has ones specific to restaurants). \n", - "\n", - "- *email_valid*\n", - "- number_possible\n", - "- number_valid\n", - "- first_name_in_account\n", - "- last_name_in_account\n", - "- google_places_business_status\n", - "- google_places_place_id_matches_phone_search\n", - "- *number_area*\n", - "- **number_country**\n", - "- **google_places_price_level**\n", - "- google_places_confidence\n", - "\n", - "### Numerical Features\n", - "The most important features overall (not only regarding numerical features) are the number of ratings and the rating itself. Logically if many users rate a company, they most likely bought products at the corresponding company. Therefore the more the better. Interesting is that a rating in the middle range is more likely for larger companies, than the extreme values. \n", - "\n", - "- reviews_sentiment_score\n", - "- **google_places_user_ratings_total**\n", - "- **google_places_rating** \n", - "- google_places_candidate_count_mail \n", - "- google_places_candidate_count_phone\n", - "\n", - "\n", - "### Regionalatlas (Numerical Features)\n", - "Regarding regional information, the population density can be considered the most important feature because the leads that are based in the service sector have a higher chance to attract customers. Additionally the age structure is important. In Germany the age group between 25-44 is the most important one, due to their expenditure behavior and the acceptance of mobile payment. Please note that the information is specific to Germany and most likely vary in other countries. Specific to SumUp Leads we also noticed that a large service sector in the region is indication for leads due to the financial model (using card and mobile payment). \n", - "\n", - "- **regional_atlas_pop_density** \n", - "- regional_atlas_pop_development \n", - "- regional_atlas_age_0 \n", - "- regional_atlas_age_1 \n", - "- **regional_atlas_age_2** (pos) \n", - "- **regional_atlas_age_3** (neg) \n", - "- **regional_atlas_age_4** (neg) \n", - "- regional_atlas_pop_avg_age \n", - "- regional_atlas_per_service_sector \n", - "- regional_atlas_per_trade \n", - "- regional_atlas_employment_rate \n", - "- regional_atlas_unemployment_rate \n", - "- regional_atlas_per_long_term_unemployment\n", - "- regional_atlas_investments_p_employee \n", - "- regional_atlas_gross_salary_p_employee \n", - "- regional_atlas_disp_income_p_inhabitant \n", - "- regional_atlas_tot_income_p_taxpayer \n", - "- regional_atlas_gdp_p_employee \n", - "- regional_atlas_gdp_development \n", - "- regional_atlas_gdp_p_inhabitant \n", - "- regional_atlas_gdp_p_workhours \n", - "- regional_atlas_pop_avg_age_zensus \n", - "- regional_atlas_regional_score \n", - "\n", - "\n", - "### Processing needed (Ideas for future)\n", - "- google_places_place_id => b_google_places\n", - "- First Name => Gender\n", - "- google_places_detailed_website => bool website\n", - "- Email (ending / provider / company email address / ...) => CountVectorizer\n", - "- Company Name (GmbH, Restaurant, ...) => CountVectorizer\n", - "- google_places_formatted_address => City\n", - "- google_places_detailed_type => Cluster / one hot encoder" - ] - }, - { - "cell_type": "markdown", - "id": "988d4272", - "metadata": {}, - "source": [ - "----------------\n", - "# Future Ideas \n", - "## Data\n", - "1. Financial reports\n", - " - Very promising to see potential revenue which might need to be combined with merchant category (due to different payment methods)\n", - " - Source: Specific to the country (e.g. Germany has Bundesanzeiger, Handelsregister, ...)\n", - " - Processing: Using an API, either just checking the existence in the corresponding register or detailed information (financial reports) \n", - "2. Neighborhood\n", - " - Gain information about the potential revenue / price of the company based on the neighborhood\n", - " - Source: Google places for location, all other steps / sources mentioned previously for evaluating the neighbors\n", - " - Processing: Clustering of typical neighborhoods \n", - "3. Price & Product / Merch Category\n", - " - Identify price categories and product types which would help identify the merch category\n", - " - Source: Website / social media\n", - " - Processing: website parsing / image detection \n", - "4. User behavior analysis\n", - " - Identify certain user behavior types to see if there are differences\n", - " - Source: Own data\n", - " - Processing: user behavior on the website (duration how long they need to fill the form, ...)\n", - "5. Process analysis\n", - " - Analysis of the process by testing how the interaction looks like and if there is a certain behavior / structure observable that is specific to the size of the company\n", - " - Source: Own data\n", - " - Processing: Information about how long the process takes / how many calls / different numbers or contact / availability\n", - "6. Confidence metric:\n", - " - Creating a sophisticated confidence metric based on similar previous results\n", - " - Using multiple features to compare it with the added sources\n", - " \n", - "## Model\n", - "1. Using GAN or other generative models to handle imbalance and missing values\n", - "2. Using outlier detection for smaller classes\n", - "3. Extensive hyperparameter tuning\n", - "4. Weighted models based on importance of the class" - ] - }, - { - "cell_type": "markdown", - "id": "e59d30fb", - "metadata": {}, - "source": [ - "# Dataset Description" - ] - }, - { - "cell_type": "markdown", - "id": "a2bee470", - "metadata": {}, - "source": [ - "## Imbalanced Dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f508fadb", - "metadata": {}, - "outputs": [], - "source": [ - "print(len(df))\n", - "df.groupby(by=['MerchantSizeByDPV']).size()/len(df)" - ] - }, - { - "cell_type": "markdown", - "id": "86fdc167", - "metadata": {}, - "source": [ - "## Repetitions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a850c5e7", - "metadata": {}, - "outputs": [], - "source": [ - "# Check if there are repetitions (> 0 => leads that exist multiple times according to the identifier)\n", - "identifier = df[['Phone','Email','Company Name','number_formatted','google_places_place_id','google_places_formatted_address','google_places_name','google_places_detailed_website']]\n", - "for col in identifier:\n", - " print(f'{col}: {len(df[col].unique())} ({1-len(df[col].unique())/df[col].count()})') " - ] - }, - { - "cell_type": "markdown", - "id": "37ce359c", - "metadata": {}, - "source": [ - "**Repetitions exist except for Email => Email as Identifier**" - ] - }, - { - "cell_type": "markdown", - "id": "e85e1911", - "metadata": {}, - "source": [ - "## Placeholder Values of Regionalatlas\n", - "\n", - "### Problem: Regionatlas, Defined Placeholder" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a7d07397", - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\" Regionalatlas: Placeholder\n", - "2222222222: nichts vorhanden, genau 0\n", - "5555555555: Zahlenwert unbekannt oder geheim zu halten\n", - "6666666666: Tabellenfach gesperrt, da Aussage nicht sinnvoll\n", - "7777777777: keine Angabe, da Zahlenwert nicht sicher genug\n", - "8888888888: Angabe fällt später an\n", - "\"\"\"\n", - "\n", - "exclude_values = [2222222222.0, 5555555555.0, 6666666666.0, 7777777777.0, 8888888888.0]\n", - "regional_df = df.filter(like='regional', axis=1).dropna()\n", - "\n", - "# Dictionary to know which columns and indices have problematic values\n", - "rem_dic = {}\n", - "columns = []\n", - "\n", - "filter_df = regional_df.copy()\n", - "\n", - "for exc in exclude_values:\n", - " # Find all columns that have those values we need to exclude\n", - " col = regional_df.loc[:,(np.sum(df == exc,axis=0)>0)].columns.tolist()\n", - "\n", - " columns+=col\n", - " \n", - " \n", - " # Now we can use those columns to find the corresponding rows\n", - " for c in col:\n", - " indices = regional_df.loc[(np.sum(df == exc,axis=1)>0),col].index.tolist() \n", - " \n", - " rem_dic[c] = {str(exc):indices}\n", - " \n", - " filter_df = filter_df[df[c]!=exc]\n", - " print(f'column:{c}, value:{exc}')\n", - " \n", - "print(rem_dic)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "16fcbf05", - "metadata": {}, - "outputs": [], - "source": [ - "# Irregular values defined by regionalatlas needs to be translated to nan so we can handle it later on\n", - "import numpy as np\n", - "regional_atlas = [col for col in df if col.startswith('regional_atlas')]\n", - "\n", - "print(\"Changed the following features, because of irregular values of regionalatlas:\")\n", - "for col in regional_atlas:\n", - " n_irr = (df[col]>=2222222222).sum()\n", - " n = (df[col].notnull()).sum()\n", - " \n", - " if (n_irr>0):\n", - " print(col+': '+str(n_irr)+' out of '+ str(n))\n", - " df[col] = np.where(df[col] >= 2222222222, np.nan, df[col])\n" - ] - }, - { - "cell_type": "markdown", - "id": "c7ab822c", - "metadata": {}, - "source": [ - "Changed the following features, because of irregular values of regionalatlas:\n", - "- regional_atlas_pop_development: 76 out of 68793\n", - "- regional_atlas_investments_p_employee: 3736 out of 68793\n", - "- regional_atlas_gross_salary_p_employee: 632 out of 68793\n", - "- regional_atlas_tot_income_p_taxpayer: 34 out of 68827" - ] - }, - { - "cell_type": "markdown", - "id": "5411cb6e", - "metadata": {}, - "source": [ - "## Empty Values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "730b574a", - "metadata": {}, - "outputs": [], - "source": [ - "isna = sum(df['google_places_place_id_matches_phone_search'].isna())\n", - "print(f'Empty: {isna}')\n", - "print(f'Not empty: {df.shape[0]-isna}')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "35186540", - "metadata": {}, - "outputs": [], - "source": [ - "print(df.groupby('MerchantSizeByDPV').count()['number_area'].reindex(categories_order))\n", - "\n", - "tmp = df[df['number_country']=='Germany'].groupby('MerchantSizeByDPV').count()\n", - "(tmp / (sum(tmp['Last Name'].values)/129))['First Name'].reindex(categories_order)" - ] - }, - { - "cell_type": "markdown", - "id": "f507c51d", - "metadata": {}, - "source": [ - "#### Different amounts of NaN values\n", - "- regional_atlas_pop_avg_age : 3213\n", - "- regional_atlas_per_service_sector : 3211\n", - "- regional_atlas_per_trade : 3211\n", - "- regional_atlas_unemployment_rate : 3119\n", - "- regional_atlas_disp_income_p_inhabitant : 3211\n", - "- regional_atlas_tot_income_p_taxpayer : 3211\n", - "- regional_atlas_gdp_p_workhours : 3211\n", - "- regional_atlas_pop_avg_age_zensus : 3119" - ] - }, - { - "cell_type": "markdown", - "id": "7f0a7994", - "metadata": {}, - "source": [ - "----------------------\n", - "# Numerical Features" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "27014d88", - "metadata": {}, - "outputs": [], - "source": [ - "min_max_df = df.agg({\n", - "'google_places_user_ratings_total':[\"min\",\"max\"],\n", - "'google_places_rating':[\"min\",\"max\"],\n", - "'google_places_price_level':[\"min\",\"max\"],\n", - "'reviews_sentiment_score':[\"min\",\"max\"],\n", - "'regional_atlas_age_0':[\"min\",\"max\"],\n", - "'regional_atlas_age_1':[\"min\",\"max\"],\n", - "'regional_atlas_age_2':[\"min\",\"max\"],\n", - "'regional_atlas_age_3':[\"min\",\"max\"],\n", - "'regional_atlas_age_4':[\"min\",\"max\"],\n", - "'regional_atlas_per_service_sector':[\"min\",\"max\"],\n", - "'regional_atlas_per_trade':[\"min\",\"max\"],\n", - "'regional_atlas_employment_rate':[\"min\",\"max\"],\n", - "'regional_atlas_unemployment_rate':[\"min\",\"max\"],\n", - "'regional_atlas_per_long_term_unemployment':[\"min\",\"max\"],\n", - "'regional_atlas_pop_density':[\"min\",\"max\"],\n", - "'regional_atlas_pop_development':[\"min\",\"max\"],\n", - "'regional_atlas_pop_avg_age':[\"min\",\"max\"],\n", - "'regional_atlas_investments_p_employee':[\"min\",\"max\"],\n", - "'regional_atlas_gross_salary_p_employee':[\"min\",\"max\"],\n", - "'regional_atlas_disp_income_p_inhabitant':[\"min\",\"max\"],\n", - "'regional_atlas_tot_income_p_taxpayer':[\"min\",\"max\"],\n", - "'regional_atlas_gdp_p_employee':[\"min\",\"max\"],\n", - "'regional_atlas_gdp_development':[\"min\",\"max\"],\n", - "'regional_atlas_gdp_p_inhabitant':[\"min\",\"max\"],\n", - "'regional_atlas_gdp_p_workhours':[\"min\",\"max\"],\n", - "'regional_atlas_pop_avg_age_zensus':[\"min\",\"max\"],\n", - "'regional_atlas_regional_score':[\"min\",\"max\"]\n", - "})\n", - "\n", - "# Apply the function for each column\n", - "for col in min_max_df.columns:\n", - " min_feature = min_max_df[col]['min']\n", - " max_feature = min_max_df[col]['max']\n", - " print(f'{col}: [{min_feature}, {max_feature}]') " - ] - }, - { - "cell_type": "markdown", - "id": "5f90f597", - "metadata": {}, - "source": [ - "## Range of the features\n", - "\n", - "| Variable | Theoretical Range | Practical Range |\n", - "|---------------------------------------------|-------------------|---------------------------|\n", - "| google_places_user_ratings_total | [0, inf] | [0.0, 86141.0] |\n", - "| google_places_rating | [1, 5] | [0.0, 5.0] |\n", - "| google_places_price_level | [1.0, 4.0] | [1.0, 4.0] |\n", - "| reviews_sentiment_score | [-1.0, 1] | [-1.0, 0.95] |\n", - "| regional_atlas_age_0 | [0, 100] | [12.6, 20.1] |\n", - "| regional_atlas_age_1 | [0, 100] | [4.2, 13.1] |\n", - "| regional_atlas_age_2 | [0, 100] | [18.7, 33.7] |\n", - "| regional_atlas_age_3 | [0, 100] | [21.6, 32.9] |\n", - "| regional_atlas_age_4 | [0, 100] | [15.9, 33.5] |\n", - "| regional_atlas_per_service_sector | [0, 100] | [44.4, 94.0] |\n", - "| regional_atlas_per_trade | [0, 100] | [13.8, 48.4] |\n", - "| regional_atlas_employment_rate | [0, 100] | [47.4, 72.4] |\n", - "| regional_atlas_unemployment_rate | [0, 100] | [1.6, 12.0] |\n", - "| regional_atlas_per_long_term_unemployment | [0, 100] | [14.5, 55.5] |\n", - "| regional_atlas_pop_density | [0, inf] | [35.3, 4788.2] |\n", - "| regional_atlas_pop_development | [-inf, inf] | [-180.4, 2567.6] |\n", - "| regional_atlas_pop_avg_age | [0, inf] | [40.7, 51.0] |\n", - "| regional_atlas_investments_p_employee | [0, inf] | [2.4, 51.0] |\n", - "| regional_atlas_gross_salary_p_employee | [0, inf] | [30.1, 90.6] |\n", - "| regional_atlas_disp_income_p_inhabitant | [0, inf] | [17635.0, 36686.0] |\n", - "| regional_atlas_tot_income_p_taxpayer | [0, inf] | [30.0, 74.3] |\n", - "| regional_atlas_gdp_p_employee | [0, inf] | [56707.0, 153485.0] |\n", - "| regional_atlas_gdp_development | [-inf, inf] | [-3.7, 80.0] |\n", - "| regional_atlas_gdp_p_inhabitant | [0, inf] | [17553.0, 158749.0] |\n", - "| regional_atlas_gdp_p_workhours | [0, inf] | [39.3, 114.3] |\n", - "| regional_atlas_pop_avg_age_zensus | [0, inf] | [39.1, 48.6] |\n", - "| regional_atlas_regional_score | [0, inf] | [47.30335218, 10342.70448564]|" - ] - }, - { - "cell_type": "markdown", - "id": "189519a2", - "metadata": {}, - "source": [ - "## Percentiles (for analysis and explanation)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "51d78040", - "metadata": {}, - "outputs": [], - "source": [ - "from scipy.stats import percentileofscore\n", - "\n", - "percentile_col = [\n", - "'regional_atlas_age_0',\n", - "'regional_atlas_age_1',\n", - " 'regional_atlas_age_2',\n", - "'regional_atlas_age_3',\n", - "'regional_atlas_age_4',\n", - "'google_places_user_ratings_total','google_places_rating','reviews_sentiment_score','regional_atlas_pop_density',\n", - "'regional_atlas_pop_development',\n", - "'regional_atlas_pop_avg_age',\n", - "'regional_atlas_per_service_sector',\n", - "'regional_atlas_per_trade',\n", - "'regional_atlas_employment_rate',\n", - "'regional_atlas_unemployment_rate',\n", - "'regional_atlas_per_long_term_unemployment',\n", - "'regional_atlas_investments_p_employee',\n", - "'regional_atlas_gross_salary_p_employee',\n", - "'regional_atlas_disp_income_p_inhabitant',\n", - "'regional_atlas_tot_income_p_taxpayer',\n", - "'regional_atlas_gdp_p_employee',\n", - "'regional_atlas_gdp_development',\n", - "'regional_atlas_gdp_p_inhabitant',\n", - "'regional_atlas_gdp_p_workhours',\n", - "'regional_atlas_pop_avg_age_zensus',\n", - "'regional_atlas_regional_score']\n", - "\n", - "for col in percentile_col:\n", - " no_nan = df[col][df[col].notnull()]\n", - " col_name = col+'_percentiles' \n", - " df[col_name] = no_nan.apply(lambda x: percentileofscore(no_nan, x))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "307c0e39", - "metadata": {}, - "outputs": [], - "source": [ - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "\n", - "# Adding the percentiles as columns for analysis and report\n", - "\n", - "for col in percentile_col:\n", - " feature = col+\"_percentiles\"\n", - " not_nan = df[feature].notnull()\n", - "\n", - " classes = df['MerchantSizeByDPV'].unique()\n", - "\n", - " for c in classes:\n", - " sns.kdeplot(df[not_nan][df[not_nan]['MerchantSizeByDPV']==c][feature], fill=False, label=c)\n", - " \n", - " # Add labels and title\n", - " plt.xlabel('Value')\n", - " plt.ylabel('Density')\n", - " plt.title('Distribution of '+col)\n", - " plt.legend()\n", - "\n", - " # Show the plot\n", - " plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "ac1d5880", - "metadata": {}, - "source": [ - "## Features regarding their discriminative power (univariate)\n", - "Features and how they increase the chance to select bigger companies\n", - "1. **Best** [Criteria: Strong trend + difference between classes]\n", - " - google_places_user_ratings_total: number of ratings\n", - " - google_places_rating: middle rating is the best\n", - " - age_2: 25-44\n", - " - age_3: 45-64 (negative)\n", - " - age_4: >65 years old (negative trend)\n", - " - pop_density: the higher, the better\n", - " - per_service_sector: High amount of service sector\n", - " - pop_avg_age: (negative)\n", - " - regional_score: pop_density * employment_rate * disp_income_p_inhabitant / 1000000\n", - "2. **Middle** [Criteria: Small trend + difference between classes]\n", - " - age_0: 0-17\n", - " - age_1: 18-24 \n", - " - pop_development: population development in one year\n", - " - investments_p_employee: Investments per employee in the region\n", - " - gross_salary_p_employee: Gross salary per employee in the region\n", - " - gdp_p_employee: GDP per employee in the region\n", - " - gdp_development: GDP development in the region\n", - " - gdp_p_inhabitant: GDP per inhabitant in the region\n", - " - gdp_p_workhours: GDP per working hours in the region\n", - " - avg_age_zensus: Average age in the region (source is the census)\n", - "3. **Worst** [No trend + no real difference between classes]\n", - " - sentiment_score: Score how polarized the google reviews are\n", - " - employment_rate: Employment rate in the region\n", - " - unemployment_rate: Unemployment rate in the region\n", - " - long_term_unemployment: Long term unemployment rate in the region\n", - " - disp_income_p_inhabitant: Disposable income per inhabitant in the region\n", - " - tot_income_p_inhabitant: Total income per inhabitant in the region" - ] - }, - { - "cell_type": "markdown", - "id": "ef03f2da", - "metadata": {}, - "source": [ - "## Bivariate analysis of the numerical google places features" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8073b0a8", - "metadata": {}, - "outputs": [], - "source": [ - "for c in classes:\n", - " tmp = df[not_nan][df[not_nan]['MerchantSizeByDPV']==c]\n", - " sns.kdeplot(x=tmp['google_places_user_ratings_total_percentiles'], y=tmp['google_places_rating_percentiles'], fill=False, label=c)\n", - "\n", - " plt.xlabel('ratings_total')\n", - " plt.ylabel('rating_avg')\n", - " plt.title('Distribution of '+c)\n", - " plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "c73f6841", - "metadata": {}, - "source": [ - "## Conclusion\n", - "1. Number of ratings is the most important feature, and is a sign of popularity\n", - "2. Bigger companies have the tendency to have lower ratings while having higher number of ratings\n", - "3. Regions in Germany that are more lucrative:\n", - " - a region that has a high population density, especially bigger cities\n", - " - with a higher percentage of the age group \"25-44\"\n", - " - and have a relatively high service sector share\n", - " - while at the same time having a high regional score\n", - "4. GDP features can be important\n", - "5. Employment rates and income are not very important" - ] - }, - { - "cell_type": "markdown", - "id": "983dc6c8", - "metadata": {}, - "source": [ - "----------------------\n", - "# Categorical Features" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "edb5923a", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "arr_false = {}\n", - "\n", - "for column in df:\n", - " \n", - " if df[column].dtype == bool:\n", - " false_count = np.count_nonzero(df[column] == False)\n", - " arr_false[column] = false_count\n", - " \n", - "print(arr_false)\n", - "# => remove email_valid, because all are positive" - ] - }, - { - "cell_type": "markdown", - "id": "3568fb12", - "metadata": {}, - "source": [ - "### Gender (Out of First Name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "30d74633", - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"\n", - "Dangerous to come to a conclusion based on Gender.\n", - "Women are as contact persons underrepresented in the XL category.\n", - "Possible reasons:\n", - " - Outside (more likely): Underrepresented in responsible positions with increasing size of the company\n", - " - Inside (less likely): The marketing team convinces men better\n", - "\"\"\"\n", - "import gender_guesser.detector as gender\n", - "gd = gender.Detector()\n", - "df['Gender'] = df['First Name'].apply(str.capitalize).map(lambda x: gd.get_gender(x))\n", - "\n", - "group_feature = 'Gender' # MerchantSizeByDPV or Gender\n", - "total_counts = df[group_feature].value_counts().reset_index(name='total_count')\n", - "total_counts = total_counts.rename(columns={'index':group_feature})\n", - "grouped_counts = df.groupby(['Gender', 'MerchantSizeByDPV']).size().reset_index(name='count')\n", - "\n", - "result = pd.merge(grouped_counts, total_counts, on=group_feature)\n", - "result['proportion'] = result['count'] / result['total_count']\n", - "\n", - "category_order = ['XS','S','M','L','XL']\n", - "\n", - "\n", - "# Create separate DataFrames for each gender\n", - "# For better depiction .drop(index='XS') and take away XS from category_order\n", - "# andy: androgynous\n", - "andy_data = result[result['Gender'] == 'andy'].set_index('MerchantSizeByDPV')['proportion']\n", - "unknown_data = result[result['Gender'] == 'unknown'].set_index('MerchantSizeByDPV')['proportion']\n", - "mostly_female_data = result[result['Gender'] == 'mostly_female'].set_index('MerchantSizeByDPV')['proportion']\n", - "mostly_male_data = result[result['Gender'] == 'mostly_male'].set_index('MerchantSizeByDPV')['proportion']\n", - "male_data = result[result['Gender'] == 'male'].set_index('MerchantSizeByDPV')['proportion']\n", - "female_data = result[result['Gender'] == 'female'].set_index('MerchantSizeByDPV')['proportion']\n", - "\n", - "# Plotting\n", - "plt.plot(category_order, andy_data, label='Andy')\n", - "plt.plot(category_order, unknown_data, label='Unknown')\n", - "plt.plot(category_order, mostly_female_data, label='Mostly Female')\n", - "plt.plot(category_order, mostly_male_data, label='Mostly Male')\n", - "plt.plot(category_order, male_data, label='Male')\n", - "plt.plot(category_order, female_data, label='Female')\n", - "\n", - "# Set labels and title\n", - "plt.xlabel('MerchantSizeByDPV')\n", - "plt.ylabel('Proportion')\n", - "plt.title('Proportion of MerchantSizeByDPV for Each Gender')\n", - "\n", - "# Display the plot\n", - "plt.legend()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "2315b4ea", - "metadata": {}, - "source": [ - "### Conclusion: Gender\n", - "1. Andy\n", - "2. Unknown + Mostly Female, Male + Mostly Male, Female" - ] - }, - { - "cell_type": "markdown", - "id": "07f9b71f", - "metadata": {}, - "source": [ - "## MCC Level (Type of Business)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fd4a15f7", - "metadata": {}, - "outputs": [], - "source": [ - "mcc_group = df.groupby(by=['MCC Level','MerchantSizeByDPV']).size()\n", - "grouped = mcc_group.unstack()\n", - "mcc_sum = mcc_group.groupby(level=0).sum()\n", - "\n", - "mcc_df = pd.concat([grouped, sum_test], axis=1)\n", - "tmp = mcc_df[0]\n", - "mcc_df = mcc_df.divide(mcc_df[0], axis=0).sort_values(by='XS', ascending=True)\n", - "mcc_df['Sum'] = tmp" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e787a6cd", - "metadata": {}, - "outputs": [], - "source": [ - "print('Dropped the rows due to less than 50 examples:')\n", - "print(mcc_df[mcc_df['Sum']<50].index.values)\n", - "mcc_df = mcc_df[mcc_df['Sum']>=50]\n", - "\n", - "# Show every 10 categories (previously ordered by ascending XS), to compare the categories\n", - "# The first categories are the most attractive ones\n", - "for i in range(mcc_df.shape[0]): \n", - " if i % 10 == 0:\n", - " mcc_df.drop([0,'Sum','XS'],axis=1)[i:(i+5)].transpose().plot.line()" - ] - }, - { - "cell_type": "markdown", - "id": "9db7b90f", - "metadata": {}, - "source": [ - "### Conclusion: MCC\n", - "For example the most lucrative categories by each group. You have to consider that the first group is more lucrative than the second, etc.\n", - "\n", - "1. Café / Restaurant\n", - "2. Apparel\n", - "3. Book Stores\n", - "4. Pharmacy and Nutrition\n", - "5. Art Dealers and Categories" - ] - }, - { - "cell_type": "markdown", - "id": "ca0dc855", - "metadata": {}, - "source": [ - "## Google_places_detailed_type (Type of Business)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5fc010da", - "metadata": {}, - "outputs": [], - "source": [ - "import ast\n", - "\n", - "data = df[df['google_places_detailed_type'].notnull()]\n", - "test = pd.Series([x for item in data.google_places_detailed_type for x in ast.literal_eval(item)]).value_counts()\n", - "test" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fd76690a", - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.feature_extraction.text import CountVectorizer\n", - "\n", - "\"\"\"\n", - "- isna => category 0\n", - "- remove establishment and point_of_interest => category 1\n", - "- aggregate categories like sublocality? , or administrative_area_level_x\n", - "\"\"\"\n", - "\n", - "docs = df['google_places_detailed_type'][df['google_places_detailed_type'].notna()]\n", - "docs = docs.apply(lambda row: ast.literal_eval(row))\n", - "\n", - "vectorizer = CountVectorizer(analyzer=lambda x: x) # , min_df = 50\n", - "categories = vectorizer.fit_transform(docs).toarray()\n", - "vectorizer.get_feature_names_out()" - ] - }, - { - "cell_type": "markdown", - "id": "78d3f491", - "metadata": {}, - "source": [ - "## Chi squared test for categorical variables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cd08fc9b", - "metadata": {}, - "outputs": [], - "source": [ - "from scipy.stats import chi2_contingency\n", - "\n", - "# Create a contingency table for each feature\n", - "contingency_tables = {}\n", - "\n", - "cat_col = df[['google_places_candidate_count_mail','google_places_candidate_count_phone','google_places_rating','google_places_price_level','google_places_confidence','MCC Level', 'Gender','number_area','first_name_in_account','last_name_in_account','google_places_business_status','number_country','number_valid','number_possible','google_places_place_id_matches_phone_search']].fillna('no_data')\n", - "cat_col['b_google_website'] = df['google_places_detailed_website'].notnull()\n", - "\n", - "#for feature_column in df.columns[df.columns != 'label']:\n", - "for feature_column in cat_col.columns:\n", - " contingency_table = pd.crosstab(df['MerchantSizeByDPV'], cat_col[feature_column])\n", - " contingency_tables[feature_column] = contingency_table\n", - "\n", - "# Perform chi-squared test for each feature\n", - "results = {}\n", - "for feature, table in contingency_tables.items():\n", - " chi2_stat, p_value, dof, expected = chi2_contingency(table)\n", - " results[feature] = {'Chi-squared stat': chi2_stat, 'P-value': p_value, 'Degrees of Freedom': dof}\n", - "\n", - "# Display the results\n", - "for feature, result in results.items():\n", - " print(f\"\\nChi-squared test for {feature}:\")\n", - " print(f\"Chi-squared statistic: {result['Chi-squared stat']:.2f}\")\n", - " print(f\"P-value: {result['P-value']:.4f}\")\n", - " print(f\"Degrees of freedom: {result['Degrees of Freedom']}\")\n", - "\n", - "# p-value > 0.05 => ignore because hypothesis can't be rejected\n", - "# The higher the statistic the more influential\n", - "# => Ignore: number_area" - ] - }, - { - "cell_type": "markdown", - "id": "533b0983", - "metadata": {}, - "source": [ - "### Conclusion: Chi squared test\n", - "1. Ignore number_area (p-value > 0.05)\n", - "2. Best: \n", - " - MCC Level: Find the category of a merchant\n", - " - google_places_price_level: Get the prices of a company\n", - " - google_places_rating: different sources for number of ratings, but also possible social media likes [popularity measures]\n", - " \n", - "3. Middle: \n", - " - number_country\n", - " - google_places_confidence\n", - " - number_possible\n", - " - number_valid\n", - " - google_places_candidate_count_phone\n", - " - Gender\n", - "4. Worst:\n", - " - google_places_candidate_count_mail\n", - " - last_name_in_account\n", - " - google_places_place_id_matches_phone_search\n", - " - b_google_website\n", - " - first_name_in_account\n", - " - google_places_business_status\n", - "5. Invalid:\n", - " - number_area" - ] - }, - { - "cell_type": "markdown", - "id": "66816550", - "metadata": {}, - "source": [ - "----------------------------------\n", - "# Boolean features (Bayesian)" - ] - }, - { - "cell_type": "markdown", - "id": "c7600a68", - "metadata": {}, - "source": [ - "#### Count false per column\n", - "- email_valid: 0 => **not interesting**\n", - "- first_name_in_account: 7659 / 10,000\n", - "- last_name_in_account: 6872 / 10,000\n", - "- number_valid: 339 / 10,000\n", - "- number_possible: 297 / 10,000" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c46eb0f4", - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "def b_bayesian(df,bin_column,b_value=True):\n", - " \n", - " prior_A = df.groupby('MerchantSizeByDPV').count()['Email']/df.shape[0] \n", - " prior_B = df[df[bin_column]==b_value].shape[0] / df[bin_column].shape[0]\n", - " evidence_A = df[df[bin_column]==b_value].groupby('MerchantSizeByDPV').count()[bin_column] / df.groupby('MerchantSizeByDPV').count()[bin_column]\n", - " posterior_B = (prior_A*evidence_A) / prior_B\n", - " \n", - " return posterior_B.reindex(index=['XS', 'S', 'M', 'L','XL'])\n", - "\n", - "per_size = (df.groupby('MerchantSizeByDPV').count()['Email']/df.shape[0]).reindex(index=['XS', 'S', 'M', 'L','XL'])\n", - "\n", - "\n", - "series_not_possible =b_bayesian(df,'number_possible',False)-per_size\n", - "series_invalid = b_bayesian(df,'number_valid',False)-per_size\n", - "series_first_name = b_bayesian(df,'first_name_in_account',True)-per_size\n", - "series_last_name = b_bayesian(df,'last_name_in_account',True)-per_size\n", - "\n", - "series_possible =b_bayesian(df,'number_possible',True)-per_size\n", - "series_valid = b_bayesian(df,'number_valid',True)-per_size\n", - "series_no_first_name = b_bayesian(df,'first_name_in_account',False)-per_size\n", - "series_no_last_name = b_bayesian(df,'last_name_in_account',False)-per_size\n", - "\n", - "# Ensure the 'Category' column is ordered\n", - "categories_order = ['XS', 'S', 'M', 'L','XL']\n", - "\n", - "# Plot the lines\n", - "plt.figure(figsize=(10, 6))\n", - "\n", - "\n", - "plt.plot(categories_order, series_not_possible, label='Number not possible', marker='o')\n", - "plt.plot(categories_order, series_invalid, label='Number invalid', marker='d')\n", - "plt.plot(categories_order, series_first_name, label='First name in account')\n", - "plt.plot(categories_order, series_last_name, label='Last name in account')\n", - "plt.plot(categories_order, series_possible, label='Number possible')\n", - "plt.plot(categories_order, series_valid, label='Number valid')\n", - "plt.plot(categories_order, series_no_first_name, label='First name not in account')\n", - "plt.plot(categories_order, series_no_last_name, label='Last name not in account')\n", - "#plt.plot(categories_order, per_size, label='Percentage of merchant size', marker='s',c='black')\n", - "\n", - "\n", - "plt.title('Bayesian')\n", - "plt.xlabel('Categories')\n", - "plt.ylabel('Percentages')\n", - "plt.legend()\n", - "plt.grid(True)\n", - "\n", - "# Show the plot\n", - "plt.show()\n" - ] - }, - { - "cell_type": "markdown", - "id": "62ca6202", - "metadata": {}, - "source": [ - "## Conclusion: Boolean features (Bayesian)\n", - "1. Email_valid: not interesting\n", - "2. Number not possible and number invalid have much lower XS values (25% less)\n", - "3. First name and last name in account have minor effects on it, and are a sign for smaller categories\n", - "4. Number valid, number possible, and first name / last name not in account are not very different from the overall data" - ] - }, - { - "cell_type": "markdown", - "id": "0d647f05", - "metadata": {}, - "source": [ - "-----------------------------------\n", - "# Quality Evaluation of the Discriminative Quality of Regionalatlas Features" - ] - }, - { - "cell_type": "markdown", - "id": "e43187de", - "metadata": {}, - "source": [ - "## Boxplots and Violinplots for Visual Observation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3a18821e", - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "import numpy as np\n", - "\n", - "\n", - "class_colors = sns.color_palette(\"colorblind\")[:5]\n", - "regional_df = df.filter(like='regional', axis=1)\n", - "regional_df['MerchantSizeByDPV'] = df['MerchantSizeByDPV']\n", - "\n", - "# Plot boxplots for each column with different MerchantSizeByDPV boxplots next to each other\n", - "for i, column in enumerate(regional_df.columns[:-1]): # Exclude the last column ('MerchantSizeByDPV') \n", - " \n", - " if column == 'regional_atlas_pop_development': \n", - " axes = sns.boxplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_pop_development']<2000],palette=class_colors, order=['XS', 'S','M','L','XL']) \n", - "\n", - " elif column == 'regional_atlas_gdp_development':\n", - " axes = sns.boxplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_gdp_development']<60],palette=class_colors, order=['XS', 'S','M','L','XL']) \n", - " \n", - " else:\n", - " axes = sns.boxplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df,palette=class_colors, order=['XS', 'S','M','L','XL'])\n", - " \n", - " axes.set_title(f'Boxplot of {column}')\n", - " axes.set_xlabel('MerchantSizeByDPV')\n", - " axes.set_ylabel(column) \n", - " \n", - " median_value = regional_df[regional_df['MerchantSizeByDPV'] == 'XL'][column].median()\n", - " axes.axhline(y=median_value, color='red', linestyle='--', label=f'Median (XL)')\n", - " axes.legend(bbox_to_anchor=(1.05, 0.5), loc='upper right')\n", - " \n", - " plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18949200", - "metadata": {}, - "outputs": [], - "source": [ - "# Same like the boxplots but now with violinplots\n", - "for column in regional_df.filter(like='regional', axis=1).columns: \n", - " if column == 'regional_atlas_pop_development': \n", - " axes = sns.violinplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_pop_development']<2000],palette=class_colors, order=['XS', 'S','M','L','XL']) \n", - "\n", - " elif column == 'regional_atlas_gdp_development':\n", - " axes = sns.violinplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_gdp_development']<60],palette=class_colors, order=['XS', 'S','M','L','XL']) \n", - " \n", - " else:\n", - " axes = sns.violinplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df,palette=class_colors, order=['XS', 'S','M','L','XL'])\n", - " \n", - " axes.set_title(f'Boxplot of {column}')\n", - " axes.set_xlabel('MerchantSizeByDPV')\n", - " axes.set_ylabel(column) \n", - " \n", - " median_value = regional_df[regional_df['MerchantSizeByDPV'] == 'XL'][column].median()\n", - " axes.axhline(y=median_value, color='red', linestyle='--', label=f'Median (XL)')\n", - " axes.legend(bbox_to_anchor=(1.05, 0.5), loc='upper right')\n", - " \n", - " plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "2fb6790a", - "metadata": {}, - "source": [ - "## Computing a Heuristic Metric to Identify Quality Groups" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4c7b2074", - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn import preprocessing\n", - "\n", - "# Normalize the features before comparing / dividing them\n", - "x = regional_df.drop('MerchantSizeByDPV', axis = 1).values #returns a numpy array\n", - "min_max_scaler = preprocessing.MinMaxScaler()\n", - "x_scaled = min_max_scaler.fit_transform(x)\n", - "norm_regio = pd.DataFrame(x_scaled, columns=regional_df.drop('MerchantSizeByDPV', axis = 1).columns)\n", - "\n", - "# Compute the stats of the normalized regional data, to find a heuristic to evaluate the features' discriminative magnitudes\n", - "df_stats_XL = norm_regio[regional_df['MerchantSizeByDPV']=='XL'].describe()\n", - "df_stats_XS = norm_regio[regional_df['MerchantSizeByDPV']=='XS'].describe()\n", - "\n", - "((df_stats_XL.loc['50%'] - df_stats_XS.loc['50%'])/(df_stats_XL.loc['75%'] - df_stats_XL.loc['25%'])).sort_values(ascending=False)" - ] - }, - { - "cell_type": "markdown", - "id": "26a45107", - "metadata": {}, - "source": [ - "## Result: Quality Evaluation of the Discriminative Quality of Regionalatlas Features\n", - "\n", - "We can see that most of the negative values are related to age. The reason behind this might be the generational related consumer and payment behavior, and on socioeconomical reasons. Every generation in germany had a vastly different working/pension experience (f.e. Generation Silent / baby boomer / Gen Y) and finance/housing situation. It seems like the XL companies tend to be in the big german cities, which have a high population density and a high share of age_2 (25-44), and a flourishing service sector.\n", - "\n", - "1. pop_development: might be related to age_0 (more kids)\n", - "2. Employment rate: bigger cities (with high density) have a higher unemployment rate. But the high density might be able to compensate it.\n", - "3. age_0: minors are part of families, that might not be able to spend money in the service industry (biggest part of SumUp)\n", - "4. pop_avg_age & pop_avg_age_zensus: Different results because of different time the data was collected\n", - "5. age_4 (>64),age_3 (45-64): Are not age_0 group and might not be in the big cities / spent as much money as others in the service industry with card payment\n", - "\n", - "The following table provides information from the regionalatlas dataset. Each row represents a different feature, with corresponding absolute values computed previously and quality class. Higher values mean higher discriminative quality (for Value and Quality Class). The visual assessment is done by considering also the distribution identified in the violin plot.\n", - "\n", - "| Feature | Value | Quality Class (metric, boxplot related) | Quality Class (after visual assessment of violin plot) |\n", - "|---------------------------------------------|-------------------------|-------|-------|\n", - "| regional_atlas_pop_density | 0.581410 | 1 | 1 |\n", - "| regional_atlas_age_2 | 0.552632 | 1 | 1 |\n", - "| regional_atlas_age_3 | 0.488372 | 1 | 1 |\n", - "| regional_atlas_per_service_sector | 0.467153 | 1 | 1 |\n", - "| regional_atlas_regional_score | 0.420439 | 1 | 1 |\n", - "| regional_atlas_pop_avg_age_zensus | 0.384615 | 1 | 2 |\n", - "| regional_atlas_age_4 | 0.354839 | 1 | 2 |\n", - "| regional_atlas_investments_p_employee | 0.333333 | 1 | 1 |\n", - "| regional_atlas_pop_avg_age | 0.304348 | 1 | 2 |\n", - "| regional_atlas_age_0 | 0.222222 | 1 | 2 |\n", - "| regional_atlas_age_1 | 0.200000 | 2 | 3 |\n", - "| regional_atlas_unemployment_rate | 0.181818 | 2 | 3 |\n", - "| regional_atlas_gdp_p_inhabitant | 0.175996 | 2 | 2 |\n", - "| regional_atlas_gdp_development | 0.163934 | 2 | 3 |\n", - "| regional_atlas_per_trade | 0.162791 | 2 | 3 |\n", - "| regional_atlas_gdp_p_employee | 0.128667 | 2 | 3 |\n", - "| regional_atlas_employment_rate | 0.093333 | 2 | 3 |\n", - "| regional_atlas_gross_salary_p_employee | 0.060377 | 3 | 3 |\n", - "| regional_atlas_tot_income_p_taxpayer | 0.053691 | 3 | 3 |\n", - "| regional_atlas_pop_development | 0.026786 | 3 | 4 |\n", - "| regional_atlas_per_long_term_unemployment | 0.024096 | 3 | 3 |\n", - "| regional_atlas_gdp_p_workhours | 0.018957 | 4 | 4 |\n", - "| regional_atlas_disp_income_p_inhabitant | 0.000000 | 4 | 4 |\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "a3c6e408", - "metadata": {}, - "source": [ - "# Correlation above 0.89" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "684af78c", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "# Compute a correlation matrix for all float values of our dataframe\n", - "float_cols = df.columns[df.dtypes==float]\n", - "corr_matrix = df[float_cols].corr()\n", - "\n", - "# The diagonal values (correlation of each feature with itself) should be considered 0, to filter them out\n", - "np.fill_diagonal(corr_matrix.values, 0)\n", - "\n", - "# Create a new DataFrame that transforms all values to 0 that are below a value of defined by variable \"correlation_threshold\" \n", - "correlation_threshold = 0.89\n", - "filtered_correlation_df = corr_matrix.applymap(lambda x: x if abs(x) >= correlation_threshold else 0)\n", - "\n", - "# Identify the rows and columns that not only consists of 0 values (after filtering)\n", - "non_zero_rows = filtered_correlation_df.index[~(filtered_correlation_df == 0).all(axis=1)]\n", - "non_zero_columns = filtered_correlation_df.columns[~(filtered_correlation_df == 0).all(axis=0)]\n", - "new_correlation_df = filtered_correlation_df.loc[non_zero_rows, non_zero_columns]\n", - "\n", - "# Print the new correlation matrix and the corresponding plot\n", - "print(f\"New Correlation Matrix (values greater than {correlation_threshold}):\")\n", - "\n", - "plt.figure(figsize=(12, 10))\n", - "heatmap = sns.heatmap(new_correlation_df, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)\n", - "plt.title('Correlation Matrix Heatmap')\n", - "plt.savefig('correlation_matrix.svg', format='svg')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "7a77e1c4", - "metadata": {}, - "source": [ - "## Result: Correlation Analysis\n", - "Might be recommendable to decide between the following features (based on the discrimination quality):\n", - "1. pop_avg_age and avg_age_zensus are different. Maybe the reason is the method or the time when the data was collected. Remove one of them\n", - "2. age_4 vs. avg_age / avg_age_zensus\n", - "3. gdp_p_workhours vs. gdp_p_employee\n", - "4. age_2 vs. age_3\n", - "5. Might remove Regional Score" - ] - }, - { - "cell_type": "markdown", - "id": "f0ebb3f6", - "metadata": {}, - "source": [ - "---------------------------------------\n", - "# PCA Analysis for Expensive Algorithms" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ba9ee1b", - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.decomposition import PCA\n", - "from sklearn.preprocessing import StandardScaler\n", - "\n", - "\n", - "reg_df = df.filter(like='regional', axis=1).dropna()\n", - "\n", - "# Standardize the features\n", - "scaler = StandardScaler()\n", - "scaled_data = scaler.fit_transform(reg_df.drop('MerchantSizeByDPV', axis=1))\n", - "\n", - "# Apply PCA\n", - "pca = PCA()\n", - "principal_components = pca.fit_transform(scaled_data)\n", - "\n", - "# Retrieve explained variance ratios\n", - "explained_variance_ratio = pca.explained_variance_ratio_\n", - "\n", - "components = pd.DataFrame(pca.components_, columns=filter_df.columns)\n", - "\n", - "# Print explained variance ratios\n", - "for i, ratio in enumerate(explained_variance_ratio, 1):\n", - " print(f\"Principal Component {i}: Explained Variance Ratio = {ratio:.4f}\")\n", - "\n", - "# Plot the cumulative explained variance\n", - "cumulative_variance = explained_variance_ratio.cumsum()\n", - "\n", - "plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o')\n", - "plt.title('Cumulative Explained Variance')\n", - "plt.xlabel('Number of Principal Components')\n", - "plt.ylabel('Cumulative Variance Explained')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "43e567f8", - "metadata": {}, - "source": [ - "---------------------------\n", - "# New Features" - ] - }, - { - "cell_type": "markdown", - "id": "b79613bf", - "metadata": {}, - "source": [ - "## Email Address Analysis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d731eff3", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from sklearn.feature_extraction.text import CountVectorizer\n", - "\n", - "# Count only those words, existing in a minum amount of 100 email adresses\n", - "count_vectorizer = CountVectorizer(min_df=50)\n", - "\n", - "# Fit and transform the text data\n", - "count_matrix = count_vectorizer.fit_transform(df['Email'])\n", - "\n", - "# Convert the matrix to a DataFrame for better readability\n", - "count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4314186c", - "metadata": {}, - "outputs": [], - "source": [ - "common_words = pd.DataFrame(count_df.sum()).transpose()\n", - "\n", - "for word in common_words:\n", - " print(word)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fdf6ba75", - "metadata": {}, - "outputs": [], - "source": [ - "# Names\n", - "names = []\n", - "\n", - "# Weird terms\n", - "weird_terms = ['hallo', 'hello', 'moin', 'sumup']\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fa498485", - "metadata": {}, - "outputs": [], - "source": [ - "grouped_common_words = []\n", - "per_size = (df.groupby('MerchantSizeByDPV').count()['Email']/df.shape[0]).reindex(index=['XS', 'S', 'M', 'L','XL'])\n", - "\n", - "for word in common_words.drop(weird_terms,axis=1): #common_words[names], common_words[weird_terms]\n", - " \n", - " indices= count_df[count_df[word]>0].index \n", - " per_word = (df.loc[indices].groupby('MerchantSizeByDPV').count()['Email']/len(df.loc[indices])).reindex(index=['XS', 'S', 'M', 'L','XL']) \n", - " \n", - " grouped_common_words.append((per_word-per_size).rename(word)) \n", - " \n", - "common_df = pd.concat(grouped_common_words, axis=1)\n", - "common_df = common_df.transpose()\n", - "\n", - "common_df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3aeb199a", - "metadata": {}, - "outputs": [], - "source": [ - "# The min/mean/max probability decrease (-) or increase (+) by a value of x with the existence of a certain common word\n", - "\n", - "print(f'{np.min(common_df[\"XS\"])}, {np.mean(common_df[\"XS\"])},{np.max(common_df[\"XS\"])}')\n", - "print(f'{np.min(common_df[\"S\"])}, {np.mean(common_df[\"S\"])},{np.max(common_df[\"S\"])}')\n", - "print(f'{np.min(common_df[\"M\"])}, {np.mean(common_df[\"M\"])},{np.max(common_df[\"M\"])}')\n", - "print(f'{np.min(common_df[\"L\"])}, {np.mean(common_df[\"L\"])},{np.max(common_df[\"L\"])}')\n", - "print(f'{np.min(common_df[\"XL\"])}, {np.mean(common_df[\"XL\"])},{np.max(common_df[\"XL\"])}')" - ] - }, - { - "cell_type": "markdown", - "id": "dc9ac1c0", - "metadata": {}, - "source": [ - "## Result: Email Address Analysis\n", - "If possible add the common words as one hot vector to the model" - ] - }, - { - "cell_type": "markdown", - "id": "83e84e7c", - "metadata": {}, - "source": [ - "## Financial report (Bundesanzeiger)\n", - "\n", - "#### Note:\n", - "- Execution: The underlying cell needs to be processed in a separate python file\n", - "- Source: It is difficult to find for Germany corresponding APIs for the financial report registers\n", - "- Choice of Source: Bundesanzeiger is the most promising register, due to its restriction for bigger companies. Problem is that each access takes at least 6sec (internet connection + captcha + ...)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "38d47b36", - "metadata": {}, - "outputs": [], - "source": [ - "import multiprocessing\n", - "import time\n", - "import pandas as pd\n", - "import numpy as np\n", - "from deutschland.bundesanzeiger import Bundesanzeiger\n", - "import pickle\n", - "import time\n", - "\n", - "def access_ba(company,b_bundesanzeiger,):\n", - "\n", - " b_bundesanzeiger.append(True)\n", - " try:\n", - " ba = Bundesanzeiger()\n", - " data = ba.get_reports(company)\n", - " except:\n", - " b_bundesanzeiger[-1] = False\n", - " return\n", - "\n", - " if __name__ == '__main__':\n", - "\n", - " \"\"\"\n", - " with open('list_file.pkl', 'rb') as file:\n", - " loaded_list = pickle.load(file)\n", - " print(loaded_list)\n", - " \"\"\"\n", - "\n", - " pd.set_option('display.max_columns', None)\n", - "\n", - " historic = pd.read_csv('historic.csv',sep = ',')#_enriched\n", - "\n", - " df = historic.groupby('MerchantSizeByDPV').apply(lambda x: x.sample(100))\n", - "\n", - "\n", - " with multiprocessing.Manager() as manager:\n", - "\n", - " b_bundesanzeiger = manager.list()\n", - " content_array = []\n", - " durations = []\n", - "\n", - " for i, company in enumerate(df[\"Company Name\"]):\n", - "\n", - " print(i)\n", - "\n", - " start = time.time()\n", - "\n", - " # Start access_ba as a process\n", - " p = multiprocessing.Process(target=access_ba, name=\"access_ba\", args=(company,b_bundesanzeiger))\n", - "\n", - " p.start()\n", - "\n", - " # Wait 8 seconds for access_ba\t\n", - " p.join(8)\n", - "\n", - " # If thread is active\n", - " if p.is_alive():\n", - " print (\"Terminate access_ba\")\n", - "\n", - " # Terminate access_ba\n", - " p.terminate()\n", - " b_bundesanzeiger[-1] = 'killed'\n", - "\n", - " # Cleanup\n", - " p.join()\n", - " i+=1\n", - "\n", - " print(b_bundesanzeiger[-1])\n", - " end = time.time()\n", - " print(end-start)\n", - " print()\n", - " durations.append(end-start)\n", - "\n", - " \"\"\"if i==100:\n", - " with open('list_file.pkl', 'wb') as file:\n", - " pickle.dump(list(b_bundesanzeiger), file)\n", - " print(np.mean(np.array(list(b_bundesanzeiger))))\n", - " break\n", - " \"\"\"\n", - "\n", - " with open('list_file.pkl', 'wb') as file:\n", - " pickle.dump(list(b_bundesanzeiger), file)\n", - "\n", - " with open('time.pkl', 'wb') as file:\n", - " pickle.dump(durations, file)\n", - "\n", - " df.to_pickle(\"./dataframe_sample.pkl\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c8a6f460", - "metadata": {}, - "outputs": [], - "source": [ - "with open('dataframe_sample.pkl', 'rb') as f:\n", - " df = pickle.load(f)\n", - "\n", - "df = df.reset_index(drop=True)\n", - "\n", - "with open('list_file.pkl', 'rb') as f:\n", - " mynewlist = pickle.load(f)\n", - "\n", - "with open('time.pkl', 'rb') as f:\n", - " time = pickle.load(f)\n", - "\n", - "df_stats = pd.DataFrame({'b_bundesanzeiger': mynewlist, 'time': time})\n", - "\n", - "df['b_bundesanzeiger'] = df_stats['b_bundesanzeiger']\n", - "df['time'] = df_stats['time']\n", - "\n", - "\n", - "counts =df.groupby('MerchantSizeByDPV')['b_bundesanzeiger'].value_counts()\n", - "\n", - "desired_value_counts = counts.unstack().fillna(0)\n", - "\n", - "# Compute total counts per category\n", - "total_counts_per_category = counts.groupby('MerchantSizeByDPV').sum()\n", - "\n", - "# Compute probability for each category\n", - "probabilities = desired_value_counts.apply(lambda x: x / total_counts_per_category)\n", - "\n", - "print(probabilities)" - ] - }, - { - "cell_type": "markdown", - "id": "956536b8", - "metadata": {}, - "source": [ - "## Result: Financial report (Bundesanzeiger)\n", - "- After 8 seconds the connection is killed, because it can take minutes. Most need less than 8 seconds.\n", - "- Results are based on a sample of 100 for each class\n", - "- Difference regarding the True values\n", - "\n", - "| b_bundesanzeiger | False | True | Killed |\n", - "|--------------------|-------|------|--------|\n", - "| MerchantSizeByDPV | | | |\n", - "| XS | 0.59 | 0.24 | 0.17 |\n", - "| S | 0.73 | 0.14 | 0.13 |\n", - "| M | 0.57 | 0.34 | 0.09 |\n", - "| L | 0.53 | 0.30 | 0.17 |\n", - "| XL | 0.45 | 0.41 | 0.14 |\n" - ] - }, - { - "cell_type": "markdown", - "id": "8504a6e0", - "metadata": {}, - "source": [ - "## Existence of company in google places" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "30b18dea", - "metadata": {}, - "outputs": [], - "source": [ - "df['b_google_places'] = df[\"google_places_place_id\"].notnull()\n", - "counts =df.groupby('MerchantSizeByDPV')['b_google_places'].value_counts()\n", - "\n", - "desired_value_counts = counts.unstack().fillna(0)\n", - "\n", - "# Compute total counts per category\n", - "total_counts_per_category = counts.groupby('MerchantSizeByDPV').sum()\n", - "\n", - "# Compute probability for each category\n", - "probabilities = desired_value_counts.apply(lambda x: x / total_counts_per_category)\n", - "\n", - "print(probabilities)" - ] - }, - { - "cell_type": "markdown", - "id": "f42904bc", - "metadata": {}, - "source": [ - "### Result\n", - "- Bigger companies are easier found than smaller companies on google places\n", - "- Average of 77% are found on google_places (but we can not be 100% sure they are correct)" - ] - }, - { - "cell_type": "markdown", - "id": "eb71d595", - "metadata": {}, - "source": [ - "# ML - Model" - ] - }, - { - "cell_type": "markdown", - "id": "4738be7c", - "metadata": {}, - "source": [ - "## Ideas\n", - "1. Imputation due to missing values\n", - "2. SMOTE or GAN / Diffusion Model / ... to create more values against the imbalance\n", - "3. Consider L and XL as outlier (using Outlier Detection)\n", - "4. StratifiedKFold\n", - "5. Weighted Classifiers (for example higher weight for class XL using a Random Forest)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f167c71", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "\n", - "\n", - "# Separate features (X) and target variable (y)\n", - "table = df[regional_columns+['MerchantSizeByDPV']].dropna()\n", - "y = table['MerchantSizeByDPV']\n", - "X=table[regional_columns]\n", - "\n", - "X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X, y)\n", - "\n", - "# Split the data into training and testing sets\n", - "X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)\n", - "\n", - "# Create a logistic regression model\n", - "model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=4000, class_weight='balanced')\n", - "\n", - "# Fit the model on the training data\n", - "model.fit(X_train, y_train)\n", - "\n", - "# Make predictions on the testing data\n", - "y_pred = model.predict(X_test)\n", - "\n", - "# Evaluate the model\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "conf_matrix = confusion_matrix(y_test, y_pred)\n", - "class_report = classification_report(y_test, y_pred)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0e6afdec", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n", - "\n", - "# Assuming X and y are your feature matrix and target variable\n", - "X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)\n", - "\n", - "# Create and fit the Random Forest model\n", - "rf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n", - "rf_model.fit(X_train, y_train)\n", - "\n", - "# Make predictions on the test set\n", - "y_pred = rf_model.predict(X_test)\n", - "\n", - "# Evaluate the model\n", - "accuracy = accuracy_score(y_test, y_pred)\n", - "conf_matrix = confusion_matrix(y_test, y_pred)\n", - "class_report = classification_report(y_test, y_pred)\n", - "\n", - "# Display evaluation metrics\n", - "print(f'Accuracy: {accuracy:.2f}')\n", - "print(f'Confusion Matrix:\\n{conf_matrix}')\n", - "print(f'Classification Report:\\n{class_report}')\n" - ] - }, - { - "cell_type": "markdown", - "id": "4da857f4", - "metadata": {}, - "source": [ - "# Idea: Testing outlier detection to identify XL" - ] - }, - { - "cell_type": "markdown", - "id": "773b81db", - "metadata": {}, - "source": [ - "## Isolation Forest\n", - "Act like XL is an anomaly and we try to identify it" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "abea2245", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from sklearn.ensemble import IsolationForest\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n", - "\n", - "# Assuming X and y are your feature matrix and target variable\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", - "\n", - "# Define the set of rare classes\n", - "rare_classes = ['XL'] # Replace with the actual class labels you consider rare\n", - " # MAYBE NOT ONLY XL, but also L and M\n", - "# Create a binary target variable indicating whether each instance is rare or not\n", - "y_train_rare = y_train.isin(rare_classes).astype(int)\n", - "y_test_rare = y_test.isin(rare_classes).astype(int)\n", - "\n", - "# Create and fit the Isolation Forest model\n", - "if_model = IsolationForest(contamination='auto')\n", - "if_model.fit(X_train)\n", - "\n", - "# Predict anomalies on the test set\n", - "y_pred_rare = if_model.predict(X_test)\n", - "\n", - "# Convert the predicted labels to binary (1 for anomalies, -1 for normal instances)\n", - "y_pred_rare_binary = (y_pred_rare == -1).astype(int)\n", - "\n", - "# Evaluate the model\n", - "accuracy = accuracy_score(y_test_rare, y_pred_rare_binary)\n", - "conf_matrix = confusion_matrix(y_test_rare, y_pred_rare_binary)\n", - "class_report = classification_report(y_test_rare, y_pred_rare_binary)\n", - "\n", - "# Display evaluation metrics\n", - "print(f'Accuracy: {accuracy:.2f}')\n", - "print(f'Confusion Matrix:\\n{conf_matrix}')\n", - "print(f'Classification Report:\\n{class_report}')\n", - "\n", - "plt.figure(figsize=(6, 4))\n", - "sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False,\n", - " xticklabels=[0,1], yticklabels=[0,1])\n", - "plt.xlabel('Predicted Label')\n", - "plt.ylabel('True Label')\n", - "plt.title('Confusion Matrix')\n", - "plt.show()\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "conda_pytorch_p310", - "language": "python", - "name": "conda_pytorch_p310" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + From c05ad761963d47101c0ac08fac260fc9a746a705 Mon Sep 17 00:00:00 2001 From: Fabian Utech Date: Wed, 7 Feb 2024 00:56:00 +0100 Subject: [PATCH 49/51] Needed update to portray result Signed-off-by: Fabian Utech --- notebooks/fabian_feature_analysis.ipynb | 1044 ++++++++++++++++++++++- 1 file changed, 1043 insertions(+), 1 deletion(-) diff --git a/notebooks/fabian_feature_analysis.ipynb b/notebooks/fabian_feature_analysis.ipynb index 8b13789..5bfa4aa 100644 --- a/notebooks/fabian_feature_analysis.ipynb +++ b/notebooks/fabian_feature_analysis.ipynb @@ -1 +1,1043 @@ - +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "9d3b226f", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "df = pd.read_csv(\"s3://amos-training-data/100k_historic_enriched.csv\")\n", + "categories_order = ['XS', 'S', 'M', 'L','XL']\n", + "df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f508fadb", + "metadata": {}, + "outputs": [], + "source": [ + "print(len(df))\n", + "df.groupby(by=['MerchantSizeByDPV']).size()/len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a850c5e7", + "metadata": {}, + "outputs": [], + "source": [ + "# Check if there are repetitions (> 0 => leads that exist multiple times according to the identifier)\n", + "identifier = df[['Phone','Email','Company Name','number_formatted','google_places_place_id','google_places_formatted_address','google_places_name','google_places_detailed_website']]\n", + "for col in identifier:\n", + " print(f'{col}: {len(df[col].unique())} ({1-len(df[col].unique())/df[col].count()})') " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7d07397", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\" Regionalatlas: Placeholder\n", + "2222222222: nichts vorhanden, genau 0\n", + "5555555555: Zahlenwert unbekannt oder geheim zu halten\n", + "6666666666: Tabellenfach gesperrt, da Aussage nicht sinnvoll\n", + "7777777777: keine Angabe, da Zahlenwert nicht sicher genug\n", + "8888888888: Angabe fällt später an\n", + "\"\"\"\n", + "\n", + "exclude_values = [2222222222.0, 5555555555.0, 6666666666.0, 7777777777.0, 8888888888.0]\n", + "regional_df = df.filter(like='regional', axis=1).dropna()\n", + "\n", + "# Dictionary to know which columns and indices have problematic values\n", + "rem_dic = {}\n", + "columns = []\n", + "\n", + "filter_df = regional_df.copy()\n", + "\n", + "for exc in exclude_values:\n", + " # Find all columns that have those values we need to exclude\n", + " col = regional_df.loc[:,(np.sum(df == exc,axis=0)>0)].columns.tolist()\n", + "\n", + " columns+=col\n", + " \n", + " \n", + " # Now we can use those columns to find the corresponding rows\n", + " for c in col:\n", + " indices = regional_df.loc[(np.sum(df == exc,axis=1)>0),col].index.tolist() \n", + " \n", + " rem_dic[c] = {str(exc):indices}\n", + " \n", + " filter_df = filter_df[df[c]!=exc]\n", + " print(f'column:{c}, value:{exc}')\n", + " \n", + "print(rem_dic)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16fcbf05", + "metadata": {}, + "outputs": [], + "source": [ + "# Irregular values defined by regionalatlas needs to be translated to nan so we can handle it later on\n", + "import numpy as np\n", + "regional_atlas = [col for col in df if col.startswith('regional_atlas')]\n", + "\n", + "print(\"Changed the following features, because of irregular values of regionalatlas:\")\n", + "for col in regional_atlas:\n", + " n_irr = (df[col]>=2222222222).sum()\n", + " n = (df[col].notnull()).sum()\n", + " \n", + " if (n_irr>0):\n", + " print(col+': '+str(n_irr)+' out of '+ str(n))\n", + " df[col] = np.where(df[col] >= 2222222222, np.nan, df[col])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "730b574a", + "metadata": {}, + "outputs": [], + "source": [ + "isna = sum(df['google_places_place_id_matches_phone_search'].isna())\n", + "print(f'Empty: {isna}')\n", + "print(f'Not empty: {df.shape[0]-isna}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35186540", + "metadata": {}, + "outputs": [], + "source": [ + "print(df.groupby('MerchantSizeByDPV').count()['number_area'].reindex(categories_order))\n", + "\n", + "tmp = df[df['number_country']=='Germany'].groupby('MerchantSizeByDPV').count()\n", + "(tmp / (sum(tmp['Last Name'].values)/129))['First Name'].reindex(categories_order)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27014d88", + "metadata": {}, + "outputs": [], + "source": [ + "min_max_df = df.agg({\n", + "'google_places_user_ratings_total':[\"min\",\"max\"],\n", + "'google_places_rating':[\"min\",\"max\"],\n", + "'google_places_price_level':[\"min\",\"max\"],\n", + "'reviews_sentiment_score':[\"min\",\"max\"],\n", + "'regional_atlas_age_0':[\"min\",\"max\"],\n", + "'regional_atlas_age_1':[\"min\",\"max\"],\n", + "'regional_atlas_age_2':[\"min\",\"max\"],\n", + "'regional_atlas_age_3':[\"min\",\"max\"],\n", + "'regional_atlas_age_4':[\"min\",\"max\"],\n", + "'regional_atlas_per_service_sector':[\"min\",\"max\"],\n", + "'regional_atlas_per_trade':[\"min\",\"max\"],\n", + "'regional_atlas_employment_rate':[\"min\",\"max\"],\n", + "'regional_atlas_unemployment_rate':[\"min\",\"max\"],\n", + "'regional_atlas_per_long_term_unemployment':[\"min\",\"max\"],\n", + "'regional_atlas_pop_density':[\"min\",\"max\"],\n", + "'regional_atlas_pop_development':[\"min\",\"max\"],\n", + "'regional_atlas_pop_avg_age':[\"min\",\"max\"],\n", + "'regional_atlas_investments_p_employee':[\"min\",\"max\"],\n", + "'regional_atlas_gross_salary_p_employee':[\"min\",\"max\"],\n", + "'regional_atlas_disp_income_p_inhabitant':[\"min\",\"max\"],\n", + "'regional_atlas_tot_income_p_taxpayer':[\"min\",\"max\"],\n", + "'regional_atlas_gdp_p_employee':[\"min\",\"max\"],\n", + "'regional_atlas_gdp_development':[\"min\",\"max\"],\n", + "'regional_atlas_gdp_p_inhabitant':[\"min\",\"max\"],\n", + "'regional_atlas_gdp_p_workhours':[\"min\",\"max\"],\n", + "'regional_atlas_pop_avg_age_zensus':[\"min\",\"max\"],\n", + "'regional_atlas_regional_score':[\"min\",\"max\"]\n", + "})\n", + "\n", + "# Apply the function for each column\n", + "for col in min_max_df.columns:\n", + " min_feature = min_max_df[col]['min']\n", + " max_feature = min_max_df[col]['max']\n", + " print(f'{col}: [{min_feature}, {max_feature}]') " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51d78040", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import percentileofscore\n", + "\n", + "percentile_col = [\n", + "'regional_atlas_age_0',\n", + "'regional_atlas_age_1',\n", + " 'regional_atlas_age_2',\n", + "'regional_atlas_age_3',\n", + "'regional_atlas_age_4',\n", + "'google_places_user_ratings_total','google_places_rating','reviews_sentiment_score','regional_atlas_pop_density',\n", + "'regional_atlas_pop_development',\n", + "'regional_atlas_pop_avg_age',\n", + "'regional_atlas_per_service_sector',\n", + "'regional_atlas_per_trade',\n", + "'regional_atlas_employment_rate',\n", + "'regional_atlas_unemployment_rate',\n", + "'regional_atlas_per_long_term_unemployment',\n", + "'regional_atlas_investments_p_employee',\n", + "'regional_atlas_gross_salary_p_employee',\n", + "'regional_atlas_disp_income_p_inhabitant',\n", + "'regional_atlas_tot_income_p_taxpayer',\n", + "'regional_atlas_gdp_p_employee',\n", + "'regional_atlas_gdp_development',\n", + "'regional_atlas_gdp_p_inhabitant',\n", + "'regional_atlas_gdp_p_workhours',\n", + "'regional_atlas_pop_avg_age_zensus',\n", + "'regional_atlas_regional_score']\n", + "\n", + "for col in percentile_col:\n", + " no_nan = df[col][df[col].notnull()]\n", + " col_name = col+'_percentiles' \n", + " df[col_name] = no_nan.apply(lambda x: percentileofscore(no_nan, x))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "307c0e39", + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Adding the percentiles as columns for analysis and report\n", + "\n", + "for col in percentile_col:\n", + " feature = col+\"_percentiles\"\n", + " not_nan = df[feature].notnull()\n", + "\n", + " classes = df['MerchantSizeByDPV'].unique()\n", + "\n", + " for c in classes:\n", + " sns.kdeplot(df[not_nan][df[not_nan]['MerchantSizeByDPV']==c][feature], fill=False, label=c)\n", + " \n", + " # Add labels and title\n", + " plt.xlabel('Value')\n", + " plt.ylabel('Density')\n", + " plt.title('Distribution of '+col)\n", + " plt.legend()\n", + "\n", + " # Show the plot\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8073b0a8", + "metadata": {}, + "outputs": [], + "source": [ + "for c in classes:\n", + " tmp = df[not_nan][df[not_nan]['MerchantSizeByDPV']==c]\n", + " sns.kdeplot(x=tmp['google_places_user_ratings_total_percentiles'], y=tmp['google_places_rating_percentiles'], fill=False, label=c)\n", + "\n", + " plt.xlabel('ratings_total')\n", + " plt.ylabel('rating_avg')\n", + " plt.title('Distribution of '+c)\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "edb5923a", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "arr_false = {}\n", + "\n", + "for column in df:\n", + " \n", + " if df[column].dtype == bool:\n", + " false_count = np.count_nonzero(df[column] == False)\n", + " arr_false[column] = false_count\n", + " \n", + "print(arr_false)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30d74633", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "Dangerous to come to a conclusion based on Gender.\n", + "\"\"\"\n", + "import gender_guesser.detector as gender\n", + "gd = gender.Detector()\n", + "df['Gender'] = df['First Name'].apply(str.capitalize).map(lambda x: gd.get_gender(x))\n", + "\n", + "group_feature = 'Gender' # MerchantSizeByDPV or Gender\n", + "total_counts = df[group_feature].value_counts().reset_index(name='total_count')\n", + "total_counts = total_counts.rename(columns={'index':group_feature})\n", + "grouped_counts = df.groupby(['Gender', 'MerchantSizeByDPV']).size().reset_index(name='count')\n", + "\n", + "result = pd.merge(grouped_counts, total_counts, on=group_feature)\n", + "result['proportion'] = result['count'] / result['total_count']\n", + "\n", + "category_order = ['XS','S','M','L','XL']\n", + "\n", + "\n", + "# Create separate DataFrames for each gender\n", + "# For better depiction .drop(index='XS') and take away XS from category_order\n", + "# andy: androgynous\n", + "andy_data = result[result['Gender'] == 'andy'].set_index('MerchantSizeByDPV')['proportion']\n", + "unknown_data = result[result['Gender'] == 'unknown'].set_index('MerchantSizeByDPV')['proportion']\n", + "mostly_female_data = result[result['Gender'] == 'mostly_female'].set_index('MerchantSizeByDPV')['proportion']\n", + "mostly_male_data = result[result['Gender'] == 'mostly_male'].set_index('MerchantSizeByDPV')['proportion']\n", + "male_data = result[result['Gender'] == 'male'].set_index('MerchantSizeByDPV')['proportion']\n", + "female_data = result[result['Gender'] == 'female'].set_index('MerchantSizeByDPV')['proportion']\n", + "\n", + "# Plotting\n", + "plt.plot(category_order, andy_data, label='Andy')\n", + "plt.plot(category_order, unknown_data, label='Unknown')\n", + "plt.plot(category_order, mostly_female_data, label='Mostly Female')\n", + "plt.plot(category_order, mostly_male_data, label='Mostly Male')\n", + "plt.plot(category_order, male_data, label='Male')\n", + "plt.plot(category_order, female_data, label='Female')\n", + "\n", + "# Set labels and title\n", + "plt.xlabel('MerchantSizeByDPV')\n", + "plt.ylabel('Proportion')\n", + "plt.title('Proportion of MerchantSizeByDPV for Each Gender')\n", + "\n", + "# Display the plot\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd4a15f7", + "metadata": {}, + "outputs": [], + "source": [ + "mcc_group = df.groupby(by=['MCC Level','MerchantSizeByDPV']).size()\n", + "grouped = mcc_group.unstack()\n", + "mcc_sum = mcc_group.groupby(level=0).sum()\n", + "\n", + "mcc_df = pd.concat([grouped, sum_test], axis=1)\n", + "tmp = mcc_df[0]\n", + "mcc_df = mcc_df.divide(mcc_df[0], axis=0).sort_values(by='XS', ascending=True)\n", + "mcc_df['Sum'] = tmp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e787a6cd", + "metadata": {}, + "outputs": [], + "source": [ + "print('Dropped the rows due to less than 50 examples:')\n", + "print(mcc_df[mcc_df['Sum']<50].index.values)\n", + "mcc_df = mcc_df[mcc_df['Sum']>=50]\n", + "\n", + "# Show every 10 categories (previously ordered by ascending XS), to compare the categories\n", + "# The first categories are the most attractive ones\n", + "for i in range(mcc_df.shape[0]): \n", + " if i % 10 == 0:\n", + " mcc_df.drop([0,'Sum','XS'],axis=1)[i:(i+5)].transpose().plot.line()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fc010da", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "\n", + "data = df[df['google_places_detailed_type'].notnull()]\n", + "test = pd.Series([x for item in data.google_places_detailed_type for x in ast.literal_eval(item)]).value_counts()\n", + "test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd76690a", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "docs = df['google_places_detailed_type'][df['google_places_detailed_type'].notna()]\n", + "docs = docs.apply(lambda row: ast.literal_eval(row))\n", + "\n", + "vectorizer = CountVectorizer(analyzer=lambda x: x) # , min_df = 50\n", + "categories = vectorizer.fit_transform(docs).toarray()\n", + "vectorizer.get_feature_names_out()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd08fc9b", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import chi2_contingency\n", + "\n", + "# Create a contingency table for each feature\n", + "contingency_tables = {}\n", + "\n", + "cat_col = df[['google_places_candidate_count_mail','google_places_candidate_count_phone','google_places_rating','google_places_price_level','google_places_confidence','MCC Level', 'Gender','number_area','first_name_in_account','last_name_in_account','google_places_business_status','number_country','number_valid','number_possible','google_places_place_id_matches_phone_search']].fillna('no_data')\n", + "cat_col['b_google_website'] = df['google_places_detailed_website'].notnull()\n", + "\n", + "#for feature_column in df.columns[df.columns != 'label']:\n", + "for feature_column in cat_col.columns:\n", + " contingency_table = pd.crosstab(df['MerchantSizeByDPV'], cat_col[feature_column])\n", + " contingency_tables[feature_column] = contingency_table\n", + "\n", + "# Perform chi-squared test for each feature\n", + "results = {}\n", + "for feature, table in contingency_tables.items():\n", + " chi2_stat, p_value, dof, expected = chi2_contingency(table)\n", + " results[feature] = {'Chi-squared stat': chi2_stat, 'P-value': p_value, 'Degrees of Freedom': dof}\n", + "\n", + "# Display the results\n", + "for feature, result in results.items():\n", + " print(f\"\\nChi-squared test for {feature}:\")\n", + " print(f\"Chi-squared statistic: {result['Chi-squared stat']:.2f}\")\n", + " print(f\"P-value: {result['P-value']:.4f}\")\n", + " print(f\"Degrees of freedom: {result['Degrees of Freedom']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c46eb0f4", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "def b_bayesian(df,bin_column,b_value=True):\n", + " \n", + " prior_A = df.groupby('MerchantSizeByDPV').count()['Email']/df.shape[0] \n", + " prior_B = df[df[bin_column]==b_value].shape[0] / df[bin_column].shape[0]\n", + " evidence_A = df[df[bin_column]==b_value].groupby('MerchantSizeByDPV').count()[bin_column] / df.groupby('MerchantSizeByDPV').count()[bin_column]\n", + " posterior_B = (prior_A*evidence_A) / prior_B\n", + " \n", + " return posterior_B.reindex(index=['XS', 'S', 'M', 'L','XL'])\n", + "\n", + "per_size = (df.groupby('MerchantSizeByDPV').count()['Email']/df.shape[0]).reindex(index=['XS', 'S', 'M', 'L','XL'])\n", + "\n", + "\n", + "series_not_possible =b_bayesian(df,'number_possible',False)-per_size\n", + "series_invalid = b_bayesian(df,'number_valid',False)-per_size\n", + "series_first_name = b_bayesian(df,'first_name_in_account',True)-per_size\n", + "series_last_name = b_bayesian(df,'last_name_in_account',True)-per_size\n", + "\n", + "series_possible =b_bayesian(df,'number_possible',True)-per_size\n", + "series_valid = b_bayesian(df,'number_valid',True)-per_size\n", + "series_no_first_name = b_bayesian(df,'first_name_in_account',False)-per_size\n", + "series_no_last_name = b_bayesian(df,'last_name_in_account',False)-per_size\n", + "\n", + "# Ensure the 'Category' column is ordered\n", + "categories_order = ['XS', 'S', 'M', 'L','XL']\n", + "\n", + "# Plot the lines\n", + "plt.figure(figsize=(10, 6))\n", + "\n", + "\n", + "plt.plot(categories_order, series_not_possible, label='Number not possible', marker='o')\n", + "plt.plot(categories_order, series_invalid, label='Number invalid', marker='d')\n", + "plt.plot(categories_order, series_first_name, label='First name in account')\n", + "plt.plot(categories_order, series_last_name, label='Last name in account')\n", + "plt.plot(categories_order, series_possible, label='Number possible')\n", + "plt.plot(categories_order, series_valid, label='Number valid')\n", + "plt.plot(categories_order, series_no_first_name, label='First name not in account')\n", + "plt.plot(categories_order, series_no_last_name, label='Last name not in account')\n", + "#plt.plot(categories_order, per_size, label='Percentage of merchant size', marker='s',c='black')\n", + "\n", + "\n", + "plt.title('Bayesian')\n", + "plt.xlabel('Categories')\n", + "plt.ylabel('Percentages')\n", + "plt.legend()\n", + "plt.grid(True)\n", + "\n", + "# Show the plot\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a18821e", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import numpy as np\n", + "\n", + "\n", + "class_colors = sns.color_palette(\"colorblind\")[:5]\n", + "regional_df = df.filter(like='regional', axis=1)\n", + "regional_df['MerchantSizeByDPV'] = df['MerchantSizeByDPV']\n", + "\n", + "# Plot boxplots for each column with different MerchantSizeByDPV boxplots next to each other\n", + "for i, column in enumerate(regional_df.columns[:-1]): # Exclude the last column ('MerchantSizeByDPV') \n", + " \n", + " if column == 'regional_atlas_pop_development': \n", + " axes = sns.boxplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_pop_development']<2000],palette=class_colors, order=['XS', 'S','M','L','XL']) \n", + "\n", + " elif column == 'regional_atlas_gdp_development':\n", + " axes = sns.boxplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_gdp_development']<60],palette=class_colors, order=['XS', 'S','M','L','XL']) \n", + " \n", + " else:\n", + " axes = sns.boxplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df,palette=class_colors, order=['XS', 'S','M','L','XL'])\n", + " \n", + " axes.set_title(f'Boxplot of {column}')\n", + " axes.set_xlabel('MerchantSizeByDPV')\n", + " axes.set_ylabel(column) \n", + " \n", + " median_value = regional_df[regional_df['MerchantSizeByDPV'] == 'XL'][column].median()\n", + " axes.axhline(y=median_value, color='red', linestyle='--', label=f'Median (XL)')\n", + " axes.legend(bbox_to_anchor=(1.05, 0.5), loc='upper right')\n", + " \n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18949200", + "metadata": {}, + "outputs": [], + "source": [ + "# Same like the boxplots but now with violinplots\n", + "for column in regional_df.filter(like='regional', axis=1).columns: \n", + " if column == 'regional_atlas_pop_development': \n", + " axes = sns.violinplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_pop_development']<2000],palette=class_colors, order=['XS', 'S','M','L','XL']) \n", + "\n", + " elif column == 'regional_atlas_gdp_development':\n", + " axes = sns.violinplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_gdp_development']<60],palette=class_colors, order=['XS', 'S','M','L','XL']) \n", + " \n", + " else:\n", + " axes = sns.violinplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df,palette=class_colors, order=['XS', 'S','M','L','XL'])\n", + " \n", + " axes.set_title(f'Boxplot of {column}')\n", + " axes.set_xlabel('MerchantSizeByDPV')\n", + " axes.set_ylabel(column) \n", + " \n", + " median_value = regional_df[regional_df['MerchantSizeByDPV'] == 'XL'][column].median()\n", + " axes.axhline(y=median_value, color='red', linestyle='--', label=f'Median (XL)')\n", + " axes.legend(bbox_to_anchor=(1.05, 0.5), loc='upper right')\n", + " \n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c7b2074", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import preprocessing\n", + "\n", + "# Normalize the features before comparing / dividing them\n", + "x = regional_df.drop('MerchantSizeByDPV', axis = 1).values #returns a numpy array\n", + "min_max_scaler = preprocessing.MinMaxScaler()\n", + "x_scaled = min_max_scaler.fit_transform(x)\n", + "norm_regio = pd.DataFrame(x_scaled, columns=regional_df.drop('MerchantSizeByDPV', axis = 1).columns)\n", + "\n", + "# Compute the stats of the normalized regional data, to find a heuristic to evaluate the features' discriminative magnitudes\n", + "df_stats_XL = norm_regio[regional_df['MerchantSizeByDPV']=='XL'].describe()\n", + "df_stats_XS = norm_regio[regional_df['MerchantSizeByDPV']=='XS'].describe()\n", + "\n", + "((df_stats_XL.loc['50%'] - df_stats_XS.loc['50%'])/(df_stats_XL.loc['75%'] - df_stats_XL.loc['25%'])).sort_values(ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "684af78c", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "# Compute a correlation matrix for all float values of our dataframe\n", + "float_cols = df.columns[df.dtypes==float]\n", + "corr_matrix = df[float_cols].corr()\n", + "\n", + "# The diagonal values (correlation of each feature with itself) should be considered 0, to filter them out\n", + "np.fill_diagonal(corr_matrix.values, 0)\n", + "\n", + "# Create a new DataFrame that transforms all values to 0 that are below a value of defined by variable \"correlation_threshold\" \n", + "correlation_threshold = 0.89\n", + "filtered_correlation_df = corr_matrix.applymap(lambda x: x if abs(x) >= correlation_threshold else 0)\n", + "\n", + "# Identify the rows and columns that not only consists of 0 values (after filtering)\n", + "non_zero_rows = filtered_correlation_df.index[~(filtered_correlation_df == 0).all(axis=1)]\n", + "non_zero_columns = filtered_correlation_df.columns[~(filtered_correlation_df == 0).all(axis=0)]\n", + "new_correlation_df = filtered_correlation_df.loc[non_zero_rows, non_zero_columns]\n", + "\n", + "# Print the new correlation matrix and the corresponding plot\n", + "print(f\"New Correlation Matrix (values greater than {correlation_threshold}):\")\n", + "\n", + "plt.figure(figsize=(12, 10))\n", + "heatmap = sns.heatmap(new_correlation_df, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)\n", + "plt.title('Correlation Matrix Heatmap')\n", + "plt.savefig('correlation_matrix.svg', format='svg')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ba9ee1b", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.decomposition import PCA\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "\n", + "reg_df = df.filter(like='regional', axis=1).dropna()\n", + "\n", + "# Standardize the features\n", + "scaler = StandardScaler()\n", + "scaled_data = scaler.fit_transform(reg_df.drop('MerchantSizeByDPV', axis=1))\n", + "\n", + "# Apply PCA\n", + "pca = PCA()\n", + "principal_components = pca.fit_transform(scaled_data)\n", + "\n", + "# Retrieve explained variance ratios\n", + "explained_variance_ratio = pca.explained_variance_ratio_\n", + "\n", + "components = pd.DataFrame(pca.components_, columns=filter_df.columns)\n", + "\n", + "# Print explained variance ratios\n", + "for i, ratio in enumerate(explained_variance_ratio, 1):\n", + " print(f\"Principal Component {i}: Explained Variance Ratio = {ratio:.4f}\")\n", + "\n", + "# Plot the cumulative explained variance\n", + "cumulative_variance = explained_variance_ratio.cumsum()\n", + "\n", + "plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o')\n", + "plt.title('Cumulative Explained Variance')\n", + "plt.xlabel('Number of Principal Components')\n", + "plt.ylabel('Cumulative Variance Explained')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d731eff3", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "# Count only those words, existing in a minum amount of 100 email adresses\n", + "count_vectorizer = CountVectorizer(min_df=50)\n", + "\n", + "# Fit and transform the text data\n", + "count_matrix = count_vectorizer.fit_transform(df['Email'])\n", + "\n", + "# Convert the matrix to a DataFrame for better readability\n", + "count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4314186c", + "metadata": {}, + "outputs": [], + "source": [ + "common_words = pd.DataFrame(count_df.sum()).transpose()\n", + "\n", + "for word in common_words:\n", + " print(word)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fdf6ba75", + "metadata": {}, + "outputs": [], + "source": [ + "# Names\n", + "names = []\n", + "\n", + "# Weird terms\n", + "weird_terms = []\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa498485", + "metadata": {}, + "outputs": [], + "source": [ + "grouped_common_words = []\n", + "per_size = (df.groupby('MerchantSizeByDPV').count()['Email']/df.shape[0]).reindex(index=['XS', 'S', 'M', 'L','XL'])\n", + "\n", + "for word in common_words.drop(weird_terms,axis=1): #common_words[names], common_words[weird_terms]\n", + " \n", + " indices= count_df[count_df[word]>0].index \n", + " per_word = (df.loc[indices].groupby('MerchantSizeByDPV').count()['Email']/len(df.loc[indices])).reindex(index=['XS', 'S', 'M', 'L','XL']) \n", + " \n", + " grouped_common_words.append((per_word-per_size).rename(word)) \n", + " \n", + "common_df = pd.concat(grouped_common_words, axis=1)\n", + "common_df = common_df.transpose()\n", + "\n", + "common_df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3aeb199a", + "metadata": {}, + "outputs": [], + "source": [ + "# The min/mean/max probability decrease (-) or increase (+) by a value of x with the existence of a certain common word\n", + "\n", + "print(f'{np.min(common_df[\"XS\"])}, {np.mean(common_df[\"XS\"])},{np.max(common_df[\"XS\"])}')\n", + "print(f'{np.min(common_df[\"S\"])}, {np.mean(common_df[\"S\"])},{np.max(common_df[\"S\"])}')\n", + "print(f'{np.min(common_df[\"M\"])}, {np.mean(common_df[\"M\"])},{np.max(common_df[\"M\"])}')\n", + "print(f'{np.min(common_df[\"L\"])}, {np.mean(common_df[\"L\"])},{np.max(common_df[\"L\"])}')\n", + "print(f'{np.min(common_df[\"XL\"])}, {np.mean(common_df[\"XL\"])},{np.max(common_df[\"XL\"])}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38d47b36", + "metadata": {}, + "outputs": [], + "source": [ + "import multiprocessing\n", + "import time\n", + "import pandas as pd\n", + "import numpy as np\n", + "from deutschland.bundesanzeiger import Bundesanzeiger\n", + "import pickle\n", + "import time\n", + "\n", + "def access_ba(company,b_bundesanzeiger,):\n", + "\n", + " b_bundesanzeiger.append(True)\n", + " try:\n", + " ba = Bundesanzeiger()\n", + " data = ba.get_reports(company)\n", + " except:\n", + " b_bundesanzeiger[-1] = False\n", + " return\n", + "\n", + " if __name__ == '__main__':\n", + "\n", + " \"\"\"\n", + " with open('list_file.pkl', 'rb') as file:\n", + " loaded_list = pickle.load(file)\n", + " print(loaded_list)\n", + " \"\"\"\n", + "\n", + " pd.set_option('display.max_columns', None)\n", + "\n", + " historic = pd.read_csv('historic.csv',sep = ',')#_enriched\n", + "\n", + " df = historic.groupby('MerchantSizeByDPV').apply(lambda x: x.sample(100))\n", + "\n", + "\n", + " with multiprocessing.Manager() as manager:\n", + "\n", + " b_bundesanzeiger = manager.list()\n", + " content_array = []\n", + " durations = []\n", + "\n", + " for i, company in enumerate(df[\"Company Name\"]):\n", + "\n", + " print(i)\n", + "\n", + " start = time.time()\n", + "\n", + " # Start access_ba as a process\n", + " p = multiprocessing.Process(target=access_ba, name=\"access_ba\", args=(company,b_bundesanzeiger))\n", + "\n", + " p.start()\n", + "\n", + " # Wait 8 seconds for access_ba\t\n", + " p.join(8)\n", + "\n", + " # If thread is active\n", + " if p.is_alive():\n", + " print (\"Terminate access_ba\")\n", + "\n", + " # Terminate access_ba\n", + " p.terminate()\n", + " b_bundesanzeiger[-1] = 'killed'\n", + "\n", + " # Cleanup\n", + " p.join()\n", + " i+=1\n", + "\n", + " print(b_bundesanzeiger[-1])\n", + " end = time.time()\n", + " print(end-start)\n", + " print()\n", + " durations.append(end-start)\n", + "\n", + " \"\"\"if i==100:\n", + " with open('list_file.pkl', 'wb') as file:\n", + " pickle.dump(list(b_bundesanzeiger), file)\n", + " print(np.mean(np.array(list(b_bundesanzeiger))))\n", + " break\n", + " \"\"\"\n", + "\n", + " with open('list_file.pkl', 'wb') as file:\n", + " pickle.dump(list(b_bundesanzeiger), file)\n", + "\n", + " with open('time.pkl', 'wb') as file:\n", + " pickle.dump(durations, file)\n", + "\n", + " df.to_pickle(\"./dataframe_sample.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8a6f460", + "metadata": {}, + "outputs": [], + "source": [ + "with open('dataframe_sample.pkl', 'rb') as f:\n", + " df = pickle.load(f)\n", + "\n", + "df = df.reset_index(drop=True)\n", + "\n", + "with open('list_file.pkl', 'rb') as f:\n", + " mynewlist = pickle.load(f)\n", + "\n", + "with open('time.pkl', 'rb') as f:\n", + " time = pickle.load(f)\n", + "\n", + "df_stats = pd.DataFrame({'b_bundesanzeiger': mynewlist, 'time': time})\n", + "\n", + "df['b_bundesanzeiger'] = df_stats['b_bundesanzeiger']\n", + "df['time'] = df_stats['time']\n", + "\n", + "\n", + "counts =df.groupby('MerchantSizeByDPV')['b_bundesanzeiger'].value_counts()\n", + "\n", + "desired_value_counts = counts.unstack().fillna(0)\n", + "\n", + "# Compute total counts per category\n", + "total_counts_per_category = counts.groupby('MerchantSizeByDPV').sum()\n", + "\n", + "# Compute probability for each category\n", + "probabilities = desired_value_counts.apply(lambda x: x / total_counts_per_category)\n", + "\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30b18dea", + "metadata": {}, + "outputs": [], + "source": [ + "df['b_google_places'] = df[\"google_places_place_id\"].notnull()\n", + "counts =df.groupby('MerchantSizeByDPV')['b_google_places'].value_counts()\n", + "\n", + "desired_value_counts = counts.unstack().fillna(0)\n", + "\n", + "# Compute total counts per category\n", + "total_counts_per_category = counts.groupby('MerchantSizeByDPV').sum()\n", + "\n", + "# Compute probability for each category\n", + "probabilities = desired_value_counts.apply(lambda x: x / total_counts_per_category)\n", + "\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f167c71", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "# Separate features (X) and target variable (y)\n", + "table = df[regional_columns+['MerchantSizeByDPV']].dropna()\n", + "y = table['MerchantSizeByDPV']\n", + "X=table[regional_columns]\n", + "\n", + "X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X, y)\n", + "\n", + "# Split the data into training and testing sets\n", + "X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)\n", + "\n", + "# Create a logistic regression model\n", + "model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=4000, class_weight='balanced')\n", + "\n", + "# Fit the model on the training data\n", + "model.fit(X_train, y_train)\n", + "\n", + "# Make predictions on the testing data\n", + "y_pred = model.predict(X_test)\n", + "\n", + "# Evaluate the model\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "conf_matrix = confusion_matrix(y_test, y_pred)\n", + "class_report = classification_report(y_test, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e6afdec", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n", + "\n", + "# Assuming X and y are your feature matrix and target variable\n", + "X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)\n", + "\n", + "# Create and fit the Random Forest model\n", + "rf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n", + "rf_model.fit(X_train, y_train)\n", + "\n", + "# Make predictions on the test set\n", + "y_pred = rf_model.predict(X_test)\n", + "\n", + "# Evaluate the model\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "conf_matrix = confusion_matrix(y_test, y_pred)\n", + "class_report = classification_report(y_test, y_pred)\n", + "\n", + "# Display evaluation metrics\n", + "print(f'Accuracy: {accuracy:.2f}')\n", + "print(f'Confusion Matrix:\\n{conf_matrix}')\n", + "print(f'Classification Report:\\n{class_report}')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abea2245", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.ensemble import IsolationForest\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n", + "\n", + "# Assuming X and y are your feature matrix and target variable\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "# Define the set of rare classes\n", + "rare_classes = ['XL'] # Replace with the actual class labels you consider rare\n", + " # MAYBE NOT ONLY XL, but also L and M\n", + "# Create a binary target variable indicating whether each instance is rare or not\n", + "y_train_rare = y_train.isin(rare_classes).astype(int)\n", + "y_test_rare = y_test.isin(rare_classes).astype(int)\n", + "\n", + "# Create and fit the Isolation Forest model\n", + "if_model = IsolationForest(contamination='auto')\n", + "if_model.fit(X_train)\n", + "\n", + "# Predict anomalies on the test set\n", + "y_pred_rare = if_model.predict(X_test)\n", + "\n", + "# Convert the predicted labels to binary (1 for anomalies, -1 for normal instances)\n", + "y_pred_rare_binary = (y_pred_rare == -1).astype(int)\n", + "\n", + "# Evaluate the model\n", + "accuracy = accuracy_score(y_test_rare, y_pred_rare_binary)\n", + "conf_matrix = confusion_matrix(y_test_rare, y_pred_rare_binary)\n", + "class_report = classification_report(y_test_rare, y_pred_rare_binary)\n", + "\n", + "# Display evaluation metrics\n", + "print(f'Accuracy: {accuracy:.2f}')\n", + "print(f'Confusion Matrix:\\n{conf_matrix}')\n", + "print(f'Classification Report:\\n{class_report}')\n", + "\n", + "plt.figure(figsize=(6, 4))\n", + "sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False,\n", + " xticklabels=[0,1], yticklabels=[0,1])\n", + "plt.xlabel('Predicted Label')\n", + "plt.ylabel('True Label')\n", + "plt.title('Confusion Matrix')\n", + "plt.show()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 2b9f289ec759a9e037b769a2a0b727cb7e2dcc18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucca=20Baumg=C3=A4rtner?= Date: Wed, 7 Feb 2024 09:49:46 +0100 Subject: [PATCH 50/51] adjust log levels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lucca Baumgärtner --- src/bdc/steps/google_places.py | 2 +- src/bdc/steps/helpers/generate_hash_leads.py | 5 ++--- src/database/leads/local_repository.py | 2 +- src/logger/logger.py | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/bdc/steps/google_places.py b/src/bdc/steps/google_places.py index 96dea27..94dde24 100644 --- a/src/bdc/steps/google_places.py +++ b/src/bdc/steps/google_places.py @@ -228,7 +228,7 @@ def get_first_place_candidate(self, query, input_type) -> (dict, int): return None, 0 if not response["status"] == HTTPStatus.OK.name: - log.warning( + log.debug( f"Failed to fetch data. Status code: {response['status']}", ) return None, 0 diff --git a/src/bdc/steps/helpers/generate_hash_leads.py b/src/bdc/steps/helpers/generate_hash_leads.py index ea66c0f..4ab3ef0 100644 --- a/src/bdc/steps/helpers/generate_hash_leads.py +++ b/src/bdc/steps/helpers/generate_hash_leads.py @@ -7,7 +7,6 @@ import pandas as pd -from bdc.steps.step import Step from database import get_database from logger import get_logger @@ -56,12 +55,12 @@ def hash_check( if lead_hash in lookup_table: # If the hash exists in the lookup table, return the corresponding data - log.info(f"Hash {lead_hash} already exists in the lookup table.") + log.debug(f"Hash {lead_hash} already exists in the lookup table.") try: previous_data = lead_data[fields_tofill] return previous_data except KeyError as e: - log.info( + log.debug( f"Hash is present but data fields {fields_tofill} were not found." ) lookup_table[lead_hash] = lookup_table[lead_hash][:-1] + [ diff --git a/src/database/leads/local_repository.py b/src/database/leads/local_repository.py index f41ba1d..3bade62 100644 --- a/src/database/leads/local_repository.py +++ b/src/database/leads/local_repository.py @@ -81,7 +81,7 @@ def save_review(self, review, place_id, force_refresh=False): json_file_path = os.path.join(self.REVIEWS, file_name) if os.path.exists(json_file_path): - log.info(f"Reviews for {place_id} already exist") + log.debug(f"Reviews for {place_id} already exist") return with open(json_file_path, "w", encoding="utf-8") as json_file: diff --git a/src/logger/logger.py b/src/logger/logger.py index ccd9eb1..fe08296 100644 --- a/src/logger/logger.py +++ b/src/logger/logger.py @@ -54,7 +54,7 @@ def __init__(self, name, log_dir=None): # Create stream handler for logging to stdout (log all five levels) self.stdout_handler = logging.StreamHandler(sys.stdout) - self.stdout_handler.setLevel(logging.DEBUG) + self.stdout_handler.setLevel(logging.INFO) self.stdout_handler.setFormatter(StdOutFormatter()) self.enable_console_output() From cab53add6d2656362467c82c54b75bd1c0a57bcf Mon Sep 17 00:00:00 2001 From: Fabian Utech Date: Wed, 7 Feb 2024 12:55:42 +0100 Subject: [PATCH 51/51] Add report generation file to the deprecated steps Signed-off-by: Fabian Utech --- deprecated/steps/report.py | 230 +++++++++++++++++++++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100644 deprecated/steps/report.py diff --git a/deprecated/steps/report.py b/deprecated/steps/report.py new file mode 100644 index 0000000..6eab25c --- /dev/null +++ b/deprecated/steps/report.py @@ -0,0 +1,230 @@ +# SPDX-License-Identifier: MIT +# SPDX-FileCopyrightText: 2023 Fabian-Paul Utech +# SPDX-FileCopyrightText: 2023 Ahmed Sheta + +import argparse +import os + +import pandas as pd +from reportlab.lib import colors +from reportlab.lib.pagesizes import A4 +from reportlab.lib.styles import getSampleStyleSheet +from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer, Table, TableStyle + +report_list = [] + +standard_group_format = { + # 1 pdf per lead (1 row in .csv) + "Lead": [ + "Last Name", + "First Name", + "Company / Account", + "Phone", + "Email", + "Predicted Size", + ], + # "Reviews": [ + # "google_places_user_ratings_total", + # "google_places_rating", + # "google_places_price_level", + # "reviews_sentiment_score", + # ], + #'Region':[] starts with regional_atlas + # Regarding columns names if there are more than one '_' take the split after the second _ +} + +file_list = [] + + +def process_lead(lead): + # Input search string (either specific leads or a whole file) + # Output: pd.series of a lead from leads_enriched.csv + try: + df = pd.read_csv("src/data/dummy_leads_email.csv", delimiter=",") + except FileNotFoundError: + raise FileNotFoundError("File not found.") + if os.path.exists( + os.path.dirname(lead) + ): # If a path was specified (by default the dummy dataset) + df = pd.read_csv(lead, delimiter=",") + return df + elif isinstance(lead, list): # A specified group of leads + rows = df[df["Company / Account"] in lead] + return rows + + elif isinstance(lead, str): # One specified lead + row = df[df["Company / Account"] == lead] + return row + else: + raise ValueError( + "Invalid type for 'lead'. It should be a single string, a list of strings, or a file path." + ) + + +def process_format(fmt): + if isinstance(fmt, list): # Transform list to dictionary + new_fmt = {} + + for value in fmt: + try: + key = str(standard_group_format[value]) + except: + key = "Others" + if key in new_fmt: + new_fmt[key] = new_fmt[key].append(str(value)) + else: + new_fmt[key] = [str(value)] + + return new_fmt + elif isinstance(fmt, dict): + return fmt + elif fmt is None: + return standard_group_format + else: + raise ValueError( + "Invalid type for 'format'. It should be either a list or a dictionary." + ) + + +def create_pdf(lead, format): + """ + Input: lead: pd.series + format: dict + Description: Function to create reports. + A report consists of tables of grouped features. + Output: '...'.pdf + """ + doc = SimpleDocTemplate( + f"src/data/reports/{lead['Company / Account']}.pdf", pagesize=A4 + ) + file_list.append(f"src/data/reports/{lead['Company / Account']}.pdf") + + report_list.append(f"src/data/reports/{lead['Company / Account']}.pdf") + + # Creating a Paragraph with a large font size and centered alignment + headline_style = getSampleStyleSheet()["Title"] + headline_style.fontSize = 32 + headline_style.alignment = 0 + + headline_paragraph = Paragraph(lead["Company / Account"], headline_style) + + # List for the 'Flowable' objects + elements = [headline_paragraph] + elements.append(Spacer(1, 50)) + + # Styles for tables and paragraphs + styles = getSampleStyleSheet() + + groups = format.keys() + + for group in groups: + title_paragraph = Paragraph(group, styles["Title"]) + elements.append(title_paragraph) + + col_names = format[group] + + # Header row + split_col = [col_names[i : i + 4] for i in range(0, len(col_names), 5)] + + # Center the table on the page + table_style = TableStyle( + [ + ("ALIGN", (0, 0), (-1, -1), "CENTER"), # center the text + ( + "VALIGN", + (0, 0), + (-1, -1), + "MIDDLE", + ), # put the text in the middle of the cell + ("TEXTCOLOR", (0, 0), (-1, 0), colors.black), + ("GRID", (0, 0), (-1, -1), 1, colors.black), + ( + "SPLITBYROWS", + (0, 0), + (-1, -1), + True, + ), # Ensure rows are not split between pages + ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), + ] + ) + + for group_columns in split_col: + header_row = group_columns + data_row = [] + for column in group_columns: + try: + if lead[column] == "nan": + data_row.append("") + else: + data_row.append(str(lead[column])) + except: + data_row.append("") + + table = [header_row, data_row] + + pdf_table = Table(table) + pdf_table.setStyle(table_style) + + # Add the table to the elements + elements.append(pdf_table) + + # Add an empty line between tables + elements.append(Spacer(1, 25)) + + """for k,v in tmp_data.items(): + if isinstance(v, dict): + + ul_items=[] + for key,val in v.items(): + bolded_text = f'{key}:{val}' + ul_items.append(Paragraph(bolded_text,styles['Normal'])) + + col_index = list(tmp_data.keys()).index(k) + table_data[1][col_index] = ul_items""" + + """# Set left alignment for all non-header cells + for col in range(len(table_data[0])): + table_style.add('FONTNAME', (col, 0), (col, 0), 'Helvetica-Bold') + table_style.add('ALIGN', (col, 1), (col, -1), 'LEFT')""" + + # Build the PDF document + doc.build(elements) + + +def main(): + # file_list=[] + parser = argparse.ArgumentParser(description="Process lead and format arguments.") + parser.add_argument( + "--lead", + default="src/data/dummy_leads_email.csv", + help="Lead argument: a single search-string, a list of strings, or a file path.", + ) + parser.add_argument( + "--format", nargs="+", help="Format argument: a list or a dictionary." + ) + + args = parser.parse_args() + + # Process lead argument (result: either specific row(/s) or a table) + # Choose lead with + processed_lead = process_lead(args.lead) + print("Generate the reports for the following leads: ") + print(processed_lead) + + # Process format argument (result: format that is a dictionary) + processed_format = process_format(args.format) + + # Generate report for every lead + + for index, lead in processed_lead.iterrows(): + create_pdf(lead, processed_format) + + print("\nReports saved:") + for file in file_list: + print(f"{file}") + + print() + + +if __name__ == "__main__": + main()