From 6a58037a8c529b99d1519819838ce594806c0f33 Mon Sep 17 00:00:00 2001 From: Fabian Utech Date: Tue, 6 Feb 2024 17:44:42 +0100 Subject: [PATCH] Add the SBOM generator as a markdown file (shellscript) and my feature analysis Signed-off-by: Fabian Utech --- Documentation/SBOM_generator.md | 61 + notebooks/fabian_feature_analysis.ipynb | 1768 +++++++++++++++++ .../fabian_feature_analysis.ipynb.license | 2 + 3 files changed, 1831 insertions(+) create mode 100644 Documentation/SBOM_generator.md create mode 100644 notebooks/fabian_feature_analysis.ipynb create mode 100644 notebooks/fabian_feature_analysis.ipynb.license diff --git a/Documentation/SBOM_generator.md b/Documentation/SBOM_generator.md new file mode 100644 index 0000000..65d5d4b --- /dev/null +++ b/Documentation/SBOM_generator.md @@ -0,0 +1,61 @@ +# Automatic SBOM generation + +```console +pipenv install +pipenv shell + +pip install pipreqs +pip install cyclonedx-bom +pip install pip-licenses + +# Create the SBOM (cyclonedx-bom) based on (pipreqs) requirements that are actually imported in the .py files + +$sbom = pipreqs --print | cyclonedx-py -r -pb -o - -i - + +# Create an XmlDocument object +$xml = New-Object System.Xml.XmlDocument + +# Load XML content into the XmlDocument +$xml.LoadXml($sbom) + + +# Create an empty CSV file +$csvPath = "SBOM.csv" + +# Initialize an empty array to store rows +$result = @() + +# Iterate through the XML nodes and create rows for each node +$xml.SelectNodes("//*[local-name()='component']") | ForEach-Object { + + $row = @{ + "Version" = $_.Version + "Context" = $_.Purl + "Name" = if ($_.Name -eq 'scikit_learn') { 'scikit-learn' } else { $_.Name } + } + + # Get license information + $match = pip-licenses --from=mixed --format=csv --with-system --packages $row.Name | ConvertFrom-Csv + + # Add license information to the row + $result += [PSCustomObject]@{ + "Context" = $row.Context + "Name" = $row.Name + "Version" = $row.Version + "License" = $match.License + } +} + +# Export the data to the CSV file +$result | Export-Csv -Path $csvPath -NoTypeInformation + +# Create the license file +$licensePath = $csvPath + '.license' +@" +SPDX-License-Identifier: CC-BY-4.0 +SPDX-FileCopyrightText: 2023 Fabian-Paul Utech +"@ | Out-File -FilePath $licensePath + +exit + +``` diff --git a/notebooks/fabian_feature_analysis.ipynb b/notebooks/fabian_feature_analysis.ipynb new file mode 100644 index 0000000..1b10567 --- /dev/null +++ b/notebooks/fabian_feature_analysis.ipynb @@ -0,0 +1,1768 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "23afeabd", + "metadata": {}, + "source": [ + "# The Dataset: Summary\n", + "1. Missing values exist\n", + "2. Correlations between RegionalAtlas Data\n", + "3. Input: numerical + bool + categorical datatype\n", + "4. Imbalanced Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d3b226f", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "df = pd.read_csv(\"s3://amos-training-data/100k_historic_enriched.csv\")\n", + "categories_order = ['XS', 'S', 'M', 'L','XL']\n", + "df.dtypes" + ] + }, + { + "cell_type": "markdown", + "id": "18fa38d5", + "metadata": {}, + "source": [ + "--------------------------------\n", + "# Feature Summary\n", + "\n", + "### Not interesting / label\n", + "- Last Name\n", + "- Account Owner\n", + "- Phone\n", + "- domain\n", + "- google_places_place_id_matches_phone_search\n", + "- number_formatted\n", + "- google_places_name\n", + "- MerchantSizeByDPV => **y**\n", + "\n", + "### Only for testing\n", + "- MCC Level: Is only available in the dataset, but it can be used to check if the feature is helpful\n", + "\n", + "### Can be used directly (Categorical / Boolean)\n", + "Regarding the categorical features, the country is important and the price level of the company (google only has ones specific to restaurants). \n", + "\n", + "- *email_valid*\n", + "- number_possible\n", + "- number_valid\n", + "- first_name_in_account\n", + "- last_name_in_account\n", + "- google_places_business_status\n", + "- google_places_place_id_matches_phone_search\n", + "- *number_area*\n", + "- **number_country**\n", + "- **google_places_price_level**\n", + "- google_places_confidence\n", + "\n", + "### Numerical Features\n", + "The most important features overall (not only regarding numerical features) are the number of ratings and the rating itself. Logically if many users rate a company, they most likely bought products at the corresponding company. Therefore the more the better. Interesting is that a rating in the middle range is more likely for larger companies, than the extreme values. \n", + "\n", + "- reviews_sentiment_score\n", + "- **google_places_user_ratings_total**\n", + "- **google_places_rating** \n", + "- google_places_candidate_count_mail \n", + "- google_places_candidate_count_phone\n", + "\n", + "\n", + "### Regionalatlas (Numerical Features)\n", + "Regarding regional information, the population density can be considered the most important feature because the leads that are based in the service sector have a higher chance to attract customers. Additionally the age structure is important. In Germany the age group between 25-44 is the most important one, due to their expenditure behavior and the acceptance of mobile payment. Please note that the information is specific to Germany and most likely vary in other countries. Specific to SumUp Leads we also noticed that a large service sector in the region is indication for leads due to the financial model (using card and mobile payment). \n", + "\n", + "- **regional_atlas_pop_density** \n", + "- regional_atlas_pop_development \n", + "- regional_atlas_age_0 \n", + "- regional_atlas_age_1 \n", + "- **regional_atlas_age_2** (pos) \n", + "- **regional_atlas_age_3** (neg) \n", + "- **regional_atlas_age_4** (neg) \n", + "- regional_atlas_pop_avg_age \n", + "- regional_atlas_per_service_sector \n", + "- regional_atlas_per_trade \n", + "- regional_atlas_employment_rate \n", + "- regional_atlas_unemployment_rate \n", + "- regional_atlas_per_long_term_unemployment\n", + "- regional_atlas_investments_p_employee \n", + "- regional_atlas_gross_salary_p_employee \n", + "- regional_atlas_disp_income_p_inhabitant \n", + "- regional_atlas_tot_income_p_taxpayer \n", + "- regional_atlas_gdp_p_employee \n", + "- regional_atlas_gdp_development \n", + "- regional_atlas_gdp_p_inhabitant \n", + "- regional_atlas_gdp_p_workhours \n", + "- regional_atlas_pop_avg_age_zensus \n", + "- regional_atlas_regional_score \n", + "\n", + "\n", + "### Processing needed (Ideas for future)\n", + "- google_places_place_id => b_google_places\n", + "- First Name => Gender\n", + "- google_places_detailed_website => bool website\n", + "- Email (ending / provider / company email address / ...) => CountVectorizer\n", + "- Company Name (GmbH, Restaurant, ...) => CountVectorizer\n", + "- google_places_formatted_address => City\n", + "- google_places_detailed_type => Cluster / one hot encoder" + ] + }, + { + "cell_type": "markdown", + "id": "988d4272", + "metadata": {}, + "source": [ + "----------------\n", + "# Future Ideas \n", + "## Data\n", + "1. Financial reports\n", + " - Very promising to see potential revenue which might need to be combined with merchant category (due to different payment methods)\n", + " - Source: Specific to the country (e.g. Germany has Bundesanzeiger, Handelsregister, ...)\n", + " - Processing: Using an API, either just checking the existence in the corresponding register or detailed information (financial reports) \n", + "2. Neighborhood\n", + " - Gain information about the potential revenue / price of the company based on the neighborhood\n", + " - Source: Google places for location, all other steps / sources mentioned previously for evaluating the neighbors\n", + " - Processing: Clustering of typical neighborhoods \n", + "3. Price & Product / Merch Category\n", + " - Identify price categories and product types which would help identify the merch category\n", + " - Source: Website / social media\n", + " - Processing: website parsing / image detection \n", + "4. User behavior analysis\n", + " - Identify certain user behavior types to see if there are differences\n", + " - Source: Own data\n", + " - Processing: user behavior on the website (duration how long they need to fill the form, ...)\n", + "5. Process analysis\n", + " - Analysis of the process by testing how the interaction looks like and if there is a certain behavior / structure observable that is specific to the size of the company\n", + " - Source: Own data\n", + " - Processing: Information about how long the process takes / how many calls / different numbers or contact / availability\n", + "6. Confidence metric:\n", + " - Creating a sophisticated confidence metric based on similar previous results\n", + " - Using multiple features to compare it with the added sources\n", + " \n", + "## Model\n", + "1. Using GAN or other generative models to handle imbalance and missing values\n", + "2. Using outlier detection for smaller classes\n", + "3. Extensive hyperparameter tuning\n", + "4. Weighted models based on importance of the class" + ] + }, + { + "cell_type": "markdown", + "id": "e59d30fb", + "metadata": {}, + "source": [ + "# Dataset Description" + ] + }, + { + "cell_type": "markdown", + "id": "a2bee470", + "metadata": {}, + "source": [ + "## Imbalanced Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f508fadb", + "metadata": {}, + "outputs": [], + "source": [ + "print(len(df))\n", + "df.groupby(by=['MerchantSizeByDPV']).size()/len(df)" + ] + }, + { + "cell_type": "markdown", + "id": "86fdc167", + "metadata": {}, + "source": [ + "## Repetitions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a850c5e7", + "metadata": {}, + "outputs": [], + "source": [ + "# Check if there are repetitions (> 0 => leads that exist multiple times according to the identifier)\n", + "identifier = df[['Phone','Email','Company Name','number_formatted','google_places_place_id','google_places_formatted_address','google_places_name','google_places_detailed_website']]\n", + "for col in identifier:\n", + " print(f'{col}: {len(df[col].unique())} ({1-len(df[col].unique())/df[col].count()})') " + ] + }, + { + "cell_type": "markdown", + "id": "37ce359c", + "metadata": {}, + "source": [ + "**Repetitions exist except for Email => Email as Identifier**" + ] + }, + { + "cell_type": "markdown", + "id": "e85e1911", + "metadata": {}, + "source": [ + "## Placeholder Values of Regionalatlas\n", + "\n", + "### Problem: Regionatlas, Defined Placeholder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7d07397", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\" Regionalatlas: Placeholder\n", + "2222222222: nichts vorhanden, genau 0\n", + "5555555555: Zahlenwert unbekannt oder geheim zu halten\n", + "6666666666: Tabellenfach gesperrt, da Aussage nicht sinnvoll\n", + "7777777777: keine Angabe, da Zahlenwert nicht sicher genug\n", + "8888888888: Angabe fällt später an\n", + "\"\"\"\n", + "\n", + "exclude_values = [2222222222.0, 5555555555.0, 6666666666.0, 7777777777.0, 8888888888.0]\n", + "regional_df = df.filter(like='regional', axis=1).dropna()\n", + "\n", + "# Dictionary to know which columns and indices have problematic values\n", + "rem_dic = {}\n", + "columns = []\n", + "\n", + "filter_df = regional_df.copy()\n", + "\n", + "for exc in exclude_values:\n", + " # Find all columns that have those values we need to exclude\n", + " col = regional_df.loc[:,(np.sum(df == exc,axis=0)>0)].columns.tolist()\n", + "\n", + " columns+=col\n", + " \n", + " \n", + " # Now we can use those columns to find the corresponding rows\n", + " for c in col:\n", + " indices = regional_df.loc[(np.sum(df == exc,axis=1)>0),col].index.tolist() \n", + " \n", + " rem_dic[c] = {str(exc):indices}\n", + " \n", + " filter_df = filter_df[df[c]!=exc]\n", + " print(f'column:{c}, value:{exc}')\n", + " \n", + "print(rem_dic)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16fcbf05", + "metadata": {}, + "outputs": [], + "source": [ + "# Irregular values defined by regionalatlas needs to be translated to nan so we can handle it later on\n", + "import numpy as np\n", + "regional_atlas = [col for col in df if col.startswith('regional_atlas')]\n", + "\n", + "print(\"Changed the following features, because of irregular values of regionalatlas:\")\n", + "for col in regional_atlas:\n", + " n_irr = (df[col]>=2222222222).sum()\n", + " n = (df[col].notnull()).sum()\n", + " \n", + " if (n_irr>0):\n", + " print(col+': '+str(n_irr)+' out of '+ str(n))\n", + " df[col] = np.where(df[col] >= 2222222222, np.nan, df[col])\n" + ] + }, + { + "cell_type": "markdown", + "id": "c7ab822c", + "metadata": {}, + "source": [ + "Changed the following features, because of irregular values of regionalatlas:\n", + "- regional_atlas_pop_development: 76 out of 68793\n", + "- regional_atlas_investments_p_employee: 3736 out of 68793\n", + "- regional_atlas_gross_salary_p_employee: 632 out of 68793\n", + "- regional_atlas_tot_income_p_taxpayer: 34 out of 68827" + ] + }, + { + "cell_type": "markdown", + "id": "5411cb6e", + "metadata": {}, + "source": [ + "## Empty Values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "730b574a", + "metadata": {}, + "outputs": [], + "source": [ + "isna = sum(df['google_places_place_id_matches_phone_search'].isna())\n", + "print(f'Empty: {isna}')\n", + "print(f'Not empty: {df.shape[0]-isna}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35186540", + "metadata": {}, + "outputs": [], + "source": [ + "print(df.groupby('MerchantSizeByDPV').count()['number_area'].reindex(categories_order))\n", + "\n", + "tmp = df[df['number_country']=='Germany'].groupby('MerchantSizeByDPV').count()\n", + "(tmp / (sum(tmp['Last Name'].values)/129))['First Name'].reindex(categories_order)" + ] + }, + { + "cell_type": "markdown", + "id": "f507c51d", + "metadata": {}, + "source": [ + "#### Different amounts of NaN values\n", + "- regional_atlas_pop_avg_age : 3213\n", + "- regional_atlas_per_service_sector : 3211\n", + "- regional_atlas_per_trade : 3211\n", + "- regional_atlas_unemployment_rate : 3119\n", + "- regional_atlas_disp_income_p_inhabitant : 3211\n", + "- regional_atlas_tot_income_p_taxpayer : 3211\n", + "- regional_atlas_gdp_p_workhours : 3211\n", + "- regional_atlas_pop_avg_age_zensus : 3119" + ] + }, + { + "cell_type": "markdown", + "id": "7f0a7994", + "metadata": {}, + "source": [ + "----------------------\n", + "# Numerical Features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27014d88", + "metadata": {}, + "outputs": [], + "source": [ + "min_max_df = df.agg({\n", + "'google_places_user_ratings_total':[\"min\",\"max\"],\n", + "'google_places_rating':[\"min\",\"max\"],\n", + "'google_places_price_level':[\"min\",\"max\"],\n", + "'reviews_sentiment_score':[\"min\",\"max\"],\n", + "'regional_atlas_age_0':[\"min\",\"max\"],\n", + "'regional_atlas_age_1':[\"min\",\"max\"],\n", + "'regional_atlas_age_2':[\"min\",\"max\"],\n", + "'regional_atlas_age_3':[\"min\",\"max\"],\n", + "'regional_atlas_age_4':[\"min\",\"max\"],\n", + "'regional_atlas_per_service_sector':[\"min\",\"max\"],\n", + "'regional_atlas_per_trade':[\"min\",\"max\"],\n", + "'regional_atlas_employment_rate':[\"min\",\"max\"],\n", + "'regional_atlas_unemployment_rate':[\"min\",\"max\"],\n", + "'regional_atlas_per_long_term_unemployment':[\"min\",\"max\"],\n", + "'regional_atlas_pop_density':[\"min\",\"max\"],\n", + "'regional_atlas_pop_development':[\"min\",\"max\"],\n", + "'regional_atlas_pop_avg_age':[\"min\",\"max\"],\n", + "'regional_atlas_investments_p_employee':[\"min\",\"max\"],\n", + "'regional_atlas_gross_salary_p_employee':[\"min\",\"max\"],\n", + "'regional_atlas_disp_income_p_inhabitant':[\"min\",\"max\"],\n", + "'regional_atlas_tot_income_p_taxpayer':[\"min\",\"max\"],\n", + "'regional_atlas_gdp_p_employee':[\"min\",\"max\"],\n", + "'regional_atlas_gdp_development':[\"min\",\"max\"],\n", + "'regional_atlas_gdp_p_inhabitant':[\"min\",\"max\"],\n", + "'regional_atlas_gdp_p_workhours':[\"min\",\"max\"],\n", + "'regional_atlas_pop_avg_age_zensus':[\"min\",\"max\"],\n", + "'regional_atlas_regional_score':[\"min\",\"max\"]\n", + "})\n", + "\n", + "# Apply the function for each column\n", + "for col in min_max_df.columns:\n", + " min_feature = min_max_df[col]['min']\n", + " max_feature = min_max_df[col]['max']\n", + " print(f'{col}: [{min_feature}, {max_feature}]') " + ] + }, + { + "cell_type": "markdown", + "id": "5f90f597", + "metadata": {}, + "source": [ + "## Range of the features\n", + "\n", + "| Variable | Theoretical Range | Practical Range |\n", + "|---------------------------------------------|-------------------|---------------------------|\n", + "| google_places_user_ratings_total | [0, inf] | [0.0, 86141.0] |\n", + "| google_places_rating | [1, 5] | [0.0, 5.0] |\n", + "| google_places_price_level | [1.0, 4.0] | [1.0, 4.0] |\n", + "| reviews_sentiment_score | [-1.0, 1] | [-1.0, 0.95] |\n", + "| regional_atlas_age_0 | [0, 100] | [12.6, 20.1] |\n", + "| regional_atlas_age_1 | [0, 100] | [4.2, 13.1] |\n", + "| regional_atlas_age_2 | [0, 100] | [18.7, 33.7] |\n", + "| regional_atlas_age_3 | [0, 100] | [21.6, 32.9] |\n", + "| regional_atlas_age_4 | [0, 100] | [15.9, 33.5] |\n", + "| regional_atlas_per_service_sector | [0, 100] | [44.4, 94.0] |\n", + "| regional_atlas_per_trade | [0, 100] | [13.8, 48.4] |\n", + "| regional_atlas_employment_rate | [0, 100] | [47.4, 72.4] |\n", + "| regional_atlas_unemployment_rate | [0, 100] | [1.6, 12.0] |\n", + "| regional_atlas_per_long_term_unemployment | [0, 100] | [14.5, 55.5] |\n", + "| regional_atlas_pop_density | [0, inf] | [35.3, 4788.2] |\n", + "| regional_atlas_pop_development | [-inf, inf] | [-180.4, 2567.6] |\n", + "| regional_atlas_pop_avg_age | [0, inf] | [40.7, 51.0] |\n", + "| regional_atlas_investments_p_employee | [0, inf] | [2.4, 51.0] |\n", + "| regional_atlas_gross_salary_p_employee | [0, inf] | [30.1, 90.6] |\n", + "| regional_atlas_disp_income_p_inhabitant | [0, inf] | [17635.0, 36686.0] |\n", + "| regional_atlas_tot_income_p_taxpayer | [0, inf] | [30.0, 74.3] |\n", + "| regional_atlas_gdp_p_employee | [0, inf] | [56707.0, 153485.0] |\n", + "| regional_atlas_gdp_development | [-inf, inf] | [-3.7, 80.0] |\n", + "| regional_atlas_gdp_p_inhabitant | [0, inf] | [17553.0, 158749.0] |\n", + "| regional_atlas_gdp_p_workhours | [0, inf] | [39.3, 114.3] |\n", + "| regional_atlas_pop_avg_age_zensus | [0, inf] | [39.1, 48.6] |\n", + "| regional_atlas_regional_score | [0, inf] | [47.30335218, 10342.70448564]|" + ] + }, + { + "cell_type": "markdown", + "id": "189519a2", + "metadata": {}, + "source": [ + "## Percentiles (for analysis and explanation)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51d78040", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import percentileofscore\n", + "\n", + "percentile_col = [\n", + "'regional_atlas_age_0',\n", + "'regional_atlas_age_1',\n", + " 'regional_atlas_age_2',\n", + "'regional_atlas_age_3',\n", + "'regional_atlas_age_4',\n", + "'google_places_user_ratings_total','google_places_rating','reviews_sentiment_score','regional_atlas_pop_density',\n", + "'regional_atlas_pop_development',\n", + "'regional_atlas_pop_avg_age',\n", + "'regional_atlas_per_service_sector',\n", + "'regional_atlas_per_trade',\n", + "'regional_atlas_employment_rate',\n", + "'regional_atlas_unemployment_rate',\n", + "'regional_atlas_per_long_term_unemployment',\n", + "'regional_atlas_investments_p_employee',\n", + "'regional_atlas_gross_salary_p_employee',\n", + "'regional_atlas_disp_income_p_inhabitant',\n", + "'regional_atlas_tot_income_p_taxpayer',\n", + "'regional_atlas_gdp_p_employee',\n", + "'regional_atlas_gdp_development',\n", + "'regional_atlas_gdp_p_inhabitant',\n", + "'regional_atlas_gdp_p_workhours',\n", + "'regional_atlas_pop_avg_age_zensus',\n", + "'regional_atlas_regional_score']\n", + "\n", + "for col in percentile_col:\n", + " no_nan = df[col][df[col].notnull()]\n", + " col_name = col+'_percentiles' \n", + " df[col_name] = no_nan.apply(lambda x: percentileofscore(no_nan, x))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "307c0e39", + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Adding the percentiles as columns for analysis and report\n", + "\n", + "for col in percentile_col:\n", + " feature = col+\"_percentiles\"\n", + " not_nan = df[feature].notnull()\n", + "\n", + " classes = df['MerchantSizeByDPV'].unique()\n", + "\n", + " for c in classes:\n", + " sns.kdeplot(df[not_nan][df[not_nan]['MerchantSizeByDPV']==c][feature], fill=False, label=c)\n", + " \n", + " # Add labels and title\n", + " plt.xlabel('Value')\n", + " plt.ylabel('Density')\n", + " plt.title('Distribution of '+col)\n", + " plt.legend()\n", + "\n", + " # Show the plot\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "ac1d5880", + "metadata": {}, + "source": [ + "## Features regarding their discriminative power (univariate)\n", + "Features and how they increase the chance to select bigger companies\n", + "1. **Best** [Criteria: Strong trend + difference between classes]\n", + " - google_places_user_ratings_total: number of ratings\n", + " - google_places_rating: middle rating is the best\n", + " - age_2: 25-44\n", + " - age_3: 45-64 (negative)\n", + " - age_4: >65 years old (negative trend)\n", + " - pop_density: the higher, the better\n", + " - per_service_sector: High amount of service sector\n", + " - pop_avg_age: (negative)\n", + " - regional_score: pop_density * employment_rate * disp_income_p_inhabitant / 1000000\n", + "2. **Middle** [Criteria: Small trend + difference between classes]\n", + " - age_0: 0-17\n", + " - age_1: 18-24 \n", + " - pop_development: population development in one year\n", + " - investments_p_employee: Investments per employee in the region\n", + " - gross_salary_p_employee: Gross salary per employee in the region\n", + " - gdp_p_employee: GDP per employee in the region\n", + " - gdp_development: GDP development in the region\n", + " - gdp_p_inhabitant: GDP per inhabitant in the region\n", + " - gdp_p_workhours: GDP per working hours in the region\n", + " - avg_age_zensus: Average age in the region (source is the census)\n", + "3. **Worst** [No trend + no real difference between classes]\n", + " - sentiment_score: Score how polarized the google reviews are\n", + " - employment_rate: Employment rate in the region\n", + " - unemployment_rate: Unemployment rate in the region\n", + " - long_term_unemployment: Long term unemployment rate in the region\n", + " - disp_income_p_inhabitant: Disposable income per inhabitant in the region\n", + " - tot_income_p_inhabitant: Total income per inhabitant in the region" + ] + }, + { + "cell_type": "markdown", + "id": "ef03f2da", + "metadata": {}, + "source": [ + "## Bivariate analysis of the numerical google places features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8073b0a8", + "metadata": {}, + "outputs": [], + "source": [ + "for c in classes:\n", + " tmp = df[not_nan][df[not_nan]['MerchantSizeByDPV']==c]\n", + " sns.kdeplot(x=tmp['google_places_user_ratings_total_percentiles'], y=tmp['google_places_rating_percentiles'], fill=False, label=c)\n", + "\n", + " plt.xlabel('ratings_total')\n", + " plt.ylabel('rating_avg')\n", + " plt.title('Distribution of '+c)\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "c73f6841", + "metadata": {}, + "source": [ + "## Conclusion\n", + "1. Number of ratings is the most important feature, and is a sign of popularity\n", + "2. Bigger companies have the tendency to have lower ratings while having higher number of ratings\n", + "3. Regions in Germany that are more lucrative:\n", + " - a region that has a high population density, especially bigger cities\n", + " - with a higher percentage of the age group \"25-44\"\n", + " - and have a relatively high service sector share\n", + " - while at the same time having a high regional score\n", + "4. GDP features can be important\n", + "5. Employment rates and income are not very important" + ] + }, + { + "cell_type": "markdown", + "id": "983dc6c8", + "metadata": {}, + "source": [ + "----------------------\n", + "# Categorical Features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "edb5923a", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "arr_false = {}\n", + "\n", + "for column in df:\n", + " \n", + " if df[column].dtype == bool:\n", + " false_count = np.count_nonzero(df[column] == False)\n", + " arr_false[column] = false_count\n", + " \n", + "print(arr_false)\n", + "# => remove email_valid, because all are positive" + ] + }, + { + "cell_type": "markdown", + "id": "3568fb12", + "metadata": {}, + "source": [ + "### Gender (Out of First Name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30d74633", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "Dangerous to come to a conclusion based on Gender.\n", + "Women are as contact persons underrepresented in the XL category.\n", + "Possible reasons:\n", + " - Outside (more likely): Underrepresented in responsible positions with increasing size of the company\n", + " - Inside (less likely): The marketing team convinces men better\n", + "\"\"\"\n", + "import gender_guesser.detector as gender\n", + "gd = gender.Detector()\n", + "df['Gender'] = df['First Name'].apply(str.capitalize).map(lambda x: gd.get_gender(x))\n", + "\n", + "group_feature = 'Gender' # MerchantSizeByDPV or Gender\n", + "total_counts = df[group_feature].value_counts().reset_index(name='total_count')\n", + "total_counts = total_counts.rename(columns={'index':group_feature})\n", + "grouped_counts = df.groupby(['Gender', 'MerchantSizeByDPV']).size().reset_index(name='count')\n", + "\n", + "result = pd.merge(grouped_counts, total_counts, on=group_feature)\n", + "result['proportion'] = result['count'] / result['total_count']\n", + "\n", + "category_order = ['XS','S','M','L','XL']\n", + "\n", + "\n", + "# Create separate DataFrames for each gender\n", + "# For better depiction .drop(index='XS') and take away XS from category_order\n", + "# andy: androgynous\n", + "andy_data = result[result['Gender'] == 'andy'].set_index('MerchantSizeByDPV')['proportion']\n", + "unknown_data = result[result['Gender'] == 'unknown'].set_index('MerchantSizeByDPV')['proportion']\n", + "mostly_female_data = result[result['Gender'] == 'mostly_female'].set_index('MerchantSizeByDPV')['proportion']\n", + "mostly_male_data = result[result['Gender'] == 'mostly_male'].set_index('MerchantSizeByDPV')['proportion']\n", + "male_data = result[result['Gender'] == 'male'].set_index('MerchantSizeByDPV')['proportion']\n", + "female_data = result[result['Gender'] == 'female'].set_index('MerchantSizeByDPV')['proportion']\n", + "\n", + "# Plotting\n", + "plt.plot(category_order, andy_data, label='Andy')\n", + "plt.plot(category_order, unknown_data, label='Unknown')\n", + "plt.plot(category_order, mostly_female_data, label='Mostly Female')\n", + "plt.plot(category_order, mostly_male_data, label='Mostly Male')\n", + "plt.plot(category_order, male_data, label='Male')\n", + "plt.plot(category_order, female_data, label='Female')\n", + "\n", + "# Set labels and title\n", + "plt.xlabel('MerchantSizeByDPV')\n", + "plt.ylabel('Proportion')\n", + "plt.title('Proportion of MerchantSizeByDPV for Each Gender')\n", + "\n", + "# Display the plot\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "2315b4ea", + "metadata": {}, + "source": [ + "### Conclusion: Gender\n", + "1. Andy\n", + "2. Unknown + Mostly Female, Male + Mostly Male, Female" + ] + }, + { + "cell_type": "markdown", + "id": "07f9b71f", + "metadata": {}, + "source": [ + "## MCC Level (Type of Business)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd4a15f7", + "metadata": {}, + "outputs": [], + "source": [ + "mcc_group = df.groupby(by=['MCC Level','MerchantSizeByDPV']).size()\n", + "grouped = mcc_group.unstack()\n", + "mcc_sum = mcc_group.groupby(level=0).sum()\n", + "\n", + "mcc_df = pd.concat([grouped, sum_test], axis=1)\n", + "tmp = mcc_df[0]\n", + "mcc_df = mcc_df.divide(mcc_df[0], axis=0).sort_values(by='XS', ascending=True)\n", + "mcc_df['Sum'] = tmp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e787a6cd", + "metadata": {}, + "outputs": [], + "source": [ + "print('Dropped the rows due to less than 50 examples:')\n", + "print(mcc_df[mcc_df['Sum']<50].index.values)\n", + "mcc_df = mcc_df[mcc_df['Sum']>=50]\n", + "\n", + "# Show every 10 categories (previously ordered by ascending XS), to compare the categories\n", + "# The first categories are the most attractive ones\n", + "for i in range(mcc_df.shape[0]): \n", + " if i % 10 == 0:\n", + " mcc_df.drop([0,'Sum','XS'],axis=1)[i:(i+5)].transpose().plot.line()" + ] + }, + { + "cell_type": "markdown", + "id": "9db7b90f", + "metadata": {}, + "source": [ + "### Conclusion: MCC\n", + "For example the most lucrative categories by each group. You have to consider that the first group is more lucrative than the second, etc.\n", + "\n", + "1. Café / Restaurant\n", + "2. Apparel\n", + "3. Book Stores\n", + "4. Pharmacy and Nutrition\n", + "5. Art Dealers and Categories" + ] + }, + { + "cell_type": "markdown", + "id": "ca0dc855", + "metadata": {}, + "source": [ + "## Google_places_detailed_type (Type of Business)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fc010da", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "\n", + "data = df[df['google_places_detailed_type'].notnull()]\n", + "test = pd.Series([x for item in data.google_places_detailed_type for x in ast.literal_eval(item)]).value_counts()\n", + "test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd76690a", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "\"\"\"\n", + "- isna => category 0\n", + "- remove establishment and point_of_interest => category 1\n", + "- aggregate categories like sublocality? , or administrative_area_level_x\n", + "\"\"\"\n", + "\n", + "docs = df['google_places_detailed_type'][df['google_places_detailed_type'].notna()]\n", + "docs = docs.apply(lambda row: ast.literal_eval(row))\n", + "\n", + "vectorizer = CountVectorizer(analyzer=lambda x: x) # , min_df = 50\n", + "categories = vectorizer.fit_transform(docs).toarray()\n", + "vectorizer.get_feature_names_out()" + ] + }, + { + "cell_type": "markdown", + "id": "78d3f491", + "metadata": {}, + "source": [ + "## Chi squared test for categorical variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd08fc9b", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import chi2_contingency\n", + "\n", + "# Create a contingency table for each feature\n", + "contingency_tables = {}\n", + "\n", + "cat_col = df[['google_places_candidate_count_mail','google_places_candidate_count_phone','google_places_rating','google_places_price_level','google_places_confidence','MCC Level', 'Gender','number_area','first_name_in_account','last_name_in_account','google_places_business_status','number_country','number_valid','number_possible','google_places_place_id_matches_phone_search']].fillna('no_data')\n", + "cat_col['b_google_website'] = df['google_places_detailed_website'].notnull()\n", + "\n", + "#for feature_column in df.columns[df.columns != 'label']:\n", + "for feature_column in cat_col.columns:\n", + " contingency_table = pd.crosstab(df['MerchantSizeByDPV'], cat_col[feature_column])\n", + " contingency_tables[feature_column] = contingency_table\n", + "\n", + "# Perform chi-squared test for each feature\n", + "results = {}\n", + "for feature, table in contingency_tables.items():\n", + " chi2_stat, p_value, dof, expected = chi2_contingency(table)\n", + " results[feature] = {'Chi-squared stat': chi2_stat, 'P-value': p_value, 'Degrees of Freedom': dof}\n", + "\n", + "# Display the results\n", + "for feature, result in results.items():\n", + " print(f\"\\nChi-squared test for {feature}:\")\n", + " print(f\"Chi-squared statistic: {result['Chi-squared stat']:.2f}\")\n", + " print(f\"P-value: {result['P-value']:.4f}\")\n", + " print(f\"Degrees of freedom: {result['Degrees of Freedom']}\")\n", + "\n", + "# p-value > 0.05 => ignore because hypothesis can't be rejected\n", + "# The higher the statistic the more influential\n", + "# => Ignore: number_area" + ] + }, + { + "cell_type": "markdown", + "id": "533b0983", + "metadata": {}, + "source": [ + "### Conclusion: Chi squared test\n", + "1. Ignore number_area (p-value > 0.05)\n", + "2. Best: \n", + " - MCC Level: Find the category of a merchant\n", + " - google_places_price_level: Get the prices of a company\n", + " - google_places_rating: different sources for number of ratings, but also possible social media likes [popularity measures]\n", + " \n", + "3. Middle: \n", + " - number_country\n", + " - google_places_confidence\n", + " - number_possible\n", + " - number_valid\n", + " - google_places_candidate_count_phone\n", + " - Gender\n", + "4. Worst:\n", + " - google_places_candidate_count_mail\n", + " - last_name_in_account\n", + " - google_places_place_id_matches_phone_search\n", + " - b_google_website\n", + " - first_name_in_account\n", + " - google_places_business_status\n", + "5. Invalid:\n", + " - number_area" + ] + }, + { + "cell_type": "markdown", + "id": "66816550", + "metadata": {}, + "source": [ + "----------------------------------\n", + "# Boolean features (Bayesian)" + ] + }, + { + "cell_type": "markdown", + "id": "c7600a68", + "metadata": {}, + "source": [ + "#### Count false per column\n", + "- email_valid: 0 => **not interesting**\n", + "- first_name_in_account: 7659 / 10,000\n", + "- last_name_in_account: 6872 / 10,000\n", + "- number_valid: 339 / 10,000\n", + "- number_possible: 297 / 10,000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c46eb0f4", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "def b_bayesian(df,bin_column,b_value=True):\n", + " \n", + " prior_A = df.groupby('MerchantSizeByDPV').count()['Email']/df.shape[0] \n", + " prior_B = df[df[bin_column]==b_value].shape[0] / df[bin_column].shape[0]\n", + " evidence_A = df[df[bin_column]==b_value].groupby('MerchantSizeByDPV').count()[bin_column] / df.groupby('MerchantSizeByDPV').count()[bin_column]\n", + " posterior_B = (prior_A*evidence_A) / prior_B\n", + " \n", + " return posterior_B.reindex(index=['XS', 'S', 'M', 'L','XL'])\n", + "\n", + "per_size = (df.groupby('MerchantSizeByDPV').count()['Email']/df.shape[0]).reindex(index=['XS', 'S', 'M', 'L','XL'])\n", + "\n", + "\n", + "series_not_possible =b_bayesian(df,'number_possible',False)-per_size\n", + "series_invalid = b_bayesian(df,'number_valid',False)-per_size\n", + "series_first_name = b_bayesian(df,'first_name_in_account',True)-per_size\n", + "series_last_name = b_bayesian(df,'last_name_in_account',True)-per_size\n", + "\n", + "series_possible =b_bayesian(df,'number_possible',True)-per_size\n", + "series_valid = b_bayesian(df,'number_valid',True)-per_size\n", + "series_no_first_name = b_bayesian(df,'first_name_in_account',False)-per_size\n", + "series_no_last_name = b_bayesian(df,'last_name_in_account',False)-per_size\n", + "\n", + "# Ensure the 'Category' column is ordered\n", + "categories_order = ['XS', 'S', 'M', 'L','XL']\n", + "\n", + "# Plot the lines\n", + "plt.figure(figsize=(10, 6))\n", + "\n", + "\n", + "plt.plot(categories_order, series_not_possible, label='Number not possible', marker='o')\n", + "plt.plot(categories_order, series_invalid, label='Number invalid', marker='d')\n", + "plt.plot(categories_order, series_first_name, label='First name in account')\n", + "plt.plot(categories_order, series_last_name, label='Last name in account')\n", + "plt.plot(categories_order, series_possible, label='Number possible')\n", + "plt.plot(categories_order, series_valid, label='Number valid')\n", + "plt.plot(categories_order, series_no_first_name, label='First name not in account')\n", + "plt.plot(categories_order, series_no_last_name, label='Last name not in account')\n", + "#plt.plot(categories_order, per_size, label='Percentage of merchant size', marker='s',c='black')\n", + "\n", + "\n", + "plt.title('Bayesian')\n", + "plt.xlabel('Categories')\n", + "plt.ylabel('Percentages')\n", + "plt.legend()\n", + "plt.grid(True)\n", + "\n", + "# Show the plot\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "id": "62ca6202", + "metadata": {}, + "source": [ + "## Conclusion: Boolean features (Bayesian)\n", + "1. Email_valid: not interesting\n", + "2. Number not possible and number invalid have much lower XS values (25% less)\n", + "3. First name and last name in account have minor effects on it, and are a sign for smaller categories\n", + "4. Number valid, number possible, and first name / last name not in account are not very different from the overall data" + ] + }, + { + "cell_type": "markdown", + "id": "0d647f05", + "metadata": {}, + "source": [ + "-----------------------------------\n", + "# Quality Evaluation of the Discriminative Quality of Regionalatlas Features" + ] + }, + { + "cell_type": "markdown", + "id": "e43187de", + "metadata": {}, + "source": [ + "## Boxplots and Violinplots for Visual Observation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a18821e", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import numpy as np\n", + "\n", + "\n", + "class_colors = sns.color_palette(\"colorblind\")[:5]\n", + "regional_df = df.filter(like='regional', axis=1)\n", + "regional_df['MerchantSizeByDPV'] = df['MerchantSizeByDPV']\n", + "\n", + "# Plot boxplots for each column with different MerchantSizeByDPV boxplots next to each other\n", + "for i, column in enumerate(regional_df.columns[:-1]): # Exclude the last column ('MerchantSizeByDPV') \n", + " \n", + " if column == 'regional_atlas_pop_development': \n", + " axes = sns.boxplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_pop_development']<2000],palette=class_colors, order=['XS', 'S','M','L','XL']) \n", + "\n", + " elif column == 'regional_atlas_gdp_development':\n", + " axes = sns.boxplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_gdp_development']<60],palette=class_colors, order=['XS', 'S','M','L','XL']) \n", + " \n", + " else:\n", + " axes = sns.boxplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df,palette=class_colors, order=['XS', 'S','M','L','XL'])\n", + " \n", + " axes.set_title(f'Boxplot of {column}')\n", + " axes.set_xlabel('MerchantSizeByDPV')\n", + " axes.set_ylabel(column) \n", + " \n", + " median_value = regional_df[regional_df['MerchantSizeByDPV'] == 'XL'][column].median()\n", + " axes.axhline(y=median_value, color='red', linestyle='--', label=f'Median (XL)')\n", + " axes.legend(bbox_to_anchor=(1.05, 0.5), loc='upper right')\n", + " \n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18949200", + "metadata": {}, + "outputs": [], + "source": [ + "# Same like the boxplots but now with violinplots\n", + "for column in regional_df.filter(like='regional', axis=1).columns: \n", + " if column == 'regional_atlas_pop_development': \n", + " axes = sns.violinplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_pop_development']<2000],palette=class_colors, order=['XS', 'S','M','L','XL']) \n", + "\n", + " elif column == 'regional_atlas_gdp_development':\n", + " axes = sns.violinplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_gdp_development']<60],palette=class_colors, order=['XS', 'S','M','L','XL']) \n", + " \n", + " else:\n", + " axes = sns.violinplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df,palette=class_colors, order=['XS', 'S','M','L','XL'])\n", + " \n", + " axes.set_title(f'Boxplot of {column}')\n", + " axes.set_xlabel('MerchantSizeByDPV')\n", + " axes.set_ylabel(column) \n", + " \n", + " median_value = regional_df[regional_df['MerchantSizeByDPV'] == 'XL'][column].median()\n", + " axes.axhline(y=median_value, color='red', linestyle='--', label=f'Median (XL)')\n", + " axes.legend(bbox_to_anchor=(1.05, 0.5), loc='upper right')\n", + " \n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "2fb6790a", + "metadata": {}, + "source": [ + "## Computing a Heuristic Metric to Identify Quality Groups" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c7b2074", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import preprocessing\n", + "\n", + "# Normalize the features before comparing / dividing them\n", + "x = regional_df.drop('MerchantSizeByDPV', axis = 1).values #returns a numpy array\n", + "min_max_scaler = preprocessing.MinMaxScaler()\n", + "x_scaled = min_max_scaler.fit_transform(x)\n", + "norm_regio = pd.DataFrame(x_scaled, columns=regional_df.drop('MerchantSizeByDPV', axis = 1).columns)\n", + "\n", + "# Compute the stats of the normalized regional data, to find a heuristic to evaluate the features' discriminative magnitudes\n", + "df_stats_XL = norm_regio[regional_df['MerchantSizeByDPV']=='XL'].describe()\n", + "df_stats_XS = norm_regio[regional_df['MerchantSizeByDPV']=='XS'].describe()\n", + "\n", + "((df_stats_XL.loc['50%'] - df_stats_XS.loc['50%'])/(df_stats_XL.loc['75%'] - df_stats_XL.loc['25%'])).sort_values(ascending=False)" + ] + }, + { + "cell_type": "markdown", + "id": "26a45107", + "metadata": {}, + "source": [ + "## Result: Quality Evaluation of the Discriminative Quality of Regionalatlas Features\n", + "\n", + "We can see that most of the negative values are related to age. The reason behind this might be the generational related consumer and payment behavior, and on socioeconomical reasons. Every generation in germany had a vastly different working/pension experience (f.e. Generation Silent / baby boomer / Gen Y) and finance/housing situation. It seems like the XL companies tend to be in the big german cities, which have a high population density and a high share of age_2 (25-44), and a flourishing service sector.\n", + "\n", + "1. pop_development: might be related to age_0 (more kids)\n", + "2. Employment rate: bigger cities (with high density) have a higher unemployment rate. But the high density might be able to compensate it.\n", + "3. age_0: minors are part of families, that might not be able to spend money in the service industry (biggest part of SumUp)\n", + "4. pop_avg_age & pop_avg_age_zensus: Different results because of different time the data was collected\n", + "5. age_4 (>64),age_3 (45-64): Are not age_0 group and might not be in the big cities / spent as much money as others in the service industry with card payment\n", + "\n", + "The following table provides information from the regionalatlas dataset. Each row represents a different feature, with corresponding absolute values computed previously and quality class. Higher values mean higher discriminative quality (for Value and Quality Class). The visual assessment is done by considering also the distribution identified in the violin plot.\n", + "\n", + "| Feature | Value | Quality Class (metric, boxplot related) | Quality Class (after visual assessment of violin plot) |\n", + "|---------------------------------------------|-------------------------|-------|-------|\n", + "| regional_atlas_pop_density | 0.581410 | 1 | 1 |\n", + "| regional_atlas_age_2 | 0.552632 | 1 | 1 |\n", + "| regional_atlas_age_3 | 0.488372 | 1 | 1 |\n", + "| regional_atlas_per_service_sector | 0.467153 | 1 | 1 |\n", + "| regional_atlas_regional_score | 0.420439 | 1 | 1 |\n", + "| regional_atlas_pop_avg_age_zensus | 0.384615 | 1 | 2 |\n", + "| regional_atlas_age_4 | 0.354839 | 1 | 2 |\n", + "| regional_atlas_investments_p_employee | 0.333333 | 1 | 1 |\n", + "| regional_atlas_pop_avg_age | 0.304348 | 1 | 2 |\n", + "| regional_atlas_age_0 | 0.222222 | 1 | 2 |\n", + "| regional_atlas_age_1 | 0.200000 | 2 | 3 |\n", + "| regional_atlas_unemployment_rate | 0.181818 | 2 | 3 |\n", + "| regional_atlas_gdp_p_inhabitant | 0.175996 | 2 | 2 |\n", + "| regional_atlas_gdp_development | 0.163934 | 2 | 3 |\n", + "| regional_atlas_per_trade | 0.162791 | 2 | 3 |\n", + "| regional_atlas_gdp_p_employee | 0.128667 | 2 | 3 |\n", + "| regional_atlas_employment_rate | 0.093333 | 2 | 3 |\n", + "| regional_atlas_gross_salary_p_employee | 0.060377 | 3 | 3 |\n", + "| regional_atlas_tot_income_p_taxpayer | 0.053691 | 3 | 3 |\n", + "| regional_atlas_pop_development | 0.026786 | 3 | 4 |\n", + "| regional_atlas_per_long_term_unemployment | 0.024096 | 3 | 3 |\n", + "| regional_atlas_gdp_p_workhours | 0.018957 | 4 | 4 |\n", + "| regional_atlas_disp_income_p_inhabitant | 0.000000 | 4 | 4 |\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "a3c6e408", + "metadata": {}, + "source": [ + "# Correlation above 0.89" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "684af78c", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "# Compute a correlation matrix for all float values of our dataframe\n", + "float_cols = df.columns[df.dtypes==float]\n", + "corr_matrix = df[float_cols].corr()\n", + "\n", + "# The diagonal values (correlation of each feature with itself) should be considered 0, to filter them out\n", + "np.fill_diagonal(corr_matrix.values, 0)\n", + "\n", + "# Create a new DataFrame that transforms all values to 0 that are below a value of defined by variable \"correlation_threshold\" \n", + "correlation_threshold = 0.89\n", + "filtered_correlation_df = corr_matrix.applymap(lambda x: x if abs(x) >= correlation_threshold else 0)\n", + "\n", + "# Identify the rows and columns that not only consists of 0 values (after filtering)\n", + "non_zero_rows = filtered_correlation_df.index[~(filtered_correlation_df == 0).all(axis=1)]\n", + "non_zero_columns = filtered_correlation_df.columns[~(filtered_correlation_df == 0).all(axis=0)]\n", + "new_correlation_df = filtered_correlation_df.loc[non_zero_rows, non_zero_columns]\n", + "\n", + "# Print the new correlation matrix and the corresponding plot\n", + "print(f\"New Correlation Matrix (values greater than {correlation_threshold}):\")\n", + "\n", + "plt.figure(figsize=(12, 10))\n", + "heatmap = sns.heatmap(new_correlation_df, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)\n", + "plt.title('Correlation Matrix Heatmap')\n", + "plt.savefig('correlation_matrix.svg', format='svg')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "7a77e1c4", + "metadata": {}, + "source": [ + "## Result: Correlation Analysis\n", + "Might be recommendable to decide between the following features (based on the discrimination quality):\n", + "1. pop_avg_age and avg_age_zensus are different. Maybe the reason is the method or the time when the data was collected. Remove one of them\n", + "2. age_4 vs. avg_age / avg_age_zensus\n", + "3. gdp_p_workhours vs. gdp_p_employee\n", + "4. age_2 vs. age_3\n", + "5. Might remove Regional Score" + ] + }, + { + "cell_type": "markdown", + "id": "f0ebb3f6", + "metadata": {}, + "source": [ + "---------------------------------------\n", + "# PCA Analysis for Expensive Algorithms" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ba9ee1b", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.decomposition import PCA\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "\n", + "reg_df = df.filter(like='regional', axis=1).dropna()\n", + "\n", + "# Standardize the features\n", + "scaler = StandardScaler()\n", + "scaled_data = scaler.fit_transform(reg_df.drop('MerchantSizeByDPV', axis=1))\n", + "\n", + "# Apply PCA\n", + "pca = PCA()\n", + "principal_components = pca.fit_transform(scaled_data)\n", + "\n", + "# Retrieve explained variance ratios\n", + "explained_variance_ratio = pca.explained_variance_ratio_\n", + "\n", + "components = pd.DataFrame(pca.components_, columns=filter_df.columns)\n", + "\n", + "# Print explained variance ratios\n", + "for i, ratio in enumerate(explained_variance_ratio, 1):\n", + " print(f\"Principal Component {i}: Explained Variance Ratio = {ratio:.4f}\")\n", + "\n", + "# Plot the cumulative explained variance\n", + "cumulative_variance = explained_variance_ratio.cumsum()\n", + "\n", + "plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o')\n", + "plt.title('Cumulative Explained Variance')\n", + "plt.xlabel('Number of Principal Components')\n", + "plt.ylabel('Cumulative Variance Explained')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "43e567f8", + "metadata": {}, + "source": [ + "---------------------------\n", + "# New Features" + ] + }, + { + "cell_type": "markdown", + "id": "b79613bf", + "metadata": {}, + "source": [ + "## Email Address Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d731eff3", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "# Count only those words, existing in a minum amount of 100 email adresses\n", + "count_vectorizer = CountVectorizer(min_df=50)\n", + "\n", + "# Fit and transform the text data\n", + "count_matrix = count_vectorizer.fit_transform(df['Email'])\n", + "\n", + "# Convert the matrix to a DataFrame for better readability\n", + "count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4314186c", + "metadata": {}, + "outputs": [], + "source": [ + "common_words = pd.DataFrame(count_df.sum()).transpose()\n", + "\n", + "for word in common_words:\n", + " print(word)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fdf6ba75", + "metadata": {}, + "outputs": [], + "source": [ + "# Names\n", + "names = [\n", + " 'alex', 'alexander', 'alexandra', 'ali', 'andre', 'andrea', 'andreas',\n", + " 'anja', 'anke', 'anna', \n", + " 'barbara', 'bauer', 'becker',\n", + " 'birgit', \n", + " 'christian', 'christina',\n", + " 'claudia', 'daniel', 'daniela', 'david',\n", + " 'dirk',\n", + " 'eva', \n", + " 'fischer', 'florian', \n", + " 'frank', \n", + " \n", + " 'heike', \n", + " \n", + " 'jan', 'jana', 'jens', 'joerg',\n", + " 'julia', 'kai', 'karin', 'katharina',\n", + " 'kathrin', 'katja', 'katrin', 'kerstin', \n", + " 'klaus', \n", + " 'lisa', 'manuela', 'marc',\n", + " 'marco', 'maria', 'marion', 'markus', 'martin', 'martina', 'matthias',\n", + " 'melanie', 'meyer', 'michael', 'michaela',\n", + " 'monika','mueller', 'nadine',\n", + " 'nicole', \n", + " 'oliver', \n", + " 'peter', 'petra', 'philipp', \n", + " 'ralf', \n", + " 'richter', 'robert', 'sabine', 'sabrina',\n", + " 'sandra', 'sarah', 'schmidt', 'schmitz', 'schneider',\n", + " 'sebastian', 'silke', 'simon', 'simone', 'sonja', \n", + " 'stefan', 'stefanie', 'steffi', \n", + " 'susanne', 'sven', 'tanja', \n", + " 'thomas','tim', 'tobias', 'uwe',\n", + " 'wagner', 'weber', \n", + " 'weiss', 'werner', 'wolf']\n", + "\n", + "# Weird terms\n", + "weird_terms = ['hallo', 'hello', 'moin', 'sumup']\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa498485", + "metadata": {}, + "outputs": [], + "source": [ + "grouped_common_words = []\n", + "per_size = (df.groupby('MerchantSizeByDPV').count()['Email']/df.shape[0]).reindex(index=['XS', 'S', 'M', 'L','XL'])\n", + "\n", + "for word in common_words.drop(weird_terms,axis=1): #common_words[names], common_words[weird_terms]\n", + " \n", + " indices= count_df[count_df[word]>0].index \n", + " per_word = (df.loc[indices].groupby('MerchantSizeByDPV').count()['Email']/len(df.loc[indices])).reindex(index=['XS', 'S', 'M', 'L','XL']) \n", + " \n", + " grouped_common_words.append((per_word-per_size).rename(word)) \n", + " \n", + "common_df = pd.concat(grouped_common_words, axis=1)\n", + "common_df = common_df.transpose()\n", + "\n", + "common_df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3aeb199a", + "metadata": {}, + "outputs": [], + "source": [ + "# The min/mean/max probability decrease (-) or increase (+) by a value of x with the existence of a certain common word\n", + "\n", + "print(f'{np.min(common_df[\"XS\"])}, {np.mean(common_df[\"XS\"])},{np.max(common_df[\"XS\"])}')\n", + "print(f'{np.min(common_df[\"S\"])}, {np.mean(common_df[\"S\"])},{np.max(common_df[\"S\"])}')\n", + "print(f'{np.min(common_df[\"M\"])}, {np.mean(common_df[\"M\"])},{np.max(common_df[\"M\"])}')\n", + "print(f'{np.min(common_df[\"L\"])}, {np.mean(common_df[\"L\"])},{np.max(common_df[\"L\"])}')\n", + "print(f'{np.min(common_df[\"XL\"])}, {np.mean(common_df[\"XL\"])},{np.max(common_df[\"XL\"])}')" + ] + }, + { + "cell_type": "markdown", + "id": "dc9ac1c0", + "metadata": {}, + "source": [ + "## Result: Email Address Analysis\n", + "If possible add the common words as one hot vector to the model" + ] + }, + { + "cell_type": "markdown", + "id": "83e84e7c", + "metadata": {}, + "source": [ + "## Financial report (Bundesanzeiger)\n", + "\n", + "#### Note:\n", + "- Execution: The underlying cell needs to be processed in a separate python file\n", + "- Source: It is difficult to find for Germany corresponding APIs for the financial report registers\n", + "- Choice of Source: Bundesanzeiger is the most promising register, due to its restriction for bigger companies. Problem is that each access takes at least 6sec (internet connection + captcha + ...)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38d47b36", + "metadata": {}, + "outputs": [], + "source": [ + "import multiprocessing\n", + "import time\n", + "import pandas as pd\n", + "import numpy as np\n", + "from deutschland.bundesanzeiger import Bundesanzeiger\n", + "import pickle\n", + "import time\n", + "\n", + "def access_ba(company,b_bundesanzeiger,):\n", + "\n", + " b_bundesanzeiger.append(True)\n", + " try:\n", + " ba = Bundesanzeiger()\n", + " data = ba.get_reports(company)\n", + " except:\n", + " b_bundesanzeiger[-1] = False\n", + " return\n", + "\n", + " if __name__ == '__main__':\n", + "\n", + " \"\"\"\n", + " with open('list_file.pkl', 'rb') as file:\n", + " loaded_list = pickle.load(file)\n", + " print(loaded_list)\n", + " \"\"\"\n", + "\n", + " pd.set_option('display.max_columns', None)\n", + "\n", + " historic = pd.read_csv('historic.csv',sep = ',')#_enriched\n", + "\n", + " df = historic.groupby('MerchantSizeByDPV').apply(lambda x: x.sample(100))\n", + "\n", + "\n", + " with multiprocessing.Manager() as manager:\n", + "\n", + " b_bundesanzeiger = manager.list()\n", + " content_array = []\n", + " durations = []\n", + "\n", + " for i, company in enumerate(df[\"Company Name\"]):\n", + "\n", + " print(i)\n", + "\n", + " start = time.time()\n", + "\n", + " # Start access_ba as a process\n", + " p = multiprocessing.Process(target=access_ba, name=\"access_ba\", args=(company,b_bundesanzeiger))\n", + "\n", + " p.start()\n", + "\n", + " # Wait 8 seconds for access_ba\t\n", + " p.join(8)\n", + "\n", + " # If thread is active\n", + " if p.is_alive():\n", + " print (\"Terminate access_ba\")\n", + "\n", + " # Terminate access_ba\n", + " p.terminate()\n", + " b_bundesanzeiger[-1] = 'killed'\n", + "\n", + " # Cleanup\n", + " p.join()\n", + " i+=1\n", + "\n", + " print(b_bundesanzeiger[-1])\n", + " end = time.time()\n", + " print(end-start)\n", + " print()\n", + " durations.append(end-start)\n", + "\n", + " \"\"\"if i==100:\n", + " with open('list_file.pkl', 'wb') as file:\n", + " pickle.dump(list(b_bundesanzeiger), file)\n", + " print(np.mean(np.array(list(b_bundesanzeiger))))\n", + " break\n", + " \"\"\"\n", + "\n", + " with open('list_file.pkl', 'wb') as file:\n", + " pickle.dump(list(b_bundesanzeiger), file)\n", + "\n", + " with open('time.pkl', 'wb') as file:\n", + " pickle.dump(durations, file)\n", + "\n", + " df.to_pickle(\"./dataframe_sample.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8a6f460", + "metadata": {}, + "outputs": [], + "source": [ + "with open('dataframe_sample.pkl', 'rb') as f:\n", + " df = pickle.load(f)\n", + "\n", + "df = df.reset_index(drop=True)\n", + "\n", + "with open('list_file.pkl', 'rb') as f:\n", + " mynewlist = pickle.load(f)\n", + "\n", + "with open('time.pkl', 'rb') as f:\n", + " time = pickle.load(f)\n", + "\n", + "df_stats = pd.DataFrame({'b_bundesanzeiger': mynewlist, 'time': time})\n", + "\n", + "df['b_bundesanzeiger'] = df_stats['b_bundesanzeiger']\n", + "df['time'] = df_stats['time']\n", + "\n", + "\n", + "counts =df.groupby('MerchantSizeByDPV')['b_bundesanzeiger'].value_counts()\n", + "\n", + "desired_value_counts = counts.unstack().fillna(0)\n", + "\n", + "# Compute total counts per category\n", + "total_counts_per_category = counts.groupby('MerchantSizeByDPV').sum()\n", + "\n", + "# Compute probability for each category\n", + "probabilities = desired_value_counts.apply(lambda x: x / total_counts_per_category)\n", + "\n", + "print(probabilities)" + ] + }, + { + "cell_type": "markdown", + "id": "956536b8", + "metadata": {}, + "source": [ + "## Result: Financial report (Bundesanzeiger)\n", + "- After 8 seconds the connection is killed, because it can take minutes. Most need less than 8 seconds.\n", + "- Results are based on a sample of 100 for each class\n", + "- Difference regarding the True values\n", + "\n", + "| b_bundesanzeiger | False | True | Killed |\n", + "|--------------------|-------|------|--------|\n", + "| MerchantSizeByDPV | | | |\n", + "| XS | 0.59 | 0.24 | 0.17 |\n", + "| S | 0.73 | 0.14 | 0.13 |\n", + "| M | 0.57 | 0.34 | 0.09 |\n", + "| L | 0.53 | 0.30 | 0.17 |\n", + "| XL | 0.45 | 0.41 | 0.14 |\n" + ] + }, + { + "cell_type": "markdown", + "id": "8504a6e0", + "metadata": {}, + "source": [ + "## Existence of company in google places" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30b18dea", + "metadata": {}, + "outputs": [], + "source": [ + "df['b_google_places'] = df[\"google_places_place_id\"].notnull()\n", + "counts =df.groupby('MerchantSizeByDPV')['b_google_places'].value_counts()\n", + "\n", + "desired_value_counts = counts.unstack().fillna(0)\n", + "\n", + "# Compute total counts per category\n", + "total_counts_per_category = counts.groupby('MerchantSizeByDPV').sum()\n", + "\n", + "# Compute probability for each category\n", + "probabilities = desired_value_counts.apply(lambda x: x / total_counts_per_category)\n", + "\n", + "print(probabilities)" + ] + }, + { + "cell_type": "markdown", + "id": "f42904bc", + "metadata": {}, + "source": [ + "### Result\n", + "- Bigger companies are easier found than smaller companies on google places\n", + "- Average of 77% are found on google_places (but we can not be 100% sure they are correct)" + ] + }, + { + "cell_type": "markdown", + "id": "eb71d595", + "metadata": {}, + "source": [ + "# ML - Model" + ] + }, + { + "cell_type": "markdown", + "id": "4738be7c", + "metadata": {}, + "source": [ + "## Ideas\n", + "1. Imputation due to missing values\n", + "2. SMOTE or GAN / Diffusion Model / ... to create more values against the imbalance\n", + "3. Consider L and XL as outlier (using Outlier Detection)\n", + "4. StratifiedKFold\n", + "5. Weighted Classifiers (for example higher weight for class XL using a Random Forest)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f167c71", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "# Separate features (X) and target variable (y)\n", + "table = df[regional_columns+['MerchantSizeByDPV']].dropna()\n", + "y = table['MerchantSizeByDPV']\n", + "X=table[regional_columns]\n", + "\n", + "X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X, y)\n", + "\n", + "# Split the data into training and testing sets\n", + "X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)\n", + "\n", + "# Create a logistic regression model\n", + "model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=4000, class_weight='balanced')\n", + "\n", + "# Fit the model on the training data\n", + "model.fit(X_train, y_train)\n", + "\n", + "# Make predictions on the testing data\n", + "y_pred = model.predict(X_test)\n", + "\n", + "# Evaluate the model\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "conf_matrix = confusion_matrix(y_test, y_pred)\n", + "class_report = classification_report(y_test, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e6afdec", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n", + "\n", + "# Assuming X and y are your feature matrix and target variable\n", + "X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)\n", + "\n", + "# Create and fit the Random Forest model\n", + "rf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n", + "rf_model.fit(X_train, y_train)\n", + "\n", + "# Make predictions on the test set\n", + "y_pred = rf_model.predict(X_test)\n", + "\n", + "# Evaluate the model\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "conf_matrix = confusion_matrix(y_test, y_pred)\n", + "class_report = classification_report(y_test, y_pred)\n", + "\n", + "# Display evaluation metrics\n", + "print(f'Accuracy: {accuracy:.2f}')\n", + "print(f'Confusion Matrix:\\n{conf_matrix}')\n", + "print(f'Classification Report:\\n{class_report}')\n" + ] + }, + { + "cell_type": "markdown", + "id": "4da857f4", + "metadata": {}, + "source": [ + "# Idea: Testing outlier detection to identify XL" + ] + }, + { + "cell_type": "markdown", + "id": "773b81db", + "metadata": {}, + "source": [ + "## Isolation Forest\n", + "Act like XL is an anomaly and we try to identify it" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abea2245", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.ensemble import IsolationForest\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n", + "\n", + "# Assuming X and y are your feature matrix and target variable\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "# Define the set of rare classes\n", + "rare_classes = ['XL'] # Replace with the actual class labels you consider rare\n", + " # MAYBE NOT ONLY XL, but also L and M\n", + "# Create a binary target variable indicating whether each instance is rare or not\n", + "y_train_rare = y_train.isin(rare_classes).astype(int)\n", + "y_test_rare = y_test.isin(rare_classes).astype(int)\n", + "\n", + "# Create and fit the Isolation Forest model\n", + "if_model = IsolationForest(contamination='auto')\n", + "if_model.fit(X_train)\n", + "\n", + "# Predict anomalies on the test set\n", + "y_pred_rare = if_model.predict(X_test)\n", + "\n", + "# Convert the predicted labels to binary (1 for anomalies, -1 for normal instances)\n", + "y_pred_rare_binary = (y_pred_rare == -1).astype(int)\n", + "\n", + "# Evaluate the model\n", + "accuracy = accuracy_score(y_test_rare, y_pred_rare_binary)\n", + "conf_matrix = confusion_matrix(y_test_rare, y_pred_rare_binary)\n", + "class_report = classification_report(y_test_rare, y_pred_rare_binary)\n", + "\n", + "# Display evaluation metrics\n", + "print(f'Accuracy: {accuracy:.2f}')\n", + "print(f'Confusion Matrix:\\n{conf_matrix}')\n", + "print(f'Classification Report:\\n{class_report}')\n", + "\n", + "plt.figure(figsize=(6, 4))\n", + "sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False,\n", + " xticklabels=[0,1], yticklabels=[0,1])\n", + "plt.xlabel('Predicted Label')\n", + "plt.ylabel('True Label')\n", + "plt.title('Confusion Matrix')\n", + "plt.show()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_pytorch_p310", + "language": "python", + "name": "conda_pytorch_p310" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/fabian_feature_analysis.ipynb.license b/notebooks/fabian_feature_analysis.ipynb.license new file mode 100644 index 0000000..72915b0 --- /dev/null +++ b/notebooks/fabian_feature_analysis.ipynb.license @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: MIT +# SPDX-FileCopyrightText: 2023 Fabian-Paul Utech