diff --git a/notebooks/fabian_feature_analysis.ipynb b/notebooks/fabian_feature_analysis.ipynb index 8b13789..5bfa4aa 100644 --- a/notebooks/fabian_feature_analysis.ipynb +++ b/notebooks/fabian_feature_analysis.ipynb @@ -1 +1,1043 @@ - +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "9d3b226f", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "df = pd.read_csv(\"s3://amos-training-data/100k_historic_enriched.csv\")\n", + "categories_order = ['XS', 'S', 'M', 'L','XL']\n", + "df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f508fadb", + "metadata": {}, + "outputs": [], + "source": [ + "print(len(df))\n", + "df.groupby(by=['MerchantSizeByDPV']).size()/len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a850c5e7", + "metadata": {}, + "outputs": [], + "source": [ + "# Check if there are repetitions (> 0 => leads that exist multiple times according to the identifier)\n", + "identifier = df[['Phone','Email','Company Name','number_formatted','google_places_place_id','google_places_formatted_address','google_places_name','google_places_detailed_website']]\n", + "for col in identifier:\n", + " print(f'{col}: {len(df[col].unique())} ({1-len(df[col].unique())/df[col].count()})') " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7d07397", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\" Regionalatlas: Placeholder\n", + "2222222222: nichts vorhanden, genau 0\n", + "5555555555: Zahlenwert unbekannt oder geheim zu halten\n", + "6666666666: Tabellenfach gesperrt, da Aussage nicht sinnvoll\n", + "7777777777: keine Angabe, da Zahlenwert nicht sicher genug\n", + "8888888888: Angabe fällt später an\n", + "\"\"\"\n", + "\n", + "exclude_values = [2222222222.0, 5555555555.0, 6666666666.0, 7777777777.0, 8888888888.0]\n", + "regional_df = df.filter(like='regional', axis=1).dropna()\n", + "\n", + "# Dictionary to know which columns and indices have problematic values\n", + "rem_dic = {}\n", + "columns = []\n", + "\n", + "filter_df = regional_df.copy()\n", + "\n", + "for exc in exclude_values:\n", + " # Find all columns that have those values we need to exclude\n", + " col = regional_df.loc[:,(np.sum(df == exc,axis=0)>0)].columns.tolist()\n", + "\n", + " columns+=col\n", + " \n", + " \n", + " # Now we can use those columns to find the corresponding rows\n", + " for c in col:\n", + " indices = regional_df.loc[(np.sum(df == exc,axis=1)>0),col].index.tolist() \n", + " \n", + " rem_dic[c] = {str(exc):indices}\n", + " \n", + " filter_df = filter_df[df[c]!=exc]\n", + " print(f'column:{c}, value:{exc}')\n", + " \n", + "print(rem_dic)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16fcbf05", + "metadata": {}, + "outputs": [], + "source": [ + "# Irregular values defined by regionalatlas needs to be translated to nan so we can handle it later on\n", + "import numpy as np\n", + "regional_atlas = [col for col in df if col.startswith('regional_atlas')]\n", + "\n", + "print(\"Changed the following features, because of irregular values of regionalatlas:\")\n", + "for col in regional_atlas:\n", + " n_irr = (df[col]>=2222222222).sum()\n", + " n = (df[col].notnull()).sum()\n", + " \n", + " if (n_irr>0):\n", + " print(col+': '+str(n_irr)+' out of '+ str(n))\n", + " df[col] = np.where(df[col] >= 2222222222, np.nan, df[col])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "730b574a", + "metadata": {}, + "outputs": [], + "source": [ + "isna = sum(df['google_places_place_id_matches_phone_search'].isna())\n", + "print(f'Empty: {isna}')\n", + "print(f'Not empty: {df.shape[0]-isna}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35186540", + "metadata": {}, + "outputs": [], + "source": [ + "print(df.groupby('MerchantSizeByDPV').count()['number_area'].reindex(categories_order))\n", + "\n", + "tmp = df[df['number_country']=='Germany'].groupby('MerchantSizeByDPV').count()\n", + "(tmp / (sum(tmp['Last Name'].values)/129))['First Name'].reindex(categories_order)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27014d88", + "metadata": {}, + "outputs": [], + "source": [ + "min_max_df = df.agg({\n", + "'google_places_user_ratings_total':[\"min\",\"max\"],\n", + "'google_places_rating':[\"min\",\"max\"],\n", + "'google_places_price_level':[\"min\",\"max\"],\n", + "'reviews_sentiment_score':[\"min\",\"max\"],\n", + "'regional_atlas_age_0':[\"min\",\"max\"],\n", + "'regional_atlas_age_1':[\"min\",\"max\"],\n", + "'regional_atlas_age_2':[\"min\",\"max\"],\n", + "'regional_atlas_age_3':[\"min\",\"max\"],\n", + "'regional_atlas_age_4':[\"min\",\"max\"],\n", + "'regional_atlas_per_service_sector':[\"min\",\"max\"],\n", + "'regional_atlas_per_trade':[\"min\",\"max\"],\n", + "'regional_atlas_employment_rate':[\"min\",\"max\"],\n", + "'regional_atlas_unemployment_rate':[\"min\",\"max\"],\n", + "'regional_atlas_per_long_term_unemployment':[\"min\",\"max\"],\n", + "'regional_atlas_pop_density':[\"min\",\"max\"],\n", + "'regional_atlas_pop_development':[\"min\",\"max\"],\n", + "'regional_atlas_pop_avg_age':[\"min\",\"max\"],\n", + "'regional_atlas_investments_p_employee':[\"min\",\"max\"],\n", + "'regional_atlas_gross_salary_p_employee':[\"min\",\"max\"],\n", + "'regional_atlas_disp_income_p_inhabitant':[\"min\",\"max\"],\n", + "'regional_atlas_tot_income_p_taxpayer':[\"min\",\"max\"],\n", + "'regional_atlas_gdp_p_employee':[\"min\",\"max\"],\n", + "'regional_atlas_gdp_development':[\"min\",\"max\"],\n", + "'regional_atlas_gdp_p_inhabitant':[\"min\",\"max\"],\n", + "'regional_atlas_gdp_p_workhours':[\"min\",\"max\"],\n", + "'regional_atlas_pop_avg_age_zensus':[\"min\",\"max\"],\n", + "'regional_atlas_regional_score':[\"min\",\"max\"]\n", + "})\n", + "\n", + "# Apply the function for each column\n", + "for col in min_max_df.columns:\n", + " min_feature = min_max_df[col]['min']\n", + " max_feature = min_max_df[col]['max']\n", + " print(f'{col}: [{min_feature}, {max_feature}]') " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51d78040", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import percentileofscore\n", + "\n", + "percentile_col = [\n", + "'regional_atlas_age_0',\n", + "'regional_atlas_age_1',\n", + " 'regional_atlas_age_2',\n", + "'regional_atlas_age_3',\n", + "'regional_atlas_age_4',\n", + "'google_places_user_ratings_total','google_places_rating','reviews_sentiment_score','regional_atlas_pop_density',\n", + "'regional_atlas_pop_development',\n", + "'regional_atlas_pop_avg_age',\n", + "'regional_atlas_per_service_sector',\n", + "'regional_atlas_per_trade',\n", + "'regional_atlas_employment_rate',\n", + "'regional_atlas_unemployment_rate',\n", + "'regional_atlas_per_long_term_unemployment',\n", + "'regional_atlas_investments_p_employee',\n", + "'regional_atlas_gross_salary_p_employee',\n", + "'regional_atlas_disp_income_p_inhabitant',\n", + "'regional_atlas_tot_income_p_taxpayer',\n", + "'regional_atlas_gdp_p_employee',\n", + "'regional_atlas_gdp_development',\n", + "'regional_atlas_gdp_p_inhabitant',\n", + "'regional_atlas_gdp_p_workhours',\n", + "'regional_atlas_pop_avg_age_zensus',\n", + "'regional_atlas_regional_score']\n", + "\n", + "for col in percentile_col:\n", + " no_nan = df[col][df[col].notnull()]\n", + " col_name = col+'_percentiles' \n", + " df[col_name] = no_nan.apply(lambda x: percentileofscore(no_nan, x))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "307c0e39", + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Adding the percentiles as columns for analysis and report\n", + "\n", + "for col in percentile_col:\n", + " feature = col+\"_percentiles\"\n", + " not_nan = df[feature].notnull()\n", + "\n", + " classes = df['MerchantSizeByDPV'].unique()\n", + "\n", + " for c in classes:\n", + " sns.kdeplot(df[not_nan][df[not_nan]['MerchantSizeByDPV']==c][feature], fill=False, label=c)\n", + " \n", + " # Add labels and title\n", + " plt.xlabel('Value')\n", + " plt.ylabel('Density')\n", + " plt.title('Distribution of '+col)\n", + " plt.legend()\n", + "\n", + " # Show the plot\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8073b0a8", + "metadata": {}, + "outputs": [], + "source": [ + "for c in classes:\n", + " tmp = df[not_nan][df[not_nan]['MerchantSizeByDPV']==c]\n", + " sns.kdeplot(x=tmp['google_places_user_ratings_total_percentiles'], y=tmp['google_places_rating_percentiles'], fill=False, label=c)\n", + "\n", + " plt.xlabel('ratings_total')\n", + " plt.ylabel('rating_avg')\n", + " plt.title('Distribution of '+c)\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "edb5923a", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "arr_false = {}\n", + "\n", + "for column in df:\n", + " \n", + " if df[column].dtype == bool:\n", + " false_count = np.count_nonzero(df[column] == False)\n", + " arr_false[column] = false_count\n", + " \n", + "print(arr_false)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30d74633", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "Dangerous to come to a conclusion based on Gender.\n", + "\"\"\"\n", + "import gender_guesser.detector as gender\n", + "gd = gender.Detector()\n", + "df['Gender'] = df['First Name'].apply(str.capitalize).map(lambda x: gd.get_gender(x))\n", + "\n", + "group_feature = 'Gender' # MerchantSizeByDPV or Gender\n", + "total_counts = df[group_feature].value_counts().reset_index(name='total_count')\n", + "total_counts = total_counts.rename(columns={'index':group_feature})\n", + "grouped_counts = df.groupby(['Gender', 'MerchantSizeByDPV']).size().reset_index(name='count')\n", + "\n", + "result = pd.merge(grouped_counts, total_counts, on=group_feature)\n", + "result['proportion'] = result['count'] / result['total_count']\n", + "\n", + "category_order = ['XS','S','M','L','XL']\n", + "\n", + "\n", + "# Create separate DataFrames for each gender\n", + "# For better depiction .drop(index='XS') and take away XS from category_order\n", + "# andy: androgynous\n", + "andy_data = result[result['Gender'] == 'andy'].set_index('MerchantSizeByDPV')['proportion']\n", + "unknown_data = result[result['Gender'] == 'unknown'].set_index('MerchantSizeByDPV')['proportion']\n", + "mostly_female_data = result[result['Gender'] == 'mostly_female'].set_index('MerchantSizeByDPV')['proportion']\n", + "mostly_male_data = result[result['Gender'] == 'mostly_male'].set_index('MerchantSizeByDPV')['proportion']\n", + "male_data = result[result['Gender'] == 'male'].set_index('MerchantSizeByDPV')['proportion']\n", + "female_data = result[result['Gender'] == 'female'].set_index('MerchantSizeByDPV')['proportion']\n", + "\n", + "# Plotting\n", + "plt.plot(category_order, andy_data, label='Andy')\n", + "plt.plot(category_order, unknown_data, label='Unknown')\n", + "plt.plot(category_order, mostly_female_data, label='Mostly Female')\n", + "plt.plot(category_order, mostly_male_data, label='Mostly Male')\n", + "plt.plot(category_order, male_data, label='Male')\n", + "plt.plot(category_order, female_data, label='Female')\n", + "\n", + "# Set labels and title\n", + "plt.xlabel('MerchantSizeByDPV')\n", + "plt.ylabel('Proportion')\n", + "plt.title('Proportion of MerchantSizeByDPV for Each Gender')\n", + "\n", + "# Display the plot\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd4a15f7", + "metadata": {}, + "outputs": [], + "source": [ + "mcc_group = df.groupby(by=['MCC Level','MerchantSizeByDPV']).size()\n", + "grouped = mcc_group.unstack()\n", + "mcc_sum = mcc_group.groupby(level=0).sum()\n", + "\n", + "mcc_df = pd.concat([grouped, sum_test], axis=1)\n", + "tmp = mcc_df[0]\n", + "mcc_df = mcc_df.divide(mcc_df[0], axis=0).sort_values(by='XS', ascending=True)\n", + "mcc_df['Sum'] = tmp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e787a6cd", + "metadata": {}, + "outputs": [], + "source": [ + "print('Dropped the rows due to less than 50 examples:')\n", + "print(mcc_df[mcc_df['Sum']<50].index.values)\n", + "mcc_df = mcc_df[mcc_df['Sum']>=50]\n", + "\n", + "# Show every 10 categories (previously ordered by ascending XS), to compare the categories\n", + "# The first categories are the most attractive ones\n", + "for i in range(mcc_df.shape[0]): \n", + " if i % 10 == 0:\n", + " mcc_df.drop([0,'Sum','XS'],axis=1)[i:(i+5)].transpose().plot.line()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fc010da", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "\n", + "data = df[df['google_places_detailed_type'].notnull()]\n", + "test = pd.Series([x for item in data.google_places_detailed_type for x in ast.literal_eval(item)]).value_counts()\n", + "test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd76690a", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "docs = df['google_places_detailed_type'][df['google_places_detailed_type'].notna()]\n", + "docs = docs.apply(lambda row: ast.literal_eval(row))\n", + "\n", + "vectorizer = CountVectorizer(analyzer=lambda x: x) # , min_df = 50\n", + "categories = vectorizer.fit_transform(docs).toarray()\n", + "vectorizer.get_feature_names_out()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd08fc9b", + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import chi2_contingency\n", + "\n", + "# Create a contingency table for each feature\n", + "contingency_tables = {}\n", + "\n", + "cat_col = df[['google_places_candidate_count_mail','google_places_candidate_count_phone','google_places_rating','google_places_price_level','google_places_confidence','MCC Level', 'Gender','number_area','first_name_in_account','last_name_in_account','google_places_business_status','number_country','number_valid','number_possible','google_places_place_id_matches_phone_search']].fillna('no_data')\n", + "cat_col['b_google_website'] = df['google_places_detailed_website'].notnull()\n", + "\n", + "#for feature_column in df.columns[df.columns != 'label']:\n", + "for feature_column in cat_col.columns:\n", + " contingency_table = pd.crosstab(df['MerchantSizeByDPV'], cat_col[feature_column])\n", + " contingency_tables[feature_column] = contingency_table\n", + "\n", + "# Perform chi-squared test for each feature\n", + "results = {}\n", + "for feature, table in contingency_tables.items():\n", + " chi2_stat, p_value, dof, expected = chi2_contingency(table)\n", + " results[feature] = {'Chi-squared stat': chi2_stat, 'P-value': p_value, 'Degrees of Freedom': dof}\n", + "\n", + "# Display the results\n", + "for feature, result in results.items():\n", + " print(f\"\\nChi-squared test for {feature}:\")\n", + " print(f\"Chi-squared statistic: {result['Chi-squared stat']:.2f}\")\n", + " print(f\"P-value: {result['P-value']:.4f}\")\n", + " print(f\"Degrees of freedom: {result['Degrees of Freedom']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c46eb0f4", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "def b_bayesian(df,bin_column,b_value=True):\n", + " \n", + " prior_A = df.groupby('MerchantSizeByDPV').count()['Email']/df.shape[0] \n", + " prior_B = df[df[bin_column]==b_value].shape[0] / df[bin_column].shape[0]\n", + " evidence_A = df[df[bin_column]==b_value].groupby('MerchantSizeByDPV').count()[bin_column] / df.groupby('MerchantSizeByDPV').count()[bin_column]\n", + " posterior_B = (prior_A*evidence_A) / prior_B\n", + " \n", + " return posterior_B.reindex(index=['XS', 'S', 'M', 'L','XL'])\n", + "\n", + "per_size = (df.groupby('MerchantSizeByDPV').count()['Email']/df.shape[0]).reindex(index=['XS', 'S', 'M', 'L','XL'])\n", + "\n", + "\n", + "series_not_possible =b_bayesian(df,'number_possible',False)-per_size\n", + "series_invalid = b_bayesian(df,'number_valid',False)-per_size\n", + "series_first_name = b_bayesian(df,'first_name_in_account',True)-per_size\n", + "series_last_name = b_bayesian(df,'last_name_in_account',True)-per_size\n", + "\n", + "series_possible =b_bayesian(df,'number_possible',True)-per_size\n", + "series_valid = b_bayesian(df,'number_valid',True)-per_size\n", + "series_no_first_name = b_bayesian(df,'first_name_in_account',False)-per_size\n", + "series_no_last_name = b_bayesian(df,'last_name_in_account',False)-per_size\n", + "\n", + "# Ensure the 'Category' column is ordered\n", + "categories_order = ['XS', 'S', 'M', 'L','XL']\n", + "\n", + "# Plot the lines\n", + "plt.figure(figsize=(10, 6))\n", + "\n", + "\n", + "plt.plot(categories_order, series_not_possible, label='Number not possible', marker='o')\n", + "plt.plot(categories_order, series_invalid, label='Number invalid', marker='d')\n", + "plt.plot(categories_order, series_first_name, label='First name in account')\n", + "plt.plot(categories_order, series_last_name, label='Last name in account')\n", + "plt.plot(categories_order, series_possible, label='Number possible')\n", + "plt.plot(categories_order, series_valid, label='Number valid')\n", + "plt.plot(categories_order, series_no_first_name, label='First name not in account')\n", + "plt.plot(categories_order, series_no_last_name, label='Last name not in account')\n", + "#plt.plot(categories_order, per_size, label='Percentage of merchant size', marker='s',c='black')\n", + "\n", + "\n", + "plt.title('Bayesian')\n", + "plt.xlabel('Categories')\n", + "plt.ylabel('Percentages')\n", + "plt.legend()\n", + "plt.grid(True)\n", + "\n", + "# Show the plot\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a18821e", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import numpy as np\n", + "\n", + "\n", + "class_colors = sns.color_palette(\"colorblind\")[:5]\n", + "regional_df = df.filter(like='regional', axis=1)\n", + "regional_df['MerchantSizeByDPV'] = df['MerchantSizeByDPV']\n", + "\n", + "# Plot boxplots for each column with different MerchantSizeByDPV boxplots next to each other\n", + "for i, column in enumerate(regional_df.columns[:-1]): # Exclude the last column ('MerchantSizeByDPV') \n", + " \n", + " if column == 'regional_atlas_pop_development': \n", + " axes = sns.boxplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_pop_development']<2000],palette=class_colors, order=['XS', 'S','M','L','XL']) \n", + "\n", + " elif column == 'regional_atlas_gdp_development':\n", + " axes = sns.boxplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_gdp_development']<60],palette=class_colors, order=['XS', 'S','M','L','XL']) \n", + " \n", + " else:\n", + " axes = sns.boxplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df,palette=class_colors, order=['XS', 'S','M','L','XL'])\n", + " \n", + " axes.set_title(f'Boxplot of {column}')\n", + " axes.set_xlabel('MerchantSizeByDPV')\n", + " axes.set_ylabel(column) \n", + " \n", + " median_value = regional_df[regional_df['MerchantSizeByDPV'] == 'XL'][column].median()\n", + " axes.axhline(y=median_value, color='red', linestyle='--', label=f'Median (XL)')\n", + " axes.legend(bbox_to_anchor=(1.05, 0.5), loc='upper right')\n", + " \n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18949200", + "metadata": {}, + "outputs": [], + "source": [ + "# Same like the boxplots but now with violinplots\n", + "for column in regional_df.filter(like='regional', axis=1).columns: \n", + " if column == 'regional_atlas_pop_development': \n", + " axes = sns.violinplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_pop_development']<2000],palette=class_colors, order=['XS', 'S','M','L','XL']) \n", + "\n", + " elif column == 'regional_atlas_gdp_development':\n", + " axes = sns.violinplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_gdp_development']<60],palette=class_colors, order=['XS', 'S','M','L','XL']) \n", + " \n", + " else:\n", + " axes = sns.violinplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df,palette=class_colors, order=['XS', 'S','M','L','XL'])\n", + " \n", + " axes.set_title(f'Boxplot of {column}')\n", + " axes.set_xlabel('MerchantSizeByDPV')\n", + " axes.set_ylabel(column) \n", + " \n", + " median_value = regional_df[regional_df['MerchantSizeByDPV'] == 'XL'][column].median()\n", + " axes.axhline(y=median_value, color='red', linestyle='--', label=f'Median (XL)')\n", + " axes.legend(bbox_to_anchor=(1.05, 0.5), loc='upper right')\n", + " \n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c7b2074", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import preprocessing\n", + "\n", + "# Normalize the features before comparing / dividing them\n", + "x = regional_df.drop('MerchantSizeByDPV', axis = 1).values #returns a numpy array\n", + "min_max_scaler = preprocessing.MinMaxScaler()\n", + "x_scaled = min_max_scaler.fit_transform(x)\n", + "norm_regio = pd.DataFrame(x_scaled, columns=regional_df.drop('MerchantSizeByDPV', axis = 1).columns)\n", + "\n", + "# Compute the stats of the normalized regional data, to find a heuristic to evaluate the features' discriminative magnitudes\n", + "df_stats_XL = norm_regio[regional_df['MerchantSizeByDPV']=='XL'].describe()\n", + "df_stats_XS = norm_regio[regional_df['MerchantSizeByDPV']=='XS'].describe()\n", + "\n", + "((df_stats_XL.loc['50%'] - df_stats_XS.loc['50%'])/(df_stats_XL.loc['75%'] - df_stats_XL.loc['25%'])).sort_values(ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "684af78c", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "# Compute a correlation matrix for all float values of our dataframe\n", + "float_cols = df.columns[df.dtypes==float]\n", + "corr_matrix = df[float_cols].corr()\n", + "\n", + "# The diagonal values (correlation of each feature with itself) should be considered 0, to filter them out\n", + "np.fill_diagonal(corr_matrix.values, 0)\n", + "\n", + "# Create a new DataFrame that transforms all values to 0 that are below a value of defined by variable \"correlation_threshold\" \n", + "correlation_threshold = 0.89\n", + "filtered_correlation_df = corr_matrix.applymap(lambda x: x if abs(x) >= correlation_threshold else 0)\n", + "\n", + "# Identify the rows and columns that not only consists of 0 values (after filtering)\n", + "non_zero_rows = filtered_correlation_df.index[~(filtered_correlation_df == 0).all(axis=1)]\n", + "non_zero_columns = filtered_correlation_df.columns[~(filtered_correlation_df == 0).all(axis=0)]\n", + "new_correlation_df = filtered_correlation_df.loc[non_zero_rows, non_zero_columns]\n", + "\n", + "# Print the new correlation matrix and the corresponding plot\n", + "print(f\"New Correlation Matrix (values greater than {correlation_threshold}):\")\n", + "\n", + "plt.figure(figsize=(12, 10))\n", + "heatmap = sns.heatmap(new_correlation_df, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)\n", + "plt.title('Correlation Matrix Heatmap')\n", + "plt.savefig('correlation_matrix.svg', format='svg')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ba9ee1b", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.decomposition import PCA\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "\n", + "reg_df = df.filter(like='regional', axis=1).dropna()\n", + "\n", + "# Standardize the features\n", + "scaler = StandardScaler()\n", + "scaled_data = scaler.fit_transform(reg_df.drop('MerchantSizeByDPV', axis=1))\n", + "\n", + "# Apply PCA\n", + "pca = PCA()\n", + "principal_components = pca.fit_transform(scaled_data)\n", + "\n", + "# Retrieve explained variance ratios\n", + "explained_variance_ratio = pca.explained_variance_ratio_\n", + "\n", + "components = pd.DataFrame(pca.components_, columns=filter_df.columns)\n", + "\n", + "# Print explained variance ratios\n", + "for i, ratio in enumerate(explained_variance_ratio, 1):\n", + " print(f\"Principal Component {i}: Explained Variance Ratio = {ratio:.4f}\")\n", + "\n", + "# Plot the cumulative explained variance\n", + "cumulative_variance = explained_variance_ratio.cumsum()\n", + "\n", + "plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o')\n", + "plt.title('Cumulative Explained Variance')\n", + "plt.xlabel('Number of Principal Components')\n", + "plt.ylabel('Cumulative Variance Explained')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d731eff3", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "# Count only those words, existing in a minum amount of 100 email adresses\n", + "count_vectorizer = CountVectorizer(min_df=50)\n", + "\n", + "# Fit and transform the text data\n", + "count_matrix = count_vectorizer.fit_transform(df['Email'])\n", + "\n", + "# Convert the matrix to a DataFrame for better readability\n", + "count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4314186c", + "metadata": {}, + "outputs": [], + "source": [ + "common_words = pd.DataFrame(count_df.sum()).transpose()\n", + "\n", + "for word in common_words:\n", + " print(word)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fdf6ba75", + "metadata": {}, + "outputs": [], + "source": [ + "# Names\n", + "names = []\n", + "\n", + "# Weird terms\n", + "weird_terms = []\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa498485", + "metadata": {}, + "outputs": [], + "source": [ + "grouped_common_words = []\n", + "per_size = (df.groupby('MerchantSizeByDPV').count()['Email']/df.shape[0]).reindex(index=['XS', 'S', 'M', 'L','XL'])\n", + "\n", + "for word in common_words.drop(weird_terms,axis=1): #common_words[names], common_words[weird_terms]\n", + " \n", + " indices= count_df[count_df[word]>0].index \n", + " per_word = (df.loc[indices].groupby('MerchantSizeByDPV').count()['Email']/len(df.loc[indices])).reindex(index=['XS', 'S', 'M', 'L','XL']) \n", + " \n", + " grouped_common_words.append((per_word-per_size).rename(word)) \n", + " \n", + "common_df = pd.concat(grouped_common_words, axis=1)\n", + "common_df = common_df.transpose()\n", + "\n", + "common_df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3aeb199a", + "metadata": {}, + "outputs": [], + "source": [ + "# The min/mean/max probability decrease (-) or increase (+) by a value of x with the existence of a certain common word\n", + "\n", + "print(f'{np.min(common_df[\"XS\"])}, {np.mean(common_df[\"XS\"])},{np.max(common_df[\"XS\"])}')\n", + "print(f'{np.min(common_df[\"S\"])}, {np.mean(common_df[\"S\"])},{np.max(common_df[\"S\"])}')\n", + "print(f'{np.min(common_df[\"M\"])}, {np.mean(common_df[\"M\"])},{np.max(common_df[\"M\"])}')\n", + "print(f'{np.min(common_df[\"L\"])}, {np.mean(common_df[\"L\"])},{np.max(common_df[\"L\"])}')\n", + "print(f'{np.min(common_df[\"XL\"])}, {np.mean(common_df[\"XL\"])},{np.max(common_df[\"XL\"])}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38d47b36", + "metadata": {}, + "outputs": [], + "source": [ + "import multiprocessing\n", + "import time\n", + "import pandas as pd\n", + "import numpy as np\n", + "from deutschland.bundesanzeiger import Bundesanzeiger\n", + "import pickle\n", + "import time\n", + "\n", + "def access_ba(company,b_bundesanzeiger,):\n", + "\n", + " b_bundesanzeiger.append(True)\n", + " try:\n", + " ba = Bundesanzeiger()\n", + " data = ba.get_reports(company)\n", + " except:\n", + " b_bundesanzeiger[-1] = False\n", + " return\n", + "\n", + " if __name__ == '__main__':\n", + "\n", + " \"\"\"\n", + " with open('list_file.pkl', 'rb') as file:\n", + " loaded_list = pickle.load(file)\n", + " print(loaded_list)\n", + " \"\"\"\n", + "\n", + " pd.set_option('display.max_columns', None)\n", + "\n", + " historic = pd.read_csv('historic.csv',sep = ',')#_enriched\n", + "\n", + " df = historic.groupby('MerchantSizeByDPV').apply(lambda x: x.sample(100))\n", + "\n", + "\n", + " with multiprocessing.Manager() as manager:\n", + "\n", + " b_bundesanzeiger = manager.list()\n", + " content_array = []\n", + " durations = []\n", + "\n", + " for i, company in enumerate(df[\"Company Name\"]):\n", + "\n", + " print(i)\n", + "\n", + " start = time.time()\n", + "\n", + " # Start access_ba as a process\n", + " p = multiprocessing.Process(target=access_ba, name=\"access_ba\", args=(company,b_bundesanzeiger))\n", + "\n", + " p.start()\n", + "\n", + " # Wait 8 seconds for access_ba\t\n", + " p.join(8)\n", + "\n", + " # If thread is active\n", + " if p.is_alive():\n", + " print (\"Terminate access_ba\")\n", + "\n", + " # Terminate access_ba\n", + " p.terminate()\n", + " b_bundesanzeiger[-1] = 'killed'\n", + "\n", + " # Cleanup\n", + " p.join()\n", + " i+=1\n", + "\n", + " print(b_bundesanzeiger[-1])\n", + " end = time.time()\n", + " print(end-start)\n", + " print()\n", + " durations.append(end-start)\n", + "\n", + " \"\"\"if i==100:\n", + " with open('list_file.pkl', 'wb') as file:\n", + " pickle.dump(list(b_bundesanzeiger), file)\n", + " print(np.mean(np.array(list(b_bundesanzeiger))))\n", + " break\n", + " \"\"\"\n", + "\n", + " with open('list_file.pkl', 'wb') as file:\n", + " pickle.dump(list(b_bundesanzeiger), file)\n", + "\n", + " with open('time.pkl', 'wb') as file:\n", + " pickle.dump(durations, file)\n", + "\n", + " df.to_pickle(\"./dataframe_sample.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8a6f460", + "metadata": {}, + "outputs": [], + "source": [ + "with open('dataframe_sample.pkl', 'rb') as f:\n", + " df = pickle.load(f)\n", + "\n", + "df = df.reset_index(drop=True)\n", + "\n", + "with open('list_file.pkl', 'rb') as f:\n", + " mynewlist = pickle.load(f)\n", + "\n", + "with open('time.pkl', 'rb') as f:\n", + " time = pickle.load(f)\n", + "\n", + "df_stats = pd.DataFrame({'b_bundesanzeiger': mynewlist, 'time': time})\n", + "\n", + "df['b_bundesanzeiger'] = df_stats['b_bundesanzeiger']\n", + "df['time'] = df_stats['time']\n", + "\n", + "\n", + "counts =df.groupby('MerchantSizeByDPV')['b_bundesanzeiger'].value_counts()\n", + "\n", + "desired_value_counts = counts.unstack().fillna(0)\n", + "\n", + "# Compute total counts per category\n", + "total_counts_per_category = counts.groupby('MerchantSizeByDPV').sum()\n", + "\n", + "# Compute probability for each category\n", + "probabilities = desired_value_counts.apply(lambda x: x / total_counts_per_category)\n", + "\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30b18dea", + "metadata": {}, + "outputs": [], + "source": [ + "df['b_google_places'] = df[\"google_places_place_id\"].notnull()\n", + "counts =df.groupby('MerchantSizeByDPV')['b_google_places'].value_counts()\n", + "\n", + "desired_value_counts = counts.unstack().fillna(0)\n", + "\n", + "# Compute total counts per category\n", + "total_counts_per_category = counts.groupby('MerchantSizeByDPV').sum()\n", + "\n", + "# Compute probability for each category\n", + "probabilities = desired_value_counts.apply(lambda x: x / total_counts_per_category)\n", + "\n", + "print(probabilities)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f167c71", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "# Separate features (X) and target variable (y)\n", + "table = df[regional_columns+['MerchantSizeByDPV']].dropna()\n", + "y = table['MerchantSizeByDPV']\n", + "X=table[regional_columns]\n", + "\n", + "X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X, y)\n", + "\n", + "# Split the data into training and testing sets\n", + "X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)\n", + "\n", + "# Create a logistic regression model\n", + "model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=4000, class_weight='balanced')\n", + "\n", + "# Fit the model on the training data\n", + "model.fit(X_train, y_train)\n", + "\n", + "# Make predictions on the testing data\n", + "y_pred = model.predict(X_test)\n", + "\n", + "# Evaluate the model\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "conf_matrix = confusion_matrix(y_test, y_pred)\n", + "class_report = classification_report(y_test, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e6afdec", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n", + "\n", + "# Assuming X and y are your feature matrix and target variable\n", + "X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)\n", + "\n", + "# Create and fit the Random Forest model\n", + "rf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n", + "rf_model.fit(X_train, y_train)\n", + "\n", + "# Make predictions on the test set\n", + "y_pred = rf_model.predict(X_test)\n", + "\n", + "# Evaluate the model\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "conf_matrix = confusion_matrix(y_test, y_pred)\n", + "class_report = classification_report(y_test, y_pred)\n", + "\n", + "# Display evaluation metrics\n", + "print(f'Accuracy: {accuracy:.2f}')\n", + "print(f'Confusion Matrix:\\n{conf_matrix}')\n", + "print(f'Classification Report:\\n{class_report}')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abea2245", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.ensemble import IsolationForest\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n", + "\n", + "# Assuming X and y are your feature matrix and target variable\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "# Define the set of rare classes\n", + "rare_classes = ['XL'] # Replace with the actual class labels you consider rare\n", + " # MAYBE NOT ONLY XL, but also L and M\n", + "# Create a binary target variable indicating whether each instance is rare or not\n", + "y_train_rare = y_train.isin(rare_classes).astype(int)\n", + "y_test_rare = y_test.isin(rare_classes).astype(int)\n", + "\n", + "# Create and fit the Isolation Forest model\n", + "if_model = IsolationForest(contamination='auto')\n", + "if_model.fit(X_train)\n", + "\n", + "# Predict anomalies on the test set\n", + "y_pred_rare = if_model.predict(X_test)\n", + "\n", + "# Convert the predicted labels to binary (1 for anomalies, -1 for normal instances)\n", + "y_pred_rare_binary = (y_pred_rare == -1).astype(int)\n", + "\n", + "# Evaluate the model\n", + "accuracy = accuracy_score(y_test_rare, y_pred_rare_binary)\n", + "conf_matrix = confusion_matrix(y_test_rare, y_pred_rare_binary)\n", + "class_report = classification_report(y_test_rare, y_pred_rare_binary)\n", + "\n", + "# Display evaluation metrics\n", + "print(f'Accuracy: {accuracy:.2f}')\n", + "print(f'Confusion Matrix:\\n{conf_matrix}')\n", + "print(f'Classification Report:\\n{class_report}')\n", + "\n", + "plt.figure(figsize=(6, 4))\n", + "sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False,\n", + " xticklabels=[0,1], yticklabels=[0,1])\n", + "plt.xlabel('Predicted Label')\n", + "plt.ylabel('True Label')\n", + "plt.title('Confusion Matrix')\n", + "plt.show()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}