diff --git a/notebooks/fabian_feature_analysis.ipynb b/notebooks/fabian_feature_analysis.ipynb
index 8b13789..5bfa4aa 100644
--- a/notebooks/fabian_feature_analysis.ipynb
+++ b/notebooks/fabian_feature_analysis.ipynb
@@ -1 +1,1043 @@
-
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d3b226f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "df = pd.read_csv(\"s3://amos-training-data/100k_historic_enriched.csv\")\n",
+    "categories_order = ['XS', 'S', 'M', 'L','XL']\n",
+    "df.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f508fadb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(len(df))\n",
+    "df.groupby(by=['MerchantSizeByDPV']).size()/len(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a850c5e7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check if there are repetitions (> 0 => leads that exist multiple times according to the identifier)\n",
+    "identifier = df[['Phone','Email','Company Name','number_formatted','google_places_place_id','google_places_formatted_address','google_places_name','google_places_detailed_website']]\n",
+    "for col in identifier:\n",
+    "    print(f'{col}: {len(df[col].unique())} ({1-len(df[col].unique())/df[col].count()})')    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a7d07397",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\" Regionalatlas: Placeholder\n",
+    "2222222222: nichts vorhanden, genau 0\n",
+    "5555555555: Zahlenwert unbekannt oder geheim zu halten\n",
+    "6666666666: Tabellenfach gesperrt, da Aussage nicht sinnvoll\n",
+    "7777777777: keine Angabe, da Zahlenwert nicht sicher genug\n",
+    "8888888888: Angabe fällt später an\n",
+    "\"\"\"\n",
+    "\n",
+    "exclude_values = [2222222222.0, 5555555555.0, 6666666666.0, 7777777777.0, 8888888888.0]\n",
+    "regional_df = df.filter(like='regional', axis=1).dropna()\n",
+    "\n",
+    "# Dictionary to know which columns and indices have problematic values\n",
+    "rem_dic = {}\n",
+    "columns = []\n",
+    "\n",
+    "filter_df = regional_df.copy()\n",
+    "\n",
+    "for exc in exclude_values:\n",
+    "    # Find all columns that have those values we need to exclude\n",
+    "    col = regional_df.loc[:,(np.sum(df == exc,axis=0)>0)].columns.tolist()\n",
+    "\n",
+    "    columns+=col\n",
+    "    \n",
+    "    \n",
+    "    # Now we can use those columns to find the corresponding rows\n",
+    "    for c in col:\n",
+    "        indices = regional_df.loc[(np.sum(df == exc,axis=1)>0),col].index.tolist()        \n",
+    "            \n",
+    "        rem_dic[c] = {str(exc):indices}\n",
+    "        \n",
+    "        filter_df = filter_df[df[c]!=exc]\n",
+    "        print(f'column:{c}, value:{exc}')\n",
+    "        \n",
+    "print(rem_dic)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16fcbf05",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Irregular values defined by regionalatlas needs to be translated to nan so we can handle it later on\n",
+    "import numpy as np\n",
+    "regional_atlas = [col for col in df if col.startswith('regional_atlas')]\n",
+    "\n",
+    "print(\"Changed the following features, because of irregular values of regionalatlas:\")\n",
+    "for col in regional_atlas:\n",
+    "    n_irr = (df[col]>=2222222222).sum()\n",
+    "    n = (df[col].notnull()).sum()\n",
+    "    \n",
+    "    if (n_irr>0):\n",
+    "        print(col+': '+str(n_irr)+' out of '+ str(n))\n",
+    "    df[col] = np.where(df[col] >= 2222222222, np.nan, df[col])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "730b574a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "isna = sum(df['google_places_place_id_matches_phone_search'].isna())\n",
+    "print(f'Empty: {isna}')\n",
+    "print(f'Not empty: {df.shape[0]-isna}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "35186540",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(df.groupby('MerchantSizeByDPV').count()['number_area'].reindex(categories_order))\n",
+    "\n",
+    "tmp = df[df['number_country']=='Germany'].groupby('MerchantSizeByDPV').count()\n",
+    "(tmp / (sum(tmp['Last Name'].values)/129))['First Name'].reindex(categories_order)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "27014d88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "min_max_df = df.agg({\n",
+    "'google_places_user_ratings_total':[\"min\",\"max\"],\n",
+    "'google_places_rating':[\"min\",\"max\"],\n",
+    "'google_places_price_level':[\"min\",\"max\"],\n",
+    "'reviews_sentiment_score':[\"min\",\"max\"],\n",
+    "'regional_atlas_age_0':[\"min\",\"max\"],\n",
+    "'regional_atlas_age_1':[\"min\",\"max\"],\n",
+    "'regional_atlas_age_2':[\"min\",\"max\"],\n",
+    "'regional_atlas_age_3':[\"min\",\"max\"],\n",
+    "'regional_atlas_age_4':[\"min\",\"max\"],\n",
+    "'regional_atlas_per_service_sector':[\"min\",\"max\"],\n",
+    "'regional_atlas_per_trade':[\"min\",\"max\"],\n",
+    "'regional_atlas_employment_rate':[\"min\",\"max\"],\n",
+    "'regional_atlas_unemployment_rate':[\"min\",\"max\"],\n",
+    "'regional_atlas_per_long_term_unemployment':[\"min\",\"max\"],\n",
+    "'regional_atlas_pop_density':[\"min\",\"max\"],\n",
+    "'regional_atlas_pop_development':[\"min\",\"max\"],\n",
+    "'regional_atlas_pop_avg_age':[\"min\",\"max\"],\n",
+    "'regional_atlas_investments_p_employee':[\"min\",\"max\"],\n",
+    "'regional_atlas_gross_salary_p_employee':[\"min\",\"max\"],\n",
+    "'regional_atlas_disp_income_p_inhabitant':[\"min\",\"max\"],\n",
+    "'regional_atlas_tot_income_p_taxpayer':[\"min\",\"max\"],\n",
+    "'regional_atlas_gdp_p_employee':[\"min\",\"max\"],\n",
+    "'regional_atlas_gdp_development':[\"min\",\"max\"],\n",
+    "'regional_atlas_gdp_p_inhabitant':[\"min\",\"max\"],\n",
+    "'regional_atlas_gdp_p_workhours':[\"min\",\"max\"],\n",
+    "'regional_atlas_pop_avg_age_zensus':[\"min\",\"max\"],\n",
+    "'regional_atlas_regional_score':[\"min\",\"max\"]\n",
+    "})\n",
+    "\n",
+    "# Apply the function for each column\n",
+    "for col in min_max_df.columns:\n",
+    "    min_feature = min_max_df[col]['min']\n",
+    "    max_feature = min_max_df[col]['max']\n",
+    "    print(f'{col}: [{min_feature}, {max_feature}]')    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "51d78040",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from scipy.stats import percentileofscore\n",
+    "\n",
+    "percentile_col = [\n",
+    "'regional_atlas_age_0',\n",
+    "'regional_atlas_age_1',\n",
+    " 'regional_atlas_age_2',\n",
+    "'regional_atlas_age_3',\n",
+    "'regional_atlas_age_4',\n",
+    "'google_places_user_ratings_total','google_places_rating','reviews_sentiment_score','regional_atlas_pop_density',\n",
+    "'regional_atlas_pop_development',\n",
+    "'regional_atlas_pop_avg_age',\n",
+    "'regional_atlas_per_service_sector',\n",
+    "'regional_atlas_per_trade',\n",
+    "'regional_atlas_employment_rate',\n",
+    "'regional_atlas_unemployment_rate',\n",
+    "'regional_atlas_per_long_term_unemployment',\n",
+    "'regional_atlas_investments_p_employee',\n",
+    "'regional_atlas_gross_salary_p_employee',\n",
+    "'regional_atlas_disp_income_p_inhabitant',\n",
+    "'regional_atlas_tot_income_p_taxpayer',\n",
+    "'regional_atlas_gdp_p_employee',\n",
+    "'regional_atlas_gdp_development',\n",
+    "'regional_atlas_gdp_p_inhabitant',\n",
+    "'regional_atlas_gdp_p_workhours',\n",
+    "'regional_atlas_pop_avg_age_zensus',\n",
+    "'regional_atlas_regional_score']\n",
+    "\n",
+    "for col in percentile_col:\n",
+    "    no_nan = df[col][df[col].notnull()]\n",
+    "    col_name = col+'_percentiles'    \n",
+    "    df[col_name] = no_nan.apply(lambda x: percentileofscore(no_nan, x))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "307c0e39",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# Adding the percentiles as columns for analysis and report\n",
+    "\n",
+    "for col in percentile_col:\n",
+    "    feature = col+\"_percentiles\"\n",
+    "    not_nan = df[feature].notnull()\n",
+    "\n",
+    "    classes = df['MerchantSizeByDPV'].unique()\n",
+    "\n",
+    "    for c in classes:\n",
+    "        sns.kdeplot(df[not_nan][df[not_nan]['MerchantSizeByDPV']==c][feature], fill=False, label=c)\n",
+    "        \n",
+    "    # Add labels and title\n",
+    "    plt.xlabel('Value')\n",
+    "    plt.ylabel('Density')\n",
+    "    plt.title('Distribution of '+col)\n",
+    "    plt.legend()\n",
+    "\n",
+    "    # Show the plot\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8073b0a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for c in classes:\n",
+    "    tmp = df[not_nan][df[not_nan]['MerchantSizeByDPV']==c]\n",
+    "    sns.kdeplot(x=tmp['google_places_user_ratings_total_percentiles'], y=tmp['google_places_rating_percentiles'], fill=False, label=c)\n",
+    "\n",
+    "    plt.xlabel('ratings_total')\n",
+    "    plt.ylabel('rating_avg')\n",
+    "    plt.title('Distribution of '+c)\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "edb5923a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "arr_false = {}\n",
+    "\n",
+    "for column in df:\n",
+    "    \n",
+    "    if df[column].dtype == bool:\n",
+    "        false_count = np.count_nonzero(df[column] == False)\n",
+    "        arr_false[column] = false_count\n",
+    "    \n",
+    "print(arr_false)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "30d74633",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "Dangerous to come to a conclusion based on Gender.\n",
+    "\"\"\"\n",
+    "import gender_guesser.detector as gender\n",
+    "gd = gender.Detector()\n",
+    "df['Gender'] = df['First Name'].apply(str.capitalize).map(lambda x: gd.get_gender(x))\n",
+    "\n",
+    "group_feature = 'Gender' # MerchantSizeByDPV or Gender\n",
+    "total_counts = df[group_feature].value_counts().reset_index(name='total_count')\n",
+    "total_counts = total_counts.rename(columns={'index':group_feature})\n",
+    "grouped_counts = df.groupby(['Gender', 'MerchantSizeByDPV']).size().reset_index(name='count')\n",
+    "\n",
+    "result = pd.merge(grouped_counts, total_counts, on=group_feature)\n",
+    "result['proportion'] = result['count'] / result['total_count']\n",
+    "\n",
+    "category_order = ['XS','S','M','L','XL']\n",
+    "\n",
+    "\n",
+    "# Create separate DataFrames for each gender\n",
+    "# For better depiction .drop(index='XS') and take away XS from category_order\n",
+    "# andy: androgynous\n",
+    "andy_data = result[result['Gender'] == 'andy'].set_index('MerchantSizeByDPV')['proportion']\n",
+    "unknown_data = result[result['Gender'] == 'unknown'].set_index('MerchantSizeByDPV')['proportion']\n",
+    "mostly_female_data = result[result['Gender'] == 'mostly_female'].set_index('MerchantSizeByDPV')['proportion']\n",
+    "mostly_male_data = result[result['Gender'] == 'mostly_male'].set_index('MerchantSizeByDPV')['proportion']\n",
+    "male_data = result[result['Gender'] == 'male'].set_index('MerchantSizeByDPV')['proportion']\n",
+    "female_data = result[result['Gender'] == 'female'].set_index('MerchantSizeByDPV')['proportion']\n",
+    "\n",
+    "# Plotting\n",
+    "plt.plot(category_order, andy_data, label='Andy')\n",
+    "plt.plot(category_order, unknown_data, label='Unknown')\n",
+    "plt.plot(category_order, mostly_female_data, label='Mostly Female')\n",
+    "plt.plot(category_order, mostly_male_data, label='Mostly Male')\n",
+    "plt.plot(category_order, male_data, label='Male')\n",
+    "plt.plot(category_order, female_data, label='Female')\n",
+    "\n",
+    "# Set labels and title\n",
+    "plt.xlabel('MerchantSizeByDPV')\n",
+    "plt.ylabel('Proportion')\n",
+    "plt.title('Proportion of MerchantSizeByDPV for Each Gender')\n",
+    "\n",
+    "# Display the plot\n",
+    "plt.legend()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fd4a15f7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mcc_group = df.groupby(by=['MCC Level','MerchantSizeByDPV']).size()\n",
+    "grouped = mcc_group.unstack()\n",
+    "mcc_sum = mcc_group.groupby(level=0).sum()\n",
+    "\n",
+    "mcc_df = pd.concat([grouped, sum_test], axis=1)\n",
+    "tmp = mcc_df[0]\n",
+    "mcc_df = mcc_df.divide(mcc_df[0], axis=0).sort_values(by='XS', ascending=True)\n",
+    "mcc_df['Sum'] = tmp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e787a6cd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('Dropped the rows due to less than 50 examples:')\n",
+    "print(mcc_df[mcc_df['Sum']<50].index.values)\n",
+    "mcc_df = mcc_df[mcc_df['Sum']>=50]\n",
+    "\n",
+    "# Show every 10 categories (previously ordered by ascending XS), to compare the categories\n",
+    "# The first categories are the most attractive ones\n",
+    "for i in range(mcc_df.shape[0]):    \n",
+    "    if i % 10 == 0:\n",
+    "        mcc_df.drop([0,'Sum','XS'],axis=1)[i:(i+5)].transpose().plot.line()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5fc010da",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ast\n",
+    "\n",
+    "data = df[df['google_places_detailed_type'].notnull()]\n",
+    "test = pd.Series([x for item in data.google_places_detailed_type for x in ast.literal_eval(item)]).value_counts()\n",
+    "test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fd76690a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "\n",
+    "docs = df['google_places_detailed_type'][df['google_places_detailed_type'].notna()]\n",
+    "docs = docs.apply(lambda row: ast.literal_eval(row))\n",
+    "\n",
+    "vectorizer = CountVectorizer(analyzer=lambda x: x) # , min_df = 50\n",
+    "categories = vectorizer.fit_transform(docs).toarray()\n",
+    "vectorizer.get_feature_names_out()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd08fc9b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from scipy.stats import chi2_contingency\n",
+    "\n",
+    "# Create a contingency table for each feature\n",
+    "contingency_tables = {}\n",
+    "\n",
+    "cat_col = df[['google_places_candidate_count_mail','google_places_candidate_count_phone','google_places_rating','google_places_price_level','google_places_confidence','MCC Level', 'Gender','number_area','first_name_in_account','last_name_in_account','google_places_business_status','number_country','number_valid','number_possible','google_places_place_id_matches_phone_search']].fillna('no_data')\n",
+    "cat_col['b_google_website'] = df['google_places_detailed_website'].notnull()\n",
+    "\n",
+    "#for feature_column in df.columns[df.columns != 'label']:\n",
+    "for feature_column in cat_col.columns:\n",
+    "    contingency_table = pd.crosstab(df['MerchantSizeByDPV'], cat_col[feature_column])\n",
+    "    contingency_tables[feature_column] = contingency_table\n",
+    "\n",
+    "# Perform chi-squared test for each feature\n",
+    "results = {}\n",
+    "for feature, table in contingency_tables.items():\n",
+    "    chi2_stat, p_value, dof, expected = chi2_contingency(table)\n",
+    "    results[feature] = {'Chi-squared stat': chi2_stat, 'P-value': p_value, 'Degrees of Freedom': dof}\n",
+    "\n",
+    "# Display the results\n",
+    "for feature, result in results.items():\n",
+    "    print(f\"\\nChi-squared test for {feature}:\")\n",
+    "    print(f\"Chi-squared statistic: {result['Chi-squared stat']:.2f}\")\n",
+    "    print(f\"P-value: {result['P-value']:.4f}\")\n",
+    "    print(f\"Degrees of freedom: {result['Degrees of Freedom']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c46eb0f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "def b_bayesian(df,bin_column,b_value=True):\n",
+    "    \n",
+    "    prior_A = df.groupby('MerchantSizeByDPV').count()['Email']/df.shape[0]        \n",
+    "    prior_B = df[df[bin_column]==b_value].shape[0] / df[bin_column].shape[0]\n",
+    "    evidence_A = df[df[bin_column]==b_value].groupby('MerchantSizeByDPV').count()[bin_column] / df.groupby('MerchantSizeByDPV').count()[bin_column]\n",
+    "    posterior_B = (prior_A*evidence_A) / prior_B\n",
+    "        \n",
+    "    return posterior_B.reindex(index=['XS', 'S', 'M', 'L','XL'])\n",
+    "\n",
+    "per_size = (df.groupby('MerchantSizeByDPV').count()['Email']/df.shape[0]).reindex(index=['XS', 'S', 'M', 'L','XL'])\n",
+    "\n",
+    "\n",
+    "series_not_possible =b_bayesian(df,'number_possible',False)-per_size\n",
+    "series_invalid = b_bayesian(df,'number_valid',False)-per_size\n",
+    "series_first_name = b_bayesian(df,'first_name_in_account',True)-per_size\n",
+    "series_last_name = b_bayesian(df,'last_name_in_account',True)-per_size\n",
+    "\n",
+    "series_possible =b_bayesian(df,'number_possible',True)-per_size\n",
+    "series_valid = b_bayesian(df,'number_valid',True)-per_size\n",
+    "series_no_first_name = b_bayesian(df,'first_name_in_account',False)-per_size\n",
+    "series_no_last_name = b_bayesian(df,'last_name_in_account',False)-per_size\n",
+    "\n",
+    "# Ensure the 'Category' column is ordered\n",
+    "categories_order = ['XS', 'S', 'M', 'L','XL']\n",
+    "\n",
+    "# Plot the lines\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "\n",
+    "\n",
+    "plt.plot(categories_order, series_not_possible, label='Number not possible', marker='o')\n",
+    "plt.plot(categories_order, series_invalid, label='Number invalid', marker='d')\n",
+    "plt.plot(categories_order, series_first_name, label='First name in account')\n",
+    "plt.plot(categories_order, series_last_name, label='Last name in account')\n",
+    "plt.plot(categories_order, series_possible, label='Number possible')\n",
+    "plt.plot(categories_order, series_valid, label='Number valid')\n",
+    "plt.plot(categories_order, series_no_first_name, label='First name not in account')\n",
+    "plt.plot(categories_order, series_no_last_name, label='Last name not in account')\n",
+    "#plt.plot(categories_order, per_size, label='Percentage of merchant size', marker='s',c='black')\n",
+    "\n",
+    "\n",
+    "plt.title('Bayesian')\n",
+    "plt.xlabel('Categories')\n",
+    "plt.ylabel('Percentages')\n",
+    "plt.legend()\n",
+    "plt.grid(True)\n",
+    "\n",
+    "# Show the plot\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3a18821e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "class_colors = sns.color_palette(\"colorblind\")[:5]\n",
+    "regional_df = df.filter(like='regional', axis=1)\n",
+    "regional_df['MerchantSizeByDPV'] = df['MerchantSizeByDPV']\n",
+    "\n",
+    "# Plot boxplots for each column with different MerchantSizeByDPV boxplots next to each other\n",
+    "for i, column in enumerate(regional_df.columns[:-1]):  # Exclude the last column ('MerchantSizeByDPV')    \n",
+    "    \n",
+    "    if column == 'regional_atlas_pop_development':        \n",
+    "        axes = sns.boxplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_pop_development']<2000],palette=class_colors, order=['XS', 'S','M','L','XL']) \n",
+    "\n",
+    "    elif column == 'regional_atlas_gdp_development':\n",
+    "        axes = sns.boxplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_gdp_development']<60],palette=class_colors, order=['XS', 'S','M','L','XL']) \n",
+    "    \n",
+    "    else:\n",
+    "        axes = sns.boxplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df,palette=class_colors, order=['XS', 'S','M','L','XL'])\n",
+    "        \n",
+    "    axes.set_title(f'Boxplot of {column}')\n",
+    "    axes.set_xlabel('MerchantSizeByDPV')\n",
+    "    axes.set_ylabel(column)    \n",
+    "    \n",
+    "    median_value = regional_df[regional_df['MerchantSizeByDPV'] == 'XL'][column].median()\n",
+    "    axes.axhline(y=median_value, color='red', linestyle='--', label=f'Median (XL)')\n",
+    "    axes.legend(bbox_to_anchor=(1.05, 0.5), loc='upper right')\n",
+    "    \n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "18949200",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Same like the boxplots but now with violinplots\n",
+    "for column in regional_df.filter(like='regional', axis=1).columns:   \n",
+    "    if column == 'regional_atlas_pop_development':        \n",
+    "        axes = sns.violinplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_pop_development']<2000],palette=class_colors, order=['XS', 'S','M','L','XL']) \n",
+    "\n",
+    "    elif column == 'regional_atlas_gdp_development':\n",
+    "        axes = sns.violinplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df[regional_df['regional_atlas_gdp_development']<60],palette=class_colors, order=['XS', 'S','M','L','XL']) \n",
+    "    \n",
+    "    else:\n",
+    "        axes = sns.violinplot(x=\"MerchantSizeByDPV\", hue =\"MerchantSizeByDPV\", y=column, data=regional_df,palette=class_colors, order=['XS', 'S','M','L','XL'])\n",
+    "        \n",
+    "    axes.set_title(f'Boxplot of {column}')\n",
+    "    axes.set_xlabel('MerchantSizeByDPV')\n",
+    "    axes.set_ylabel(column)    \n",
+    "    \n",
+    "    median_value = regional_df[regional_df['MerchantSizeByDPV'] == 'XL'][column].median()\n",
+    "    axes.axhline(y=median_value, color='red', linestyle='--', label=f'Median (XL)')\n",
+    "    axes.legend(bbox_to_anchor=(1.05, 0.5), loc='upper right')\n",
+    "    \n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c7b2074",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn import preprocessing\n",
+    "\n",
+    "# Normalize the features before comparing / dividing them\n",
+    "x = regional_df.drop('MerchantSizeByDPV', axis = 1).values #returns a numpy array\n",
+    "min_max_scaler = preprocessing.MinMaxScaler()\n",
+    "x_scaled = min_max_scaler.fit_transform(x)\n",
+    "norm_regio =  pd.DataFrame(x_scaled, columns=regional_df.drop('MerchantSizeByDPV', axis = 1).columns)\n",
+    "\n",
+    "# Compute the stats of the normalized regional data, to find a heuristic to evaluate the features' discriminative magnitudes\n",
+    "df_stats_XL = norm_regio[regional_df['MerchantSizeByDPV']=='XL'].describe()\n",
+    "df_stats_XS = norm_regio[regional_df['MerchantSizeByDPV']=='XS'].describe()\n",
+    "\n",
+    "((df_stats_XL.loc['50%'] - df_stats_XS.loc['50%'])/(df_stats_XL.loc['75%'] - df_stats_XL.loc['25%'])).sort_values(ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "684af78c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "# Compute a correlation matrix for all float values of our dataframe\n",
+    "float_cols = df.columns[df.dtypes==float]\n",
+    "corr_matrix = df[float_cols].corr()\n",
+    "\n",
+    "# The diagonal values (correlation of each feature with itself) should be considered 0, to filter them out\n",
+    "np.fill_diagonal(corr_matrix.values, 0)\n",
+    "\n",
+    "# Create a new DataFrame that transforms all values to 0 that are below a value of defined by variable \"correlation_threshold\" \n",
+    "correlation_threshold = 0.89\n",
+    "filtered_correlation_df = corr_matrix.applymap(lambda x: x if abs(x) >= correlation_threshold else 0)\n",
+    "\n",
+    "# Identify the rows and columns that not only consists of 0 values (after filtering)\n",
+    "non_zero_rows = filtered_correlation_df.index[~(filtered_correlation_df == 0).all(axis=1)]\n",
+    "non_zero_columns = filtered_correlation_df.columns[~(filtered_correlation_df == 0).all(axis=0)]\n",
+    "new_correlation_df = filtered_correlation_df.loc[non_zero_rows, non_zero_columns]\n",
+    "\n",
+    "# Print the new correlation matrix and the corresponding plot\n",
+    "print(f\"New Correlation Matrix (values greater than {correlation_threshold}):\")\n",
+    "\n",
+    "plt.figure(figsize=(12, 10))\n",
+    "heatmap = sns.heatmap(new_correlation_df, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)\n",
+    "plt.title('Correlation Matrix Heatmap')\n",
+    "plt.savefig('correlation_matrix.svg', format='svg')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7ba9ee1b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.decomposition import PCA\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "\n",
+    "\n",
+    "reg_df = df.filter(like='regional', axis=1).dropna()\n",
+    "\n",
+    "# Standardize the features\n",
+    "scaler = StandardScaler()\n",
+    "scaled_data = scaler.fit_transform(reg_df.drop('MerchantSizeByDPV', axis=1))\n",
+    "\n",
+    "# Apply PCA\n",
+    "pca = PCA()\n",
+    "principal_components = pca.fit_transform(scaled_data)\n",
+    "\n",
+    "# Retrieve explained variance ratios\n",
+    "explained_variance_ratio = pca.explained_variance_ratio_\n",
+    "\n",
+    "components = pd.DataFrame(pca.components_, columns=filter_df.columns)\n",
+    "\n",
+    "# Print explained variance ratios\n",
+    "for i, ratio in enumerate(explained_variance_ratio, 1):\n",
+    "    print(f\"Principal Component {i}: Explained Variance Ratio = {ratio:.4f}\")\n",
+    "\n",
+    "# Plot the cumulative explained variance\n",
+    "cumulative_variance = explained_variance_ratio.cumsum()\n",
+    "\n",
+    "plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o')\n",
+    "plt.title('Cumulative Explained Variance')\n",
+    "plt.xlabel('Number of Principal Components')\n",
+    "plt.ylabel('Cumulative Variance Explained')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d731eff3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "\n",
+    "# Count only those words, existing in a minum amount of 100 email adresses\n",
+    "count_vectorizer = CountVectorizer(min_df=50)\n",
+    "\n",
+    "# Fit and transform the text data\n",
+    "count_matrix = count_vectorizer.fit_transform(df['Email'])\n",
+    "\n",
+    "# Convert the matrix to a DataFrame for better readability\n",
+    "count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4314186c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "common_words = pd.DataFrame(count_df.sum()).transpose()\n",
+    "\n",
+    "for word in common_words:\n",
+    "    print(word)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fdf6ba75",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Names\n",
+    "names = []\n",
+    "\n",
+    "# Weird terms\n",
+    "weird_terms = []\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fa498485",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "grouped_common_words = []\n",
+    "per_size = (df.groupby('MerchantSizeByDPV').count()['Email']/df.shape[0]).reindex(index=['XS', 'S', 'M', 'L','XL'])\n",
+    "\n",
+    "for word in common_words.drop(weird_terms,axis=1): #common_words[names], common_words[weird_terms]\n",
+    "    \n",
+    "    indices= count_df[count_df[word]>0].index      \n",
+    "    per_word = (df.loc[indices].groupby('MerchantSizeByDPV').count()['Email']/len(df.loc[indices])).reindex(index=['XS', 'S', 'M', 'L','XL']) \n",
+    "    \n",
+    "    grouped_common_words.append((per_word-per_size).rename(word))  \n",
+    "    \n",
+    "common_df = pd.concat(grouped_common_words, axis=1)\n",
+    "common_df = common_df.transpose()\n",
+    "\n",
+    "common_df.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3aeb199a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The min/mean/max probability decrease (-) or increase (+) by a value of x with the existence of a certain common word\n",
+    "\n",
+    "print(f'{np.min(common_df[\"XS\"])}, {np.mean(common_df[\"XS\"])},{np.max(common_df[\"XS\"])}')\n",
+    "print(f'{np.min(common_df[\"S\"])}, {np.mean(common_df[\"S\"])},{np.max(common_df[\"S\"])}')\n",
+    "print(f'{np.min(common_df[\"M\"])}, {np.mean(common_df[\"M\"])},{np.max(common_df[\"M\"])}')\n",
+    "print(f'{np.min(common_df[\"L\"])}, {np.mean(common_df[\"L\"])},{np.max(common_df[\"L\"])}')\n",
+    "print(f'{np.min(common_df[\"XL\"])}, {np.mean(common_df[\"XL\"])},{np.max(common_df[\"XL\"])}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "38d47b36",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import multiprocessing\n",
+    "import time\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from deutschland.bundesanzeiger import Bundesanzeiger\n",
+    "import pickle\n",
+    "import time\n",
+    "\n",
+    "def access_ba(company,b_bundesanzeiger,):\n",
+    "\n",
+    "    b_bundesanzeiger.append(True)\n",
+    "    try:\n",
+    "        ba = Bundesanzeiger()\n",
+    "        data = ba.get_reports(company)\n",
+    "    except:\n",
+    "        b_bundesanzeiger[-1] = False\n",
+    "    return\n",
+    "\n",
+    "    if __name__ == '__main__':\n",
+    "\n",
+    "    \"\"\"\n",
+    "    with open('list_file.pkl', 'rb') as file:\n",
+    "        loaded_list = pickle.load(file)\n",
+    "        print(loaded_list)\n",
+    "    \"\"\"\n",
+    "\n",
+    "    pd.set_option('display.max_columns', None)\n",
+    "\n",
+    "    historic = pd.read_csv('historic.csv',sep = ',')#_enriched\n",
+    "\n",
+    "    df = historic.groupby('MerchantSizeByDPV').apply(lambda x: x.sample(100))\n",
+    "\n",
+    "\n",
+    "    with multiprocessing.Manager() as manager:\n",
+    "\n",
+    "        b_bundesanzeiger = manager.list()\n",
+    "        content_array = []\n",
+    "        durations = []\n",
+    "\n",
+    "        for i, company in enumerate(df[\"Company Name\"]):\n",
+    "\n",
+    "            print(i)\n",
+    "\n",
+    "            start = time.time()\n",
+    "\n",
+    "            # Start access_ba as a process\n",
+    "            p = multiprocessing.Process(target=access_ba, name=\"access_ba\", args=(company,b_bundesanzeiger))\n",
+    "\n",
+    "            p.start()\n",
+    "\n",
+    "            # Wait 8 seconds for access_ba\t\n",
+    "            p.join(8)\n",
+    "\n",
+    "            # If thread is active\n",
+    "            if p.is_alive():\n",
+    "                print (\"Terminate access_ba\")\n",
+    "\n",
+    "                # Terminate access_ba\n",
+    "                p.terminate()\n",
+    "                b_bundesanzeiger[-1] = 'killed'\n",
+    "\n",
+    "            # Cleanup\n",
+    "            p.join()\n",
+    "            i+=1\n",
+    "\n",
+    "            print(b_bundesanzeiger[-1])\n",
+    "            end = time.time()\n",
+    "            print(end-start)\n",
+    "            print()\n",
+    "            durations.append(end-start)\n",
+    "\n",
+    "            \"\"\"if i==100:\n",
+    "                with open('list_file.pkl', 'wb') as file:\n",
+    "                    pickle.dump(list(b_bundesanzeiger), file)\n",
+    "                print(np.mean(np.array(list(b_bundesanzeiger))))\n",
+    "                break\n",
+    "            \"\"\"\n",
+    "\n",
+    "        with open('list_file.pkl', 'wb') as file:\n",
+    "            pickle.dump(list(b_bundesanzeiger), file)\n",
+    "\n",
+    "        with open('time.pkl', 'wb') as file:\n",
+    "            pickle.dump(durations, file)\n",
+    "\n",
+    "        df.to_pickle(\"./dataframe_sample.pkl\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c8a6f460",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('dataframe_sample.pkl', 'rb') as f:\n",
+    "    df = pickle.load(f)\n",
+    "\n",
+    "df = df.reset_index(drop=True)\n",
+    "\n",
+    "with open('list_file.pkl', 'rb') as f:\n",
+    "    mynewlist = pickle.load(f)\n",
+    "\n",
+    "with open('time.pkl', 'rb') as f:\n",
+    "    time = pickle.load(f)\n",
+    "\n",
+    "df_stats = pd.DataFrame({'b_bundesanzeiger': mynewlist, 'time': time})\n",
+    "\n",
+    "df['b_bundesanzeiger'] = df_stats['b_bundesanzeiger']\n",
+    "df['time'] = df_stats['time']\n",
+    "\n",
+    "\n",
+    "counts =df.groupby('MerchantSizeByDPV')['b_bundesanzeiger'].value_counts()\n",
+    "\n",
+    "desired_value_counts = counts.unstack().fillna(0)\n",
+    "\n",
+    "# Compute total counts per category\n",
+    "total_counts_per_category = counts.groupby('MerchantSizeByDPV').sum()\n",
+    "\n",
+    "# Compute probability for each category\n",
+    "probabilities = desired_value_counts.apply(lambda x: x / total_counts_per_category)\n",
+    "\n",
+    "print(probabilities)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "30b18dea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['b_google_places'] = df[\"google_places_place_id\"].notnull()\n",
+    "counts =df.groupby('MerchantSizeByDPV')['b_google_places'].value_counts()\n",
+    "\n",
+    "desired_value_counts = counts.unstack().fillna(0)\n",
+    "\n",
+    "# Compute total counts per category\n",
+    "total_counts_per_category = counts.groupby('MerchantSizeByDPV').sum()\n",
+    "\n",
+    "# Compute probability for each category\n",
+    "probabilities = desired_value_counts.apply(lambda x: x / total_counts_per_category)\n",
+    "\n",
+    "print(probabilities)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f167c71",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "\n",
+    "# Separate features (X) and target variable (y)\n",
+    "table = df[regional_columns+['MerchantSizeByDPV']].dropna()\n",
+    "y = table['MerchantSizeByDPV']\n",
+    "X=table[regional_columns]\n",
+    "\n",
+    "X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X, y)\n",
+    "\n",
+    "# Split the data into training and testing sets\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)\n",
+    "\n",
+    "# Create a logistic regression model\n",
+    "model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=4000, class_weight='balanced')\n",
+    "\n",
+    "# Fit the model on the training data\n",
+    "model.fit(X_train, y_train)\n",
+    "\n",
+    "# Make predictions on the testing data\n",
+    "y_pred = model.predict(X_test)\n",
+    "\n",
+    "# Evaluate the model\n",
+    "accuracy = accuracy_score(y_test, y_pred)\n",
+    "conf_matrix = confusion_matrix(y_test, y_pred)\n",
+    "class_report = classification_report(y_test, y_pred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e6afdec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n",
+    "\n",
+    "# Assuming X and y are your feature matrix and target variable\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)\n",
+    "\n",
+    "# Create and fit the Random Forest model\n",
+    "rf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n",
+    "rf_model.fit(X_train, y_train)\n",
+    "\n",
+    "# Make predictions on the test set\n",
+    "y_pred = rf_model.predict(X_test)\n",
+    "\n",
+    "# Evaluate the model\n",
+    "accuracy = accuracy_score(y_test, y_pred)\n",
+    "conf_matrix = confusion_matrix(y_test, y_pred)\n",
+    "class_report = classification_report(y_test, y_pred)\n",
+    "\n",
+    "# Display evaluation metrics\n",
+    "print(f'Accuracy: {accuracy:.2f}')\n",
+    "print(f'Confusion Matrix:\\n{conf_matrix}')\n",
+    "print(f'Classification Report:\\n{class_report}')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "abea2245",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.ensemble import IsolationForest\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n",
+    "\n",
+    "# Assuming X and y are your feature matrix and target variable\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
+    "\n",
+    "# Define the set of rare classes\n",
+    "rare_classes = ['XL']  # Replace with the actual class labels you consider rare\n",
+    "    # MAYBE NOT ONLY XL, but also L and M\n",
+    "# Create a binary target variable indicating whether each instance is rare or not\n",
+    "y_train_rare = y_train.isin(rare_classes).astype(int)\n",
+    "y_test_rare = y_test.isin(rare_classes).astype(int)\n",
+    "\n",
+    "# Create and fit the Isolation Forest model\n",
+    "if_model = IsolationForest(contamination='auto')\n",
+    "if_model.fit(X_train)\n",
+    "\n",
+    "# Predict anomalies on the test set\n",
+    "y_pred_rare = if_model.predict(X_test)\n",
+    "\n",
+    "# Convert the predicted labels to binary (1 for anomalies, -1 for normal instances)\n",
+    "y_pred_rare_binary = (y_pred_rare == -1).astype(int)\n",
+    "\n",
+    "# Evaluate the model\n",
+    "accuracy = accuracy_score(y_test_rare, y_pred_rare_binary)\n",
+    "conf_matrix = confusion_matrix(y_test_rare, y_pred_rare_binary)\n",
+    "class_report = classification_report(y_test_rare, y_pred_rare_binary)\n",
+    "\n",
+    "# Display evaluation metrics\n",
+    "print(f'Accuracy: {accuracy:.2f}')\n",
+    "print(f'Confusion Matrix:\\n{conf_matrix}')\n",
+    "print(f'Classification Report:\\n{class_report}')\n",
+    "\n",
+    "plt.figure(figsize=(6, 4))\n",
+    "sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False,\n",
+    "            xticklabels=[0,1], yticklabels=[0,1])\n",
+    "plt.xlabel('Predicted Label')\n",
+    "plt.ylabel('True Label')\n",
+    "plt.title('Confusion Matrix')\n",
+    "plt.show()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}