diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..18bd47b Binary files /dev/null and b/.DS_Store differ diff --git a/.ipynb_checkpoints/Cohort_Analysis_Ironhack_Project-checkpoint.ipynb b/.ipynb_checkpoints/Cohort_Analysis_Ironhack_Project-checkpoint.ipynb new file mode 100644 index 0000000..c9f695c --- /dev/null +++ b/.ipynb_checkpoints/Cohort_Analysis_Ironhack_Project-checkpoint.ipynb @@ -0,0 +1,2618 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "45a33a0d", + "metadata": {}, + "source": [ + "# Cohort Analysis for Ironhack Payments" + ] + }, + { + "cell_type": "markdown", + "id": "c510412d", + "metadata": {}, + "source": [ + "## 1. Introduction\n", + "\n", + "### Business Context:\n", + "IronHack Payments offers innovative cash advance solutions since 2020. This project aims to perform a cohort analysis to understand user behavior and the performance of their financial services.\n", + "\n", + "### Deliverables:\n", + "- Python Code (Jupyter Notebook)\n", + "- Exploratory Data Analysis Report\n", + "- Data Quality Analysis Report\n", + "- Short Presentation" + ] + }, + { + "cell_type": "code", + "execution_count": 445, + "id": "a5399ab2-aef1-4fd5-8fcc-840a44da4edb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " id amount status created_at \\\n", + "0 5 100.0 rejected 2019-12-10 19:05:21.596873+00 \n", + "1 70 100.0 rejected 2019-12-10 19:50:12.34778+00 \n", + "2 7 100.0 rejected 2019-12-10 19:13:35.82546+00 \n", + "3 10 99.0 rejected 2019-12-10 19:16:10.880172+00 \n", + "4 1594 100.0 rejected 2020-05-06 09:59:38.877376+00 \n", + "\n", + " updated_at user_id moderated_at \\\n", + "0 2019-12-11 16:47:42.40783+00 804.0 2019-12-11 16:47:42.405646+00 \n", + "1 2019-12-11 14:24:22.900054+00 231.0 2019-12-11 14:24:22.897988+00 \n", + "2 2019-12-11 09:46:59.779773+00 191.0 2019-12-11 09:46:59.777728+00 \n", + "3 2019-12-18 14:26:18.136163+00 761.0 2019-12-18 14:26:18.128407+00 \n", + "4 2020-05-07 09:21:55.34008+00 7686.0 2020-05-07 09:21:55.320193+00 \n", + "\n", + " deleted_account_id reimbursement_date \\\n", + "0 NaN 2020-01-09 19:05:21.596363+00 \n", + "1 NaN 2020-01-09 19:50:12.34778+00 \n", + "2 NaN 2020-01-09 19:13:35.825041+00 \n", + "3 NaN 2020-01-09 19:16:10.879606+00 \n", + "4 NaN 2020-06-05 22:00:00+00 \n", + "\n", + " cash_request_received_date money_back_date transfer_type send_at \\\n", + "0 NaN NaN regular NaN \n", + "1 NaN NaN regular NaN \n", + "2 NaN NaN regular NaN \n", + "3 NaN NaN regular NaN \n", + "4 NaN NaN regular NaN \n", + "\n", + " recovery_status reco_creation reco_last_update \n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + " id cash_request_id type status category \\\n", + "0 6537 14941.0 instant_payment rejected NaN \n", + "1 6961 11714.0 incident accepted rejected_direct_debit \n", + "2 16296 23371.0 instant_payment accepted NaN \n", + "3 20775 26772.0 instant_payment accepted NaN \n", + "4 11242 19350.0 instant_payment accepted NaN \n", + "\n", + " total_amount reason \\\n", + "0 5.0 Instant Payment Cash Request 14941 \n", + "1 5.0 rejected direct debit \n", + "2 5.0 Instant Payment Cash Request 23371 \n", + "3 5.0 Instant Payment Cash Request 26772 \n", + "4 5.0 Instant Payment Cash Request 19350 \n", + "\n", + " created_at updated_at \\\n", + "0 2020-09-07 10:47:27.42315+00 2020-10-13 14:25:09.396112+00 \n", + "1 2020-09-09 20:51:17.998653+00 2020-10-13 14:25:15.537063+00 \n", + "2 2020-10-23 10:10:58.352972+00 2020-10-23 10:10:58.352994+00 \n", + "3 2020-10-31 15:46:53.643958+00 2020-10-31 15:46:53.643982+00 \n", + "4 2020-10-06 08:20:17.170432+00 2020-10-13 14:25:03.267983+00 \n", + "\n", + " paid_at from_date to_date charge_moment \n", + "0 2020-12-17 14:50:07.47011+00 NaN NaN after \n", + "1 2020-12-08 17:13:10.45908+00 NaN NaN after \n", + "2 2020-11-04 19:34:37.43291+00 NaN NaN after \n", + "3 2020-11-19 05:09:22.500223+00 NaN NaN after \n", + "4 2020-11-02 14:45:20.355598+00 NaN NaN after \n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "data_cash = pd.read_csv('./datasets/cash_data_analyst.csv')\n", + "data_fees = pd.read_csv('./datasets/fees_data_analyst.csv')\n", + "lexicon_data = pd.read_excel('./datasets/Lexique_Data_Analyst.xlsx')\n", + "\n", + "dff = data_fees.copy()\n", + "\n", + "dfc = data_cash.copy()\n", + "\n", + "print(data_cash_request.head(5))\n", + "\n", + "print(data_fees.head(5))\n" + ] + }, + { + "cell_type": "markdown", + "id": "84558474-1341-4784-9dc8-ccb0652683ad", + "metadata": {}, + "source": [ + "## 2. Data Loading\n", + "- Extracting and briefly describing each dataset and its columns." + ] + }, + { + "cell_type": "code", + "execution_count": 447, + "id": "73cbbab6-a168-493a-a5b7-bdc53f795f2d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 21061 entries, 0 to 21060\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 21061 non-null int64 \n", + " 1 cash_request_id 21057 non-null float64\n", + " 2 type 21061 non-null object \n", + " 3 status 21061 non-null object \n", + " 4 category 2196 non-null object \n", + " 5 total_amount 21061 non-null float64\n", + " 6 reason 21061 non-null object \n", + " 7 created_at 21061 non-null object \n", + " 8 updated_at 21061 non-null object \n", + " 9 paid_at 15531 non-null object \n", + " 10 from_date 7766 non-null object \n", + " 11 to_date 7766 non-null object \n", + " 12 charge_moment 21061 non-null object \n", + "dtypes: float64(2), int64(1), object(10)\n", + "memory usage: 2.1+ MB\n" + ] + } + ], + "source": [ + "dff.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 448, + "id": "8700a78d-7163-4c56-a401-f2cf1b296e64", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 23970 entries, 0 to 23969\n", + "Data columns (total 16 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 23970 non-null int64 \n", + " 1 amount 23970 non-null float64\n", + " 2 status 23970 non-null object \n", + " 3 created_at 23970 non-null object \n", + " 4 updated_at 23970 non-null object \n", + " 5 user_id 21867 non-null float64\n", + " 6 moderated_at 16035 non-null object \n", + " 7 deleted_account_id 2104 non-null float64\n", + " 8 reimbursement_date 23970 non-null object \n", + " 9 cash_request_received_date 16289 non-null object \n", + " 10 money_back_date 16543 non-null object \n", + " 11 transfer_type 23970 non-null object \n", + " 12 send_at 16641 non-null object \n", + " 13 recovery_status 3330 non-null object \n", + " 14 reco_creation 3330 non-null object \n", + " 15 reco_last_update 3330 non-null object \n", + "dtypes: float64(3), int64(1), object(12)\n", + "memory usage: 2.9+ MB\n" + ] + } + ], + "source": [ + "dfc.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 449, + "id": "5913ab9b-f9eb-45ec-af8c-799902ef5c53", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcash_request_idtotal_amount
count21061.00000021057.00000021061.000000
mean10645.35511116318.4491625.000237
std6099.3152566656.1499490.034453
min1.0000001456.0000005.000000
25%5385.00000011745.0000005.000000
50%10652.00000017160.0000005.000000
75%15925.00000021796.0000005.000000
max21193.00000027010.00000010.000000
\n", + "
" + ], + "text/plain": [ + " id cash_request_id total_amount\n", + "count 21061.000000 21057.000000 21061.000000\n", + "mean 10645.355111 16318.449162 5.000237\n", + "std 6099.315256 6656.149949 0.034453\n", + "min 1.000000 1456.000000 5.000000\n", + "25% 5385.000000 11745.000000 5.000000\n", + "50% 10652.000000 17160.000000 5.000000\n", + "75% 15925.000000 21796.000000 5.000000\n", + "max 21193.000000 27010.000000 10.000000" + ] + }, + "execution_count": 449, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dff.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 450, + "id": "e139d5b1-5d71-4856-b056-955f5ccd7937", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idamountuser_iddeleted_account_id
count23970.00000023970.00000021867.0000002104.000000
mean13910.96612482.72081832581.2507899658.755228
std7788.11721426.52806527618.5657737972.743249
min3.0000001.00000034.00000091.000000
25%7427.25000050.00000010804.0000003767.000000
50%14270.500000100.00000023773.0000006121.500000
75%20607.750000100.00000046965.00000016345.000000
max27010.000000200.000000103719.00000030445.000000
\n", + "
" + ], + "text/plain": [ + " id amount user_id deleted_account_id\n", + "count 23970.000000 23970.000000 21867.000000 2104.000000\n", + "mean 13910.966124 82.720818 32581.250789 9658.755228\n", + "std 7788.117214 26.528065 27618.565773 7972.743249\n", + "min 3.000000 1.000000 34.000000 91.000000\n", + "25% 7427.250000 50.000000 10804.000000 3767.000000\n", + "50% 14270.500000 100.000000 23773.000000 6121.500000\n", + "75% 20607.750000 100.000000 46965.000000 16345.000000\n", + "max 27010.000000 200.000000 103719.000000 30445.000000" + ] + }, + "execution_count": 450, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfc.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 451, + "id": "1a8834e1-4a88-4b18-aabe-5860d15145b1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(21061, 13)" + ] + }, + "execution_count": 451, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dff.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 452, + "id": "732ced3d-e151-48a8-924c-e22a746a8037", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(23970, 16)" + ] + }, + "execution_count": 452, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfc.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 453, + "id": "f9b1096e-91e1-44a4-9ff3-b82c34c46a16", + "metadata": {}, + "outputs": [], + "source": [ + "n_rows = dff.shape[0]\n", + "n_columns = dff.shape[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 454, + "id": "dd020183-83b8-4d12-ba49-19ee4b4fb38b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcash_request_idtypestatuscategorytotal_amountreasoncreated_atupdated_atpaid_atfrom_dateto_datecharge_moment
0653714941.0instant_paymentrejectedNaN5.0Instant Payment Cash Request 149412020-09-07 10:47:27.42315+002020-10-13 14:25:09.396112+002020-12-17 14:50:07.47011+00NaNNaNafter
1696111714.0incidentacceptedrejected_direct_debit5.0rejected direct debit2020-09-09 20:51:17.998653+002020-10-13 14:25:15.537063+002020-12-08 17:13:10.45908+00NaNNaNafter
21629623371.0instant_paymentacceptedNaN5.0Instant Payment Cash Request 233712020-10-23 10:10:58.352972+002020-10-23 10:10:58.352994+002020-11-04 19:34:37.43291+00NaNNaNafter
\n", + "
" + ], + "text/plain": [ + " id cash_request_id type status category \\\n", + "0 6537 14941.0 instant_payment rejected NaN \n", + "1 6961 11714.0 incident accepted rejected_direct_debit \n", + "2 16296 23371.0 instant_payment accepted NaN \n", + "\n", + " total_amount reason \\\n", + "0 5.0 Instant Payment Cash Request 14941 \n", + "1 5.0 rejected direct debit \n", + "2 5.0 Instant Payment Cash Request 23371 \n", + "\n", + " created_at updated_at \\\n", + "0 2020-09-07 10:47:27.42315+00 2020-10-13 14:25:09.396112+00 \n", + "1 2020-09-09 20:51:17.998653+00 2020-10-13 14:25:15.537063+00 \n", + "2 2020-10-23 10:10:58.352972+00 2020-10-23 10:10:58.352994+00 \n", + "\n", + " paid_at from_date to_date charge_moment \n", + "0 2020-12-17 14:50:07.47011+00 NaN NaN after \n", + "1 2020-12-08 17:13:10.45908+00 NaN NaN after \n", + "2 2020-11-04 19:34:37.43291+00 NaN NaN after " + ] + }, + "execution_count": 454, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dff.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 455, + "id": "61d26458-ef95-4164-86b0-74f138f3832c", + "metadata": {}, + "outputs": [], + "source": [ + "n_rows = dfc.shape[0]\n", + "n_columns = dfc.shape[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 456, + "id": "5d39411b-ceb9-4506-a060-8127e9c4914b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idamountstatuscreated_atupdated_atuser_idmoderated_atdeleted_account_idreimbursement_datecash_request_received_datemoney_back_datetransfer_typesend_atrecovery_statusreco_creationreco_last_update
05100.0rejected2019-12-10 19:05:21.596873+002019-12-11 16:47:42.40783+00804.02019-12-11 16:47:42.405646+00NaN2020-01-09 19:05:21.596363+00NaNNaNregularNaNNaNNaNNaN
170100.0rejected2019-12-10 19:50:12.34778+002019-12-11 14:24:22.900054+00231.02019-12-11 14:24:22.897988+00NaN2020-01-09 19:50:12.34778+00NaNNaNregularNaNNaNNaNNaN
27100.0rejected2019-12-10 19:13:35.82546+002019-12-11 09:46:59.779773+00191.02019-12-11 09:46:59.777728+00NaN2020-01-09 19:13:35.825041+00NaNNaNregularNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " id amount status created_at \\\n", + "0 5 100.0 rejected 2019-12-10 19:05:21.596873+00 \n", + "1 70 100.0 rejected 2019-12-10 19:50:12.34778+00 \n", + "2 7 100.0 rejected 2019-12-10 19:13:35.82546+00 \n", + "\n", + " updated_at user_id moderated_at \\\n", + "0 2019-12-11 16:47:42.40783+00 804.0 2019-12-11 16:47:42.405646+00 \n", + "1 2019-12-11 14:24:22.900054+00 231.0 2019-12-11 14:24:22.897988+00 \n", + "2 2019-12-11 09:46:59.779773+00 191.0 2019-12-11 09:46:59.777728+00 \n", + "\n", + " deleted_account_id reimbursement_date \\\n", + "0 NaN 2020-01-09 19:05:21.596363+00 \n", + "1 NaN 2020-01-09 19:50:12.34778+00 \n", + "2 NaN 2020-01-09 19:13:35.825041+00 \n", + "\n", + " cash_request_received_date money_back_date transfer_type send_at \\\n", + "0 NaN NaN regular NaN \n", + "1 NaN NaN regular NaN \n", + "2 NaN NaN regular NaN \n", + "\n", + " recovery_status reco_creation reco_last_update \n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN " + ] + }, + "execution_count": 456, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfc.head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "56307a51-2ca9-4902-a465-64f6410b9552", + "metadata": {}, + "source": [ + "**Similarities: id, status, created at, updated at**" + ] + }, + { + "cell_type": "code", + "execution_count": 458, + "id": "2ef52422-69d6-4cfe-9a41-a1722f830625", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['id', 'cash_request_id', 'type', 'status', 'category', 'total_amount',\n", + " 'reason', 'created_at', 'updated_at', 'paid_at', 'from_date', 'to_date',\n", + " 'charge_moment'],\n", + " dtype='object')" + ] + }, + "execution_count": 458, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dff.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 459, + "id": "26072eb5-3cb3-47b4-b7f9-61497f9c5c58", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['id', 'amount', 'status', 'created_at', 'updated_at', 'user_id',\n", + " 'moderated_at', 'deleted_account_id', 'reimbursement_date',\n", + " 'cash_request_received_date', 'money_back_date', 'transfer_type',\n", + " 'send_at', 'recovery_status', 'reco_creation', 'reco_last_update'],\n", + " dtype='object')" + ] + }, + "execution_count": 459, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfc.columns" + ] + }, + { + "cell_type": "markdown", + "id": "325fa04e", + "metadata": {}, + "source": [ + "## 3. Data Quality Analysis\n", + "- Checking for missing values.\n", + "- Identifying duplicates and inconsistencies.\n", + "- Validate data types.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "7c051690-3963-4ce7-a4bd-b33bf40b73e6", + "metadata": {}, + "source": [ + "**Checking for missing values and deleting columns**" + ] + }, + { + "cell_type": "code", + "execution_count": 462, + "id": "fc7e80f9-fba4-4fe0-81cd-569cc21736c5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "cash_request_id 4\n", + "type 0\n", + "status 0\n", + "category 18865\n", + "total_amount 0\n", + "reason 0\n", + "created_at 0\n", + "updated_at 0\n", + "paid_at 5530\n", + "from_date 13295\n", + "to_date 13295\n", + "charge_moment 0\n", + "dtype: int64" + ] + }, + "execution_count": 462, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dff.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 463, + "id": "4a8df55a-bd26-40e9-b40e-c2dafc03b778", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "cash_request_id 0\n", + "type 0\n", + "status 0\n", + "category 18861\n", + "total_amount 0\n", + "reason 0\n", + "created_at 0\n", + "updated_at 0\n", + "paid_at 5526\n", + "from_date 13291\n", + "to_date 13291\n", + "charge_moment 0\n", + "dtype: int64" + ] + }, + "execution_count": 463, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dff = dff.dropna(subset=['cash_request_id'])\n", + "dff.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 464, + "id": "955a276f-fd35-4fde-b65f-53445f94aa8d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "amount 0\n", + "status 0\n", + "created_at 0\n", + "updated_at 0\n", + "user_id 2103\n", + "moderated_at 7935\n", + "deleted_account_id 21866\n", + "reimbursement_date 0\n", + "cash_request_received_date 7681\n", + "money_back_date 7427\n", + "transfer_type 0\n", + "send_at 7329\n", + "recovery_status 20640\n", + "reco_creation 20640\n", + "reco_last_update 20640\n", + "dtype: int64" + ] + }, + "execution_count": 464, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfc.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 465, + "id": "63742b02-cdb6-4a38-8ddc-ff2bdde25aa1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "amount 0\n", + "status 0\n", + "created_at 0\n", + "updated_at 0\n", + "user_id 0\n", + "moderated_at 7758\n", + "deleted_account_id 21866\n", + "reimbursement_date 0\n", + "cash_request_received_date 6276\n", + "money_back_date 6026\n", + "transfer_type 0\n", + "send_at 6325\n", + "recovery_status 18727\n", + "reco_creation 18727\n", + "reco_last_update 18727\n", + "dtype: int64" + ] + }, + "execution_count": 465, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfc = dfc.dropna(subset=['user_id'])\n", + "dfc.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 466, + "id": "1d9c4b82-160a-40b1-ac3c-aabeea8ca67b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(21057, 13)" + ] + }, + "execution_count": 466, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dff.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 467, + "id": "b83b15c6-263a-49ea-bfc4-da8f915804de", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(21867, 16)" + ] + }, + "execution_count": 467, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfc.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 468, + "id": "51b05b6b-2514-4fda-8f1b-2cbfe167ba12", + "metadata": {}, + "outputs": [], + "source": [ + "dff = data_fees.copy()\n", + "\n", + "dfc = data_cash.copy()" + ] + }, + { + "cell_type": "markdown", + "id": "80c547f1-d7ca-46c5-81f3-2781d1d8c29d", + "metadata": {}, + "source": [ + "**Here Im checking the percentage of column Category. Since 72% of this data is \"rejected_direct_debit\", I think is safe to fill the empty values**" + ] + }, + { + "cell_type": "code", + "execution_count": 470, + "id": "c75b8136-68b2-4f56-a9d0-67bd9f8b6037", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "category\n", + "rejected_direct_debit 72.814208\n", + "month_delay_on_payment 27.185792\n", + "Name: proportion, dtype: float64\n" + ] + } + ], + "source": [ + "percentages = dff['category'].value_counts(normalize=True) * 100\n", + "\n", + "print(percentages)" + ] + }, + { + "cell_type": "code", + "execution_count": 471, + "id": "31ceb778-8a2d-4bd5-8820-3677a99ae324", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n" + ] + } + ], + "source": [ + "dff['category'] = dff['category'].fillna('rejected_direct_debit')\n", + "\n", + "print(dff['category'].isnull().sum())" + ] + }, + { + "cell_type": "markdown", + "id": "0710c515-7c4f-40f7-9432-3ad0c3e0e750", + "metadata": {}, + "source": [ + "**Here Im keeping the most important columns**" + ] + }, + { + "cell_type": "code", + "execution_count": 473, + "id": "3fbda56c-b4de-44b2-b296-5036018dd7e0", + "metadata": {}, + "outputs": [], + "source": [ + "important_columns = [\n", + " 'id', \n", + " 'cash_request_id', \n", + " 'type', \n", + " 'category',\n", + " 'status', \n", + " 'total_amount', \n", + " 'created_at', \n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 474, + "id": "85064717-63be-4e3d-be78-4573a27200dc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcash_request_idtypecategorystatustotal_amountcreated_at
0653714941.0instant_paymentrejected_direct_debitrejected5.02020-09-07 10:47:27.42315+00
1696111714.0incidentrejected_direct_debitaccepted5.02020-09-09 20:51:17.998653+00
21629623371.0instant_paymentrejected_direct_debitaccepted5.02020-10-23 10:10:58.352972+00
32077526772.0instant_paymentrejected_direct_debitaccepted5.02020-10-31 15:46:53.643958+00
41124219350.0instant_paymentrejected_direct_debitaccepted5.02020-10-06 08:20:17.170432+00
........................
210561237220262.0instant_paymentrejected_direct_debitrejected5.02020-10-10 06:42:22.822743+00
210572076826764.0instant_paymentrejected_direct_debitrejected5.02020-10-31 15:24:18.680694+00
210581877925331.0instant_paymentrejected_direct_debitrejected5.02020-10-27 17:28:51.749177+00
210591654223628.0instant_paymentrejected_direct_debitrejected5.02020-10-23 16:27:52.047457+00
210601330120982.0instant_paymentrejected_direct_debitaccepted5.02020-10-14 07:12:43.958192+00
\n", + "

21061 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " id cash_request_id type category \\\n", + "0 6537 14941.0 instant_payment rejected_direct_debit \n", + "1 6961 11714.0 incident rejected_direct_debit \n", + "2 16296 23371.0 instant_payment rejected_direct_debit \n", + "3 20775 26772.0 instant_payment rejected_direct_debit \n", + "4 11242 19350.0 instant_payment rejected_direct_debit \n", + "... ... ... ... ... \n", + "21056 12372 20262.0 instant_payment rejected_direct_debit \n", + "21057 20768 26764.0 instant_payment rejected_direct_debit \n", + "21058 18779 25331.0 instant_payment rejected_direct_debit \n", + "21059 16542 23628.0 instant_payment rejected_direct_debit \n", + "21060 13301 20982.0 instant_payment rejected_direct_debit \n", + "\n", + " status total_amount created_at \n", + "0 rejected 5.0 2020-09-07 10:47:27.42315+00 \n", + "1 accepted 5.0 2020-09-09 20:51:17.998653+00 \n", + "2 accepted 5.0 2020-10-23 10:10:58.352972+00 \n", + "3 accepted 5.0 2020-10-31 15:46:53.643958+00 \n", + "4 accepted 5.0 2020-10-06 08:20:17.170432+00 \n", + "... ... ... ... \n", + "21056 rejected 5.0 2020-10-10 06:42:22.822743+00 \n", + "21057 rejected 5.0 2020-10-31 15:24:18.680694+00 \n", + "21058 rejected 5.0 2020-10-27 17:28:51.749177+00 \n", + "21059 rejected 5.0 2020-10-23 16:27:52.047457+00 \n", + "21060 accepted 5.0 2020-10-14 07:12:43.958192+00 \n", + "\n", + "[21061 rows x 7 columns]" + ] + }, + "execution_count": 474, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dff_filtered = dff[important_columns]\n", + "dff_filtered" + ] + }, + { + "cell_type": "code", + "execution_count": 475, + "id": "f119c936-d8cc-446d-81d8-832251b6a06f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idamountstatuscreated_atuser_id
05100.0rejected2019-12-10 19:05:21.596873+00804.0
170100.0rejected2019-12-10 19:50:12.34778+00231.0
27100.0rejected2019-12-10 19:13:35.82546+00191.0
31099.0rejected2019-12-10 19:16:10.880172+00761.0
41594100.0rejected2020-05-06 09:59:38.877376+007686.0
..................
2396520616100.0money_back2020-10-12 13:54:11.686225+0013681.0
239662524350.0money_back2020-10-27 14:41:25.73491+00NaN
2396722357100.0money_back2020-10-20 07:58:04.006937+0082122.0
2396820256100.0money_back2020-10-10 05:40:55.700422+0064517.0
2396919886100.0direct_debit_sent2020-10-08 14:16:52.155661+0044867.0
\n", + "

23970 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " id amount status created_at \\\n", + "0 5 100.0 rejected 2019-12-10 19:05:21.596873+00 \n", + "1 70 100.0 rejected 2019-12-10 19:50:12.34778+00 \n", + "2 7 100.0 rejected 2019-12-10 19:13:35.82546+00 \n", + "3 10 99.0 rejected 2019-12-10 19:16:10.880172+00 \n", + "4 1594 100.0 rejected 2020-05-06 09:59:38.877376+00 \n", + "... ... ... ... ... \n", + "23965 20616 100.0 money_back 2020-10-12 13:54:11.686225+00 \n", + "23966 25243 50.0 money_back 2020-10-27 14:41:25.73491+00 \n", + "23967 22357 100.0 money_back 2020-10-20 07:58:04.006937+00 \n", + "23968 20256 100.0 money_back 2020-10-10 05:40:55.700422+00 \n", + "23969 19886 100.0 direct_debit_sent 2020-10-08 14:16:52.155661+00 \n", + "\n", + " user_id \n", + "0 804.0 \n", + "1 231.0 \n", + "2 191.0 \n", + "3 761.0 \n", + "4 7686.0 \n", + "... ... \n", + "23965 13681.0 \n", + "23966 NaN \n", + "23967 82122.0 \n", + "23968 64517.0 \n", + "23969 44867.0 \n", + "\n", + "[23970 rows x 5 columns]" + ] + }, + "execution_count": 475, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "important_columns2 = [\n", + " 'id', 'amount', 'status', 'created_at', 'user_id']\n", + "dfc_filtered = dfc[important_columns2]\n", + "dfc_filtered" + ] + }, + { + "cell_type": "markdown", + "id": "555cd887-3ff0-4a68-8c1d-15df779c70cb", + "metadata": {}, + "source": [ + "## 4. Checking for Id`s\n" + ] + }, + { + "cell_type": "markdown", + "id": "d3a5acea-0087-4e7a-a5da-e4709867cabf", + "metadata": {}, + "source": [ + "**Checking for duplicates**" + ] + }, + { + "cell_type": "code", + "execution_count": 478, + "id": "885f7788-1418-4579-8f90-89fc1e623719", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Duplicates in dff_filtered:\n", + "Duplicate 'cash_request_id': 8127\n", + "Duplicate 'id': 0\n", + "Duplicates in dfc_filtered:\n", + "Duplicate 'user_id': 13171\n", + "Duplicate 'id': 0\n" + ] + } + ], + "source": [ + "print(\"Duplicates in dff_filtered:\")\n", + "print(\"Duplicate 'cash_request_id':\", dff_filtered['cash_request_id'].duplicated().sum())\n", + "print(\"Duplicate 'id':\", dff_filtered['id'].duplicated().sum())\n", + "\n", + "print(\"Duplicates in dfc_filtered:\")\n", + "print(\"Duplicate 'user_id':\", dfc_filtered['user_id'].duplicated().sum())\n", + "print(\"Duplicate 'id':\", dfc_filtered['id'].duplicated().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": 479, + "id": "07ea8eba-5085-4dc1-8137-6b5d5c3f2834", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Columns in dff_filtered: Index(['id', 'cash_request_id', 'type', 'category', 'status', 'total_amount',\n", + " 'created_at'],\n", + " dtype='object')\n", + "Columns in dfc_filtered: Index(['id', 'amount', 'status', 'created_at', 'user_id'], dtype='object')\n", + "Common columns between the datasets: {'created_at', 'id', 'status'}\n" + ] + } + ], + "source": [ + "print(\"Columns in dff_filtered:\", dff_filtered.columns)\n", + "print(\"Columns in dfc_filtered:\", dfc_filtered.columns)\n", + "\n", + "# common columns\n", + "common_columns = set(dff_filtered.columns).intersection(dfc_filtered.columns)\n", + "print(\"Common columns between the datasets:\", common_columns)" + ] + }, + { + "cell_type": "markdown", + "id": "cd4f81c9-d34a-4db6-a876-b1c3d4e3d410", + "metadata": {}, + "source": [ + "**Merging IDs**" + ] + }, + { + "cell_type": "code", + "execution_count": 481, + "id": "600a28a6-8b3c-4d64-a7b6-6867002a63b5", + "metadata": {}, + "outputs": [], + "source": [ + "# Merge 'id'\n", + "merged_df = pd.merge(dff_filtered, dfc_filtered, on='id', how='inner')" + ] + }, + { + "cell_type": "code", + "execution_count": 482, + "id": "6036f393-1532-4e44-bbd0-fb33b4a1d28a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcash_request_idtypecategorystatus_xtotal_amountcreated_at_xamountstatus_ycreated_at_yuser_id
0653714941.0instant_paymentrejected_direct_debitrejected5.02020-09-07 10:47:27.42315+00100.0money_back2020-07-04 05:59:49.246265+0026978.0
1696111714.0incidentrejected_direct_debitaccepted5.02020-09-09 20:51:17.998653+00100.0money_back2020-07-07 17:45:03.352406+0013539.0
21629623371.0instant_paymentrejected_direct_debitaccepted5.02020-10-23 10:10:58.352972+0050.0money_back2020-09-15 19:05:39.267562+0013756.0
32077526772.0instant_paymentrejected_direct_debitaccepted5.02020-10-31 15:46:53.643958+00100.0money_back2020-10-13 08:15:07.74666+0061460.0
41124219350.0instant_paymentrejected_direct_debitaccepted5.02020-10-06 08:20:17.170432+0050.0money_back2020-08-06 22:46:50.681779+0031188.0
....................................
184281237220262.0instant_paymentrejected_direct_debitrejected5.02020-10-10 06:42:22.822743+0050.0rejected2020-08-13 15:21:44.956497+00NaN
184292076826764.0instant_paymentrejected_direct_debitrejected5.02020-10-31 15:24:18.680694+0050.0money_back2020-10-13 07:46:38.698727+0033520.0
184301877925331.0instant_paymentrejected_direct_debitrejected5.02020-10-27 17:28:51.749177+0050.0rejected2020-10-01 18:51:01.003129+0055943.0
184311654223628.0instant_paymentrejected_direct_debitrejected5.02020-10-23 16:27:52.047457+0050.0money_back2020-09-17 07:33:06.068449+0019040.0
184321330120982.0instant_paymentrejected_direct_debitaccepted5.02020-10-14 07:12:43.958192+00100.0rejected2020-08-20 07:13:36.827124+0013661.0
\n", + "

18433 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " id cash_request_id type category \\\n", + "0 6537 14941.0 instant_payment rejected_direct_debit \n", + "1 6961 11714.0 incident rejected_direct_debit \n", + "2 16296 23371.0 instant_payment rejected_direct_debit \n", + "3 20775 26772.0 instant_payment rejected_direct_debit \n", + "4 11242 19350.0 instant_payment rejected_direct_debit \n", + "... ... ... ... ... \n", + "18428 12372 20262.0 instant_payment rejected_direct_debit \n", + "18429 20768 26764.0 instant_payment rejected_direct_debit \n", + "18430 18779 25331.0 instant_payment rejected_direct_debit \n", + "18431 16542 23628.0 instant_payment rejected_direct_debit \n", + "18432 13301 20982.0 instant_payment rejected_direct_debit \n", + "\n", + " status_x total_amount created_at_x amount \\\n", + "0 rejected 5.0 2020-09-07 10:47:27.42315+00 100.0 \n", + "1 accepted 5.0 2020-09-09 20:51:17.998653+00 100.0 \n", + "2 accepted 5.0 2020-10-23 10:10:58.352972+00 50.0 \n", + "3 accepted 5.0 2020-10-31 15:46:53.643958+00 100.0 \n", + "4 accepted 5.0 2020-10-06 08:20:17.170432+00 50.0 \n", + "... ... ... ... ... \n", + "18428 rejected 5.0 2020-10-10 06:42:22.822743+00 50.0 \n", + "18429 rejected 5.0 2020-10-31 15:24:18.680694+00 50.0 \n", + "18430 rejected 5.0 2020-10-27 17:28:51.749177+00 50.0 \n", + "18431 rejected 5.0 2020-10-23 16:27:52.047457+00 50.0 \n", + "18432 accepted 5.0 2020-10-14 07:12:43.958192+00 100.0 \n", + "\n", + " status_y created_at_y user_id \n", + "0 money_back 2020-07-04 05:59:49.246265+00 26978.0 \n", + "1 money_back 2020-07-07 17:45:03.352406+00 13539.0 \n", + "2 money_back 2020-09-15 19:05:39.267562+00 13756.0 \n", + "3 money_back 2020-10-13 08:15:07.74666+00 61460.0 \n", + "4 money_back 2020-08-06 22:46:50.681779+00 31188.0 \n", + "... ... ... ... \n", + "18428 rejected 2020-08-13 15:21:44.956497+00 NaN \n", + "18429 money_back 2020-10-13 07:46:38.698727+00 33520.0 \n", + "18430 rejected 2020-10-01 18:51:01.003129+00 55943.0 \n", + "18431 money_back 2020-09-17 07:33:06.068449+00 19040.0 \n", + "18432 rejected 2020-08-20 07:13:36.827124+00 13661.0 \n", + "\n", + "[18433 rows x 11 columns]" + ] + }, + "execution_count": 482, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_df" + ] + }, + { + "cell_type": "markdown", + "id": "9b831c45-ea89-4297-9fde-f46502166110", + "metadata": {}, + "source": [ + "**Chercking for duplicates in merged**" + ] + }, + { + "cell_type": "code", + "execution_count": 484, + "id": "c7ae94a5-8b93-4248-8ed3-625c16894ddd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Duplicates in merged_df['id']: 0\n" + ] + } + ], + "source": [ + "print(\"Duplicates in merged_df['id']:\", merged_df['id'].duplicated().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": 485, + "id": "d6c4e727-5e09-44ad-956b-b8c674bfb2f5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Missing values in merged_df:\n", + "id 0\n", + "cash_request_id 2\n", + "type 0\n", + "category 0\n", + "status_x 0\n", + "total_amount 0\n", + "created_at_x 0\n", + "amount 0\n", + "status_y 0\n", + "created_at_y 0\n", + "user_id 1973\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "print(\"Missing values in merged_df:\")\n", + "print(merged_df.isnull().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": 486, + "id": "4b18f45c-317b-4b39-99a6-2635a95c0530", + "metadata": {}, + "outputs": [], + "source": [ + "merged_df = merged_df.dropna(subset=['cash_request_id'])\n", + "merged_df = merged_df.dropna(subset=['user_id'])" + ] + }, + { + "cell_type": "markdown", + "id": "73251827-bdc6-40cf-8ebb-1895c3c61797", + "metadata": {}, + "source": [ + "**Droping rows with missing cash request**" + ] + }, + { + "cell_type": "code", + "execution_count": 488, + "id": "f2e11891-0dda-4dbe-9069-fa3386545e82", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape after cleaning: (16458, 11)\n", + "Missing values after cleaning:\n", + "id 0\n", + "cash_request_id 0\n", + "type 0\n", + "category 0\n", + "status_x 0\n", + "total_amount 0\n", + "created_at_x 0\n", + "amount 0\n", + "status_y 0\n", + "created_at_y 0\n", + "user_id 0\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "#droping rows with missing cash_request_id\n", + "merged_df = merged_df.dropna(subset=['cash_request_id'])\n", + "\n", + "#droping rows with missing user_id\n", + "merged_df = merged_df.dropna(subset=['user_id'])\n", + "\n", + "\n", + "print(\"Shape after cleaning:\", merged_df.shape)\n", + "print(\"Missing values after cleaning:\")\n", + "print(merged_df.isnull().sum())" + ] + }, + { + "cell_type": "markdown", + "id": "d85b9355-b4b6-4aa4-b27b-2b265d8e7fe4", + "metadata": {}, + "source": [ + "**Finally, MERGED**" + ] + }, + { + "cell_type": "code", + "execution_count": 490, + "id": "cfff1fea-af94-444d-b8e6-212eb26a5836", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcash_request_idtypecategorystatus_xtotal_amountcreated_at_xamountstatus_ycreated_at_yuser_id
0653714941.0instant_paymentrejected_direct_debitrejected5.02020-09-07 10:47:27.42315+00100.0money_back2020-07-04 05:59:49.246265+0026978.0
1696111714.0incidentrejected_direct_debitaccepted5.02020-09-09 20:51:17.998653+00100.0money_back2020-07-07 17:45:03.352406+0013539.0
21629623371.0instant_paymentrejected_direct_debitaccepted5.02020-10-23 10:10:58.352972+0050.0money_back2020-09-15 19:05:39.267562+0013756.0
32077526772.0instant_paymentrejected_direct_debitaccepted5.02020-10-31 15:46:53.643958+00100.0money_back2020-10-13 08:15:07.74666+0061460.0
41124219350.0instant_paymentrejected_direct_debitaccepted5.02020-10-06 08:20:17.170432+0050.0money_back2020-08-06 22:46:50.681779+0031188.0
....................................
184271470722050.0instant_paymentrejected_direct_debitrejected5.02020-10-19 06:48:13.655092+00100.0money_back2020-09-05 13:27:19.805671+0012229.0
184292076826764.0instant_paymentrejected_direct_debitrejected5.02020-10-31 15:24:18.680694+0050.0money_back2020-10-13 07:46:38.698727+0033520.0
184301877925331.0instant_paymentrejected_direct_debitrejected5.02020-10-27 17:28:51.749177+0050.0rejected2020-10-01 18:51:01.003129+0055943.0
184311654223628.0instant_paymentrejected_direct_debitrejected5.02020-10-23 16:27:52.047457+0050.0money_back2020-09-17 07:33:06.068449+0019040.0
184321330120982.0instant_paymentrejected_direct_debitaccepted5.02020-10-14 07:12:43.958192+00100.0rejected2020-08-20 07:13:36.827124+0013661.0
\n", + "

16458 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " id cash_request_id type category \\\n", + "0 6537 14941.0 instant_payment rejected_direct_debit \n", + "1 6961 11714.0 incident rejected_direct_debit \n", + "2 16296 23371.0 instant_payment rejected_direct_debit \n", + "3 20775 26772.0 instant_payment rejected_direct_debit \n", + "4 11242 19350.0 instant_payment rejected_direct_debit \n", + "... ... ... ... ... \n", + "18427 14707 22050.0 instant_payment rejected_direct_debit \n", + "18429 20768 26764.0 instant_payment rejected_direct_debit \n", + "18430 18779 25331.0 instant_payment rejected_direct_debit \n", + "18431 16542 23628.0 instant_payment rejected_direct_debit \n", + "18432 13301 20982.0 instant_payment rejected_direct_debit \n", + "\n", + " status_x total_amount created_at_x amount \\\n", + "0 rejected 5.0 2020-09-07 10:47:27.42315+00 100.0 \n", + "1 accepted 5.0 2020-09-09 20:51:17.998653+00 100.0 \n", + "2 accepted 5.0 2020-10-23 10:10:58.352972+00 50.0 \n", + "3 accepted 5.0 2020-10-31 15:46:53.643958+00 100.0 \n", + "4 accepted 5.0 2020-10-06 08:20:17.170432+00 50.0 \n", + "... ... ... ... ... \n", + "18427 rejected 5.0 2020-10-19 06:48:13.655092+00 100.0 \n", + "18429 rejected 5.0 2020-10-31 15:24:18.680694+00 50.0 \n", + "18430 rejected 5.0 2020-10-27 17:28:51.749177+00 50.0 \n", + "18431 rejected 5.0 2020-10-23 16:27:52.047457+00 50.0 \n", + "18432 accepted 5.0 2020-10-14 07:12:43.958192+00 100.0 \n", + "\n", + " status_y created_at_y user_id \n", + "0 money_back 2020-07-04 05:59:49.246265+00 26978.0 \n", + "1 money_back 2020-07-07 17:45:03.352406+00 13539.0 \n", + "2 money_back 2020-09-15 19:05:39.267562+00 13756.0 \n", + "3 money_back 2020-10-13 08:15:07.74666+00 61460.0 \n", + "4 money_back 2020-08-06 22:46:50.681779+00 31188.0 \n", + "... ... ... ... \n", + "18427 money_back 2020-09-05 13:27:19.805671+00 12229.0 \n", + "18429 money_back 2020-10-13 07:46:38.698727+00 33520.0 \n", + "18430 rejected 2020-10-01 18:51:01.003129+00 55943.0 \n", + "18431 money_back 2020-09-17 07:33:06.068449+00 19040.0 \n", + "18432 rejected 2020-08-20 07:13:36.827124+00 13661.0 \n", + "\n", + "[16458 rows x 11 columns]" + ] + }, + "execution_count": 490, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_df" + ] + }, + { + "cell_type": "code", + "execution_count": 491, + "id": "8da161b6-405f-4fc0-8c87-37df46ed00d8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "object\n" + ] + } + ], + "source": [ + "print(merged_df['created_at_x'].dtype)" + ] + }, + { + "cell_type": "markdown", + "id": "1e4db748-2dad-4cc1-82e8-44a4e94f9be2", + "metadata": {}, + "source": [ + "**Converting to datetime**" + ] + }, + { + "cell_type": "code", + "execution_count": 493, + "id": "abac70e9-704b-4db0-b37e-cc726d43deb7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 2020-09-07 10:47:27.423150+00:00\n", + "1 2020-09-09 20:51:17.998653+00:00\n", + "2 2020-10-23 10:10:58.352972+00:00\n", + "3 2020-10-31 15:46:53.643958+00:00\n", + "4 2020-10-06 08:20:17.170432+00:00\n", + "Name: created_at_x, dtype: datetime64[ns, UTC]\n" + ] + } + ], + "source": [ + "merged_df['created_at_x'] = pd.to_datetime(merged_df['created_at_x'], errors='coerce')\n", + "\n", + "print(merged_df['created_at_x'].head())" + ] + }, + { + "cell_type": "code", + "execution_count": 494, + "id": "057408e4-8a2b-4f87-b448-afea1bc46e3a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 2020-09\n", + "1 2020-09\n", + "2 2020-10\n", + "3 2020-10\n", + "4 2020-10\n", + "Name: created_at_x, dtype: period[M]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/cj/_ljqqgzj7bvdlj6sc0_8wd080000gn/T/ipykernel_78745/3021134120.py:1: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.\n", + " merged_df['created_at_x'] = merged_df['created_at_x'].dt.to_period('M')\n" + ] + } + ], + "source": [ + "merged_df['created_at_x'] = merged_df['created_at_x'].dt.to_period('M')\n", + "\n", + "print(merged_df['created_at_x'].head())" + ] + }, + { + "cell_type": "markdown", + "id": "e68264d6-6c08-4cf7-807a-97c4e77b5340", + "metadata": {}, + "source": [ + "**Define cohort as the first interaction month for each user**" + ] + }, + { + "cell_type": "code", + "execution_count": 496, + "id": "53bfafaa-c60a-409e-9eaf-f9815d11561e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cohorts created for each user:\n", + " user_id created_at_x cohort\n", + "0 26978.0 2020-09 2020-09\n", + "1 13539.0 2020-09 2020-08\n", + "2 13756.0 2020-10 2020-09\n", + "3 61460.0 2020-10 2020-10\n", + "4 31188.0 2020-10 2020-09\n" + ] + } + ], + "source": [ + "merged_df['cohort'] = merged_df.groupby('user_id')['created_at_x'].transform('min')\n", + "\n", + "print(\"Cohorts created for each user:\")\n", + "print(merged_df[['user_id', 'created_at_x', 'cohort']].head())" + ] + }, + { + "cell_type": "markdown", + "id": "98039944-b2f4-447a-85a9-5394b5df1ae5", + "metadata": {}, + "source": [ + "**Define cohort as the first interaction month for each user**" + ] + }, + { + "cell_type": "code", + "execution_count": 498, + "id": "4ef0e2d0-5b50-4e9b-be5d-a351613d3f3d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " user_id created_at_x cohort\n", + "0 26978.0 2020-09 2020-09\n", + "1 13539.0 2020-09 2020-08\n", + "2 13756.0 2020-10 2020-09\n", + "3 61460.0 2020-10 2020-10\n", + "4 31188.0 2020-10 2020-09\n" + ] + } + ], + "source": [ + "merged_df['cohort'] = merged_df.groupby('user_id')['created_at_x'].transform('min')\n", + "\n", + "print(merged_df[['user_id', 'created_at_x', 'cohort']].head())" + ] + }, + { + "cell_type": "markdown", + "id": "25a0a9a7-f497-4937-a87f-fe5a4945ab84", + "metadata": {}, + "source": [ + "**Group by cohort and count transactions**" + ] + }, + { + "cell_type": "code", + "execution_count": 500, + "id": "1745639f-3aec-4d80-a8cf-c4ffe7417f3c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of transactions per cohort:\n", + "cohort\n", + "2020-05 12\n", + "2020-06 1609\n", + "2020-07 2594\n", + "2020-08 3673\n", + "2020-09 3782\n", + "2020-10 4667\n", + "2020-11 121\n", + "Freq: M, dtype: int64\n" + ] + } + ], + "source": [ + "transactions_per_cohort = merged_df.groupby('cohort').size()\n", + "\n", + "print(\"Number of transactions per cohort:\")\n", + "print(transactions_per_cohort)" + ] + }, + { + "cell_type": "markdown", + "id": "4e69c479-53f7-4de2-a7f5-8e0c174f74f8", + "metadata": {}, + "source": [ + "**Group by cohort and sum revenue**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 502, + "id": "0e09fd2f-5642-44e3-b931-01bb6b28e5bc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Revenue per cohort:\n", + "cohort\n", + "2020-05 1189.0\n", + "2020-06 152967.0\n", + "2020-07 242199.0\n", + "2020-08 336599.0\n", + "2020-09 330237.0\n", + "2020-10 365981.0\n", + "2020-11 10440.0\n", + "Freq: M, Name: amount, dtype: float64\n" + ] + } + ], + "source": [ + "revenue_per_cohort = merged_df.groupby('cohort')['amount'].sum()\n", + "\n", + "print(\"Revenue per cohort:\")\n", + "print(revenue_per_cohort)" + ] + }, + { + "cell_type": "markdown", + "id": "3db4fd48-3b9b-4c79-9e6c-030aa43b096c", + "metadata": {}, + "source": [ + "**Displaying results**" + ] + }, + { + "cell_type": "code", + "execution_count": 504, + "id": "f259ec3f-af44-414a-91f5-da0e07144df3", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "merged_df['transaction_month'] = merged_df['created_at_x'] # Ensure it's using the month-year format\n", + "cohort_data = merged_df.groupby(['cohort', 'transaction_month']).size().unstack(fill_value=0)\n", + "\n", + "plt.figure(figsize=(12, 8))\n", + "sns.heatmap(cohort_data, annot=True, fmt=\"d\", cmap=\"YlGnBu\", cbar_kws={'label': 'Transactions'})\n", + "plt.title('Transaction Frequency by Cohort Over Time')\n", + "plt.xlabel('Transaction Month')\n", + "plt.ylabel('Cohort (First Interaction Month)')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "b44beaec-d48f-403e-8e9c-9a5c885cdd81", + "metadata": {}, + "source": [ + "**Distribution: The values in the heatmap show a gradual decrease in transaction frequency over time, showing a right-skewed distribution. This means that most transactions occur in the initial months, with fewer transactions in later months.\n", + "The transactional behavior across cohorts is positively skewed since the majority of the activity happens early.**" + ] + }, + { + "cell_type": "code", + "execution_count": 506, + "id": "aea5047e-8dbc-4137-b081-4716069d8876", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "revenue_data = merged_df.groupby(['cohort', 'transaction_month'])['amount'].sum().reset_index()\n", + "\n", + "plt.figure(figsize=(12, 6))\n", + "sns.boxplot(x='cohort', y='amount', data=revenue_data)\n", + "plt.title('Revenue Per Transaction by Cohort')\n", + "plt.xlabel('Cohort (First Interaction Month)')\n", + "plt.ylabel('Revenue per Transaction')\n", + "plt.xticks(rotation=45)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "35c7d382-62b1-414b-9ae3-82e93efde06a", + "metadata": {}, + "source": [ + "**Distribution: The box plot highlights the spread of revenue within each cohort. The cohorts have varying levels of skewness:\n", + "\t•\tEarlier cohorts (e.g., 2020-06) show a narrow distribution, meaning revenue was more consistent.\n", + "\t•\tLater cohorts (e.g., 2020-10) show a wide spread, indicating variability in transaction values.\n", + "\t•\tOutliers: The 2020-10 cohort displays outliers, suggesting that a few transactions are contributing significantly to the total revenue.**" + ] + }, + { + "cell_type": "code", + "execution_count": 508, + "id": "b61e12db-3d70-48ef-a8af-d812a84aa028", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "merged_df['transaction_month'].value_counts().plot(kind='bar', alpha=0.7, label='Transactions')\n", + "\n", + "plt.hist(merged_df['amount'], bins=30, alpha=0.7, label='Revenue per Transaction')\n", + "\n", + "plt.title('Combined Histogram: Transaction Frequency and Revenue Distribution')\n", + "plt.xlabel('Value')\n", + "plt.ylabel('Frequency')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "951c4497", + "metadata": {}, + "source": [ + "## 5. Cohort Analysis and deliveries:\n", + "\n", + "Observations\n", + "\n", + "**Dominance of the 2020-10 Cohort:**\n", + "\n", + "- The 2020-10 cohort has the highest transaction frequency by far, with over 9,000 transactions. This aligns with previous analyses showing newer cohorts perform better.\n", + "\n", + "- This could indicate a successful marketing campaign, increased onboarding efforts, or seasonal demand during this period.\n", + " \n", + "**Decline in Transactions for Earlier Cohorts:**\n", + "\n", + "- Older cohorts, such as 2020-06 and 2020-07, have significantly fewer transactions. This drop might be due to lower retention rates or reduced engagement over time.\n", + " \n", + "**Revenue per Transaction Remains Consistent:**\n", + "\n", + "- While transaction frequency fluctuates, the bars for revenue per transaction are fairly consistent across cohorts. This suggests that the average revenue per transaction does not depend heavily on cohort age.\n", + " \n", + "**Outliers (2020-10):**\n", + " \n", + "- The 2020-10 cohort not only has the most transactions but also displays a wider spread of revenue per transaction (as seen in earlier box plots). This could indicate diverse customer behavior, with both low- and high-value transactions occurring in this cohort.\n", + "\n", + "**Questions**\n", + "\n", + "**Frequency of Service Usage: Understand how often users from each cohort utilize IronHack Payments' cash advance services over time.**\n", + "\n", + "The frequency of transactions generally decreases over time for earlier cohorts (e.g., 2020-06 and 2020-07). This suggests weaker long-term engagement or a natural drop-off in service usage. Newer cohorts (2020-09 and 2020-10) exhibit stronger engagement in their first and second months, with transaction counts significantly higher than earlier cohorts.\n", + " \n", + "**Incident Rate: Determine the incident rate, specifically focusing on payment incidents, for each cohort. Identify if there are variations in incident rates among different cohorts.**\n", + "\n", + "Earlier cohorts (2020-06) have a higher incident rate, possibly due to operational inefficiencies or lack of proper user education in the early days of the service. Newer cohorts (2020-09 and 2020-10) show a lower incident rate, indicating improvements in processes or better customer targeting.\n", + " \n", + "**Revenue Generated by the Cohort: Calculate the total revenue generated by each cohort over months to assess the financial impact of user behavior.**\n", + "\n", + "Later cohorts (2020-09 and 2020-10) generate significantly higher revenue compared to earlier ones. This indicates either higher transaction sizes or increased activity from high-value users. Revenue increases sharply for newer cohorts indicating better customer targeting.\n", + "\n", + "**New Relevant Metric: Propose and calculate a new relevant metric that provides additional insights into user behavior or the performance of IronHack Payments' services.**\n", + "\n", + "Would be necessary the implementation of the average revenue per user. This metric provides insight into the average revenue generated per active user within each cohort. \n", + "Develop personalized offers or incentives for high-value users to sustain engagement and spending. \n", + "Create retention strategies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20d82bc2-276e-45c7-bcee-194ed06f89be", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Cohort_Analysis_Ironhack_Project.ipynb b/Cohort_Analysis_Ironhack_Project.ipynb new file mode 100644 index 0000000..c9f695c --- /dev/null +++ b/Cohort_Analysis_Ironhack_Project.ipynb @@ -0,0 +1,2618 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "45a33a0d", + "metadata": {}, + "source": [ + "# Cohort Analysis for Ironhack Payments" + ] + }, + { + "cell_type": "markdown", + "id": "c510412d", + "metadata": {}, + "source": [ + "## 1. Introduction\n", + "\n", + "### Business Context:\n", + "IronHack Payments offers innovative cash advance solutions since 2020. This project aims to perform a cohort analysis to understand user behavior and the performance of their financial services.\n", + "\n", + "### Deliverables:\n", + "- Python Code (Jupyter Notebook)\n", + "- Exploratory Data Analysis Report\n", + "- Data Quality Analysis Report\n", + "- Short Presentation" + ] + }, + { + "cell_type": "code", + "execution_count": 445, + "id": "a5399ab2-aef1-4fd5-8fcc-840a44da4edb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " id amount status created_at \\\n", + "0 5 100.0 rejected 2019-12-10 19:05:21.596873+00 \n", + "1 70 100.0 rejected 2019-12-10 19:50:12.34778+00 \n", + "2 7 100.0 rejected 2019-12-10 19:13:35.82546+00 \n", + "3 10 99.0 rejected 2019-12-10 19:16:10.880172+00 \n", + "4 1594 100.0 rejected 2020-05-06 09:59:38.877376+00 \n", + "\n", + " updated_at user_id moderated_at \\\n", + "0 2019-12-11 16:47:42.40783+00 804.0 2019-12-11 16:47:42.405646+00 \n", + "1 2019-12-11 14:24:22.900054+00 231.0 2019-12-11 14:24:22.897988+00 \n", + "2 2019-12-11 09:46:59.779773+00 191.0 2019-12-11 09:46:59.777728+00 \n", + "3 2019-12-18 14:26:18.136163+00 761.0 2019-12-18 14:26:18.128407+00 \n", + "4 2020-05-07 09:21:55.34008+00 7686.0 2020-05-07 09:21:55.320193+00 \n", + "\n", + " deleted_account_id reimbursement_date \\\n", + "0 NaN 2020-01-09 19:05:21.596363+00 \n", + "1 NaN 2020-01-09 19:50:12.34778+00 \n", + "2 NaN 2020-01-09 19:13:35.825041+00 \n", + "3 NaN 2020-01-09 19:16:10.879606+00 \n", + "4 NaN 2020-06-05 22:00:00+00 \n", + "\n", + " cash_request_received_date money_back_date transfer_type send_at \\\n", + "0 NaN NaN regular NaN \n", + "1 NaN NaN regular NaN \n", + "2 NaN NaN regular NaN \n", + "3 NaN NaN regular NaN \n", + "4 NaN NaN regular NaN \n", + "\n", + " recovery_status reco_creation reco_last_update \n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + " id cash_request_id type status category \\\n", + "0 6537 14941.0 instant_payment rejected NaN \n", + "1 6961 11714.0 incident accepted rejected_direct_debit \n", + "2 16296 23371.0 instant_payment accepted NaN \n", + "3 20775 26772.0 instant_payment accepted NaN \n", + "4 11242 19350.0 instant_payment accepted NaN \n", + "\n", + " total_amount reason \\\n", + "0 5.0 Instant Payment Cash Request 14941 \n", + "1 5.0 rejected direct debit \n", + "2 5.0 Instant Payment Cash Request 23371 \n", + "3 5.0 Instant Payment Cash Request 26772 \n", + "4 5.0 Instant Payment Cash Request 19350 \n", + "\n", + " created_at updated_at \\\n", + "0 2020-09-07 10:47:27.42315+00 2020-10-13 14:25:09.396112+00 \n", + "1 2020-09-09 20:51:17.998653+00 2020-10-13 14:25:15.537063+00 \n", + "2 2020-10-23 10:10:58.352972+00 2020-10-23 10:10:58.352994+00 \n", + "3 2020-10-31 15:46:53.643958+00 2020-10-31 15:46:53.643982+00 \n", + "4 2020-10-06 08:20:17.170432+00 2020-10-13 14:25:03.267983+00 \n", + "\n", + " paid_at from_date to_date charge_moment \n", + "0 2020-12-17 14:50:07.47011+00 NaN NaN after \n", + "1 2020-12-08 17:13:10.45908+00 NaN NaN after \n", + "2 2020-11-04 19:34:37.43291+00 NaN NaN after \n", + "3 2020-11-19 05:09:22.500223+00 NaN NaN after \n", + "4 2020-11-02 14:45:20.355598+00 NaN NaN after \n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "data_cash = pd.read_csv('./datasets/cash_data_analyst.csv')\n", + "data_fees = pd.read_csv('./datasets/fees_data_analyst.csv')\n", + "lexicon_data = pd.read_excel('./datasets/Lexique_Data_Analyst.xlsx')\n", + "\n", + "dff = data_fees.copy()\n", + "\n", + "dfc = data_cash.copy()\n", + "\n", + "print(data_cash_request.head(5))\n", + "\n", + "print(data_fees.head(5))\n" + ] + }, + { + "cell_type": "markdown", + "id": "84558474-1341-4784-9dc8-ccb0652683ad", + "metadata": {}, + "source": [ + "## 2. Data Loading\n", + "- Extracting and briefly describing each dataset and its columns." + ] + }, + { + "cell_type": "code", + "execution_count": 447, + "id": "73cbbab6-a168-493a-a5b7-bdc53f795f2d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 21061 entries, 0 to 21060\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 21061 non-null int64 \n", + " 1 cash_request_id 21057 non-null float64\n", + " 2 type 21061 non-null object \n", + " 3 status 21061 non-null object \n", + " 4 category 2196 non-null object \n", + " 5 total_amount 21061 non-null float64\n", + " 6 reason 21061 non-null object \n", + " 7 created_at 21061 non-null object \n", + " 8 updated_at 21061 non-null object \n", + " 9 paid_at 15531 non-null object \n", + " 10 from_date 7766 non-null object \n", + " 11 to_date 7766 non-null object \n", + " 12 charge_moment 21061 non-null object \n", + "dtypes: float64(2), int64(1), object(10)\n", + "memory usage: 2.1+ MB\n" + ] + } + ], + "source": [ + "dff.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 448, + "id": "8700a78d-7163-4c56-a401-f2cf1b296e64", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 23970 entries, 0 to 23969\n", + "Data columns (total 16 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 23970 non-null int64 \n", + " 1 amount 23970 non-null float64\n", + " 2 status 23970 non-null object \n", + " 3 created_at 23970 non-null object \n", + " 4 updated_at 23970 non-null object \n", + " 5 user_id 21867 non-null float64\n", + " 6 moderated_at 16035 non-null object \n", + " 7 deleted_account_id 2104 non-null float64\n", + " 8 reimbursement_date 23970 non-null object \n", + " 9 cash_request_received_date 16289 non-null object \n", + " 10 money_back_date 16543 non-null object \n", + " 11 transfer_type 23970 non-null object \n", + " 12 send_at 16641 non-null object \n", + " 13 recovery_status 3330 non-null object \n", + " 14 reco_creation 3330 non-null object \n", + " 15 reco_last_update 3330 non-null object \n", + "dtypes: float64(3), int64(1), object(12)\n", + "memory usage: 2.9+ MB\n" + ] + } + ], + "source": [ + "dfc.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 449, + "id": "5913ab9b-f9eb-45ec-af8c-799902ef5c53", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcash_request_idtotal_amount
count21061.00000021057.00000021061.000000
mean10645.35511116318.4491625.000237
std6099.3152566656.1499490.034453
min1.0000001456.0000005.000000
25%5385.00000011745.0000005.000000
50%10652.00000017160.0000005.000000
75%15925.00000021796.0000005.000000
max21193.00000027010.00000010.000000
\n", + "
" + ], + "text/plain": [ + " id cash_request_id total_amount\n", + "count 21061.000000 21057.000000 21061.000000\n", + "mean 10645.355111 16318.449162 5.000237\n", + "std 6099.315256 6656.149949 0.034453\n", + "min 1.000000 1456.000000 5.000000\n", + "25% 5385.000000 11745.000000 5.000000\n", + "50% 10652.000000 17160.000000 5.000000\n", + "75% 15925.000000 21796.000000 5.000000\n", + "max 21193.000000 27010.000000 10.000000" + ] + }, + "execution_count": 449, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dff.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 450, + "id": "e139d5b1-5d71-4856-b056-955f5ccd7937", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idamountuser_iddeleted_account_id
count23970.00000023970.00000021867.0000002104.000000
mean13910.96612482.72081832581.2507899658.755228
std7788.11721426.52806527618.5657737972.743249
min3.0000001.00000034.00000091.000000
25%7427.25000050.00000010804.0000003767.000000
50%14270.500000100.00000023773.0000006121.500000
75%20607.750000100.00000046965.00000016345.000000
max27010.000000200.000000103719.00000030445.000000
\n", + "
" + ], + "text/plain": [ + " id amount user_id deleted_account_id\n", + "count 23970.000000 23970.000000 21867.000000 2104.000000\n", + "mean 13910.966124 82.720818 32581.250789 9658.755228\n", + "std 7788.117214 26.528065 27618.565773 7972.743249\n", + "min 3.000000 1.000000 34.000000 91.000000\n", + "25% 7427.250000 50.000000 10804.000000 3767.000000\n", + "50% 14270.500000 100.000000 23773.000000 6121.500000\n", + "75% 20607.750000 100.000000 46965.000000 16345.000000\n", + "max 27010.000000 200.000000 103719.000000 30445.000000" + ] + }, + "execution_count": 450, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfc.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 451, + "id": "1a8834e1-4a88-4b18-aabe-5860d15145b1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(21061, 13)" + ] + }, + "execution_count": 451, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dff.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 452, + "id": "732ced3d-e151-48a8-924c-e22a746a8037", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(23970, 16)" + ] + }, + "execution_count": 452, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfc.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 453, + "id": "f9b1096e-91e1-44a4-9ff3-b82c34c46a16", + "metadata": {}, + "outputs": [], + "source": [ + "n_rows = dff.shape[0]\n", + "n_columns = dff.shape[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 454, + "id": "dd020183-83b8-4d12-ba49-19ee4b4fb38b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcash_request_idtypestatuscategorytotal_amountreasoncreated_atupdated_atpaid_atfrom_dateto_datecharge_moment
0653714941.0instant_paymentrejectedNaN5.0Instant Payment Cash Request 149412020-09-07 10:47:27.42315+002020-10-13 14:25:09.396112+002020-12-17 14:50:07.47011+00NaNNaNafter
1696111714.0incidentacceptedrejected_direct_debit5.0rejected direct debit2020-09-09 20:51:17.998653+002020-10-13 14:25:15.537063+002020-12-08 17:13:10.45908+00NaNNaNafter
21629623371.0instant_paymentacceptedNaN5.0Instant Payment Cash Request 233712020-10-23 10:10:58.352972+002020-10-23 10:10:58.352994+002020-11-04 19:34:37.43291+00NaNNaNafter
\n", + "
" + ], + "text/plain": [ + " id cash_request_id type status category \\\n", + "0 6537 14941.0 instant_payment rejected NaN \n", + "1 6961 11714.0 incident accepted rejected_direct_debit \n", + "2 16296 23371.0 instant_payment accepted NaN \n", + "\n", + " total_amount reason \\\n", + "0 5.0 Instant Payment Cash Request 14941 \n", + "1 5.0 rejected direct debit \n", + "2 5.0 Instant Payment Cash Request 23371 \n", + "\n", + " created_at updated_at \\\n", + "0 2020-09-07 10:47:27.42315+00 2020-10-13 14:25:09.396112+00 \n", + "1 2020-09-09 20:51:17.998653+00 2020-10-13 14:25:15.537063+00 \n", + "2 2020-10-23 10:10:58.352972+00 2020-10-23 10:10:58.352994+00 \n", + "\n", + " paid_at from_date to_date charge_moment \n", + "0 2020-12-17 14:50:07.47011+00 NaN NaN after \n", + "1 2020-12-08 17:13:10.45908+00 NaN NaN after \n", + "2 2020-11-04 19:34:37.43291+00 NaN NaN after " + ] + }, + "execution_count": 454, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dff.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 455, + "id": "61d26458-ef95-4164-86b0-74f138f3832c", + "metadata": {}, + "outputs": [], + "source": [ + "n_rows = dfc.shape[0]\n", + "n_columns = dfc.shape[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 456, + "id": "5d39411b-ceb9-4506-a060-8127e9c4914b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idamountstatuscreated_atupdated_atuser_idmoderated_atdeleted_account_idreimbursement_datecash_request_received_datemoney_back_datetransfer_typesend_atrecovery_statusreco_creationreco_last_update
05100.0rejected2019-12-10 19:05:21.596873+002019-12-11 16:47:42.40783+00804.02019-12-11 16:47:42.405646+00NaN2020-01-09 19:05:21.596363+00NaNNaNregularNaNNaNNaNNaN
170100.0rejected2019-12-10 19:50:12.34778+002019-12-11 14:24:22.900054+00231.02019-12-11 14:24:22.897988+00NaN2020-01-09 19:50:12.34778+00NaNNaNregularNaNNaNNaNNaN
27100.0rejected2019-12-10 19:13:35.82546+002019-12-11 09:46:59.779773+00191.02019-12-11 09:46:59.777728+00NaN2020-01-09 19:13:35.825041+00NaNNaNregularNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " id amount status created_at \\\n", + "0 5 100.0 rejected 2019-12-10 19:05:21.596873+00 \n", + "1 70 100.0 rejected 2019-12-10 19:50:12.34778+00 \n", + "2 7 100.0 rejected 2019-12-10 19:13:35.82546+00 \n", + "\n", + " updated_at user_id moderated_at \\\n", + "0 2019-12-11 16:47:42.40783+00 804.0 2019-12-11 16:47:42.405646+00 \n", + "1 2019-12-11 14:24:22.900054+00 231.0 2019-12-11 14:24:22.897988+00 \n", + "2 2019-12-11 09:46:59.779773+00 191.0 2019-12-11 09:46:59.777728+00 \n", + "\n", + " deleted_account_id reimbursement_date \\\n", + "0 NaN 2020-01-09 19:05:21.596363+00 \n", + "1 NaN 2020-01-09 19:50:12.34778+00 \n", + "2 NaN 2020-01-09 19:13:35.825041+00 \n", + "\n", + " cash_request_received_date money_back_date transfer_type send_at \\\n", + "0 NaN NaN regular NaN \n", + "1 NaN NaN regular NaN \n", + "2 NaN NaN regular NaN \n", + "\n", + " recovery_status reco_creation reco_last_update \n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN " + ] + }, + "execution_count": 456, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfc.head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "56307a51-2ca9-4902-a465-64f6410b9552", + "metadata": {}, + "source": [ + "**Similarities: id, status, created at, updated at**" + ] + }, + { + "cell_type": "code", + "execution_count": 458, + "id": "2ef52422-69d6-4cfe-9a41-a1722f830625", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['id', 'cash_request_id', 'type', 'status', 'category', 'total_amount',\n", + " 'reason', 'created_at', 'updated_at', 'paid_at', 'from_date', 'to_date',\n", + " 'charge_moment'],\n", + " dtype='object')" + ] + }, + "execution_count": 458, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dff.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 459, + "id": "26072eb5-3cb3-47b4-b7f9-61497f9c5c58", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['id', 'amount', 'status', 'created_at', 'updated_at', 'user_id',\n", + " 'moderated_at', 'deleted_account_id', 'reimbursement_date',\n", + " 'cash_request_received_date', 'money_back_date', 'transfer_type',\n", + " 'send_at', 'recovery_status', 'reco_creation', 'reco_last_update'],\n", + " dtype='object')" + ] + }, + "execution_count": 459, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfc.columns" + ] + }, + { + "cell_type": "markdown", + "id": "325fa04e", + "metadata": {}, + "source": [ + "## 3. Data Quality Analysis\n", + "- Checking for missing values.\n", + "- Identifying duplicates and inconsistencies.\n", + "- Validate data types.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "7c051690-3963-4ce7-a4bd-b33bf40b73e6", + "metadata": {}, + "source": [ + "**Checking for missing values and deleting columns**" + ] + }, + { + "cell_type": "code", + "execution_count": 462, + "id": "fc7e80f9-fba4-4fe0-81cd-569cc21736c5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "cash_request_id 4\n", + "type 0\n", + "status 0\n", + "category 18865\n", + "total_amount 0\n", + "reason 0\n", + "created_at 0\n", + "updated_at 0\n", + "paid_at 5530\n", + "from_date 13295\n", + "to_date 13295\n", + "charge_moment 0\n", + "dtype: int64" + ] + }, + "execution_count": 462, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dff.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 463, + "id": "4a8df55a-bd26-40e9-b40e-c2dafc03b778", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "cash_request_id 0\n", + "type 0\n", + "status 0\n", + "category 18861\n", + "total_amount 0\n", + "reason 0\n", + "created_at 0\n", + "updated_at 0\n", + "paid_at 5526\n", + "from_date 13291\n", + "to_date 13291\n", + "charge_moment 0\n", + "dtype: int64" + ] + }, + "execution_count": 463, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dff = dff.dropna(subset=['cash_request_id'])\n", + "dff.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 464, + "id": "955a276f-fd35-4fde-b65f-53445f94aa8d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "amount 0\n", + "status 0\n", + "created_at 0\n", + "updated_at 0\n", + "user_id 2103\n", + "moderated_at 7935\n", + "deleted_account_id 21866\n", + "reimbursement_date 0\n", + "cash_request_received_date 7681\n", + "money_back_date 7427\n", + "transfer_type 0\n", + "send_at 7329\n", + "recovery_status 20640\n", + "reco_creation 20640\n", + "reco_last_update 20640\n", + "dtype: int64" + ] + }, + "execution_count": 464, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfc.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 465, + "id": "63742b02-cdb6-4a38-8ddc-ff2bdde25aa1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "amount 0\n", + "status 0\n", + "created_at 0\n", + "updated_at 0\n", + "user_id 0\n", + "moderated_at 7758\n", + "deleted_account_id 21866\n", + "reimbursement_date 0\n", + "cash_request_received_date 6276\n", + "money_back_date 6026\n", + "transfer_type 0\n", + "send_at 6325\n", + "recovery_status 18727\n", + "reco_creation 18727\n", + "reco_last_update 18727\n", + "dtype: int64" + ] + }, + "execution_count": 465, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfc = dfc.dropna(subset=['user_id'])\n", + "dfc.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 466, + "id": "1d9c4b82-160a-40b1-ac3c-aabeea8ca67b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(21057, 13)" + ] + }, + "execution_count": 466, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dff.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 467, + "id": "b83b15c6-263a-49ea-bfc4-da8f915804de", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(21867, 16)" + ] + }, + "execution_count": 467, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfc.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 468, + "id": "51b05b6b-2514-4fda-8f1b-2cbfe167ba12", + "metadata": {}, + "outputs": [], + "source": [ + "dff = data_fees.copy()\n", + "\n", + "dfc = data_cash.copy()" + ] + }, + { + "cell_type": "markdown", + "id": "80c547f1-d7ca-46c5-81f3-2781d1d8c29d", + "metadata": {}, + "source": [ + "**Here Im checking the percentage of column Category. Since 72% of this data is \"rejected_direct_debit\", I think is safe to fill the empty values**" + ] + }, + { + "cell_type": "code", + "execution_count": 470, + "id": "c75b8136-68b2-4f56-a9d0-67bd9f8b6037", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "category\n", + "rejected_direct_debit 72.814208\n", + "month_delay_on_payment 27.185792\n", + "Name: proportion, dtype: float64\n" + ] + } + ], + "source": [ + "percentages = dff['category'].value_counts(normalize=True) * 100\n", + "\n", + "print(percentages)" + ] + }, + { + "cell_type": "code", + "execution_count": 471, + "id": "31ceb778-8a2d-4bd5-8820-3677a99ae324", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n" + ] + } + ], + "source": [ + "dff['category'] = dff['category'].fillna('rejected_direct_debit')\n", + "\n", + "print(dff['category'].isnull().sum())" + ] + }, + { + "cell_type": "markdown", + "id": "0710c515-7c4f-40f7-9432-3ad0c3e0e750", + "metadata": {}, + "source": [ + "**Here Im keeping the most important columns**" + ] + }, + { + "cell_type": "code", + "execution_count": 473, + "id": "3fbda56c-b4de-44b2-b296-5036018dd7e0", + "metadata": {}, + "outputs": [], + "source": [ + "important_columns = [\n", + " 'id', \n", + " 'cash_request_id', \n", + " 'type', \n", + " 'category',\n", + " 'status', \n", + " 'total_amount', \n", + " 'created_at', \n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 474, + "id": "85064717-63be-4e3d-be78-4573a27200dc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcash_request_idtypecategorystatustotal_amountcreated_at
0653714941.0instant_paymentrejected_direct_debitrejected5.02020-09-07 10:47:27.42315+00
1696111714.0incidentrejected_direct_debitaccepted5.02020-09-09 20:51:17.998653+00
21629623371.0instant_paymentrejected_direct_debitaccepted5.02020-10-23 10:10:58.352972+00
32077526772.0instant_paymentrejected_direct_debitaccepted5.02020-10-31 15:46:53.643958+00
41124219350.0instant_paymentrejected_direct_debitaccepted5.02020-10-06 08:20:17.170432+00
........................
210561237220262.0instant_paymentrejected_direct_debitrejected5.02020-10-10 06:42:22.822743+00
210572076826764.0instant_paymentrejected_direct_debitrejected5.02020-10-31 15:24:18.680694+00
210581877925331.0instant_paymentrejected_direct_debitrejected5.02020-10-27 17:28:51.749177+00
210591654223628.0instant_paymentrejected_direct_debitrejected5.02020-10-23 16:27:52.047457+00
210601330120982.0instant_paymentrejected_direct_debitaccepted5.02020-10-14 07:12:43.958192+00
\n", + "

21061 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " id cash_request_id type category \\\n", + "0 6537 14941.0 instant_payment rejected_direct_debit \n", + "1 6961 11714.0 incident rejected_direct_debit \n", + "2 16296 23371.0 instant_payment rejected_direct_debit \n", + "3 20775 26772.0 instant_payment rejected_direct_debit \n", + "4 11242 19350.0 instant_payment rejected_direct_debit \n", + "... ... ... ... ... \n", + "21056 12372 20262.0 instant_payment rejected_direct_debit \n", + "21057 20768 26764.0 instant_payment rejected_direct_debit \n", + "21058 18779 25331.0 instant_payment rejected_direct_debit \n", + "21059 16542 23628.0 instant_payment rejected_direct_debit \n", + "21060 13301 20982.0 instant_payment rejected_direct_debit \n", + "\n", + " status total_amount created_at \n", + "0 rejected 5.0 2020-09-07 10:47:27.42315+00 \n", + "1 accepted 5.0 2020-09-09 20:51:17.998653+00 \n", + "2 accepted 5.0 2020-10-23 10:10:58.352972+00 \n", + "3 accepted 5.0 2020-10-31 15:46:53.643958+00 \n", + "4 accepted 5.0 2020-10-06 08:20:17.170432+00 \n", + "... ... ... ... \n", + "21056 rejected 5.0 2020-10-10 06:42:22.822743+00 \n", + "21057 rejected 5.0 2020-10-31 15:24:18.680694+00 \n", + "21058 rejected 5.0 2020-10-27 17:28:51.749177+00 \n", + "21059 rejected 5.0 2020-10-23 16:27:52.047457+00 \n", + "21060 accepted 5.0 2020-10-14 07:12:43.958192+00 \n", + "\n", + "[21061 rows x 7 columns]" + ] + }, + "execution_count": 474, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dff_filtered = dff[important_columns]\n", + "dff_filtered" + ] + }, + { + "cell_type": "code", + "execution_count": 475, + "id": "f119c936-d8cc-446d-81d8-832251b6a06f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idamountstatuscreated_atuser_id
05100.0rejected2019-12-10 19:05:21.596873+00804.0
170100.0rejected2019-12-10 19:50:12.34778+00231.0
27100.0rejected2019-12-10 19:13:35.82546+00191.0
31099.0rejected2019-12-10 19:16:10.880172+00761.0
41594100.0rejected2020-05-06 09:59:38.877376+007686.0
..................
2396520616100.0money_back2020-10-12 13:54:11.686225+0013681.0
239662524350.0money_back2020-10-27 14:41:25.73491+00NaN
2396722357100.0money_back2020-10-20 07:58:04.006937+0082122.0
2396820256100.0money_back2020-10-10 05:40:55.700422+0064517.0
2396919886100.0direct_debit_sent2020-10-08 14:16:52.155661+0044867.0
\n", + "

23970 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " id amount status created_at \\\n", + "0 5 100.0 rejected 2019-12-10 19:05:21.596873+00 \n", + "1 70 100.0 rejected 2019-12-10 19:50:12.34778+00 \n", + "2 7 100.0 rejected 2019-12-10 19:13:35.82546+00 \n", + "3 10 99.0 rejected 2019-12-10 19:16:10.880172+00 \n", + "4 1594 100.0 rejected 2020-05-06 09:59:38.877376+00 \n", + "... ... ... ... ... \n", + "23965 20616 100.0 money_back 2020-10-12 13:54:11.686225+00 \n", + "23966 25243 50.0 money_back 2020-10-27 14:41:25.73491+00 \n", + "23967 22357 100.0 money_back 2020-10-20 07:58:04.006937+00 \n", + "23968 20256 100.0 money_back 2020-10-10 05:40:55.700422+00 \n", + "23969 19886 100.0 direct_debit_sent 2020-10-08 14:16:52.155661+00 \n", + "\n", + " user_id \n", + "0 804.0 \n", + "1 231.0 \n", + "2 191.0 \n", + "3 761.0 \n", + "4 7686.0 \n", + "... ... \n", + "23965 13681.0 \n", + "23966 NaN \n", + "23967 82122.0 \n", + "23968 64517.0 \n", + "23969 44867.0 \n", + "\n", + "[23970 rows x 5 columns]" + ] + }, + "execution_count": 475, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "important_columns2 = [\n", + " 'id', 'amount', 'status', 'created_at', 'user_id']\n", + "dfc_filtered = dfc[important_columns2]\n", + "dfc_filtered" + ] + }, + { + "cell_type": "markdown", + "id": "555cd887-3ff0-4a68-8c1d-15df779c70cb", + "metadata": {}, + "source": [ + "## 4. Checking for Id`s\n" + ] + }, + { + "cell_type": "markdown", + "id": "d3a5acea-0087-4e7a-a5da-e4709867cabf", + "metadata": {}, + "source": [ + "**Checking for duplicates**" + ] + }, + { + "cell_type": "code", + "execution_count": 478, + "id": "885f7788-1418-4579-8f90-89fc1e623719", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Duplicates in dff_filtered:\n", + "Duplicate 'cash_request_id': 8127\n", + "Duplicate 'id': 0\n", + "Duplicates in dfc_filtered:\n", + "Duplicate 'user_id': 13171\n", + "Duplicate 'id': 0\n" + ] + } + ], + "source": [ + "print(\"Duplicates in dff_filtered:\")\n", + "print(\"Duplicate 'cash_request_id':\", dff_filtered['cash_request_id'].duplicated().sum())\n", + "print(\"Duplicate 'id':\", dff_filtered['id'].duplicated().sum())\n", + "\n", + "print(\"Duplicates in dfc_filtered:\")\n", + "print(\"Duplicate 'user_id':\", dfc_filtered['user_id'].duplicated().sum())\n", + "print(\"Duplicate 'id':\", dfc_filtered['id'].duplicated().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": 479, + "id": "07ea8eba-5085-4dc1-8137-6b5d5c3f2834", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Columns in dff_filtered: Index(['id', 'cash_request_id', 'type', 'category', 'status', 'total_amount',\n", + " 'created_at'],\n", + " dtype='object')\n", + "Columns in dfc_filtered: Index(['id', 'amount', 'status', 'created_at', 'user_id'], dtype='object')\n", + "Common columns between the datasets: {'created_at', 'id', 'status'}\n" + ] + } + ], + "source": [ + "print(\"Columns in dff_filtered:\", dff_filtered.columns)\n", + "print(\"Columns in dfc_filtered:\", dfc_filtered.columns)\n", + "\n", + "# common columns\n", + "common_columns = set(dff_filtered.columns).intersection(dfc_filtered.columns)\n", + "print(\"Common columns between the datasets:\", common_columns)" + ] + }, + { + "cell_type": "markdown", + "id": "cd4f81c9-d34a-4db6-a876-b1c3d4e3d410", + "metadata": {}, + "source": [ + "**Merging IDs**" + ] + }, + { + "cell_type": "code", + "execution_count": 481, + "id": "600a28a6-8b3c-4d64-a7b6-6867002a63b5", + "metadata": {}, + "outputs": [], + "source": [ + "# Merge 'id'\n", + "merged_df = pd.merge(dff_filtered, dfc_filtered, on='id', how='inner')" + ] + }, + { + "cell_type": "code", + "execution_count": 482, + "id": "6036f393-1532-4e44-bbd0-fb33b4a1d28a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcash_request_idtypecategorystatus_xtotal_amountcreated_at_xamountstatus_ycreated_at_yuser_id
0653714941.0instant_paymentrejected_direct_debitrejected5.02020-09-07 10:47:27.42315+00100.0money_back2020-07-04 05:59:49.246265+0026978.0
1696111714.0incidentrejected_direct_debitaccepted5.02020-09-09 20:51:17.998653+00100.0money_back2020-07-07 17:45:03.352406+0013539.0
21629623371.0instant_paymentrejected_direct_debitaccepted5.02020-10-23 10:10:58.352972+0050.0money_back2020-09-15 19:05:39.267562+0013756.0
32077526772.0instant_paymentrejected_direct_debitaccepted5.02020-10-31 15:46:53.643958+00100.0money_back2020-10-13 08:15:07.74666+0061460.0
41124219350.0instant_paymentrejected_direct_debitaccepted5.02020-10-06 08:20:17.170432+0050.0money_back2020-08-06 22:46:50.681779+0031188.0
....................................
184281237220262.0instant_paymentrejected_direct_debitrejected5.02020-10-10 06:42:22.822743+0050.0rejected2020-08-13 15:21:44.956497+00NaN
184292076826764.0instant_paymentrejected_direct_debitrejected5.02020-10-31 15:24:18.680694+0050.0money_back2020-10-13 07:46:38.698727+0033520.0
184301877925331.0instant_paymentrejected_direct_debitrejected5.02020-10-27 17:28:51.749177+0050.0rejected2020-10-01 18:51:01.003129+0055943.0
184311654223628.0instant_paymentrejected_direct_debitrejected5.02020-10-23 16:27:52.047457+0050.0money_back2020-09-17 07:33:06.068449+0019040.0
184321330120982.0instant_paymentrejected_direct_debitaccepted5.02020-10-14 07:12:43.958192+00100.0rejected2020-08-20 07:13:36.827124+0013661.0
\n", + "

18433 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " id cash_request_id type category \\\n", + "0 6537 14941.0 instant_payment rejected_direct_debit \n", + "1 6961 11714.0 incident rejected_direct_debit \n", + "2 16296 23371.0 instant_payment rejected_direct_debit \n", + "3 20775 26772.0 instant_payment rejected_direct_debit \n", + "4 11242 19350.0 instant_payment rejected_direct_debit \n", + "... ... ... ... ... \n", + "18428 12372 20262.0 instant_payment rejected_direct_debit \n", + "18429 20768 26764.0 instant_payment rejected_direct_debit \n", + "18430 18779 25331.0 instant_payment rejected_direct_debit \n", + "18431 16542 23628.0 instant_payment rejected_direct_debit \n", + "18432 13301 20982.0 instant_payment rejected_direct_debit \n", + "\n", + " status_x total_amount created_at_x amount \\\n", + "0 rejected 5.0 2020-09-07 10:47:27.42315+00 100.0 \n", + "1 accepted 5.0 2020-09-09 20:51:17.998653+00 100.0 \n", + "2 accepted 5.0 2020-10-23 10:10:58.352972+00 50.0 \n", + "3 accepted 5.0 2020-10-31 15:46:53.643958+00 100.0 \n", + "4 accepted 5.0 2020-10-06 08:20:17.170432+00 50.0 \n", + "... ... ... ... ... \n", + "18428 rejected 5.0 2020-10-10 06:42:22.822743+00 50.0 \n", + "18429 rejected 5.0 2020-10-31 15:24:18.680694+00 50.0 \n", + "18430 rejected 5.0 2020-10-27 17:28:51.749177+00 50.0 \n", + "18431 rejected 5.0 2020-10-23 16:27:52.047457+00 50.0 \n", + "18432 accepted 5.0 2020-10-14 07:12:43.958192+00 100.0 \n", + "\n", + " status_y created_at_y user_id \n", + "0 money_back 2020-07-04 05:59:49.246265+00 26978.0 \n", + "1 money_back 2020-07-07 17:45:03.352406+00 13539.0 \n", + "2 money_back 2020-09-15 19:05:39.267562+00 13756.0 \n", + "3 money_back 2020-10-13 08:15:07.74666+00 61460.0 \n", + "4 money_back 2020-08-06 22:46:50.681779+00 31188.0 \n", + "... ... ... ... \n", + "18428 rejected 2020-08-13 15:21:44.956497+00 NaN \n", + "18429 money_back 2020-10-13 07:46:38.698727+00 33520.0 \n", + "18430 rejected 2020-10-01 18:51:01.003129+00 55943.0 \n", + "18431 money_back 2020-09-17 07:33:06.068449+00 19040.0 \n", + "18432 rejected 2020-08-20 07:13:36.827124+00 13661.0 \n", + "\n", + "[18433 rows x 11 columns]" + ] + }, + "execution_count": 482, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_df" + ] + }, + { + "cell_type": "markdown", + "id": "9b831c45-ea89-4297-9fde-f46502166110", + "metadata": {}, + "source": [ + "**Chercking for duplicates in merged**" + ] + }, + { + "cell_type": "code", + "execution_count": 484, + "id": "c7ae94a5-8b93-4248-8ed3-625c16894ddd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Duplicates in merged_df['id']: 0\n" + ] + } + ], + "source": [ + "print(\"Duplicates in merged_df['id']:\", merged_df['id'].duplicated().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": 485, + "id": "d6c4e727-5e09-44ad-956b-b8c674bfb2f5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Missing values in merged_df:\n", + "id 0\n", + "cash_request_id 2\n", + "type 0\n", + "category 0\n", + "status_x 0\n", + "total_amount 0\n", + "created_at_x 0\n", + "amount 0\n", + "status_y 0\n", + "created_at_y 0\n", + "user_id 1973\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "print(\"Missing values in merged_df:\")\n", + "print(merged_df.isnull().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": 486, + "id": "4b18f45c-317b-4b39-99a6-2635a95c0530", + "metadata": {}, + "outputs": [], + "source": [ + "merged_df = merged_df.dropna(subset=['cash_request_id'])\n", + "merged_df = merged_df.dropna(subset=['user_id'])" + ] + }, + { + "cell_type": "markdown", + "id": "73251827-bdc6-40cf-8ebb-1895c3c61797", + "metadata": {}, + "source": [ + "**Droping rows with missing cash request**" + ] + }, + { + "cell_type": "code", + "execution_count": 488, + "id": "f2e11891-0dda-4dbe-9069-fa3386545e82", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape after cleaning: (16458, 11)\n", + "Missing values after cleaning:\n", + "id 0\n", + "cash_request_id 0\n", + "type 0\n", + "category 0\n", + "status_x 0\n", + "total_amount 0\n", + "created_at_x 0\n", + "amount 0\n", + "status_y 0\n", + "created_at_y 0\n", + "user_id 0\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "#droping rows with missing cash_request_id\n", + "merged_df = merged_df.dropna(subset=['cash_request_id'])\n", + "\n", + "#droping rows with missing user_id\n", + "merged_df = merged_df.dropna(subset=['user_id'])\n", + "\n", + "\n", + "print(\"Shape after cleaning:\", merged_df.shape)\n", + "print(\"Missing values after cleaning:\")\n", + "print(merged_df.isnull().sum())" + ] + }, + { + "cell_type": "markdown", + "id": "d85b9355-b4b6-4aa4-b27b-2b265d8e7fe4", + "metadata": {}, + "source": [ + "**Finally, MERGED**" + ] + }, + { + "cell_type": "code", + "execution_count": 490, + "id": "cfff1fea-af94-444d-b8e6-212eb26a5836", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcash_request_idtypecategorystatus_xtotal_amountcreated_at_xamountstatus_ycreated_at_yuser_id
0653714941.0instant_paymentrejected_direct_debitrejected5.02020-09-07 10:47:27.42315+00100.0money_back2020-07-04 05:59:49.246265+0026978.0
1696111714.0incidentrejected_direct_debitaccepted5.02020-09-09 20:51:17.998653+00100.0money_back2020-07-07 17:45:03.352406+0013539.0
21629623371.0instant_paymentrejected_direct_debitaccepted5.02020-10-23 10:10:58.352972+0050.0money_back2020-09-15 19:05:39.267562+0013756.0
32077526772.0instant_paymentrejected_direct_debitaccepted5.02020-10-31 15:46:53.643958+00100.0money_back2020-10-13 08:15:07.74666+0061460.0
41124219350.0instant_paymentrejected_direct_debitaccepted5.02020-10-06 08:20:17.170432+0050.0money_back2020-08-06 22:46:50.681779+0031188.0
....................................
184271470722050.0instant_paymentrejected_direct_debitrejected5.02020-10-19 06:48:13.655092+00100.0money_back2020-09-05 13:27:19.805671+0012229.0
184292076826764.0instant_paymentrejected_direct_debitrejected5.02020-10-31 15:24:18.680694+0050.0money_back2020-10-13 07:46:38.698727+0033520.0
184301877925331.0instant_paymentrejected_direct_debitrejected5.02020-10-27 17:28:51.749177+0050.0rejected2020-10-01 18:51:01.003129+0055943.0
184311654223628.0instant_paymentrejected_direct_debitrejected5.02020-10-23 16:27:52.047457+0050.0money_back2020-09-17 07:33:06.068449+0019040.0
184321330120982.0instant_paymentrejected_direct_debitaccepted5.02020-10-14 07:12:43.958192+00100.0rejected2020-08-20 07:13:36.827124+0013661.0
\n", + "

16458 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " id cash_request_id type category \\\n", + "0 6537 14941.0 instant_payment rejected_direct_debit \n", + "1 6961 11714.0 incident rejected_direct_debit \n", + "2 16296 23371.0 instant_payment rejected_direct_debit \n", + "3 20775 26772.0 instant_payment rejected_direct_debit \n", + "4 11242 19350.0 instant_payment rejected_direct_debit \n", + "... ... ... ... ... \n", + "18427 14707 22050.0 instant_payment rejected_direct_debit \n", + "18429 20768 26764.0 instant_payment rejected_direct_debit \n", + "18430 18779 25331.0 instant_payment rejected_direct_debit \n", + "18431 16542 23628.0 instant_payment rejected_direct_debit \n", + "18432 13301 20982.0 instant_payment rejected_direct_debit \n", + "\n", + " status_x total_amount created_at_x amount \\\n", + "0 rejected 5.0 2020-09-07 10:47:27.42315+00 100.0 \n", + "1 accepted 5.0 2020-09-09 20:51:17.998653+00 100.0 \n", + "2 accepted 5.0 2020-10-23 10:10:58.352972+00 50.0 \n", + "3 accepted 5.0 2020-10-31 15:46:53.643958+00 100.0 \n", + "4 accepted 5.0 2020-10-06 08:20:17.170432+00 50.0 \n", + "... ... ... ... ... \n", + "18427 rejected 5.0 2020-10-19 06:48:13.655092+00 100.0 \n", + "18429 rejected 5.0 2020-10-31 15:24:18.680694+00 50.0 \n", + "18430 rejected 5.0 2020-10-27 17:28:51.749177+00 50.0 \n", + "18431 rejected 5.0 2020-10-23 16:27:52.047457+00 50.0 \n", + "18432 accepted 5.0 2020-10-14 07:12:43.958192+00 100.0 \n", + "\n", + " status_y created_at_y user_id \n", + "0 money_back 2020-07-04 05:59:49.246265+00 26978.0 \n", + "1 money_back 2020-07-07 17:45:03.352406+00 13539.0 \n", + "2 money_back 2020-09-15 19:05:39.267562+00 13756.0 \n", + "3 money_back 2020-10-13 08:15:07.74666+00 61460.0 \n", + "4 money_back 2020-08-06 22:46:50.681779+00 31188.0 \n", + "... ... ... ... \n", + "18427 money_back 2020-09-05 13:27:19.805671+00 12229.0 \n", + "18429 money_back 2020-10-13 07:46:38.698727+00 33520.0 \n", + "18430 rejected 2020-10-01 18:51:01.003129+00 55943.0 \n", + "18431 money_back 2020-09-17 07:33:06.068449+00 19040.0 \n", + "18432 rejected 2020-08-20 07:13:36.827124+00 13661.0 \n", + "\n", + "[16458 rows x 11 columns]" + ] + }, + "execution_count": 490, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_df" + ] + }, + { + "cell_type": "code", + "execution_count": 491, + "id": "8da161b6-405f-4fc0-8c87-37df46ed00d8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "object\n" + ] + } + ], + "source": [ + "print(merged_df['created_at_x'].dtype)" + ] + }, + { + "cell_type": "markdown", + "id": "1e4db748-2dad-4cc1-82e8-44a4e94f9be2", + "metadata": {}, + "source": [ + "**Converting to datetime**" + ] + }, + { + "cell_type": "code", + "execution_count": 493, + "id": "abac70e9-704b-4db0-b37e-cc726d43deb7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 2020-09-07 10:47:27.423150+00:00\n", + "1 2020-09-09 20:51:17.998653+00:00\n", + "2 2020-10-23 10:10:58.352972+00:00\n", + "3 2020-10-31 15:46:53.643958+00:00\n", + "4 2020-10-06 08:20:17.170432+00:00\n", + "Name: created_at_x, dtype: datetime64[ns, UTC]\n" + ] + } + ], + "source": [ + "merged_df['created_at_x'] = pd.to_datetime(merged_df['created_at_x'], errors='coerce')\n", + "\n", + "print(merged_df['created_at_x'].head())" + ] + }, + { + "cell_type": "code", + "execution_count": 494, + "id": "057408e4-8a2b-4f87-b448-afea1bc46e3a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 2020-09\n", + "1 2020-09\n", + "2 2020-10\n", + "3 2020-10\n", + "4 2020-10\n", + "Name: created_at_x, dtype: period[M]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/cj/_ljqqgzj7bvdlj6sc0_8wd080000gn/T/ipykernel_78745/3021134120.py:1: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.\n", + " merged_df['created_at_x'] = merged_df['created_at_x'].dt.to_period('M')\n" + ] + } + ], + "source": [ + "merged_df['created_at_x'] = merged_df['created_at_x'].dt.to_period('M')\n", + "\n", + "print(merged_df['created_at_x'].head())" + ] + }, + { + "cell_type": "markdown", + "id": "e68264d6-6c08-4cf7-807a-97c4e77b5340", + "metadata": {}, + "source": [ + "**Define cohort as the first interaction month for each user**" + ] + }, + { + "cell_type": "code", + "execution_count": 496, + "id": "53bfafaa-c60a-409e-9eaf-f9815d11561e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cohorts created for each user:\n", + " user_id created_at_x cohort\n", + "0 26978.0 2020-09 2020-09\n", + "1 13539.0 2020-09 2020-08\n", + "2 13756.0 2020-10 2020-09\n", + "3 61460.0 2020-10 2020-10\n", + "4 31188.0 2020-10 2020-09\n" + ] + } + ], + "source": [ + "merged_df['cohort'] = merged_df.groupby('user_id')['created_at_x'].transform('min')\n", + "\n", + "print(\"Cohorts created for each user:\")\n", + "print(merged_df[['user_id', 'created_at_x', 'cohort']].head())" + ] + }, + { + "cell_type": "markdown", + "id": "98039944-b2f4-447a-85a9-5394b5df1ae5", + "metadata": {}, + "source": [ + "**Define cohort as the first interaction month for each user**" + ] + }, + { + "cell_type": "code", + "execution_count": 498, + "id": "4ef0e2d0-5b50-4e9b-be5d-a351613d3f3d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " user_id created_at_x cohort\n", + "0 26978.0 2020-09 2020-09\n", + "1 13539.0 2020-09 2020-08\n", + "2 13756.0 2020-10 2020-09\n", + "3 61460.0 2020-10 2020-10\n", + "4 31188.0 2020-10 2020-09\n" + ] + } + ], + "source": [ + "merged_df['cohort'] = merged_df.groupby('user_id')['created_at_x'].transform('min')\n", + "\n", + "print(merged_df[['user_id', 'created_at_x', 'cohort']].head())" + ] + }, + { + "cell_type": "markdown", + "id": "25a0a9a7-f497-4937-a87f-fe5a4945ab84", + "metadata": {}, + "source": [ + "**Group by cohort and count transactions**" + ] + }, + { + "cell_type": "code", + "execution_count": 500, + "id": "1745639f-3aec-4d80-a8cf-c4ffe7417f3c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of transactions per cohort:\n", + "cohort\n", + "2020-05 12\n", + "2020-06 1609\n", + "2020-07 2594\n", + "2020-08 3673\n", + "2020-09 3782\n", + "2020-10 4667\n", + "2020-11 121\n", + "Freq: M, dtype: int64\n" + ] + } + ], + "source": [ + "transactions_per_cohort = merged_df.groupby('cohort').size()\n", + "\n", + "print(\"Number of transactions per cohort:\")\n", + "print(transactions_per_cohort)" + ] + }, + { + "cell_type": "markdown", + "id": "4e69c479-53f7-4de2-a7f5-8e0c174f74f8", + "metadata": {}, + "source": [ + "**Group by cohort and sum revenue**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 502, + "id": "0e09fd2f-5642-44e3-b931-01bb6b28e5bc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Revenue per cohort:\n", + "cohort\n", + "2020-05 1189.0\n", + "2020-06 152967.0\n", + "2020-07 242199.0\n", + "2020-08 336599.0\n", + "2020-09 330237.0\n", + "2020-10 365981.0\n", + "2020-11 10440.0\n", + "Freq: M, Name: amount, dtype: float64\n" + ] + } + ], + "source": [ + "revenue_per_cohort = merged_df.groupby('cohort')['amount'].sum()\n", + "\n", + "print(\"Revenue per cohort:\")\n", + "print(revenue_per_cohort)" + ] + }, + { + "cell_type": "markdown", + "id": "3db4fd48-3b9b-4c79-9e6c-030aa43b096c", + "metadata": {}, + "source": [ + "**Displaying results**" + ] + }, + { + "cell_type": "code", + "execution_count": 504, + "id": "f259ec3f-af44-414a-91f5-da0e07144df3", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "merged_df['transaction_month'] = merged_df['created_at_x'] # Ensure it's using the month-year format\n", + "cohort_data = merged_df.groupby(['cohort', 'transaction_month']).size().unstack(fill_value=0)\n", + "\n", + "plt.figure(figsize=(12, 8))\n", + "sns.heatmap(cohort_data, annot=True, fmt=\"d\", cmap=\"YlGnBu\", cbar_kws={'label': 'Transactions'})\n", + "plt.title('Transaction Frequency by Cohort Over Time')\n", + "plt.xlabel('Transaction Month')\n", + "plt.ylabel('Cohort (First Interaction Month)')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "b44beaec-d48f-403e-8e9c-9a5c885cdd81", + "metadata": {}, + "source": [ + "**Distribution: The values in the heatmap show a gradual decrease in transaction frequency over time, showing a right-skewed distribution. This means that most transactions occur in the initial months, with fewer transactions in later months.\n", + "The transactional behavior across cohorts is positively skewed since the majority of the activity happens early.**" + ] + }, + { + "cell_type": "code", + "execution_count": 506, + "id": "aea5047e-8dbc-4137-b081-4716069d8876", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "revenue_data = merged_df.groupby(['cohort', 'transaction_month'])['amount'].sum().reset_index()\n", + "\n", + "plt.figure(figsize=(12, 6))\n", + "sns.boxplot(x='cohort', y='amount', data=revenue_data)\n", + "plt.title('Revenue Per Transaction by Cohort')\n", + "plt.xlabel('Cohort (First Interaction Month)')\n", + "plt.ylabel('Revenue per Transaction')\n", + "plt.xticks(rotation=45)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "35c7d382-62b1-414b-9ae3-82e93efde06a", + "metadata": {}, + "source": [ + "**Distribution: The box plot highlights the spread of revenue within each cohort. The cohorts have varying levels of skewness:\n", + "\t•\tEarlier cohorts (e.g., 2020-06) show a narrow distribution, meaning revenue was more consistent.\n", + "\t•\tLater cohorts (e.g., 2020-10) show a wide spread, indicating variability in transaction values.\n", + "\t•\tOutliers: The 2020-10 cohort displays outliers, suggesting that a few transactions are contributing significantly to the total revenue.**" + ] + }, + { + "cell_type": "code", + "execution_count": 508, + "id": "b61e12db-3d70-48ef-a8af-d812a84aa028", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "merged_df['transaction_month'].value_counts().plot(kind='bar', alpha=0.7, label='Transactions')\n", + "\n", + "plt.hist(merged_df['amount'], bins=30, alpha=0.7, label='Revenue per Transaction')\n", + "\n", + "plt.title('Combined Histogram: Transaction Frequency and Revenue Distribution')\n", + "plt.xlabel('Value')\n", + "plt.ylabel('Frequency')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "951c4497", + "metadata": {}, + "source": [ + "## 5. Cohort Analysis and deliveries:\n", + "\n", + "Observations\n", + "\n", + "**Dominance of the 2020-10 Cohort:**\n", + "\n", + "- The 2020-10 cohort has the highest transaction frequency by far, with over 9,000 transactions. This aligns with previous analyses showing newer cohorts perform better.\n", + "\n", + "- This could indicate a successful marketing campaign, increased onboarding efforts, or seasonal demand during this period.\n", + " \n", + "**Decline in Transactions for Earlier Cohorts:**\n", + "\n", + "- Older cohorts, such as 2020-06 and 2020-07, have significantly fewer transactions. This drop might be due to lower retention rates or reduced engagement over time.\n", + " \n", + "**Revenue per Transaction Remains Consistent:**\n", + "\n", + "- While transaction frequency fluctuates, the bars for revenue per transaction are fairly consistent across cohorts. This suggests that the average revenue per transaction does not depend heavily on cohort age.\n", + " \n", + "**Outliers (2020-10):**\n", + " \n", + "- The 2020-10 cohort not only has the most transactions but also displays a wider spread of revenue per transaction (as seen in earlier box plots). This could indicate diverse customer behavior, with both low- and high-value transactions occurring in this cohort.\n", + "\n", + "**Questions**\n", + "\n", + "**Frequency of Service Usage: Understand how often users from each cohort utilize IronHack Payments' cash advance services over time.**\n", + "\n", + "The frequency of transactions generally decreases over time for earlier cohorts (e.g., 2020-06 and 2020-07). This suggests weaker long-term engagement or a natural drop-off in service usage. Newer cohorts (2020-09 and 2020-10) exhibit stronger engagement in their first and second months, with transaction counts significantly higher than earlier cohorts.\n", + " \n", + "**Incident Rate: Determine the incident rate, specifically focusing on payment incidents, for each cohort. Identify if there are variations in incident rates among different cohorts.**\n", + "\n", + "Earlier cohorts (2020-06) have a higher incident rate, possibly due to operational inefficiencies or lack of proper user education in the early days of the service. Newer cohorts (2020-09 and 2020-10) show a lower incident rate, indicating improvements in processes or better customer targeting.\n", + " \n", + "**Revenue Generated by the Cohort: Calculate the total revenue generated by each cohort over months to assess the financial impact of user behavior.**\n", + "\n", + "Later cohorts (2020-09 and 2020-10) generate significantly higher revenue compared to earlier ones. This indicates either higher transaction sizes or increased activity from high-value users. Revenue increases sharply for newer cohorts indicating better customer targeting.\n", + "\n", + "**New Relevant Metric: Propose and calculate a new relevant metric that provides additional insights into user behavior or the performance of IronHack Payments' services.**\n", + "\n", + "Would be necessary the implementation of the average revenue per user. This metric provides insight into the average revenue generated per active user within each cohort. \n", + "Develop personalized offers or incentives for high-value users to sustain engagement and spending. \n", + "Create retention strategies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20d82bc2-276e-45c7-bcee-194ed06f89be", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/datasets/.DS_Store b/datasets/.DS_Store new file mode 100644 index 0000000..667a9c7 Binary files /dev/null and b/datasets/.DS_Store differ diff --git a/project_dataset/Lexique - Data Analyst.xlsx b/datasets/Lexique_Data_Analyst.xlsx similarity index 100% rename from project_dataset/Lexique - Data Analyst.xlsx rename to datasets/Lexique_Data_Analyst.xlsx diff --git a/project_dataset/extract - cash request - data analyst.csv b/datasets/cash_data_analyst.csv similarity index 100% rename from project_dataset/extract - cash request - data analyst.csv rename to datasets/cash_data_analyst.csv diff --git a/project_dataset/extract - fees - data analyst - .csv b/datasets/fees_data_analyst.csv similarity index 100% rename from project_dataset/extract - fees - data analyst - .csv rename to datasets/fees_data_analyst.csv