-
Notifications
You must be signed in to change notification settings - Fork 0
/
Explore Data.py
115 lines (87 loc) · 3.48 KB
/
Explore Data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#Preparation by importing libraries
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from scipy.stats import shapiro
import matplotlib.patches as mpatches
#Defining plot style
plt.style.use('ggplot')
pd.set_option('display.max_columns', 500)
color_dict = dict({0:'green',
1:'red'})
#Importing data as dataframe
filename = r"creditcard.csv"
df = pd.read_csv(filename)
#Check to make sure data is imported correctly
print(df.head())
#Checking data types
print(df.dtypes)
#Changing Class datatype from int64 to Category
for col in ['Class']:
df[col] = df[col].astype('category')
#Generating a random combinations of scatterplots to check for data balance
# and class overlap
sn.scatterplot(x="V18",y="V26", hue="Class", data=df, palette=color_dict)
plt.show()
sn.scatterplot(x="V1",y="V19", hue="Class", data=df, palette=color_dict)
plt.show()
sn.scatterplot(x="V11",y="V25", hue="Class", data=df, palette=color_dict)
plt.show()
sn.scatterplot(x="V4",y="V12", hue="Class", data=df, palette=color_dict)
plt.show()
sn.scatterplot(x="V21",y="V14", hue="Class", data=df, palette=color_dict)
plt.show()
sn.scatterplot(x="V15",y="V25", hue="Class", data=df, palette=color_dict)
plt.show()
# Some scatterplots show that high class overlap, whereas some show
#more delineation between classes
# This indicates that a feature subset can be defined such that
#the model performs better than with the full dataset
#Counting number of NaN values in dataframe
print(df.isnull().sum().sum())
#Basic Description of datasets
print(df.describe())
# Count of each feature is the same, confirming that there are no missing values
# Max time is 1720,792 confirming that the data covers just above 2 days from
#first transaction recorded
# 50th percentile is 84,692 which indicates that data is generally evenly
#distributed across two days
# Maximum amount is 25,691 and the minimum transaction amount is $0, and the
#average amount is $88
# $77 is the 75th percentile which indicates that the amount field is positively
#skewed, with a few high value outliers
# The average class is 0.001727which confirms that the dataset is heavily imbalanced.
#Exploring data distribution
df.hist()
plt.show()
#At first glance, all fields seem to be unimodal, with different means
#Further exploring the distribution of 'Time' field
df.hist(column='Time',by='Class')
plt.show()
# The drilled-down distribution shows a bi-modal distribution shows that
#there are fewer transacations at night
#There is a more distinct bi-modality for non-fraudulent transacations which
#indicates cycles to regular transacations that fraudulent transactions may
#not follow
#Further exploring the distribution of 'Amount' field
df.hist(column='Amount',by="Class")
plt.show()
# Amount looks to have a high positive skew, with a few extreme values,
#as indicated by the percentile distribution
#Futher examining the distribution of Amount by Class without points beyond
#the whiskers (outliers)
df.boxplot(column='Amount',by="Class",showfliers=False)
plt.show()
# Creating a scatter matrix
scatter_matrix(df)
plt.show()
#Creating correlation matrix rounding to 4 digits
corr = df.corr().round(4)
#Printing to textfile
print(corr, file=open("corr_output.txt","w"))
################### Need to close file!
#Visualizing correlation matrix
sn.heatmap(corr,annot=False)
plt.show()