-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocessing.py
111 lines (91 loc) · 4.37 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import pandas as pd
# Trains a KNN model with X data and Y corresponding categories.
# Tests the accuracy of the trained model using X_test and Y_test.
# It only considers data under the cols-titled columns (cols is an array of column titles
# to consider).
# returns the trained KNN model.
# Note: y_train needs to be a 1-dimensional vector.
def train_knn(x_train, y_train, cols):
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=len(cols))
knn.fit(x_train, y_train)
return knn
def get_accuracy(x_test, y_test, model):
from sklearn.metrics import accuracy_score
return accuracy_score(y_test, model.predict(x_test))
def get_accuracy_rounded(x_test, y_test, model):
from sklearn.metrics import accuracy_score
return accuracy_score(y_test, model.predict(x_test).round())
# Scaling down both train and test data set (using MinMaxScaler, get values btw 0 and 1)
def scaling(x_train, x_test):
from sklearn.preprocessing import MinMaxScaler
min_max = MinMaxScaler()
return min_max.fit_transform(x_train), min_max.fit_transform(x_test)
# STANDARDISING, i.e., Standardization (or Z-score normalization)
# is the process where the features are rescaled so that they will
# have the properties of a standard normal distribution with mu=0 and sigma=1,
# where mu is the mean (average) and sigma is the standard deviation from the mean
def standardising(x_train, x_test):
from sklearn.preprocessing import scale
return scale(x_train), scale(x_test)
def train_linear_regression(x_train, y_train):
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train, y_train)
return lr
# Note: y_train needs to be a 1-dimensional vector.
def train_logistic_regression(x_train, y_train):
from sklearn.linear_model import LogisticRegression
log = LogisticRegression(penalty='l2', C=.01)
log.fit(x_train, y_train)
return log
def train_svm_linear_svc(x_train, y_train):
from sklearn.svm import SVC
svc = SVC(kernel='linear', C=.01)
svc.fit(x_train, y_train)
return svc
def encoding_categorical(x_train, x_test):
from sklearn.preprocessing import LabelEncoder
X_train_enc = x_train
X_test_enc = x_test
le = LabelEncoder()
for col in x_test.columns.values:
# Encoding only categorical variables
if x_test[col].dtypes == 'object':
# Use the whole data to form an exhaustive list of levels
data = x_train[col].append(x_test[col])
le.fit(data.values)
X_train_enc[col] = le.transform(x_train[col])
X_test_enc[col] = le.transform(x_test[col])
return X_train_enc, X_test_enc
def one_hot_encoding_categorical(x_train, x_test):
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)
X_train_hot_enc = x_train
X_test_hot_enc = x_test
for col in x_test.columns.values:
# Encoding only categorical variables
if x_test[col].dtypes=='object':
# creating an exhaustive list of all possible categorical values
data = x_train[[col]].append(x_test[[col]])
enc.fit(data)
# Fitting One Hot Encoding on train data
temp = enc.transform(x_train[[col]])
# Changing the encoded features into a data frame with new column names
temp = pd.DataFrame(temp,columns=[(col+"_"+str(i)) for i in data[col]
.value_counts().index])
# In side by side concatenation index values should be same
# Setting the index values similar to the X_train data frame
temp = temp.set_index(x_train.index.values)
# adding the new One Hot Encoded variables to the train data frame
X_train_hot_enc = pd.concat([X_train_hot_enc, temp],axis=1)
# fitting One Hot Encoding on test data
temp = enc.transform(x_test[[col]])
# changing it into data frame and adding column names
temp = pd.DataFrame(temp,columns=[(col+"_"+str(i)) for i in data[col]
.value_counts().index])
# Setting the index for proper concatenation
temp = temp.set_index(x_test.index.values)
# adding the new One Hot Encoded variables to test data frame
X_test_hot_enc = pd.concat([X_test_hot_enc, temp],axis=1)
return X_train_hot_enc, X_test_hot_enc