-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
98 lines (71 loc) · 2.93 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import sklearn
import warnings
import math
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
warnings.filterwarnings("ignore")
train_df = pd.read_csv('~/Desktop/Projects/Disease Prediction Model/MultipleDiseaseDataset/Blood_samples_dataset_balanced_2(f).csv')
test_df = pd.read_csv('~/Desktop/Projects/Disease Prediction Model/MultipleDiseaseDataset/blood_samples_dataset_test.csv')
le = LabelEncoder()
sc = StandardScaler()
df = pd.concat([train_df, test_df], ignore_index=True)
df['Disease'] = le.fit_transform(df['Disease'])
# Preprocessing Work
print(df)
df.info()
print(df.isnull())
print(df.isnull().sum())
X = df.iloc[:, :24]
y = df.iloc[:, 24:25]
y = le.fit_transform(y)
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.2, random_state = 1)
Xtrain = sc.fit_transform(Xtrain)
Xtest = sc.fit_transform(Xtest)
print("\n")
# K Nearest Neighbors
k = round(math.sqrt(len(ytrain)))
# Through trial and error, a k value of 2 is yielding a higher accuracy score than the convention used for calculating k (prev. line)
knn = KNeighborsClassifier(n_neighbors = 2)
knn.fit(Xtrain, ytrain)
knn_pred = knn.predict(Xtest)
knn_accuracy = (metrics.accuracy_score(ytest, knn_pred)) * 100
print('K Nearest Neighbors Algorithm Accuracy Metric:',knn_accuracy,'%')
#sns.pairplot(data = df, hue = 'Disease', palette = 'Set2')
#plt.show()
# Support Vector Machine
# Rbf kernel yields higher accuracy score than the linear kernel
svm=SVC(kernel="rbf", random_state= 0)
svm.fit(Xtrain, ytrain)
svm_pred = svm.predict(Xtest)
svm_accuracy = (metrics.accuracy_score(ytest, svm_pred)) * 100
print('Support Vector Machine Algorithm Accuracy Metric:',svm_accuracy,'%')
# Logistic Regression
lreg = LogisticRegression()
lreg.fit(Xtrain, ytrain)
lreg_pred = lreg.predict(Xtest)
lreg_accuracy = (metrics.accuracy_score(ytest, lreg_pred)) * 100
print('Logistic Regression Algorithm Accuracy Metric:',lreg_accuracy,'%')
# Gaussian Naive Bayes
gnb = GaussianNB()
gnb.fit(Xtrain, ytrain)
gnb_pred = gnb.predict(Xtest)
gnb_accuracy = (metrics.accuracy_score(ytest, gnb_pred)) * 100
print('Gaussian Naive Bayes Algorithm Accuracy Metric:',gnb_accuracy,'%')
# Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=12, criterion="entropy", random_state=3) # Fix later
rfc.fit(Xtrain, ytrain)
rfc_pred = rfc.predict(Xtest)
rfc_accuracy = (metrics.accuracy_score(ytest, rfc_pred)) * 100
print('Random Forest Classifier Algorithm Accuracy Metric:',rfc_accuracy,'%')