-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnba_classification.py
129 lines (96 loc) · 4.5 KB
/
nba_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# -*- coding: utf-8 -*-
"""P2.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1EpNcHvSjV2pjfe2b8ARWaPmJP3HTqg8o
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
import seaborn as sns
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pandas as pd
nba_data = pd.read_csv('nba_stats.csv')
# missing_data = nba_data.isnull().sum()
# print("Missing values in each column:\n", missing_data)
pos_encoded = pd.get_dummies(nba_data['Pos'],prefix = 'Pos')
data_encoded = pd.concat([nba_data.drop('Pos', axis=1), pos_encoded], axis=1)
correlation_matrix = data_encoded.corr()
position_columns = pos_encoded.columns
for position in position_columns:
sorted_correlations = correlation_matrix[position].abs().sort_values(ascending=False)
min_corr_threshold = 0.1
# Calculating the maximum correlation of each feature with any position column
max_corr_positions = correlation_matrix.drop(pos_encoded.columns, axis=0)[pos_encoded.columns].abs().max(axis=1)
features_selected = max_corr_positions[max_corr_positions > min_corr_threshold].index.tolist()
X = data_encoded[features_selected]
y = nba_data['Pos']
print("Selected features and their max correlation with positions:")
print(max_corr_positions[features_selected])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) #splitting the dataset
scaler = StandardScaler() # scaling the features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# model training
svm = SVC(kernel ='rbf',C=1,random_state=0).fit(X_train_scaled, y_train)
# making predictions on the test set
y_train_pred = svm.predict(X_train_scaled)
y_test_pred = svm.predict(X_test_scaled)
# getting the accuracies of training and validation set
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Training set accuracy: {train_accuracy: .10f}")
print(f"Validation set accuracy: {test_accuracy: .10f}")
# Creating a confusion matrix using pd.crosstab
conf_mat_train = pd.crosstab(index=y_train, columns=y_train_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)
print("\nConfusion Matrix for the Training Set:\n")
print(conf_mat_train)
conf_mat_test = pd.crosstab(index=y_test, columns=y_test_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)
print("\nConfusion Matrix for the Validation Set:\n")
print(conf_mat_test)
"""## **Part 2**
"""
# testing on dummy_test.csv
dummy_test_set = pd.read_csv('dummy_test.csv')
# use the same features used in the model training
X_dummy = dummy_test_set[features_selected]
y_dummy = dummy_test_set['Pos'] # get actual labels from test data
# scaling
X_dummy_scaled = scaler.transform(X_dummy)
y_dummy_pred = svm.predict(X_dummy_scaled) # predicting the target variable using the trained SVM model
# Calculate accuracy for the dummy test set
accuracy_dummy = accuracy_score(y_dummy, y_dummy_pred)
print("Accuracy on dummy test set:", accuracy_dummy)
# Create a confusion matrix for the dummy test set
conf_mat_dummy = pd.crosstab(index=y_dummy, columns=y_dummy_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)
print("\nConfusion Matrix for the Dummy Test Set:\n")
print(conf_mat_dummy)
# for index, (value1, value2) in enumerate(zip(y_dummy, y_dummy_pred)):
# print(index, value1 + ' '+ value2)
"""## **Part 3**"""
method = Pipeline([
('scaler', StandardScaler()),
('svm', svm)
])
# stratified K-Fold
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
# cross validation
cv_scores = cross_val_score(method, X, y, cv=kfold, scoring='accuracy')
# accuracy of each fold
print("Accuracies of each fold:")
for index, score in enumerate(cv_scores, 1):
print(f"Fold {index}: {score:.10f}")
# accuracy across all folds
print(f"\nAverage accuracy across all folds: {np.mean(cv_scores):.10f}")