-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclustering.py
126 lines (96 loc) · 3.93 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# Copyright (c) 2021 Shivangi0503
#
#
# This file is licensed under MIT License
# See https://github.com/Shivangi0503/Wine_Clustering_KMeans/blob/main/LICENSE for more information.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import ParameterGrid
from sklearn.cluster import KMeans
def pca_embeddings(df_scaled):
"""To reduce the dimensions of the wine dataset we use Principal Component Analysis (PCA).
Here we reduce it from 13 dimensions to 2.
:param df_scaled: scaled data
:return: pca result, pca for plotting graph
"""
pca_2 = PCA(n_components=2)
pca_2_result = pca_2.fit_transform(df_scaled)
print('Explained variation per principal component: {}'.format(pca_2.explained_variance_ratio_))
print('Cumulative variance explained by 2 principal components: {:.2%}'.format(
np.sum(pca_2.explained_variance_ratio_)))
# Results from pca.components_
dataset_pca = pd.DataFrame(abs(pca_2.components_), columns=df_scaled.columns, index=['PC_1', 'PC_2'])
print('\n\n', dataset_pca)
print("\n*************** Most important features *************************")
print('As per PC 1:\n', (dataset_pca[dataset_pca > 0.3].iloc[0]).dropna())
print('\n\nAs per PC 2:\n', (dataset_pca[dataset_pca > 0.3].iloc[1]).dropna())
print("\n******************************************************************")
return pca_2_result, pca_2
def kmean_hyper_param_tuning(data):
"""
Hyper parameter tuning to select the best from all the parameters on the basis of silhouette_score.
:param data: dimensionality reduced data after applying PCA
:return: best number of clusters for the model (used for KMeans n_clusters)
"""
# candidate values for our number of cluster
parameters = [2, 3, 4, 5]
# instantiating ParameterGrid, pass number of clusters as input
parameter_grid = ParameterGrid({'n_clusters': parameters})
best_score = -1
kmeans_model = KMeans() # instantiating KMeans model
silhouette_scores = []
# evaluation based on silhouette_score
for p in parameter_grid:
kmeans_model.set_params(**p) # set current hyper parameter
kmeans_model.fit(data) # fit model on wine dataset, this will find clusters based on parameter p
ss = metrics.silhouette_score(data, kmeans_model.labels_) # calculate silhouette_score
silhouette_scores += [ss] # store all the scores
print('Parameter:', p, 'Score', ss)
# check p which has the best score
if ss > best_score:
best_score = ss
best_grid = p
# plotting silhouette score
plt.bar(range(len(silhouette_scores)), list(silhouette_scores), align='center', color='#722f59', width=0.5)
plt.xticks(range(len(silhouette_scores)), list(parameters))
plt.title('Silhouette Score', fontweight='bold')
plt.xlabel('Number of Clusters')
plt.show()
return best_grid['n_clusters']
def visualizing_results(pca_result, label, centroids_pca):
""" Visualizing the clusters
:param pca_result: PCA applied data
:param label: K Means labels
:param centroids_pca: PCA format K Means centroids
"""
TYPES = ['INFJ',
'INTJ',
'ENTJ',
'INFP',
'INTP',
'ENFP',
'ENTP',
'ENFJ',
'ISFJ',
'ISTJ',
'ISTP',
'ISFP',
'ESFJ',
'ESTJ',
'ESTP',
'ESFP']
# ------------------ Using Matplotlib for plotting-----------------------
x = pca_result[:, 0]
y = pca_result[:, 1]
_, ax = plt.subplots()
ax.scatter(x, y, c=label, alpha=0.5, s=200)
for i, txt in enumerate(TYPES):
ax.annotate(txt, (x[i], y[i]))
plt.title('Personality clusters')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.show()