clustering in python.txt

>>Hierarchy

from scipy.cluster.hierarchy import linkage, fcluster
from matplotlib import pyplot as plt
import seaborn as sns, pandas as pd

x_coordinates=[89.1,93.1,86.6,98.5,86.4,9.5,15.2,3.4,
18.4,20.3,44.2,56.8,49.2,62.5,44.8]

y_coordinates=[87.2,96.1,95.6,92.4,92.4,57.7,49.4,
47.3,59.1,55.5,25.6,2.1,18.9, 24.1,18.3]

df=pd.DataFrame({'x_coordinates':x_coordinates,
'y_coordinates':y_coordinates})

Z= linkage(df,'ward')
df['cluster_labels']=fcluster(Z,3,criterion='maxclust')

sns.scatterplot(x='x_coordinates',y='y_coordinates', hue='cluster_labels', data=df)
plt.show()

>>K Means

from scipy.cluster.vq import kmeans, vq
from matplotlib import pyplot as plt
import seaborn as sns, pandas as pd

import random
random.seed(1000,2000)

x_coordinates=[89.1,93.1,86.6,98.5,86.4,9.5,15.2,3.4,
18.4,20.3,44.2,56.8,49.2,62.5,44.8]

y_coordinates=[87.2,96.1,95.6,92.4,92.4,57.7,49.4,
47.3,59.1,55.5,25.6,2.1,18.9, 24.1,18.3]


df=pd.DataFrame({'x_coordinates':x_coordinates,
'y_coordinates':y_coordinates})

centroids, _=kmeans(df,3)
df['cluster_labels'], _= vq(df,centroids)

sns.scatterplot(x='x_coordinates',y='y_coordinates', hue='cluster_labels', data=df)
plt.show()
 

>>Preparing data for clustering

1) variables have incomparable units
2) variables with the same units have vastly different scales and variances

normalization is rescaling data to standard deviation of 1
x_new=x/std_dev(x)
whiten is used to normalize an np.pdarray


from scipy.cluster.vq import whiten
from matplotlib import pyplot as plt

data=[5,1,3,3,2,3,3,8,1,2,2,3,5]

scaled_data=whiten(data)
print(scaled_data)

plt.plot(data, label='original')
plt.plot(scaled_data,label='scaled')


>>>>>

# Scale wage and value
fifa['scaled_wage'] = whiten(fifa['eur_wage'])
fifa['scaled_value'] = whiten(fifa['eur_value'])

# Plot the two columns in a scatter plot
fifa.plot(x='scaled_wage', y='scaled_value', kind='scatter')
plt.show()

print(fifa[['scaled_wage', 'scaled_value']].describe())

>>>>hierarchial clustering

scipy.cluster.hierarchy.linkage(observations, method='single', metric='euclidean', optimal_ordering=False)

method=how to calcuate the proximity of clusters
metric=distance metric
optimal-ordering = order data points

method=single (two closest data objects)
complete (based on two farthest objects)
average (based on arithmetic mean of all objects)
centroid ( based on the geometric mean of all objects)
median (based on the median of all objects)
ward (based on the sum of squares)

from scipy.cluster.hierarchy import linkage, fcluster
from matplotlib import pyplot as plt
import seaborn as sns, pandas as pd

x_coordinates=[89.1,93.1,86.6,98.5,86.4,9.5,15.2,3.4,
18.4,20.3,44.2,56.8,49.2,62.5,44.8]

y_coordinates=[87.2,96.1,95.6,92.4,92.4,57.7,49.4,
47.3,59.1,55.5,25.6,2.1,18.9, 24.1,18.3]

df=pd.DataFrame({'x_coordinates':x_coordinates,
'y_coordinates':y_coordinates})

Z= linkage(df,'ward')
df['cluster_labels']=fcluster(Z,3,criterion='maxclust')

# fcluster parameters : the distance matrix, number of clusters, and criterion to decide thresholds to form clusters

sns.scatterplot(x='x_coordinates',y='y_coordinates', hue='cluster_labels', data=df)
plt.show()


>>>>>>

# Import the fcluster and linkage functions
from scipy.cluster.hierarchy import fcluster, linkage

# Use the linkage() function
distance_matrix = linkage(comic_con[['x_scaled', 'y_scaled']], method = 'ward', metric = 'euclidean')

# Assign cluster labels
comic_con['cluster_labels'] = fcluster(distance_matrix, 2, criterion='maxclust')

# Plot clusters
sns.scatterplot(x='x_scaled', y='y_scaled', 
                hue='cluster_labels', data = comic_con)
plt.show()


>>>> visualizing clusters

spot trends in data

from matplotlib import pyplot as plt

df= pd.DataFrame({
'x':[2,3,5,6,2],
'y':[1,1,5,5,2],
'labels': ['A','A','B','B','A']
})

colors= {'A':'red','B':'blue'}

df.plot.scatter(
x='x',
y='y',
c=df['labels'].apply(lambda x: colors[x]))
plt.show()

>>seaborn

from matplotlib import pyplot as plt
import seaborn as sns

df= pd.DataFrame({
'x':[2,3,5,6,2],
'y':[1,1,5,5,2],
'labels': ['A','A','B','B','A']
})

sns.scatterplot(x='x',y='y', hue='labels', data=df)
plt.show()

>>
# Import the seaborn module
import seaborn as sns

# Plot a scatter plot using seaborn
sns.scatterplot(x='x_scaled', 
                y='y_scaled', 
                hue='cluster_labels', 
                data=comic_con)
plt.show()

>>Dendrograms

from scipy.cluster.hierarchy import dendrogram

Z=linkage(df[['x_whiten','y_whiten']],
method='ward',
metric='euclidean'
)
dn=dendrogram(Z)
plt.show()

1). in the dendrogram the y represents the distance to the centroid of the cluster
2). the vertical lines represents the distance between the two child clusters
3) draw a horizontal line and the number of intersections tell you the number of clusters at that stage.

>>measuring speed in hierarchial clustering

measure the speed of .linkage() method
use randomly generated points

from scipy.cluster.hierarchy import linkage
import pandas as pd
import random, timeit

points=100
df=pd.DataFrame({
'x': random.sample(range(0,points),points),
'y': random.sample(range(0,points),points)})

%timeit linkage(df[['x','y']], method='ward', metric='euclidean')


>>>


# Fit the data into a hierarchical clustering algorithm
distance_matrix = linkage(fifa[['scaled_sliding_tackle', 'scaled_aggression']], 'ward')


# Assign cluster labels to each row of data
fifa['cluster_labels'] = fcluster(distance_matrix, 3, criterion='maxclust')

print(fifa[['scaled_sliding_tackle', 'scaled_aggression', 'cluster_labels']].groupby('cluster_labels').mean())

# Create a scatter plot through seaborn
sns.scatterplot(x='scaled_sliding_tackle', y='scaled_aggression', hue='cluster_labels', data=fifa)
plt.show()

>>>KMeans clustering

K means runs significantly faster on large datasets

1) generate cluster centrs
2) generate labels

kmeans(obs, k_or_guess, iter, thresh, check_finite)

obs: whiten standardized observations
k_or_guess: number of clusters
iter: number of iterations the default is 20
thres: threshold (10^-5)
check_finite: whether to check if observations contain only finite number (default is true)

kmeans returns cluster centers and distortion

the distortion is the sum of squares of distances of points from cluster centers

use the vq to generate cluster labels

vb(obs, code_book, check_finite=True)

obs: standardize observations
code_book: cluster centers
check_finite: whether to check if observations contain only finite numbers where the default is True

vq retuns a list of distortions

The sum of vq distortions should equal the sum of the kmeans distortions


from scipy.cluster.vq import kmeans, vq

cluster_centers, distortion =kmeans(df[['scale_x','scaled_y']],3)

df['cluster_labels'], = vq(df[['scale_x','scaled_y']], cluster_centers)

sns.scatterplot(x='scaled_x', y='scaled_y', hue='cluster_labels', data=df)


>>

from scipy.cluster.vq import kmeans, vq

# Generate cluster centers
cluster_centers, distortion = kmeans(comic_con[['x_scaled', 'y_scaled']], 2)

# Assign cluster labels
comic_con['cluster_labels'], distortion_list = vq(comic_con[['x_scaled', 'y_scaled']], cluster_centers)

# Plot clusters
sns.scatterplot(x='x_scaled', y='y_scaled', 
                hue='cluster_labels', data = comic_con)
plt.show()

>>how to find the right k
>>elbow method
>>distortion decreases with an increasing number of clusters



from sklearn.cluster import KMeans
import seaborn as sns
from matplotlib import pyplot as plt

sse={}

for k in range(1,11):
	kmeans=KMeans(n_clusters=k,random_state=1)
	kmeans.fit(data_normalized)
	sse[k]=kmeans.inertia_


plt.title('The Elbow Method')
plt.xlabel('k')
plt.ylabel('sse')
sns.pointplot(x=list(see.keys()), y=list(sse.values()))
plt.show()

num_clusters= range(1,11)

distortions=[]
for k in num_clusters:
	centroids, distortion = kmeans(df[['scaled_x','scaled_y']],k)
	distortions.append(distortion)

elbow_plot_data= pd.DataFrame({'num_clusters':num_clusters,
	'distortions':distortions})

sns.lineplot(x='num_clusters' y='distortions', data=elbow_plot_data)
plt.show()

>>

distortions = []
num_clusters = range(2, 7)

# Create a list of distortions from the kmeans function
for i in num_clusters:
    cluster_centers, distortion = kmeans(
        uniform_data[['x_scaled','y_scaled']],i)
        
    distortions.append(distortions)

# Create a data frame with two lists - number of clusters and distortions
elbow_plot = pd.DataFrame({'num_clusters': num_clusters, 'distortions': distortions})

# Creat a line plot of num_clusters and distortions
sns.lineplot(x='num_clusters', y='distortions', data=elbow_plot)
plt.xticks(num_clusters)
plt.show()

>>Limitations

impact of seeds
bias towards equal sized clusters

from numpy import random

random.seed(12)

1) clustering is exploratory phase of analysis

>>
# Import the kmeans and vq functions
from scipy.cluster.vq import kmeans, vq

# Generate cluster centers
cluster_centers, distortion =  kmeans(mouse[['x_scaled','y_scaled']],3)

# Assign cluster labels
mouse['cluster_labels'], distortion_list = vq(mouse[['x_scaled', 'y_scaled']], cluster_centers)

# Plot clusters
sns.scatterplot(x='x_scaled', y='y_scaled', 
                hue='cluster_labels', data = mouse)
plt.show()

>>

# Set up a random seed in numpy
random.seed([1000,2000])

# Fit the data into a k-means algorithm
cluster_centers,_ = kmeans(fifa[['scaled_def','scaled_phy']],3)

# Assign cluster labels
fifa['cluster_labels'],_ = vq(fifa[['scaled_def','scaled_phy']],cluster_centers)

# Display cluster centers 
print(fifa[['scaled_def', 'scaled_phy', 'cluster_labels']].groupby('cluster_labels').mean())

sns.scatterplot(x='scaled_def', y='scaled_phy', hue='cluster_labels', data=fifa)
plt.show()

>>>>clustering on images

1. converting image to pixels using matplotlib.image.imread
2. display colors of cluster centers using matplotlib.pyplot.imshow


import matplotlib.image as img
from scipy.cluster.vq import kmeans, vq, whiten
import seaborn as sns, pandas as pd
from matplotlib import pyplot as plt

import matplotlib.image as img
from scipy.cluster.vq import kmeans, vq, whiten
import seaborn as sns, pandas as pd
from matplotlib import pyplot as plt

image=img.imread('construction_gear.png')
image.shape


r=[]
g=[]
b=[]

for row in image:
        for pixel in row:
            temp_r,temp_g,temp_b=pixel
            r.append(temp_r)
            g.append(temp_g)
            b.append(temp_b)
            
pixels=pd.DataFrame({
    'red':r,
    'blue':b,
    'green':g
})            

pixels['scaled_red']=whiten(pixels['red'])
pixels['scaled_blue']=whiten(pixels['blue'])
pixels['scaled_green']=whiten(pixels['green'])

num_clusters= range(1,6)

distortions=[]
for k in num_clusters:
    centroids, distortion = kmeans(pixels[['scaled_red','scaled_blue','scaled_green']],k)
    distortions.append(distortion)

elbow_plot_data= pd.DataFrame({'num_clusters':num_clusters,'distortions':distortions})

sns.lineplot(x='num_clusters', y='distortions', data=elbow_plot_data)
plt.show()            

cluster_centers,distortions=kmeans(pixels[['scaled_red','scaled_blue','scaled_green']],3)

colors=[]

r_std,g_std,b_std = pixels[['red','blue','green']].std()

for cluster_center in cluster_centers:
        scaled_r,scaled_g,scaled_b=cluster_center
        colors.append((
            scaled_r * r_std/255,
            scaled_g * g_std/255,
            scaled_b * g_std/255
        ))
        
plt.imshow([colors])
plt.show()


>>

# Import image class of matplotlib
import matplotlib.image as img

# Read batman image and print dimensions
batman_image = img.imread('batman.jpg')
print(batman_image.shape)

# Store RGB values of all pixels in lists r, g and b
for row in batman_image:
    for temp_r, temp_g, temp_b in row:
        r.append(temp_r)
        g.append(temp_g)
        b.append(temp_b)

distortions = []
num_clusters = range(1, 7)

# Create a list of distortions from the kmeans function
for i in num_clusters:
    cluster_centers, distortion = kmeans(batman_df[['scaled_red','scaled_blue','scaled_green']],i)
    distortions.append(distortion)

# Create a data frame with two lists, num_clusters and distortions
elbow_plot = pd.DataFrame({'num_clusters':num_clusters,'distortions':distortions})

# Create a line plot of num_clusters and distortions
sns.lineplot(x='num_clusters', y='distortions', data = elbow_plot)
plt.xticks(num_clusters)
plt.show()

# Get standard deviations of each color
r_std,g_std,b_std = batman_df[['red', 'green', 'blue']].std()

for cluster_center in cluster_centers:
    scaled_r, scaled_g, scaled_b = cluster_center
    # Convert each standardized value to scaled value
    colors.append((
        scaled_r * r_std /255,
        scaled_g * g_std / 255,
        scaled_b * b_std / 255
    ))

# Display colors of cluster centers
plt.imshow(colors)
plt.show()

>> Document clustering : concepts

(tf=term frequency - inverse document frequency)

1. clean data before processing
2. determine the importance of the terms in a document (tf-idf matrix)
3. cluster the tf-idf matrix
4. find top terms, documents in each cluster

clean and tokenize data

from nltk.tokenize import word_tokenize
import re

def remove_noise(text, stop_word=[]):
	tokens=word_tokenize(text)
	cleaned_tokens=[]
	for token in tokens:
		token = re.sub('[^A-Za-z0-9]+', '',token)
		if len(token)>1 and token.lower() not in stop_words:
			cleaned_tokens.append(token.lower())
	return cleaned_tokens

remove_noise('it is lovely weather we are having. I hope the weather continues.')

['lovely','weather','hope','weather','continues']


>>>
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer=TFidfVectorizer(max_df=0.8, max_features=50,
	min_df=0.2, tokenizer=remove_noise)

tfidf_matrix=tfidf_vectorizer.fit_transform(data)

>>kmeans does not support sparse matrices

use .todense() to convert to a matrix

cluster_centers, distortion = kmeans(tfidf_matrix.todense(), num_clusters)

>>>
1. cluster centers list with a size equal to the number of terms
2. each value in the cluster center is its importance
3. create a dictionary and print top terms

terms=tfidf_vectorizer.get_feature_names()

for i in range(num_clusters):
	center_terms= dict(zip(terms, list(cluster_centers[i])))
	sorted_terms = sorted(center_terms, key=center_terms.get, reverse=True)
	print(sorted_terms[:3])


>>normalized words (run, ran, running->run)
>>.todense() many not work with large datasets


# Import TfidfVectorizer class from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.75, max_features=50,
                                   min_df=0.1, tokenizer=remove_noise)

# Use the .fit_transform() method on the list plots
tfidf_matrix = tfidf_vectorizer.fit_transform(plots)

num_clusters = 2

# Generate cluster centers through the kmeans function
cluster_centers, distortion = kmeans(tfidf_matrix.todense(), num_clusters)


# Generate terms from the tfidf_vectorizer object
terms = tfidf_vectorizer.get_feature_names()

for i in range(num_clusters):
    # Sort the terms and print top 3 terms
    center_terms = dict(zip(terms, list(cluster_centers[i])))
    sorted_terms = sorted(center_terms, key=center_terms.get, reverse=True)
    print(sorted_terms[:3])

>>>Clustering with multiple features

print(fifa.groupby('clusters_labels')[['scaled_heading_accuracy','scaled_volleys','scaled_finishing']].mean()

print('fifa.groupby('clusters_labels').['ID'].count())


>>visualizing cluster centers

fifa.groupby('clusters_labels')[['scaled_heading_accuracy','scaled_volleys','scaled_finishing']].mean().
plot(kind='bar')

plt.show()


#get the name column of top 5 plays in each cluster

for cluster in fifa['cluster_labels'].unique():
	print(cluster,fifa[fifa['cluster_labels']==cluster]['name'].values[:5])

>>feature reduction
1) factor analysis

# Print the size of the clusters
print(fifa.groupby('cluster_labels')['ID'].count())

# Print the mean value of wages in each cluster
print(fifa.groupby('cluster_labels')['eur_wage'].mean())

>>>>

# Create centroids with kmeans for 2 clusters
cluster_centers,_ = kmeans(fifa[scaled_features], 2)


# Assign cluster labels and print cluster centers
fifa['cluster_labels'], _ = vq(fifa[scaled_features], cluster_centers)
print(fifa.groupby('cluster_labels')[scaled_features].mean())

# Plot cluster centers to visualize clusters
fifa.groupby('cluster_labels')[scaled_features].mean().plot(legend=True, kind='bar')
plt.show()

# Get the name column of first 5 players in each cluster
for cluster in fifa['cluster_labels'].unique():
    print(cluster, fifa[fifa['cluster_labels'] == cluster]['name'].values[:5])