-
Notifications
You must be signed in to change notification settings - Fork 119
/
Copy pathhclust_examples.R
70 lines (47 loc) · 1.56 KB
/
hclust_examples.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# Protein first
protein = read.csv("../data/protein.csv", row.names=1)
# Center/scale the data
protein_scaled = scale(protein, center=TRUE, scale=TRUE)
# Form a pairwise distance matrix using the dist function
protein_distance_matrix = dist(protein_scaled, method='euclidean')
# Now run hierarchical clustering
hier_protein = hclust(protein_distance_matrix, method='average')
# Plot the dendrogram
plot(hier_protein, cex=0.8)
# Cut the tree into 5 clusters
cluster1 = cutree(hier_protein, k=5)
summary(factor(cluster1))
# Examine the cluster members
which(cluster1 == 1)
which(cluster1 == 2)
which(cluster1 == 3)
# Using max ("complete") linkage instead
hier_protein2 = hclust(protein_distance_matrix, method='single')
# Plot the dendrogram
plot(hier_protein2, cex=0.8)
cluster2 = cutree(hier_protein2, k=5)
summary(factor(cluster2))
# Examine the cluster members
which(cluster2 == 1)
which(cluster2 == 2)
which(cluster2 == 3)
## Now the cars data
cars = read.csv('../data/cars.csv', header=TRUE)
summary(cars)
# Center and scale the data
X = cars[,-(1:9)]
X = scale(X, center=TRUE, scale=TRUE)
# Extract the centers and scales from the rescaled data (which are named attributes)
mu = attr(X,"scaled:center")
sigma = attr(X,"scaled:scale")
# First form a pairwise distance matrix
distance_between_cars = dist(X)
# Now run hierarchical clustering
h1 = hclust(distance_between_cars, method='complete')
# Cut the tree into 10 clusters
cluster1 = cutree(h1, k=10)
summary(factor(cluster1))
# Examine the cluster members
which(cluster1 == 2)
# Plot the dendrogram
plot(h1, cex=0.3)