-
Notifications
You must be signed in to change notification settings - Fork 119
/
Copy pathcars.R
57 lines (42 loc) · 1.44 KB
/
cars.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
library(tidyverse)
library(ClusterR) # for kmeans++
library(foreach)
library(mosaic)
cars = read.csv('../data/cars.csv', header=TRUE)
summary(cars)
# Center and scale the data
X = cars[,-(1:9)]
X = scale(X, center=TRUE, scale=TRUE)
# Extract the centers and scales from the rescaled data (which are named attributes)
mu = attr(X,"scaled:center")
sigma = attr(X,"scaled:scale")
# Run k-means with 6 clusters and 25 starts
clust1 = kmeans(X, 6, nstart=25)
# What are the clusters?
clust1$center %>% round(2) # not super helpful
clust1$center[1,]*sigma + mu
clust1$center[2,]*sigma + mu
clust1$center[4,]*sigma + mu
# Which cars are in which clusters?
which(clust1$cluster == 1)
which(clust1$cluster == 2)
which(clust1$cluster == 3)
which(clust1$cluster == 4)
which(clust1$cluster == 5)
# A few plots with cluster membership shown
# qplot is in the ggplot2 library
qplot(Weight, Length, data=cars, color=factor(clust1$cluster))
qplot(Horsepower, CityMPG, data=cars, color=factor(clust1$cluster))
# ggplot 2 equivalent
ggplot(cars) +
geom_point(aes(Weight, Length, color=factor(clust1$cluster)))
# Using kmeans++ initialization
clust2 = KMeans_rcpp(X, clusters=6, num_init=25)
clust2$centroids[1,]*sigma + mu
clust2$centroids[2,]*sigma + mu
clust2$centroids[4,]*sigma + mu
# Which cars are in which clusters?
names(clust2$cluster) = rownames(X) # some annoying preprocessing
which(clust2$cluster == 1)
which(clust2$cluster == 2)
which(clust2$cluster == 3)