-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathPCA.m
122 lines (109 loc) · 3.1 KB
/
PCA.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
%% Introduction
% Principle component analysis (PCA) is a well-applied way for dimension
% reduction. It takes a linear transformation of features of the
% original dataset to reduce dimension. The steps associated with PCA are:
%
% 1. feature scaling
%
% 2. get covariance matrix of the scaled features
%
% 3. calculate eigenvalues and eigenvectors of covariance matrix
%
% 4. choose eigenvalues and eigenvectors
%
% 5. compute dimension-reduced data
%% Data access and data description
% *wine data*
%
% Wine data are the 13 kinds of chemical tests for 3 different types of wine grown in the same region in Italy.
% In this tutorial, we choose 2 types of wine instead of 3 for PCA
% analysis.
% The 13 features of wine are:
% 1) Alcohol
% 2) Malic acid
% 3) Ash
% 4) Alcalinity of ash
% 5) Magnesium
% 6) Total phenols
% 7) Flavanoids
% 8) Nonavanoid phenols
% 9) Proanthocyanins
% 10) Color intensity
% 11) Hue
% 12) OD280/OD315 of diluted wines
% 13) Proline
%
% *data access*
clear all;
mydata = csvread('data_wine.csv', 1, 0);
features = mydata(:, 2:end);
labels = mydata(:, 1);
%%
% The first 6 rows of the data are:
mydata(1:6, 1:14)
%% Implementation in Matlab
% *Step 1. * feature scaling: standardilization method
%
% $x'_i = \frac{x_i-\bar{x}}{\sigma_x}$
features_scale = zscore(features);
features_scale(1:6, :)
%%
% *Step 2. * get covariance matrix of the scaled features
covMatrix = cov(features_scale)
%%
% *Step 3. * calculate eigenvalues and eigenvectors of covariance matrix
[eigenVector eigenValue] = eig(covMatrix);
eigenValue = diag(eigenValue)
eigenVector
%%
% *Step 4. * choose eigenvalues and eigenvectors
% Suppose we want to keep more than 80% of the previous information
information = 0.8;
[sortedValue index] = sort(eigenValue, 'descend');
percValue=sortedValue/sum(sortedValue);
%cumPercValue=cumsum(sortedValue)/sum(sortedValue);
figure(1)
plot(cumsum(sortedValue)/sum(sortedValue), 'mo--', 'LineWidth', 2)
xlabel('Number of principal components')
ylabel('Percentage of information (%)')
for i=1:length(sortedValue)
if sum(sortedValue(1:i))/sum(sortedValue) > information
eigenIDs = index(1:i);
break;
end
end
eigenValue_pca = eigenValue(eigenIDs)
eigenVector_pca = eigenVector(:, eigenIDs)
%%
% 5. compute dimension-reduced data
features_pca = features_scale*eigenVector_pca;
features_pca(1:6, :)
%%%%%%%%%%%
%%%%%%%%%%%
%%%%%%%%%%%
%%%%%%%%%%%
%% Example: iris data
clear all
load fisheriris.mat
features = meas(:, 1:3);
labels = [repmat(1, 1, 50) repmat(2, 1, 50) repmat(3, 1, 50)]';
%%
% plot original 3-D data
figure(2)
scatter3(features(:, 1), features(:, 2), features(:, 3), 50, labels, 'filled')
xlabel('sepal length')
ylabel('sepal width')
zlabel('petal length')
title('Kinds of Iris')
%%
% perform PCA to reduce 3-D to 2-D, and plot the new 2-D data
% features_scale = zscore(features);
% covMatrix = cov(features_scale);
% [eigenVector eigenValue] = eig(covMatrix);
% eigenValue = diag(eigenValue);
features_pca = GetPCAFeature(features, 0.8);
figure(3)
scatter(features_pca(:, 1), features_pca(:, 2), 50, labels, 'filled')
xlabel('PCA1')
ylabel('PCA2')
title('Types of fish')