-
Notifications
You must be signed in to change notification settings - Fork 0
/
pca.py
78 lines (55 loc) · 2.01 KB
/
pca.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import numpy as np
'''
scaling the dataset
'''
def scaling(dataset):
print('scaling ...')
meanVals = np.mean(dataset, axis=0)
meanRemoved = dataset-meanVals
var_total = np.var(meanRemoved)
var_percent = meanRemoved**2 /var_total
return var_percent
'''
compute the value and the vector of eig
'''
def calcul_eig(train_data):
print('calculing eig...')
var_percent = scaling(train_data)
covMat = np.cov(var_percent, rowvar=0)
eigvals, eigVects = np.linalg.eig(np.mat(covMat))
return eigvals,eigVects
'''
analyse data , and select some feature who have occupe 95% vars
'''
def analyse_data( eigvals, eigVects, taux=0.95):
print('analysing data...')
eigValInd = np.argsort(-eigvals)
count = 0
cov_all_score = sum(eigvals)
sum_cov_score = 0
for i in range(0, len(eigValInd)):
line_cov_score = eigvals[eigValInd[i]]
sum_cov_score += line_cov_score
count += 1
if sum_cov_score/cov_all_score>=taux:
break
print('main: {:.0f}'.format(i+1),', sqrs: {:.2f}'.format((line_cov_score/cov_all_score*100).real),'%, sum : {:.2f}'.format((sum_cov_score/cov_all_score*100).real),'% ')
return count
'''
cut some feature , ruduce dimension
'''
def cut_feature(train_data, test_data, num, eigvals, eigVects ):
print('cutting features ...')
eigValInd = np.argsort(eigvals)
eigValInd = eigValInd[:-(num+1):-1]
redEigVects = eigVects[:, eigValInd]
pca_train = train_data * redEigVects
pca_test = test_data * redEigVects
print('pca train',pca_train.shape,'pca test', pca_test.shape)
return np.real(pca_train), np.real(pca_test)
def pca_func(train_data,test_data):
eigvals, eigVects = calcul_eig(np.array(train_data).astype(np.float))
num = analyse_data( eigvals, eigVects, 0.95)
print('num',num)
train_pca, test_pca = cut_feature(np.array(train_data).astype(np.float),np.array(test_data).astype(np.float),num, eigvals, eigVects )
return train_pca, test_pca