-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path离散数据.py
388 lines (321 loc) · 14.8 KB
/
离散数据.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
import numpy as np
import tensorflow as tf
import math
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from sklearn.preprocessing import normalize
from keras.utils import to_categorical
import keras
import scipy
import os
def loadDataSet(frx):
Data=[];
X = [];
Y = [];
name = []
# 打开保存特征X的文件
frx = open(frx)
# 是否跳过第一行
lines = frx.readlines()
# 基因的名字
name = lines[0].strip().split('\t')
for line in range(1, len(lines)):
curLine = lines[line].strip().split('\t')
# 字符型转化为浮点型
fltLine = list(map(float, curLine))
Data.append(fltLine)
# 转化为矩阵
Data = np.mat(Data)
X=Data[:,:-1]
Y=Data[:,-1]
m, n = X.shape
# 划分训练集和测试集
indices = np.arange(m)
# random_state表示每次生成的训练集和测试集都是固定的,也就是结果可以重复
X_train, X_test, Y_train, Y_test, index1, index2 = train_test_split(X, Y, indices, test_size=0.2, random_state=42)
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)
return X_train, X_test, Y_train, Y_test, name, index1
# 创建placeholders对象
def create_placeholders(n_x, n_y):
"""
placeholder是TensorFlow的占位符节点,由placeholder方法创建,其也是一种常量,但是由用户在调用run方法是传递的.
也可以将placeholder理解为一种形参。
即其不像constant那样直接可以使用,需要用户传递常数值。
"""
X = tf.placeholder(tf.float32, shape=[n_x, None], name="X")
Y = tf.placeholder(tf.float32, shape=[n_y, None], name="Y")
return X, Y
# 初始化参数
def initialize_parameters(n):
tf.set_random_seed(1)
Z0 = tf.get_variable("Z0", shape=[1, 2 * n],
initializer=tf.contrib.layers.xavier_initializer(seed=1)) # 初始化X_feature和X_knockoffs的的系数
Zb0 = tf.get_variable("Zb0", shape=[1, 2 * n], initializer=tf.zeros_initializer())
W0 = tf.get_variable("W0", shape=[1, n], initializer=tf.contrib.layers.xavier_initializer(seed=1))
b0 = tf.get_variable("b0", shape=[1, n], initializer=tf.zeros_initializer())
W1 = tf.get_variable("W1", shape=[n, n], initializer=tf.contrib.layers.xavier_initializer(seed=1))
b1 = tf.get_variable("b1", shape=[n, 1], initializer=tf.zeros_initializer())
W2 = tf.get_variable("W2", shape=[n, n], initializer=tf.contrib.layers.xavier_initializer(seed=1))
b2 = tf.get_variable("b2", shape=[n, 1], initializer=tf.zeros_initializer())
W3 = tf.get_variable("W3", shape=[2, n], initializer=tf.contrib.layers.xavier_initializer(seed=1))
b3 = tf.get_variable("b3", shape=[2, 1], initializer=tf.zeros_initializer())
parameters = {"Z0": Z0,
"Zb0": Zb0,
"W0": W0,
"b0": b0,
"W1": W1,
"b1": b1,
"W2": W2,
"b2": b2,
"W3": W3,
"b3": b3,
}
return parameters
# 前向传播
def forward_propagation(X, parameters, n, lambd):
print(type(n), n)
Z0 = parameters['Z0']
Zb0 = parameters['Zb0']
W0 = parameters['W0']
b0 = parameters['b0']
W1 = parameters['W1']
b1 = parameters['b1']
W2 = parameters['W2']
b2 = parameters['b2']
W3 = parameters['W3']
b3 = parameters['b3']
# 正则化
# tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(lambd)(W1))
# tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(lambd)(W3))
X_temp = tf.multiply(Z0, tf.transpose(X))
X0 = X_temp[:, 0:n]
X1 = X_temp[:, n:2 * n]
X0 = tf.add(X0, X1)
# x0*w0+x0_knockoffs*w0'+x1*w1+x1_knockofss*w1'....
A0 = tf.add(tf.multiply(W0, X0), b0)
A0 = tf.transpose(A0)
Z1 = tf.add(tf.matmul(W1, A0), b1) # Z1 = np.dot(W1, X) + b1
# 过拟合时使用dropout改善
# Z1 = tf.nn.dropout(Z1, keep_prob=0.65)
# 隐藏层数目的选择
A1 = tf.nn.relu(Z1) # A1 = relu(Z1)
# Z2 = tf.add(tf.matmul(W2, A1), b2) # Z2 = np.dot(W2, a1) + b2
# Z2 = tf.nn.dropout(Z2, keep_prob=0.65)
# A2=tf.nn.relu(Z2)
Z3 = tf.add(tf.matmul(W3, A1), b3) # Z3 = np.dot(W3,Z2) + b3
return Z3
def compute_cost(Z3, Y):
# to fit the tensorflow requirement for tf.nn.softmax_cross_entropy_with_logits(...,...)
logits = tf.transpose(Z3)
labels = tf.transpose(Y)
# 如果是二分类
cost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=Z3, labels=Y))
#cost = -tf.reduce_mean(labels * tf.log(tf.clip_by_value(logits, 1e-10, 1.0)))
#cost = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels)
tf.add_to_collection('losses', cost)
cost = tf.add_n(tf.get_collection('losses'))
return cost
def random_mini_batches(X, Y, mini_batch_size=64, seed=0):
m = X.shape[1] # number of training examples
mini_batches = []
np.random.seed(seed)
# Step 1: 打乱顺序
permutation = list(np.random.permutation(m)) # 会返回一个长度为m的随机数组,且里面的数是0-m-1
shuffled_X = X[:, permutation] # 将每列的数据按permutation的顺序来重新排列
shuffled_Y = Y[:, permutation].reshape((Y.shape[0], m))
# Step 2: 分割
# num_complete_minibatches = math.floor(m/mini_batch_size) # original
num_complete_minibatches = int(math.floor(m / mini_batch_size)) # 一共有多少个集合
for k in range(0, num_complete_minibatches):
mini_batch_X = shuffled_X[:, k * mini_batch_size: k * mini_batch_size + mini_batch_size]
mini_batch_Y = shuffled_Y[:, k * mini_batch_size: k * mini_batch_size + mini_batch_size]
mini_batch = (mini_batch_X, mini_batch_Y)
mini_batches.append(mini_batch)
# 如果训练集的大小刚好是mini_batch_size的整数倍,那么这里已经处理完了
# 如果训练集的大小不是mini_batch_size的整数倍,那么最后肯定会剩下一些,把剩下的放在一个集合内
if m % mini_batch_size != 0:
mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size: m]
mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size: m]
mini_batch = (mini_batch_X, mini_batch_Y)
mini_batches.append(mini_batch)
return mini_batches
def model(X_train, Y_train, X_test, Y_test, learning_rate=0.0001,
minibatch_size=32, num_epochs=500, print_cost=True):
"""
输入参数:
X_train -- 训练集, (输入特征数 , 样例数 )
Y_train -- 训练集, (输出维度 , 样例数 )
X_test -- 测试集, (输入特征数 , 样例数 )
Y_test -- 测试集, (输入特征数 , 样例数 )
learning_rate -- 参数更新的learning rate
minibatch_size -- 每个集合的样本数目,num_minibatch表示一共有多少个集合。我们在每个集合中选择一个进行训练
num_epochs -- 迭代次数
print_cost -- 是否每迭代100输出代价
"""
tf.set_random_seed(1)
seed = 3
(n_x, m) = X_train.shape
n_y = Y_train.shape[0]
costs = []
# 创建Placeholders,一个张量
X, Y = create_placeholders(n_x, n_y)
"""下面的计算只是定义了一种计算的形式,并没有具体的数字,在实际run中使用这些函数来进行计算"""
# 初始化参数
parameters = initialize_parameters(int(n_x / 2))
# 前向传播
Z3 = forward_propagation(X, parameters, int(n_x / 2), 0.04)
# 计算代价
cost = compute_cost(Z3, Y)
# 后向传播: 定义tensorflow optimizer对象,这里使用AdamOptimizer.
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost)
# 初始化所有参数
init = tf.global_variables_initializer()
m, n = X_test.shape
X_k = np.zeros((m, n))
X_test = np.c_[X_test, X_k]
X_test = np.transpose(X_test)
X_test = X_test.astype(np.float32)
Y_test = Y_test.astype(np.float32)
print(X_test.shape)
print(Y_test.shape)
# 启动session来计算tensorflow graph
with tf.Session() as sess:
sess.run(init)
for epoch in range(num_epochs):
#对于样本数目很多的情况,可以使用mini_batch
epoch_cost=0 #定义每一次迭代的代价
num_minibatches=int(m/minibatch_size) #训练集minibatch的数量
seed=seed+1
minibatches=random_mini_batches(X_train,Y_train,minibatch_size,seed)
for minibatch in minibatches:
(minibatch_X,minibatch_Y)=minibatch
_,minibatch_cost=sess.run([optimizer,cost],feed_dict={X:minibatch_X,Y:minibatch_Y})
epoch_cost += minibatch_cost / num_minibatches
# 进行批度训练
#epoch_cost = sess.run([optimizer, cost], feed_dict={X: X_train, Y: Y_train})
#test_cost = sess.run(cost, feed_dict={X: X_test, Y: Y_test})
#epoch_cost = epoch_cost[1]
# Print the cost every epoch
if print_cost == True and epoch % 100 == 0:
print("Cost after epoch %i: %f" % (epoch, epoch_cost))
#print("test_cost: ", test_cost)
if print_cost == True and epoch % 5 == 0:
costs.append(epoch_cost)
# lets save the parameters in a variable
parameters = sess.run(parameters)
print("Parameters have been trained!")
# 神经网络经过训练后得到的值
Z3 = sess.run(Z3, feed_dict={X: X_train, Y: Y_train})
print(sess.run(cost, feed_dict={X: X_train, Y: Y_train}))
# 增加测试集时
cost = sess.run(cost, feed_dict={X: X_test, Y: Y_test})
print('test cost: ', cost)
correct_prediction = tf.equal(tf.argmax(Z3), tf.argmax(Y)) # tf.argmax找出每一列最大值的索引
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) # tf.cast转化数据类型
print("Train Accuracy:", accuracy.eval({X: X_train, Y: Y_train}))
return parameters
def analyse(parameters, n):
print(type(n))
Z0 = parameters['Z0']
W0 = parameters['W0']
b0 = parameters['b0']
W1 = parameters['W1']
b1 = parameters['b1']
W2 = parameters['W2']
b2 = parameters['b2']
W3 = parameters['W3']
# 进行特征重要性计算
# W = np.matmul(W3, W2)
W = np.matmul(W3, W1)
# W=W3
Z1 = np.multiply(Z0[:, 0:n], W0)
Z2 = np.multiply(Z0[:, n:2 * n], W0)
Z11 = np.multiply(Z1, W)
Z22 = np.multiply(Z2, W)
# print(Z0)
res = {} # 公式为W0*Z0
res1 = {} # Z0[i]-Z0[j],
res2 = {}
for i in range(0, n):
res[i] = Z1[0][i] * Z1[0][i] - Z2[0][i] * Z2[0][i]
for i in range(0, n):
res1[i] = Z0[0][i] * Z0[0][i] - Z0[0][n + i] * Z0[0][n + i]
for i in range(0, n):
res2[i] = Z11[0][i] * Z11[0][i] - Z22[0][i] * Z22[0][i]
# 不使用knockoffs时的结果
# for i in range(0,215):
# res2[i]=W0[0][i]
res = sorted(res.items(), key=lambda d: d[1], reverse=True)
res1 = sorted(res1.items(), key=lambda d: d[1], reverse=True)
res2 = sorted(res2.items(), key=lambda d: d[1], reverse=True)
# (Z0-Z0')^2*W0
"""
rank = 1
for key in res:
#print(rank, '(', key[0], ' ,', name[key[0]], ' ,', key[1])
if (name[key[0]] == 'GRMZM5G872256' or name[key[0]] == 'GRMZM2G066734' or name[key[0]] == 'GRMZM2G012455' or
name[key[0]] == 'GRMZM2G138589' or
name[key[0]] == 'GRMZM2G004528' or name[key[0]] == 'GRMZM5G870176' or name[key[0]] == 'GRMZM2G015040'):
print(rank, '(', key[0], ' ,', name[key[0]], ' ,', key[1])
if (name[key[0]] == 'GRMZM2G324886' or name[key[0]] == 'GRMZM2G150906' or name[key[0]] == 'GRMZM2G158232' or
name[key[0]] == 'GRMZM2G082780' or name[key[0]] == 'GRMZM2G039454'):
print(rank, '(', key[0], ' ,', name[key[0]], ' ,', key[1])
rank += 1
"""
"""
rank = 1
for key in res1:
#print(rank, '(', key[0], ' ,', name[key[0]], ' ,', key[1])
if (name[key[0]] == 'GRMZM5G872256' or name[key[0]] == 'GRMZM2G066734' or name[key[0]] == 'GRMZM2G012455' or
name[key[0]] == 'GRMZM2G138589' or name[key[0]] == 'GRMZM2G004528' or name[key[0]] == 'GRMZM5G870176' or
name[key[0]] == 'GRMZM2G015040'):
print(rank, '(', key[0], ' ,', name[key[0]], ' ,', key[1])
if (name[key[0]] == 'GRMZM2G324886' or name[key[0]] == 'GRMZM2G150906' or name[key[0]] == 'GRMZM2G158232' or
name[key[0]] == 'GRMZM2G082780' or name[key[0]] == 'GRMZM2G039454'):
print(rank, '(', key[0], ' ,', name[key[0]], ' ,', key[1])
rank += 1
"""
fr = open('./result.txt', 'a')
res = []
rank = 1
for key in res2:
print(rank, '(', key[0], ' ,', name[key[0]], ' ,', key[1])
fr.write(str(rank) + ' ' + str(name[key[0]]) + ' ' + str(key[1]) + '\n')
if (name[key[0]] == 'GRMZM5G872256' or name[key[0]] == 'GRMZM2G066734' or name[key[0]] == 'GRMZM2G012455' or
name[key[0]] == 'GRMZM2G138589' or name[key[0]] == 'GRMZM2G004528' or name[key[0]] == 'GRMZM5G870176' or
name[key[0]] == 'GRMZM2G015040'):
print(rank, '(', key[0], ' ,', name[key[0]], ' ,', key[1])
if (name[key[0]] == 'GRMZM2G324886' or name[key[0]] == 'GRMZM2G150906' or name[key[0]] == 'GRMZM2G158232' or
name[key[0]] == 'GRMZM2G082780' or name[key[0]] == 'GRMZM2G039454'):
print(rank, '(', key[0], ' ,', name[key[0]], ' ,', key[1])
rank += 1
if __name__ == "__main__":
# 得到训练集和测试集
X_train_orig, X_test_orig, Y_train_orig, Y_test_orig, name, index1 = loadDataSet('./GY-DATA/1-001.txt')
# 生成X对应的knockoffX数据
knockoffX = []
fr = open("./GY-DATA/1-001-fixed.txt")
rank=0
for line in fr.readlines():
curLine = line.strip().split('\t')
rank=rank+1
fltLine = list(map(float, curLine))
knockoffX.append(fltLine)
knockoffX = np.mat(knockoffX)
# 只取训练集那一部分
knockoffX = knockoffX[index1, :]
print('knockoffX.shape: ', knockoffX.shape)
X_train_orig = np.hstack((X_train_orig, knockoffX))
print('X_train_orig.shape: ', X_train_orig.shape)
m, n = X_train_orig.shape
# 把训练集和测试集的图像展开
X_train_flatten = X_train_orig.reshape(X_train_orig.shape[0], -1).T
X_test_flatten = X_test_orig.reshape(X_test_orig.shape[0], -1).T
X_train = normalize(X_train_flatten) #
X_test = normalize(X_test_flatten) #
Y_train=to_categorical(Y_train_orig)
Y_test=to_categorical(Y_test_orig)
print(X_train.shape,Y_train.shape)
parameters = model(X_train, Y_train.T, X_test, Y_test.T)
analyse(parameters, int(n / 2))