-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcnn_ddsm.py
288 lines (219 loc) · 9.92 KB
/
cnn_ddsm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
"""
cnn_ddsm.py
@author: wronk
Classify breast cancer images using convolutional neural network.
"""
import os
from os import path as op
import tensorflow as tf
import numpy as np
from numpy.random import choice
import cv2
import json
data_dir = os.environ['DDSM_DATA']
diagnosis_classes = ['normal', 'benign', 'cancer']
train_batch_sizes = [3, 3, 3] # Training batch sizes
valid_batch_sizes = [10, 10, 10] # Validation batch sizes
#n_base_classes = [3, 3, 3]
n_base_classes = [12, 14, 15]
data_split_props = [0.8, 0.0, 0.2] # Training, validation, test
case_base_str = 'case'
base_str_img = ['LEFT_CC.png', 'LEFT_MLO.png',
'RIGHT_CC.png', 'RIGHT_MLO.png']
IMG_SIZE = (800, 400) # n_rows x n_cols
assert sum(data_split_props) == 1., "Data proportions must add to 1."
# Load image labels (y/n malignant)
with open(op.join(data_dir, 'pathology_labels.json')) as json_file:
loaded_diag_dict = json.load(json_file)
base_views = [name + '_preproc' for name in loaded_diag_dict['base_views']]
diagnosis_data = {}
diagnosis_labels = {}
for di, diag_class in enumerate(diagnosis_classes):
print '\nLoading diagnosis class: ' + diag_class
batch_folds = [diag_class + '_%02i' % batch_num
for batch_num in range(1, n_base_classes[di] + 1)]
batch_data = []
batch_label = []
for batch_fold in batch_folds:
print ' Loading batch: ' + batch_fold
# Get inidividual case directory names
batch_dir = op.join(data_dir, batch_fold)
case_list = [c for c in os.listdir(batch_dir)
if op.isdir(op.join(batch_dir, c)) and case_base_str in c]
case_list.sort()
cases_arr = -1 * np.ones((len(case_list), 4, IMG_SIZE[0], IMG_SIZE[1]))
labels_arr = -1 * np.ones((len(case_list), 4))
# Loop through each individual case
for ci, case_fold in enumerate(case_list):
if case_fold in loaded_diag_dict['skip_cases']:
print 'Skipping case %s' % case_fold
continue
img_dir = op.join(batch_dir, case_fold)
img_list = [temp_img_fname for temp_img_fname in os.listdir(img_dir)
if '.png' in temp_img_fname]
case_labels = loaded_diag_dict[diag_class][batch_fold][case_fold]
for img_i, img_fname in enumerate(img_list):
# Load and store each image
img_fpath = op.join(img_dir, img_fname)
img = cv2.imread(img_fpath, 0)
img_base_view = img_fname.split('.')[-2]
img_4_index = base_views.index(img_base_view)
# Load and store labels
# TODO: Check ordering of images/labels
cases_arr[ci, img_i, :, :] = img
labels_arr[ci, img_4_index] = int(case_labels[img_4_index])
batch_data.extend(cases_arr)
batch_label.extend(labels_arr)
diagnosis_data[diag_class] = np.asarray(batch_data)
diagnosis_labels[diag_class] = np.asarray(batch_label)
####################################
# Construct validation and test data
####################################
def split_data(data, splits):
"""Helper to split large dataset into an arbitrary number of proportions"""
n_imgs = data.shape[0]
rand_inds = choice(range(n_imgs), size=n_imgs, replace=False)
data_arrs = []
split_inds = [int(sum(splits[0:ind]) * n_imgs) for ind in range(len(splits))]
split_inds.append(n_imgs)
for si in range(len(splits)):
start, stop = split_inds[si], split_inds[si + 1]
rand_slice = rand_inds[start:stop]
data_arrs.append(data[rand_slice])
return data_arrs
print '\nTraining batch sizes: %s, Testing batch sizes: %s' % (
train_batch_sizes, valid_batch_sizes)
print 'Data split proportions (train, valid, test): %s' % str(data_split_props)
print 'Image size: %s' % str(IMG_SIZE)
train_data, valid_data, test_data = {}, {}, {}
for diag_class in diagnosis_classes:
train_data[diag_class], valid_data[diag_class], test_data[diag_class] = \
split_data(diagnosis_data[diag_class], data_split_props)
del diagnosis_data
#####################
# CNN helper funks
#####################
def weight_variable(shape, name):
return(tf.Variable(tf.truncated_normal(shape, stddev=0.1), name=name))
def bias_variable(shape, name):
#return(tf.Variable(tf.truncated_normal(shape, stddev=0.1), name=name))
return(tf.Variable(tf.constant(0.1, shape=shape), name=name))
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool(x, pool_size):
return tf.nn.max_pool(x, ksize=[1, pool_size, pool_size, 1],
strides=[1, pool_size, pool_size, 1], padding='SAME')
def create_ccn_model(layer_sizes, fullC_size, pool_size, filt_size, act_func,
img_size, n_classes):
W_list, b_list, h_list = [], [], []
# Initialize input vars
#x_train = tf.placeholder(tf.float32, shape=[None, img_size[0] * img_size[1]],
# name='x_train')
#x_image = tf.reshape(x_train, [-1, img_size[0], img_size[1], 1])
x_image = tf.placeholder(tf.float32, shape=[None, img_size[0], img_size[1], 1])
# Add convolutional layers one by one
for li, (l_size1, l_size2) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])):
W_list.append(weight_variable([filt_size, filt_size, l_size1,
l_size2], name='W%i' % li))
b_list.append(bias_variable([l_size2], name='b%i' % li))
if li == 0:
conv_temp = tf.nn.relu(conv2d(x_image, W_list[-1]) + b_list[-1])
else:
conv_temp = tf.nn.relu(conv2d(h_list[-1], W_list[-1]) + b_list[-1])
h_list.append(max_pool(conv_temp, pool_size))
# First fully connected layer
# Input: W_fc1 is image size x num_features
n_pool_layers = float(len(layer_sizes) - 1)
last_h_shape = [dim.value for dim in h_list[-1].get_shape()[1:]]
n_fc_vars = int(np.prod(last_h_shape))
W_fc1 = weight_variable([n_fc_vars, fullC_size], name='W_fc1')
b_fc1 = bias_variable([fullC_size], name='b_fc1')
h_pool_last_flat = tf.reshape(h_list[-1], [-1, n_fc_vars])
h_fc1 = tf.nn.relu(tf.matmul(h_pool_last_flat, W_fc1) + b_fc1)
# Apply dropout
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
# Second fully connected layer (output layer)
W_fc2 = weight_variable([fullC_size, n_classes], name='W_fc2')
b_fc2 = bias_variable([n_classes], name='b_fc2')
y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
#return x_train, y_conv, keep_prob
return x_image, y_conv, keep_prob
def get_batch(data, batch_sizes):
"""Helper to randomly return samples from dataset"""
batch_x, batch_y = [], []
# Get a designated number of samples from each diagnosis class
# TODO: might need to equalize images since each cancer case usually only
# has one breast with cancer and one w/out
for di, diag_class in enumerate(diagnosis_classes):
rand_inds = choice(range(data[diag_class].shape[0]),
size=batch_sizes[di], replace=False)
batch_x.extend(data[diag_class][rand_inds])
batch_y.extend(diagnosis_labels[diag_class][rand_inds])
# Reshape for feeding tensorflow. Each row is now an observation
batch_x_arr = np.array(batch_x).reshape(-1, IMG_SIZE[0], IMG_SIZE[1], 1)
batch_y_arr = np.array(batch_y).reshape(-1)
return batch_x_arr, batch_y_arr
#####################
# CNN model params
#####################
layers_sizes = [32, 32, 16, 16]
fullC_size = 512
act_func = tf.nn.relu
pool_size = 3
filt_size = 3
dropout_keep_p = 0.5
n_classes = 2
# Training params
n_training_batches = 20000
######################
# Construct CNN
######################
layers_sizes.insert(0, 1) # Add for convenience during construction
x_train, y_conv, keep_prob = create_ccn_model(
layers_sizes, fullC_size, pool_size, filt_size, act_func, IMG_SIZE,
n_classes)
y_labels = tf.placeholder(tf.int64, shape=[None], name='y_labels')
# Add objective function and defining training scheme
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
y_conv, y_labels))
train_step = tf.train.AdamOptimizer(1e-2).minimize(loss)
is_pred_correct = tf.equal(tf.arg_max(y_conv, 1), y_labels)
accuracy = tf.reduce_mean(tf.cast(is_pred_correct, tf.float32))
# Attach summaries
tf.scalar_summary('loss', loss)
tf.scalar_summary('accuracy', accuracy)
merged_summaries = tf.merge_all_summaries()
#saver = tf.train.Saver() # create saver for saving network weights
init = tf.initialize_all_variables()
sess = tf.Session()
train_writer = tf.train.SummaryWriter('./train_summaries', sess.graph)
sess.run(init)
######################
# Train CNN
######################
for ti in range(n_training_batches):
# Get data for training step
batch_x, batch_y = get_batch(train_data, train_batch_sizes)
feed_dict = {x_train: batch_x, y_labels: batch_y,
keep_prob: dropout_keep_p}
_, obj, acc, summary = sess.run([train_step, loss, accuracy,
merged_summaries], feed_dict)
train_writer.add_summary(summary, ti)
print("\titer: %03d, cost: %.2f, acc: %.2f" % (ti, obj, acc))
# Sometimes compute validation accuracy
if ti % 5 == 0:
# XXX HACK: for now, use only test data
valid_x, valid_y = get_batch(test_data, valid_batch_sizes)
valid_acc = accuracy.eval(feed_dict={x_train: valid_x, y_labels:
valid_y, keep_prob: 1.0},
session=sess)
chance_acc = 1. - np.mean(valid_y)
print 'Validation accuracy: %0.2f, Relative change: %0.2f' % (
valid_acc, valid_acc - chance_acc)
# Compute test data accuracy
#test_acc = accuracy.eval(feed_dict={x_train: test_x, y_labels: test_y,
# keep_prob: 1.0},
# session=sess)
#print 'Test accuracy: %0.2f' % test_acc
sess.close()