-
Notifications
You must be signed in to change notification settings - Fork 49
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AttributeError: 'NoneType' object has no attribute 'taskgraph' #27
Comments
you can use tf.train.MonitoredTrainingSession instead of tf.Session , and |
Thanks for your reply! I modify the code as follows: import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import cifar10
import epl
import os
def conv_bn_relu(inputs, filters, kernel_size, stride, training):
conv = tf.layers.conv2d(inputs, filters, kernel_size, strides=stride, padding='SAME', use_bias=False)
bn = tf.layers.batch_normalization(conv, training=training)
relu = tf.nn.relu(bn)
return relu
def bottleneck_block(inputs, filters, stride, training):
shortcut = inputs
out = conv_bn_relu(inputs, filters, 1, 1, training)
out = conv_bn_relu(out, filters, 3, stride, training)
out = conv_bn_relu(out, 4 * filters, 1, 1, training)
if stride != 1 or inputs.get_shape().as_list()[-1] != 4 * filters:
shortcut = tf.layers.conv2d(inputs, 4 * filters, 1, strides=stride, padding='SAME', use_bias=False)
shortcut = tf.layers.batch_normalization(shortcut, training=training)
out = tf.add(out, shortcut)
return out
def resnet50(inputs, training):
out = conv_bn_relu(inputs, 64, 3, 1, training)
out = bottleneck_block(out, 64, 1, training)
out = bottleneck_block(out, 128, 2, training)
out = bottleneck_block(out, 256, 2, training)
out = bottleneck_block(out, 512, 2, training)
out = tf.layers.average_pooling2d(out, 4, 1)
out = tf.layers.flatten(out)
out = tf.layers.dense(out, 10)
return out
def run_model():
(X_train, y_train), (X_test, y_test) = cifar10.load_data()
X_train, X_test = X_train.astype(np.float32) / 255.0, X_test.astype(np.float32) / 255.0
y_train, y_test = y_train.astype(np.int32), y_test.astype(np.int32)
images = tf.placeholder(tf.float32, shape=(None, 32, 32, 3), name='images')
labels = tf.placeholder(tf.int32, shape=(None), name='labels')
is_training = tf.placeholder(tf.bool, name='is_training')
logits = resnet50(images, is_training)
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
global_step = tf.train.get_or_create_global_step()
optimizer = tf.train.AdamOptimizer(0.001)
train_op = optimizer.minimize(loss, global_step=global_step)
batch_size = 128
n_epochs = 100
hooks = [tf.train.StopAtStepHook(last_step=n_epochs * len(X_train) // batch_size)]
def get_batch(data, labels, batch_size):
idx = np.random.choice(np.arange(len(data)), batch_size, replace=False)
return data[idx], labels[idx].flatten()
with tf.train.MonitoredTrainingSession(hooks=hooks) as sess:
while not sess.should_stop():
batch_images, batch_labels = get_batch(X_train, y_train, batch_size)
_, train_loss, step = sess.run(
[train_op, loss, global_step],
feed_dict={images: batch_images, labels: batch_labels, is_training: True}
)
if step % 100 == 0:
print(f"Step {step}, Loss: {train_loss:.4f}")
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
config_json = {}
epl.init(epl.Config(config_json))
print(epl.Env.get().cluster.gpu_num_per_worker)
if epl.Env.get().cluster.gpu_num_per_worker > 1:
# Avoid NCCL hang.
os.environ["NCCL_LAUNCH_MODE"] = "GROUP"
epl.set_default_strategy(epl.replicate(device_count=1))
run_model() However, I am confronted with the following issue: Traceback (most recent call last): During handling of the above exception, another exception occurred: Traceback (most recent call last): Original stack trace for 'EPL_REPLICA_1/labels': Could you give me a hand? Thank you very much! |
you should replace get_batch with tf.data.Dataset |
Hi EPL team,
When I use epl library to train the following code:
I am confronted with the following issue:
Traceback (most recent call last):
File "resnet50_split3.py", line 203, in
run_model()
File "resnet50_split3.py", line 164, in run_model
sess.run(tf.global_variables_initializer())
File "/users/Master/anaconda3/envs/py37/lib/python3.7/site-packages/epl/parallel/hooks.py", line 453, in run
assign_ops = _init_local_resources(self, fn)
File "/users/Master/anaconda3/envs/py37/lib/python3.7/site-packages/epl/parallel/hooks.py", line 416, in _init_local_resources
assign_ops = broadcast_variables()
File "/users/Master/anaconda3/envs/py37/lib/python3.7/site-packages/epl/parallel/hooks.py", line 339, in broadcast_variables
bcast_variables = taskgraph.get_variables(replica_idx)
File "/users/Master/anaconda3/envs/py37/lib/python3.7/site-packages/epl/ir/taskgraph.py", line 409, in get_variables
if id(var_tensor.taskgraph) != id(self):
AttributeError: 'NoneType' object has no attribute 'taskgraph'
Could you give me a hand when you are free? Thank you very much!
The text was updated successfully, but these errors were encountered: