Model nondeterminism when using `K.layers.DepthwiseConv2D` / `tf.nn.depthwise_conv2d` (known issue) #26

kimzt · 2020-09-24T04:35:10Z

I'm using TF2.3 on Ubuntu 16.04.
To get deterministic results, I followed your instructions. Please check the attached code that is very simple MNIST example.
After running the code twice, I compared the results. Unfortunately, I got some non-deterministic results such as loss, embs, and so on.

Please check my code and give me some advice.

kimzt · 2020-09-24T04:57:51Z

I put my code below.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from datetime import datetime
import os
import time
import sys
import numpy as np
import importlib
import argparse
import pickle
import random
import imageio

import tensorflow as tf
from tensorflow import keras as K
from tensorflow.keras import layers

AUTOTUNE = tf.data.experimental.AUTOTUNE

def main(args):
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' #filter INFO
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_no        
    
    ## tf-deterministic 
    if args.deterministic:                
        os.environ['TF_DETERMINISTIC_OPS'] = '1'    
    os.environ['PYTHONHASHSEED'] = str(args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)
    tf.random.set_seed(args.seed)

    ## setting gpu memory
    gpus = tf.config.experimental.list_physical_devices('GPU') #return 1 GPU because of 'CUDA_VISIBLE_DEVICES'
    if gpus:
        try:
            tf.config.experimental.set_memory_growth(gpus[0], True) # dynamic memory only growing
        except RuntimeError as e:
            print(e)
    
    nrof_classes = 10
    weight_decay = 1e-4

    ## building a model0
    img_inputs = K.Input(shape=(28, 28, 1), name="img_inputs")
    x = K.layers.Conv2D(filters=64, kernel_size=[3,3], strides=1)(img_inputs)
    x = K.layers.DepthwiseConv2D(kernel_size=[3,3], strides=1, depth_multiplier=1,
                padding='same', activation='relu', use_bias=False,
                kernel_initializer=K.initializers.HeNormal(seed=2020),
                kernel_regularizer=K.regularizers.L2(weight_decay))(x)
    x = K.layers.GlobalAveragePooling2D()(x)
    x = K.layers.Dropout(0.5, seed=2020)(x)
    embeddings = K.layers.Dense(64, activation=None)(x)
    base_model = K.Model(inputs=img_inputs, outputs=embeddings) # feature extration model
    
    #classfication head
    logit_layer = Logits(nrof_classes, weight_decay=weight_decay)
    logits = logit_layer(base_model.output)

    train_model = K.Model(inputs=[base_model.input], outputs=[embeddings, logits])
    # train_model.summary()

    # Instantiate an optimizer.
    # optimizer = keras.optimizers.SGD(learning_rate=1e-3)
    optimizer = K.optimizers.Adam(learning_rate=1e-3, beta_1=0.9, beta_2=0.999, epsilon=0.1)
    train_model.compile(optimizer=optimizer)

    # Instantiate a loss function.
    loss_fn = K.losses.SparseCategoricalCrossentropy(from_logits=False)

    # Prepare the training dataset.
    batch_size = 64
    (x_train, y_train), (x_test, y_test) = K.datasets.mnist.load_data()
    train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)
    
    epochs = 5
    for epoch in range(epochs):
        print("\nStart of epoch %d" % (epoch,))

        for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
            with tf.GradientTape() as tape:
                embs, logits = train_model((x_batch_train, y_batch_train), training=True)  # Logits for this minibatch
                logits = tf.nn.softmax(logits)

                # Compute the loss value for this minibatch.
                ce_loss = loss_fn(y_batch_train, logits)

                total_loss = tf.add_n([ce_loss] + train_model.losses)

            grads = tape.gradient(total_loss, train_model.trainable_variables)
            optimizer.apply_gradients(zip(grads, train_model.trainable_variables))

            # Log every 200 batches.
            if step % 200 == 0:
                print(
                    "Training loss (for one batch) at step %d: %.4f"
                    % (step, float(total_loss))
                )
                print("Seen so far: %s samples" % ((step + 1) * 64))

                
            ## debug code
            # if step == 200:
            #     with open('debug_train{}.pkl'.format(args.gpu_no), 'wb') as f:
            #         pickle.dump((x_batch_train, y_batch_train, embs, embs, grads, train_model.trainable_variables), f)
            #     exit()

    with open('debug_train{}.pkl'.format(args.gpu_no), 'wb') as f:
        pickle.dump((x_batch_train, y_batch_train, embs, embs, grads, train_model.trainable_variables), f)
        

class Logits(K.layers.Layer):
    def __init__(self, nrof_classes, weight_decay=0.0):
        super(Logits, self).__init__()
        self.nrof_classes = nrof_classes
        self.weight_decay = weight_decay

    def build(self, input_shape):        
        """
        Args:
            input_shape = emb_shape
        """

        self.W = tf.Variable(name='W', dtype=tf.float32,
                            initial_value=K.initializers.HeNormal(seed=2020)(shape=(input_shape[-1], self.nrof_classes)))
        self.b = tf.Variable(name='b', dtype=tf.float32,
                            initial_value=tf.zeros_initializer()(shape=[self.nrof_classes]))        
        #weight regularization
        self.add_loss(K.regularizers.L2(self.weight_decay)(self.W))

    def call(self, inputs):
        return tf.matmul(inputs, self.W) + self.b

    def get_config(self):
        config = super(Logits, self).get_config()
        config.update({"nrof_classes": self.nrof_classes,
                       "weight_decay": self.weight_decay,
                      })
        return config

    def compute_output_shape(self, input_shape):        
        return (None, self.nrof_classes)

def parse_arguments(argv):
    parser = argparse.ArgumentParser()
    
    parser.add_argument('--gpu_no', type=str, help='Set visible GPU.', default='0')
    parser.add_argument('--seed', type=int,
        help='Random seed.', default=333)
    parser.add_argument('--deterministic',
        help='Enable deterministic training', action='store_true')
    
    return parser.parse_args(argv)  

if __name__ == '__main__':
    main(parse_arguments(sys.argv[1:]))

duncanriach · 2020-09-24T05:15:16Z

Hi Youngsam,

First of all, please will you come to the issue on the GitHub web app and edit your comments. By responding to the emails from GitHub, you've added a lot of junk into the comment thread. There are two comments that are about 50 or more lines long, but only need to be one or two lines long. It's better to respond in the issue, through the web interface.

Secondly, thanks for the code. I've looked through it and there is nothing that stands out immediately as being an issue. I'm going to need to instrument this and isolate the source of nondeterminism. I'll get back to you.

duncanriach · 2020-09-24T05:19:24Z

Actually, K.layers.DepthwiseConv2D / tf.nn.depthwise_conv2d is suspect. I intend to isolate and repro.

kimzt · 2020-09-24T05:21:13Z

I didn't noticed that my email response is automatically attached here. I'll respond through this webpage. Thanks.

kimzt · 2020-09-24T05:33:49Z

OK. I deleted my comment written by email.

duncanriach · 2020-09-24T18:40:37Z

Thanks. I've also tidied up as much as I could by removing my responses to your email-sourced comments.

hermosayhl · 2021-03-19T01:58:14Z

Dear @duncanriach,

I'm using TF2.3 on Ubuntu 16.04.
To get deterministic results, I followed your instructions. Please check the attached code that is very simple MNIST example.
After running the code twice, I compared the results. Unfortunately, I got some non-deterministic results such as loss, embs, and so on.

Please check my code and give me some advice.

I am having trouble with this too! I got non-deterministic results on lots of experiments with Tensorflow2.3-GPU. Have you solved the problem?

Listed below are my statement：

def seed_everything(seed=13):
    numpy.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC']='1'
    os.environ['TF_KERAS'] = '1'

seed_everything(19980212)

duncanriach · 2021-03-19T04:44:15Z

@hermosayhl: you're using K.layers.DepthwiseConv2D / tf.nn.depthwise_conv2d in your model too?

hermosayhl · 2021-03-19T11:53:52Z

DepthwiseConv2D

Not yet. Listed below are codes for network copied from ResNet_TF2

category_num = 1000
block_type = {18: 'basic block',
              34: 'basic block',
              50: 'bottlenect block',
              101: 'bottlenect block',
              152: 'bottlenect block'}
block_num = {18: (2, 2, 2, 2),
             34: (3, 4, 6, 3),
             50: (3, 4, 6, 3),
             101: (3, 4, 23, 3),
             152: (3, 4, 36, 3)}
filter_num = (64, 128, 256, 512)

from tensorflow.keras.layers import Conv2D, GlobalAvgPool2D, BatchNormalization, Dense

class BasicBlock(tf.keras.layers.Layer):
    def __init__(self, filters, strides=(1, 1), **kwargs):
        self.strides = strides
        if self.strides != (1, 1):
            self.shortcut_projection = Conv2D(filters, (1, 1), name='projection', padding='same', use_bias=False)
            self.shortcut_bn = BatchNormalization(name='shortcut_bn', momentum=0.9, epsilon=1e-5)
        self.conv_0 = Conv2D(filters, (3, 3), name='conv_0', strides=self.strides, padding='same', use_bias=False)
        self.conv_1 = Conv2D(filters, (3, 3), name='conv_1', padding='same', use_bias=False)
        self.bn_0 = BatchNormalization(name='bn_0', momentum=0.9, epsilon=1e-5)
        self.bn_1 = BatchNormalization(name='bn_1', momentum=0.9, epsilon=1e-5)
        super(BasicBlock, self).__init__(**kwargs)

    def call(self, inputs, training):
        net = self.conv_0(inputs)
        net = self.bn_0(net, training=training)
        net = tf.nn.relu(net)
        net = self.conv_1(net)
        net = self.bn_1(net, training=training)
        if self.strides != (1, 1):
            shortcut = tf.nn.avg_pool2d(inputs, ksize=(2, 2), strides=(2, 2), padding='SAME')
            shortcut = self.shortcut_projection(shortcut)
            shortcut = self.shortcut_bn(shortcut)
        else:
            shortcut = inputs
        net = net + shortcut
        net = tf.nn.relu(net)
        return net

class BottleneckBlock(tf.keras.layers.Layer):
    def __init__(self, filters, strides=(1, 1), projection=False, **kwargs):
        self.strides = strides
        self.projection = projection
        if self.strides != (1, 1) or self.projection:
            self.shortcut_projection = Conv2D(filters * 4, (1, 1), name='projection', padding='same', use_bias=False)
            self.shortcut_bn = BatchNormalization(name='shortcut_bn', momentum=0.9, epsilon=1e-5)

        self.conv_0 = Conv2D(filters, (1, 1), name='conv_0', padding='same', use_bias=False)
        self.conv_1 = Conv2D(filters, (3, 3), name='conv_1', strides=strides, padding='same', use_bias=False)
        self.conv_2 = Conv2D(filters * 4, (1, 1), name='conv_2', padding='same', use_bias=False)
        self.bn_0 = BatchNormalization(name='bn_0', momentum=0.9, epsilon=1e-5)
        self.bn_1 = BatchNormalization(name='bn_1', momentum=0.9, epsilon=1e-5)
        self.bn_2 = BatchNormalization(name='bn_2', momentum=0.9, epsilon=1e-5)

        super(BottleneckBlock, self).__init__(**kwargs)

    def call(self, inputs, training):
        net = self.conv_0(inputs)
        net = self.bn_0(net, training=training)
        net = tf.nn.relu(net)

        net = self.conv_1(net)
        net = self.bn_1(net, training=training)
        net = tf.nn.relu(net)

        net = self.conv_2(net)
        net = self.bn_2(net, training=training)

        if self.projection:
            shortcut = self.shortcut_projection(inputs)
            shortcut = self.shortcut_bn(shortcut, training=training)
        elif self.strides != (1, 1):
            shortcut = tf.nn.avg_pool2d(inputs, ksize=(2, 2), strides=(2, 2), padding='SAME')
            shortcut = self.shortcut_projection(shortcut)
            shortcut = self.shortcut_bn(shortcut, training=training)
        else:
            shortcut = inputs

        net = net + shortcut
        net = tf.nn.relu(net)
        return net


class ResNet(tf.keras.models.Model):
    def __init__(self, layer_num, **kwargs):
        super(ResNet, self).__init__(**kwargs)
        if block_type[layer_num] == 'basic block':
            self.block = BasicBlock
        else:
            self.block = BottleneckBlock

        self.conv0 = Conv2D(64, (7, 7), strides=(2, 2), name='conv0', padding='same', use_bias=False)
        self.bn = BatchNormalization(name='bn', momentum=0.9, epsilon=1e-5)

        self.block_collector = []
        for layer_index, (b, f) in enumerate(zip(block_num[layer_num], filter_num), start=1):
            if layer_index == 1:
                if block_type[layer_num] == 'basic block':
                    self.block_collector.append(self.block(f, name='conv1_0'))
                else:
                    self.block_collector.append(self.block(f, projection=True, name='conv1_0'))
            else:
                self.block_collector.append(self.block(f, strides=(2, 2), name='conv{}_0'.format(layer_index)))

            for block_index in range(1, b):
                self.block_collector.append(self.block(f, name='conv{}_{}'.format(layer_index, block_index)))

        self.global_average_pooling = GlobalAvgPool2D()
        self.fc = Dense(category_num, name='fully_connected', activation='softmax', use_bias=False)

    def call(self, inputs, training):
        net = self.conv0(inputs)
        net = self.bn(net, training)
        net = tf.nn.relu(net)
        net = tf.nn.max_pool2d(net, ksize=(3, 3), strides=(2, 2), padding='SAME')

        for block in self.block_collector:
            net = block(net, training)

        net = self.global_average_pooling(net)
        net = self.fc(net) 
        return net

duncanriach · 2021-03-22T20:06:17Z

@hermosayhl, please open a new issue and remove your comments from this issue.

duncanriach · 2021-03-22T20:08:42Z

I changed the title of this issue and I'm going to close it. @kimzt's model contains an op that does not yet have a deterministic GPU implementation. Recommend using another op or finding another work-around until we implement a GPU-deterministic version of this op.

duncanriach · 2021-03-22T20:09:26Z

closing

duncanriach · 2021-04-16T02:37:34Z

@kimzt, please will you try running the patch I mention here on TensorFlow Issue 47174. This will move the depthwise-conv2d functionality onto the CPU and should result in your model training deterministically, though a little more slowly.

kimzt · 2021-04-16T07:56:31Z

Thank you for your information. I updated my code applied the patch you suggested. Unfortunately, in my code, I got still non-deterministic results. More specifically, training loss looks deterministic, but training variables doesn't. Following log is for comparison with training variables after two runs on GPU.

===== Summary of trainig variables per layer =====
[conv2d/kernel:0 ] 0.8125687838 0.8125276566
[conv2d/bias:0 ] -0.0547780395 -0.0547753200
[depthwise_conv2d/depthwise_kernel:0] -3.4531931877 -3.4530177116
[dense/kernel:0 ] -4.1456966400 -4.1456689835
[dense/bias:0 ] -0.0831420571 -0.0831417441
[logits/W:0 ] 6.8328371048 6.8328371048
[logits/b:0 ] -0.0069593228 -0.0069591329

I hope this would help you making patch for deterministic DepthwiseConv2D.

duncanriach · 2021-04-22T21:48:14Z

Thanks @kimzt. Either the patch is not working for some reason or there is another source of nondeterminism. I have a task to debug the model; I can't promise when I'll get to that.

duncanriach · 2021-04-22T21:48:48Z

I'm also going to reopen this issue ...

duncanriach · 2021-04-29T01:28:16Z

See this study for more information about nondeterminism and depthwise convolution.

duncanriach · 2021-09-17T22:55:53Z

Update: MR 51920 adds determinism-unimplemented exception-throwing to tf.nn.depthwise_conv2d in stock TensorFlow. This will be included in the stock TF 2.7 release. Meanwhile, you can (relatively easily) try out the latest determinism functionality in the top-of-tree by using the tensorflow/tensorflow:nifgtly-gpu Docker container image.

duncanriach changed the title ~~Cannot get deterministic results in TF2.3~~ Model nondeterminism when using K.layers.DepthwiseConv2D / tf.nn.depthwise_conv2d Mar 22, 2021

duncanriach changed the title ~~Model nondeterminism when using K.layers.DepthwiseConv2D / tf.nn.depthwise_conv2d~~ Model nondeterminism when using K.layers.DepthwiseConv2D / tf.nn.depthwise_conv2d (known issue) Mar 22, 2021

duncanriach closed this as completed Mar 22, 2021

duncanriach added the known issue label Mar 22, 2021

duncanriach reopened this Apr 22, 2021

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Model nondeterminism when using `K.layers.DepthwiseConv2D` / `tf.nn.depthwise_conv2d` (known issue) #26

Model nondeterminism when using `K.layers.DepthwiseConv2D` / `tf.nn.depthwise_conv2d` (known issue) #26

kimzt commented Sep 24, 2020 •

edited

Loading

kimzt commented Sep 24, 2020

duncanriach commented Sep 24, 2020 •

edited

Loading

duncanriach commented Sep 24, 2020

kimzt commented Sep 24, 2020

kimzt commented Sep 24, 2020

duncanriach commented Sep 24, 2020

hermosayhl commented Mar 19, 2021

duncanriach commented Mar 19, 2021

hermosayhl commented Mar 19, 2021

duncanriach commented Mar 22, 2021

duncanriach commented Mar 22, 2021

duncanriach commented Mar 22, 2021

duncanriach commented Apr 16, 2021

kimzt commented Apr 16, 2021

duncanriach commented Apr 22, 2021

duncanriach commented Apr 22, 2021

duncanriach commented Apr 29, 2021

duncanriach commented Sep 17, 2021

Model nondeterminism when using K.layers.DepthwiseConv2D / tf.nn.depthwise_conv2d (known issue) #26

Model nondeterminism when using K.layers.DepthwiseConv2D / tf.nn.depthwise_conv2d (known issue) #26

Comments

kimzt commented Sep 24, 2020 • edited Loading

kimzt commented Sep 24, 2020

duncanriach commented Sep 24, 2020 • edited Loading

duncanriach commented Sep 24, 2020

kimzt commented Sep 24, 2020

kimzt commented Sep 24, 2020

duncanriach commented Sep 24, 2020

hermosayhl commented Mar 19, 2021

duncanriach commented Mar 19, 2021

hermosayhl commented Mar 19, 2021

duncanriach commented Mar 22, 2021

duncanriach commented Mar 22, 2021

duncanriach commented Mar 22, 2021

duncanriach commented Apr 16, 2021

kimzt commented Apr 16, 2021

duncanriach commented Apr 22, 2021

duncanriach commented Apr 22, 2021

duncanriach commented Apr 29, 2021

duncanriach commented Sep 17, 2021

Model nondeterminism when using `K.layers.DepthwiseConv2D` / `tf.nn.depthwise_conv2d` (known issue) #26

Model nondeterminism when using `K.layers.DepthwiseConv2D` / `tf.nn.depthwise_conv2d` (known issue) #26

kimzt commented Sep 24, 2020 •

edited

Loading

duncanriach commented Sep 24, 2020 •

edited

Loading