text-classification/dynamic_memory_network.py

# -*- coding: utf-8 -*-
"""
Dynamic Memory Network: a.Input Module,b.Question Module,c.Episodic Memory Module,d.Answer Module.
  1.Input Module: encode raw texts into vector representation
  2.Question Module: encode question into vector representation
  3.Episodic Memory Module: with inputs,it chooses which parts of inputs to focus on through the attention mechanism,
                            taking into account of question and previous memory====>it poduce a 'memory' vector.
  4.Answer Module:generate an answer from the final memory vector.
"""
import tensorflow as tf
import numpy as np
import tensorflow.contrib as tf_contrib
from tensorflow.contrib import rnn


class DynamicMemoryNetwork:
    def __init__(
        self,
        num_classes,
        learning_rate,
        decay_steps,
        decay_rate,
        sequence_length,
        story_length,
        vocab_size,
        embed_size,
        hidden_size,
        num_pass = 2,
        use_gated_gru = True,
        decode_with_sequences = False,
        initializer = tf.random_normal_initializer(stddev = 0.1),
        clip_gradients = 5.0,
        l2_lambda = 0.0001,
    ):
        """init all hyperparameter here"""
        self.num_classes = num_classes
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.learning_rate = tf.Variable(
            learning_rate, trainable = False, name = 'learning_rate'
        )
        self.learning_rate_decay_half_op = tf.assign(
            self.learning_rate, self.learning_rate * 0.5
        )
        self.initializer = initializer
        self.hidden_size = hidden_size
        self.clip_gradients = clip_gradients
        self.story_length = story_length
        self.num_pass = num_pass
        self.use_gated_gru = use_gated_gru
        self.decode_with_sequences = decode_with_sequences
        self.l2_lambda = l2_lambda
        self.story = tf.placeholder(
            tf.int32, [None, self.story_length, self.sequence_length]
        )
        self.batch_size = tf.shape(self.story)[0]
        self.query = tf.placeholder(tf.int32, [None, self.sequence_length])

        self.answer_single = tf.placeholder(tf.int32, [None])
        self.dropout_keep_prob = tf.placeholder(tf.float32)

        self.global_step = tf.Variable(
            0, trainable = False, name = 'Global_Step'
        )
        self.epoch_step = tf.Variable(0, trainable = False, name = 'Epoch_Step')
        self.epoch_increment = tf.assign(
            self.epoch_step, tf.add(self.epoch_step, tf.constant(1))
        )
        self.decay_steps, self.decay_rate = decay_steps, decay_rate

        self.instantiate_weights()
        self.logits = self.inference()

        self.predictions = tf.argmax(self.logits, 1, name = 'predictions')
        correct_prediction = tf.equal(
            tf.cast(self.predictions, tf.int32), self.answer_single
        )
        self.accuracy = tf.reduce_mean(
            tf.cast(correct_prediction, tf.float32), name = 'Accuracy'
        )
        self.cost = self.loss()
        self.optimizer = self.train()

    def inference(self):
        self.input_module()
        self.question_module()
        self.episodic_memory_module()
        logits = self.answer_module()
        return logits

    def input_module(self):
        """encode raw texts into vector representation"""
        story_embedding = tf.nn.embedding_lookup(self.Embedding, self.story)
        story_embedding = tf.reshape(
            story_embedding,
            (
                self.batch_size,
                self.story_length,
                self.sequence_length * self.embed_size,
            ),
        )
        cell = rnn.GRUCell(self.hidden_size)
        self.story_embedding, hidden_state = tf.nn.dynamic_rnn(
            cell, story_embedding, dtype = tf.float32, scope = 'input_module'
        )

    def question_module(self):
        """
        input:tokens of query:[batch_size,sequence_length]
        :return: representation of question:[batch_size,hidden_size]
        """
        query_embedding = tf.nn.embedding_lookup(self.Embedding, self.query)
        cell = rnn.GRUCell(self.hidden_size)
        _, self.query_embedding = tf.nn.dynamic_rnn(
            cell, query_embedding, dtype = tf.float32, scope = 'question_module'
        )

    def episodic_memory_module(
        self
    ):  # input(story):[batch_size,story_length,hidden_size]
        """
        episodic memory module
        1.combine features
        1.attention mechansim using gate function.take fact representation c,question q,previous memory m_previous
        2.use gated-gru to update hidden state
        3.set last hidden state as episode result
        4.use gru to update final memory using episode result

        input: story(from input module):[batch_size,story_length,hidden_size]
        output: last hidden state:[batch_size,hidden_size]
        """
        candidate_inputs = tf.split(
            self.story_embedding, self.story_length, axis = 1
        )
        candidate_list = [tf.squeeze(x, axis = 1) for x in candidate_inputs]
        m_current = self.query_embedding
        h_current = tf.zeros((self.batch_size, self.hidden_size))
        for pass_number in range(self.num_pass):
            g = self.attention_mechanism_parallel(
                self.story_embedding,
                m_current,
                self.query_embedding,
                pass_number,
            )
            if self.use_gated_gru:
                g = tf.split(g, self.story_length, axis = 1)
                for i, c_current in enumerate(candidate_list):
                    g_current = g[i]
                    h_current = self.gated_gru(c_current, h_current, g_current)
                e_i = h_current
            else:
                p_gate = tf.nn.softmax(g, dim = 1)
                p_gate = tf.expand_dims(p_gate, axis = 2)
                e_i = tf.multiply(p_gate, self.story_embedding)
                e_i = tf.reduce_sum(e_i, axis = 1)
            m_current = self.gru_cell(e_i, m_current, 'gru_episodic_memory')
        self.m_T = m_current

    def answer_module(self):
        """ Answer Module:generate an answer from the final memory vector.
        Input:
            hidden state from episodic memory module:[batch_size,hidden_size]
            question:[batch_size, embedding_size]
        """
        steps = self.sequence_length if self.decode_with_sequences else 1
        a = self.m_T
        y_pred = tf.zeros((self.batch_size, self.hidden_size))
        logits_list = []
        logits_return = None
        for i in range(steps):
            cell = rnn.GRUCell(self.hidden_size)
            y_previous_q = tf.concat([y_pred, self.query_embedding], axis = 1)
            _, a = cell(y_previous_q, a)
            logits = tf.layers.dense(a, units = self.num_classes)
            logits_list.append(logits)
        if self.decode_with_sequences:
            logits_return = tf.stack(logits_list, axis = 1)
        else:
            logits_return = logits_list[0]

        return logits_return

    def gated_gru(self, c_current, h_previous, g_current):
        """
        gated gru to get updated hidden state
        :param  c_current: [batch_size,embedding_size]
        :param  h_previous:[batch_size,hidden_size]
        :param  g_current: [batch_size,1]
        :return h_current: [batch_size,hidden_size]
        """
        h_candidate = self.gru_cell(
            c_current, h_previous, 'gru_candidate_sentence'
        )
        h_current = tf.multiply(g_current, h_candidate) + tf.multiply(
            1 - g_current, h_previous
        )
        return h_current

    def attention_mechanism_parallel(self, c_full, m, q, i):
        """ parallel implemtation of gate function given a list of candidate sentence, a query, and previous memory.
        Input:
           c_full: candidate fact. shape:[batch_size,story_length,hidden_size]
           m: previous memory. shape:[batch_size,hidden_size]
           q: question. shape:[batch_size,hidden_size]
        Output: a scalar score (in batch). shape:[batch_size,story_length]
        """
        q = tf.expand_dims(q, axis = 1)
        m = tf.expand_dims(m, axis = 1)

        c_q_elementwise = tf.multiply(c_full, q)
        c_m_elementwise = tf.multiply(c_full, m)
        c_q_minus = tf.abs(tf.subtract(c_full, q))
        c_m_minus = tf.abs(tf.subtract(c_full, m))
        c_w_q = self.x1Wx2_parallel(c_full, q, 'c_w_q' + str(i))
        c_w_m = self.x1Wx2_parallel(c_full, m, 'c_w_m' + str(i))
        q_tile = tf.tile(q, [1, self.story_length, 1])
        m_tile = tf.tile(m, [1, self.story_length, 1])
        z = tf.concat(
            [
                c_full,
                m_tile,
                q_tile,
                c_q_elementwise,
                c_m_elementwise,
                c_q_minus,
                c_m_minus,
                c_w_q,
                c_w_m,
            ],
            2,
        )
        g = tf.layers.dense(z, self.hidden_size * 3, activation = tf.nn.tanh)
        g = tf.layers.dense(g, 1, activation = tf.nn.sigmoid)
        g = tf.squeeze(g, axis = 2)
        return g

    def x1Wx2_parallel(self, x1, x2, scope):
        """
        :param x1: [batch_size,story_length,hidden_size]
        :param x2: [batch_size,1,hidden_size]
        :param scope: a string
        :return:  [batch_size,story_length,hidden_size]
        """
        with tf.variable_scope(scope):
            x1 = tf.reshape(x1, shape = (self.batch_size, self.hidden_size))
            x1_w = tf.layers.dense(
                x1, self.story_length * self.hidden_size, use_bias = False
            )
            x1_w_expand = tf.expand_dims(x1_w, axis = 2)
            x1_w_x2 = tf.matmul(x1_w_expand, x2)
            x1_w_x2 = tf.reshape(
                x1_w_x2,
                shape = (
                    self.batch_size,
                    self.story_length,
                    self.hidden_size,
                    self.hidden_size,
                ),
            )
            x1_w_x2 = tf.reduce_sum(x1_w_x2, axis = 3)
            return x1_w_x2

    def gru_cell(self, Xt, h_t_minus_1, variable_scope):
        """
        single step of gru
        :param Xt: Xt:[batch_size,hidden_size]
        :param h_t_minus_1:[batch_size,hidden_size]
        :return:[batch_size,hidden_size]
        """
        with tf.variable_scope(variable_scope):
            z_t = tf.nn.sigmoid(
                tf.matmul(Xt, self.W_z)
                + tf.matmul(h_t_minus_1, self.U_z)
                + self.b_z
            )
            r_t = tf.nn.sigmoid(
                tf.matmul(Xt, self.W_r)
                + tf.matmul(h_t_minus_1, self.U_r)
                + self.b_r
            )
            h_t_candiate = tf.nn.tanh(
                tf.matmul(Xt, self.W_h)
                + r_t * (tf.matmul(h_t_minus_1, self.U_h))
                + self.b_h
            )
            h_t = (
                1 - z_t
            ) * h_t_minus_1 + z_t * h_t_candiate  # h_t:[batch_size,hidden_size]
        return h_t

    def loss(self, l2_lambda = 0.0001):  # 0.001
        with tf.name_scope('loss'):
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels = self.answer_single, logits = self.logits
            )
            loss = tf.reduce_mean(losses)
            l2_losses = (
                tf.add_n(
                    [
                        tf.nn.l2_loss(v)
                        for v in tf.trainable_variables()
                        if ('bias' not in v.name) and ('alpha' not in v.name)
                    ]
                )
                * l2_lambda
            )
            loss = loss + l2_losses
        return loss

    def train(self):
        """based on the loss, use SGD to update parameter"""
        learning_rate = tf.train.exponential_decay(
            self.learning_rate,
            self.global_step,
            self.decay_steps,
            self.decay_rate,
            staircase = True,
        )
        self.learning_rate_ = learning_rate
        train_op = tf_contrib.layers.optimize_loss(
            self.cost,
            global_step = self.global_step,
            learning_rate = learning_rate,
            optimizer = 'Adam',
            clip_gradients = self.clip_gradients,
        )
        return train_op

    def instantiate_weights(self):
        """define all weights here"""
        with tf.variable_scope('gru_cell'):
            self.W_z = tf.get_variable(
                'W_z',
                shape = [self.embed_size, self.hidden_size],
                initializer = self.initializer,
            )
            self.U_z = tf.get_variable(
                'U_z',
                shape = [self.embed_size, self.hidden_size],
                initializer = self.initializer,
            )
            self.b_z = tf.get_variable('b_z', shape = [self.hidden_size])
            # GRU parameters:reset gate related
            self.W_r = tf.get_variable(
                'W_r',
                shape = [self.embed_size, self.hidden_size],
                initializer = self.initializer,
            )
            self.U_r = tf.get_variable(
                'U_r',
                shape = [self.embed_size, self.hidden_size],
                initializer = self.initializer,
            )
            self.b_r = tf.get_variable('b_r', shape = [self.hidden_size])

            self.W_h = tf.get_variable(
                'W_h',
                shape = [self.embed_size, self.hidden_size],
                initializer = self.initializer,
            )
            self.U_h = tf.get_variable(
                'U_h',
                shape = [self.embed_size, self.hidden_size],
                initializer = self.initializer,
            )
            self.b_h = tf.get_variable('b_h', shape = [self.hidden_size])

        with tf.variable_scope('embedding_projection'):
            self.Embedding = tf.get_variable(
                'Embedding',
                shape = [self.vocab_size, self.embed_size],
                initializer = self.initializer,
            )