From 1f3403dcbe2cd7e57e33882ccb520aaeb72e27fe Mon Sep 17 00:00:00 2001 From: Chris Cummins Date: Tue, 18 Aug 2020 15:33:58 +0100 Subject: [PATCH] WIP: Rewrite LSTM in PyTorch Signed-off-by: format 2020.06.15 --- WORKSPACE | 17 -- programl/models/lstm/BUILD | 2 +- programl/models/lstm/lstm.py | 276 ++++++++++--------- programl/models/lstm/lstm_batch.py | 4 +- programl/task/dataflow/BUILD | 1 + programl/task/dataflow/lstm_batch_builder.py | 31 +-- programl/task/dataflow/train_lstm.py | 6 +- programl/task/dataflow/train_lstm_test.py | 4 +- requirements.txt | 2 +- third_party/py/tensorflow/BUILD | 3 +- 10 files changed, 167 insertions(+), 179 deletions(-) diff --git a/WORKSPACE b/WORKSPACE index 2d9d8cad4..0a99fbfe3 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -34,8 +34,6 @@ load( pip_repositories() -# ProGraML pip requirements. - pip3_import( name = "programl_requirements", timeout = 3600, @@ -49,21 +47,6 @@ load( programl_pip_install() -# TensorFlow pip requirements. - -pip3_import( - name = "programl_tensorflow_requirements", - timeout = 3600, - requirements = "@programl//third_party/py/tensorflow:requirements.txt", -) - -load( - "@programl_tensorflow_requirements//:requirements.bzl", - programl_pip_install = "pip_install", -) - -programl_pip_install() - # Protobuf. pip3_import( name = "protobuf_py_deps", diff --git a/programl/models/lstm/BUILD b/programl/models/lstm/BUILD index 7c9c47677..b1b643a16 100644 --- a/programl/models/lstm/BUILD +++ b/programl/models/lstm/BUILD @@ -27,7 +27,7 @@ py_library( "//programl/proto:epoch_py", "//third_party/py/labm8", "//third_party/py/numpy", - "//third_party/py/tensorflow", + "//third_party/py/torch", ], ) diff --git a/programl/models/lstm/lstm.py b/programl/models/lstm/lstm.py index 9672db30a..90c5566e7 100644 --- a/programl/models/lstm/lstm.py +++ b/programl/models/lstm/lstm.py @@ -14,26 +14,26 @@ # See the License for the specific language governing permissions and # limitations under the License. """An LSTM for instruction classification.""" -import pathlib -import tempfile from typing import Any from typing import Dict from typing import List import numpy as np -import tensorflow as tf +import torch +from torch import nn +from torch import optim from labm8.py import app from labm8.py.progress import NullContext from labm8.py.progress import ProgressContext -from tensorflow import keras +from programl.models.ggnn.node_embeddings import NodeEmbeddings +from programl.models.ggnn.loss import Loss from programl.models.batch_data import BatchData from programl.models.batch_results import BatchResults from programl.models.lstm.lstm_batch import LstmBatchData from programl.models.model import Model from programl.proto import epoch_pb2 - FLAGS = app.FLAGS app.DEFINE_integer( @@ -62,19 +62,24 @@ "The value used for the positive class in the 1-hot selector embedding " "vectors. Has no effect when selector embeddings are not used.", ) -app.DEFINE_boolean( - "cudnn_lstm", - True, - "If set, use CuDNNLSTM implementation when a GPU is available. Else use " - "default Keras implementation. Note that the two implementations are " - "incompatible - a model saved using one LSTM type cannot be restored using " - "the other LSTM type.", -) app.DEFINE_float("learning_rate", 0.001, "The mode learning rate.") app.DEFINE_boolean( "trainable_embeddings", True, "Whether the embeddings are trainable." ) +# Embeddings options. +app.DEFINE_string( + "text_embedding_type", + "random", + "The type of node embeddings to use. One of " + "{constant_zero, constant_random, random}.", +) +app.DEFINE_integer( + "text_embedding_dimensionality", + 32, + "The dimensionality of node text embeddings.", +) + class Lstm(Model): """An LSTM model for node-level classification.""" @@ -83,103 +88,56 @@ def __init__( self, vocabulary: Dict[str, int], node_y_dimensionality: int, + graph_y_dimensionality: int, + graph_x_dimensionality: int, + use_selector_embeddings: bool, test_only: bool = False, name: str = "lstm", ): """Constructor.""" - super(Lstm, self).__init__( - test_only=test_only, vocabulary=vocabulary, name=name - ) + super().__init__(test_only=test_only, vocabulary=vocabulary, name=name) self.vocabulary = vocabulary self.node_y_dimensionality = node_y_dimensionality + self.graph_y_dimensionality = graph_y_dimensionality + self.graph_x_dimensionality = graph_x_dimensionality + self.node_selector_dimensionality = 2 if use_selector_embeddings else 0 # Flag values. self.batch_size = FLAGS.batch_size self.padded_sequence_length = FLAGS.padded_sequence_length - # Reset any previous Tensorflow session. This is required when running - # consecutive LSTM models in the same process. - keras.backend.clear_session() - - @staticmethod - def MakeLstmLayer(*args, **kwargs): - """Construct an LSTM layer. - - If a GPU is available and --cudnn_lstm, this will use NVIDIA's fast - CuDNNLSTM implementation. Else it will use Keras' builtin LSTM, which is - much slower but works on CPU. - """ - if FLAGS.cudnn_lstm and tf.compat.v1.test.is_gpu_available(): - return keras.layers.CuDNNLSTM(*args, **kwargs) - else: - return keras.layers.LSTM(*args, **kwargs, implementation=1) - - def CreateKerasModel(self): # -> keras.Model: - """Construct the tensorflow computation graph.""" - vocab_ids = keras.layers.Input( - batch_shape=(self.batch_size, self.padded_sequence_length,), - dtype="int32", - name="sequence_in", - ) - embeddings = keras.layers.Embedding( - input_dim=len(self.vocabulary) + 2, - input_length=self.padded_sequence_length, - output_dim=FLAGS.hidden_size, - name="embedding", - trainable=FLAGS.trainable_embeddings, - )(vocab_ids) - - selector_vectors = keras.layers.Input( - batch_shape=(self.batch_size, self.padded_sequence_length, 2), - dtype="float32", - name="selector_vectors", + self.model = LstmModel( + node_embeddings=NodeEmbeddings( + node_embeddings_type=FLAGS.text_embedding_type, + use_selector_embeddings=self.node_selector_dimensionality, + selector_embedding_value=FLAGS.selector_embedding_value, + embedding_shape=( + # Add one to the vocabulary size to account for the out-of-vocab token. + len(vocabulary) + 1, + FLAGS.text_embedding_dimensionality, + ), + ), + loss=Loss( + num_classes=self.node_y_dimensionality, + has_aux_input=self.has_aux_input, + intermediate_loss_weight=None, # NOTE(cec): Intentionally broken. + class_prevalence_weighting=False, + ), + padded_sequence_length=self.padded_sequence_length, + learning_rate=FLAGS.learning_rate, + test_only=test_only, + hidden_size=FLAGS.hidden_size, + hidden_dense_layer_count=FLAGS.hidden_dense_layer_count, ) - lang_model_input = keras.layers.Concatenate( - axis=2, name="embeddings_and_selector_vectorss" - )([embeddings, selector_vectors],) - - # Recurrent layers. - lang_model = self.MakeLstmLayer( - FLAGS.hidden_size, return_sequences=True, name="lstm_1" - )(lang_model_input) - lang_model = self.MakeLstmLayer( - FLAGS.hidden_size, - return_sequences=True, - return_state=False, - name="lstm_2", - )(lang_model) - - # Dense layers. - for i in range(1, FLAGS.hidden_dense_layer_count + 1): - lang_model = keras.layers.Dense( - FLAGS.hidden_size, activation="relu", name=f"dense_{i}", - )(lang_model) - node_out = keras.layers.Dense( - self.node_y_dimensionality, activation="sigmoid", name="node_out", - )(lang_model) - - model = keras.Model( - inputs=[vocab_ids, selector_vectors], outputs=[node_out], - ) - model.compile( - optimizer=keras.optimizers.Adam( - learning_rate=FLAGS.learning_rate - ), - metrics=["accuracy"], - loss=["categorical_crossentropy"], - loss_weights=[1.0], - ) + @property + def num_classes(self) -> int: + return self.node_y_dimensionality or self.graph_y_dimensionality - return model - - def CreateModelData(self, test_only: bool) -> None: - """Initialize an LSTM model. This is called during Initialize().""" - # Create the Tensorflow session and graph for the model. - tf.get_logger().setLevel("ERROR") - SetAllowedGrowthOnKerasSession() - self.model = self.CreateKerasModel() + @property + def has_aux_input(self) -> bool: + return self.graph_x_dimensionality > 0 def RunBatch( self, @@ -204,24 +162,32 @@ def RunBatch( self.batch_size, self.padded_sequence_length, ), model_data.encoded_sequences.shape - assert model_data.selector_vectors.shape == ( + assert model_data.selector_ids.shape == ( self.batch_size, self.padded_sequence_length, - 2, - ), model_data.selector_vectors.shape - - x = [model_data.encoded_sequences, model_data.selector_vectors] - y = [model_data.node_labels] + ), model_data.selector_ids.shape if epoch_type == epoch_pb2.TRAIN: - loss, *_ = self.model.train_on_batch(x, y) + if not self.model.training: + self.model.train() + targets, logits = self.model(model_data.encoded_sequences, model_data.selector_ids, model_data.node_labels) else: - loss = None + if self.model.training: + self.model.eval() + self.model.opt.zero_grad() + # Inference only, don't trace the computation graph. + with torch.no_grad(): + targets, logits = self.model(model_data.encoded_sequences, model_data.selector_ids, model_data.node_labels) - padded_predictions = self.model.predict_on_batch(x) + loss = self.model.loss((logits, None), targets) + + if epoch_type == epoch_pb2.TRAIN: + loss.backward() + self.model.opt.step() + self.model.opt.zero_grad() # Reshape the outputs. - predictions = self.ReshapePaddedModelOutput(batch_data, padded_predictions) + predictions = self.ReshapePaddedModelOutput(batch_data, outputs) # Flatten the targets and predictions lists so that we can compare them. # Shape (batch_node_count, node_y_dimensionality). @@ -229,7 +195,10 @@ def RunBatch( predictions = np.concatenate(predictions) return BatchResults.Create( - targets=targets, predictions=predictions, loss=loss, + targets=model_data.node_labels, + predictions=logits.detach().cpu().numpy(), + learning_rate=self.model.learning_rate, + loss=loss.item(), ) def ReshapePaddedModelOutput( @@ -275,36 +244,71 @@ def ReshapePaddedModelOutput( def GetModelData(self) -> Any: """Get the model state.""" - # According to https://keras.io/getting-started/faq/, it is not recommended - # to pickle a Keras model. So as a workaround, I use Keras's saving - # mechanism to store the weights, and pickle that. - with tempfile.TemporaryDirectory(prefix="lstm_pickle_") as d: - path = pathlib.Path(d) / "weights.h5" - self.model.save(path) - with open(path, "rb") as f: - model_data = f.read() - return model_data + return { + "model_state_dict": self.model.state_dict(), + "optimizer_state_dict": self.model.opt.state_dict(), + "scheduler_state_dict": self.model.scheduler.state_dict(), + } def LoadModelData(self, data_to_load: Any) -> None: """Restore the model state.""" - # Load the weights from a file generated by ModelDataToSave(). - with tempfile.TemporaryDirectory(prefix="lstm_pickle_") as d: - path = pathlib.Path(d) / "weights.h5" - with open(path, "wb") as f: - f.write(data_to_load) - - # The default TF graph is finalized in Initialize(), so we must - # first reset the session and create a new graph. - tf.compat.v1.reset_default_graph() - SetAllowedGrowthOnKerasSession() - - self.model = keras.models.load_model(path) - - -def SetAllowedGrowthOnKerasSession(): - """Allow growth on GPU for Keras.""" - config = tf.compat.v1.ConfigProto() - config.gpu_options.allow_growth = True - session = tf.compat.v1.Session(config=config) - # set_session(session) - return session + self.model.load_state_dict(data_to_load["model_state_dict"]) + # only restore opt if needed. opt should be None o/w. + if not self.test_only: + self.model.opt.load_state_dict(data_to_load["optimizer_state_dict"]) + self.model.scheduler.load_state_dict(data_to_load["scheduler_state_dict"]) + + + +class LstmModel(nn.Module): + + def __init__(self, node_embeddings: NodeEmbeddings, + loss: Loss, padded_sequence_length: int, test_only: bool, learning_rate: float, + hidden_size: int, + hidden_dense_layer_count: int, # TODO(cec): Implement. + ): + super().__init__() + self.node_embeddings = node_embeddings + self.loss = loss + self.padded_sequence_length = padded_sequence_length + self.learning_rate = learning_rate + self.hidden_size = hidden_size + self.learning_rate = learning_rate + + self.lstm = nn.LSTM( + self.node_embeddings.embedding_dimensionality + 2, + self.hidden_size, + ) + self.hidden2label = nn.Linear(self.hidden_size, 2) + + if test_only: + self.opt = None + self.eval() + else: + self.opt = optim.AdamW(self.parameters(), lr=self.learning_rate) + + def forward( + self, + encoded_sequences, + selector_ids, + node_labels, + ): + print("SHAPES", encoded_sequences.shape, selector_ids.shape, node_labels.shape) + + encoded_sequences = torch.tensor(encoded_sequences, dtype=torch.long) + selector_ids = torch.tensor(selector_ids, dtype=torch.long) + node_labels = torch.tensor(node_labels, dtype=torch.long) + + # Embed and concatenate sequences and selector vectors. + embeddings = self.node_embeddings(encoded_sequences, selector_ids) + + lstm_out, _ = self.lstm(embeddings.view( + self.padded_sequence_length, len(encoded_sequences), -1 + )) + print(lstm_out.shape) + + label_space = self.hidden2label(lstm_out.view(self.padded_sequence_length, -1)) + logits = F.log_softmax(label_space, dim=2) + + targets = node_labels + return logits, targets diff --git a/programl/models/lstm/lstm_batch.py b/programl/models/lstm/lstm_batch.py index 5d112a151..fe71c1ec8 100644 --- a/programl/models/lstm/lstm_batch.py +++ b/programl/models/lstm/lstm_batch.py @@ -32,8 +32,8 @@ class LstmBatchData(NamedTuple): # Shape (batch_size, padded_sequence_length, 1), dtype np.int32 encoded_sequences: np.array - # Shape (batch_size, padded_sequence_length, 2), dtype np.int32 - selector_vectors: np.array + # Shape (batch_size, padded_sequence_length, 1), dtype np.int32 + selector_ids: np.array # Shape (batch_size, padded_sequence_length, node_y_dimensionality), # dtype np.float32 node_labels: np.array diff --git a/programl/task/dataflow/BUILD b/programl/task/dataflow/BUILD index 3dc0ffbd0..b730ea703 100644 --- a/programl/task/dataflow/BUILD +++ b/programl/task/dataflow/BUILD @@ -119,6 +119,7 @@ py_library( "//programl/models/lstm:lstm_batch", "//third_party/py/labm8", "//third_party/py/numpy", + "//third_party/py/keras_preprocessing", ], ) diff --git a/programl/task/dataflow/lstm_batch_builder.py b/programl/task/dataflow/lstm_batch_builder.py index aab2b9ac5..706fe6d32 100644 --- a/programl/task/dataflow/lstm_batch_builder.py +++ b/programl/task/dataflow/lstm_batch_builder.py @@ -18,8 +18,8 @@ from typing import Optional import numpy as np -from tensorflow import keras from labm8.py import app +from keras_preprocessing.sequence import pad_sequences from programl.graph.format.py import graph_serializer from programl.models.base_batch_builder import BaseBatchBuilder @@ -51,12 +51,12 @@ def __init__( # Mutable state. self.graph_node_sizes = [] self.vocab_ids = [] - self.selector_vectors = [] + self.selector_ids = [] self.targets = [] # Padding values. self._vocab_id_pad = len(self.vocabulary) + 1 - self._selector_vector_pad = np.zeros((0, 2), dtype=np.int32) + self._selector_id_pad = 0 self._node_label_pad = np.zeros( (0, self.node_y_dimensionality), dtype=np.int32 ) @@ -77,14 +77,14 @@ def _Build(self) -> BatchData: self.vocab_ids += [ np.array([self._vocab_id_pad], dtype=np.int32) ] * pad_count - self.selector_vectors += [self._selector_vector_pad] * pad_count + self.selector_ids += [np.array([self._selector_id_pad], dtype=np.int32)] * pad_count self.targets += [self._node_label_pad] * pad_count batch = BatchData( graph_count=len(self.graph_node_sizes), model_data=LstmBatchData( graph_node_sizes=np.array(self.graph_node_sizes, dtype=np.int32), - encoded_sequences=keras.preprocessing.sequence.pad_sequences( + encoded_sequences=pad_sequences( self.vocab_ids, maxlen=self.padded_sequence_length, dtype="int32", @@ -92,15 +92,15 @@ def _Build(self) -> BatchData: truncating="post", value=self._vocab_id_pad, ), - selector_vectors=keras.preprocessing.sequence.pad_sequences( - self.selector_vectors, + selector_ids=pad_sequences( + self.selector_ids, maxlen=self.padded_sequence_length, - dtype="float32", + dtype="int32", padding="pre", truncating="post", - value=np.zeros(2, dtype=np.float32), + value=self._selector_id_pad, ), - node_labels=keras.preprocessing.sequence.pad_sequences( + node_labels=pad_sequences( self.targets, maxlen=self.padded_sequence_length, dtype="float32", @@ -116,7 +116,7 @@ def _Build(self) -> BatchData: # Reset mutable state. self.graph_node_sizes = [] self.vocab_ids = [] - self.selector_vectors = [] + self.selector_ids = [] self.targets = [] return batch @@ -142,7 +142,7 @@ def OnItem(self, item) -> Optional[BatchData]: ) for n in node_list ] - selector_values = np.array( + selector_ids = np.array( [ features.node_features.feature_list["data_flow_root_node"] .feature[n] @@ -151,10 +151,7 @@ def OnItem(self, item) -> Optional[BatchData]: ], dtype=np.int32, ) - selector_vectors = np.zeros((selector_values.size, 2), dtype=np.float32) - selector_vectors[ - np.arange(selector_values.size), selector_values - ] = FLAGS.selector_embedding_value + # TODO: FLAGS.selector_embedding_value targets = np.array( [ features.node_features.feature_list["data_flow_value"] @@ -174,7 +171,7 @@ def OnItem(self, item) -> Optional[BatchData]: self.graph_node_sizes.append(len(node_list)) self.vocab_ids.append(vocab_ids) - self.selector_vectors.append(selector_vectors) + self.selector_ids.append(selector_ids) self.targets.append(targets_1hot) if len(self.graph_node_sizes) >= self.batch_size: diff --git a/programl/task/dataflow/train_lstm.py b/programl/task/dataflow/train_lstm.py index 908740a3f..16d2d1400 100644 --- a/programl/task/dataflow/train_lstm.py +++ b/programl/task/dataflow/train_lstm.py @@ -122,7 +122,9 @@ def TrainDataflowLSTM( # # For these data flow experiments, our graphs contain per-node binary # classification targets (e.g. reachable / not-reachable). - model = Lstm(vocabulary=vocab, test_only=False, node_y_dimensionality=2,) + model = Lstm(vocabulary=vocab, test_only=False, node_y_dimensionality=2, + graph_y_dimensionality=0, + graph_x_dimensionality=0, use_selector_embeddings=True) if restore_from: # Pick up training where we left off. @@ -140,8 +142,6 @@ def TrainDataflowLSTM( model.Initialize() start_epoch_step, start_graph_cumsum = 1, 0 - model.model.summary() - # Create training batches and split into epochs. epochs = EpochBatchIterator( MakeBatchBuilder( diff --git a/programl/task/dataflow/train_lstm_test.py b/programl/task/dataflow/train_lstm_test.py index bd31ca70e..7e5a0773b 100644 --- a/programl/task/dataflow/train_lstm_test.py +++ b/programl/task/dataflow/train_lstm_test.py @@ -79,7 +79,9 @@ def main(): "--max_data_flow_steps", str(10), "--val_graph_count", str(10), "--val_seed", str(0xCC), - "--train_graph_counts", "10,20" + "--train_graph_counts", "10,20", + "--padded_sequence_length", str(10), + "--batch_size", str(8), ]) p.communicate() if p.returncode: diff --git a/requirements.txt b/requirements.txt index cbfa6f8a3..2690e91d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ absl-py>=0.9.0 cycler>=0.10.0 # Needed by matplotlib. decorator>=4.3.0 GPUtil>=1.4.0 -Keras>=2.3.1 +keras_preprocessing >= 1.1.1, < 1.2 kiwisolver>=1.0.1 # Needed by matplotlib. labm8>=2020.06.07 matplotlib>=2.2.0rc1 diff --git a/third_party/py/tensorflow/BUILD b/third_party/py/tensorflow/BUILD index e0d27ec94..981f8c2a3 100644 --- a/third_party/py/tensorflow/BUILD +++ b/third_party/py/tensorflow/BUILD @@ -8,6 +8,7 @@ py_library( name = "tensorflow", srcs = ["//third_party/py:empty.py"], deps = [ + requirement("tensorflow"), # Copied from: # https://github.com/tensorflow/tensorflow/blob/f3a015274fadab00ec8cad92af2a968e0ecd434f/tensorflow/tools/pip_package/setup.py#L54-L73 requirement("absl-py"), @@ -22,10 +23,10 @@ py_library( requirement("protobuf"), requirement("six"), requirement("tensorboard"), - requirement("tensorflow"), requirement("tensorflow_estimator"), requirement("termcolor"), requirement("wheel"), requirement("wrapt"), + "//third/party/py/keras_preprocessing", ], )