From 9b66995013b40a5c2d7673e7672fac58beac6085 Mon Sep 17 00:00:00 2001 From: Lateefah Bello <2019cinnamon@gmail.com> Date: Thu, 12 May 2022 15:30:27 +0100 Subject: [PATCH] Lesson 16 & 17 --- .../src/assets/translations/en/index.js | 4 +- .../src/assets/translations/en/lesson-16.json | 119 +++++++ .../src/assets/translations/en/lesson-17.json | 115 +++++++ etc/quiz-src/questions-en.txt | 53 +++ lessons/5-NLP/15-LanguageModeling/README.md | 8 +- lessons/5-NLP/16-RNN/README.md | 15 +- lessons/5-NLP/16-RNN/RNNPyTorch.ipynb | 4 +- lessons/5-NLP/16-RNN/RNNTF.ipynb | 310 +++++++++--------- .../GenerativePyTorch.ipynb | 2 +- .../17-GenerativeNetworks/GenerativeTF.ipynb | 2 +- lessons/5-NLP/17-GenerativeNetworks/README.md | 9 +- 11 files changed, 475 insertions(+), 166 deletions(-) create mode 100644 etc/quiz-app/src/assets/translations/en/lesson-16.json create mode 100644 etc/quiz-app/src/assets/translations/en/lesson-17.json diff --git a/etc/quiz-app/src/assets/translations/en/index.js b/etc/quiz-app/src/assets/translations/en/index.js index a3e563a9..074ad9c5 100644 --- a/etc/quiz-app/src/assets/translations/en/index.js +++ b/etc/quiz-app/src/assets/translations/en/index.js @@ -10,6 +10,8 @@ import x10 from "./lesson-10.json"; import x12 from "./lesson-12.json"; import x13 from "./lesson-13.json"; import x14 from "./lesson-14.json"; +import x16 from "./lesson-16.json"; +import x17 from "./lesson-17.json"; import x23 from "./lesson-23.json"; -const quiz = { 0 : x1[0], 1 : x2[0], 2 : x3[0], 3 : x4[0], 4 : x5[0], 5 : x7[0], 6 : x8[0], 7 : x9[0], 8 : x10[0], 9 : x12[0], 10 : x13[0], 11 : x14[0], 12 : x23[0] }; +const quiz = { 0 : x1[0], 1 : x2[0], 2 : x3[0], 3 : x4[0], 4 : x5[0], 5 : x7[0], 6 : x8[0], 7 : x9[0], 8 : x10[0], 9 : x12[0], 10 : x13[0], 11 : x14[0], 12 : x16[0], 13 : x17[0], 14 : x23[0] }; export default quiz; \ No newline at end of file diff --git a/etc/quiz-app/src/assets/translations/en/lesson-16.json b/etc/quiz-app/src/assets/translations/en/lesson-16.json new file mode 100644 index 00000000..f068dd7d --- /dev/null +++ b/etc/quiz-app/src/assets/translations/en/lesson-16.json @@ -0,0 +1,119 @@ +[ + { + "title": "AI for Beginners: Quizzes", + "complete": "Congratulations, you completed the quiz!", + "error": "Sorry, try again", + "quizzes": [ + { + "id": 116, + "title": "RNN: Pre Quiz", + "quiz": [ + { + "questionText": "RNN is short for?", + "answerOptions": [ + { + "answerText": "regression neural network", + "isCorrect": false + }, + { + "answerText": "recurrent neural network", + "isCorrect": true + }, + { + "answerText": "re-iterative neural network", + "isCorrect": false + } + ] + }, + { + "questionText": "Simple RNN cell has two weight _____", + "answerOptions": [ + { + "answerText": "matrices", + "isCorrect": true + }, + { + "answerText": "cell", + "isCorrect": false + }, + { + "answerText": "neuron", + "isCorrect": false + } + ] + }, + { + "questionText": "vanishing gradients is a problem of _____", + "answerOptions": [ + { + "answerText": "RNN", + "isCorrect": true + }, + { + "answerText": "CNN", + "isCorrect": false + }, + { + "answerText": "KNN", + "isCorrect": false + } + ] + } + ] + }, + { + "id": 216, + "title": "RNN: Post Quiz", + "quiz": [ + { + "questionText": "_____ takes some information from the input and hidden vector, and inserts it into state", + "answerOptions": [ + { + "answerText": "forget gate", + "isCorrect": false + }, + { + "answerText": "output gate", + "isCorrect": false + }, + { + "answerText": "input gate", + "isCorrect": true + } + ] + }, + { + "questionText": "Bidirectional RNNs runs recurrent computation in _____", + "answerOptions": [ + { + "answerText": "both directions", + "isCorrect": true + }, + { + "answerText": "nort-west direction", + "isCorrect": false + }, + { + "answerText": "left-right direction", + "isCorrect": false + } + ] + }, + { + "questionText": "All RNN Cells have the same shareable weights", + "answerOptions": [ + { + "answerText": "True", + "isCorrect": true + }, + { + "answerText": "False", + "isCorrect": false + } + ] + } + ] + } + ] + } +] \ No newline at end of file diff --git a/etc/quiz-app/src/assets/translations/en/lesson-17.json b/etc/quiz-app/src/assets/translations/en/lesson-17.json new file mode 100644 index 00000000..08cb8046 --- /dev/null +++ b/etc/quiz-app/src/assets/translations/en/lesson-17.json @@ -0,0 +1,115 @@ +[ + { + "title": "AI for Beginners: Quizzes", + "complete": "Congratulations, you completed the quiz!", + "error": "Sorry, try again", + "quizzes": [ + { + "id": 117, + "title": "Generative networks: Pre Quiz", + "quiz": [ + { + "questionText": "RNNs can be for generative tasks", + "answerOptions": [ + { + "answerText": "yes", + "isCorrect": true + }, + { + "answerText": "no", + "isCorrect": false + } + ] + }, + { + "questionText": "_____ is a traditional neural network with one input and one output", + "answerOptions": [ + { + "answerText": "one-to-one", + "isCorrect": true + }, + { + "answerText": "sequence-to-sequence", + "isCorrect": false + }, + { + "answerText": "one-to-many", + "isCorrect": false + } + ] + }, + { + "questionText": "RNN generate texts by generating next output character for each input character", + "answerOptions": [ + { + "answerText": "true", + "isCorrect": true + }, + { + "answerText": "false", + "isCorrect": false + } + ] + } + ] + }, + { + "id": 217, + "title": "Generative networks: Post Quiz", + "quiz": [ + { + "questionText": "Output encoder converts hidden state into _____ output", + "answerOptions": [ + { + "answerText": "one-hot-encoded", + "isCorrect": true + }, + { + "answerText": "sequence", + "isCorrect": false + }, + { + "answerText": "number", + "isCorrect": false + } + ] + }, + { + "questionText": "Selecting the character with higher probabilities always gives a meaningful text.", + "answerOptions": [ + { + "answerText": "true", + "isCorrect": false + }, + { + "answerText": "false", + "isCorrect": true + }, + { + "answerText": "maybe", + "isCorrect": false + } + ] + }, + { + "questionText": "Many-to-many can also be referred to as _____", + "answerOptions": [ + { + "answerText": "one-to-one", + "isCorrect": false + }, + { + "answerText": "sequence-to-sequence", + "isCorrect": true + }, + { + "answerText": "one-to-many", + "isCorrect": false + } + ] + } + ] + } + ] + } +] \ No newline at end of file diff --git a/etc/quiz-src/questions-en.txt b/etc/quiz-src/questions-en.txt index 38b9d193..7feb272b 100644 --- a/etc/quiz-src/questions-en.txt +++ b/etc/quiz-src/questions-en.txt @@ -317,6 +317,59 @@ Lesson 14E Embeddings: Post Quiz - symbol - number +Lesson 16B RNN: Pre Quiz +* RNN is short for? +- regression neural network ++ recurrent neural network +- re-iterative neural network +* Simple RNN cell has two weight _____ ++ matrices +- cell +- neuron +* vanishing gradients is a problem of _____ ++ RNN +- CNN +- KNN + +Lesson 16E RNN: Post Quiz +* _____ takes some information from the input and hidden vector, and inserts it into state +- forget gate +- output gate ++ input gate +* Bidirectional RNNs runs recurrent computation in _____ ++ both directions +- nort-west direction +- left-right direction +* All RNN Cells have the same shareable weights ++ True +- False + +Lesson 17B Generative networks: Pre Quiz +* RNNs can be for generative tasks ++ yes +- no +* _____ is a traditional neural network with one input and one output ++ one-to-one +- sequence-to-sequence +- one-to-many +* RNN generate texts by generating next output character for each input character ++ true +- false + +Lesson 17E Generative networks: Post Quiz +* Output encoder converts hidden state into _____ output ++ one-hot-encoded +- sequence +- number +* Selecting the character with higher probabilities always gives a meaningful text. +- true ++ false +- maybe +* Many-to-many can also be referred to as _____ +- one-to-one ++ sequence-to-sequence +- one-to-many + Lesson 23B Multi-Agent Modeling: Pre Quiz * By modeling the behavior of simple agents, we can understand more complex behaviors of a system. + true diff --git a/lessons/5-NLP/15-LanguageModeling/README.md b/lessons/5-NLP/15-LanguageModeling/README.md index 3d4d01f1..96a2e5cd 100644 --- a/lessons/5-NLP/15-LanguageModeling/README.md +++ b/lessons/5-NLP/15-LanguageModeling/README.md @@ -3,20 +3,20 @@ Semantic embeddings, such as Word2Vec and GloVe, are in fact a first step towards **language modeling** - creating models that somehow *understand* (or *represent*) the nature of the language. -The main idea behind language modeling is training them on unlabeled datesets in unsupervised manner. It is important, because we have huge amounts of unlabeled text available, while the amount of labeled text would always be limited by the amount of effort we can spend on labeling. Most often, we build language models that can **predict missing words** in the text, because it is easy to mask out a random word in text and use it as a training sample. +The main idea behind language modeling is training them on unlabeled datesets in unsupervised manner. It is important, because we have huge amounts of unlabeled text available, while the amount of labeled text would always be limited by the amount of effort we can spend on labeling. Most often, we build language models that can **predict missing words** in the text, because it is easy to mask out a random word in text and use it as a training sample. ## Training embeddings -In our previous examples, we have been using pre-trained semantic embeddings, but it is interesting to see how those embeddings can be trained using either CBoW, or Skip-gram architectures. +In our previous examples, we have been using pre-trained semantic embeddings, but it is interesting to see how those embeddings can be trained using either CBoW, or Skip-gram architectures. ![](../14-Embeddings/images/example-algorithms-for-converting-words-to-vectors.png) > Image from [this paper](https://arxiv.org/pdf/1301.3781.pdf) -The idea of CBoW is exactly predicting a missing word, however, to do this we take a small sliding window of text tokens (we can denote them from W-2 to W2), and train a model to predict the central word W0 from few surrounding words. +The idea of CBoW is exactly predicting a missing word, however, to do this we take a small sliding window of text tokens (we can denote them from W-2 to W2), and train a model to predict the central word W0 from few surrounding words. ## More Info -* [Official PyTorch tutorial on Language Modeling](https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html). +* [Official PyTorch tutorial on Language Modeling](https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html). * [Official TensorFlow tutorial on training Word2Vec model](https://www.TensorFlow.org/tutorials/text/word2vec). * Using **gensim** framework to train most commonly used embeddings in a few lines of code is as described [in this documentation](https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html). diff --git a/lessons/5-NLP/16-RNN/README.md b/lessons/5-NLP/16-RNN/README.md index 67303743..33a814c5 100644 --- a/lessons/5-NLP/16-RNN/README.md +++ b/lessons/5-NLP/16-RNN/README.md @@ -1,5 +1,7 @@ # Recurrent Neural Networks +## [Pre-lecture quiz](https://black-ground-0cc93280f.1.azurestaticapps.net/quiz/116) + In the previous sections, we have been using rich semantic representations of text, and a simple linear classifier on top of the embeddings. What this architecture does is to capture aggregated meaning of words in a sentence, but it does not take into account the **order** of words, because aggregation operation on top of embeddings removed this information from the original text. Because these models are unable to model word ordering, they cannot solve more complex or ambiguous tasks such as text generation or question answering. To capture the meaning of text sequence, we need to use another neural network architecture, which is called a **recurrent neural network**, or RNN. In RNN, we pass our sentence through the network one symbol at a time, and the network produces some **state**, which we then pass to the network again with the next symbol. @@ -8,7 +10,7 @@ To capture the meaning of text sequence, we need to use another neural network a > Image by author -Given the input sequence of tokens X0,...,Xn, RNN creates a sequence of neural network blocks, and trains this sequence end-to-end using back propagation. Each network block takes a pair (Xi,Si) as an input, and produces Si+1 as a result. Final state Sn or (output Yn) goes into a linear classifier to produce the result. All network blocks share the same weights, and are trained end-to-end using one backpropagation pass. +Given the input sequence of tokens X0,...,Xn, RNN creates a sequence of neural network blocks, and trains this sequence end-to-end using back propagation. Each network block takes a pair (Xi,Si) as an input, and produces Si+1 as a result. Final state Sn or (output Yn) goes into a linear classifier to produce the result. All network blocks share the same weights, and are trained end-to-end using one back propagation pass. Because state vectors S0,...,Sn are passed through the network, it is able to learn the sequential dependencies between words. For example, when the word *not* appears somewhere in the sequence, it can learn to negate certain elements within the state vector, resulting in negation. @@ -24,7 +26,7 @@ Simple RNN cell has two weight matrices inside: one transforms input symbol (let > Image by author -In many cases, input tokens are passed through the embedding layer before entering the RNN to lower the dimensionality. In this case, if the dimension of the input vectors is *emb_size*, and state vector is *hid_size* - the size of W is *emb_size*×*hid_size*, and the size of H is *hid_size*×*hid_size*. +In many cases, input tokens are passed through the embedding layer before entering the RNN to lower the dimensionality. In this case, if the dimension of the input vectors is *emb_size*, and state vector is *hid_size* - the size of W is *emb_size*×*hid_size*, and the size of H is *hid_size*×*hid_size*. ## Long Short Term Memory (LSTM) @@ -33,7 +35,8 @@ One of the main problems of classical RNNs is so-called **vanishing gradients** ![Image showing an example long short term memory cell](./images/long-short-term-memory-cell.svg) LSTM Network is organized in a manner similar to RNN, but there are two states that are being passed from layer to layer: actual state C, and hidden vector H. At each unit, hidden vector Hi is concatenated with input Xi, and they control what happens to the state C via **gates**. Each gate is a neural network with sigmoid activation (output in the range [0,1]), which can be thought of as bitwise mask when multiplied by the state vector. There are the following gates (from left to right on the picture above): -* **forget gate** takes hidden vector and determines, which components of the vector C we need to forget, and which to pass through. + +* **forget gate** takes hidden vector and determines, which components of the vector C we need to forget, and which to pass through. * **input gate** takes some information from the input and hidden vector, and inserts it into state. * **output gate** transforms state via some linear layer with *tanh* activation, then selects some of its components using hidden vector Hi to produce new state Ci+1. @@ -43,7 +46,7 @@ Components of the state C can be thought of as some flags that can be switched o ## Bidirectional and multilayer RNNs -We have discussed recurrent networks that operate in one direction, from beginning of a sequence to the end. It looks natural, because it resembles the way we read and listen to speech. However, since in many practical cases we have random access to the input sequence, it might make sense to run recurrent computation in both directions. Such networks are call **bidirectional** RNNs. When dealing with bidirectional network, we would need two hidden state vectors, one for each direction. +We have discussed recurrent networks that operate in one direction, from beginning of a sequence to the end. It looks natural, because it resembles the way we read and listen to speech. However, since in many practical cases we have random access to the input sequence, it might make sense to run recurrent computation in both directions. Such networks are call **bidirectional** RNNs. When dealing with bidirectional network, we would need two hidden state vectors, one for each direction. Recurrent network, one-directional or bidirectional, captures certain patterns within a sequence, and can store them into state vector or pass into output. As with convolutional networks, we can build another recurrent layer on top of the first one to capture higher level patterns, build from low-level patterns extracted by the first layer. This leads us to the notion of **multi-layer RNN**, which consists of two or more recurrent networks, where output of the previous layer is passed to the next layer as input. @@ -59,3 +62,7 @@ Recurrent network, one-directional or bidirectional, captures certain patterns w ## RNNs for other tasks In this unit, we have seen that RNNs can be used for sequence classification, but in fact, they can handle many more tasks, such as text generation, machine translation, and more. We will consider those tasks in the next unit. + +## [Post-lecture quiz](https://black-ground-0cc93280f.1.azurestaticapps.net/quiz/216) + +> ✅ Todo: conclusion, Assignment, challenge, reference. diff --git a/lessons/5-NLP/16-RNN/RNNPyTorch.ipynb b/lessons/5-NLP/16-RNN/RNNPyTorch.ipynb index 2f7f6d07..a621a012 100644 --- a/lessons/5-NLP/16-RNN/RNNPyTorch.ipynb +++ b/lessons/5-NLP/16-RNN/RNNPyTorch.ipynb @@ -12,7 +12,7 @@ "\n", "\"RNN\"\n", "\n", - "Given the input sequence of tokens $X_0,\\dots,X_n$, RNN creates a sequence of neural network blocks, and trains this sequence end-to-end using back propagation. Each network block takes a pair $(X_i,S_i)$ as an input, and produces $S_{i+1}$ as a result. Final state $S_n$ or output $X_n$ goes into a linear classifier to produce the result. All network blocks share the same weights, and are trained end-to-end using one backpropagation pass.\n", + "Given the input sequence of tokens $X_0,\\dots,X_n$, RNN creates a sequence of neural network blocks, and trains this sequence end-to-end using back propagation. Each network block takes a pair $(X_i,S_i)$ as an input, and produces $S_{i+1}$ as a result. Final state $S_n$ or output $X_n$ goes into a linear classifier to produce the result. All network blocks share the same weights, and are trained end-to-end using one back propagation pass.\n", "\n", "Because state vectors $S_0,\\dots,S_n$ are passed through the network, it is able to learn the sequential dependencies between words. For example, when the word *not* appears somewhere in the sequence, it can learn to negate certain elements within the state vector, resulting in negation. \n", "\n", @@ -206,7 +206,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now let's train our network. Note that training LSTM is also quite slow, and you may not seem much raise in accuracy in the beginning of training. Also, you may need to play with `lr` learning rate parameter to find the learning rate that results in reasonable training speed, and yet does not cause " + "Now let's train our network. Note that training LSTM is also quite slow, and you may not seem much raise in accuracy in the beginning of training. Also, you may need to play with `lr` learning rate parameter to find the learning rate that results in reasonable training speed, and yet does not cause memory waste." ] }, { diff --git a/lessons/5-NLP/16-RNN/RNNTF.ipynb b/lessons/5-NLP/16-RNN/RNNTF.ipynb index c461cedc..03a18139 100644 --- a/lessons/5-NLP/16-RNN/RNNTF.ipynb +++ b/lessons/5-NLP/16-RNN/RNNTF.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "metadata": {}, "source": [ "# Recurrent neural networks\n", "\n", @@ -11,7 +12,7 @@ "\n", "![Image showing an example recurrent neural network generation.](images/rnn.png)\n", "\n", - "Given the input sequence of tokens $X_0,\\dots,X_n$, the RNN creates a sequence of neural network blocks, and trains this sequence end-to-end using backpropagation. Each network block takes a pair $(X_i,S_i)$ as an input, and produces $S_{i+1}$ as a result. The final state $S_n$ or output $Y_n$ goes into a linear classifier to produce the result. All network blocks share the same weights, and are trained end-to-end using one backpropagation pass.\n", + "Given the input sequence of tokens $X_0,\\dots,X_n$, the RNN creates a sequence of neural network blocks, and trains this sequence end-to-end using backpropagation. Each network block takes a pair $(X_i,S_i)$ as an input, and produces $S_{i+1}$ as a result. The final state $S_n$ or output $Y_n$ goes into a linear classifier to produce the result. All network blocks share the same weights, and are trained end-to-end using one back propagation pass.\n", "\n", "> The figure above shows recurrent neural network in the unrolled form (on the left), and in more compact recurrent representation (on the right). It is important to realize that all RNN Cells have the same **shareable weights**.\n", "\n", @@ -24,22 +25,24 @@ "Let's see how recurrent neural networks can help us classify our news dataset.\n", "\n", "> For the sandbox environment, we need to run the following cell to make sure the required library is installed, and data is prefetched. If you are running locally, you can skip the following cell." - ], - "metadata": {} + ] }, { "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], "source": [ "import sys\n", "!{sys.executable} -m pip install --quiet tensorflow_datasets==4.4.0\n", "!cd ~ && wget -q -O - https://mslearntensorflowlp.blob.core.windows.net/data/tfds-ag-news.tgz | tar xz" - ], - "outputs": [], - "execution_count": 1, - "metadata": {} + ] }, { "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], "source": [ "import tensorflow as tf\n", "from tensorflow import keras\n", @@ -53,49 +56,47 @@ " tf.config.experimental.set_memory_growth(physical_devices[0], True)\n", "\n", "ds_train, ds_test = tfds.load('ag_news_subset').values()" - ], - "outputs": [], - "execution_count": 2, - "metadata": {} + ] }, { "cell_type": "markdown", - "source": [ - "When training large models, GPU memory allocation may become a problem. We also may need to experiment with different minibatch sizes, so that the data fits into our GPU memory, yet the training is fast enough. If you are running this code on your own GPU machine, you may experiment with adjusting minibatch size to speed up training.\r\n", - "\r\n", - "> **Note**: Certain versions of NVidia drivers are known not to release the memory after training the model. We are running several examples in this notebooks, and it might cause memory to be exhausted in certain setups, especially if you are doing your own experiments as part of the same notebook. If you encounter some weird errors when starting to train the model, you may want to restart notebook kernel." - ], "metadata": { "nteract": { "transient": { "deleting": false } } - } + }, + "source": [ + "When training large models, GPU memory allocation may become a problem. We also may need to experiment with different minibatch sizes, so that the data fits into our GPU memory, yet the training is fast enough. If you are running this code on your own GPU machine, you may experiment with adjusting minibatch size to speed up training.\n", + "\n", + "> **Note**: Certain versions of NVidia drivers are known not to release the memory after training the model. We are running several examples in this notebooks, and it might cause memory to be exhausted in certain setups, especially if you are doing your own experiments as part of the same notebook. If you encounter some weird errors when starting to train the model, you may want to restart notebook kernel." + ] }, { "cell_type": "code", - "source": [ - "batch_size = 16\r\n", - "embed_size = 64" - ], - "outputs": [], "execution_count": 3, "metadata": { "collapsed": true, "jupyter": { - "source_hidden": false, - "outputs_hidden": false + "outputs_hidden": false, + "source_hidden": false }, "nteract": { "transient": { "deleting": false } } - } + }, + "outputs": [], + "source": [ + "batch_size = 16\n", + "embed_size = 64" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Simple RNN classifier\n", "\n", @@ -104,31 +105,16 @@ "While we can pass one-hot encoded tokens to the RNN layer directly, this is not a good idea because of their high dimensionality. Therefore, we will use an embedding layer to lower the dimensionality of word vectors, followed by an RNN layer, and finally a `Dense` classifier.\n", "\n", "> **Note**: In cases where the dimensionality isn't so high, for example when using character-level tokenization, it might make sense to pass one-hot encoded tokens directly into the RNN cell." - ], - "metadata": {} + ] }, { "cell_type": "code", - "source": [ - "vocab_size = 20000\n", - "\n", - "vectorizer = keras.layers.experimental.preprocessing.TextVectorization(\n", - " max_tokens=vocab_size,\n", - " input_shape=(1,))\n", - "\n", - "model = keras.models.Sequential([\n", - " vectorizer,\n", - " keras.layers.Embedding(vocab_size, embed_size),\n", - " keras.layers.SimpleRNN(16),\n", - " keras.layers.Dense(4,activation='softmax')\n", - "])\n", - "\n", - "model.summary()" - ], + "execution_count": 4, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Model: \"sequential\"\n", "_________________________________________________________________\n", @@ -149,22 +135,49 @@ ] } ], - "execution_count": 4, - "metadata": {} + "source": [ + "vocab_size = 20000\n", + "\n", + "vectorizer = keras.layers.experimental.preprocessing.TextVectorization(\n", + " max_tokens=vocab_size,\n", + " input_shape=(1,))\n", + "\n", + "model = keras.models.Sequential([\n", + " vectorizer,\n", + " keras.layers.Embedding(vocab_size, embed_size),\n", + " keras.layers.SimpleRNN(16),\n", + " keras.layers.Dense(4,activation='softmax')\n", + "])\n", + "\n", + "model.summary()" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "> **Note:** We use an untrained embedding layer here for simplicity, but for better results we can use a pretrained embedding layer using Word2Vec, as described in the previous unit. It would be a good exercise for you to adapt this code to work with pretrained embeddings.\n", "\n", "Now let's train our RNN. RNNs in general are quite difficult to train, because once the RNN cells are unrolled along the sequence length, the resulting number of layers involved in backpropagation is quite large. Thus we need to select a smaller learning rate, and train the network on a larger dataset to produce good results. This can take quite a long time, so using a GPU is preferred.\n", "\n", "To speed things up, we will only train the RNN model on news titles, omitting the description. You can try training with description and see if you can get the model to train." - ], - "metadata": {} + ] }, { "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training vectorizer\n" + ] + } + ], "source": [ "def extract_title(x):\n", " return x['title']\n", @@ -174,62 +187,52 @@ "\n", "print('Training vectorizer')\n", "vectorizer.adapt(ds_train.take(2000).map(extract_title))" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Training vectorizer\n" - ] - } - ], - "execution_count": 5, - "metadata": { - "scrolled": true - } + ] }, { "cell_type": "code", - "source": [ - "model.compile(loss='sparse_categorical_crossentropy',metrics=['acc'], optimizer='adam')\n", - "model.fit(ds_train.map(tupelize_title).batch(batch_size),validation_data=ds_test.map(tupelize_title).batch(batch_size))" - ], + "execution_count": 6, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "7500/7500 [==============================] - 82s 11ms/step - loss: 0.6629 - acc: 0.7623 - val_loss: 0.5559 - val_acc: 0.7995\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\n" ] }, { - "output_type": "execute_result", - "execution_count": 6, "data": { - "text/plain": "" + "text/plain": [ + "" + ] }, - "metadata": {} + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" } ], - "execution_count": 6, - "metadata": {} + "source": [ + "model.compile(loss='sparse_categorical_crossentropy',metrics=['acc'], optimizer='adam')\n", + "model.fit(ds_train.map(tupelize_title).batch(batch_size),validation_data=ds_test.map(tupelize_title).batch(batch_size))" + ] }, { "cell_type": "markdown", - "source": [ - "> **Note** that accuracy is likely to be lower here, because we are training only on news titles." - ], "metadata": { "nteract": { "transient": { "deleting": false } } - } + }, + "source": [ + "> **Note** that accuracy is likely to be lower here, because we are training only on news titles." + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Revisiting variable sequences \n", "\n", @@ -240,11 +243,31 @@ "Another approach is to use **masking**. In Keras, some layers support additional input that shows which tokens should be taken into account when training. To incorporate masking into our model, we can either include a separate `Masking` layer ([docs](https://keras.io/api/layers/core_layers/masking/)), or we can specify the `mask_zero=True` parameter of our `Embedding` layer.\n", "\n", "> **Note**: This training will take around 5 minutes to complete one epoch on the whole dataset. Feel free to interrupt training at any time if you run out of patience. What you can also do is limit the amount of data used for training, by adding `.take(...)` clause after `ds_train` and `ds_test` datasets." - ], - "metadata": {} + ] }, { "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7500/7500 [==============================] - 371s 49ms/step - loss: 0.5401 - acc: 0.8079 - val_loss: 0.3780 - val_acc: 0.8822\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def extract_text(x):\n", " return x['title']+' '+x['description']\n", @@ -261,38 +284,20 @@ "\n", "model.compile(loss='sparse_categorical_crossentropy',metrics=['acc'], optimizer='adam')\n", "model.fit(ds_train.map(tupelize).batch(batch_size),validation_data=ds_test.map(tupelize).batch(batch_size))" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "7500/7500 [==============================] - 371s 49ms/step - loss: 0.5401 - acc: 0.8079 - val_loss: 0.3780 - val_acc: 0.8822\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\n" - ] - }, - { - "output_type": "execute_result", - "execution_count": 7, - "data": { - "text/plain": "" - }, - "metadata": {} - } - ], - "execution_count": 7, - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ - "Now that we're using masking, we can train the model on the whole dataset of titles and descriptions.\r\n", - "\r\n", + "Now that we're using masking, we can train the model on the whole dataset of titles and descriptions.\n", + "\n", "> **Note**: Have you noticed that we have been using vectorizer trained on the news titles, and not the whole body of the article? Potentially, this can cause some of the the tokens to be ignored, so it is better to re-train the vectorizer. However, it might only have very small effect, so we will stick to the previous pre-trained vectorizer for the sake of simplicity." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## LSTM: Long short-term memory\n", "\n", @@ -310,51 +315,53 @@ "> **Note**: Here's a great resource for understanding the internals of LSTMs: [Understanding LSTM Networks](https://colah.github.io/posts/2015-08-Understanding-LSTMs/) by Christopher Olah.\n", "\n", "While the internal structure of an LSTM cell may look complex, Keras hides this implementation inside the `LSTM` layer, so the only thing we need to do in the example above is to replace the recurrent layer:" - ], - "metadata": {} + ] }, { "cell_type": "code", - "source": [ - "model = keras.models.Sequential([\n", - " vectorizer,\n", - " keras.layers.Embedding(vocab_size, embed_size),\n", - " keras.layers.LSTM(8),\n", - " keras.layers.Dense(4,activation='softmax')\n", - "])\n", - "\n", - "model.compile(loss='sparse_categorical_crossentropy',metrics=['acc'], optimizer='adam')\n", - "model.fit(ds_train.map(tupelize).batch(8),validation_data=ds_test.map(tupelize).batch(8))" - ], + "execution_count": 8, + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "15000/15000 [==============================] - 188s 13ms/step - loss: 0.5692 - acc: 0.7916 - val_loss: 0.3441 - val_acc: 0.8870\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\n" ] }, { - "output_type": "execute_result", - "execution_count": 8, "data": { - "text/plain": "" + "text/plain": [ + "" + ] }, - "metadata": {} + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" } ], - "execution_count": 8, - "metadata": {} + "source": [ + "model = keras.models.Sequential([\n", + " vectorizer,\n", + " keras.layers.Embedding(vocab_size, embed_size),\n", + " keras.layers.LSTM(8),\n", + " keras.layers.Dense(4,activation='softmax')\n", + "])\n", + "\n", + "model.compile(loss='sparse_categorical_crossentropy',metrics=['acc'], optimizer='adam')\n", + "model.fit(ds_train.map(tupelize).batch(8),validation_data=ds_test.map(tupelize).batch(8))" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "> **Note** that training LSTMs is also quite slow, and you may not seem much increase in accuracy in the beginning of training. You may need to continue training for some time to achieve good accuracy." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Bidirectional and multilayer RNNs\n", "\n", @@ -373,11 +380,21 @@ "Let's build a two-layer bidirectional LSTM for our classification problem.\n", "\n", "> **Note** this code again takes quite a long time to complete, but it gives us highest accuracy we have seen so far. So maybe it is worth waiting and seeing the result." - ], - "metadata": {} + ] }, { "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5044/7500 [===================>..........] - ETA: 2:33 - loss: 0.3709 - acc: 0.8706\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\r5045/7500 [===================>..........] - ETA: 2:33 - loss: 0.3709 - acc: 0.8706" + ] + } + ], "source": [ "model = keras.models.Sequential([\n", " vectorizer,\n", @@ -390,49 +407,38 @@ "model.compile(loss='sparse_categorical_crossentropy',metrics=['acc'], optimizer='adam')\n", "model.fit(ds_train.map(tupelize).batch(batch_size),\n", " validation_data=ds_test.map(tupelize).batch(batch_size))" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "5044/7500 [===================>..........] - ETA: 2:33 - loss: 0.3709 - acc: 0.8706\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\r5045/7500 [===================>..........] - ETA: 2:33 - loss: 0.3709 - acc: 0.8706" - ] - } - ], - "execution_count": 9, - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## RNNs for other tasks\n", "\n", "Up until now, we've focused on using RNNs to classify sequences of text. But they can handle many more tasks, such as text generation and machine translation — we'll consider those tasks in the next unit." - ], - "metadata": {} + ] } ], "metadata": { + "kernel_info": { + "name": "conda-env-py37_tensorflow-py" + }, "kernelspec": { - "name": "conda-env-py37_tensorflow-py", + "display_name": "py37_tensorflow", "language": "python", - "display_name": "py37_tensorflow" + "name": "conda-env-py37_tensorflow-py" }, "language_info": { - "name": "python", - "version": "3.7.9", - "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, - "pygments_lexer": "ipython3", + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", "nbconvert_exporter": "python", - "file_extension": ".py" - }, - "kernel_info": { - "name": "conda-env-py37_tensorflow-py" + "pygments_lexer": "ipython3", + "version": "3.7.9" }, "nteract": { "version": "nteract-front-end@1.0.0" @@ -440,4 +446,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/lessons/5-NLP/17-GenerativeNetworks/GenerativePyTorch.ipynb b/lessons/5-NLP/17-GenerativeNetworks/GenerativePyTorch.ipynb index d75f0266..04a069d8 100644 --- a/lessons/5-NLP/17-GenerativeNetworks/GenerativePyTorch.ipynb +++ b/lessons/5-NLP/17-GenerativeNetworks/GenerativePyTorch.ipynb @@ -8,7 +8,7 @@ "\n", "Recurrent Neural Networks (RNNs) and their gated cell variants such as Long Short Term Memory Cells (LSTMs) and Gated Recurrent Units (GRUs) provided a mechanism for language modeling, i.e. they can learn word ordering and provide predictions for next word in a sequence. This allows us to use RNNs for **generative tasks**, such as ordinary text generation, machine translation, and even image captioning.\n", "\n", - "In RNN architecture we discussed in the previous unit, each RNN unit produced next next hidden state as an output. However, we can also add another output to each recurrent unit, which would allow us to output a **sequence** (which is equal in length to the original sequence). Moreover, we can use RNN units that do not accept an input at each step, and just take some initial state vector, and then produce a sequence of outputs.\n", + "In RNN architecture we discussed in the previous unit, each RNN unit produced next hidden state as an output. However, we can also add another output to each recurrent unit, which would allow us to output a **sequence** (which is equal in length to the original sequence). Moreover, we can use RNN units that do not accept an input at each step, and just take some initial state vector, and then produce a sequence of outputs.\n", "\n", "In this notebook, we will focus on simple generative models that help us generate text. For simplicity, let's build **character-level network**, which generates text letter by letter. During training, we need to take some text corpus, and split it into letter sequences. " ] diff --git a/lessons/5-NLP/17-GenerativeNetworks/GenerativeTF.ipynb b/lessons/5-NLP/17-GenerativeNetworks/GenerativeTF.ipynb index be81f195..2b2e95ba 100644 --- a/lessons/5-NLP/17-GenerativeNetworks/GenerativeTF.ipynb +++ b/lessons/5-NLP/17-GenerativeNetworks/GenerativeTF.ipynb @@ -8,7 +8,7 @@ "\n", "Recurrent Neural Networks (RNNs) and their gated cell variants such as Long Short Term Memory Cells (LSTMs) and Gated Recurrent Units (GRUs) provided a mechanism for language modeling, i.e. they can learn word ordering and provide predictions for next word in a sequence. This allows us to use RNNs for **generative tasks**, such as ordinary text generation, machine translation, and even image captioning.\n", "\n", - "In RNN architecture we discussed in the previous unit, each RNN unit produced next next hidden state as an output. However, we can also add another output to each recurrent unit, which would allow us to output a **sequence** (which is equal in length to the original sequence). Moreover, we can use RNN units that do not accept an input at each step, and just take some initial state vector, and then produce a sequence of outputs.\n", + "In RNN architecture we discussed in the previous unit, each RNN unit produced next hidden state as an output. However, we can also add another output to each recurrent unit, which would allow us to output a **sequence** (which is equal in length to the original sequence). Moreover, we can use RNN units that do not accept an input at each step, and just take some initial state vector, and then produce a sequence of outputs.\n", "\n", "In this notebook, we will focus on simple generative models that help us generate text. For simplicity, let's build **character-level network**, which generates text letter by letter. During training, we need to take some text corpus, and split it into letter sequences. " ] diff --git a/lessons/5-NLP/17-GenerativeNetworks/README.md b/lessons/5-NLP/17-GenerativeNetworks/README.md index 2c253131..08bc8256 100644 --- a/lessons/5-NLP/17-GenerativeNetworks/README.md +++ b/lessons/5-NLP/17-GenerativeNetworks/README.md @@ -1,8 +1,10 @@ # Generative networks +## [Pre-lecture quiz](https://black-ground-0cc93280f.1.azurestaticapps.net/quiz/117) + Recurrent Neural Networks (RNNs) and their gated cell variants such as Long Short Term Memory Cells (LSTMs) and Gated Recurrent Units (GRUs) provided a mechanism for language modeling, i.e. they can learn word ordering and provide predictions for next word in a sequence. This allows us to use RNNs for **generative tasks**, such as ordinary text generation, machine translation, and even image captioning. -In RNN architecture we discussed in the previous unit, each RNN unit produced next next hidden state as an output. However, we can also add another output to each recurrent unit, which would allow us to output a **sequence** (which is equal in length to the original sequence). Moreover, we can use RNN units that do not accept an input at each step, and just take some initial state vector, and then produce a sequence of outputs. +In RNN architecture we discussed in the previous unit, each RNN unit produced next hidden state as an output. However, we can also add another output to each recurrent unit, which would allow us to output a **sequence** (which is equal in length to the original sequence). Moreover, we can use RNN units that do not accept an input at each step, and just take some initial state vector, and then produce a sequence of outputs. This allows for different neural architectures that are shown in the picture below: @@ -26,6 +28,7 @@ When generating text (during inference), we start with some **prompt**, which is > Image by author + ## Continue to Notebooks * [Generative Networks with PyTorch](GenerativePyTorch.ipynb) @@ -44,3 +47,7 @@ However, if we look at the probability distribution for the next character, it c This leads us to the conclusion that it is not always "fair" to select the character with higher probability, because choosing the second highest might still lead us to meaningful text. It is more wise to **sample** characters from the probability distribution given by the network output. We can also use a parameter, **temperature**, that will flatten out the probability distribution, in case we want to add more randomness, or make it more steep, if we want to stick more to the highest-probability characters. Have a look at how this soft text generation is implemented in the notebooks. + +## [Post-lecture quiz](https://black-ground-0cc93280f.1.azurestaticapps.net/quiz/217) + +> ✅ Todo: conclusion, Assignment, challenge, reference.