diff --git a/.gitignore b/.gitignore index f48746098..2a693d3be 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,8 @@ __pycache__/ # Distribution / packaging .Python env/ +data/ +cache/ build/ develop-eggs/ dist/ @@ -105,4 +107,4 @@ ENV/ # Notebook files Sentiment Analysis/aclImdb_v1.tar.gz -Sentiment Analysis/aclImdb/ \ No newline at end of file +Sentiment Analysis/aclImdb/ diff --git a/Mini-Projects/IMDB Sentiment Analysis - XGBoost (Updating a Model) - Solution.ipynb b/Mini-Projects/IMDB Sentiment Analysis - XGBoost (Updating a Model) - Solution.ipynb index 49f38937b..1f77e8fa2 100644 --- a/Mini-Projects/IMDB Sentiment Analysis - XGBoost (Updating a Model) - Solution.ipynb +++ b/Mini-Projects/IMDB Sentiment Analysis - XGBoost (Updating a Model) - Solution.ipynb @@ -1,5 +1,19 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {}, @@ -433,9 +447,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'os' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mprefix\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'sentiment-update'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mtest_location\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupload_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'test.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey_prefix\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprefix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mval_location\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupload_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'validation.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey_prefix\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprefix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mtrain_location\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupload_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'train.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey_prefix\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprefix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'os' is not defined" + ] + } + ], "source": [ "import sagemaker\n", "\n", @@ -470,7 +496,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -483,9 +509,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:There is a more up to date SageMaker XGBoost image. To use the newer image, please set 'repo_version'='0.90-1'. For example:\n", + "\tget_image_uri(region, 'xgboost', '0.90-1').\n" + ] + } + ], "source": [ "# We need to retrieve the location of the container which is provided by Amazon for using XGBoost.\n", "# As a matter of convenience, the training and inference code both use the same container.\n", @@ -496,7 +531,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -531,9 +566,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'train_location' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0ms3_input_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msagemaker\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0ms3_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms3_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrain_location\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcontent_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0ms3_input_validation\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msagemaker\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0ms3_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms3_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mval_location\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcontent_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'train_location' is not defined" + ] + } + ], "source": [ "s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')\n", "s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')" @@ -561,9 +608,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'xgb' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mxgb_transformer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mxgb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransformer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minstance_count\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minstance_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'ml.m4.xlarge'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mNameError\u001b[0m: name 'xgb' is not defined" + ] + } + ], "source": [ "xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')" ] diff --git a/Project/SageMaker Project.ipynb b/Project/SageMaker Project.ipynb index af1816cf2..b0fe3880e 100644 --- a/Project/SageMaker Project.ipynb +++ b/Project/SageMaker Project.ipynb @@ -53,9 +53,28 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mkdir: cannot create directory ‘../data’: File exists\n", + "--2020-05-07 14:11:39-- http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n", + "Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10\n", + "Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 84125825 (80M) [application/x-gzip]\n", + "Saving to: ‘../data/aclImdb_v1.tar.gz’\n", + "\n", + "../data/aclImdb_v1. 100%[===================>] 80.23M 19.7MB/s in 6.9s \n", + "\n", + "2020-05-07 14:11:47 (11.6 MB/s) - ‘../data/aclImdb_v1.tar.gz’ saved [84125825/84125825]\n", + "\n" + ] + } + ], "source": [ "%mkdir ../data\n", "!wget -O ../data/aclImdb_v1.tar.gz http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n", @@ -73,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -109,9 +128,17 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "IMDB reviews: train = 12500 pos / 12500 neg, test = 12500 pos / 12500 neg\n" + ] + } + ], "source": [ "data, labels = read_imdb_data()\n", "print(\"IMDB reviews: train = {} pos / {} neg, test = {} pos / {} neg\".format(\n", @@ -128,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -153,9 +180,17 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "IMDb reviews (combined): train = 25000, test = 25000\n" + ] + } + ], "source": [ "train_X, test_X, train_y, test_y = prepare_imdb_data(data, labels)\n", "print(\"IMDb reviews (combined): train = {}, test = {}\".format(len(train_X), len(test_X)))" @@ -170,12 +205,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "I can not believe such slanted, jingoistic material is getting passed off to Americans as art house material. Early on, from such telling lines like \"we want to make sure they are playing for the right team\" and manipulative framing and lighting, A Love Divided shows it's true face. The crass manner in which the Irish Catholics are shown as hegemonic, the Protestants as peaceful and downtrodden, is as poor a representation of history as early US westerns that depict the struggle between cowboys and American Indians. The truth of the story is distorted with the stereotypes and outright vilification of the Irish Catholics in the story; a corruption admitted by the filmmakers themselves! It is sad that people today still think that they can win moral sway by making a film so easily recognized for it's obvious intent, so far from attempting art. This film has no business being anywhere in any legitimate cinema or library.\n", + "0\n" + ] + } + ], "source": [ - "print(train_X[100])\n", - "print(train_y[100])" + "print(train_X[3])\n", + "print(train_y[3])" ] }, { @@ -187,7 +231,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -220,11 +264,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['mani', 'sourc', 'routin', 'lump', 'thought', 'provok', 'period', 'drama', 'part', 'base', 'histor', 'fact', 'togeth', 'superfici', 'similar', 'nunsploit', 'mainstay', '70', 'euro', 'trash', 'cinema', 'overlook', 'righteou', 'anger', 'drive', 'whole', 'endeavor', 'perhap', 'coincident', 'also', 'director', 'gianfranco', 'mingozzi', 'singular', 'attempt', 'narr', 'film', 'make', 'outsid', 'mani', 'well', 'receiv', 'documentari', 'safe', 'set', 'within', 'histor', 'context', 'flavia', 'chart', 'grow', 'rebellion', 'earli', '15th', 'centuri', 'italian', 'nun', 'florinda', 'bolkan', 'career', 'perform', 'even', 'surpass', 'sterl', 'work', 'lucio', 'fulci', 'devast', 'tortur', 'duckl', 'lock', 'away', 'convent', 'nobleman', 'father', 'desper', 'attempt', 'curb', 'girl', 'bud', 'sensuou', 'natur', 'wonder', 'women', 'releg', 'secondari', 'role', 'best', 'life', 'holi', 'scriptur', 'confront', 'way', 'male', 'domin', 'ruptur', 'femal', 'live', 'inspir', 'revolt', 'fuel', 'rant', 'semi', 'craze', 'older', 'sister', 'agatha', 'indel', 'portray', 'veteran', 'actress', 'maria', 'casar', 'marcel', 'carn', 'le', 'enfant', 'du', 'paradi', 'construct', 'muslim', 'invas', 'join', 'oppressor', 'perhap', 'unwittingli', 'manipul', 'bid', 'flavia', 'truli', 'becom', 'outcast', 'alreadi', 'felt', 'expect', 'tragic', 'result', 'breathtak', 'widescreen', 'composit', 'alfio', 'contini', 'shot', 'michelangelo', 'antonioni', 'zabriski', 'point', 'uncompromis', 'auster', 'account', 'one', 'woman', 'fierc', 'yet', 'ultim', 'futil', 'fight', 'patriarch', 'societi', 'allot', 'right', 'beyond', 'childbear', 'whore', 'sister', 'agatha', 'wryli', 'remark', 'lengthi', 'drug', 'induc', 'fantasi', 'sequenc', 'clearli', 'model', 'ken', 'russel', 'otherwis', 'far', 'flamboy', 'devil', 'notwithstand', 'movi', 'turn', 'rel', 'stingi', 'skin', 'depart', 'make', 'someth', 'mockeri', 'semi', 'porn', 'reput', 'seriou', 'work', 'deserv', 'rediscoveri', 'restor', 'unjustli', 'tarnish', 'reput']\n" + ] + } + ], "source": [ - "# TODO: Apply review_to_words to a review (train_X[100] or any other review)\n" + "# TODO: Apply review_to_words to a review (train_X[100] or any other review)\n", + "print(review_to_words(train_X[100]))\n" ] }, { @@ -238,7 +291,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Answer:**" + "**Answer:** \n", + "\n", + "* It removes html tags from the review.\n", + "* It converts all characters to lower case.\n", + "* It splits the review into seperate words.\n", + "* It removes stop words." ] }, { @@ -250,7 +308,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -298,9 +356,17 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Read preprocessed data from cache file: preprocessed_data.pkl\n" + ] + } + ], "source": [ "# Preprocess data\n", "train_X, test_X, train_y, test_y = preprocess_data(train_X, test_X, train_y, test_y)" @@ -330,7 +396,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -341,13 +407,16 @@ " \n", " # TODO: Determine how often each word appears in `data`. Note that `data` is a list of sentences and that a\n", " # sentence is a list of words.\n", - " \n", - " word_count = {} # A dict storing the words that appear in the reviews along with how often they occur\n", + " word_count = {}\n", + " for sentence in data:\n", + " for word in sentence:\n", + " word_count[word] = word_count[word] + 1 if (word in word_count) else 1 \n", " \n", " # TODO: Sort the words found in `data` so that sorted_words[0] is the most frequently appearing word and\n", " # sorted_words[-1] is the least frequently appearing word.\n", - " \n", - " sorted_words = None\n", + "\n", + " sorted_words = [key for key, value in sorted(word_count.items(), key=lambda item: item[1])]\n", + " sorted_words.reverse() \n", " \n", " word_dict = {} # This is what we are building, a dictionary that translates words into integers\n", " for idx, word in enumerate(sorted_words[:vocab_size - 2]): # The -2 is so that we save room for the 'no word'\n", @@ -358,7 +427,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -381,11 +450,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['movi', 'film', 'one', 'like', 'time']\n" + ] + } + ], "source": [ - "# TODO: Use this space to determine the five most frequently appearing words in the training set." + "# TODO: Use this space to determine the five most frequently appearing words in the training set.\n", + "frequent = list(word_dict.keys())[:5]\n", + "# ['movi', 'film', 'one', 'like', 'time']\n", + "# Yes it does as most of these words are common when discussing movies (either negative or positive)" ] }, { @@ -399,7 +479,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -410,7 +490,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -429,7 +509,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -461,7 +541,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -478,11 +558,55 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Use this cell to examine one of the processed reviews to make sure everything is working as intended." + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 28 135 3 1693 3018 812 4 395 2 116 123 508 3841 1273\n", + " 497 3780 2 989 1441 269 1120 2408 1516 1516 333 1 72 1\n", + " 1520 65 1975 3928 1 8 195 84 3699 57 98 1530 31 1120\n", + " 64 278 1 75 433 15 1 1 2627 537 264 1516 611 52\n", + " 237 45 40 1975 991 2408 234 1854 1341 1975 247 42 1975 1341\n", + " 1452 3192 1 893 508 3841 1273 1415 893 1 53 2 71 1084\n", + " 508 3841 1273 3192 1 26 60 17 1 2 1690 1 115 197\n", + " 218 200 1 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0]\n" + ] + } + ], + "source": [ + "# Use this cell to examine one of the processed reviews to make sure everything is working as intended.\n", + "print(train_X[100])" ] }, { @@ -496,7 +620,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Answer:**" + "**Answer:** This could be a problem as the results are stored as variables as oposed to in a cached file. This means the results are not being stored." ] }, { @@ -514,7 +638,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -536,7 +660,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -552,10 +676,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ + "data_dir = '../data/pytorch'\n", "input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)" ] }, @@ -585,9 +710,48 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mtorch.nn\u001b[39;49;00m \u001b[34mas\u001b[39;49;00m \u001b[04m\u001b[36mnn\u001b[39;49;00m\r\n", + "\r\n", + "\u001b[34mclass\u001b[39;49;00m \u001b[04m\u001b[32mLSTMClassifier\u001b[39;49;00m(nn.Module):\r\n", + " \u001b[33m\"\"\"\u001b[39;49;00m\r\n", + "\u001b[33m This is the simple RNN model we will be using to perform Sentiment Analysis.\u001b[39;49;00m\r\n", + "\u001b[33m \"\"\"\u001b[39;49;00m\r\n", + "\r\n", + " \u001b[34mdef\u001b[39;49;00m \u001b[32m__init__\u001b[39;49;00m(\u001b[36mself\u001b[39;49;00m, embedding_dim, hidden_dim, vocab_size):\r\n", + " \u001b[33m\"\"\"\u001b[39;49;00m\r\n", + "\u001b[33m Initialize the model by settingg up the various layers.\u001b[39;49;00m\r\n", + "\u001b[33m \"\"\"\u001b[39;49;00m\r\n", + " \u001b[36msuper\u001b[39;49;00m(LSTMClassifier, \u001b[36mself\u001b[39;49;00m).\u001b[32m__init__\u001b[39;49;00m()\r\n", + "\r\n", + " \u001b[36mself\u001b[39;49;00m.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=\u001b[34m0\u001b[39;49;00m)\r\n", + " \u001b[36mself\u001b[39;49;00m.lstm = nn.LSTM(embedding_dim, hidden_dim)\r\n", + " \u001b[36mself\u001b[39;49;00m.dense = nn.Linear(in_features=hidden_dim, out_features=\u001b[34m1\u001b[39;49;00m)\r\n", + " \u001b[36mself\u001b[39;49;00m.sig = nn.Sigmoid()\r\n", + " \r\n", + " \u001b[36mself\u001b[39;49;00m.word_dict = \u001b[36mNone\u001b[39;49;00m\r\n", + "\r\n", + " \u001b[34mdef\u001b[39;49;00m \u001b[32mforward\u001b[39;49;00m(\u001b[36mself\u001b[39;49;00m, x):\r\n", + " \u001b[33m\"\"\"\u001b[39;49;00m\r\n", + "\u001b[33m Perform a forward pass of our model on some input.\u001b[39;49;00m\r\n", + "\u001b[33m \"\"\"\u001b[39;49;00m\r\n", + " x = x.t()\r\n", + " lengths = x[\u001b[34m0\u001b[39;49;00m,:]\r\n", + " reviews = x[\u001b[34m1\u001b[39;49;00m:,:]\r\n", + " embeds = \u001b[36mself\u001b[39;49;00m.embedding(reviews)\r\n", + " lstm_out, _ = \u001b[36mself\u001b[39;49;00m.lstm(embeds)\r\n", + " out = \u001b[36mself\u001b[39;49;00m.dense(lstm_out)\r\n", + " out = out[lengths - \u001b[34m1\u001b[39;49;00m, \u001b[36mrange\u001b[39;49;00m(\u001b[36mlen\u001b[39;49;00m(lengths))]\r\n", + " \u001b[34mreturn\u001b[39;49;00m \u001b[36mself\u001b[39;49;00m.sig(out.squeeze())\r\n" + ] + } + ], "source": [ "!pygmentize train/model.py" ] @@ -603,7 +767,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 85, "metadata": {}, "outputs": [], "source": [ @@ -614,6 +778,7 @@ "train_sample = pd.read_csv(os.path.join(data_dir, 'train.csv'), header=None, names=None, nrows=250)\n", "\n", "# Turn the input pandas dataframe into tensors\n", + "\n", "train_sample_y = torch.from_numpy(train_sample[[0]].values).float().squeeze()\n", "train_sample_X = torch.from_numpy(train_sample.drop([0], axis=1).values).long()\n", "\n", @@ -634,7 +799,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 90, "metadata": {}, "outputs": [], "source": [ @@ -649,8 +814,13 @@ " batch_y = batch_y.to(device)\n", " \n", " # TODO: Complete this train method to train the model provided.\n", - " \n", + " optimizer.zero_grad()\n", + " output = model(batch_X)\n", + " loss = loss_fn(output, batch_y)\n", + " loss.backward()\n", " total_loss += loss.data.item()\n", + " optimizer.step()\n", + " \n", " print(\"Epoch: {}, BCELoss: {}\".format(epoch, total_loss / len(train_loader)))" ] }, @@ -663,9 +833,21 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch: 1, BCELoss: 0.691946291923523\n", + "Epoch: 2, BCELoss: 0.6817818284034729\n", + "Epoch: 3, BCELoss: 0.6726435661315918\n", + "Epoch: 4, BCELoss: 0.6623459339141846\n", + "Epoch: 5, BCELoss: 0.6493294477462769\n" + ] + } + ], "source": [ "import torch.optim as optim\n", "from train.model import LSTMClassifier\n", @@ -700,18 +882,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from sagemaker.pytorch import PyTorch\n", "\n", + "\n", "estimator = PyTorch(entry_point=\"train.py\",\n", " source_dir=\"train\",\n", " role=role,\n", " framework_version='0.4.0',\n", " train_instance_count=1,\n", - " train_instance_type='ml.p2.xlarge',\n", + " train_instance_type='ml.m4.xlarge',\n", " hyperparameters={\n", " 'epochs': 10,\n", " 'hidden_dim': 200,\n", @@ -722,7 +905,180 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2020-05-07 10:30:45 Starting - Starting the training job......\n", + "2020-05-07 10:31:26 Starting - Launching requested ML instances......\n", + "2020-05-07 10:32:24 Starting - Preparing the instances for training......\n", + "2020-05-07 10:33:43 Downloading - Downloading input data\n", + "2020-05-07 10:33:43 Training - Downloading the training image...\n", + "2020-05-07 10:34:02 Training - Training image download completed. Training in progress.\u001b[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device\u001b[0m\n", + "\u001b[34mbash: no job control in this shell\u001b[0m\n", + "\u001b[34m2020-05-07 10:34:03,146 sagemaker-containers INFO Imported framework sagemaker_pytorch_container.training\u001b[0m\n", + "\u001b[34m2020-05-07 10:34:03,149 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m2020-05-07 10:34:03,162 sagemaker_pytorch_container.training INFO Block until all host DNS lookups succeed.\u001b[0m\n", + "\u001b[34m2020-05-07 10:34:04,578 sagemaker_pytorch_container.training INFO Invoking user training script.\u001b[0m\n", + "\u001b[34m2020-05-07 10:34:04,861 sagemaker-containers INFO Module train does not provide a setup.py. \u001b[0m\n", + "\u001b[34mGenerating setup.py\u001b[0m\n", + "\u001b[34m2020-05-07 10:34:04,861 sagemaker-containers INFO Generating setup.cfg\u001b[0m\n", + "\u001b[34m2020-05-07 10:34:04,861 sagemaker-containers INFO Generating MANIFEST.in\u001b[0m\n", + "\u001b[34m2020-05-07 10:34:04,861 sagemaker-containers INFO Installing module with the following command:\u001b[0m\n", + "\u001b[34m/usr/bin/python -m pip install -U . -r requirements.txt\u001b[0m\n", + "\u001b[34mProcessing /opt/ml/code\u001b[0m\n", + "\u001b[34mCollecting pandas (from -r requirements.txt (line 1))\n", + " Downloading https://files.pythonhosted.org/packages/74/24/0cdbf8907e1e3bc5a8da03345c23cbed7044330bb8f73bb12e711a640a00/pandas-0.24.2-cp35-cp35m-manylinux1_x86_64.whl (10.0MB)\u001b[0m\n", + "\u001b[34mCollecting numpy (from -r requirements.txt (line 2))\n", + " Downloading https://files.pythonhosted.org/packages/38/92/fa5295d9755c7876cb8490eab866e1780154033fa45978d9cf74ffbd4c68/numpy-1.18.4-cp35-cp35m-manylinux1_x86_64.whl (20.0MB)\u001b[0m\n", + "\u001b[34mCollecting nltk (from -r requirements.txt (line 3))\n", + " Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)\u001b[0m\n", + "\u001b[34mCollecting beautifulsoup4 (from -r requirements.txt (line 4))\n", + " Downloading https://files.pythonhosted.org/packages/e8/b5/7bb03a696f2c9b7af792a8f51b82974e51c268f15e925fc834876a4efa0b/beautifulsoup4-4.9.0-py3-none-any.whl (109kB)\u001b[0m\n", + "\u001b[34mCollecting html5lib (from -r requirements.txt (line 5))\n", + " Downloading https://files.pythonhosted.org/packages/a5/62/bbd2be0e7943ec8504b517e62bab011b4946e1258842bc159e5dfde15b96/html5lib-1.0.1-py2.py3-none-any.whl (117kB)\u001b[0m\n", + "\u001b[34mCollecting pytz>=2011k (from pandas->-r requirements.txt (line 1))\n", + " Downloading https://files.pythonhosted.org/packages/4f/a4/879454d49688e2fad93e59d7d4efda580b783c745fd2ec2a3adf87b0808d/pytz-2020.1-py2.py3-none-any.whl (510kB)\u001b[0m\n", + "\u001b[34mRequirement already satisfied, skipping upgrade: python-dateutil>=2.5.0 in /usr/local/lib/python3.5/dist-packages (from pandas->-r requirements.txt (line 1)) (2.7.5)\u001b[0m\n", + "\u001b[34mRequirement already satisfied, skipping upgrade: click in /usr/local/lib/python3.5/dist-packages (from nltk->-r requirements.txt (line 3)) (7.0)\u001b[0m\n", + "\u001b[34mCollecting joblib (from nltk->-r requirements.txt (line 3))\n", + " Downloading https://files.pythonhosted.org/packages/28/5c/cf6a2b65a321c4a209efcdf64c2689efae2cb62661f8f6f4bb28547cf1bf/joblib-0.14.1-py2.py3-none-any.whl (294kB)\u001b[0m\n", + "\u001b[34mCollecting regex (from nltk->-r requirements.txt (line 3))\u001b[0m\n", + "\u001b[34m Downloading https://files.pythonhosted.org/packages/4c/e7/eee73c42c1193fecc0e91361a163cbb8dfbea62c3db7618ad986e5b43a14/regex-2020.4.4.tar.gz (695kB)\u001b[0m\n", + "\u001b[34mCollecting tqdm (from nltk->-r requirements.txt (line 3))\n", + " Downloading https://files.pythonhosted.org/packages/c9/40/058b12e8ba10e35f89c9b1fdfc2d4c7f8c05947df2d5eb3c7b258019fda0/tqdm-4.46.0-py2.py3-none-any.whl (63kB)\u001b[0m\n", + "\u001b[34mCollecting soupsieve>1.2 (from beautifulsoup4->-r requirements.txt (line 4))\n", + " Downloading https://files.pythonhosted.org/packages/05/cf/ea245e52f55823f19992447b008bcbb7f78efc5960d77f6c34b5b45b36dd/soupsieve-2.0-py2.py3-none-any.whl\u001b[0m\n", + "\u001b[34mRequirement already satisfied, skipping upgrade: six>=1.9 in /usr/local/lib/python3.5/dist-packages (from html5lib->-r requirements.txt (line 5)) (1.11.0)\u001b[0m\n", + "\u001b[34mCollecting webencodings (from html5lib->-r requirements.txt (line 5))\n", + " Downloading https://files.pythonhosted.org/packages/f4/24/2a3e3df732393fed8b3ebf2ec078f05546de641fe1b667ee316ec1dcf3b7/webencodings-0.5.1-py2.py3-none-any.whl\u001b[0m\n", + "\u001b[34mBuilding wheels for collected packages: nltk, train, regex\n", + " Running setup.py bdist_wheel for nltk: started\u001b[0m\n", + "\u001b[34m Running setup.py bdist_wheel for nltk: finished with status 'done'\n", + " Stored in directory: /root/.cache/pip/wheels/ae/8c/3f/b1fe0ba04555b08b57ab52ab7f86023639a526d8bc8d384306\n", + " Running setup.py bdist_wheel for train: started\u001b[0m\n", + "\u001b[34m Running setup.py bdist_wheel for train: finished with status 'done'\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-4dxw0rzu/wheels/35/24/16/37574d11bf9bde50616c67372a334f94fa8356bc7164af8ca3\n", + " Running setup.py bdist_wheel for regex: started\u001b[0m\n", + "\u001b[34m Running setup.py bdist_wheel for regex: finished with status 'done'\n", + " Stored in directory: /root/.cache/pip/wheels/e6/9b/ae/2972da29cc7759b71dee015813b7c6931917d6a51e64ed5e79\u001b[0m\n", + "\u001b[34mSuccessfully built nltk train regex\u001b[0m\n", + "\u001b[34mInstalling collected packages: numpy, pytz, pandas, joblib, regex, tqdm, nltk, soupsieve, beautifulsoup4, webencodings, html5lib, train\n", + " Found existing installation: numpy 1.15.4\u001b[0m\n", + "\u001b[34m Uninstalling numpy-1.15.4:\u001b[0m\n", + "\u001b[34m Successfully uninstalled numpy-1.15.4\u001b[0m\n", + "\u001b[34mSuccessfully installed beautifulsoup4-4.9.0 html5lib-1.0.1 joblib-0.14.1 nltk-3.5 numpy-1.18.4 pandas-0.24.2 pytz-2020.1 regex-2020.4.4 soupsieve-2.0 tqdm-4.46.0 train-1.0.0 webencodings-0.5.1\u001b[0m\n", + "\u001b[34mYou are using pip version 18.1, however version 20.1 is available.\u001b[0m\n", + "\u001b[34mYou should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", + "\u001b[34m2020-05-07 10:34:27,264 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m2020-05-07 10:34:27,278 sagemaker-containers INFO Invoking user script\n", + "\u001b[0m\n", + "\u001b[34mTraining Env:\n", + "\u001b[0m\n", + "\u001b[34m{\n", + " \"module_name\": \"train\",\n", + " \"input_dir\": \"/opt/ml/input\",\n", + " \"hyperparameters\": {\n", + " \"hidden_dim\": 200,\n", + " \"epochs\": 10\n", + " },\n", + " \"log_level\": 20,\n", + " \"module_dir\": \"s3://sagemaker-eu-west-2-705833918113/sagemaker-pytorch-2020-05-07-10-30-44-825/source/sourcedir.tar.gz\",\n", + " \"num_gpus\": 0,\n", + " \"input_data_config\": {\n", + " \"training\": {\n", + " \"TrainingInputMode\": \"File\",\n", + " \"S3DistributionType\": \"FullyReplicated\",\n", + " \"RecordWrapperType\": \"None\"\n", + " }\n", + " },\n", + " \"network_interface_name\": \"eth0\",\n", + " \"output_dir\": \"/opt/ml/output\",\n", + " \"channel_input_dirs\": {\n", + " \"training\": \"/opt/ml/input/data/training\"\n", + " },\n", + " \"output_data_dir\": \"/opt/ml/output/data\",\n", + " \"input_config_dir\": \"/opt/ml/input/config\",\n", + " \"num_cpus\": 4,\n", + " \"job_name\": \"sagemaker-pytorch-2020-05-07-10-30-44-825\",\n", + " \"additional_framework_parameters\": {},\n", + " \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n", + " \"framework_module\": \"sagemaker_pytorch_container.training:main\",\n", + " \"user_entry_point\": \"train.py\",\n", + " \"hosts\": [\n", + " \"algo-1\"\n", + " ],\n", + " \"current_host\": \"algo-1\",\n", + " \"resource_config\": {\n", + " \"network_interface_name\": \"eth0\",\n", + " \"hosts\": [\n", + " \"algo-1\"\n", + " ],\n", + " \"current_host\": \"algo-1\"\n", + " },\n", + " \"model_dir\": \"/opt/ml/model\"\u001b[0m\n", + "\u001b[34m}\n", + "\u001b[0m\n", + "\u001b[34mEnvironment variables:\n", + "\u001b[0m\n", + "\u001b[34mSM_HP_HIDDEN_DIM=200\u001b[0m\n", + "\u001b[34mSM_MODULE_NAME=train\u001b[0m\n", + "\u001b[34mSM_HPS={\"epochs\":10,\"hidden_dim\":200}\u001b[0m\n", + "\u001b[34mSM_RESOURCE_CONFIG={\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"}\u001b[0m\n", + "\u001b[34mSM_NUM_GPUS=0\u001b[0m\n", + "\u001b[34mSM_LOG_LEVEL=20\u001b[0m\n", + "\u001b[34mSM_INPUT_DIR=/opt/ml/input\u001b[0m\n", + "\u001b[34mSM_FRAMEWORK_PARAMS={}\u001b[0m\n", + "\u001b[34mSM_CURRENT_HOST=algo-1\u001b[0m\n", + "\u001b[34mSM_CHANNELS=[\"training\"]\u001b[0m\n", + "\u001b[34mSM_OUTPUT_DIR=/opt/ml/output\u001b[0m\n", + "\u001b[34mPYTHONPATH=/usr/local/bin:/usr/lib/python35.zip:/usr/lib/python3.5:/usr/lib/python3.5/plat-x86_64-linux-gnu:/usr/lib/python3.5/lib-dynload:/usr/local/lib/python3.5/dist-packages:/usr/lib/python3/dist-packages\u001b[0m\n", + "\u001b[34mSM_USER_ARGS=[\"--epochs\",\"10\",\"--hidden_dim\",\"200\"]\u001b[0m\n", + "\u001b[34mSM_INPUT_DATA_CONFIG={\"training\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}}\u001b[0m\n", + "\u001b[34mSM_USER_ENTRY_POINT=train.py\u001b[0m\n", + "\u001b[34mSM_OUTPUT_DATA_DIR=/opt/ml/output/data\u001b[0m\n", + "\u001b[34mSM_MODEL_DIR=/opt/ml/model\u001b[0m\n", + "\u001b[34mSM_CHANNEL_TRAINING=/opt/ml/input/data/training\u001b[0m\n", + "\u001b[34mSM_HOSTS=[\"algo-1\"]\u001b[0m\n", + "\u001b[34mSM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\u001b[0m\n", + "\u001b[34mSM_NETWORK_INTERFACE_NAME=eth0\u001b[0m\n", + "\u001b[34mSM_INPUT_CONFIG_DIR=/opt/ml/input/config\u001b[0m\n", + "\u001b[34mSM_MODULE_DIR=s3://sagemaker-eu-west-2-705833918113/sagemaker-pytorch-2020-05-07-10-30-44-825/source/sourcedir.tar.gz\u001b[0m\n", + "\u001b[34mSM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main\u001b[0m\n", + "\u001b[34mSM_NUM_CPUS=4\u001b[0m\n", + "\u001b[34mSM_HP_EPOCHS=10\u001b[0m\n", + "\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"training\":\"/opt/ml/input/data/training\"},\"current_host\":\"algo-1\",\"framework_module\":\"sagemaker_pytorch_container.training:main\",\"hosts\":[\"algo-1\"],\"hyperparameters\":{\"epochs\":10,\"hidden_dim\":200},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"training\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"job_name\":\"sagemaker-pytorch-2020-05-07-10-30-44-825\",\"log_level\":20,\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"s3://sagemaker-eu-west-2-705833918113/sagemaker-pytorch-2020-05-07-10-30-44-825/source/sourcedir.tar.gz\",\"module_name\":\"train\",\"network_interface_name\":\"eth0\",\"num_cpus\":4,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"train.py\"}\n", + "\u001b[0m\n", + "\u001b[34mInvoking script with the following command:\n", + "\u001b[0m\n", + "\u001b[34m/usr/bin/python -m train --epochs 10 --hidden_dim 200\n", + "\n", + "\u001b[0m\n", + "\u001b[34mUsing device cpu.\u001b[0m\n", + "\u001b[34mGet train data loader.\u001b[0m\n", + "\u001b[34mModel loaded with embedding_dim 32, hidden_dim 200, vocab_size 5000.\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34mEpoch: 1, BCELoss: 0.6686128937468236\u001b[0m\n", + "\u001b[34mEpoch: 5, BCELoss: 0.36999733411535923\u001b[0m\n", + "\u001b[34mEpoch: 7, BCELoss: 0.3187107614108494\u001b[0m\n", + "\u001b[34mEpoch: 8, BCELoss: 0.31308953798547084\u001b[0m\n", + "\u001b[34mEpoch: 9, BCELoss: 0.2818753390896077\u001b[0m\n", + "\u001b[34mEpoch: 10, BCELoss: 0.2627122727583866\u001b[0m\n", + "\u001b[34m2020-05-07 12:24:00,607 sagemaker-containers INFO Reporting training SUCCESS\u001b[0m\n", + "\n", + "2020-05-07 12:24:10 Uploading - Uploading generated training model\n", + "2020-05-07 12:24:10 Completed - Training job completed\n", + "Training seconds: 6643\n", + "Billable seconds: 6643\n" + ] + } + ], "source": [ "estimator.fit({'training': input_data})" ] @@ -754,11 +1110,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ - "# TODO: Deploy the trained model" + "# TODO: Deploy the trained model\n", + "predictor = estimator.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')" ] }, { @@ -772,7 +1129,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ @@ -781,7 +1138,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ @@ -798,7 +1155,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -808,9 +1165,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.84836" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.metrics import accuracy_score\n", "accuracy_score(test_y, predictions)" @@ -827,7 +1195,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Answer:**" + "**Answer:**\n", + "This models accuracy score is very close to the XGBoost models accuracy score.\n", + "\n", + "These two models might perform differently as they work better with different sizes of data sets whilst neural networks work better with very large data sets they are often out performed by alternatives with smaller data sets. So even though neural network might be better designed for natural language processing the size of the data set meant it was unable to outperform the XGBoost model. Also XGBoost tends to perform better on more structured data like the one used in this project.\n", + "\n", + "In this instance an XGBoost might be preferable as it needs less computing power to be trained however if the data sample was much larger the neural network model would likly out perform the XGBoost model.\n" ] }, { @@ -841,7 +1214,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ @@ -865,12 +1238,22 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1, 1374, 50, 53, 3, 4, 878, 173, 392, 682, 29, 723, 2, 4412, 275, 2081, 1059, 760, 1, 581, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n" + ] + } + ], "source": [ "# TODO: Convert test_review into a form usable by the model and save the results in test_data\n", - "test_data = None" + "test_review_to_words = review_to_words(test_review)\n", + "test_data, _ = convert_and_pad(word_dict, test_review_to_words)\n", + "print(test_data)" ] }, { @@ -882,11 +1265,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 80, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array(0.7069678, dtype=float32)" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "predictor.predict(test_data)" + "predictor.predict([test_data])" ] }, { @@ -941,9 +1335,108 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36margparse\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mjson\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mos\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mpickle\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36msys\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36msagemaker_containers\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mpandas\u001b[39;49;00m \u001b[34mas\u001b[39;49;00m \u001b[04m\u001b[36mpd\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mnumpy\u001b[39;49;00m \u001b[34mas\u001b[39;49;00m \u001b[04m\u001b[36mnp\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mtorch\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mtorch.nn\u001b[39;49;00m \u001b[34mas\u001b[39;49;00m \u001b[04m\u001b[36mnn\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mtorch.optim\u001b[39;49;00m \u001b[34mas\u001b[39;49;00m \u001b[04m\u001b[36moptim\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mtorch.utils.data\u001b[39;49;00m\r\n", + "\r\n", + "\u001b[34mfrom\u001b[39;49;00m \u001b[04m\u001b[36mmodel\u001b[39;49;00m \u001b[34mimport\u001b[39;49;00m LSTMClassifier\r\n", + "\r\n", + "\u001b[34mfrom\u001b[39;49;00m \u001b[04m\u001b[36mutils\u001b[39;49;00m \u001b[34mimport\u001b[39;49;00m review_to_words, convert_and_pad\r\n", + "\r\n", + "\u001b[34mdef\u001b[39;49;00m \u001b[32mmodel_fn\u001b[39;49;00m(model_dir):\r\n", + " \u001b[33m\"\"\"Load the PyTorch model from the `model_dir` directory.\"\"\"\u001b[39;49;00m\r\n", + " \u001b[34mprint\u001b[39;49;00m(\u001b[33m\"\u001b[39;49;00m\u001b[33mLoading model.\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\r\n", + "\r\n", + " \u001b[37m# First, load the parameters used to create the model.\u001b[39;49;00m\r\n", + " model_info = {}\r\n", + " model_info_path = os.path.join(model_dir, \u001b[33m'\u001b[39;49;00m\u001b[33mmodel_info.pth\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\r\n", + " \u001b[34mwith\u001b[39;49;00m \u001b[36mopen\u001b[39;49;00m(model_info_path, \u001b[33m'\u001b[39;49;00m\u001b[33mrb\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m) \u001b[34mas\u001b[39;49;00m f:\r\n", + " model_info = torch.load(f)\r\n", + "\r\n", + " \u001b[34mprint\u001b[39;49;00m(\u001b[33m\"\u001b[39;49;00m\u001b[33mmodel_info: {}\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m.format(model_info))\r\n", + "\r\n", + " \u001b[37m# Determine the device and construct the model.\u001b[39;49;00m\r\n", + " device = torch.device(\u001b[33m\"\u001b[39;49;00m\u001b[33mcuda\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m \u001b[34mif\u001b[39;49;00m torch.cuda.is_available() \u001b[34melse\u001b[39;49;00m \u001b[33m\"\u001b[39;49;00m\u001b[33mcpu\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\r\n", + " model = LSTMClassifier(model_info[\u001b[33m'\u001b[39;49;00m\u001b[33membedding_dim\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m], model_info[\u001b[33m'\u001b[39;49;00m\u001b[33mhidden_dim\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m], model_info[\u001b[33m'\u001b[39;49;00m\u001b[33mvocab_size\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m])\r\n", + "\r\n", + " \u001b[37m# Load the store model parameters.\u001b[39;49;00m\r\n", + " model_path = os.path.join(model_dir, \u001b[33m'\u001b[39;49;00m\u001b[33mmodel.pth\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\r\n", + " \u001b[34mwith\u001b[39;49;00m \u001b[36mopen\u001b[39;49;00m(model_path, \u001b[33m'\u001b[39;49;00m\u001b[33mrb\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m) \u001b[34mas\u001b[39;49;00m f:\r\n", + " model.load_state_dict(torch.load(f))\r\n", + "\r\n", + " \u001b[37m# Load the saved word_dict.\u001b[39;49;00m\r\n", + " word_dict_path = os.path.join(model_dir, \u001b[33m'\u001b[39;49;00m\u001b[33mword_dict.pkl\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\r\n", + " \u001b[34mwith\u001b[39;49;00m \u001b[36mopen\u001b[39;49;00m(word_dict_path, \u001b[33m'\u001b[39;49;00m\u001b[33mrb\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m) \u001b[34mas\u001b[39;49;00m f:\r\n", + " model.word_dict = pickle.load(f)\r\n", + "\r\n", + " model.to(device).eval()\r\n", + "\r\n", + " \u001b[34mprint\u001b[39;49;00m(\u001b[33m\"\u001b[39;49;00m\u001b[33mDone loading model.\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\r\n", + " \u001b[34mreturn\u001b[39;49;00m model\r\n", + "\r\n", + "\u001b[34mdef\u001b[39;49;00m \u001b[32minput_fn\u001b[39;49;00m(serialized_input_data, content_type):\r\n", + " \u001b[34mprint\u001b[39;49;00m(\u001b[33m'\u001b[39;49;00m\u001b[33mDeserializing the input data.\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\r\n", + " \u001b[34mif\u001b[39;49;00m content_type == \u001b[33m'\u001b[39;49;00m\u001b[33mtext/plain\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m:\r\n", + " data = serialized_input_data.decode(\u001b[33m'\u001b[39;49;00m\u001b[33mutf-8\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\r\n", + " \u001b[34mreturn\u001b[39;49;00m data\r\n", + " \u001b[34mraise\u001b[39;49;00m \u001b[36mException\u001b[39;49;00m(\u001b[33m'\u001b[39;49;00m\u001b[33mRequested unsupported ContentType in content_type: \u001b[39;49;00m\u001b[33m'\u001b[39;49;00m + content_type)\r\n", + "\r\n", + "\u001b[34mdef\u001b[39;49;00m \u001b[32moutput_fn\u001b[39;49;00m(prediction_output, accept):\r\n", + " \u001b[34mprint\u001b[39;49;00m(\u001b[33m'\u001b[39;49;00m\u001b[33mSerializing the generated output.\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\r\n", + " \u001b[34mreturn\u001b[39;49;00m \u001b[36mstr\u001b[39;49;00m(prediction_output)\r\n", + "\r\n", + "\u001b[34mdef\u001b[39;49;00m \u001b[32mpredict_fn\u001b[39;49;00m(input_data, model):\r\n", + " \u001b[34mprint\u001b[39;49;00m(\u001b[33m'\u001b[39;49;00m\u001b[33mInferring sentiment of input data.\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\r\n", + "\r\n", + " device = torch.device(\u001b[33m\"\u001b[39;49;00m\u001b[33mcuda\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m \u001b[34mif\u001b[39;49;00m torch.cuda.is_available() \u001b[34melse\u001b[39;49;00m \u001b[33m\"\u001b[39;49;00m\u001b[33mcpu\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\r\n", + " \r\n", + " \u001b[34mif\u001b[39;49;00m model.word_dict \u001b[35mis\u001b[39;49;00m \u001b[36mNone\u001b[39;49;00m:\r\n", + " \u001b[34mraise\u001b[39;49;00m \u001b[36mException\u001b[39;49;00m(\u001b[33m'\u001b[39;49;00m\u001b[33mModel has not been loaded properly, no word_dict.\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\r\n", + " \r\n", + " \u001b[37m# TODO: Process input_data so that it is ready to be sent to our model.\u001b[39;49;00m\r\n", + " \u001b[37m# You should produce two variables:\u001b[39;49;00m\r\n", + " \u001b[37m# data_X - A sequence of length 500 which represents the converted review\u001b[39;49;00m\r\n", + " \u001b[37m# data_len - The length of the review\u001b[39;49;00m\r\n", + "\r\n", + " data_X = \u001b[36mNone\u001b[39;49;00m\r\n", + " data_len = \u001b[36mNone\u001b[39;49;00m\r\n", + "\r\n", + " \u001b[37m# Using data_X and data_len we construct an appropriate input tensor. Remember\u001b[39;49;00m\r\n", + " \u001b[37m# that our model expects input data of the form 'len, review[500]'.\u001b[39;49;00m\r\n", + " data_pack = np.hstack((data_len, data_X))\r\n", + " data_pack = data_pack.reshape(\u001b[34m1\u001b[39;49;00m, -\u001b[34m1\u001b[39;49;00m)\r\n", + " \r\n", + " data = torch.from_numpy(data_pack)\r\n", + " data = data.to(device)\r\n", + "\r\n", + " \u001b[37m# Make sure to put the model into evaluation mode\u001b[39;49;00m\r\n", + " model.eval()\r\n", + "\r\n", + " \u001b[37m# TODO: Compute the result of applying the model to the input data. The variable `result` should\u001b[39;49;00m\r\n", + " \u001b[37m# be a numpy array which contains a single integer which is either 1 or 0\u001b[39;49;00m\r\n", + "\r\n", + " result = \u001b[36mNone\u001b[39;49;00m\r\n", + "\r\n", + " \u001b[34mreturn\u001b[39;49;00m result\r\n" + ] + } + ], "source": [ "!pygmentize serve/predict.py" ] @@ -970,9 +1463,17 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---------------!" + ] + } + ], "source": [ "from sagemaker.predictor import RealTimePredictor\n", "from sagemaker.pytorch import PyTorchModel\n", @@ -1001,11 +1502,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "import glob\n", + "import os\n", "\n", "def test_reviews(data_dir='../data/aclImdb', stop=250):\n", " \n", @@ -1033,7 +1535,10 @@ " # Read in the review and convert to 'utf-8' for transmission via HTTP\n", " review_input = review.read().encode('utf-8')\n", " # Send the review to the predictor and store the results\n", - " results.append(int(predictor.predict(review_input)))\n", + " result = predictor.predict(review_input)\n", + " result = result.decode('UTF-8')\n", + " result = eval(result)\n", + " results.append(int(result[0]))\n", " \n", " # Sending reviews to our endpoint one at a time takes a while so we\n", " # only send a small number of reviews\n", @@ -1046,18 +1551,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting pos files\n", + "Starting neg files\n" + ] + } + ], "source": [ - "ground, results = test_reviews()" + "ground, results= test_reviews()\n" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.848" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.metrics import accuracy_score\n", "accuracy_score(ground, results)" @@ -1072,9 +1597,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 226, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "b'[1.]'" + ] + }, + "execution_count": 226, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "predictor.predict(test_review)" ] @@ -1222,7 +1758,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Answer:**" + "**Answer:**\n", + "\"You might have hoped that any commentary around a new drama about Nazis trying to establish a fourth Reich in modern-day America would not need include the word “timely”. But we are where we are.\"\n", + "\n", + "It predicted that the review was bad, which it was." ] }, { diff --git a/Project/serve/predict.py b/Project/serve/predict.py index 00c9149e6..450ebf307 100644 --- a/Project/serve/predict.py +++ b/Project/serve/predict.py @@ -69,9 +69,9 @@ def predict_fn(input_data, model): # You should produce two variables: # data_X - A sequence of length 500 which represents the converted review # data_len - The length of the review + review_words = review_to_words(input_data) - data_X = None - data_len = None + data_X, data_len = convert_and_pad(model.word_dict, review_words) # Using data_X and data_len we construct an appropriate input tensor. Remember # that our model expects input data of the form 'len, review[500]'. @@ -86,7 +86,6 @@ def predict_fn(input_data, model): # TODO: Compute the result of applying the model to the input data. The variable `result` should # be a numpy array which contains a single integer which is either 1 or 0 - - result = None - + result = model(data) + result = np.array([round(result.item())]) return result diff --git a/Project/train/train.py b/Project/train/train.py index 9cf9915b8..ac0a6e474 100644 --- a/Project/train/train.py +++ b/Project/train/train.py @@ -54,22 +54,25 @@ def _get_train_data_loader(batch_size, training_dir): return torch.utils.data.DataLoader(train_ds, batch_size=batch_size) - def train(model, train_loader, epochs, optimizer, loss_fn, device): - """ - This is the training method that is called by the PyTorch training script. The parameters - passed are as follows: - model - The PyTorch model that we wish to train. - train_loader - The PyTorch DataLoader that should be used during training. - epochs - The total number of epochs to train for. - optimizer - The optimizer to use during training. - loss_fn - The loss function used for training. - device - Where the model and data should be loaded (gpu or cpu). - """ - - # TODO: Paste the train() method developed in the notebook here. - - pass + for epoch in range(1, epochs + 1): + model.train() + total_loss = 0 + for batch in train_loader: + batch_X, batch_y = batch + + batch_X = batch_X.to(device) + batch_y = batch_y.to(device) + + # TODO: Complete this train method to train the model provided. + optimizer.zero_grad() + output = model(batch_X) + loss = loss_fn(output, batch_y) + loss.backward() + total_loss += loss.data.item() + optimizer.step() + + print("Epoch: {}, BCELoss: {}".format(epoch, total_loss / len(train_loader))) if __name__ == '__main__': diff --git a/Project/website/index.html b/Project/website/index.html index 6ae4feffb..fae8276e2 100644 --- a/Project/website/index.html +++ b/Project/website/index.html @@ -37,7 +37,7 @@

Is your review positive, or negative?

Enter your review below and click submit to find out...