From 7a6972bcd43c4f2ff9809f7760b3f774ab5bd612 Mon Sep 17 00:00:00 2001 From: dimazhylko Date: Sat, 12 Dec 2020 05:30:26 +0100 Subject: [PATCH] first nlp notebook --- NLP/nlp-v0.ipynb | 1546 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1546 insertions(+) create mode 100644 NLP/nlp-v0.ipynb diff --git a/NLP/nlp-v0.ipynb b/NLP/nlp-v0.ipynb new file mode 100644 index 0000000..f7423e6 --- /dev/null +++ b/NLP/nlp-v0.ipynb @@ -0,0 +1,1546 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install tensorflow==1.15" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from google.colab import files\n", + "files.upload()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install kaggle" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir ~/.kaggle\n", + "!mv kaggle.json ~/.kaggle" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!kaggle datasets download -d bittlingmayer/amazonreviews" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!unzip amazonreviews.zip " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm amazonreviews.zip\n", + "!ls" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import nltk\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem import SnowballStemmer\n", + "from collections import defaultdict\n", + "from nltk.corpus import wordnet as wn\n", + "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", + "from sklearn import model_selection, naive_bayes\n", + "from sklearn.metrics import accuracy_score\n", + "import bz2\n", + "import re\n", + "from tqdm.notebook import tqdm\n", + "import matplotlib.pyplot as plt\n", + "plt.style.use('ggplot')" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "def prepare_text(text):\n", + " text = text.lower()\n", + " idx = text.find(':')\n", + " text = text[idx+1:]\n", + " \n", + " text = re.sub(r\"[^\\w\\s]+\", '', text)\n", + " text = re.sub(r\"\\s+\", ' ', text)\n", + " return ' '.join([word for word in text.strip().split() if len(word) > 1])\n", + "\n", + "def read_and_preprocess(file, total=1, sub_size=-1):\n", + " labels = []\n", + " texts = []\n", + " if sub_size != -1:\n", + " total = min(total, sub_size)\n", + " \n", + " for l in tqdm(bz2.BZ2File(file), total=total):\n", + " x = l.decode('utf-8')\n", + " label = int(x[9]) - 1\n", + " text = x[10:].strip()\n", + " text = prepare_text(text)\n", + " if text != ' ':\n", + " labels.append(label)\n", + " texts.append(text)\n", + " if len(texts) == sub_size:\n", + " break\n", + " \n", + " return np.array(labels), texts" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2f074774a07c4ae49929422093ede39c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=500000.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "train_labels, train_texts = read_and_preprocess('archive-2/train.ft.txt.bz2', total=3600000, sub_size=500000)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0b53430bf1c74b09a712347241d39866", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=200000.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "test_labels, test_texts = read_and_preprocess('archive-2/test.ft.txt.bz2', total=400000, sub_size=200000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Naive Bayes" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "stemmer = SnowballStemmer(\"english\")\n", + "\n", + "def stem_text(text):\n", + " text_tokenized = word_tokenize(text)\n", + " return ' '.join([stemmer.stem(word) for word in text_tokenized])" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "96466f47db334716bf64ac523fc6d634", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=500000.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "train_texts_stammed = [stem_text(text) for text in tqdm(train_texts)]" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b0585fc220cd4ced8514e0f06c02cb77", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=200000.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "test_texts_stemmed = [stem_text(text) for text in tqdm(test_texts)]" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to\n", + "[nltk_data] /Users/dimazhylko/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "['i',\n", + " 'me',\n", + " 'my',\n", + " 'myself',\n", + " 'we',\n", + " 'our',\n", + " 'ours',\n", + " 'ourselves',\n", + " 'you',\n", + " \"you're\",\n", + " \"you've\",\n", + " \"you'll\",\n", + " \"you'd\",\n", + " 'your',\n", + " 'yours',\n", + " 'yourself',\n", + " 'yourselves',\n", + " 'he',\n", + " 'him',\n", + " 'his',\n", + " 'himself',\n", + " 'she',\n", + " \"she's\",\n", + " 'her',\n", + " 'hers',\n", + " 'herself',\n", + " 'it',\n", + " \"it's\",\n", + " 'its',\n", + " 'itself',\n", + " 'they',\n", + " 'them',\n", + " 'their',\n", + " 'theirs',\n", + " 'themselves',\n", + " 'what',\n", + " 'which',\n", + " 'who',\n", + " 'whom',\n", + " 'this',\n", + " 'that',\n", + " \"that'll\",\n", + " 'these',\n", + " 'those',\n", + " 'am',\n", + " 'is',\n", + " 'are',\n", + " 'was',\n", + " 'were',\n", + " 'be',\n", + " 'been',\n", + " 'being',\n", + " 'have',\n", + " 'has',\n", + " 'had',\n", + " 'having',\n", + " 'do',\n", + " 'does',\n", + " 'did',\n", + " 'doing',\n", + " 'a',\n", + " 'an',\n", + " 'the',\n", + " 'and',\n", + " 'but',\n", + " 'if',\n", + " 'or',\n", + " 'because',\n", + " 'as',\n", + " 'until',\n", + " 'while',\n", + " 'of',\n", + " 'at',\n", + " 'by',\n", + " 'for',\n", + " 'with',\n", + " 'about',\n", + " 'against',\n", + " 'between',\n", + " 'into',\n", + " 'through',\n", + " 'during',\n", + " 'before',\n", + " 'after',\n", + " 'above',\n", + " 'below',\n", + " 'to',\n", + " 'from',\n", + " 'up',\n", + " 'down',\n", + " 'in',\n", + " 'out',\n", + " 'on',\n", + " 'off',\n", + " 'over',\n", + " 'under',\n", + " 'again',\n", + " 'further',\n", + " 'then',\n", + " 'once',\n", + " 'here',\n", + " 'there',\n", + " 'when',\n", + " 'where',\n", + " 'why',\n", + " 'how',\n", + " 'all',\n", + " 'any',\n", + " 'both',\n", + " 'each',\n", + " 'few',\n", + " 'more',\n", + " 'most',\n", + " 'other',\n", + " 'some',\n", + " 'such',\n", + " 'no',\n", + " 'nor',\n", + " 'not',\n", + " 'only',\n", + " 'own',\n", + " 'same',\n", + " 'so',\n", + " 'than',\n", + " 'too',\n", + " 'very',\n", + " 's',\n", + " 't',\n", + " 'can',\n", + " 'will',\n", + " 'just',\n", + " 'don',\n", + " \"don't\",\n", + " 'should',\n", + " \"should've\",\n", + " 'now',\n", + " 'd',\n", + " 'll',\n", + " 'm',\n", + " 'o',\n", + " 're',\n", + " 've',\n", + " 'y',\n", + " 'ain',\n", + " 'aren',\n", + " \"aren't\",\n", + " 'couldn',\n", + " \"couldn't\",\n", + " 'didn',\n", + " \"didn't\",\n", + " 'doesn',\n", + " \"doesn't\",\n", + " 'hadn',\n", + " \"hadn't\",\n", + " 'hasn',\n", + " \"hasn't\",\n", + " 'haven',\n", + " \"haven't\",\n", + " 'isn',\n", + " \"isn't\",\n", + " 'ma',\n", + " 'mightn',\n", + " \"mightn't\",\n", + " 'mustn',\n", + " \"mustn't\",\n", + " 'needn',\n", + " \"needn't\",\n", + " 'shan',\n", + " \"shan't\",\n", + " 'shouldn',\n", + " \"shouldn't\",\n", + " 'wasn',\n", + " \"wasn't\",\n", + " 'weren',\n", + " \"weren't\",\n", + " 'won',\n", + " \"won't\",\n", + " 'wouldn',\n", + " \"wouldn't\"]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nltk.download('stopwords')\n", + "stopwords.words('english')" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2db651fa6f654be8b5262ea5d1e5abd4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=500000.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',\n", + " 'ourselves', 'you', \"you're\", \"you've\", \"you'll\",\n", + " \"you'd\", 'your', 'yours', 'yourself', 'yourselves',\n", + " 'he', 'him', 'his', 'himself', 'she', \"she's\",\n", + " 'her', 'hers', 'herself', 'it', \"it's\", 'its',\n", + " 'itself', ...])" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vectorizer.fit(tqdm(train_texts_stammed))" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "464542" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(vectorizer.vocabulary_)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " (0, 459946)\t0.07996947534081235\n", + " (0, 459736)\t0.31538065409307176\n", + " (0, 457003)\t0.09742771221067453\n", + " (0, 456371)\t0.06520515383876084\n", + " (0, 456168)\t0.09713377583791682\n", + " (0, 427397)\t0.1522013109363848\n", + " (0, 414328)\t0.17111347538516142\n", + " (0, 400409)\t0.18128965197057867\n", + " (0, 385844)\t0.22411317316711604\n", + " (0, 381526)\t0.1483145901715332\n", + " (0, 371333)\t0.118999542170106\n", + " (0, 355789)\t0.1682200085112425\n", + " (0, 344880)\t0.1754696697183511\n", + " (0, 339375)\t0.16337151657839263\n", + " (0, 334608)\t0.06474427379750541\n", + " (0, 321766)\t0.10144302733177181\n", + " (0, 321490)\t0.10598393853075028\n", + " (0, 305987)\t0.17359537914160617\n", + " (0, 294015)\t0.3251993177406255\n", + " (0, 292838)\t0.0706608532769978\n", + " (0, 291694)\t0.05459682227165906\n", + " (0, 274154)\t0.09936518099451207\n", + " (0, 273387)\t0.08933815725521534\n", + " (0, 271816)\t0.07582024442678638\n", + " (0, 266313)\t0.09095363420801905\n", + " (0, 264547)\t0.2985955377215965\n", + " (0, 253314)\t0.14794290219685868\n", + " (0, 244473)\t0.09280938288968092\n", + " (0, 240953)\t0.09820270026942181\n", + " (0, 206669)\t0.17025931308755213\n", + " (0, 205099)\t0.11738729685773906\n", + " (0, 177644)\t0.07882700410235738\n", + " (0, 171721)\t0.1143947303147365\n", + " (0, 158388)\t0.1267594336413101\n", + " (0, 156602)\t0.09383720788048346\n", + " (0, 152480)\t0.3251993177406255\n", + " (0, 146695)\t0.0934623823077469\n", + " (0, 122173)\t0.17446662989786169\n", + " (0, 80727)\t0.08981696660906542\n", + " (0, 73620)\t0.0773700997164049\n", + " (0, 59883)\t0.10539847299062545\n", + " (0, 56872)\t0.08339506749009365\n", + " (0, 53599)\t0.11035509264729593\n", + " (0, 35041)\t0.08101940355208788\n" + ] + } + ], + "source": [ + "print(vectorizer.transform([train_texts_stammed[1]]))" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ed38ab126e354cf3ab6f67ac07bff0a9", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=500000.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "473544d2953e45d9a5d640c65d97c8c8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=200000.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Naive Bayes Accuracy Score -> 75.4015\n" + ] + } + ], + "source": [ + "train_texts_tfidf = vectorizer.transform(tqdm(train_texts_stammed))\n", + "test_texts_tfidf = vectorizer.transform(tqdm(test_texts))\n", + "\n", + "bayes_model = naive_bayes.MultinomialNB()\n", + "bayes_model.fit(train_texts_tfidf, train_labels)\n", + "predictions_bayes = bayes_model.predict(test_texts_tfidf)\n", + "\n", + "print(\"Naive Bayes Accuracy Score -> \", accuracy_score(predictions_bayes, test_labels)*100)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.bar([0, 1], [len(train_labels[train_labels==0]), len(train_labels[train_labels==1])])" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.bar([0, 1], [len(test_labels[test_labels==0]), len(test_labels[test_labels==1])])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Naive DL implementation" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_history(history):\n", + " acc = history.history['acc']\n", + " val_acc = history.history['val_acc']\n", + " loss = history.history['loss']\n", + " val_loss = history.history['val_loss']\n", + " x = range(1, len(acc) + 1)\n", + "\n", + " plt.figure(figsize=(12, 5))\n", + " plt.subplot(1, 2, 1)\n", + " plt.plot(x, acc, 'b', label='Training acc')\n", + " plt.plot(x, val_acc, 'r', label='Validation acc')\n", + " plt.title('Training and validation accuracy')\n", + " plt.legend()\n", + " plt.subplot(1, 2, 2)\n", + " plt.plot(x, loss, 'b', label='Training loss')\n", + " plt.plot(x, val_loss, 'r', label='Validation loss')\n", + " plt.title('Training and validation loss')\n", + " plt.legend()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bb27037fede14648968b62c4e87937a5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=500000.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7857701f5a424c5a8fdb89f115a51cca", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=500000.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "53ac9841498f4e7fbaaf012990bd8911", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=200000.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "464670\n", + "WARNING:tensorflow:From /usr/local/Caskroom/miniconda/base/envs/bertsum/lib/python3.7/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "If using Keras pass *_constraint arguments to layers.\n", + "WARNING:tensorflow:From /usr/local/Caskroom/miniconda/base/envs/bertsum/lib/python3.7/site-packages/tensorflow_core/python/ops/nn_impl.py:183: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Use tf.where in 2.0, which has the same broadcast rule as np.where\n", + "Model: \"sequential\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "dense (Dense) (None, 64) 29738944 \n", + "_________________________________________________________________\n", + "dense_1 (Dense) (None, 1) 65 \n", + "=================================================================\n", + "Total params: 29,739,009\n", + "Trainable params: 29,739,009\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n" + ] + } + ], + "source": [ + "import tensorflow as tf\n", + "from tensorflow import keras\n", + "\n", + "vectorizer = CountVectorizer()\n", + "vectorizer.fit(tqdm(train_texts_stammed))\n", + "train_X = vectorizer.transform(tqdm(train_texts_stammed))\n", + "test_X = vectorizer.transform(tqdm(test_texts_stemmed))\n", + "\n", + "input_dim = train_X.shape[1]\n", + "print(input_dim)\n", + "\n", + "model = keras.models.Sequential([\n", + " keras.layers.Dense(64, input_dim=input_dim, activation='relu'),\n", + " keras.layers.Dense(1, activation='sigmoid')\n", + "])\n", + "\n", + "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n", + "\n", + "model.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mvalidation_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest_X\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_labels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mepochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m batch_size=32)\n\u001b[0m", + "\u001b[0;32m/usr/local/Caskroom/miniconda/base/envs/bertsum/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m \u001b[0mmax_queue_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmax_queue_size\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 726\u001b[0m \u001b[0mworkers\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mworkers\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m use_multiprocessing=use_multiprocessing)\n\u001b[0m\u001b[1;32m 728\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 729\u001b[0m def evaluate(self,\n", + "\u001b[0;32m/usr/local/Caskroom/miniconda/base/envs/bertsum/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_arrays.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, **kwargs)\u001b[0m\n\u001b[1;32m 641\u001b[0m \u001b[0msteps\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msteps_per_epoch\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 642\u001b[0m \u001b[0mvalidation_split\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvalidation_split\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 643\u001b[0;31m shuffle=shuffle)\n\u001b[0m\u001b[1;32m 644\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 645\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mvalidation_data\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/Caskroom/miniconda/base/envs/bertsum/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py\u001b[0m in \u001b[0;36m_standardize_user_data\u001b[0;34m(self, x, y, sample_weight, class_weight, batch_size, check_steps, steps_name, steps, validation_split, shuffle, extract_tensors_from_dataset)\u001b[0m\n\u001b[1;32m 2487\u001b[0m \u001b[0mconverted_x\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2488\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mflat_inputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflat_expected_inputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2489\u001b[0;31m \u001b[0mconverted_x\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_convert_scipy_sparse_tensor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2490\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnest\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpack_sequence_as\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconverted_x\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexpand_composites\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2491\u001b[0m \u001b[0mx_shapes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnest\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmap_structure\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype_spec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtype_spec_from_value\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/Caskroom/miniconda/base/envs/bertsum/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py\u001b[0m in \u001b[0;36m_convert_scipy_sparse_tensor\u001b[0;34m(value, expected_input)\u001b[0m\n\u001b[1;32m 3233\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0missparse\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0missparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3234\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mops\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_dense_tensor_like\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpected_input\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3235\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3236\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3237\u001b[0m \u001b[0msparse_coo\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtocoo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/Caskroom/miniconda/base/envs/bertsum/lib/python3.7/site-packages/scipy/sparse/compressed.py\u001b[0m in \u001b[0;36mtoarray\u001b[0;34m(self, order, out)\u001b[0m\n\u001b[1;32m 1034\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1035\u001b[0m \u001b[0mM\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mN\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_swap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1036\u001b[0;31m \u001b[0mcsr_todense\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mM\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mN\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindptr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindices\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1037\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1038\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "history = model.fit(train_X, train_labels, \n", + " validation_data=(test_X, test_labels),\n", + " epochs=3,\n", + " batch_size=32)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_history(history)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model with Embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "555f313ae4bc45b88d702ad2c6061ece", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=1000000.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from tensorflow.keras.preprocessing.text import Tokenizer\n", + "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", + "\n", + "tokenizer = Tokenizer(num_words=50000)\n", + "tokenizer.fit_on_texts(tqdm(train_texts))\n", + "\n", + "vocab_size = len(tokenizer.word_index) + 1" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "X_train = tokenizer.texts_to_sequences(tqdm(train_texts))\n", + "X_test = tokenizer.texts_to_sequences(tqdm(test_texts))" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [], + "source": [ + "max_len = 512\n", + "\n", + "X_train = pad_sequences(X_train, padding='post', maxlen=max_len)\n", + "X_test = pad_sequences(X_test, padding='post', maxlen=max_len)" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /usr/local/Caskroom/miniconda/base/envs/bertsum/lib/python3.7/site-packages/tensorflow_core/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Call initializer instance with the dtype argument instead of passing it to the constructor\n", + "Model: \"sequential_2\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "embedding (Embedding) (None, 512, 50) 20122500 \n", + "_________________________________________________________________\n", + "flatten (Flatten) (None, 25600) 0 \n", + "_________________________________________________________________\n", + "dense_6 (Dense) (None, 128) 3276928 \n", + "_________________________________________________________________\n", + "dense_7 (Dense) (None, 128) 16512 \n", + "_________________________________________________________________\n", + "dense_8 (Dense) (None, 1) 129 \n", + "=================================================================\n", + "Total params: 23,416,069\n", + "Trainable params: 23,416,069\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n" + ] + } + ], + "source": [ + "embedding_dim = 50\n", + "\n", + "model = keras.models.Sequential([\n", + " keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),\n", + " keras.layers.Flatten(),\n", + " keras.layers.Dense(64, input_dim=input_dim, activation='relu'),\n", + " keras.layers.Dense(1, activation='sigmoid')\n", + "])\n", + "\n", + "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n", + "\n", + "model.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "history = model.fit(X_train, train_labels, \n", + " validation_data=(X_test, test_labels),\n", + " epochs=30,\n", + " batch_size=32)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_history(history)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embedding_dim = 50\n", + "\n", + "model = keras.models.Sequential([\n", + " keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),\n", + " keras.layers.GlobalAveragePooling1D(),\n", + " keras.layers.Dense(64, activation='relu'),\n", + " keras.layers.Dense(1, activation='sigmoid')\n", + "])\n", + "\n", + "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n", + "\n", + "model.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "history = model.fit(X_train, train_labels, \n", + " validation_data=(X_test, test_labels),\n", + " epochs=3,\n", + " batch_size=32)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_history(history)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model with pre-trained Embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dyld: Library not loaded: /usr/local/opt/openssl/lib/libssl.1.0.0.dylib\r\n", + " Referenced from: /usr/local/bin/wget\r\n", + " Reason: image not found\r\n" + ] + } + ], + "source": [ + "!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Archive: glove.twitter.27B.zip\n", + " inflating: glove.twitter.27B.25d.txt \n", + " inflating: glove.twitter.27B.50d.txt \n", + " inflating: glove.twitter.27B.100d.txt \n", + " inflating: glove.twitter.27B.200d.txt \n", + "Untitled.ipynb glove.twitter.27B.200d.txt\n", + "\u001b[34marchive-2\u001b[m\u001b[m glove.twitter.27B.25d.txt\n", + "glove.twitter.27B.100d.txt glove.twitter.27B.50d.txt\n" + ] + } + ], + "source": [ + "!unzip glove.twitter.27B.zip\n", + "!rm glove.twitter.27B.zip\n", + "!ls" + ] + }, + { + "cell_type": "code", + "execution_count": 192, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2a992cc357ce4e71978f98466a85dc70", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=1000000.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "713947" + ] + }, + "execution_count": 192, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer = Tokenizer(num_words=5000)\n", + "tokenizer.fit_on_texts(tqdm(train_texts))\n", + "\n", + "vocab_size = len(tokenizer.word_index) + 1\n", + "vocab_size" + ] + }, + { + "cell_type": "code", + "execution_count": 229, + "metadata": {}, + "outputs": [], + "source": [ + "vocabulary = set()\n", + "\n", + "def load_embeddings(file_name, total=1):\n", + " weights = np.zeros((vocab_size, embedding_dim))\n", + " \n", + " with open(file_name) as f:\n", + " for l in tqdm(f, total=total):\n", + " word, *vector = l.split()\n", + " word = word.lower()\n", + " vocabulary.add(word)\n", + " \n", + " if word in tokenizer.word_index:\n", + " weights[tokenizer.word_index[word]] = np.array(vector, dtype=np.float32)\n", + " return weights" + ] + }, + { + "cell_type": "code", + "execution_count": 231, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a5dd687b4c7d4a8d8c9c5322cfaf85ad", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=400000.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "0.1925829228220022" + ] + }, + "execution_count": 231, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "w = load_embeddings('glove.6B.50d.txt', total=400000)\n", + "\n", + "nonzero_elements = np.count_nonzero(np.count_nonzero(w, axis=1))\n", + "nonzero_elements / vocab_size" + ] + }, + { + "cell_type": "code", + "execution_count": 228, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: \"sequential_11\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "embedding_8 (Embedding) (None, 512, 50) 35697350 \n", + "_________________________________________________________________\n", + "flatten_8 (Flatten) (None, 25600) 0 \n", + "_________________________________________________________________\n", + "dense_33 (Dense) (None, 64) 1638464 \n", + "_________________________________________________________________\n", + "dense_34 (Dense) (None, 1) 65 \n", + "=================================================================\n", + "Total params: 37,335,879\n", + "Trainable params: 1,638,529\n", + "Non-trainable params: 35,697,350\n", + "_________________________________________________________________\n" + ] + } + ], + "source": [ + "embedding_dim = 50\n", + "\n", + "model = keras.models.Sequential([\n", + " keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, \n", + " input_length=max_len, weights=[w], trainable=False),\n", + " keras.layers.Flatten(),\n", + " keras.layers.Dense(64, activation='relu'),\n", + " keras.layers.Dense(1, activation='sigmoid')\n", + "])\n", + "\n", + "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n", + "\n", + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transformer (spoiler alert!)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install tqdm==4.47.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install transformers==3.5.0 simpletransformers==0.49.3" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": {}, + "outputs": [], + "source": [ + "from simpletransformers.classification import ClassificationModel\n", + "import logging\n", + "\n", + "logging.basicConfig(level=logging.INFO)\n", + "transformers_logger = logging.getLogger(\"transformers\")\n", + "transformers_logger.setLevel(logging.WARNING)" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "metadata": {}, + "outputs": [], + "source": [ + "train_data_df = pd.DataFrame({'text': train_texts[:200000], 'labels': train_labels[:200000]})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "eval_data_df = pd.DataFrame({'text': train_texts[200000:250000], 'labels': train_labels[200000:250000]})" + ] + }, + { + "cell_type": "code", + "execution_count": 174, + "metadata": {}, + "outputs": [], + "source": [ + "test_data_df = pd.DataFrame({'text': test_texts, 'labels': test_labels})" + ] + }, + { + "cell_type": "code", + "execution_count": 177, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:filelock:Lock 6910369040 acquired on /Users/dimazhylko/.cache/torch/transformers/51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ec33c6e53af7411aadc0fa2dc7964ce5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:filelock:Lock 6910369040 released on /Users/dimazhylko/.cache/torch/transformers/51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']\n", + "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "INFO:filelock:Lock 5249639248 acquired on /Users/dimazhylko/.cache/torch/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a18b1229ff9c4637ae0ea7f94d24e8f5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:filelock:Lock 5249639248 released on /Users/dimazhylko/.cache/torch/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:filelock:Lock 5249686544 acquired on /Users/dimazhylko/.cache/torch/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "36435f2eb8d84fad9779daab0af6d0e0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:filelock:Lock 5249686544 released on /Users/dimazhylko/.cache/torch/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "model = ClassificationModel('roberta', 'roberta-base', use_cuda=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "del train_texts\n", + "del train_labels\n", + "del test_texts\n", + "del test_labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.train_model(train_data_df, eval_df=eval_data_df, \n", + " args={\"num_train_epochs\": 1, 'evaluate_during_training': True,\n", + " 'learning_rate': 5e-5, 'train_batch_size': 32, 'eval_batch_size': 32, 'gradient_accumulation_steps': 1, \n", + " 'use_multipprocessing': False, 'fp16': True, 'lazy_loading': False, 'reprocess_input_data': False\n", + " 'save_steps': 7000}, acc=accuracy_score)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.args.reprocess_input_data = True\n", + "model.eval_model(test_data_df, acc=accuracy_score)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}