-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
36,999 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"GRUUsingTestData.ipynb","provenance":[],"collapsed_sections":[],"authorship_tag":"ABX9TyOJcCyO8ePKhtjiSZ1UphTl"},"kernelspec":{"name":"python3","display_name":"Python 3"},"accelerator":"GPU"},"cells":[{"cell_type":"code","metadata":{"id":"30t45_ETJUvN","colab_type":"code","colab":{}},"source":["import pandas as pd\n","import tensorflow as tf\n","import os\n","import re\n","import numpy as np\n","from string import punctuation\n","from zipfile import ZipFile\n","from sklearn.model_selection import train_test_split\n","import matplotlib.pyplot as plt"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"mEWkgvChQGdy","colab_type":"code","outputId":"110f63f9-ec28-4dc4-b0c0-ca8a319a00a1","executionInfo":{"status":"ok","timestamp":1591163498202,"user_tz":-540,"elapsed":28564,"user":{"displayName":"김민철","photoUrl":"","userId":"01557895546280688331"}},"colab":{"base_uri":"https://localhost:8080/","height":131}},"source":["from google.colab import drive\n","drive.mount('/content/gdrive')"],"execution_count":2,"outputs":[{"output_type":"stream","text":["Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly\n","\n","Enter your authorization code:\n","··········\n","Mounted at /content/gdrive\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"QF-E6FUTQOcA","colab_type":"code","colab":{}},"source":["PATH='/content/gdrive/My Drive/ML Project/test.csv'"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"ML0Md42iQcej","colab_type":"code","outputId":"0d89e270-e812-4812-f4f1-f2abdc2e3982","executionInfo":{"status":"ok","timestamp":1591163521953,"user_tz":-540,"elapsed":3940,"user":{"displayName":"김민철","photoUrl":"","userId":"01557895546280688331"}},"colab":{"base_uri":"https://localhost:8080/","height":55}},"source":["data = pd.read_csv(PATH)\n","print('총 샘플의 수 :',len(data))\n","train_data=data.copy()\n","train_data['title'].nunique(), data['text'].nunique() #기사제목, 기사내용 중복 내용 확인\n","# 윈도우 바탕화면에서 작업한 저자의 경우에는\n","# data = pd.read_csv(r'C:\\Users\\USER\\Desktop\\spam.csv',encoding='latin1')\n","train_data.drop_duplicates(subset=['text'], inplace=True) # v2 열에서 중복인 내용이 있다면 중복 제거\n","print('중복 제거 후 샘플의 수 :',len(train_data))"],"execution_count":4,"outputs":[{"output_type":"stream","text":["총 샘플의 수 : 5200\n","중복 제거 후 샘플의 수 : 5127\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"Sqsv-JRcRYDl","colab_type":"code","colab":{}},"source":["# importing neural network libraries\n","import tensorflow as tf\n","from tensorflow.keras.preprocessing.text import Tokenizer\n","from tensorflow.keras.preprocessing.sequence import pad_sequences\n","from tensorflow.keras.models import Sequential\n","from tensorflow.keras.layers import Dense, Dropout, Embedding, GRU, LSTM, RNN, SpatialDropout1D\n","from tensorflow.keras.models import load_model"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"RS-DGWwTQigJ","colab_type":"code","colab":{}},"source":["train_data = train_data.set_index('id', drop = True)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"1EBUOEixQ0_-","colab_type":"code","outputId":"879a62f2-bd36-435d-e5ab-9ee68e642059","executionInfo":{"status":"ok","timestamp":1591163533526,"user_tz":-540,"elapsed":888,"user":{"displayName":"김민철","photoUrl":"","userId":"01557895546280688331"}},"colab":{"base_uri":"https://localhost:8080/","height":251}},"source":["print(train_data.shape)\n","train_data.head()"],"execution_count":7,"outputs":[{"output_type":"stream","text":["(5127, 3)\n"],"name":"stdout"},{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>title</th>\n"," <th>author</th>\n"," <th>text</th>\n"," </tr>\n"," <tr>\n"," <th>id</th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>20800</th>\n"," <td>Specter of Trump Loosens Tongues, if Not Purse...</td>\n"," <td>David Streitfeld</td>\n"," <td>PALO ALTO, Calif. — After years of scorning...</td>\n"," </tr>\n"," <tr>\n"," <th>20801</th>\n"," <td>Russian warships ready to strike terrorists ne...</td>\n"," <td>NaN</td>\n"," <td>Russian warships ready to strike terrorists ne...</td>\n"," </tr>\n"," <tr>\n"," <th>20802</th>\n"," <td>#NoDAPL: Native American Leaders Vow to Stay A...</td>\n"," <td>Common Dreams</td>\n"," <td>Videos #NoDAPL: Native American Leaders Vow to...</td>\n"," </tr>\n"," <tr>\n"," <th>20803</th>\n"," <td>Tim Tebow Will Attempt Another Comeback, This ...</td>\n"," <td>Daniel Victor</td>\n"," <td>If at first you don’t succeed, try a different...</td>\n"," </tr>\n"," <tr>\n"," <th>20804</th>\n"," <td>Keiser Report: Meme Wars (E995)</td>\n"," <td>Truth Broadcast Network</td>\n"," <td>42 mins ago 1 Views 0 Comments 0 Likes 'For th...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" title ... text\n","id ... \n","20800 Specter of Trump Loosens Tongues, if Not Purse... ... PALO ALTO, Calif. — After years of scorning...\n","20801 Russian warships ready to strike terrorists ne... ... Russian warships ready to strike terrorists ne...\n","20802 #NoDAPL: Native American Leaders Vow to Stay A... ... Videos #NoDAPL: Native American Leaders Vow to...\n","20803 Tim Tebow Will Attempt Another Comeback, This ... ... If at first you don’t succeed, try a different...\n","20804 Keiser Report: Meme Wars (E995) ... 42 mins ago 1 Views 0 Comments 0 Likes 'For th...\n","\n","[5 rows x 3 columns]"]},"metadata":{"tags":[]},"execution_count":7}]},{"cell_type":"code","metadata":{"id":"moL7ODX0Q2wg","colab_type":"code","outputId":"f355be6a-c415-420c-87b4-8ea67d772884","executionInfo":{"status":"ok","timestamp":1591163538327,"user_tz":-540,"elapsed":1128,"user":{"displayName":"김민철","photoUrl":"","userId":"01557895546280688331"}},"colab":{"base_uri":"https://localhost:8080/","height":92}},"source":["train_data.isnull().sum()"],"execution_count":8,"outputs":[{"output_type":"execute_result","data":{"text/plain":["title 120\n","author 490\n","text 1\n","dtype: int64"]},"metadata":{"tags":[]},"execution_count":8}]},{"cell_type":"code","metadata":{"id":"X6Oi4VnqQ5_p","colab_type":"code","outputId":"92529f3f-7fcf-4919-ca42-252e92fa2f5a","executionInfo":{"status":"ok","timestamp":1591163557169,"user_tz":-540,"elapsed":1392,"user":{"displayName":"김민철","photoUrl":"","userId":"01557895546280688331"}},"colab":{"base_uri":"https://localhost:8080/","height":92}},"source":["train_data[['title', 'author']] = train_data[['title', 'author']].fillna(value = 'Missing')\n","train_data = train_data.dropna()\n","train_data.isnull().sum()"],"execution_count":9,"outputs":[{"output_type":"execute_result","data":{"text/plain":["title 0\n","author 0\n","text 0\n","dtype: int64"]},"metadata":{"tags":[]},"execution_count":9}]},{"cell_type":"code","metadata":{"id":"KawGg6tBQ8Te","colab_type":"code","outputId":"dbcedea6-ce4a-4446-e71c-edd5ac96f25e","executionInfo":{"status":"ok","timestamp":1591163613037,"user_tz":-540,"elapsed":1052,"user":{"displayName":"김민철","photoUrl":"","userId":"01557895546280688331"}},"colab":{"base_uri":"https://localhost:8080/","height":233}},"source":["length = []\n","[length.append(len(str(text))) for text in train_data['text']]\n","train_data['length'] = length\n","train_data.head()"],"execution_count":10,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>title</th>\n"," <th>author</th>\n"," <th>text</th>\n"," <th>length</th>\n"," </tr>\n"," <tr>\n"," <th>id</th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," <th></th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>20800</th>\n"," <td>Specter of Trump Loosens Tongues, if Not Purse...</td>\n"," <td>David Streitfeld</td>\n"," <td>PALO ALTO, Calif. — After years of scorning...</td>\n"," <td>8015</td>\n"," </tr>\n"," <tr>\n"," <th>20801</th>\n"," <td>Russian warships ready to strike terrorists ne...</td>\n"," <td>Missing</td>\n"," <td>Russian warships ready to strike terrorists ne...</td>\n"," <td>1559</td>\n"," </tr>\n"," <tr>\n"," <th>20802</th>\n"," <td>#NoDAPL: Native American Leaders Vow to Stay A...</td>\n"," <td>Common Dreams</td>\n"," <td>Videos #NoDAPL: Native American Leaders Vow to...</td>\n"," <td>4547</td>\n"," </tr>\n"," <tr>\n"," <th>20803</th>\n"," <td>Tim Tebow Will Attempt Another Comeback, This ...</td>\n"," <td>Daniel Victor</td>\n"," <td>If at first you don’t succeed, try a different...</td>\n"," <td>3578</td>\n"," </tr>\n"," <tr>\n"," <th>20804</th>\n"," <td>Keiser Report: Meme Wars (E995)</td>\n"," <td>Truth Broadcast Network</td>\n"," <td>42 mins ago 1 Views 0 Comments 0 Likes 'For th...</td>\n"," <td>542</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" title ... length\n","id ... \n","20800 Specter of Trump Loosens Tongues, if Not Purse... ... 8015\n","20801 Russian warships ready to strike terrorists ne... ... 1559\n","20802 #NoDAPL: Native American Leaders Vow to Stay A... ... 4547\n","20803 Tim Tebow Will Attempt Another Comeback, This ... ... 3578\n","20804 Keiser Report: Meme Wars (E995) ... 542\n","\n","[5 rows x 4 columns]"]},"metadata":{"tags":[]},"execution_count":10}]},{"cell_type":"code","metadata":{"id":"hRVB4wjNQ-n7","colab_type":"code","outputId":"9218e18c-92d4-4f04-ba8f-b3071519eba1","executionInfo":{"status":"ok","timestamp":1591163638877,"user_tz":-540,"elapsed":1148,"user":{"displayName":"김민철","photoUrl":"","userId":"01557895546280688331"}},"colab":{"base_uri":"https://localhost:8080/","height":36}},"source":["train_data = train_data.drop(train_data['text'][train_data['length'] < 50].index, axis = 0)\n","print('기사의 최대길이 : %d' % max(j for j in train_data['length']))"],"execution_count":11,"outputs":[{"output_type":"stream","text":["기사의 최대길이 : 107014\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"s1DfblyvRBRk","colab_type":"code","colab":{}},"source":["max_features = 4500"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"Vjl3wJt-RDpB","colab_type":"code","colab":{}},"source":["tokenizer = Tokenizer(num_words = max_features, filters='!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n', lower = True, split = ' ')\n","tokenizer.fit_on_texts(texts = train_data['text'])\n","X = tokenizer.texts_to_sequences(texts = train_data['text'])"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"7kYDI_Y0RF3H","colab_type":"code","outputId":"e777694c-f5a8-4562-e0d7-a9131e1a9f30","executionInfo":{"status":"ok","timestamp":1591163655218,"user_tz":-540,"elapsed":1237,"user":{"displayName":"김민철","photoUrl":"","userId":"01557895546280688331"}},"colab":{"base_uri":"https://localhost:8080/","height":56}},"source":["print(X[2000])"],"execution_count":14,"outputs":[{"output_type":"stream","text":["[2493, 3970, 3, 5, 668, 3, 908, 2406, 173, 177, 68, 300, 77, 4, 25, 20, 464, 6, 49, 3, 1, 24, 4414, 77, 112, 1239, 3, 8, 827, 2, 3330, 104, 764, 7, 16, 35, 48, 6, 571, 5, 87, 415, 714, 31, 368, 256, 56, 2313, 2, 125, 40, 14, 38, 65, 136, 67, 11, 16, 264, 958, 1100, 300, 77, 2354, 1318, 6, 3, 884, 300, 77, 2628, 24, 2287, 2235, 33, 4326, 1221, 3656, 37, 19, 733, 1, 3569, 12, 530, 507, 10, 390, 19, 1, 76, 3, 1, 3569, 1, 144, 35, 43, 1457, 83, 611, 40, 458, 4115, 13, 4, 46, 12, 1892, 7, 1, 3, 611, 82, 568, 2, 72, 90, 46, 48, 415, 15, 29, 1668, 9, 1, 2167, 5, 3599, 12, 113, 1442, 18, 29, 3060, 511, 33, 965, 7, 910, 13, 77, 1219, 48, 24, 571, 1, 415, 53, 1, 511, 1021, 46, 12, 47, 6, 1, 3970, 77, 8, 827, 10, 90, 341, 2, 392, 13, 1, 511, 4, 611, 7, 117, 1, 3, 1, 3970, 2813, 13, 1, 87, 415, 44, 388, 2, 1, 103, 144, 6, 4182, 28, 276, 74, 49, 145, 63, 77, 35, 748, 10, 1518, 1012, 13, 25, 860, 120, 34, 1, 546, 258, 233, 887, 34, 104, 910, 13, 306, 6, 25, 9, 5, 976, 3561, 1787, 1442, 105, 81, 18, 1762, 77, 45, 1198, 3161, 2, 396, 5, 976, 13, 6, 4473, 1100, 4220, 13, 144, 2964, 37, 121, 1019, 58, 220, 991, 2, 310, 13, 213, 10, 367, 452, 1019, 1065, 1127, 2, 2477, 9, 2077, 59, 8, 67, 144, 2964, 5, 9, 118, 6, 1, 722, 3, 5, 37, 836, 2493, 2406, 13, 77, 24, 4435, 2, 4414, 114, 16, 12, 1239, 2155, 2, 25, 121, 9, 68, 30, 14, 999, 353, 80, 7, 46, 648, 73, 1, 461, 1360, 2, 5, 390, 1291, 1, 3894, 3, 1, 4324, 580, 19, 5, 76, 53, 34, 2283, 274, 976, 3561, 1858, 105, 199, 887, 159, 37, 34, 77, 27, 35, 5, 298, 3, 51, 3837, 25, 1451, 3, 202, 55, 79, 38, 129]\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"rxz56oudRn6V","colab_type":"code","outputId":"d19322b8-69ae-49f4-c852-c5f126a88b7d","executionInfo":{"status":"ok","timestamp":1591163657986,"user_tz":-540,"elapsed":1474,"user":{"displayName":"김민철","photoUrl":"","userId":"01557895546280688331"}},"colab":{"base_uri":"https://localhost:8080/","height":36}},"source":["X = pad_sequences(sequences = X, maxlen = max_features, padding = 'pre')\n","print(X.shape)"],"execution_count":15,"outputs":[{"output_type":"stream","text":["(5093, 4500)\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"x2IHzM4tRqUy","colab_type":"code","outputId":"c8e33dae-449d-4c10-af18-1eee4f4c2005","executionInfo":{"status":"ok","timestamp":1591166777715,"user_tz":-540,"elapsed":1533,"user":{"displayName":"김민철","photoUrl":"","userId":"01557895546280688331"}},"colab":{"base_uri":"https://localhost:8080/","height":111}},"source":["import numpy as np\n","loaded_model = load_model('/content/gdrive/My Drive/ML Project/GRUbest_model.h5')\n","##xhat = X[1]\n","##yhat = loaded_model.predict(xhat)\n","##print('Predict : ' + str(yhat))\n","\n","xhat_idx = np.random.choice(X.shape[0], 5)\n","xhat = X[xhat_idx]\n","yhat = loaded_model.predict_classes(xhat)\n","\n","for i in range(5):\n"," if(yhat[i]==0):\n"," print('Predict : True '+ str(yhat[i]))\n"," else:\n"," print('Predict : False '+ str(yhat[i]))\n"," ##print('Predict : ' + str(yhat[i]))"],"execution_count":28,"outputs":[{"output_type":"stream","text":["Predict : False [1]\n","Predict : False [1]\n","Predict : False [1]\n","Predict : True [0]\n","Predict : False [1]\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"bRvwXbpARswa","colab_type":"code","colab":{}},"source":[""],"execution_count":0,"outputs":[]}]} |
Large diffs are not rendered by default.
Oops, something went wrong.
Oops, something went wrong.