-
Notifications
You must be signed in to change notification settings - Fork 1
/
subwordlstm.py
87 lines (69 loc) · 2.42 KB
/
subwordlstm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# Import statements
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_datasets as tfds
# Get data from csv file (in same folder location)
train_set = pd.read_csv("trainset.csv")
dev_set = pd.read_csv("devset.csv")
'''
print(train_set.shape)
print(train_set.head())
print(dev_set.shape)
print(dev_set.head())
'''
# Split the datasets
X_train = train_set.Text
y_train = train_set.Sentiment
X_dev = dev_set.Text
y_dev = dev_set.Sentiment
# Print out basic information about the dataset
print(X_train.head())
print(y_train.head())
print(X_dev.head())
print(y_dev.head())
vocab_size = 1000
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(X_train, vocab_size, max_subword_length=5)
print("vocab size: ", tokenizer.vocab_size)
for i, tweet in enumerate(X_train):
X_train[i] = tokenizer.encode(tweet)
print(X_train[:4])
max_len = 280
train_seqs = tokenizer.texts_to_sequences(X_train)
dev_seqs = tokenizer.texts_to_sequences(X_dev)
train_padded = pad_sequences(train_seqs, maxlen=max_len, truncating="post", padding="post")
dev_padded = pad_sequences(dev_seqs, maxlen=max_len)
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, 16, input_length=max_len),
tf.keras.layers.Dropout(0.4),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.summary()
epochs = 10
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(train_padded, y_train, epochs=epochs, validation_data=(dev_padded, y_dev))
plt.plot(history.history["accuracy"])
plt.plot(history.history["val_accuracy"])
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend(["accuracy", "val_accuracy"])
plt.show()
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend(["loss", "val_loss"])
plt.show()
def predict_sentence(model, sentence):
sample_seq = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sample_seq, padding='post', maxlen=max_len)
classes = model.predict(padded)
for x in range(len(padded)):
print(sentence[x])
print(classes[x])
print("----------")
sentences = ["dap me up bro", "i hate you and ur mum", "this is why she left you", "thanks for the gold kind stranger"]
predict_sentence(model, sentences)