-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_NLP_emotion_model.py
146 lines (120 loc) · 5.17 KB
/
train_NLP_emotion_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# BET EMOTIONS [Fear, Anger, Joy, Sadness, Disgust, Surprise]
import pandas as pd
import sklearn
from keras.utils import np_utils
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, multilabel_confusion_matrix, classification_report
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
import os
import pickle
from sklearn.pipeline import Pipeline
# Save the model and weights
def saveModel(modelName, model):
save_dir = os.path.join(os.getcwd(), 'my_models')
if not os.path.isdir(save_dir):
os.makedirs(save_dir)
model_path = os.path.join(save_dir, modelName+".sav")
pickle.dump(model, open(model_path, 'wb'))
print('Saved trained model at %s ' % model_path)
# Load the model
def loadModel(filepath):
loaded_model = pickle.load(open(filepath, 'rb'))
return loaded_model
# Change the plot styling
plt.style.use('ggplot')
# Plot the accuracy vs val accuracy and loss vs val loss
def plot_history(history):
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
x = range(1, len(acc) + 1)
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(x, acc, 'b', label='Training acc')
plt.plot(x, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(x, loss, 'b', label='Training loss')
plt.plot(x, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
# Read in the emotional text dataset
data = pd.read_csv("..\\Datasets\\ISEARs.csv", sep='|', usecols=["SIT", "Field1"]) # Emotion Range: [joy, fear, anger, sadness, disgust, shame, guilt]
# Remove \n chars
data = data.rename(columns={"SIT": "text", "Field1":"label"})
data["text"] = data["text"].str.replace("\n", "")
testableData = data[data["label"] != "shame"]
testableData = testableData[testableData["label"] != "guilt"]
testableData['text'] = testableData['text'].str.lower()
print("Class Distribution:")
print(testableData["label"].value_counts())
print(len(data))
# Convert the labels to numbers
# print(testableData["label"].unique())
# testableData["label"] = testableData["label"].replace("joy", 1)
# testableData["label"] = testableData["label"].replace("fear", 2)
# testableData["label"] = testableData["label"].replace("anger", 3)
# testableData["label"] = testableData["label"].replace("sadness", 4)
# testableData["label"] = testableData["label"].replace("disgust", 5)
sentences = testableData["text"].values
y = testableData['label'].values
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)
lb = LabelEncoder()
y_train_og = y_train
y_test_og = y_test
y_train = np_utils.to_categorical(lb.fit_transform(y_train))
y_test = np_utils.to_categorical(lb.fit_transform(y_test))
# Vectorizer Definition
vectorizer = CountVectorizer(min_df=0, lowercase=True)
vectorizer.fit(sentences_train)
X_train = vectorizer.transform(sentences_train)
X_test = vectorizer.transform(sentences_test)
'''Our MODEL'''
# Logistic Regression - Baseline 65%
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train_og)
preds = classifier.predict(X_test)
print(len(preds))
print("LogReg Accuracy:", accuracy_score(y_test_og, preds))
print("Confusion Matrix:")
print(multilabel_confusion_matrix(y_test_og, preds))
print("Report:")
print(classification_report(y_test_og, preds))
'''Their BASELINE MODEL'''
# Read in the emotional text dataset
data = pd.read_csv("..\\Datasets\\RAVDESS\\utterancesFull.csv") # Emotion Range: [joy, fear, anger, sadness, disgust, shame, guilt]
data["text"] = data["text"].str.replace("\n", "")
# drop unused emotions
#testableData = data[data["label"] != "shame"]
#testableData = testableData[testableData["label"] != "guilt"]
testableData = data
# print class distibution
print("BASELINE Class Distribution:")
print(testableData["label"].value_counts())
# Remove stop words from the sentences
stop_words = set(stopwords.words('english'))
testableData['text'] = testableData['text'].str.lower().str.split()
testableData['text'] = testableData['text'].apply(lambda x: [item for item in x if item not in stop_words])
testableData['text'] = testableData['text'].str.join(" ")
sentences = testableData["text"].values
y = testableData['label'].values
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)
lb = LabelEncoder()
y_train_og = y_train
y_test_og = y_test
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression(max_iter=1000))])
classifier = text_clf
classifier.fit(sentences_train, y_train_og)
preds = classifier.predict(sentences_test)
print("LogReg Accuracy:", accuracy_score(y_test_og, preds))
print("Confusion Matrix:")
print(multilabel_confusion_matrix(y_test_og, preds))
print("Report:")
print(classification_report(y_test_og, preds))