-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentiment_analysis.py
79 lines (54 loc) · 2.47 KB
/
sentiment_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
import nltk
import joblib
nltk.download('stopwords')
df = pd.read_csv('sentiment_analysis.csv')
stop_words = set(stopwords.words('english'))
df['tweet'] = df['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['label'], test_size=0.2, random_state=42)
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)
models = {
"Logistic Regression": LogisticRegression(),
"SVM": SVC(probability=True),
"Random Forest": RandomForestClassifier(),
"Naive Bayes": MultinomialNB(),
"Gradient Boosting": GradientBoostingClassifier()
}
results = {}
for model_name, model in models.items():
pipeline = make_pipeline(CountVectorizer(), model)
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
results[model_name] = accuracy
print(f'{model_name} Accuracy: {accuracy}')
cm = confusion_matrix(y_test, predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title(f'{model_name} Confusion Matrix')
plt.savefig(os.path.join(output_dir, f'{model_name}_confusion_matrix.png'))
plt.close()
report = classification_report(y_test, predictions)
with open(os.path.join(output_dir, f'{model_name}_classification_report.txt'), 'w') as f:
f.write(f'{model_name} Classification Report:\n{report}\n')
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
best_pipeline = make_pipeline(CountVectorizer(), best_model)
best_pipeline.fit(df['tweet'], df['label'])
joblib.dump(best_pipeline, 'best_model.pkl')
print(f'The best model is {best_model_name} with accuracy {results[best_model_name]}')