Skip to content

Latest commit



120 lines (88 loc) · 3.87 KB

File metadata and controls

120 lines (88 loc) · 3.87 KB

Twitter Sentiment Analysis

This project employs machine learning to categorize tweets as positive, negative, or neutral, offering insights into public opinion on various topics. The model is trained using Logistic Regression, Decision Tree, and XGBoost, leveraging TF-IDF features and Bag-of-Words for textual data representation.


  • Sentiment Analysis: Classifies tweets into positive, negative, or neutral categories.
  • Hashtag Analysis: Extracts and analyzes the impact of hashtags on tweet sentiments.
  • Word Frequency Analysis: Identifies and visualizes the most frequent words in positive and negative tweets.
  • Text Feature Extraction: Utilizes Bag-of-Words and TF-IDF for feature extraction from tweets.
  • Machine Learning Models: Implements Logistic Regression, Decision Tree, and XGBoost for classification tasks.

Word Cloud Visualization

Mask = np.array('', stream=True).raw))
image_colors = ImageColorGenerator(Mask)

wc = WordCloud(background_color='black', height=1500, width=4000, mask=Mask).generate(all_words_negative)
plt.figure(figsize=(10, 20))
plt.imshow(wc.recolor(color_func=image_colors), interpolation="gaussian")

Hashtag Analysis

Extracting Hashtags

def Hashtags_Extract(x):
    hashtags = []
    for i in x:
        ht = re.findall(r'#(\w+)', i)
    return hashtags

ht_positive = Hashtags_Extract(combine['Tidy_Tweets'][combine['label'] == 0])
ht_positive_unnest = sum(ht_positive, [])
ht_negative = Hashtags_Extract(combine['Tidy_Tweets'][combine['label'] == 1])
ht_negative_unnest = sum(ht_negative, [])

Word Frequency of Positive Hashtags

word_freq_positive = nltk.FreqDist(ht_positive_unnest)
df_positive = pd.DataFrame({'Hashtags': list(word_freq_positive.keys()), 'Count': list(word_freq_positive.values())})
sns.barplot(data=df_positive.nlargest(20, columns='Count'), y='Hashtags', x='Count')

Word Frequency of Negative Hashtags

word_freq_negative = nltk.FreqDist(ht_negative_unnest)
df_negative = pd.DataFrame({'Hashtags': list(word_freq_negative.keys()), 'Count': list(word_freq_negative.values())})
sns.barplot(data=df_negative.nlargest(20, columns='Count'), y='Hashtags', x='Count')

Feature Extraction

Bag of Words

from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
bow = bow_vectorizer.fit_transform(combine['Tidy_Tweets'])
df_bow = pd.DataFrame(bow.todense())


from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
tfidf_matrix = tfidf.fit_transform(combine['Tidy_Tweets'])
df_tfidf = pd.DataFrame(tfidf_matrix.todense())

Model Implementation

Logistic Regression

from sklearn.linear_model import LogisticRegression

Log_Reg = LogisticRegression(random_state=0, solver='lbfgs')

# Using Bag-of-Words Features, y_train_bow)
prediction_bow = Log_Reg.predict_proba(x_valid_bow)

# Calculating F1 Score
from sklearn.metrics import f1_score
prediction_int = prediction_bow[:, 1] >= 0.3
prediction_int = prediction_int.astype(np.int64)
log_bow = f1_score(y_valid_bow, prediction_int)

# Using TF-IDF Features, y_train_tfidf)
prediction_tfidf = Log_Reg.predict_proba(x_valid_tfidf)

# Calculating F1 Score
prediction_int = prediction_tfidf[:, 1] >= 0.3
prediction_int = prediction_int.astype(np.int64)
log_tfidf = f1_score(y_valid_tfidf, prediction_int)


from xgboost import XGBClassifier

model_bow = XGBClassifier(random_state=22, learning_rate=0.9), y_train_bow)