-
Notifications
You must be signed in to change notification settings - Fork 88
/
Copy pathreddit-sentiment-analysis.py
288 lines (234 loc) · 12.3 KB
/
reddit-sentiment-analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
'''*****************************************************************************
Purpose: To analyze the sentiments of the reddit
This program uses Vader SentimentIntensityAnalyzer to calculate the ticker compound value.
You can change multiple parameters to suit your needs. See below under "set program parameters."
Implementation:
I am using sets for 'x in s' comparison, sets time complexity for "x in s" is O(1) compare to list: O(n).
Limitations:
It depends mainly on the defined parameters for current implementation:
It completely ignores the heavily downvoted comments, and there can be a time when
the most mentioned ticker is heavily downvoted, but you can change that in upvotes variable.
Author: github:asad70
-------------------------------------------------------------------
****************************************************************************'''
import praw
from data import *
import time
import pandas as pd
import matplotlib.pyplot as plt
import squarify
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
import emoji # removes emojis
import re # removes links
import en_core_web_sm
import string
def data_extractor(reddit):
'''extracts all the data from reddit
Parameter: reddt: reddit obj
Return: posts, c_analyzed, tickers, titles, a_comments, picks, subs, picks_ayz
posts: int: # of posts analyzed
c_analyzed: int: # of comments analyzed
tickers: dict: all the tickers found
titles: list: list of the title of posts analyzed
a_comments: dict: all the comments to analyze
picks: int: top picks to analyze
subs: int: # of subreddits analyzed
picks_ayz: int: top picks to analyze
'''
'''############################################################################'''
# set the program parameters
subs = ['wallstreetbets' ] # sub-reddit to search
post_flairs = {'Daily Discussion', 'Weekend Discussion', 'Discussion'} # posts flairs to search || None flair is automatically considered
goodAuth = {'AutoModerator'} # authors whom comments are allowed more than once
uniqueCmt = True # allow one comment per author per symbol
ignoreAuthP = {'example'} # authors to ignore for posts
ignoreAuthC = {'example'} # authors to ignore for comment
upvoteRatio = 0.70 # upvote ratio for post to be considered, 0.70 = 70%
ups = 20 # define # of upvotes, post is considered if upvotes exceed this #
limit = 1 # define the limit, comments 'replace more' limit
upvotes = 2 # define # of upvotes, comment is considered if upvotes exceed this #
picks = 10 # define # of picks here, prints as "Top ## picks are:"
picks_ayz = 5 # define # of picks for sentiment analysis
'''############################################################################'''
posts, count, c_analyzed, tickers, titles, a_comments = 0, 0, 0, {}, [], {}
cmt_auth = {}
for sub in subs:
subreddit = reddit.subreddit(sub)
hot_python = subreddit.hot() # sorting posts by hot
# Extracting comments, symbols from subreddit
for submission in hot_python:
flair = submission.link_flair_text
author = submission.author.name
# checking: post upvote ratio # of upvotes, post flair, and author
if submission.upvote_ratio >= upvoteRatio and submission.ups > ups and (flair in post_flairs or flair is None) and author not in ignoreAuthP:
submission.comment_sort = 'new'
comments = submission.comments
titles.append(submission.title)
posts += 1
try:
submission.comments.replace_more(limit=limit)
for comment in comments:
# try except for deleted account?
try: auth = comment.author.name
except: pass
c_analyzed += 1
# checking: comment upvotes and author
if comment.score > upvotes and auth not in ignoreAuthC:
split = comment.body.split(" ")
for word in split:
word = word.replace("$", "")
# upper = ticker, length of ticker <= 5, excluded words,
if word.isupper() and len(word) <= 5 and word not in blacklist and word in us:
# unique comments, try/except for key errors
if uniqueCmt and auth not in goodAuth:
try:
if auth in cmt_auth[word]: break
except: pass
# counting tickers
if word in tickers:
tickers[word] += 1
a_comments[word].append(comment.body)
cmt_auth[word].append(auth)
count += 1
else:
tickers[word] = 1
cmt_auth[word] = [auth]
a_comments[word] = [comment.body]
count += 1
except Exception as e: print(e)
return posts, c_analyzed, tickers, titles, a_comments, picks, subs, picks_ayz
def print_helper(tickers, picks, c_analyzed, posts, subs, titles, time, start_time):
'''prints out top tickers, and most mentioned tickers
Parameter: tickers: dict: all the tickers found
picks: int: top picks to analyze
c_analyzed: int: # of comments analyzed
posts: int: # of posts analyzed
subs: int: # of subreddits analyzed
titles: list: list of the title of posts analyzed
time: time obj: top picks to analyze
start_time: time obj: prog start time
Return: symbols: dict: dict of sorted tickers based on mentions
times: list: include # of time top tickers is mentioned
top: list: list of top tickers
'''
# sorts the dictionary
symbols = dict(sorted(tickers.items(), key=lambda item: item[1], reverse = True))
top_picks = list(symbols.keys())[0:picks]
time = (time.time() - start_time)
# print top picks
print("It took {t:.2f} seconds to analyze {c} comments in {p} posts in {s} subreddits.\n".format(t=time, c=c_analyzed, p=posts, s=len(subs)))
print("Posts analyzed saved in titles")
#for i in titles: print(i) # prints the title of the posts analyzed
print(f"\n{picks} most mentioned tickers: ")
times = []
top = []
for i in top_picks:
print(f"{i}: {symbols[i]}")
times.append(symbols[i])
top.append(f"{i}: {symbols[i]}")
return symbols, times, top
def sentiment_analysis(picks_ayz, a_comments, symbols):
'''analyzes sentiment anaylsis of top tickers
Parameter: picks_ayz: int: top picks to analyze
a_comments: dict: all the comments to analyze
symbols: dict: dict of sorted tickers based on mentions
Return: scores: dictionary: dictionary of all the sentiment analysis
'''
scores = {}
vader = SentimentIntensityAnalyzer()
vader.lexicon.update(new_words) # adding custom words from data.py
picks_sentiment = list(symbols.keys())[0:picks_ayz]
for symbol in picks_sentiment:
stock_comments = a_comments[symbol]
for cmnt in stock_comments:
emojiless = emoji.get_emoji_regexp().sub(u'', cmnt) # remove emojis
# remove punctuation
text_punc = "".join([char for char in emojiless if char not in string.punctuation])
text_punc = re.sub('[0-9]+', '', text_punc)
# tokenizeing and cleaning
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|http\S+')
tokenized_string = tokenizer.tokenize(text_punc)
lower_tokenized = [word.lower() for word in tokenized_string] # convert to lower case
# remove stop words
nlp = en_core_web_sm.load()
stopwords = nlp.Defaults.stop_words
sw_removed = [word for word in lower_tokenized if not word in stopwords]
# normalize the words using lematization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = ([lemmatizer.lemmatize(w) for w in sw_removed])
# calculating sentiment of every word in comments n combining them
score_cmnt = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
word_count = 0
for word in lemmatized_tokens:
if word.upper() not in us:
score = vader.polarity_scores(word)
word_count += 1
for key, _ in score.items():
score_cmnt[key] += score[key]
else:
score_cmnt['pos'] = 2.0
# calculating avg.
try: # handles: ZeroDivisionError: float division by zero
for key in score_cmnt:
score_cmnt[key] = score_cmnt[key] / word_count
except: pass
# adding score the the specific symbol
if symbol in scores:
for key, _ in score_cmnt.items():
scores[symbol][key] += score_cmnt[key]
else:
scores[symbol] = score_cmnt
# calculating avg.
for key in score_cmnt:
scores[symbol][key] = scores[symbol][key] / symbols[symbol]
scores[symbol][key] = "{pol:.3f}".format(pol=scores[symbol][key])
return scores
def visualization(picks_ayz, scores, picks, times, top):
'''prints sentiment analysis
makes a most mentioned picks chart
makes a chart of sentiment analysis of top picks
Parameter: picks_ayz: int: top picks to analyze
scores: dictionary: dictionary of all the sentiment analysis
picks: int: most mentioned picks
times: list: include # of time top tickers is mentioned
top: list: list of top tickers
Return: None
'''
# printing sentiment analysis
print(f"\nSentiment analysis of top {picks_ayz} picks:")
df = pd.DataFrame(scores)
df.index = ['Bearish', 'Neutral', 'Bullish', 'Total/Compound']
df = df.T
print(df)
# Date Visualization
# most mentioned picks
squarify.plot(sizes=times, label=top, alpha=.7 )
plt.axis('off')
plt.title(f"{picks} most mentioned picks")
#plt.show()
# Sentiment analysis
df = df.astype(float)
colors = ['red', 'springgreen', 'forestgreen', 'coral']
df.plot(kind = 'bar', color=colors, title=f"Sentiment analysis of top {picks_ayz} picks:")
#plt.show()
def main():
'''main function
Parameter: None
Return: None
'''
start_time = time.time()
# reddit client
reddit = praw.Reddit(user_agent="Comment Extraction",
client_id="",
client_secret="",
username="",
password="")
posts, c_analyzed, tickers, titles, a_comments, picks, subs, picks_ayz = data_extractor(reddit)
symbols, times, top = print_helper(tickers, picks, c_analyzed, posts, subs, titles, time, start_time)
scores = sentiment_analysis(picks_ayz, a_comments, symbols)
visualization(picks_ayz, scores, picks, times, top)
if __name__ == '__main__':
main()