-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbotornot_3.py
276 lines (243 loc) · 9.93 KB
/
botornot_3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
#!/usr/bin/env python
# coding: utf-8
# Honours Project 2019
# Krithika Saravanan, 100970975
# Run with Python 3.6
# Uncomment all method call lines
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from decimal import Decimal
import pprint
from botometer import Botometer
import tweepy
import json
import re
import preprocessor as p
import itertools
import pickle
from string import punctuation
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from collections import Counter
pp = pprint.PrettyPrinter(indent=4)
C_KEY = 'rdFeZ2ePyDf9qFhNGzkKGEC22'
C_SECRET = 'csvkpCZziAgbngIPjH9Gi8P2R3H1fNyQr3FpvsE3fh0jmXKI74'
A_TOKEN = '878335163121287169-sHzBMdW9I1HVlcWFWdvaj7wIMsKheZm'
A_TOKEN_SECRET = 'fgy1huoiJgA8fDSfpdz5rnNPvPbGtXvtiTL3RGPv5PtHm'
mashape_key = "e7b7a5ab38msh5ddc881afcc3bdcp101997jsn03924e682235"
twitter_app_auth = {
'consumer_key': C_KEY,
'consumer_secret': C_SECRET,
'access_token': A_TOKEN,
'access_token_secret': A_TOKEN_SECRET,
}
auth = tweepy.OAuthHandler(C_KEY, C_SECRET)
auth.set_access_token(A_TOKEN, A_TOKEN_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, compression=True)
botometer = Botometer(wait_on_ratelimit=True, mashape_key=mashape_key, **twitter_app_auth)
# User is exactly the user object from the request.
# Scores contains the overall classification results.
# The english score uses all six categories of features
# The universal score omits sentiment and content features, both of which are English-specific.
# Categories gives subscores for each of the six feature classes.
# Content, sentiment = English specific features
# Language independent features = Friend, network, temporal, user
def get_bom_scores():
accounts = []
scores = {}
# Read scores from file into list
with open('accounts.txt', 'r') as fileobj:
for row in fileobj:
row = row.rstrip('\n')
accounts.append(row)
# Get botometer scores for a list of accounts
for handle, result in botometer.check_accounts_in(accounts):
try:
tweet = api.user_timeline(screen_name=handle, count=1, include_rts=True)[0]
# If user profile language is english print english score
if (tweet.lang == 'en' or tweet.lang == 'und' and tweet.possibly_sensitive == False):
english_score = result['scores']['english']
scores[handle] = english_score
print(handle + ' ' + str(english_score))
else:
print('account ' + handle + ' skipped')
# No tweets to analyze; skip and go to next handle
except:
print('Error: account ' + handle + ' skipped')
continue
with open('scores.txt', 'w') as file:
file.write(json.dumps(scores))
return scores
scores = get_bom_scores()
def get_bot_handles():
# Read scores from file
scores = {}
bots = []
with open('scores.txt', 'r') as file_obj:
data = file_obj.read().replace('{', '').replace('}', '').replace('\"', '').split(',')
for line in data:
handle, score = line.strip().split(':')
scores[handle] = score.strip()
if (Decimal(score) >= 0.43):
bots.append(handle)
outfile = open('bots.txt', 'w')
for b in bots:
# write line to output file
outfile.write(b)
outfile.write("\n")
outfile.close()
return bots
bots = get_bot_handles()
def get_bot_all_scores():
accounts = []
scores = {}
# Read scores from file into list
with open('bots.txt', 'r') as fileobj:
for row in fileobj:
row = row.rstrip('\n')
accounts.append(row)
# Get botometer scores for a list of accounts
for handle, result in botometer.check_accounts_in(accounts):
try:
tweet = api.user_timeline(screen_name=handle, count=1, include_rts=True)[0]
# If user profile language is english print english score
if (tweet.lang == 'en' or tweet.lang == 'und' and tweet.possibly_sensitive == False):
score = result
scores[handle] = score
print(handle + ' ' + str(score))
else:
print('account ' + handle + ' skipped')
# No tweets to analyze; skip and go to next handle
except:
print('Error: account ' + handle + ' skipped')
continue
with open('scores.txt', 'w') as file:
file.write(json.dumps(scores))
return scores
scores = get_bot_all_scores()
# Gets a set number of tweets for a list of bots
def get_tweets():
# Removes URLs, handles, reserved words from tweets
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED)
# Stores tweets and handles
tweets_map = {}
bad_chars = list(punctuation)
bad_chars.remove('!')
bad_chars.remove('.')
bad_chars.remove('?')
bad_chars.append('–')
bad_chars.append('...')
bad_chars.append('..')
bad_chars.append('')
with open('bots.txt', 'r') as file_obj:
for row in file_obj:
handle = row.strip()
try:
tweets = api.user_timeline(screen_name=handle, count=100, tweet_mode='extended', include_rts=False)
for tweet in tweets:
try:
if (tweet.lang == 'en' or tweet.lang == 'und' and tweet.possibly_sensitve == False):
words = tweet.full_text.split(' ')
new_words = []
for wrd in words:
# Remove words where letters and numbers are mixed like LX23: to do
# Remove monetary values $325 and numbers with decimals in the middle
# Replaces THIIIIIIIIS with THIS
wrd = re.sub(r'\d+', '', wrd)
wrd = re.sub(re.compile(r'(.)\1{2,}'), r'\1', wrd)
new_words.append(wrd)
words = new_words
cleaned_tweet = ' '.join(words)
# Using replace() to remove bad_chars
cleaned_tweet = p.clean(cleaned_tweet)
for char in bad_chars:
cleaned_tweet = cleaned_tweet.replace(char, '')
if (cleaned_tweet != ""):
# Appending tweets to the empty array tmp
tweets_map.setdefault(handle, []).append(cleaned_tweet)
# Remove duplicate tweets
tweets_set = set(tweets_map[handle])
# Convert back to original format
tweets_map[handle] = list(tweets_set)
print(handle + " " + cleaned_tweet)
else:
continue
except tweepy.TweepError:
continue
except:
continue
return tweets_map
tweets_map = get_tweets()
# Removes bots that have less than 20 tweets in the dictionary
def clean_tweets_dict(this_map):
for key in list(this_map):
if (len(this_map[key]) < 50):
this_map.pop(key)
with open('tweets.pickle', 'wb') as f:
pickle.dump(this_map, f)
return this_map
this_map = clean_tweets_dict(tweets_map)
def sentiment_analysis(sentiment_values, num_tweets):
tweet_count = 0
print('SENTIMENT ANALYSIS: \t' + str(num_tweets) + ' tweets')
# Load map from pickle file
with open('tweets.pickle', 'rb') as f:
result = pickle.load(f)
print('num bots ' + str(len(result)))
analyzer = SentimentIntensityAnalyzer()
for key in result:
print('num tweets ' + str(len(result[key])))
tweet_count = tweet_count + len(result[key])
user_sentiment = 0.0
for tweet in itertools.islice(result[key], 0, num_tweets):
vs = analyzer.polarity_scores(tweet)
print('{:-<65} {}'.format(tweet, str(vs['compound'])))
# Average scores over each user to find final score
user_sentiment += vs['compound']
avg_sentiment = round(user_sentiment / num_tweets, 4)
print('AVERAGE SENTIMENT FOR USER: \t' + key + '\t\t' + str(avg_sentiment))
# Add to map
sentiment_values.setdefault(key, []).append(avg_sentiment)
print('total tweets ' + str(tweet_count))
sentiment_values = {}
# Compute average sentiment over 20 tweets
sentiment_analysis(sentiment_values, 20)
# Compute average sentiment over 30 tweets
sentiment_analysis(sentiment_values, 30)
# Compute average sentiment over 40 tweets
sentiment_analysis(sentiment_values, 40)
# Compute average sentiment over 50 tweets
sentiment_analysis(sentiment_values, 50)
positive_bots = []
negative_bots = []
neutral_bots = []
def classify_bots(sentiment_values, pos):
if (pos == 0):
value = 20
elif (pos == 1):
value = 30
elif (pos == 2):
value = 40
else:
value = 50
print('\nCLASSIFY: \t' + str(value) + ' tweets')
for handle, sent_arr in sentiment_values.items():
if (Decimal(sent_arr[pos]) >= 0.05):
positive_bots.append(handle)
print('positive' + handle)
elif (Decimal(sent_arr[pos]) <= -0.05):
negative_bots.append(handle)
print('negative' + handle)
else:
print('neutral' + handle)
neutral_bots.append(handle)
print('POSITIVE BOTS: \t' + str(len(positive_bots)))
print('NEGATIVE BOTS: \t' + str(len(negative_bots)))
print('NEUTRAL BOTS: \t' + str(len(neutral_bots)))
# Classifies bots using the 1st value
classify_bots(sentiment_values, 0)
# Classifies bots using the 2nd value
classify_bots(sentiment_values, 1)
# Classifies bots using the 3rd value
classify_bots(sentiment_values, 2)
# Classifies bots using the 4th value
classify_bots(sentiment_values, 3)