-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbotornot_3_ml.py
150 lines (130 loc) · 5.75 KB
/
botornot_3_ml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python
# coding: utf-8
# Honours Project 2019
# Krithika Saravanan, 100970975
# Run with Python 3.6
# credit to https://towardsdatascience.com/creating-the-twitter-sentiment-analysis-program-in-python-with-naive-bayes-classification-672e5589a7ed
# for ml model implementation
from decimal import Decimal
import pprint
import tweepy
import itertools
import pickle
import csv
import nltk
import os
import re
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
pp = pprint.PrettyPrinter(indent=4)
C_KEY = 'rdFeZ2ePyDf9qFhNGzkKGEC22'
C_SECRET = 'csvkpCZziAgbngIPjH9Gi8P2R3H1fNyQr3FpvsE3fh0jmXKI74'
A_TOKEN = '878335163121287169-sHzBMdW9I1HVlcWFWdvaj7wIMsKheZm'
A_TOKEN_SECRET = 'fgy1huoiJgA8fDSfpdz5rnNPvPbGtXvtiTL3RGPv5PtHm'
mashape_key = "e7b7a5ab38msh5ddc881afcc3bdcp101997jsn03924e682235"
twitter_app_auth = {
'consumer_key': C_KEY,
'consumer_secret': C_SECRET,
'access_token': A_TOKEN,
'access_token_secret': A_TOKEN_SECRET,
}
auth = tweepy.OAuthHandler(C_KEY, C_SECRET)
auth.set_access_token(A_TOKEN, A_TOKEN_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, compression=True)
def build_train_set(corpus_file):
training_dataset = []
with open(corpus_file, encoding='utf-8') as csvfile:
line_reader = csv.reader(csvfile, delimiter=',')
for row in line_reader:
training_dataset.append({'tweet_id': row[0], 'label': row[1], 'topic': row[2], 'text': row[3]})
return training_dataset
def build_test_set(num_tweets):
test_set = []
with open('tweets.pickle', 'rb') as f:
result = pickle.load(f)
for key in result:
user_sentiment = 0.0
for tweet in itertools.islice(result[key], 0, num_tweets):
tweet_dict = dict()
tweet_dict['text'] = tweet
tweet_dict['label'] = None
tweet_dict['handle'] = key
test_set.append(tweet_dict)
if 'tweet_id' in tweet:
print('tweet id')
test_set.remove(tweet)
return test_set
test_data_set = build_test_set(50)
class PreProcessTweets:
def __init__(self):
self._stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER', 'URL'])
def process_tweets(self, list_of_tweets):
processed_tweets = []
for tweet in list_of_tweets:
processed_tweets.append((self._process_tweet(tweet["text"]), tweet["label"]))
return processed_tweets
def _process_tweet(self, tweet):
tweet = tweet.lower() # convert text to lower-case
tweet = re.sub(' \d+', '', tweet)
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames
tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
tweet = word_tokenize(tweet) # remove repeated characters (helloooooooo into hello)
return [word for word in tweet if word not in self._stopwords]
def build_vocabulary(preprocessed_training_data):
print('build vocab')
all_words = []
for (words, sentiment) in preprocessed_training_data:
all_words.extend(words)
wordlist = nltk.FreqDist(all_words)
word_feat = wordlist.keys()
return word_feat
def extract_features(tweet):
tweet_words = set(tweet)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in tweet_words)
return features
def save_classifier(classifier):
print('save classifier')
file = open('sentiment_classifier.pickle', 'wb')
pickle.dump(classifier, file, -1)
file.close()
def load_classifier():
print('load classifier')
file = open('sentiment_classifier.pickle', 'rb')
classifier = pickle.load(file)
file.close()
return classifier
tweetProcessor = PreProcessTweets()
preprocessedTestSet = tweetProcessor.process_tweets(test_data_set)
training_data = build_train_set('training_data.csv')
preprocessedTrainingSet = tweetProcessor.process_tweets(training_data)
word_features = build_vocabulary(preprocessedTrainingSet)
myTestSet = ['Suicide Bomber Kills Officials in Mayor’s Office in Somalia’s CapitalSuicide Bomber Kills Officials in Mayor’s Office in Somalia’s Capital',
'Trump Says Mueller Was ‘Horrible’ and Republicans ‘Had a Good Day’',
'DO This gives a very good view of the water. Small excursions to close by islands might be possible.',
'please eat something healthy',
'You have a great sense of humor.',
'Germany In Political Turmoil As Coalition Talks Fail lmao',
'Confidence goes a long way. Even if it doesn’t come naturally fake it ’til you make it!',
'MIDRANGE Try it good place hangout in eveinings and night. Chinese and Malaysian food.',
'As Trump Accuses Iran He Has One Problem His Own Credibility',
'please dont forget to listen to a song you enjoy']
if os.path.isfile('sentiment_classifier.pickle'):
print('pickle file exists')
cl = load_classifier()
for tweet in myTestSet:
print(tweet)
NBResultLabels = [cl.classify(extract_features(tweet)) for tweet in myTestSet]
testFeatures = nltk.classify.apply_features(extract_features, preprocessedTestSet[0:100])
print(nltk.classify.accuracy(cl, testFeatures))
cl.show_most_informative_features(5)
print('classification done')
else:
print('pickle file doesnt exist')
trainingFeatures = nltk.classify.apply_features(extract_features, preprocessedTrainingSet)
NBayesClassifier = nltk.NaiveBayesClassifier.train(trainingFeatures)
save_classifier(NBayesClassifier)
NBResultLabels = [NBayesClassifier.classify(extract_features(tweet)) for tweet in myTestSet]