-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfake_news_classifier.py
356 lines (258 loc) · 11.8 KB
/
fake_news_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
# -*- coding: utf-8 -*-
"""Fake_News_Classifier.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1MPRy6rqmuLZBACpcmoIQg697z_ETHwk9
## Importing Libraries and Data Set
"""
# ----------------------------importing all necessary libraries--------------------------------- #
import nlp_utils # offer set of API's to work on string
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import seaborn as sns
df = pd.read_csv('train.csv')
df
"""* The label column in our data set is the target. **Label 0** -> represent that the news is not fake, **Label 1** -> represent that the news is fake."""
df.shape
"""## Data Preprocessing"""
#------------------------------------------Data Preprocessing-----------------------------------------------#
pd.set_option('display.max_colwidth', None) # Increasing the width of column to see the full text
df['title'] # title column that contain the heading of the news
df['text'] # text column that contain the information regarding the heading
df['label'].value_counts() # give the total count of label 0 and label 1
df.isnull().sum() # to check how many null values in each column
df = df.dropna() # droping the null values
df.isnull().sum() # as we can see that null values are removed
df.reset_index(inplace = True) # reset the index of data
df
"""## Text Cleaning"""
#--------------------------------------------Text Cleaning-------------------------------------------#
import re # importing regular expressions
import string
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)# remove all numbers which is attached to letters
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower()) # converting all letter to lowercase
remove_n = lambda x: re.sub("\n", " ", x) # replacing all \n with space
remove_non_ascii = lambda x: re.sub(r'[^\x00-\x7f]',r' ', x) # removing all non-ascii characters
df['text'] = df['text'].map(alphanumeric).map(punc_lower).map(remove_n).map(remove_non_ascii)
df['text']
"""## Removing stop words and stemming the text
In natural language processing, useless words are called stop words which on removing from the sentence does not affect the measning of sentence.
Stop words like "a", "an", "the", "in", "on" etc.
There is something called Porter Stemming Algorithm that is used to remove common morphological words. For more detail about the algorithm you can refer to the [link](http://snowball.tartarus.org/algorithms/porter/stemmer.html)
"""
#---------------------------------Poter Stemming Algorithm----------------------------------------------#
# importing libraries to do Porter Stemming Algorithm
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
ps = PorterStemmer()
corpus = []
for i in range(0, len(df)):
review = re.sub('[^a-zA-Z]', ' ', df['text'][i])
review = review.lower()
review = review.split()
review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
review = ' '.join(review)
corpus.append(review)
"""## Splitting DataFrame
"""
Y = df['label']
Y.head(8)
# Splitting data into 30% test data and 70% train data
X_train, X_test, Y_train, Y_test = train_test_split(df['text'], Y, test_size=0.3, random_state=40)
"""### Tfidf vectorizer
**TfidfVectorizer** - Transforms text to feature vectors that can be used as input to estimator.
**vocabulary_** Is a dictionary that converts each token (word) to feature index in the matrix, each unique token gets a feature index.
"""
#---------------------------------Tfidf Vectorizer-------------------------------------------#
# Apply tfidf vectorizer to the data set
tfidf_vect = TfidfVectorizer(stop_words = 'english',max_df=0.7)
tfidf_train = tfidf_vect.fit_transform(X_train)
tfidf_test = tfidf_vect.transform(X_test)
print(tfidf_train, tfidf_test)
# Get the list of feature names of `tfidf_vectorizer`
print(tfidf_vect.get_feature_names()[30:40])
"""### Count Vectorizer
**CountVectorizer** is a great tool provided by the scikit-learn library in Python. It is used to transform a given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text.
"""
#------------------------------------Count Vectorizer-------------------------------------------#
# Apply count vectorizer to the data set
count_vect = CountVectorizer(stop_words = 'english')
count_train = count_vect.fit_transform(X_train.values)
count_test = count_vect.transform(X_test.values)
print(count_train, count_test)
# Get the feature names of `count_vectorizer`
print(count_vect.get_feature_names()[-10:])
#-------------------------------------MACHINE LEARNING---------------------------------------------#
# Testing Three different Model
# 1. Naive Bayes Model
# 2. Random Forest Model
# 3. K-NN
"""## Machine Learning
## 1. Naive Bayes Model
### TF-Idf vectorized
"""
#------------------------------------Naive Bayes Model-------------------------------------------#
# importing sklearn libraries for Naive Bayes Model
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import accuracy_score
# Applying naive bayes , fitting the model
print('#'+'-'*40+"Naive Bayes Model"+'-'*40+"#")
print("1. TF-Idf Vectorized")
nb = MultinomialNB()
nb.fit(tfidf_train, Y_train)
# prdict the test data
pred = nb.predict(tfidf_test)
# accuracy of test
score = metrics.accuracy_score(Y_test, pred)
print('-'*30)
print("Accuracy: " + str(score))
# Confusion Matrix
cm1 = metrics.confusion_matrix(Y_test, pred)
print("Confusion Matrix: ")
print(cm1)
print('Wrong predictions out of total: ',end="")
print((Y_test !=pred).sum(),'/',((Y_test == pred).sum()+(Y_test != pred).sum()))
print('Percentage accuracy: '+str(100*score)+" %")
# plotting Confusion Matrix
sns.heatmap(cm1, annot=True)
"""### Count Vectorized"""
# Applying naive bayes , fitting the model
print("2. Count Vectorized")
nb = MultinomialNB()
nb.fit(count_train, Y_train)
# prediction on test data
pred1 = nb.predict(count_test)
# accuracy score on test data
score = metrics.accuracy_score(Y_test, pred1)
print('-'*30)
print("Accuracy: " + str(score))
# confusion matrix
cm2 = metrics.confusion_matrix(Y_test, pred1)
print("Confusion Matrix: ")
print(cm2)
print('Wrong predictions out of total: ',end="")
print((Y_test !=pred1).sum(),'/',((Y_test == pred1).sum()+(Y_test != pred1).sum()))
print('Percentage accuracy: '+str(100*score)+" %")
# plotting Confusion Matrix
sns.heatmap(cm2, annot=True)
"""## 2. Random Forest Model
Random Forest is a robust machine learning algorithm that can be used for both regression and classification task. When come to regression we use a random forest regressor and we can use random forest on classification model as our classification uses.
### TF-Idf Vectorized
"""
#------------------------------------Random Forest Model-------------------------------------------#
# importing RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
# Apllying Random Forest Classifier
# Random Forest is a robust machine learning algorithm that can be used for both regression and classification task.
# When come to regression we use a random forest regressor and we can use random forest on classification model as our classification uses.
print('#'+'-'*40+"Random Forest Model"+'-'*40+"#")
print("1. TF-Idf Vectorized")
print('-'*30)
RF=RandomForestClassifier().fit(tfidf_train,Y_train)
# prediction on train data
train_preds2 = RF.predict(tfidf_train)
# accuracy on train data
print("Model accuracy on train is: ", accuracy_score(Y_train, train_preds2))
#predict on test
test_preds2 = RF.predict(tfidf_test)
#accuracy on test
print("Model accuracy on test is: ", accuracy_score(Y_test, test_preds2))
print('-'*30)
#Confusion matrix
print("Confusion_matrix train is: ")
print(metrics.confusion_matrix(Y_train, train_preds2))
print("Confusion_matrix test is: ")
print(metrics.confusion_matrix(Y_test, test_preds2))
print('-'*30)
# Wrong Predictions made.
print('Wrong predictions out of total: ',end="")
print((Y_test !=test_preds2).sum(),'/',((Y_test == test_preds2).sum()+(Y_test != test_preds2).sum()))
print('-'*30)
"""### Count Vectorized"""
# Apllying RainForest Classifier
print("2. Count Vectorized")
print('-'*30)
RF=RandomForestClassifier().fit(count_train,Y_train)
# prediction on train data
train_preds3 = RF.predict(count_train)
# accuracy on train data
print("Model accuracy on train is: ", accuracy_score(Y_train, train_preds3))
# prediction on test data
test_preds3 = RF.predict(count_test)
# accuracy on test data
print("Model accuracy on test is: ", accuracy_score(Y_test, test_preds3))
print('-'*30)
#Confusion matrix
print("Confusion_matrix train is: ")
print(metrics.confusion_matrix(Y_train, train_preds3))
print("Confusion_matrix test is: ")
print(metrics.confusion_matrix(Y_test, test_preds3))
print('-'*30)
# Wrong Predictions made.
print('Wrong predictions out of total: ',end="")
print((Y_test !=test_preds3).sum(),'/',((Y_test == test_preds3).sum()+(Y_test != test_preds3).sum()))
print('-'*30)
"""##3. K-NN
### TF-Idf Vectorized
"""
#------------------------------------K-Nearest Neighbour-------------------------------------------#
# importing KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
print('#'+'-'*40+"K-Nearest Neighbour"+'-'*40+"#")
print("1. Tf-idf Vectorized")
print('-'*30)
# fit the model on train data
KNN = KNeighborsClassifier().fit(tfidf_train,Y_train)
#predict on train
train_preds4 = KNN.predict(tfidf_train)
#accuracy on train
print("Model accuracy on train is: ", accuracy_score(Y_train, train_preds4))
#predict on test
test_preds4 = KNN.predict(tfidf_test)
#accuracy on test
print("Model accuracy on test is: ", accuracy_score(Y_test, test_preds4))
print('-'*30)
#Confusion matrix
print("Confusion_matrix train is: ")
print(metrics.confusion_matrix(Y_train, train_preds4))
print("Confusion_matrix test is: ")
print(metrics.confusion_matrix(Y_test, test_preds4))
print('-'*30)
# Wrong Predictions made.
print('Wrong predictions out of total: ',end="")
print((Y_test !=test_preds4).sum(),'/',((Y_test == test_preds4).sum()+(Y_test != test_preds4).sum()))
print('-'*30)
"""### Count Vectorizer"""
print("1. Count Vectorized")
print('-'*30)
#fit the model on train data
KNN = KNeighborsClassifier().fit(count_train,Y_train)
#predict on train
train_preds5 = KNN.predict(count_train)
#accuracy on train
print("Model accuracy on train is: ", accuracy_score(Y_train, train_preds5))
#predict on test
test_preds5 = KNN.predict(count_test)
#accuracy on test
print("Model accuracy on test is: ", accuracy_score(Y_test, test_preds5))
print('-'*30)
#Confusion matrix
print("Confusion_matrix train is: ")
print(metrics.confusion_matrix(Y_train, train_preds5))
print("Confusion_matrix test is: ")
print(metrics.confusion_matrix(Y_test, test_preds5))
print('-'*30)
# Wrong Predictions made.
print('Wrong predictions out of total: ',end="")
print((Y_test !=test_preds5).sum(),'/',((Y_test == test_preds5).sum()+(Y_test != test_preds5).sum()))
print('-'*30)
"""**Result**<br>
We have cretaed three models
- Naive Based Model
- Random Forest Model
- K-Nearest Neighbour
We can see that the accuracy is high in Random Forest Model with aproximately 90% accuracy both for Tfidf vectorizer and count vectorizer. Hence, Random Forest Model is best fit for the Fake News Classifier.
"""