fake_news_classifier.py

# -*- coding: utf-8 -*-
"""Fake_News_Classifier.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1MPRy6rqmuLZBACpcmoIQg697z_ETHwk9

## Importing Libraries and Data Set
"""

# ----------------------------importing all necessary libraries--------------------------------- #

import nlp_utils  # offer set of API's to work on string
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import seaborn as sns

df = pd.read_csv('train.csv')
df

"""* The label column in our data set is the target. **Label 0** -> represent that the news is not fake, **Label 1** -> represent that the news is fake."""

df.shape

"""## Data Preprocessing"""

#------------------------------------------Data Preprocessing-----------------------------------------------#
pd.set_option('display.max_colwidth', None) # Increasing the width of column to see the full text

df['title'] # title column that contain the heading of the news

df['text'] # text column that contain the information regarding the heading

df['label'].value_counts() # give the total count of label 0 and label 1

df.isnull().sum() # to check how many null values in each column

df = df.dropna()  # droping the null values

df.isnull().sum() # as we can see that null values are removed

df.reset_index(inplace = True) # reset the index of data

df

"""## Text Cleaning"""

#--------------------------------------------Text Cleaning-------------------------------------------#
import re # importing regular expressions
import string

alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)# remove all numbers which is attached to letters
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower()) # converting all letter to lowercase
remove_n = lambda x: re.sub("\n", " ", x) # replacing all \n with space
remove_non_ascii = lambda x: re.sub(r'[^\x00-\x7f]',r' ', x) # removing all non-ascii characters 

df['text'] = df['text'].map(alphanumeric).map(punc_lower).map(remove_n).map(remove_non_ascii)

df['text']

"""## Removing stop words and stemming the text

In natural language processing, useless words are called stop words which on removing from the sentence does not affect the measning of sentence. 
Stop words like "a", "an", "the", "in", "on" etc.

There is something called Porter Stemming Algorithm that is used to remove common morphological words. For more detail about the algorithm you can refer to the [link](http://snowball.tartarus.org/algorithms/porter/stemmer.html)
"""

#---------------------------------Poter Stemming Algorithm----------------------------------------------#
# importing libraries to do Porter Stemming Algorithm

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
ps = PorterStemmer()
corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['text'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

"""## Splitting DataFrame

"""

Y = df['label']
Y.head(8)

# Splitting data into 30% test data and 70% train data
X_train, X_test, Y_train, Y_test = train_test_split(df['text'], Y, test_size=0.3, random_state=40)

"""### Tfidf vectorizer

**TfidfVectorizer** - Transforms text to feature vectors that can be used as input to estimator.

**vocabulary_** Is a dictionary that converts each token (word) to feature index in the matrix, each unique token gets a feature index.
"""

#---------------------------------Tfidf Vectorizer-------------------------------------------#
# Apply tfidf vectorizer to the data set
tfidf_vect = TfidfVectorizer(stop_words = 'english',max_df=0.7)
tfidf_train = tfidf_vect.fit_transform(X_train)
tfidf_test = tfidf_vect.transform(X_test)

print(tfidf_train, tfidf_test)

# Get the list of feature names of `tfidf_vectorizer` 
print(tfidf_vect.get_feature_names()[30:40])

"""### Count Vectorizer

**CountVectorizer** is a great tool provided by the scikit-learn library in Python. It is used to transform a given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text.
"""

#------------------------------------Count Vectorizer-------------------------------------------#
# Apply count vectorizer to the data set
count_vect = CountVectorizer(stop_words = 'english')
count_train = count_vect.fit_transform(X_train.values)
count_test = count_vect.transform(X_test.values)

print(count_train, count_test)

# Get the feature names of `count_vectorizer` 
print(count_vect.get_feature_names()[-10:])


#-------------------------------------MACHINE LEARNING---------------------------------------------#
# Testing Three different Model
# 1. Naive Bayes Model
# 2. Random Forest Model
# 3. K-NN

"""## Machine Learning

## 1. Naive Bayes Model

### TF-Idf vectorized
"""


#------------------------------------Naive Bayes Model-------------------------------------------#
# importing sklearn libraries for Naive Bayes Model
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import accuracy_score

# Applying naive bayes , fitting the model
print('#'+'-'*40+"Naive Bayes Model"+'-'*40+"#")
print("1. TF-Idf Vectorized")
nb = MultinomialNB() 
nb.fit(tfidf_train, Y_train)                       

# prdict the test data
pred = nb.predict(tfidf_test) 
# accuracy of test                    
score = metrics.accuracy_score(Y_test, pred)
print('-'*30)
print("Accuracy: " + str(score))

# Confusion Matrix
cm1 = metrics.confusion_matrix(Y_test, pred)
print("Confusion Matrix: ")
print(cm1)

print('Wrong predictions out of total: ',end="")
print((Y_test !=pred).sum(),'/',((Y_test == pred).sum()+(Y_test != pred).sum()))
print('Percentage accuracy: '+str(100*score)+" %")

# plotting Confusion Matrix
sns.heatmap(cm1, annot=True)

"""### Count Vectorized"""

# Applying naive bayes , fitting the model
print("2. Count Vectorized")
nb = MultinomialNB() 
nb.fit(count_train, Y_train)                       

# prediction on test data
pred1 = nb.predict(count_test)
# accuracy score on test data                    
score = metrics.accuracy_score(Y_test, pred1)
print('-'*30)
print("Accuracy: " + str(score))

# confusion matrix
cm2 = metrics.confusion_matrix(Y_test, pred1)
print("Confusion Matrix: ")
print(cm2)

print('Wrong predictions out of total: ',end="")
print((Y_test !=pred1).sum(),'/',((Y_test == pred1).sum()+(Y_test != pred1).sum()))
print('Percentage accuracy: '+str(100*score)+" %")

# plotting Confusion Matrix
sns.heatmap(cm2, annot=True)

"""## 2. Random Forest Model

Random Forest is a robust machine learning algorithm that can be used for both regression and classification task. When come to regression we use a random forest regressor and we can use random forest on classification model as our classification uses.

### TF-Idf Vectorized
"""

#------------------------------------Random Forest Model-------------------------------------------#
# importing RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

# Apllying Random Forest Classifier

# Random Forest is a robust machine learning algorithm that can be used for both regression and classification task.
# When come to regression we use a random forest regressor and we can use random forest on classification model as our classification uses.
print('#'+'-'*40+"Random Forest Model"+'-'*40+"#")
print("1. TF-Idf Vectorized")
print('-'*30)
RF=RandomForestClassifier().fit(tfidf_train,Y_train)

# prediction on train data 
train_preds2 = RF.predict(tfidf_train)
# accuracy on train data
print("Model accuracy on train is: ", accuracy_score(Y_train, train_preds2))

#predict on test
test_preds2 = RF.predict(tfidf_test)
#accuracy on test
print("Model accuracy on test is: ", accuracy_score(Y_test, test_preds2))
print('-'*30)



#Confusion matrix
print("Confusion_matrix train is: ")
print(metrics.confusion_matrix(Y_train, train_preds2))
print("Confusion_matrix test is: ")
print(metrics.confusion_matrix(Y_test, test_preds2))
print('-'*30)

# Wrong Predictions made.
print('Wrong predictions out of total: ',end="")
print((Y_test !=test_preds2).sum(),'/',((Y_test == test_preds2).sum()+(Y_test != test_preds2).sum()))
print('-'*30)

"""### Count Vectorized"""

# Apllying RainForest Classifier
print("2. Count Vectorized")
print('-'*30)
RF=RandomForestClassifier().fit(count_train,Y_train)

# prediction on train data
train_preds3 = RF.predict(count_train)
# accuracy on train data
print("Model accuracy on train is: ", accuracy_score(Y_train, train_preds3))

# prediction on test data
test_preds3 = RF.predict(count_test)
# accuracy on test data
print("Model accuracy on test is: ", accuracy_score(Y_test, test_preds3))
print('-'*30)


#Confusion matrix
print("Confusion_matrix train is: ")
print(metrics.confusion_matrix(Y_train, train_preds3))
print("Confusion_matrix test is: ")
print(metrics.confusion_matrix(Y_test, test_preds3))
print('-'*30)

# Wrong Predictions made.
print('Wrong predictions out of total: ',end="")
print((Y_test !=test_preds3).sum(),'/',((Y_test == test_preds3).sum()+(Y_test != test_preds3).sum()))
print('-'*30)

"""##3.  K-NN

### TF-Idf Vectorized
"""

#------------------------------------K-Nearest Neighbour-------------------------------------------#
# importing KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

print('#'+'-'*40+"K-Nearest Neighbour"+'-'*40+"#")
print("1. Tf-idf Vectorized")
print('-'*30)

# fit the model on train data 
KNN = KNeighborsClassifier().fit(tfidf_train,Y_train)

#predict on train 
train_preds4 = KNN.predict(tfidf_train)
#accuracy on train
print("Model accuracy on train is: ", accuracy_score(Y_train, train_preds4))

#predict on test
test_preds4 = KNN.predict(tfidf_test)
#accuracy on test
print("Model accuracy on test is: ", accuracy_score(Y_test, test_preds4))
print('-'*30)

#Confusion matrix
print("Confusion_matrix train is: ")
print(metrics.confusion_matrix(Y_train, train_preds4))
print("Confusion_matrix test is: ")
print(metrics.confusion_matrix(Y_test, test_preds4))
print('-'*30)

# Wrong Predictions made.
print('Wrong predictions out of total: ',end="")
print((Y_test !=test_preds4).sum(),'/',((Y_test == test_preds4).sum()+(Y_test != test_preds4).sum()))
print('-'*30)

"""### Count Vectorizer"""

print("1. Count Vectorized")
print('-'*30)
#fit the model on train data 
KNN = KNeighborsClassifier().fit(count_train,Y_train)

#predict on train 
train_preds5 = KNN.predict(count_train)
#accuracy on train
print("Model accuracy on train is: ", accuracy_score(Y_train, train_preds5))

#predict on test
test_preds5 = KNN.predict(count_test)
#accuracy on test
print("Model accuracy on test is: ", accuracy_score(Y_test, test_preds5))
print('-'*30)

#Confusion matrix
print("Confusion_matrix train is: ")
print(metrics.confusion_matrix(Y_train, train_preds5))
print("Confusion_matrix test is: ")
print(metrics.confusion_matrix(Y_test, test_preds5))
print('-'*30)

# Wrong Predictions made.
print('Wrong predictions out of total: ',end="")
print((Y_test !=test_preds5).sum(),'/',((Y_test == test_preds5).sum()+(Y_test != test_preds5).sum()))
print('-'*30)

"""**Result**<br>
We have cretaed three models
- Naive Based Model
- Random Forest Model
- K-Nearest Neighbour

We can see that the accuracy is high in Random Forest Model with aproximately 90% accuracy both for Tfidf vectorizer and count vectorizer. Hence, Random Forest Model is best fit for the Fake News Classifier.
"""