-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcategory_classification.py
105 lines (84 loc) · 3.8 KB
/
category_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
"""
@author: Sahil Chopra,Jorge Vasquez,Colm Rooney
"""
import pandas as pd
import numpy as np
import argparse
from sklearn.model_selection import train_test_split
import nltk
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix,classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.utils import shuffle
import seaborn as sns
import matplotlib.pyplot as plt
class PerceptronClassifier:
def __init__(self):
self.model = None
def split_vectorise_data(self,X,y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=150,
stratify=y)
tfidf_vectorizer = TfidfVectorizer(max_features=20000,
use_idf=True,
stop_words='english',
tokenizer=nltk.word_tokenize,
ngram_range=(1, 3))
X_train_vec = tfidf_vectorizer.fit_transform(X_train)
X_test_vec = tfidf_vectorizer.transform(X_test)
return X_train_vec, X_test_vec, y_train, y_test
def train(self, X, y):
"""
Function used for training the Perceptron algorithm on the data vectorized using Tf-Idf vectorization
:param X: sparse matrix with the texts preprocessed by Tf-Idf Vectorizer
:param y: expected values for each sample
"""
self.model = Perceptron(penalty='l1', alpha=0.001, random_state=0)
self.model.fit(X, y)
def predict(self, X):
"""
:param X: sparse matrix with the texts preprocessed by Tf-Idf Vectorizer
:return: predicted values
"""
pred = self.model.predict(X)
return pred
def compute_scores(self, expected: list, predicted: list, num_classes: int):
"""
Function used for computing the confusion matrix, Recall, Precision and F1 scores
:param expected: list of expected values
:param predicted: list of predicted values
:param num_classes: number of classes
:return: numpy array with the confusion matrix, string with the recall, precision and f1 scores
"""
classes = self.model.classes_
# get confusion matrix
conf_matrix = confusion_matrix(expected, predicted)
plt.figure(figsize=(10, 10))
plot = sns.heatmap(conf_matrix, annot=True, cmap='YlOrBr', xticklabels=classes, yticklabels=classes)
plt.savefig(f'data/Confusion matrix 16 classes.png')
plt.clf()
# get precision, recall, F1
report = classification_report(predicted, expected)
report_dict = classification_report(predicted, expected, output_dict=True)
df = pd.DataFrame(report_dict).transpose().round(2)
df.to_csv(f'data/Scores 16 classes.csv')
return conf_matrix, report, classes
def main(path: str):
"""
:param path: path to the csv file
"""
df = pd.read_csv(path)
df = shuffle(df).reset_index(drop=True)
classifier = PerceptronClassifier()
X_train, X_test, y_train, y_test = classifier.split_vectorise_data(df['Preprocessed Wikipage'],df['Person'])
classifier.train(X_train, y_train)
predicted = classifier.predict(X_test)
classes = np.unique(y_train)
conf_matrix, class_report, classes = classifier.compute_scores(y_test, predicted, classes)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Classifier")
parser.add_argument("--inputpath", type=str,
help="path to the csv file in required format")
args = parser.parse_args()
main(args.inputpath)