-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
31 lines (28 loc) · 1.21 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
class NaiveBayes:
def __init__(self):
self.prior = None
self.likelihood = None
self.classes = None
self.vocabulary = None
def fit(self, X, y):
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)
self.vocabulary = vectorizer.vocabulary_
self.classes, y_count = np.unique(y, return_counts=True)
self.prior = y_count / y_count.sum()
self.likelihood = np.zeros((len(self.classes), X.shape[1]))
for i, c in enumerate(self.classes):
X_c = X[y == c]
self.likelihood[i, :] = (X_c.sum(axis=0) + 1) / (X_c.shape[0] + 2)
def predict_proba(self, X):
vectorizer = CountVectorizer(vocabulary=self.vocabulary)
X = vectorizer.fit_transform(X)
log_prior = np.log(self.prior)
log_likelihood = np.log(self.likelihood)
log_posterior = log_prior + X.dot(log_likelihood.T)
# shift the values to avoid numerical underflow
log_posterior -= log_posterior.max(axis=1)[:, np.newaxis]
posterior = np.exp(log_posterior)
return posterior / posterior.sum(axis=1)[:, np.newaxis]