-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathimbhn.py
114 lines (99 loc) · 3.87 KB
/
imbhn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 30 17:35:15 2017
@author: thiago
"""
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
#from scipy.sparse import csr_matrix
class IMBHN(BaseEstimator, ClassifierMixin):
def __init__(self, eta=0.1, max_itr=1000, min_sqr_error=0.005):
self.eta = eta
self.max_itr = max_itr
self.min_sqr_error = min_sqr_error
def stop_analysis(self, mean_error, num_iterations):
#if mean_error - self.mean_error == 0:
# return True
if mean_error < self.min_sqr_error:
return True
print(self.max_itr)
if num_iterations > self.max_itr:
return True
self.mean_error = mean_error
self.niterations = num_iterations
return False
def init_dataset(self, X,y):
self.ndocs, self.nterms = X.shape
self.nclass = len(set(y))
self.W = X # document-term matrix
self.D = range(self.ndocs) # set of documents
self.C = range(self.nclass) # set of classes
self.T = range(self.nterms) # set of terms
self.Y = y # class labels
self.current_doc_index = -1
self.terms_by_doc = []
self.small_float = 0.000001
self.mean_error = float("-inf")
print("oi"+str(self.max_itr))
def get_terms_by_doc(self, d):
if self.current_doc_index == d:
return self.terms_by_doc
self.current_doc_index = d
self.terms_by_doc = self.W.indices[self.W.indptr[d]:self.W.indptr[d+1]]
return self.terms_by_doc
def classify(self, d):
out = np.zeros(self.nclass)
for c in self.C:
cw = 0 # class_weight
for t in self.set_of_terms_by_doc(d):
cw += self.F[t,c] * self.W[d,t]
out[c] = cw
return out
def classify_hard(self, d):
_max, _max_c = float("-inf"), -1
for c in self.C:
cw = 0 # class_weight
for t in self.get_terms_by_doc(d):
cw += self.F[t,c] * self.W[d,t]
if _max < cw: _max, _max_c = cw, c
out = np.zeros(self.nclass)
if _max > self.small_float: out[_max_c] = 1
return out
def fit(self, X, y):
print("running")
self.init_dataset(X,y)
_exit = False
self.F = np.zeros((self.nterms, self.nclass))
num_it = 0
while(_exit == False):
mean_error = 0.0
for d in self.D:
estimated_classes = self.classify_hard(d)
for c in self.C:
error = (1 if self.Y[d] == c else 0) - estimated_classes[c]
mean_error += ((error*error)/2.0)
for t in self.get_terms_by_doc(d):
current_weight = self.F[t,c]
new_weight = current_weight + (self.eta * self.W[d,t] * error)
self.F[t,c] = new_weight
num_it += 1
mean_error = mean_error/self.ndocs
_exit = self.stop_analysis(mean_error, num_it)
print(num_it)
print(mean_error)
return self
def transform(self, X):
return None
def predict(self, X):
ndocs = X.shape[0]
result = np.zeros(ndocs)
for d in range(ndocs):
_max, _max_c = float("-inf"), -1
for c in self.C:
cw = 0 # class_weight
for t in X.indices[X.indptr[d]:X.indptr[d+1]]:
cw += self.F[t,c] * X[d,t]
if _max < cw: _max, _max_c = cw, c
result[d] = _max_c
return result