-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathembedding_vectorizer.py
87 lines (75 loc) · 3.14 KB
/
embedding_vectorizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, BaseEstimator
import numpy as np
import abc
from scipy.sparse import csr_matrix, hstack, csc_matrix
class EmbeddingVectorizer(BaseEstimator):
__metaclass__ = abc.ABCMeta
def __init__(self, word2vec, bow_vectorizer=None):
self.word2vec = word2vec
self.dim = word2vec.vector_size
self.bow_vectorizer = bow_vectorizer
def fit(self, X, y=None):
if self.bow_vectorizer is not None:
self.bow_vectorizer.fit_transform(X, y)
return self
def transform(self, X):
docs_matrix = []
for document in X:
document = document.lower()
document = document.replace('ne_', 'NE_')
word_vectors = []
for word in document.split():
if self.word_known(word):
word_vectors.append(np.array(self.word2vec[word] * self.word_weight(word)))
if word_vectors:
doc_vector = np.mean(np.array(word_vectors), axis=0)
else:
doc_vector = np.zeros(self.dim)
docs_matrix.append(doc_vector)
X_word2vec = np.array(docs_matrix)
if self.bow_vectorizer is None:
return docs_matrix
else:
X_transformed = self.bow_vectorizer.transform(X)
X_transformed = X_transformed.astype(np.float32)
X_word2vec = csr_matrix(X_word2vec)
return hstack((X_transformed, X_word2vec), format='csr')
def fit_transform(self, X, y=None):
self.fit(X,y)
return self.transform(X)
@abc.abstractmethod
def word_known(self, word):
"""Method that checks whether the given word exists in the model"""
return
@abc.abstractmethod
def word_weight(self, word):
"""Method that returns a weight for the vector of the given word"""
return
class MeanEmbeddingVectorizer(EmbeddingVectorizer):
def word_known(self, word):
return word in self.word2vec
def word_weight(self, word):
return 1
class MeanEmbeddingVectorizerInNegatedText(MeanEmbeddingVectorizer):
def transform(self, X):
docs_matrix = []
for document in X:
document = document.lower()
document = document.replace('ne_', '')
word_vectors = []
for word in document.split():
if self.word_known(word):
word_vectors.append(np.array(self.word2vec[word] * self.word_weight(word)))
if word_vectors:
doc_vector = np.mean(np.array(word_vectors), axis=0)
else:
doc_vector = np.zeros(self.dim)
docs_matrix.append(doc_vector)
X_word2vec = np.array(docs_matrix)
if self.bow_vectorizer is None:
return docs_matrix
else:
X_transformed = self.bow_vectorizer.transform(X)
X_transformed = X_transformed.astype(np.float32)
X_word2vec = csr_matrix(X_word2vec)
return hstack((X_transformed, X_word2vec), format='csr')