-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvec_operations.py
139 lines (102 loc) · 4.54 KB
/
vec_operations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
@author: Cem Rıfkı Aydın
"""
import numpy as np
import svd_U
from supervised_features import SupervisedFeatures
class VecOperations:
"""
This class is used to perform basic vector operations, such as concatenation of embeddings
and computing the average word vector of the words occurring in a review.
"""
def __init__(self, revs, labels):
self.revs = revs
self.labels = labels
self.delta_idf_scores = SupervisedFeatures(revs, labels).extract_delta_idf_scores()
def get_vecs(self):
"""
This helper method returns embeddings of the ensemble form.
In this ensemble form, corpus, lexical, and supervised features are combined.
This and other corresponding "get_vecs" methods creates embeddings on a word-basis.
:return: Combined embeddings for corpus words.
:rtype: dict
"""
return self.get_3_concatenated_vecs()
def get_combination_of_3_approaches(self, diff_approach_vecs): # corpus_svd_U + tdk_svd_U + 4_superv
"""
This helper method combines three embeddings of different genres.
:param diff_approach_vecs: The list of embeddings generated with different approaches.
:type diff_approach_vecs: list
:return: The combination of these embeddings.
:rtype: dict
"""
concatenated_vecs = {}
all_words = self.get_all_words(diff_approach_vecs)
for one_approach_vecs in diff_approach_vecs:
one_approach_vecs_list = one_approach_vecs.keys()
vec_dim_size = len(list(one_approach_vecs.values())[0])
for word in all_words:
if word not in concatenated_vecs:
concatenated_vecs[word] = []
one_vec_component = one_approach_vecs[word] if word in one_approach_vecs_list \
else np.random.uniform(-0.1, 0.1, vec_dim_size)
concatenated_vecs[word].extend(one_vec_component)
return concatenated_vecs
def get_3_concatenated_vecs(self):
"""
A helper method that returns the concatenated vectors with respect to different types.
:return: The combination of vectors.
:rtype: dict
"""
corpus_svd_u_vecs = svd_U.CorpusSvdU(self.revs, self.labels).get_corpus_svd_u_vecs()
lexical_svd_u_vecs = svd_U.LexicalSvdU(self.revs, self.labels).get_lexical_svd_u_vecs()
sf = SupervisedFeatures(self.revs, self.labels)
self.delta_idf_scores = sf.delta_idf_scores
four_delta_idf_vecs = sf.get_4_delta_idf_vecs()
all_vecs = [corpus_svd_u_vecs, lexical_svd_u_vecs, four_delta_idf_vecs]
return self.get_combination_of_3_approaches(all_vecs)
def get_all_words(self, diff_approach_vecs):
"""
This helper method obtains all words from embeddings dictionary.
:param diff_approach_vecs: A list containing embeddings of different types.
:type diff_approach_vecs: list
:return: All words that are taken into account in this stage.
:rtype: set
"""
all_words = set([word for one_approach_vecs in diff_approach_vecs for word, vec in one_approach_vecs.items()])
return all_words
def rev2avg_vec(rev, word_vecs):
"""
This function takes the average of the words appearing in a review.
This will later be used as an input feature in the classification stage.
:param rev: A review.
:type rev: list
:param word_vecs: All word vectors generated based on corpora or lexicons or both.
:type word_vecs: dict
:return: Averaged vector representing a review.
:rtype: list
"""
vec_dim_size = len(list(word_vecs.values())[0])
res_vec = np.random.uniform(-1.0, 1.0, vec_dim_size)
for word in rev:
vec = np.random.uniform(-1.0, 1.0, vec_dim_size) \
if word not in word_vecs else np.array(word_vecs[word])
res_vec = np.add(vec, res_vec)
res_vec[(res_vec == np.inf) | (res_vec == -np.inf)] = 0
res_vec = res_vec / len(rev)
return res_vec.tolist()
def revs2avg_vecs(revs, word_vecs):
"""
The list containing all the averaged embeddings of all the reviews in the corpus.
:param revs: Corpus reviews.
:type revs: list or ndarray
:param word_vecs: All word vectors generated based on corpora or lexicons or both.
:type word_vecs: dict
:return: The averaged embeddings each representing a review.
:rtype: list
"""
return [rev2avg_vec(rev, word_vecs) for rev in revs]
if __name__ == "__main__":
pass