-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathword_vec_extractor.py
130 lines (89 loc) · 2.76 KB
/
word_vec_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
def extract_word_vecs_list(list_toktextdatas, embeddingfile, dim):
print("list_toktextdatas", len(list_toktextdatas))
terms = []
for toktextdatas in list_toktextdatas:
for word_tokens in toktextdatas:
terms.extend(word_tokens)
terms=set(terms)
print("terms length", len(terms))
file=open(embeddingfile,"r")
vectorlines = file.readlines()
file.close()
lineProgCount = 0
termsVectors = []
for vecline in vectorlines:
vecarr = vecline.strip().split()
lineProgCount=lineProgCount+1
if lineProgCount % 100000 ==0:
print(lineProgCount)
if len(vecarr) < 20:
continue
w2vecword = vecarr[0]
if w2vecword in terms:
termsVectors.append(vecline)
del vectorlines
termsVectorsDic = {}
for vecline in termsVectors:
veclinearr = vecline.strip().split()
vecword = veclinearr[0]
vecnumbers = list(map(float, veclinearr[1:]))
termsVectorsDic[vecword]=vecnumbers
print("termsVectorsDic length", len(termsVectorsDic))
return termsVectorsDic
def extract_word_vecs(toktextdatas, embeddingfile, dim):
print("toktextdatas", len(toktextdatas))
terms = []
for word_tokens in toktextdatas:
terms.extend(word_tokens)
terms=set(terms)
print("terms length", len(terms))
file=open(embeddingfile,"r")
vectorlines = file.readlines()
file.close()
lineProgCount = 0
termsVectors = []
for vecline in vectorlines:
vecarr = vecline.strip().split()
lineProgCount=lineProgCount+1
if lineProgCount % 100000 ==0:
print(lineProgCount)
if len(vecarr) < 20:
continue
w2vecword = vecarr[0]
if w2vecword in terms:
termsVectors.append(vecline)
del vectorlines
termsVectorsDic = {}
for vecline in termsVectors:
veclinearr = vecline.strip().split()
vecword = veclinearr[0]
vecnumbers = list(map(float, veclinearr[1:]))
termsVectorsDic[vecword]=vecnumbers
print("termsVectorsDic length", len(termsVectorsDic))
return termsVectorsDic
def populateTermVecs(terms, embeddingfile, dim):
termsVectorsDic = {}
file=open(embeddingfile,"r")
vectorlines = file.readlines()
file.close()
lineProgCount = 0
termsVectors = []
for vecline in vectorlines:
vecarr = vecline.strip().split()
lineProgCount=lineProgCount+1
if lineProgCount % 100000 ==0:
print(lineProgCount)
if len(vecarr) < 20:
continue
w2vecword = vecarr[0]
if w2vecword in terms:
termsVectors.append(vecline)
del vectorlines
termsVectorsDic = {}
for vecline in termsVectors:
veclinearr = vecline.strip().split()
vecword = veclinearr[0]
vecnumbers = list(map(float, veclinearr[1:]))
termsVectorsDic[vecword]=vecnumbers
print("termsVectorsDic length", len(termsVectorsDic))
return termsVectorsDic