-
Notifications
You must be signed in to change notification settings - Fork 4
/
pretrain.py
251 lines (209 loc) · 8.88 KB
/
pretrain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# coding: utf-8
import logging
import re
import numpy as np
import pandas as pd
import gensim.models.doc2vec
from random import shuffle
from collections import OrderedDict
from collections import namedtuple
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"
from utility import *
logging.basicConfig(format='%(asctime)s %(filename)s [%(levelname)s] %(message)s',
level=logging.INFO)
########################################
# FEATURE: keyword frequency
#
# Calculate the frequency of keywords.
#
########################################
def get_keyword_list():
"""
get the keyword list, containing the keywords extracted
using TF-IDF method, keywords by heuristic and their
negations
"""
key_adj = pd.read_csv('data/tfidf_by_class_adj.csv', error_bad_lines=False)
key_adj = key_adj[['class','tfidf','word']]
key_adv = pd.read_csv('data/tfidf_by_class_adv.csv', error_bad_lines=False)
key_adv = key_adv[['class','tfidf','word']]
# filter by tfidf value
threshold = 0.18
keywords_adj = key_adj[key_adj['tfidf'] >= threshold]
keywords_adv = key_adv[key_adv['tfidf'] >= threshold]
keywords = pd.concat([keywords_adj,keywords_adv], axis=0).reset_index(drop=True)
keywords_list = list(keywords['word'].unique())
# add heuristic keywords
keywords_heuristic = [
'gain', 'loss', 'switch', 'homozygous', 'neutral', 'exon',
'functional', 'oncogenic', 'resistant', 'recurrent', 'vus',
'deleterious', 'nick', 'variant', 'primary', 'sensitive',
'neratinib', 'transgenic', 'truncated', 'aberrant', 'hematopoietic',
'alternative', 'very', 'still', 'predominantly', 'probably', 'transiently',
'alone', 'potentially', 'functionally', 'fully', 'effectively', 'possibly',
'likely', 'largely', 'importantly', 'stably', 'constitutively', 'poorly',
'differentially', 'directly'
]
for word in keywords_heuristic:
keywords_list.append(word)
keywords_list = list(np.unique(keywords_list))
# add negations of keywords
keywords_neg = []
for word in keywords_list:
keywords_neg.append('not {word}'.format(word=word))
for neg_word in keywords_neg:
keywords_list.append(neg_word)
return np.array(keywords_list)
def count_key_freq_paragraph(sen, keyword_list):
'''
for a given sentence where variation name appears,
return frequency of each keyword on the keyword_list,
in the form of a numpy array
'''
vec = np.zeros(len(keyword_list))
if len(sen) == 0:
return vec
for j in range(vec.shape[0]):
pattern = keyword_list[j]
vec[j] = len(re.findall(pattern, sen))
vec /= len(sen)
return vec
def get_feat_keyfrq_paragraph(table, keyword_list, pre=100, post=300):
'''
for a given table of training/testing dataset, and a keyword list,
return the feature of keyword frequency,
in the form of a numpy array
'''
feat_keyfrq = np.zeros([table.shape[0],len(keyword_list)])
for i in range(feat_keyfrq.shape[0]):
text = table['Text'].iloc[i]
var = table['Variation'].iloc[i][:-1]
gene = table['Gene'].iloc[i]
paragraph = []
for match_V in re.finditer(var, text):
sentence = ''
s = match_V.start()
e = match_V.end()
sentence = text[s-pre:e+post]
paragraph.append(sentence)
for match_G in re.finditer(gene, text):
sentence = ''
s = match_G.start()
e = match_G.end()
sentence = text[s-pre:e+post]
paragraph.append(sentence)
gv_sen = ''
for sen in paragraph:
gv_sen += sen
feat_keyfrq[i] = count_key_freq_paragraph(gv_sen, keyword_list)
return feat_keyfrq
# get keyword frequnecy feature for stage2 training and testing dataset
train2, y = get_stage2_train_data()
test_all = get_stage2_test_data()
test2 = test_all[(test_all['real']==True)&(test_all['in_stage1']==False)]
keywords_list = get_keyword_list()
feat_keyfreq_train2 = get_feat_keyfrq_paragraph(train2, keywords_list)
feat_keyfreq_test2 = get_feat_keyfrq_paragraph(test2, keywords_list)
np.save('features/feat_keyfreq_train2.npy', feat_keyfreq_train2)
np.save('features/feat_keyfreq_test2.npy', feat_keyfreq_test2)
############################################
# FEATURE: d2v
#
# Train a Doc2Vec model using gensim,
# and obtain document vectors as features.
############################################
def table2lbldoc(table, pre_char, post_char):
'''Transform table to labelled doc
for each data point in the given table of training/testing dataset,
and assigned window of pre-characters and post-characters,
find the paragraph where variation name appears in its text in the form of labeldoc.
final return is a list of labeldoc for those successfully parsed data points,
and a list of indeces of dropouts for those whose variation name cannot be found in text.
'''
# heuristic stopwords
stopwords = ['wa', 'table', 'fig','figure','1','2','3','4','5','6','7','8','9','at',
'and','were','to','or','with','','a','am','an','are','be','been','being',
'by','for','had','has','have','he','her','her','hers','him','his','i',
'in','is','me','my','mine','of','our','ours','own','she','the','their',
'them','themselves','there','us','was','we','who','whoever','whom',
'whose','you','your','yours','yourself','yourselves']
LabelDoc = namedtuple('LabelDoc', 'words tags')
all_docs = []
count = 0
count_drop_V = 0
drop_list = []
# variation only; no stemming; only manually de-pleural
for i in range(table.shape[0]):
paragraph_V = []
tag = []
text = table['Text'].iloc[i]
pattern_V = table['Variation'].iloc[i][1:-1] # ignore the first and last char of variation name
for match_V in re.finditer(pattern_V, text):
sentence = ''
s = match_V.start()
e = match_V.end()
sentence = text[s-pre_char:e+post_char]
word_list = sentence.split(' ')[1:-1]
# de-pleural
for j, item in enumerate(word_list):
if len(item) > 0:
if item[-1] == 's':
word_list[j] = item[:-1]
paragraph_V.append(word_list)
# remove stop words
paragraph_V = [item for items in paragraph_V for item in items if item not in stopwords]
# count dropouts
if len(paragraph_V) < 3:
#print('Var', i, 'not in text, length: ', len(paragraph_V))
count_drop_V += 1
drop_list.append(i)
continue
else:
tag.append(count)
count += 1
all_docs.append(LabelDoc(paragraph_V, tag))
print('total number of drop by var: ', count_drop_V)
return all_docs, drop_list
# build dataframe for train & true test: df_all_doc2vec
col_list = ['Gene', 'Variation', 'Text']
df_all_doc2vec = pd.concat([train2[col_list], test2[col_list]], axis=0).reset_index()
# transform all data to labelled doc
pre_char = 100
post_char = 300
lbldoc_all, drop_ind_all = table2lbldoc(df_all_doc2vec, pre_char, post_char)
# for reshuffling
doc_list = lbldoc_all[:]
# generate training drop list and testing drop list from drop_ind_all
drop_ind_train2 = [e for e in drop_ind_all if e < train2.shape[0]]
drop_ind_test2 = [e for e in drop_ind_all if e > train2.shape[0]-1]
# build model and vocabulary
model = Doc2Vec(dm=1, size=100, window=5, negative=5, hs=0, min_count=2)
model.build_vocab(lbldoc_all)
# train model: decrease learning rate and shuffling
loop = 30
for epoch in range(loop):
#print(epoch, model.corpus_count, model.iter)
shuffle(doc_list) # Shuffling gets better results
model.train(lbldoc_all, total_examples=model.corpus_count, epochs=model.iter)
model.alpha -= 0.002 # decrease the learning rate
model.min_alpha = model.alpha # fix the learning rate, no decay
model.save('models/model.doc2vec')
# generate d2v vector for training and true testing dataset
doc2vec_raw = model.docvecs.doctag_syn0
feat_doc2vec_train2 = np.zeros([train2.shape[0], doc2vec_raw.shape[1]])
count_take_train2 = 0
for i in range(train2.shape[0]):
if i not in drop_ind_train2:
feat_doc2vec_train2[i] = doc2vec_raw[count_take_train2]
count_take_train2 += 1
feat_doc2vec_test2 = np.zeros([test2.shape[0],doc2vec_raw.shape[1]])
count_take_test2 = count_take_train2
for i in range(test2.shape[0]):
if i+train2.shape[0] not in drop_ind_test2:
feat_doc2vec_test2[i] = doc2vec_raw[count_take_test2]
count_take_test2 += 1
np.save('features/feat_doc2vec_train2.npy', feat_doc2vec_train2)
np.save('features/feat_doc2vec_test2.npy', feat_doc2vec_test2)