-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathget_term_maps.py
executable file
·336 lines (321 loc) · 14.8 KB
/
get_term_maps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
import os
import re
attribute_value_from_fact = re.compile(r'([A-Z0-9_]+) *[=] *((["]+[^"]*["])|([0-9]+))',re.I)
## from term_utilities in Termolator
def read_in_term_line(line):
out_dict = {}
for att_value_pattern in attribute_value_from_fact.finditer(line):
kv = att_value_pattern.group(0)
equal_position = kv.index('=')
key = kv[:equal_position]
value = kv[equal_position+1:]
## key,value = att_value_pattern.group(0).split('=')
if re.search('^".*"$',value):
value = value.strip('"')
elif re.search('^[0-9]+$',value):
value = int(value)
out_dict[key] = value
return(out_dict)
def short_file(file_string,input_path_prefix=False):
if input_path_prefix:
if not input_path_prefix.endswith(os.sep):
input_path_prefix += os.sep
first_slash_pattern = re.compile(input_path_prefix)
last_slash_pattern = False
else:
last_slash_pattern = re.compile(os.sep+'[^'+os.sep+']*$')
first_slash_pattern = False
last_dot_pattern = re.compile('\.[^\.]*$')
if last_slash_pattern:
last_slash_match = last_slash_pattern.search(file_string)
first_slash_match = False
elif first_slash_pattern:
first_slash_match = first_slash_pattern.search(file_string)
last_slash_match = False
last_dot_match = last_dot_pattern.search(file_string)
if first_slash_match and last_dot_match:
return(file_string[first_slash_match.end():last_dot_match.start()])
elif first_slash_match:
return(file_string[first_slash_match.end():])
elif last_slash_match and last_dot_match and (last_slash_match.start()<last_dot_match.start()):
return(file_string[last_slash_match.start()+1:last_dot_match.start()])
elif last_slash_match:
return(file_string[last_slash_match.start()+1:])
elif (not last_slash_match) and last_dot_match:
## last_slash_match is only used if it precedes last_dot_match
return(file_string[:last_dot_match.start()])
else:
return(file_string)
def combine_path_and_file(path,file):
if path.endswith(os.sep):
return(path+file)
else:
return(path+os.sep+file)
def get_subsequence_strings(string_list,lemma_list,length):
output = []
for index in range(len(string_list)):
if (index+length)> len(string_list):
pass
elif lemma_list == False:
output.append([" ".join(string_list[index:index+length]),False])
else:
output.append([" ".join(string_list[index:index+length])," ".join(lemma_list[index:index+length])])
return(output)
def preposition_member(string_list):
if ('of' in string_list) or ('for' in string_list):
return(True)
else:
return(False)
closed_class_stop_words = ['a','the','an','and','or','but','about','above','after','along','amid','among',\
'as','at','by','for','from','in','into','like','minus','near','of','off','on',\
'onto','out','over','past','per','plus','since','till','to','under','until','up',\
'via','vs','with','that','can','cannot','could','may','might','must',\
'need','ought','shall','should','will','would','have','had','has','having','be',\
'is','am','are','was','were','being','been','get','gets','got','gotten',\
'getting','seem','seeming','seems','seemed',\
'enough', 'both', 'all', 'your' 'those', 'this', 'these', \
'their', 'the', 'that', 'some', 'our', 'no', 'neither', 'my',\
'its', 'his' 'her', 'every', 'either', 'each', 'any', 'another',\
'an', 'a', 'just', 'mere', 'such', 'merely' 'right', 'no', 'not',\
'only', 'sheer', 'even', 'especially', 'namely', 'as', 'more',\
'most', 'less' 'least', 'so', 'enough', 'too', 'pretty', 'quite',\
'rather', 'somewhat', 'sufficiently' 'same', 'different', 'such',\
'when', 'why', 'where', 'how', 'what', 'who', 'whom', 'which',\
'whether', 'why', 'whose', 'if', 'anybody', 'anyone', 'anyplace', \
'anything', 'anytime' 'anywhere', 'everybody', 'everyday',\
'everyone', 'everyplace', 'everything' 'everywhere', 'whatever',\
'whenever', 'whereever', 'whichever', 'whoever', 'whomever' 'he',\
'him', 'his', 'her', 'she', 'it', 'they', 'them', 'its', 'their','theirs',\
'you','your','yours','me','my','mine','I','we','us','much','and/or'
]
def OK_substring(substring):
if not substring in closed_class_stop_words:
return(True)
else:
return(False)
def get_term_substrings(string,lemma):
global lemma_dict
output = []
string_list = string.split()
if (string in lemma_dict) or (preposition_member(string_list)):
## these are cases where substrings of lemmas will not align
## with substrings of string
lemma_list = False
elif (lemma == string):
lemma_list = False
elif lemma == False:
lemma_list = False
else:
lemma_list = lemma.split()
if (len(lemma_list) == len(string_list)):
lemma_list = False
output = []
for length in range(1,len(string_list)):
## longest list is length-1; shortest list is 1 word
for substring in get_subsequence_strings(string_list,lemma_list,length):
if OK_substring(substring):
output.append(substring)
return(output)
def update_term_dict(short_file_name,term_dict,string,lemma,start,end,head_term,merge_super_string=False,all_keys=False,substring_of=False):
global lemma_dict
if lemma and (lemma in term_dict):
entry = term_dict[lemma]
if 'instances' in entry:
entry['instances'].append([short_file_name,start,end,False])
else:
entry['instances'] = [[short_file_name,start,end,False]]
if (not 'variants' in entry):
entry['variants'] = [lemma]
if (not lemma == string) and (not string in entry['variants']):
entry['variants'].append(string)
elif lemma:
entry = {}
entry['instances'] = [[short_file_name,start,end,False]]
entry['variants'] = [lemma]
if not string in entry['variants']:
entry['variants'].append(string)
term_dict[lemma] = entry
else:
entry = False
if entry and substring_of:
if not 'substring_of' in entry:
entry['substring_of']=[substring_of]
elif not substring_of in entry['substring_of']:
entry['substring_of'].append(substring_of)
def count_files(instances):
files = []
for tfile,start,end,is_substring in instances:
if not tfile in files:
files.append(tfile)
return(len(files))
def get_term_maps(term_list,file_list,outfile,input_path_prefix,remove_mismatches=True,minimum=3,json=True):
## don't use abbr dict for now, but could in the future
## term_list consists of input terms
## file_list consists of .terms files
global lemma_dict
term_dict = {}
lemma_dict = {}
keys = []
with open(term_list) as instream:
position = 0
for line in instream:
line = line.strip(os.linesep).lower()
forms = line.split('\t')
base = forms[0]
reset_entry = False
for form in forms:
if (not reset_entry) and (form in lemma_dict):
reset_entry = True
base = lemma_dict[form]
for form in forms:
lemma_dict[form] = base
if not base in keys:
keys.append(base) ## maintain same order for print out
## does not repeat bases
# for form in ['lrp','lung resistance related protein','lung cancer associated resistance protein']:
# print(lemma_dict[form],form)
# input('pause')
with open(file_list) as liststream:
## first pass do whole terms
## second pass just to substrings
infile_list = []
for infile in liststream:
infile = infile.strip(os.linesep)
infile = combine_path_and_file(input_path_prefix,infile)
infile_list.append(infile)
with open(infile) as instream:
short_file_name = short_file(infile,input_path_prefix=input_path_prefix)
## print(short_file_name)
for line in instream:
line = line.strip(os.linesep).lower()
entry = read_in_term_line(line)
string = entry['string']
if string in lemma_dict:
lemma = lemma_dict[string]
elif 'lemma' in entry:
lemma = entry['lemma']
if (lemma in lemma_dict) and (not lemma == lemma_dict[lemma]):
lemma = lemma_dict[lemma]
else:
lemma = False
if 'head_term' in entry:
head_term = entry['head_term']
else:
head_term = False
start = entry['start']
end = entry['end']
if string in lemma_dict:
lemma = lemma_dict[string]
elif lemma in lemma_dict:
lemma = lemma_dict[lemma]
else:
## AM debug 8/3/2020
lemma = string
update_term_dict(short_file_name,term_dict,string,lemma,start,end,head_term,merge_super_string=remove_mismatches)
for infile in infile_list:
with open(infile) as instream:
short_file_name = short_file(infile,input_path_prefix=input_path_prefix)
for line in instream:
line = line.strip(os.linesep).lower()
entry = read_in_term_line(line)
string = entry['string']
if 'lemma' in entry:
lemma = entry['lemma']
elif string in lemma_dict:
lemma = lemma_dict[string]
else:
lemma = False
start = entry['start']
end = entry['end']
if lemma in lemma_dict:
lemma = lemma_dict[lemma]
elif string in lemma_dict:
lemma = lemma_dict[string]
for substring,lemma_substring in get_term_substrings(string,lemma):
if substring and lemma_substring and (substring in lemma_dict) and (lemma_substring in lemma_dict[substring]):
trade_key = update_term_dict(short_file_name,term_dict,substring,lemma_substring,start,end,head_term,substring_of=lemma)
else:
trade_key = update_term_dict(short_file_name,term_dict,substring,lemma_substring,start,end,head_term,substring_of=lemma)
if trade_key and (substring in keys):
position = keys.index(substring)
keys[position]= trade_key
elif trade_key and (lemma_substring in keys):
position = keys.index(lemma_substring)
keys[position] = trade_key
with open(outfile,'w') as outstream:
rank = 0
## do not repeat (repeats can arise due to substring terms)
done = set()
for key in keys[:]:
rank = rank+1
if key in term_dict:
entry = term_dict[key]
elif (key in lemma_dict) and (key != lemma_dict[key]):
## never happens ??
new_key = lemma_dict[key]
if new_key in keys:
pass
else:
print('key not found:',key)
else:
done.add(key)
entry = []
start_string = ''
## skip to rest of loop
if not key in done:
start_string = '<term string="'+key+'" rank='+str(rank)
else:
start_string = ''
skip = True
if (not 'variants' in entry) and (not 'substring_of' in entry):
if remove_mismatches:
pass
else:
entry['hypothetical'] = True
start_string +=' hypothetical_term="'+key+'"'
skip = True
else:
skip = False
if 'instances' in entry:
number_of_files = count_files(entry['instances'])
else:
number_of_files = 0
skip = True
## print('Warning:',key,'has no instances. Outfile:',outfile)
if not skip:
start_string += ' number_of_files_containing_term='+str(number_of_files)
length = len(entry['instances'])
start_string += ' total_frequency='+str(length)
if 'variants' in entry:
start_string += ' variants="'
for variant in entry['variants']:
start_string +=variant+'|'
if not variant in done:
done.add(variant)
start_string = start_string[:-1]+'"'
if 'substring_of' in entry:
start_string += ' substring_of="'
for variant in entry['substring_of']:
start_string +=variant+'|'
start_string = start_string[:-1]+'"'
if minimum and (number_of_files<minimum):
start_string = ''
skip = True
elif skip and remove_mismatches:
start_string = ''
else:
start_string +='>'
done.add(key)
## remove final pipe and close start_string
outstream.write(start_string+'\n')
if not skip:
for tfile,start,end,is_substring in entry['instances']:
if is_substring:
instance_string = '<instance file="'+tfile+'" start='+str(start)+' end='+str(end)+' is_substring="True"/>\n'
else:
instance_string = '<instance file="'+tfile+'" start='+str(start)+' end='+str(end)+'/>\n'
outstream.write(instance_string)
if skip and remove_mismatches:
pass
else:
outstream.write('</term>\n')