-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmatch_hmdb.py
330 lines (282 loc) · 14.2 KB
/
match_hmdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
# -*- coding: utf-8 -*-
"""
Created on Mon May 02 10:31:09 2016
@author: Claire
"""
from lxml import etree
import pandas as pd
import argparse
## This searches all of a downloaded HMDB xml file for each of your input masses
## things this code needs to do:
##1. Read the entire HMDB into a dictionary
#1.1 Fields of interest: HMDB ID, mass, name, synonyms, biofluid, biofunction
#2. Convert mz's to neutral masses, using golden dictionary
#2.1 probably should do both -H nd +Cl
#3. Search all masses against the masses in HMDB
#4. Write results to a file
def parse_HMDB(xml_file):
""" Parses a cleaned up HMDB xml file into a dictionary.
Dictionary contains: {HMDB_ID: {'neutral_mass': m, 'name': name,
'synonyms': [list of names], 'biofluid': [list of fluids],
'biofunction': [list of functions]}, ...}
Input: xml_file: this file should be cleaned up using
remove_excess_xml_declarations.py
Why? Because downloading all HMDB metabolites actually downloads
45,000 separate xml files. Will concatenate into one file, but need to
remove new XML declarations.
Output: dictionary
"""
outdict = {}
tree = etree.iterparse(xml_file, tag='metabolite')
counter = 0
for event, elem in tree:
counter += 1
if counter % 2000 == 0:
print('Reading {}th metabolite from HMDB'.format(counter))
tmpdict = {}
tmpdict['name'] = elem.findtext('name')
if elem.findtext('monisotopic_moleculate_weight'):
tmpdict['neutral_mass'] = elem.findtext('monisotopic_moleculate_weight')
else:
tmpdict['neutral_mass'] = '0.0'
tmpdict['synonyms'] = [
i.text for i in elem.find('synonyms').findall('synonym')]
tmpdict['biofluids'] = [
i.text for i in elem.find('biofluid_locations').findall('biofluid')]
# ''.join(i.text.split('\t')) fixes bug related to Clavulanate,
# which seems to have tab characters in its biofunctions
tmpdict['biofunctions'] = [
''.join(i.text.split('\t')) for i in
elem.find('ontology').find('biofunctions').findall('biofunction')]
tmpdict['description'] = elem.findtext('description')
# Get taxonomy info, substituents, and alternative_parents.
# Not all HMDB metabolites have this info
try:
tmpdict['kingdom'] = elem.find('taxonomy').findtext('kingdom')
except:
tmpdict['kingdom'] = ''
try:
tmpdict['super_class'] = elem.find('taxonomy').findtext('super_class')
except:
tmpdict['super_class'] = ''
try:
tmpdict['class'] = elem.find('taxonomy').findtext('class')
except:
tmpdict['class'] = ''
try:
tmpdict['sub_class'] = elem.find('taxonomy').findtext('sub_class')
except:
tmpdict['sub_class'] = ''
try:
tmpdict['molecular_framework'] = elem.find('taxonomy').findtext('molecular_framework')
except:
tmpdict['molecular_framework'] = ''
try:
tmpdict['alternative_parents'] = [i.text for i in elem.find('taxonomy').find('alternative_parents').findall('alternative_parent')]
except:
tmpdict['alternative_parents'] = ['']
try:
tmpdict['substituents'] = [i.text for i in elem.find('taxonomy').find('substituents').findall('substituent')]
except:
tmpdict['substituents'] = ['']
# Find diseases and pathways. Note that pathways also can have different IDs like KEGG
try:
tmpdict['diseases'] = [i.findtext('name') for i in elem.find('diseases').findall('disease')]
except:
tmpdict['diseases'] = ['']
try:
tmpdict['pathways'] = [i.findtext('name') for i in elem.find('pathways').findall('pathway')]
except:
tmpdict['pathways'] = ['']
outdict[elem.findtext('accession')] = tmpdict
elem.clear()
return outdict
def extract_mzs(fname, colname=None, indexname=None, sep='\t', header=0):
""" Extracts the mz's from a file containing mz's in a column
named colname. Each mz is labeled with values in column labeled indexname.
sep indicates the separator for columns in file fname. Default is tab-delimited.
If header=None, file assumes no header row. Else it reads in the first row as the header.
If file only contains neutral masses (i.e. it's not a dataframe) and
colname = None, just reads in the file with each mz on a newline, and labels
each mz with a counter.
"""
## Read in mz's from file
if not colname:
with open(fname, 'r') as f:
mzs = f.readlines()
mzs = [m.strip() for m in mzs]
else:
feattable = pd.read_csv(fname, sep=sep)
mzs = list(feattable[colname])
# Read in mz labels from file
if not colname and not indexname:
mznames = [str(i) for i in range(0, len(mzs))]
else:
feattable = pd.read_csv(fname, sep=sep, index_col=0)
if not indexname:
mznames = list(feattable.index)
else:
mznames = list(feattable(indexname))
return mzs, mznames
def calculate_neutral_masses(mzs):
""" Calculates the neutral masses from a list of mz's provided.
Currently returns two lists of neutral masses: one corresponding to the
[M-H]- adduct and one to the [M+Cl]- adduct
"""
## Calculate neutral masses for both types of common neg ions.
# Numbers include electron masses
# [M-H]- adducts
minush = [float(i) + 1.007276 for i in mzs]
# [M+Cl]- adducts
pluscl = [float(i) - 34.969402 for i in mzs]
return minush, pluscl
def get_hmdb_hits(masses, hmdb_dict, ppm_tol):
""" For each mass in masses, scan the given hmdb dictionary for any hits within
ppm_tol tolerance.
INPUTS
masses = list of neutral masses
hmdb_dict = dictionary created by running parse_HMDB() on an HMDB xml file.
ppm_tol = maximum error tolerated to consider mass a "hit", in ppm's
OUTPUT
allhits = dictionary containing all the hits for each mass
dictionary contains: {mass: {hmdb_id: hmdb_dict}}
each hmdb_dict is as returned by parse_HMDB, and has as top key
the HMDB_ID. It also has one additional subkey, which is ppm.
"""
allhits = {}
count = 0
print('Getting hits for {} unique neutral masses'.format(len(set(masses))))
for m in set(masses):
count += 1
if count % 100 == 0:
print('Getting hits for the {}th neutral mass'.format(count))
mhits = [i for i in hmdb_dict if abs(float(hmdb_dict[i]['neutral_mass']) - float(m)) <= ppm_tol*float(m)/1e6]
allhits[m] = {i: hmdb_dict[i] for i in mhits}
# Add in ppm for each hit
for i in allhits[m]:
allhits[m][i]['ppm'] = str(abs(float(hmdb_dict[i]['neutral_mass']) - float(m))/float(m) * 1e6)
return allhits
def write_hits_to_file(adduct_type, neutral_masses, mzs, mznames, allhits, fout, overwrite=False, describe=False):
""" Write all of the hits for each mass for each adduct to fout.
INPUTS
adduct_type = list of adduct types of len(neutral_masses)
neutral_masses = list of neutral masses
mzs = list of mz's corresponding of len(neutral_masses). mzs[i] = neutral_masses[i] with adduct[i]
allhits = dictionary of HMDB hits. Keys are all of the unique neutral masses
allhits[neutral_masses[i]] = {hmdb_id: hmdb_dict[hmdb_id]}
describe = whether to include the HMDB description in the output file. This feature is buggy
and has weird formatting at times. Default is False
overwrite = whether to overwrite the fout or append to existing file fout
OUTPUTS
None.
Writes tab-delimited file to fout with each hit for each mz on its own line.
"""
if overwrite:
readtype = 'w'
else:
readtype = 'a'
with open(fout, readtype) as f:
if overwrite:
if describe:
f.write('\t'.join(['featname', 'mz', 'adduct', 'neutral_mass',
'hmdb_id', 'hmdb_name', 'monoisotopic_mass',
'ppm', 'synonyms', 'biofluids', 'biofunctions',
'description', 'kingdom', 'super_class',
'class', 'sub_class', 'molecular_framework',
'alternative_parents', 'substituents',
'pathways', 'diseases']) + '\n')
else:
f.write('\t'.join(['featname', 'mz', 'adduct', 'neutral_mass',
'hmdb_id', 'hmdb_name', 'monoisotopic_mass',
'ppm', 'synonyms', 'biofluids', 'biofunctions',
'kingdom', 'super_class',
'class', 'sub_class', 'molecular_framework',
'alternative_parents', 'substituents',
'pahways', 'diseases']) + '\n')
for a, nm, mz, mzname in zip(adduct_type, neutral_masses, mzs, mznames):
# If the neutral mass has an HMDB hit
if allhits[nm]:
for h_id in allhits[nm]:
# Assemble data from the dictionary
h_name = allhits[nm][h_id]['name'].encode('utf-8')
ppm = allhits[nm][h_id]['ppm']
mono_mass = allhits[nm][h_id]['neutral_mass']
syns = '; '.join(allhits[nm][h_id]['synonyms']).encode('utf-8')
fluids = '; '.join(allhits[nm][h_id]['biofluids']).encode('utf-8')
functions = '; '.join(allhits[nm][h_id]['biofunctions']).encode('utf-8')
kingdom = allhits[nm][h_id]['kingdom']
super_class = allhits[nm][h_id]['super_class']
cls = allhits[nm][h_id]['class']
sub_class = allhits[nm][h_id]['sub_class']
molec_framework = allhits[nm][h_id]['molecular_framework']
alt_parents = '; '.join(allhits[nm][h_id]['alternative_parents']).encode('utf-8')
substituents = '; '.join(allhits[nm][h_id]['substituents']).encode('utf-8')
pathways = '; '.join(allhits[nm][h_id]['pathways']).encode('utf-8')
diseases = '; '.join(allhits[nm][h_id]['diseases']).encode('utf-8')
if describe:
description = allhits[nm][h_id]['description'].encode('utf-8')
f.write('\t'.join([str(i) for i in [mzname, mz, a, nm,
h_id, h_name, mono_mass,
ppm, syns, fluids, functions,
description, kingdom,
super_class, cls,
sub_class, molec_framework,
alt_parents,
substituents,
pathways, diseases]]) + '\n')
else:
f.write('\t'.join([str(i) for i in [mzname, mz, a, nm,
h_id, h_name, mono_mass,
ppm, syns, fluids, functions,
kingdom,
super_class, cls,
sub_class, molec_framework,
alt_parents,
substituents,
pathways, diseases]]) + '\n')
else:
f.write('\t'.join([str(i) for i in [mzname, mz, a, nm]]) + '\n')
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-x', help='clean hmdb xml file [required, default: %(default)s]', default='data/hmdb_metabolites_clean.xml')
parser.add_argument('-f', help='input feature table. has column labeled mz and feature names in the first column [required, no default]', required=True)
parser.add_argument('-s', help='feature table separator. [default is tab-delmited]', default='\t')
parser.add_argument('-p', help='ppm tolerance. [default: %(default)s]', default=5, type=int)
parser.add_argument('-o', help='output file to write results to [required, no default]', required=True)
parser.add_argument('-d', help='whether to include description in output (may mess up formatting) [default: %(default)s]', default=False)
args = parser.parse_args()
hmdb_xml = args.x
feattable = args.f
sep = args.s
ppm_tolerance = args.p
outfile = args.o
describe = args.d
## 1. Parse HMDB xml
hmdb_dict = parse_HMDB(hmdb_xml)
# try:
# hmdb_dict = parse_HMDB(hmdb_xml)
# except:
# print('failed to parse hmdb')
## 2. Extract mz's from feature table file
try:
mzs, mznames = extract_mzs(feattable, colname='mz', indexname=None, sep=sep, header=0)
except:
print('failed to extract mzs from feature table')
## 3. Calculate neutral masses for each mz
print('Calculating neutral masses')
minush_nm, pluscl_nm = calculate_neutral_masses(mzs)
## 4. Get hits to HMDB, within a tolerance
print('Getting hits to HMDB metabolites, [M-H]-')
hits_h = get_hmdb_hits(minush_nm, hmdb_dict, ppm_tolerance)
print('Getting hits to HMDB metabolites, [M+Cl]-')
hits_cl = get_hmdb_hits(pluscl_nm, hmdb_dict, ppm_tolerance)
## 5. Write to file
print('Writing file ' + outfile)
# 5.1 [M-H]- adducts first
adducts = ['[M-H]-']*len(minush_nm)
newfile = True
write_hits_to_file(adducts, minush_nm, mzs, mznames, hits_h, outfile, overwrite=newfile, describe=describe)
# [M+Cl]- next
adducts = ['[M+Cl]-']*len(pluscl_nm)
newfile = False
write_hits_to_file(adducts, pluscl_nm, mzs, mznames, hits_cl, outfile, overwrite=newfile, describe=describe)