-
Notifications
You must be signed in to change notification settings - Fork 0
/
modules.py
373 lines (280 loc) · 12.9 KB
/
modules.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
#!/usr/bin/env python
import urllib
from difflib import SequenceMatcher
import numpy as np
from numpy import genfromtxt
import re
import scipy.stats as st
import statsmodels.sandbox.stats.multicomp as mc
# Metabolite_Parser
def downloadParser():
"""
Download compound information (cpd_id, cpd_name) from KEGG database.
Returns:
:return met2keggs: Output table with KEGG information about compound name and identifier
:rtype met2keggs: file
"""
met2keggs = urllib.urlopen("http://rest.kegg.jp/list/compound")
met2keggs = met2keggs.read()
met2keggs = met2keggs.split("\n")
return(met2keggs)
def add2Dictionary(noPrefixMetName, newMetName, metabolite, kegg_cpd, keggMetNames, metDict):
"""
Function to compare input metabolite name with kegg name.
If newMetName (metabolite without prefix) == keggMetName:
Calculate similarity between metabolite (oirginal name) and keggMetName
It creates different dictionaries for each metabolite (one per match)
Arguments:
:param noPrefixMetName: Input metabolite name without prefix (metabolite_parser.mainPrefixes)
:type noPrefixMetName: string
:param newMetName: Input metabolite name without prefix and chemical words.
:type newMetName: string
:param metabolite: Input metabolite name
:type metabolite: string
:param kegg_cpd: Kegg identifier of the metabolite
:type kegg_cpd: string
:param keggMetNames: "List" of Kegg metabolite names associated to a Kegg identifier
:type keggMetNames: string
Returns:
:return metDict: Output dictionary with this structure: {Metabolite_Name_in_Kegg: [similarity, Kegg_compound_identifier]
:rtype metDict: dictionary
"""
keggMetNames = keggMetNames.split(";")
for keggMetName in keggMetNames:
if re.search(".*" + newMetName + ".*", keggMetName.strip(), re.IGNORECASE):
similarity = calculateSimilarity(metabolite.strip(), keggMetName.strip())
metDict[keggMetName.strip()] = [similarity, kegg_cpd]
return(metDict)
def calculateSimilarity(metabolite, keggMetName):
"""
Function used by add2dictionary() to compare two metabolite names and return the similarity between them.
If the difference between them is only one of these mainPrefixes, a similarity of 90% is returned.
Arguments:
:param metabolite: Input metabolite name
:type metabolite: string
:param keggMetName: One of the metabolites contained in keggMetNames
:type keggMetName: string
Returns:
:return similarity: Percentage of similarity between 2 input names.
:rtype similarity: float
"""
if (metabolite == '' or keggMetName == ''):
raise TypeError('Empty string as input')
mainPrefixes = {"", "cis-","trans-","d-","l-","(s)-","alpha-","beta-","alpha ","beta ","alpha-d-","beta-d-","alpha-l-","beta-l-", "l-beta-", "l-alpha-", "d-beta-", "d-alpha-"}
if metabolite.lower() == keggMetName.lower():
similarity = 1.0
elif keggMetName.lower().replace(metabolite.lower(), "") in mainPrefixes:
similarity = 0.9
elif metabolite.lower().replace(keggMetName.lower(), "") in mainPrefixes:
similarity = 0.9
else:
similarity = SequenceMatcher(a=metabolite.lower(), b=keggMetName.lower()).ratio()
return similarity
# Kegg_downloader
def downloadKegg(args):
"""
Download necessary information from Kegg Database for parsing.
Returns:
:return gen2kegg: kegg_gene_identifier "\t" Gene_Symbol ";" Gene_name
:rtype gen2kegg: file
:return kgen2pathway: kegg_gene_identifier "\t" pathway_identifier_for_gene
:rtype kgen2pathway: file
:return met2kegg: kegg_metabolite_identifier "\t" Metabolite_names_list_sep_;
:rtype met2kegg: file
:return kmet2pathway: kegg_metabolite_identifier "\t" pathway_identifier_for_metabolite
:rtype kmet2pathway:similarity file
:return pathways: pathway_identifier_for_gene "\t" Pathway_name "-" Specified_organism
:rtype pathways: file
:return pathways_common: pathway_identifier_for_metabolite "\t" Pathway_name
:rtype pathways_common: file
"""
urllib.urlretrieve("http://rest.kegg.jp/list/" + args.specie, args.gen2keggs)
urllib.urlretrieve("http://rest.kegg.jp/link/pathway/" + args.specie, args.kgen2pathways)
urllib.urlretrieve("http://rest.kegg.jp/list/compound", args.met2keggs)
urllib.urlretrieve("http://rest.kegg.jp/link/pathway/compound", args.kmet2pathways)
urllib.urlretrieve("http://rest.kegg.jp/list/pathway/" + args.specie, args.pathways)
urllib.urlretrieve("http://rest.kegg.jp/list/pathway", args.pathways_commons)
return(args)
def createGeneList(args):
"""
Function that takes the gene expression matrix and extracts the column with Gene_Symbols
Arguments:
:param geneDataset: Gene Expression Dataset File
:type geneDataset: file
:param genId: Name of the column with Gene_symbol identifier
:type genId: string
Returns:
:return geneList: "List" that contains only the gene symbols
:rtype geneList: numpy_table
"""
with open(args.geneDataset, "r") as data:
header = data.readline()
header = header.strip().split('\t')
ncol = header.index(args.genId)
geneList = genfromtxt(args.geneDataset, delimiter='\t', usecols=ncol, dtype=None)
geneList = np.delete(geneList, 0)
return(geneList)
def useMetaboliteParsed(args):
"""
This function is used when the input metabolite data set has been parsed with metabolite_parser tool.
Arguments:
:param metDataset: Metabolomic Dataset File
:type metDataset: file
Returns:
:return metList: list with selected Kegg names column of the metabolomic dataset
:rtype metList: list
"""
metList = []
for line in open(args.metDataset, "r"):
selected = line.split("\t")[5].strip()
if selected == "Yes":
metList.append(line.split("\t")[1])
return(metList)
def useMetaboliteNotParsed(args):
"""
This function is used when the input metabolite data set has NOT been parsed with metabolite_parser tool.
Arguments:
:param metDataset: Metabolomic Dataset File
:type metDataset: file
:param metId: Name of the column with metabolite names
:type metId: string
Returns:
:return metList: "List" that contains only the metabolite names
:rtype metList: numpy_table
"""
# If metabolite dataset is not parsed, create a dataset with metabolite name column
with open(args.metDataset, "r") as data:
header = data.readline()
header = header.strip().split('\t')
ncol = header.index(args.metId)
metList = genfromtxt(args.MET_dataset, delimiter='\t', usecols=ncol, dtype=None)
metList = np.delete(metList, 0)
return(metList)
def findWholeWord(word):
"""
Function to find one word inside another word.
Arguments:
:param word: word to check if is contained inside another word
:type word: string
Returns:
:return is_contained: Returns "yes" if it's contained, "no" if not
:rtype is_contained: boolean
"""
return re.compile(r'\b(?![-])({0})(?![-])\b'.format(word), flags=re.IGNORECASE).search
# Pathway Enrichment
def fisherExactTest(args, path_feat):
"""
This function performs a complete Pathway enrichment analysis:
1) Calcule all the values of the contingency table
2) Perform Fisher Exact test
3) FDR correction of p-values
*********************
* Fisher exact test *
*********************
| Molecules associated to pathway | Molecules not associated to pathway | Total
-----------------------------------------------------------------------------------------
DEG/M | a | b | a+b
-----------------------------------------------------------------------------------------
No DEG/M | c | d | c+d
-----------------------------------------------------------------------------------------
Total | a+c | b+d | N
-----------------------------------------------------------------------------------------
N = Total of molecules
a+b = Total 1 flags
c+d = Total 0 flags
Arguments:
:params deaGeneDataset deaMetDataset: Tables with Differential Expression Analysis information for gene expression
and metabolomics, respectively
:types deaGeneDataset met_dataset: files
:params gene_id_col, met_id_col, gene_flag_col, met_flag_col: Column names of unique identifiers and desired
flag column of gene expression and metabolomics datasets, respectively
:type gene_id_col, met_id_col, gene_flag_col, met_flag_col: strings
:params alpha, method: alpha-value and method desired for the FDR correction
:type alpha, method: strings
Returns:
:return output: Table with this structure: Pathway Name Odds_Ratio P_value FDR_Correction Flag_#
:rtype output: file
"""
# N)
N_gene = sum(1 for line in open(args.deaGeneDataset)) - 1
N_met = sum(1 for line in open(args.deaMetDataset)) - 1
N = N_gene + N_met
# a+b, c+d)
metDeCounter = 0
metNonDeCounter = 0
with open(args.deaGeneDataset, "r") as geneDataset:
header_gene = geneDataset.readline().split("\t")
indice_gene_flag = header_gene.index(args.gene_flag_col)
for line in geneDataset:
flag = int(line.split("\t")[indice_gene_flag])
if flag == 1:
metDeCounter += 1
elif flag == 0:
metNonDeCounter += 1
geneDeCounter = 0
geneNonDeCounter = 0
with open(args.deaMetDataset, "r") as metDataset:
header_met = metDataset.readline().split("\t")
indice_met_flag = header_met.index(args.met_flag_col)
for line in metDataset:
flag = int(line.split("\t")[indice_met_flag])
if flag == 1:
geneDeCounter += 1
elif flag == 0:
geneNonDeCounter += 1
a_b = metDeCounter + geneDeCounter
c_d = metNonDeCounter + geneNonDeCounter
# a)
geneDataset = open(args.deaGeneDataset, "r")
metDataset = open(args.deaMetDataset, "r")
tmpList_v1 = []
tmpList_v2 = []
pvalues = []
indice_gene_id = header_gene.index(args.gene_id_col)
indice_met_id = header_met.index(args.met_id_col)
for pathway in path_feat.keys():
a_gen = 0
a_met = 0
a_c = 0
for value in path_feat[pathway]:
geneDataset.seek(0)
geneDataset.readline()
for line in geneDataset:
gene = line.split("\t")[indice_gene_id].replace('"', '')
if value == gene:
a_gen += int(line.split("\t")[indice_gene_flag])
a_c += 1
break
metDataset.seek(0)
metDataset.readline()
for line2 in metDataset:
metabolite = line2.split("\t")[indice_met_id].replace('"', '')
if value == metabolite:
a_met += int(line2.split("\t")[indice_met_flag])
a_c += 1
break
a = a_gen + a_met
# b), c), d)
b = a_b - (a_gen + a_met)
c = a_c - (a_gen + a_met)
d = c_d - (a_c - (a_gen + a_met))
oddsratio, pvalue = st.fisher_exact([[a, b], [c, d]])
pvalues.append(pvalue)
tmpList_v1.append(pathway + "\t" + str(oddsratio) + "\t" + str(pvalue) + "\t")
# FDR correction
pvals_corrected = mc.multipletests(pvalues, alpha = float(args.alpha), method = args.method, is_sorted = False, returnsorted = False)[1]
# Write results, Sort by P value
i = 0
for line in tmpList_v1:
tmpList_v2.append(line + str(pvals_corrected[i]))
i+=1
with open(args.output, "w") as output:
output.write("Pathway Name\tOdds Ratio\tP value\tFDR Correction\tFlag_0.05\n")
for line in sorted(tmpList_v2, key=lambda line: float(line.split("\t")[2])):
if float(line.split("\t")[3]) <= 0.05:
output.write(line + "\t" + str(1) + "\n")
else:
output.write(line + "\t" + str(0) + "\n")
geneDataset.close()
metDataset.close()
return(args)