-
Notifications
You must be signed in to change notification settings - Fork 2
/
go_enrichment_script.py
287 lines (256 loc) · 11.1 KB
/
go_enrichment_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
@author: Pieter Moris
Main script to perform a gene enrichment analysis.
# Method
Performs one-sided hypergeometric tests, starting from the most specific (child) GO terms associated with the
genes in the set of interest. If the p-value of the test does not fall below the specified significance level alpha,
the test wll be carried out for all of the term's parent terms, otherwise the process will terminate. This method
attempts to limit the total number of tests that need to be carried out, since a term that is enriched will likely
also have enriched parent terms. Furthermore, GO terms associated with a small number of genes are skipped.
Next, the Benjamini-Hochberg FDR or Bonferroni multiple testing correction are applied to the test results.
Finally, a csv file containing all the GO terms that were evaluated and their p-values are returned.
# Special cases
If a GO namespace is provided, any genes that lack an annotation of this type are ignored. I.e. this can lower
the total counts for the background and interest gene sets.
If any of the provided genes are absent from the gene association file / gene ontology file, they are ignored.
If the subset of interest is disjoint from the background, the extra genes will be ignored.
# Requires the following input files:
- gene association file in .gaf format
- gene ontology file in .obo format
- text file with genes of interest separated by new lines
expects Uniprot AC's
- optional text file with background set
if not provided, the entire gene annotation found in the association file will be used
# The following options can be specified:
- The output file in which results will be stored
- The minimum number of genes required to be associated with a term before it will be tested.
- The FDR cut-off to utilise.
- The p-value threshold to limit the recursive testing of the GO tree.
- The namespace to test: all, biological_process, molecular_function or cellular_component
Uses the naming convention found in the .obo file.
# Dependencies:
numpy, pandas, scipy.stats, statsmodels.sandbox.stats.multicomp
'''
import argparse
from goscripts import enrichment_stats
from goscripts import gaf_parser
from goscripts import genelist_importer
from goscripts import obo_tools
# Run as stand alone script
if __name__ == '__main__':
# Check provided arguments
parser = argparse.ArgumentParser(
description='Script to perform GO enrichment analysis',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# https://stackoverflow.com/questions/18862836/how-to-open-file-using-argparse
parser.add_argument(
'-b',
'--background',
type=str,
default=set(), # use full annotation set instead
dest='background',
help=
'File containing a list of Uniprot accession numbers for the background set of genes. If omitted, the full list of genes in the .gaf file will be used.'
)
parser.add_argument(
'-s',
'--subset',
type=str,
required=True,
dest='subset',
help=
'File containing a list of Uniprot accession numbers for the subset of genes of interest'
)
parser.add_argument(
'-o',
'--obo',
type=str,
required=True,
dest='obo',
help='.obo file containing gene ontology')
parser.add_argument(
'-g',
'--gaf',
type=str,
required=True,
dest='gaf',
help='.gaf file containing GO associations')
# parser.add_argument('-O', '--output', type=argparse.FileType('w'),
# default='enrichment-results.csv', dest='output',
# help='Specifies the name or path of the output csv file')
parser.add_argument(
'-O',
'--output',
type=str,
default='enrichment_results.csv',
dest='outputFile',
help='Output file or path')
parser.add_argument(
'-n',
'--namespace',
type=str,
default='all',
choices=[
'all', 'biological_process', 'molecular_function',
'cellular_component'
],
help='Select the GO namespace to limit the enrichment test to')
parser.add_argument(
'-m',
'--min',
type=int,
default='3',
dest='minGenes',
help='Minimum number of genes before considering a GO category')
parser.add_argument(
'-l',
'--limit-tests',
type=float,
default='0.05',
dest='testing_limit',
help=
'P-value cut-off to use to stop GO tree propagation during enchrichment tests'
)
parser.add_argument(
'-p',
'--pval-thresh',
type=float,
default='0.1',
dest='threshold',
help='Significant p-value threshold to use for significance testing')
parser.add_argument(
'--mult-test',
type=str,
default='fdr_bh',
dest='mult_test',
help=
'The type of multiple testing correction to use. Either "fdr_bh" (default), "bonferroni" or any other method offered by statsmodels.stats.multitest.multipletests().'
)
parser.add_argument(
'-v', '--verbose', action="store_true", help='Verbose output.')
parser.add_argument(
'--no-propagation',
action="store_false",
dest="propagation", # will be true by default
help=
'Disables propagation during testing. Use if only strictly associated terms should be tested.'
)
parser.add_argument(
'--no-part-of',
action="store_true",
dest="no_part_of",
help='Ignore part_of relations between GO terms during traversal.')
args = parser.parse_args()
# Import the gene set of interest (Uniprot AC format)
interest = genelist_importer.importGeneList(args.subset)
# If no background is provided, import the gene association file and
# retrieve the full background set from it
if not args.background:
print(
'\nNo background gene set provided, retrieving all genes from the gene association file...\n'
)
gafDict = gaf_parser.importGAF(args.gaf, args.background)
background = set(gafDict.keys())
# otherwise, import the background set first and
# use this to limit gene association import
else:
background = genelist_importer.importGeneList(args.background)
gafDict = gaf_parser.importGAF(args.gaf, background)
# Check if the set of interest contains Uniprot AC's not present
# in the background set and report and remove them
interest = genelist_importer.isValidSubset(interest, background)
# Generate a gene association file for the (pruned) subset of interest too.
gafSubset = gaf_parser.createSubsetGafDict(interest, gafDict)
# Check if the background/interest gene sets contain genes not present
# in the gene association file remove them
# and report that they won't be considered during the analysis
background = genelist_importer.reportMissingGenes(background, gafDict,
'background')
interest = genelist_importer.reportMissingGenes(interest, gafSubset,
'interest')
# Import gene ontology file
GOterms = obo_tools.importOBO(args.obo, ignore_part_of=args.no_part_of)
# Reduce gene ontology file to selected namespace
if not args.namespace == 'all':
GOterms = obo_tools.filterOnNamespace(GOterms, args.namespace)
# and reduce the gene association files to these GO terms
gafDict = gaf_parser.cleanGafTerms(gafDict, GOterms)
gafSubset = gaf_parser.cleanGafTerms(gafSubset, GOterms)
# NOTE: terms belonging to unselected namespaces could still show up in
# the child/parent attributes of existing terms (via part_of relations)
# This can be prevented by either going through these attributes
# or by always checking if a term is present in the dictionary before using it.
# The second option is used in propagateParents() and a warning is
# issued in the buildGOtree() function.
# This means that the parent attribute might contain removed terms,
# but the recursive_parent/child attributes do not.
print(
len(gafDict), 'out of', len(background),
'background genes found annotated for', args.namespace)
print(
len(gafSubset), 'out of', len(interest),
'genes of interest found annotated for', args.namespace)
# If for some reason the gene association dictionaries' length can't
# be used for determining the size of the background and interest sets
# after namespace reduction
# the set objects themselves need to be filtered once more as well.
# NOTE that this will output a long list of removed (unannotated) genes
# to the screen.
background = genelist_importer.reportMissingGenes(
background, gafDict, 'background')
interest = genelist_importer.reportMissingGenes(
interest, gafSubset, 'interest')
# Build full GO hierarchy
root_nodes = obo_tools.set_namespace_root(args.namespace)
print(
'Propagating through ontology to find all children and parents for each term...\n'
)
obo_tools.buildGOtree(GOterms, root_nodes)
# Filter GO terms from gaf dictionaries if they are missing from the GO
print(
'Looking for inconsistencies between the gene association and GO files...\n'
)
to_remove = set()
for gene in gafDict:
for term in gafDict[gene]:
if term not in GOterms and term not in to_remove:
to_remove.add(term)
print(
f'{term} was missing from GO file.\nRemoving term from gene assocation file...\n'
)
if to_remove:
gafDict = {
gene: terms
for gene, terms in gafDict.items() if gene not in to_remove
}
gafSubset = {
gene: terms
for gene, terms in gafSubset.items() if gene not in to_remove
}
# Perform enrichment test
print(
'Finished completing ontology...proceeding with enrichment tests...\n')
enrichmentResults = enrichment_stats.enrichmentAnalysis(
GOterms,
gafDict,
gafSubset,
minGenes=args.minGenes,
threshold=args.testing_limit,
propagation=args.propagation)
# Update results with multiple testing correction
enrichment_stats.multipleTestingCorrection(
enrichmentResults, testType=args.mult_test, threshold=args.threshold)
# Create dataframe with tested GO terms and results
output = enrichment_stats.annotateOutput(enrichmentResults, GOterms,
gafDict, gafSubset)
# Print intermediate output
if args.verbose:
print(
'\nGO term, uncorrected and FDR-corrected p-values and description of GO term:\n'
)
print(output)
# Save results
print('\nSaving output to', args.outputFile)
output.to_csv(args.outputFile)