-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGSEA_run_script.R
181 lines (146 loc) · 9.52 KB
/
GSEA_run_script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#####
# This is a script to take in two files that have been pre processed
# 1) you need an annotation file that has a list of all genes tab then
# comma separated GO terms
# 2) you need a list of the candidate genes from that list you want to assess
# for being enriched compared to the genome.
#
# NOTE: these two files must have the following headers added to them.
# annotation=S_gifuensis_aa.fasta.emapper.annotations
# candidate_file=japonicus_run1above95
# cut -f1,6 $annotation > $annotation.tsv
# echo 'Parent GO_term' | cat - "$annotation".tsv > "$annotation".header.tsv
# echo 'geneid' | cat - "$candidate_file" > "$candidate_file".header.tsv
#
# the resulting *header.tsv files are then used in the script below
# 1st argument is the annotation file
# 2nd argument is the candidate set list
#
# note that the current setting for GO node size is 5, which
# is hard coded below for more robust results.
#
# example run
# Rscript GSEA_run_script.R gifuensis_aa.fasta.emapper.annotations.header.tsv itoensis_run1above95.header.tsv
# descriptions
# http://avrilomics.blogspot.com/2015/07/using-topgo-to-test-for-go-term.html
# new general run
rm(list=ls()) #clears all variables
objects() # clear all objects
graphics.off() #close all figures
library("topGO")
library("Rgraphviz")
library(openxlsx)
##########
# set variables
# making general script
# Rscript myscript.R batch.csv
# and invoke these in the myscript.R
args <- commandArgs(TRUE)
# dataset <- read.table(args[1],header=FALSE,sep=",",skip=1)
annotations=args[1]
candidate_list=args[2]
# here I will be only analyzing GO terms with at least 5 members,
# as this yield more stable results.
node_size=5
#### CORE BP #####
GO_category="BP"
geneID2GO <- readMappings(file = annotations)
geneUniverse <- names(geneID2GO)
genesOfInterest.bv <- read.table(candidate_list,header=TRUE)
genesOfInterest.bv <- as.character(genesOfInterest.bv$geneid)
geneList.bv <- factor(as.integer(geneUniverse %in% genesOfInterest.bv))
names(geneList.bv) <- geneUniverse
myGOdata.bv <- new("topGOdata", description="Candidate genes", ontology=GO_category, allGenes=geneList.bv, annot = annFUN.gene2GO, gene2GO = geneID2GO, nodeSize = node_size)
# STATS#
# each GO term is tested independently, not taking the GO hierarchy into account
resultClassic <- runTest(myGOdata.bv, algorithm="classic", statistic="fisher")
# elim method processes the GO terms by traversing the GO hierarchy from bottom to top,
# ie. it first assesses the most specific (bottom-most) GO terms, and proceeds later
# to more general (higher) GO terms. When it assesses a higher (more general) GO term,
# it discards any genes that are annotated with significantly enriched descendant
# GO terms (considered significant using a pre-defined P-value threshold).
# This method does tend to miss some true positives at higher (more general)
# levels of the GO hierarchy.
resultElim <- runTest(myGOdata.bv, algorithm="elim", statistic="fisher")
# weight01 this is the default method used by TopGO, and is a mixture of the 'elim' and 'weight' methods
resultTopgo <- runTest(myGOdata.bv, algorithm="weight01", statistic="fisher")
# when assessing a GO term, it takes into accoount the annotation of terms to the current term's parents,
# and so reduces false positives due to the inheritance problem
resultParentchild <- runTest(myGOdata.bv, algorithm="parentchild", statistic="fisher")
# see how many results we get where weight01 gives a P-value <= 0.001:
mysummary <- summary(attributes(resultTopgo)$score <= 0.1)
numsignif <- as.integer(mysummary[[3]]) # how many terms is it true that P <= 0.001
allRes <- GenTable(myGOdata.bv,
classicFisher = resultClassic,
elimFisher = resultElim,
topgoFisher = resultTopgo,
parentchildFisher = resultParentchild,
orderBy = "parentchildFisher", ranksOf = "classicFisher", topNodes = numsignif)
# write output
printGraph(myGOdata.bv, resultClassic, firstSigNodes = 5, fn.prefix = paste(candidate_list, ".",GO_category,".GSEA_graph_resultClassic", sep=""), useInfo = "all", pdfSW = TRUE)
printGraph(myGOdata.bv, resultTopgo, firstSigNodes = 5, fn.prefix = paste(candidate_list, ".",GO_category,".GSEA_graph_resultTopGo", sep=""), useInfo = "all", pdfSW = TRUE)
printGraph(myGOdata.bv, resultParentchild, firstSigNodes = 5, fn.prefix = paste(candidate_list, ".",GO_category,".GSEA_graph_resultParentchild", sep=""), useInfo = "all", pdfSW = TRUE)
write.table(allRes[,c(1,8)], file=paste(candidate_list, ".",GO_category,".GSEA_result_elimFisher.REVIGO.tsv", sep=""), sep = "\t", qmethod = "double", quote = FALSE, row.names = FALSE, col.names = FALSE)
write.table(allRes[,c(1,7)], file=paste(candidate_list, ".",GO_category,".GSEA_result_classicFisher.REVIGO.tsv", sep=""), sep = "\t", qmethod = "double", quote = FALSE, row.names = FALSE, col.names = FALSE)
write.table(allRes[,c(1,10)], file=paste(candidate_list, ".",GO_category,".GSEA_result_parentchild.REVIGO.tsv", sep=""), sep = "\t", qmethod = "double", quote = FALSE, row.names = FALSE, col.names = FALSE)
write.table(allRes, file=paste(candidate_list, ".",GO_category,".GSEA_result.tsv", sep=""), sep = "\t", qmethod = "double", quote = FALSE, row.names = FALSE, col.names = TRUE)
write.xlsx(allRes, file = paste(candidate_list, ".",GO_category,".GSEA_result.xlsx", sep=""), borders = "rows")
#### CORE MF #####
# new general run
objects() # clear all objects definded the previous run
##########
# set variables
# making general script
# Rscript myscript.R batch.csv
# and invoke these in the myscript.R
rm(myGOdata.bv,allRes,resultClassic,resultElim,resultTopgo,resultParentchild,GO_category) #clears all variables
args <- commandArgs(TRUE)
# dataset <- read.table(args[1],header=FALSE,sep=",",skip=1)
annotations=args[1]
candidate_list=args[2]
# here I will be only analyzing GO terms with at least 5 members,
# as this yield more stable results.
node_size=5
#### CORE MF #####
GO_category="MF"
geneID2GO <- readMappings(file = annotations)
geneUniverse <- names(geneID2GO)
genesOfInterest.bv <- read.table(candidate_list,header=TRUE)
genesOfInterest.bv <- as.character(genesOfInterest.bv$geneid)
geneList.bv <- factor(as.integer(geneUniverse %in% genesOfInterest.bv))
names(geneList.bv) <- geneUniverse
myGOdata.bv <- new("topGOdata", description="Candidate genes", ontology=GO_category, allGenes=geneList.bv, annot = annFUN.gene2GO, gene2GO = geneID2GO, nodeSize = node_size)
# STATS#
# each GO term is tested independently, not taking the GO hierarchy into account
resultClassic <- runTest(myGOdata.bv, algorithm="classic", statistic="fisher")
# elim method processes the GO terms by traversing the GO hierarchy from bottom to top,
# ie. it first assesses the most specific (bottom-most) GO terms, and proceeds later
# to more general (higher) GO terms. When it assesses a higher (more general) GO term,
# it discards any genes that are annotated with significantly enriched descendant
# GO terms (considered significant using a pre-defined P-value threshold).
# This method does tend to miss some true positives at higher (more general)
# levels of the GO hierarchy.
resultElim <- runTest(myGOdata.bv, algorithm="elim", statistic="fisher")
# weight01 this is the default method used by TopGO, and is a mixture of the 'elim' and 'weight' methods
resultTopgo <- runTest(myGOdata.bv, algorithm="weight01", statistic="fisher")
# when assessing a GO term, it takes into accoount the annotation of terms to the current term's parents,
# and so reduces false positives due to the inheritance problem
resultParentchild <- runTest(myGOdata.bv, algorithm="parentchild", statistic="fisher")
# see how many results we get where weight01 gives a P-value <= 0.001:
mysummary <- summary(attributes(resultTopgo)$score <= 0.1)
numsignif <- as.integer(mysummary[[3]]) # how many terms is it true that P <= 0.001
allRes <- GenTable(myGOdata.bv,
classicFisher = resultClassic,
elimFisher = resultElim,
topgoFisher = resultTopgo,
parentchildFisher = resultParentchild,
orderBy = "parentchildFisher", ranksOf = "classicFisher", topNodes = numsignif)
# write output
printGraph(myGOdata.bv, resultClassic, firstSigNodes = 5, fn.prefix = paste(candidate_list, ".",GO_category,".GSEA_graph_resultClassic", sep=""), useInfo = "all", pdfSW = TRUE)
printGraph(myGOdata.bv, resultTopgo, firstSigNodes = 5, fn.prefix = paste(candidate_list, ".",GO_category,".GSEA_graph_resultTopGo", sep=""), useInfo = "all", pdfSW = TRUE)
printGraph(myGOdata.bv, resultParentchild, firstSigNodes = 5, fn.prefix = paste(candidate_list, ".",GO_category,".GSEA_graph_resultParentchild", sep=""), useInfo = "all", pdfSW = TRUE)
write.table(allRes[,c(1,8)], file=paste(candidate_list, ".",GO_category,".GSEA_result.REVIGO.tsv", sep=""), sep = "\t", qmethod = "double", quote = FALSE, row.names = FALSE, col.names = FALSE)
write.table(allRes[,c(1,7)], file=paste(candidate_list, ".",GO_category,".GSEA_result_classicFisher.REVIGO.tsv", sep=""), sep = "\t", qmethod = "double", quote = FALSE, row.names = FALSE, col.names = FALSE)
write.table(allRes[,c(1,10)], file=paste(candidate_list, ".",GO_category,".GSEA_result_parentchild.REVIGO.tsv", sep=""), sep = "\t", qmethod = "double", quote = FALSE, row.names = FALSE, col.names = FALSE)
write.table(allRes, file=paste(candidate_list, ".",GO_category,".GSEA_result.tsv", sep=""), sep = "\t", qmethod = "double", quote = FALSE, row.names = FALSE, col.names = TRUE)
write.xlsx(allRes, file = paste(candidate_list, ".",GO_category,".GSEA_result.xlsx", sep=""), borders = "rows")