-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmetabolomics_pathway_analysis.Rmd
252 lines (214 loc) · 10.3 KB
/
metabolomics_pathway_analysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
---
title: "metabolomics_analysis"
author:
- "DeniseSl22"
- "ddedesener"
date: "25/04/2022"
output:
md_document:
variant: markdown_github
always_allow_html: true
---
## Introduction
In this workflow, we link the metabolites of interest to pathway data from WikiPathways, based on their HMDB identifier.
```{r data_import,warning=FALSE, message=FALSE}
# Obtain Working Directory for step 8 to find processed data
setwd('..')
#Obtain data from step 8
mSet_CD <- read.csv("8-significantly_changed_metabolites_analysis/output/mbxData_CD.csv", na.strings=c("", "NA"))
mSet_UC <- read.csv("8-significantly_changed_metabolites_analysis/output/mbxData_UC.csv", na.strings=c("", "NA"))
## Select a disorder to analyse (options; CD or UC)
disorder <- "CD"
if (disorder == "CD") {
mSet = mSet_CD
print("Selected disorder is Crohn's disease")}else if(disorder == "UC"){
mSet = mSet_UC
print("Selected disorder is Ulcerative Colitis")}else{print("Disorder not Recognised")}
```
## Find pathways based on relevant IDs column
```{r pathway_retrieval,warning=FALSE, message=FALSE}
if(!"SPARQL" %in% installed.packages()){
install.packages("SPARQL")
}
library(SPARQL)
##Connect to Endpoint WikiPathways
endpointwp <- "https://sparql.wikipathways.org/sparql"
## 1. Query metadata:
queryMetadata <-
"SELECT DISTINCT ?dataset (str(?titleLit) as ?title) ?date ?license
WHERE {
?dataset a void:Dataset ;
dcterms:title ?titleLit ;
dcterms:license ?license ;
pav:createdOn ?date .
}"
#below code should be performed first to handle the ssl certificate error
options(RCurlOptions = list(cainfo = paste0( tempdir() , "/cacert.pem" ), ssl.verifypeer = FALSE))
resultsMetadata <- SPARQL(endpointwp,queryMetadata,curl_args=list(useragent=R.version.string))
showresultsMetadata <- resultsMetadata$results
remove(queryMetadata, resultsMetadata)
## Create a list of HMDB IDs according to filtering criteria from step 8.
list_Relevant_HMDB_IDs <- list(mSet$relevant_ids)
vector_HMDB <- unlist(list_Relevant_HMDB_IDs) #convert list to array, for traversing the data to a SPARQL query later on
vector_HMDB <- vector_HMDB[!is.na(vector_HMDB)]
##Add the HMDb prefix IRI in front of all IDs.
query_HMDBs <- paste("ch:", vector_HMDB, sep="")
##Merge the individual entries in the vector into one string, separated by a space
string_HMDB <- paste(c(query_HMDBs), collapse=' ' )
##TODO: add column with nr. of Metabolites in PW (to calculate PW impact)
#For now, filter out Reactome PWs due to visualization issues in Cytoscape.
item1 = "PREFIX ch: <https://identifiers.org/hmdb/>
PREFIX cur: <http://vocabularies.wikipathways.org/wp#Curation:>
select distinct ?pathwayRes (str(?wpid) as ?pathway) (str(?title) as ?pathwayTitle) (count(distinct ?hmdbMetabolite) AS ?HMDBsInPWs)
(GROUP_CONCAT(DISTINCT fn:substring(?hmdbMetabolite,30);separator=' ') AS ?includedHMDBs)
where {
VALUES ?hmdbMetabolite {"
item2 = "}
?datanode a wp:Metabolite ;
wp:bdbHmdb ?hmdbMetabolite ;
dcterms:isPartOf ?pathwayRes .
?pathwayRes a wp:Pathway ;
wp:organismName 'Homo sapiens' ;
dcterms:identifier ?wpid ;
dc:title ?title .
#?pathwayRes wp:ontologyTag cur:Reactome_Approved .
?pathwayRes wp:ontologyTag cur:AnalysisCollection .
}
ORDER BY DESC(?HMDBsInPWs)"
query_CombinePWs <- paste(item1,string_HMDB,item2)
remove(item1, item2)
results_CombinePWs <- SPARQL(endpointwp,query_CombinePWs,curl_args=list(useragent=R.version.string))
showresults_CombinePWs <- results_CombinePWs$results
remove(query_CombinePWs,results_CombinePWs)
##Top pathway cuttoff threshold (can be defined by users)
pathway_cutoff_metabolites = 10
#Keep and print table within threshold (if less than threshold are found, print only those)
if(nrow(showresults_CombinePWs) < pathway_cutoff_metabolites){
print(showresults_CombinePWs[1:nrow(showresults_CombinePWs),c(2:4)])
}else{
#delete rows below threshold
showresults_CombinePWs <- showresults_CombinePWs[-c((pathway_cutoff_metabolites+1):nrow(showresults_CombinePWs)),]
print(showresults_CombinePWs[1:5,c(2:4)])}
remove(cleaned_string_HMDB, list_Relevant_HMDB_IDs, query_HMDBs)
```
Retrieve additional relevant pathway data, for example protein and gene data
```{r}
string_WP_IDs_list <- paste0("'", showresults_CombinePWs$pathway ,"'")
string_WP_IDs <- paste(c(string_WP_IDs_list), collapse=' ' )
item1 = "
PREFIX ch: <https://identifiers.org/hmdb/>
PREFIX cur: <http://vocabularies.wikipathways.org/wp#Curation:>
select distinct ?pathwayRes (count(distinct ?metaboliteDatanode) AS ?TotalMetabolitesinPW) (GROUP_CONCAT(DISTINCT fn:substring(?hgnc,37);separator=' ') AS ?Proteins) (count(distinct ?hgnc) AS ?ProteinsInPWs)
where {
VALUES ?wpid {
"
item2 = "
}
?metaboliteDatanode a wp:Metabolite ;
dcterms:isPartOf ?pathwayRes .
?pathwayRes a wp:Pathway ;
wp:organismName 'Homo sapiens' ;
dcterms:identifier ?wpid ;
dc:title ?title ;
wp:ontologyTag cur:AnalysisCollection .
OPTIONAL{
?datanode2 wp:bdbHgncSymbol ?hgnc ;
dcterms:isPartOf ?pathwayRes .
}
}
"
query_CombinePWs_gene <- paste(item1,string_WP_IDs,item2)
remove(item1, item2)
results_CombinePWs_gene <- SPARQL(endpointwp,query_CombinePWs_gene,curl_args=list(useragent=R.version.string))
showresults_CombinePWs_gene <- results_CombinePWs_gene$results
remove(query_CombinePWs_gene,results_CombinePWs_gene)
##Merge the two dataframes together:
showresults_CombinePW_data <- merge(x = showresults_CombinePWs, y = showresults_CombinePWs_gene, by = "pathwayRes", all.x = TRUE)
##Reorder data to fit previous format:
showresults_CombinePW_data <- showresults_CombinePW_data[, c(1:4, 6:8, 5)]
```
## Calculate the ORA score for each pathway, using the Fishers exact test.
```{r pathway_analysis, warning=FALSE, message=FALSE}
##Based on: https://www.pathwaycommons.org/guide/primers/statistics/fishers_exact_test/
#Create a dataframe to store the required numbers in.
Contingency_table <- data.frame(matrix(ncol=5,nrow=0, dimnames=list(NULL, c("WP.ID", "x", "m", "n", "k"))))
counter = 1
for (i in 1:nrow(showresults_CombinePW_data)) {
Contingency_table[counter,1] <- (showresults_CombinePW_data[i,2]) #WP.ID
Contingency_table[counter,2] <- (showresults_CombinePW_data[i,4]) ##x <- (number4) #Total differentially changed metabolites, also in a PW. (HMDBsInPWs)
Contingency_table[counter,3] <- (showresults_CombinePW_data[i,5]) ##m <- (number) #Total Metabolites in PW (TotalMetabolitesinPW)
Contingency_table[counter,4] <- (length(unique(mSet[,1])) - showresults_CombinePW_data[i,4]) ##n <- (number2) #Total Metabolites measured not in PW (DISTINCT all_HMDB - HMDBsInPWs)
Contingency_table[counter,5] <- length(unique(vector_HMDB)) ##k <- (number3) #Total differentially changed metabolites. (DISTINCT vector_HMDB)
counter <- counter + 1
}
# Calculate hypergeometric density p-value for all pathways.
i <- 1:nrow(Contingency_table)
probabilities <- dhyper(Contingency_table[i,2], Contingency_table[i,3], Contingency_table[i,4], Contingency_table[i,5], log = FALSE)
pathwayAnalysis_results <- cbind(showresults_CombinePW_data[, c(2:4)], probabilities, showresults_CombinePW_data[, c(6,7)])
colnames(pathwayAnalysis_results)[5] <- "HGNCs"
colnames(pathwayAnalysis_results)[6] <- "ProteinsInPWs"
##Sort PW results based on 1. highest amount of #HMDBs in PW, 2. lowest p-values, 3. highest amouny of proteins in PW (which might be relevant for transcriptomics analysis later)
pathwayAnalysis_results_sorted <- pathwayAnalysis_results[ with(pathwayAnalysis_results, order(-HMDBsInPWs, probabilities, -ProteinsInPWs)),]
print(pathwayAnalysis_results_sorted[1:5,])
```
## Export the pathway data:
```{r data_export, warning=FALSE, message=FALSE}
##Save the data file
nameDataFile <- paste0("output/mbxPWdata_", disorder ,".csv")
write.table(pathwayAnalysis_results_sorted, nameDataFile, sep =",", row.names = FALSE)
```
## Print significantly changed metabolites which were not in a pathway, by ID and name:
```{r missing_data,warning=FALSE, message=FALSE }
##Find Missing Biomarkers (not part of any Human pathway model)
item1 = "PREFIX ch: <https://identifiers.org/hmdb/>
SELECT DISTINCT ?HMDBMetabolite WHERE {
VALUES ?HMDBMetabolite {"
item2 = "}
?pathwayRes a wp:Pathway ;
wp:organismName 'Homo sapiens' .
?metabolite a wp:Metabolite ;
dcterms:identifier ?id ;
dcterms:isPartOf ?pathwayRes .
?metabolite wp:bdbHmdb ?HMDBMetabolite.
}"
queryMissingBiomarkers <- paste(item1,string_HMDB,item2)
remove(item1,item2)
resultsMissingBiomarkers <- SPARQL(endpointwp,queryMissingBiomarkers,curl_args=list(useragent=R.version.string))
listMissingBiomarkers <- c(resultsMissingBiomarkers$results) #safe results as list for comparison.
remove(queryMissingBiomarkers,resultsMissingBiomarkers)
HMDBs_inPWs <- gsub("[<https://identifiers.org/hmdb/>]", "", listMissingBiomarkers) #HMDB IDs IRI cleanup
intersectingHMDB <- setdiff(vector_HMDB, HMDBs_inPWs)
string_intersectingHMDB <- paste(c(intersectingHMDB), collapse=', ' )
#Find names for missing Biomarkers based on HMDB ID (to help with data understanding and curation)
missingNames <- list()
for (j in 1:length(intersectingHMDB)){
for (i in 1:nrow(mSet)){
if(!is.na(mSet[i,5]) & mSet[i,5] == intersectingHMDB[j]){
missingNames[j] <- mSet[i,6]
}
else{next}
}
}
remove(i,j)
#Save list on one string for reporting purposes
string_missingNames <- do.call(paste, c(as.list(missingNames), sep = ", "))
#Print relevant information:
if(length(intersectingHMDB) == 0 ){print("All relevant biomarkers are in a pathway!")} else{
print(paste0("For the disorder ", disorder, ", ", length(intersectingHMDB), " biomarkers are not in a pathway; with the following HMDB IDs: " , string_intersectingHMDB, "; with the following Database names: ", string_missingNames))}
```
## Print session info:
```{r print_session_info}
##Print session info:
sessionInfo()
```
## Last, we create a Jupyter notebook file from this script:
```{r writing_to_notebooks,warning=FALSE, message=FALSE }
#Jupyter Notebook file
if(!"devtools" %in% installed.packages()) BiocManager::install("devtools")
devtools::install_github("mkearney/rmd2jupyter", force=TRUE)
library(devtools)
library(rmd2jupyter)
rmd2jupyter("metabolomics_pathway_analysis.Rmd")
##Clean up data
remove(counter, i, probabilities, string_HMDB, vector_HMDB, Contingency_table, pathwayAnalysis_results, showresults_CombinePWs, showresultsMetadata)
```