Retrieving a semantic similarity matrix #37

uyedaj · 2019-02-15T20:44:42Z

title: "Pipeline for retrieving semantic similarity matrix"
output: html_notebook

This is our pipeline for retrieving a semantic similarity matrix that we would like to be streamlined and robust in rphenoscape. Our first step is to obtain a treedata object for anatomical entities in catfish.

library(rphenoscape)
library(treeplyr)
library(RCurl)
library(rjson)
library(readr)
library(urltools)
library(pracma)
library(httr)


nex <- pk_get_ontotrace_xml(taxon = c("Siluriformes"), entity = "anatomical entity")

m <- pk_get_ontotrace(nex)
m$taxa <- gsub(" ", "_", m$taxa)
#write.csv(m, file="~/repos/ontologyPCM/data/Ontotrace_Siluriformes_AnatomicalEntity.csv")

tree <- read.tree("https://datadryad.org/bitstream/handle/10255/dryad.199127/actinopt_12k_treePL.tre?sequence=1")
td <- make.treedata(tree, m)

td

We remove the otu data and look at the traits.

traits <- colnames(td$dat)
traits <- traits[-(1:2)] #delete otu data
traits

Get IRI ids for each trait.

traitDetails <- lapply(traits, function(x) pk_anatomical_detail(x, verbose=TRUE))

traitDetails[1:5]
traitIDs <- unname(do.call(c, sapply(traitDetails, function(x) x[,'@id'])))

irisPhenotypes <- sapply(traitIDs, url_encode)

This is the ugliest part, making sure the URLencoding works. I think it's the hardest (for me) to make robust as well.

filename <- "../output/siluriformesFormData.txt"

cat("iris=%5B%0A%20%20", file=filename)
irisPhenotypes <- lapply(irisPhenotypes, function(x) gsub("/", "%2F", x, fixed=TRUE))
irisPhenotypes <- lapply(irisPhenotypes, function(x) gsub(":", "%3A", x, fixed=TRUE))
irisPhenotypes <- lapply(irisPhenotypes, function(x) gsub("=", "%3D%0A", x, fixed=TRUE))
dum <- lapply(irisPhenotypes[1:(length(irisPhenotypes)-1)],function(x) cat(paste0('%22', x,'%22%2C', sep=""), file=filename, append=TRUE))
cat(paste0('%22', irisPhenotypes[[length(irisPhenotypes)]],'%22',"%5D%0A", sep=""), file=filename, append=TRUE)

Submit the api request.

api.semanticSimilarity_query <- "curl -X POST -d @../output/siluriformesFormData.txt 'http://kb.phenoscape.org/api/similarity/jaccard'"
semanticSimilarityAPIResults <- system(api.semanticSimilarity_query, intern=TRUE)

Process the results from the api request.

results <- fromJSON(semanticSimilarityAPIResults)
scores <- lapply(results$results, function(x) x$score)
scores <- sapply(scores, function(x) if(is.null(x)) NA else(x))
result_terms <- do.call(rbind, lapply(results$results, function(x) do.call(cbind, lapply(x$terms, curlUnescape))))
semanticSimilarityMatrix <- matrix(NA, nrow=length(irisPhenotypes), ncol=length(irisPhenotypes))
diag(semanticSimilarityMatrix) <- 1
rownames(semanticSimilarityMatrix) <- colnames(semanticSimilarityMatrix) <- curlUnescape(irisPhenotypes)

for(i in 1:nrow(result_terms)){
  semanticSimilarityMatrix[result_terms[i,1], result_terms[i,2]] <- semanticSimilarityMatrix[result_terms[i,2], result_terms[i,1]] <- scores[i]
}

rownames(semanticSimilarityMatrix) <- colnames(semanticSimilarityMatrix) <- traits

write.csv(semanticSimilarityMatrix, file="../output/siluriformesSemanticSimMatrix.csv")

Check to see if semantic similarity matrix makes sense by visualizing the highest and lowest matching semantic similarity values.

maxSS <- list()
for(i in 1:ncol(semanticSimilarityMatrix)){
  ss <- semanticSimilarityMatrix[-i,]
  j <- which(ss[,i]==max(ss[,i]))[1]
  maxSS[[i]] <- cbind(colnames(semanticSimilarityMatrix)[i], rownames(ss)[j], round(ss[j, i],4))
}
maxSS <- do.call(rbind, maxSS)
as.data.frame(maxSS)


minSS <- list()
for(i in 1:ncol(semanticSimilarityMatrix)){
  ss <- semanticSimilarityMatrix[-i,]
  j <- which(ss[,i]==min(ss[,i]))[1]
  minSS[[i]] <- cbind(colnames(semanticSimilarityMatrix)[i], rownames(ss)[j], round(ss[j, i],4))
}
minSS <- do.call(rbind, minSS)
as.data.frame(minSS)

Create a neighbor-joining tree of SS matrix to see trait clusters.

njt <- nj(1-semanticSimilarityMatrix)
pdf("../output/njTreeSiluriformesSemanticMatrix.pdf", height=30, width=30)
plot(njt, type="unrooted", cex=0.35)
dev.off()

hlapp · 2022-10-14T18:10:31Z

@uyedaj and @diegosasso I think this can be considered complete, right?

uyedaj added enhancement New feature or request help wanted Extra attention is needed labels Feb 15, 2019

hlapp mentioned this issue Jun 14, 2019

Adds subsumer matrix query and some similarity metrics #92

Merged

hlapp mentioned this issue Jun 17, 2019

Adds a function for retrieving phenotypes from the KB #95

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Retrieving a semantic similarity matrix #37

Retrieving a semantic similarity matrix #37

uyedaj commented Feb 15, 2019 •

edited by hlapp

Loading

hlapp commented Oct 14, 2022

Retrieving a semantic similarity matrix #37

Retrieving a semantic similarity matrix #37

Comments

uyedaj commented Feb 15, 2019 • edited by hlapp Loading

title: "Pipeline for retrieving semantic similarity matrix" output: html_notebook

hlapp commented Oct 14, 2022

uyedaj commented Feb 15, 2019 •

edited by hlapp

Loading

title: "Pipeline for retrieving semantic similarity matrix"
output: html_notebook