Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retrieving a semantic similarity matrix #37

Open
uyedaj opened this issue Feb 15, 2019 · 1 comment
Open

Retrieving a semantic similarity matrix #37

uyedaj opened this issue Feb 15, 2019 · 1 comment
Labels
enhancement New feature or request help wanted Extra attention is needed

Comments

@uyedaj
Copy link

uyedaj commented Feb 15, 2019


title: "Pipeline for retrieving semantic similarity matrix"
output: html_notebook

This is our pipeline for retrieving a semantic similarity matrix that we would like to be streamlined and robust in rphenoscape. Our first step is to obtain a treedata object for anatomical entities in catfish.

library(rphenoscape)
library(treeplyr)
library(RCurl)
library(rjson)
library(readr)
library(urltools)
library(pracma)
library(httr)


nex <- pk_get_ontotrace_xml(taxon = c("Siluriformes"), entity = "anatomical entity")

m <- pk_get_ontotrace(nex)
m$taxa <- gsub(" ", "_", m$taxa)
#write.csv(m, file="~/repos/ontologyPCM/data/Ontotrace_Siluriformes_AnatomicalEntity.csv")

tree <- read.tree("https://datadryad.org/bitstream/handle/10255/dryad.199127/actinopt_12k_treePL.tre?sequence=1")
td <- make.treedata(tree, m)
td

We remove the otu data and look at the traits.

traits <- colnames(td$dat)
traits <- traits[-(1:2)] #delete otu data
traits

Get IRI ids for each trait.

traitDetails <- lapply(traits, function(x) pk_anatomical_detail(x, verbose=TRUE))
traitDetails[1:5]
traitIDs <- unname(do.call(c, sapply(traitDetails, function(x) x[,'@id'])))
irisPhenotypes <- sapply(traitIDs, url_encode)

This is the ugliest part, making sure the URLencoding works. I think it's the hardest (for me) to make robust as well.

filename <- "../output/siluriformesFormData.txt"

cat("iris=%5B%0A%20%20", file=filename)
irisPhenotypes <- lapply(irisPhenotypes, function(x) gsub("/", "%2F", x, fixed=TRUE))
irisPhenotypes <- lapply(irisPhenotypes, function(x) gsub(":", "%3A", x, fixed=TRUE))
irisPhenotypes <- lapply(irisPhenotypes, function(x) gsub("=", "%3D%0A", x, fixed=TRUE))
dum <- lapply(irisPhenotypes[1:(length(irisPhenotypes)-1)],function(x) cat(paste0('%22', x,'%22%2C', sep=""), file=filename, append=TRUE))
cat(paste0('%22', irisPhenotypes[[length(irisPhenotypes)]],'%22',"%5D%0A", sep=""), file=filename, append=TRUE)

Submit the api request.

api.semanticSimilarity_query <- "curl -X POST -d @../output/siluriformesFormData.txt 'http://kb.phenoscape.org/api/similarity/jaccard'"
semanticSimilarityAPIResults <- system(api.semanticSimilarity_query, intern=TRUE)

Process the results from the api request.

results <- fromJSON(semanticSimilarityAPIResults)
scores <- lapply(results$results, function(x) x$score)
scores <- sapply(scores, function(x) if(is.null(x)) NA else(x))
result_terms <- do.call(rbind, lapply(results$results, function(x) do.call(cbind, lapply(x$terms, curlUnescape))))
semanticSimilarityMatrix <- matrix(NA, nrow=length(irisPhenotypes), ncol=length(irisPhenotypes))
diag(semanticSimilarityMatrix) <- 1
rownames(semanticSimilarityMatrix) <- colnames(semanticSimilarityMatrix) <- curlUnescape(irisPhenotypes)

for(i in 1:nrow(result_terms)){
  semanticSimilarityMatrix[result_terms[i,1], result_terms[i,2]] <- semanticSimilarityMatrix[result_terms[i,2], result_terms[i,1]] <- scores[i]
}

rownames(semanticSimilarityMatrix) <- colnames(semanticSimilarityMatrix) <- traits

write.csv(semanticSimilarityMatrix, file="../output/siluriformesSemanticSimMatrix.csv")

Check to see if semantic similarity matrix makes sense by visualizing the highest and lowest matching semantic similarity values.

maxSS <- list()
for(i in 1:ncol(semanticSimilarityMatrix)){
  ss <- semanticSimilarityMatrix[-i,]
  j <- which(ss[,i]==max(ss[,i]))[1]
  maxSS[[i]] <- cbind(colnames(semanticSimilarityMatrix)[i], rownames(ss)[j], round(ss[j, i],4))
}
maxSS <- do.call(rbind, maxSS)
as.data.frame(maxSS)


minSS <- list()
for(i in 1:ncol(semanticSimilarityMatrix)){
  ss <- semanticSimilarityMatrix[-i,]
  j <- which(ss[,i]==min(ss[,i]))[1]
  minSS[[i]] <- cbind(colnames(semanticSimilarityMatrix)[i], rownames(ss)[j], round(ss[j, i],4))
}
minSS <- do.call(rbind, minSS)
as.data.frame(minSS)

Create a neighbor-joining tree of SS matrix to see trait clusters.

njt <- nj(1-semanticSimilarityMatrix)
pdf("../output/njTreeSiluriformesSemanticMatrix.pdf", height=30, width=30)
plot(njt, type="unrooted", cex=0.35)
dev.off()
@hlapp
Copy link
Member

hlapp commented Oct 14, 2022

@uyedaj and @diegosasso I think this can be considered complete, right?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
enhancement New feature or request help wanted Extra attention is needed
Projects
None yet
Development

No branches or pull requests

2 participants