Skip to content

Commit

Permalink
Adds ability to obtain or calculate term frequencies
Browse files Browse the repository at this point in the history
Note that not all corpora are supported in the KB API yet. Specifically,
support for the "gene_annotations" corpus is pending in the API
(see phenoscape/phenoscape-kb-services#146 and phenoscape/phenoscape-kb-services#189),
and at least presently there are some issues in the KB API for the "genes"
corpus (see phenoscape/phenoscape-kb-services#191 and
phenoscape/phenoscape-kb-services#192).
  • Loading branch information
hlapp committed Feb 18, 2020
1 parent 35eae42 commit 330fc7d
Show file tree
Hide file tree
Showing 4 changed files with 191 additions and 0 deletions.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ export(anatomy_ontology_iris)
export(as.phenotype)
export(chars)
export(charstates)
export(corpus_size)
export(cosine_similarity)
export(find_term)
export(get_KBinfo)
Expand Down Expand Up @@ -50,6 +51,7 @@ export(subsumer_matrix)
export(tanimoto_similarity)
export(taxon_ontology_iris)
export(term_category)
export(term_freqs)
importClassesFrom(RNeXML,nexml)
importFrom(RNeXML,add_meta)
importFrom(RNeXML,expand_prefix)
Expand Down
110 changes: 110 additions & 0 deletions R/term-weights.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#' Obtains term frequencies for the Phenoscape KB
#'
#' Determines the frequencies for the given input list of terms, based on
#' the selected corpus.
#'
#' Depending on the corpus selected, the frequencies are queried directly
#' from the Phenoscape API, or calculated based on query results. Currently,
#' the Phenoscape KB has precomputed frequencies for corpora "taxa" and
#' "genes".
#'
#' @param x a vector or list of one or more terms, either as IRIs or as term
#' objects.
#' @param as the category or categories of the input terms (see [term_category()]).
#' Supported categories are "entity", "quality", and "phenotype". The value
#' must either be a single category (applying to all terms), or a vector of
#' categories (of same length as `x`). If provided as "auto" (or NULL), the
#' category of each term is automatically determined. The default is "auto".
#' @param corpus the name of the corpus for which to determine frequencies.
#' Supported values are "taxon_annotations", "taxa", "gene_annotations", and
#' "genes". (At present, support for "gene_annotations" is pending support in
#' the Phenoscape API.) The default is "taxon_annotations".
#' @return a vector of frequencies as floating point numbers (between zero
#' and 1.0), of the same length (and ordering) as the input list of terms.
#' @examples
#' terms <- c("pectoral fin", "pelvic fin", "dorsal fin", "paired fin")
#' IRIs <- sapply(terms, pk_get_iri, as = "anatomy")
#' term_freqs(IRIs)
#'
#' phens <- get_phenotypes(entity = "basihyal bone")
#' term_freqs(phens$id, as = "phenotype", corpus = "taxon_annotations")
#' term_freqs(phens$id, as = "phenotype", corpus = "taxa")
#'
#' @export
term_freqs <- function(x,
as = c("auto", "entity", "quality", "phenotype"),
corpus = c("taxon_annotations", "taxa", "gene_annotations", "genes")) {
as <- match.arg(as, several.ok = TRUE)
corpus <- match.arg(corpus)

if (as[1] == "auto")
as <- term_category(x)
else if (length(as) > 1 && length(as) != length(x))
stop("'as' must be a single value, or have the same length as 'x'", call. = FALSE)
else if (any(as == "auto"))
stop("'auto' can only be applied to all terms", call. = FALSE)

ctotal <- corpus_size(corpus = corpus)
if (corpus == "taxa" || corpus == "genes") {
if (any(as != "phenotype"))
stop("corpus '", corpus, "' requires phenotype terms", call. = FALSE)
corpusID <- paste0("http://kb.phenoscape.org/sim/", corpus)
query <- list(terms = as.character(jsonlite::toJSON(x)),
corpus_graph = corpusID)
freqs <- get_csv_data(pkb_api("/similarity/frequency"), query = query,
header = FALSE, row.names = 1, check.names = FALSE)
reordering <- match(x, rownames(freqs))
freqs <- freqs[reordering,] / ctotal
} else if (corpus == "taxon_annotations") {
freqs <- mapply(function(iri, param) {
query <- list(total = TRUE)
query[[param]] <- iri
res <- get_json_data(pkb_api("/taxon/annotations"), query = query)
res$total
},
iri = x, param = as)
freqs <- freqs / ctotal
} else {
stop("corpus '", corpus, "' is currently unsupported", call. = FALSE)
}
unname(freqs)
}

#' Obtain the size of different corpora
#'
#' Obtains the size of a certain number of predefined corpora. The total size
#' of a corpus is important for calculating term frequencies.
#'
#' Corpus sizes are cached per session after they have first been obtained.
#' Thus, if the Phenoscape KB changes, a session needs to be restarted to
#' have those changes be reflected.
#'
#' @param corpus the name of the corpus, currently one of "taxon_annotations",
#' "taxa", "gene_annotations", and "genes". (At present "gene_annotations" is
#' pending support by the Phenoscape API.) Unambiguous abbreviations are
#' acceptable.
#' @return the size of the specified corpus as an integer number.
#' @examples
#' corpus_size("taxa")
#' corpus_size("taxon_annotations")
#' @export
corpus_size <- local({
.sizes <- list()
function(corpus = c("taxon_annotations", "taxa", "gene_annotations", "genes")) {
corpus <- match.arg(corpus)
if (is.null(.sizes[[corpus]])) {
if (corpus == "taxa" || corpus == "genes") {
corpusID <- paste0("http://kb.phenoscape.org/sim/", corpus)
res <- get_json_data(pkb_api("/similarity/corpus_size"),
query = list(corpus_graph = corpusID))
.sizes[[corpus]] <- res$total
} else if (corpus == "taxon_annotations") {
res <- get_json_data(pkb_api("/taxon/annotations"), list(total = TRUE))
.sizes[[corpus]] <- res$total
} else {
stop("corpus 'gene_annotations' is currently unsupported", call. = FALSE)
}
}
.sizes[[corpus]]
}
})
31 changes: 31 additions & 0 deletions man/corpus_size.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

48 changes: 48 additions & 0 deletions man/term_freqs.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 330fc7d

Please sign in to comment.