From 482f67d03bff84a7778d6eccee449ee8896e1398 Mon Sep 17 00:00:00 2001 From: sdgamboa Date: Tue, 9 Jan 2024 18:15:44 -0500 Subject: [PATCH] Update import of bugphyzz and how to make signatures. --- NAMESPACE | 1 + R/bugphyzz.R | 175 +++++++++++++++++++++++++++++------- inst/extdata/thresholds.tsv | 40 ++++----- man/importBugphyzz.Rd | 12 +-- man/makeSignatures.Rd | 53 +++++++++++ 5 files changed, 220 insertions(+), 61 deletions(-) create mode 100644 man/makeSignatures.Rd diff --git a/NAMESPACE b/NAMESPACE index c2a6edb8..72c46365 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,7 @@ export(fattyAcidComposition) export(getBugAnnotations) export(getBugphyzzSignatures) export(importBugphyzz) +export(makeSignatures) export(physiologies) export(showPhys) export(whichAttr) diff --git a/R/bugphyzz.R b/R/bugphyzz.R index 5939f6c0..4279a8d6 100644 --- a/R/bugphyzz.R +++ b/R/bugphyzz.R @@ -14,39 +14,154 @@ #' @examples #' #' bp <- importBugphyzz() -#' -#' ## Check available groups of attributes -#' unique(bp$Attribute_group) -#' -#' ## Filter only for growth temperature -#' gt <- bp[which(bp$Attribute_group == 'growth temperature'), ] -#' -#' ## Create signatures with taxids at the species level -#' gt_sigs <- getBugphyzzSignatures(gt, tax.id.type = 'NCBI_ID', tax.level = 'species')] -#' lapply(gt_sigs, function(x) head(x)) -#' +#' names(bp) #' importBugphyzz <- function(version = 'devel', force_download = FALSE) { - if (version == 'devel') - url <- 'https://github.com/waldronlab/bugphyzzExports/raw/sdgamboa/update-workflow/bugphyzz_export.tsv' - rpath <- .getResource( - rname = 'bugphyzz_export.tsv', url = url, verbose = TRUE, - force = force_download + types <- c("multistate", "binary", "numeric") + urls <- paste0( + "https://github.com/waldronlab/bugphyzzExports/raw/sdgamboa/phylo/bugphyzz_", + types, + ".csv" ) - thr <- .thresholds() - dat <- utils::read.table(rpath, header = TRUE, sep = '\t') |> - dplyr::mutate(Score = round(.data$Score, digits = 3)) |> - dplyr::mutate(Frequency = dplyr::case_when( - .data$Score == 1 ~ 'always', - .data$Score >= 0.9 & .data$Score < 1 ~ 'usually', - .data$Score >= 0.5 & .data$Score < 0.9 ~ 'sometimes', - .data$Score > 0 & .data$Score < 0.5 ~ 'rarely', - .data$Score == 0 ~ 'never' - )) |> - dplyr::mutate( - Attribute_source = ifelse(.data$Evidence == 'inh', NA, .data$Attribute_source) + names(urls) <- types + if (version == 'devel') { + output <- vector("list", length(urls)) + for (i in seq_along(output)) { + message("Importing ", names(urls)[i], " data...") + names(output)[i] <- names(urls)[i] + rpath <- .getResource( + rname = paste0("bugphyzz_", names(urls)[i], ".tsv"), + url = urls[i], verbose = TRUE, force = force_download + ) + output[[i]] <- utils::read.csv(rpath, header = TRUE, skip = 1) + } + } + + output <- lapply(output, function(x) split(x, x$Attribute_group)) + output <- purrr::list_flatten(output) + names(output) <- purrr::map_chr(output, ~ unique(.x$Attribute_group)) + return(output) +} + + +#' Make bugs signatures +#' +#' \code{makeSignatures} Creates signatures for a list of bugphyzz +#' data.frames imported with \code{importBugphyzz} +#' +#' @param dat A data.frame. +#' @param tax_id_type A character string. Valid options: NCBI_ID, Taxon_name. +#' @param tax_level A character vector. Taxonomic rank. Valid options: +#' kingdom, phylum, class, order, family, genus, species, strain. +#' They can be combined. "mixed" is equivalent to select all valid ranks. +#' @param evidence A character vector. Valid options: exp, igc, nas, tas, tax, asr. +#' They can be combined. Default is all. +#' @param frequency A character vector. Valid options: always, usually, +#' sometimes, rarely, unknown. They can be combiend. Default value is all but +#' rarely. +#' @param min_size Minimun number of bugs in a signature. Default is 10. +#' @param min Minimum value inclusive. Only for numeric attributes. Default is NULL. +#' @param max Maximum value inclusive. Only for numeric attributes. Default is NULL. +#' +#' @return A list of character vector with the IDs of the bugs. +#' @export +#' +#' @examples +#' +#' bp <- importBugphyzz() +#' sigs <- lapply(bp, makeSignatures) +#' sigs <- purrr::list_flatten(sigs) +#' +makeSignatures <- function( + dat, tax_id_type = "NCBI_ID", + tax_level = "mixed", + evidence = c("exp", "igc", "tas", "nas", "tax", "asr"), + frequency = c("always", "usually", "sometimes", "unknown"), + min_size = 10, min = NULL, max = NULL +) { + attr_type <- unique(dat$Attribute_type) + if (tax_level == "mixed") { + tax_level <- c( + "kingdom", "phylum", "class", "order", "family", "genus", "species", + "strain" ) - dplyr::left_join(dat, thr, by = c('Attribute_group', 'Attribute')) + } + dat <- dat |> + dplyr::filter(Rank %in% tax_level) |> + dplyr::filter(.data$Evidence %in% evidence) |> + dplyr::filter(.data$Frequency %in% frequency) + if (!nrow(dat)) { + warning( + "Not enough data for creating signatures. Try different filtering options", + call. = FALSE + ) + return(NULL) + } + if (attr_type %in% c("multistate-intersection", "binary")) { + s <- .makeSignaturesDiscrete(dat = dat, tax_id_type = tax_id_type) + } else if (attr_type %in% c("range", "numeric")) { + s <- .makeSignaturesNumeric( + dat = dat, tax_id_type = tax_id_type, min = min, max = max + ) + } + output <- purrr::keep(s, ~ length(.x) >= min_size) + if (!length(output)) { + warning( + "Not enough data for creating signatures. Try different filtering options", + call. = FALSE + ) + } + return(output) +} + +.makeSignaturesDiscrete <- function(dat, tax_id_type = "NCBI_ID") { + dat |> + dplyr::mutate( + Attribute = paste0("bugphyzz:", .data$Attribute, "|", .data$Attribute_value) + ) |> + {\(y) split(y, y$Attribute)}() |> + lapply(function(x) unique(x[[tax_id_type]])) +} + +.makeSignaturesNumeric <- function( + dat, tax_id_type = "NCBI_ID", min = NULL, max = NULL +) { + if (!is.null(min) || !is.null(max)) { + if (is.null(min)) { + message("Minimum unespecified. Using ", min(dat$Attribute_value), ".") + min <- min(dat$Attribute_value) + } + if (is.null(max)) { + message("Maximum unespecified. Using ", max(dat$Attribute_value), ".") + max <- max(dat$Attribute_value) + } + dat <- dat |> + dplyr::filter( + .data$Attribute_value >= min & .data$Attribute_value <= max + ) |> + dplyr::mutate( + Attribute = paste0("bugphyzz:", .data$Attribute, "| >=", min, " & <=", max) + ) + } else { + thr <- .thresholds() |> + dplyr::filter(.data$Attribute_group == unique(dat$Attribute_group)) + attr_name <- thr$Attribute + min_values <- thr$lower + max_values <- thr$upper + dat$tmp_col <- NA + for (i in seq_along(attr_name)) { + if (is.na(min_values[i])) + min_values[i] <- min(dat$Attribute_value) - 0.01 + if (is.na(max_values[i])) + max_values[i] <- max(dat$Attribute_value) + pos <- which(dat$Attribute_value > min_values[i] & dat$Attribute_value <= max_values[i]) + dat$tmp_col[pos] <- attr_name[i] + dat$Attribute[pos] <- paste0("bugphyzz:", dat$Attribute[pos], "|", attr_name[i], "| > ", round(min_values[i], 2), " & <= ", max_values[i]) + } + } + dat |> + {\(y) split(y, y$Attribute)}() |> + lapply(function(x) unique(x[[tax_id_type]])) } #' Get bugphyzz signatures @@ -235,7 +350,7 @@ whichAttrGrp <- function(bp) { unit = ifelse(is.na(.data$unit), '', .data$unit) ) |> dplyr::mutate(Attribute_range = paste0(range, unit)) |> - dplyr::select( + dplyr::relocate( .data$Attribute_group, .data$Attribute, .data$Attribute_range ) } diff --git a/inst/extdata/thresholds.tsv b/inst/extdata/thresholds.tsv index 8fb1e05d..ecc7b4ae 100644 --- a/inst/extdata/thresholds.tsv +++ b/inst/extdata/thresholds.tsv @@ -1,32 +1,32 @@ Attribute_group Attribute lower upper unit coding genes very small NA 473 NA -coding genes small 474 600 NA -coding genes average 601 6000 NA -coding genes very large 60001 NA NA +coding genes small 473 600 NA +coding genes average 600 6000 NA +coding genes very large 6000 NA NA genome size small NA 490885 bp -genome size average 490886 998123 bp -genome size large 998124 6997434 bp -genome size very large 6997435 NA bp -growth temperature psychrophile NA 24.9 C +genome size average 490885 998123 bp +genome size large 998123 6997434 bp +genome size very large 6997434 NA bp +growth temperature psychrophile NA 25 C growth temperature mesophile 25 45 C -growth temperature thermophile 46 60 C -growth temperature hyperthermophile 61 NA C +growth temperature thermophile 45 60 C +growth temperature hyperthermophile 60 NA C length small NA 3.8 μm -length average 3.9 22 μm -length large 23 60 μm -length very large 61 NA μm +length average 3.8 22 μm +length large 22 60 μm +length very large 60 NA μm mutation rate per site per generation slow NA 2.92 NA -mutation rate per site per generation medium 2.93 16 NA -mutation rate per site per generation fast 17 NA NA +mutation rate per site per generation medium 2.92 16 NA +mutation rate per site per generation fast 16 NA NA mutation rate per site per year slow NA 7.5 NA -mutation rate per site per year medium 7.6 20 NA -mutation rate per site per year medium fast 21 54.2 NA -mutation rate per site per year fast 54.3 NA NA +mutation rate per site per year medium 7.5 20 NA +mutation rate per site per year medium fast 20 54.2 NA +mutation rate per site per year fast 54.2 NA NA optimal ph acidic NA 6 NA optimal ph neutral 6 8 NA optimal ph alkaline 8 9.76 NA optimal ph very alkaline 9.76 NA NA width small NA 0.9 μm -width average 0.91 3.5 μm -width large 3.51 12 μm -width very large 13 NA μm +width average 0.9 3.5 μm +width large 3.5 12 μm +width very large 12 NA μm diff --git a/man/importBugphyzz.Rd b/man/importBugphyzz.Rd index 91073aa6..0ae0702e 100644 --- a/man/importBugphyzz.Rd +++ b/man/importBugphyzz.Rd @@ -23,16 +23,6 @@ attributes with categorical and binary (boolean) attributes. \examples{ bp <- importBugphyzz() - -## Check available groups of attributes -unique(bp$Attribute_group) - -## Filter only for growth temperature -gt <- bp[which(bp$Attribute_group == 'growth temperature'), ] - -## Create signatures with taxids at the species level -gt_sigs <- getBugphyzzSignatures(gt, tax.id.type = 'NCBI_ID', tax.level = 'species')] -lapply(gt_sigs, function(x) head(x)) - +names(bp) } diff --git a/man/makeSignatures.Rd b/man/makeSignatures.Rd new file mode 100644 index 00000000..f00f3e3f --- /dev/null +++ b/man/makeSignatures.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/bugphyzz.R +\name{makeSignatures} +\alias{makeSignatures} +\title{Make bugs signatures} +\usage{ +makeSignatures( + dat, + tax_id_type = "NCBI_ID", + tax_level = "mixed", + evidence = c("exp", "igc", "tas", "nas", "tax", "asr"), + frequency = c("always", "usually", "sometimes", "unknown"), + min_size = 10, + min = NULL, + max = NULL +) +} +\arguments{ +\item{dat}{A data.frame.} + +\item{tax_id_type}{A character string. Valid options: NCBI_ID, Taxon_name.} + +\item{tax_level}{A character vector. Taxonomic rank. Valid options: +kingdom, phylum, class, order, family, genus, species, strain. +They can be combined. "mixed" is equivalent to select all valid ranks.} + +\item{evidence}{A character vector. Valid options: exp, igc, nas, tas, tax, asr. +They can be combined. Default is all.} + +\item{frequency}{A character vector. Valid options: always, usually, +sometimes, rarely, unknown. They can be combiend. Default value is all but +rarely.} + +\item{min_size}{Minimun number of bugs in a signature. Default is 10.} + +\item{min}{Minimum value inclusive. Only for numeric attributes. Default is NULL.} + +\item{max}{Maximum value inclusive. Only for numeric attributes. Default is NULL.} +} +\value{ +A list of character vector with the IDs of the bugs. +} +\description{ +\code{makeSignatures} Creates signatures for a list of bugphyzz +data.frames imported with \code{importBugphyzz} +} +\examples{ + +bp <- importBugphyzz() +sigs <- lapply(bp, makeSignatures) +sigs <- purrr::list_flatten(sigs) + +}