diff --git a/.gitignore b/.gitignore index 9c464472..74caa1df 100644 --- a/.gitignore +++ b/.gitignore @@ -46,3 +46,5 @@ jupyter_notebooks/updated_spore_shape_v2.xlsx protratis_data/ misc + +*.Rproj diff --git a/DESCRIPTION b/DESCRIPTION index 94c015a1..bdbdc88e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -59,7 +59,8 @@ Imports: tidyselect, BiocFileCache, httr2, - tools + tools, + S4Vectors Suggests: DT, forcats, diff --git a/R/bacdive.R b/R/bacdive.R index ca94273f..c0c88556 100644 --- a/R/bacdive.R +++ b/R/bacdive.R @@ -1,9 +1,9 @@ ## Main function for importing BacDive .getBacDive <- function(verbose = FALSE ) { - bacdive_data <- .importBacDiveExcel(verbose = verbose) - colnames(bacdive_data) <- .changeBDColNames(colnames(bacdive_data)) - .getTidyBD(bacdive_data) + bacdiveData <- .importBacDiveExcel(verbose = verbose) + colnames(bacdiveData) <- .changeBDColNames(colnames(bacdiveData)) + .getTidyBD(bacdiveData) } ## Helper function for .getBacDive @@ -34,8 +34,8 @@ } ## Helper function for .getBacDive -.getTidyBD <- function(bacdive_data) { - bacdive_data |> +.getTidyBD <- function(bacdiveData) { + bacdiveData |> tidyr::pivot_longer( # Attributes start in the gram_stain column cols = gram_stain:tidyr::last_col(), @@ -60,10 +60,10 @@ .reshapeBacDive <- function(df) { df[['Attribute_source']] <- 'BacDive' - split_df <- split(df, factor(df[['Attribute']])) + splitDf <- split(df, factor(df[['Attribute']])) ## Attributes that must be changed from character to logical (simplest fix) - attr_names <- c( + attrNames <- c( 'aerophilicity', 'shape', 'country', @@ -72,20 +72,22 @@ 'isolation site' ## colony color (delete) ) - - for (i in seq_along(attr_names)) { - split_df[[attr_names[i]]] <- .catToLog(split_df[[attr_names[i]]]) - if (attr_names[i] %in% c('aerophilicity', 'shape')) { - split_df[[attr_names[i]]]$Attribute_type <- + + ## Modifying an already existing vector rather than creating a list + ## Keeping this for loop + for (i in seq_along(attrNames)) { + splitDf[[attrNames[i]]] <- .catToLog(splitDf[[attrNames[i]]]) + if (attrNames[i] %in% c('aerophilicity', 'shape')) { + splitDf[[attrNames[i]]]$Attribute_type <- 'multistate-intersection' } else { - split_df[[attr_names[i]]]$Attribute_type <- 'multistate-union' + splitDf[[attrNames[i]]]$Attribute_type <- 'multistate-union' } } ## aerophilicity #### ## This is only to match the data in the bugphyzz spreadsheet - aer <- split_df[['aerophilicity']] + aer <- splitDf[['aerophilicity']] aer$Attribute <- dplyr::case_when( aer$Attribute == 'aerobe' ~ 'aerobic', aer$Attribute == 'anaerobe' ~ 'anaerobic', @@ -95,51 +97,51 @@ aer$Attribute == 'obligate aerobe' ~ 'obligately aerobic', TRUE ~ aer$Attribute ) - split_df[['aerophilicity']] <- aer + splitDf[['aerophilicity']] <- aer ## animal pathogen #### - pos <- names(split_df) == 'animal pathongen' - names(split_df)[pos] <- 'animal pathogen' - x_ <- split_df[['animal pathogen']][['Attribute_value']] + pos <- names(splitDf) == 'animal pathongen' + names(splitDf)[pos] <- 'animal pathogen' + x_ <- splitDf[['animal pathogen']][['Attribute_value']] x_ <- ifelse(x_ == "yes, in single cases", "yes", x_) x_ <- dplyr::case_when(x_ == 'yes' ~ TRUE, x_ == 'no' ~ FALSE) - split_df[['animal pathogen']][['Attribute_value']] <- x_ - split_df[['animal pathogen']][['Attribute_group']] <- 'animal pathogen' - split_df[['animal pathogen']][['Attribute']] <- 'animal pathogen' - split_df[['animal pathogen']][['Attribute_type']] <- 'binary' + splitDf[['animal pathogen']][['Attribute_value']] <- x_ + splitDf[['animal pathogen']][['Attribute_group']] <- 'animal pathogen' + splitDf[['animal pathogen']][['Attribute']] <- 'animal pathogen' + splitDf[['animal pathogen']][['Attribute_type']] <- 'binary' ## biosafety level #### - y <- split_df[['biosafety level comment']][ + y <- splitDf[['biosafety level comment']][ , c('BacDive_ID', 'Attribute_value') ] colnames(y)[2] <- 'Note' - x <- dplyr::left_join(split_df[['biosafety level']], y, by = 'BacDive_ID') + x <- dplyr::left_join(splitDf[['biosafety level']], y, by = 'BacDive_ID') x[['Attribute_value']] <- paste0('biosafety level ', x[['Attribute_value']]) x[['Attribute']] <- x[['Attribute_value']] x[['Attribute_value']] <- TRUE x[['Attribute_group']] <- 'biosafety level' x[['Attribute_type']] <- 'multistate-intersection' - split_df[['biosafety level']] <- x - split_df[['biosafety level comment']] <- NULL + splitDf[['biosafety level']] <- x + splitDf[['biosafety level comment']] <- NULL ## colony color #### ## This one must be removed - split_df[['colony color']] <- NULL + splitDf[['colony color']] <- NULL ## cultivation medium used - growth medium #### - pos <- names(split_df) == 'cultivation medium used' - names(split_df)[pos] <- 'growth medium' - split_df[['growth medium']][['Attribute_group']] <- 'growth medium' + pos <- names(splitDf) == 'cultivation medium used' + names(splitDf)[pos] <- 'growth medium' + splitDf[['growth medium']][['Attribute_group']] <- 'growth medium' ## growth temperature #### ## culture temperature ## culture temperature growth ## culture temperature range (ignore) ## culture temperature type (ignore) - split_df[['culture temperature range']] <- NULL - split_df[['culture temperature type']] <- NULL - a <- split_df[['culture temperature']] - b <- split_df[['culture temperature growth']] + splitDf[['culture temperature range']] <- NULL + splitDf[['culture temperature type']] <- NULL + a <- splitDf[['culture temperature']] + b <- splitDf[['culture temperature growth']] b_ <- b[,c('BacDive_ID', 'Attribute_value')] colnames(b_)[2] <- 'growth' ab <- dplyr::left_join(a, b_, by = 'BacDive_ID') @@ -148,25 +150,25 @@ ab[['Attribute_group']] <- 'growth temperature' ab[['Attribute_type']] <- 'range' ab[['Attribute']] <- 'growth temperature' - split_df[['growth temperature']] <- ab - split_df[['culture temperature']] <- NULL - split_df[['culture temperature growth']] <- NULL + splitDf[['growth temperature']] <- ab + splitDf[['culture temperature']] <- NULL + splitDf[['culture temperature growth']] <- NULL ## gram stain #### - gs <- split_df[['gram stain']] + gs <- splitDf[['gram stain']] gs[['Attribute']] <- paste(gs[['Attribute']], gs[['Attribute_value']]) gs[['Attribute_value']] <- TRUE gs[['Attribute_group']] <- 'gram stain' gs[['Attribute_type']] <- 'multistate-intersection' - split_df[['gram stain']] <- gs + splitDf[['gram stain']] <- gs ## halophily #### - valid_terms <- c( + validTerms <- c( 'NaCl', 'KCl', 'MgCl2', 'MgCl2x6H2O', 'Na\\+', 'MgSO4x7H2O', 'Na2SO4', 'Sea salts', 'Chromium \\(Cr6\\+\\)' ) - regex <- paste0('(', paste0(valid_terms, collapse = '|'), ')') - split_df[['halophily']] <- split_df[['halophily']] |> + regex <- paste0('(', paste0(validTerms, collapse = '|'), ')') + splitDf[['halophily']] <- splitDf[['halophily']] |> dplyr::mutate(Attribute_value = strsplit(Attribute_value, ';')) |> tidyr::unnest(cols = 'Attribute_value') |> dplyr::filter(!grepl('no growth', Attribute_value)) |> @@ -194,7 +196,7 @@ dplyr::distinct() ## hemolysis #### - split_df[['hemolysis']] <- split_df[['hemolysis']] |> + splitDf[['hemolysis']] <- splitDf[['hemolysis']] |> dplyr::mutate( Attribute_value = strsplit(Attribute_value, ';|/') ) |> @@ -210,21 +212,21 @@ ## incubation period ## This one must be removed - split_df[['incubation period']] <- NULL + splitDf[['incubation period']] <- NULL ## motility #### - split_df[['motility']] <- split_df[['motility']] |> + splitDf[['motility']] <- splitDf[['motility']] |> dplyr::mutate( Attribute_value = dplyr::case_when( Attribute_value == 'yes' ~ TRUE, Attribute_value == 'no' ~ FALSE ) ) - split_df[['motility']][['Attribute_group']] <- 'motility' - split_df[['motility']][['Attribute_type']] <- 'binary' + splitDf[['motility']][['Attribute_group']] <- 'motility' + splitDf[['motility']][['Attribute_type']] <- 'binary' ## pathogenicity human #### - pat <- split_df[['pathogenicity human']] + pat <- splitDf[['pathogenicity human']] pat[['Note']] <- stringr::str_extract( pat[['Attribute_value']], 'in single cases' ) @@ -235,10 +237,10 @@ pat <- pat[!is.na(pat[['Attribute_value']]),] pat[['Attribute_group']] <- 'pathogenicity human' pat[['Attribute_type']] <- 'binary' - split_df[['pathogenicity human']] <- pat + splitDf[['pathogenicity human']] <- pat ## metabolite production #### - mp <- split_df[['metabolite production']] + mp <- splitDf[['metabolite production']] mp <- mp |> dplyr::mutate(Attribute_value = strsplit(Attribute_value, ';')) |> tidyr::unnest(Attribute_value) @@ -250,12 +252,12 @@ mp[['Attribute']] <- sub(' (yes|no)$', '', mp[['Attribute']]) mp[['Attribute_group']] <- 'metabolite utilization' mp[['Attribute_type']] <- 'multistate-intersection' - split_df[['metabolite production']] <- mp + splitDf[['metabolite production']] <- mp ## metabolite utilization #### - pos <- names(split_df) == 'metabolite utiilization' - names(split_df)[pos] <- 'metabolite utilization' - mu <- split_df[['metabolite utilization']] + pos <- names(splitDf) == 'metabolite utiilization' + names(splitDf)[pos] <- 'metabolite utilization' + mu <- splitDf[['metabolite utilization']] mu <- mu |> dplyr::mutate(Attribute_value = strsplit(Attribute_value, ';')) |> tidyr::unnest(Attribute_value) |> @@ -280,10 +282,10 @@ dplyr::mutate(Attribute_value = as.logical(Attribute_value)) mu[['Attribute_group']] <- 'metabolite utilization' mu[['Attribute_type']] <- 'multistate-intersection' - split_df[['metabolite utilization']] <- mu + splitDf[['metabolite utilization']] <- mu ## spore formation #### - sf <- split_df[['spore formation']] + sf <- splitDf[['spore formation']] sf <- sf |> dplyr::mutate( Attribute_value = dplyr::case_when( @@ -294,9 +296,9 @@ Attribute_type = 'binary' ) |> dplyr::filter(!is.na(Attribute_value)) - split_df[['spore formation']] <- sf + splitDf[['spore formation']] <- sf - split_df <- lapply(split_df, function(x) { + splitDf <- lapply(splitDf, function(x) { x <- as.data.frame(x) x[['NCBI_ID']] <- as.character(x[['NCBI_ID']]) x[['Parent_NCBI_ID']] <- as.character(x[['Parent_NCBI_ID']]) @@ -312,7 +314,7 @@ as.data.frame(x) }) - return(split_df) + return(splitDf) } ## Helper function for .reshapeBacDive diff --git a/R/bugphyzz.R b/R/bugphyzz.R index 67eb1ed0..90a06164 100644 --- a/R/bugphyzz.R +++ b/R/bugphyzz.R @@ -10,14 +10,15 @@ utils::globalVariables(c( #' #' \code{importBugphyzz} imports bugphyzz annotations as a list of #' tidy data.frames. To learn more about the structure of the data.frames -#' please check the bugphyzz vignette with `browseVignettes("bugphyzz")`. +#' please check the bugphyzz vignette with `browseVignettes("bugphyzz")` or +#' `vignette("bugphyzz", "bugphyzz"). #' #' @param version Character string indicating the version. Default is the #' latest release on Zenodo. Options: Zenodo DOI, GitHub commit hash, or devel. -#' @param force_download Logical value. Force a fresh download of the data or +#' @param forceDownload Logical value. Force a fresh download of the data or #' use the one stored in the cache (if available). Default is FALSE. #' @param v Validation value. Default 0.8 (see details). -#' @param exclude_rarely Default is TRUE. Exclude values with +#' @param excludeRarely Default is TRUE. Exclude values with #' Frequency == FALSE (see details). #' #' @details @@ -37,13 +38,18 @@ utils::globalVariables(c( #' imported. The minimum value can be adjusted with the `v` argument (only #' values between 0 and 1). #' -#' ## Frequency (exclude_rarely argument) +#' ## Frequency (excludeRarely argument) #' One of the variables in the bugphyzz data.frames is "Frequency", which #' can adopt values of #' "always", "usually", "sometimes", "rarely", or "never". By default #' "never" and "rarely" are excluded. "rarely" could be included with -#' `exclude_rarely = FALSE`. To learn more about these frequency keywords +#' `excludeRarely = FALSE`. To learn more about these frequency keywords #' please check the bugphyzz vignette with `browseVignettes("bugphyzz")`. +#' +#' ## Sources +#' By default, the datasets imported with the `importBugphuzz` function +#' will always return a shortened version of the source. Please use +#' vigette("sources", "bugphyz") to see the full sources. #' #' @return A list of tidy data frames. #' @export @@ -54,34 +60,18 @@ utils::globalVariables(c( #' names(bp) #' importBugphyzz <- function( - version = "10.5281/zenodo.10980813", force_download = FALSE, v = 0.8, - exclude_rarely = TRUE + version = "10.5281/zenodo.12574596", forceDownload = FALSE, v = 0.8, + excludeRarely = TRUE ) { ## output is a list of three data.frames ## one of each: binary, multistate, numeric - output <- .downloadResource(version, force_download) + output <- .downloadResource(version, forceDownload) - ## TODO add release version output <- lapply(output, function(x) split(x, x$Attribute)) output <- purrr::list_flatten(output) - ## TODO correct plant pathogenicity name earlier in the workflow or - ## better yet, directly in the curation - pos <- which(names(output) == "plant pathogenity") - names(output)[pos] <- "plant pathogenicity" - output <- purrr::map(output, ~ { - .x |> - dplyr::mutate( - Attribute = ifelse( - Attribute == "plant pathogenity", - "plant pathogenicity", - Attribute - ) - ) - }) - names(output) <- purrr::map_chr(output, ~ unique(.x$Attribute)) val <- .validationData() |> dplyr::filter(rank == "all") |> @@ -90,13 +80,13 @@ importBugphyzz <- function( dplyr::mutate(attribute = tolower(attribute)) output <- purrr::map(output, ~ { - attr_type <- unique(.x$Attribute_type) - if (attr_type == "binary") { + attrType <- unique(.x$Attribute_type) + if (attrType == "binary") { val <- dplyr::select(val, Attribute = attribute, value) o <- dplyr::left_join(.x, val, by = "Attribute" ) } else if ( - attr_type == "multistate-intersection" || - attr_type == "multistate-union" + attrType == "multistate-intersection" || + attrType == "multistate-union" ) { val <- dplyr::select( val, Attribute = physiology, Attribute_value = attribute, value @@ -105,7 +95,7 @@ importBugphyzz <- function( dplyr::mutate(.x, Attribute_value = tolower(Attribute_value)), val, by = c("Attribute", "Attribute_value") ) - } else if (attr_type == "numeric") { + } else if (attrType == "numeric") { val <- dplyr::select(val, Attribute = attribute, value) o <- dplyr::left_join(.x, val, by = "Attribute") |> dplyr::rename(NSTI = nsti) @@ -118,7 +108,7 @@ importBugphyzz <- function( dplyr::rename(Validation = value) }) - if (exclude_rarely) { + if (excludeRarely) { output <- purrr::map( output, ~ dplyr::filter(.x, Frequency != "rarely") ) @@ -133,8 +123,8 @@ importBugphyzz <- function( #' run `browseVignettes("bugphyz")` for detailed examples. #' #' @param dat A data.frame. -#' @param tax_id_type A character string. Valid options: NCBI_ID, Taxon_name. -#' @param tax_level A character vector. Taxonomic rank. Valid options: +#' @param taxIdType A character string. Valid options: NCBI_ID, Taxon_name. +#' @param taxLevel A character vector. Taxonomic rank. Valid options: #' superkingdom, kingdom, phylum, class, order, family, genus, species, strain. #' They can be combined. "mixed" is equivalent to select all valid ranks. #' @param evidence A character vector. Valid options: exp, igc, nas, tas, tax, @@ -142,7 +132,7 @@ importBugphyzz <- function( #' @param frequency A character vector. Valid options: always, usually, #' sometimes, rarely, unknown. They can be combined. By default, "rarely" is #' excluded. -#' @param min_size Minimum number of bugs in a signature. Default is 10. +#' @param minSize Minimum number of bugs in a signature. Default is 10. #' @param min Minimum value (inclusive). Only for numeric attributes. #' Default is NULL. #' @param max Maximum value (inclusive). Only for numeric attributes. @@ -158,23 +148,31 @@ importBugphyzz <- function( #' sigs <- purrr::list_flatten(sigs, name_spec = "{inner}") #' makeSignatures <- function( - dat, tax_id_type = "NCBI_ID", - tax_level = "mixed", + dat, + taxIdType = c("NCBI_ID", "Taxon_name"), + taxLevel = c("mixed", "superkingdom", "phylum", "class", "order", + "family", "genus", "species", "strain"), evidence = c("exp", "igc", "tas", "nas", "tax", "asr"), frequency = c("always", "usually", "sometimes", "unknown"), - min_size = 10, min = NULL, max = NULL + minSize = 10, min = NULL, max = NULL ) { - attr_type <- unique(dat$Attribute_type) - if ("mixed" %in% tax_level) { - tax_level <- c( - "kingdom", "phylum", "class", "order", "family", "genus", "species", - "strain" + taxIdType <- match.arg(arg = taxIdType, several.ok = FALSE) + taxLevel <- match.arg(arg = taxLevel, several.ok = TRUE) + evidence <- match.arg(arg = evidence, several.ok = TRUE) + frequency <- match.arg(arg = frequency, several.ok = TRUE) + + attrType <- unique(dat$Attribute_type) + if ("mixed" %in% taxLevel) { + taxLevel <- c( + "superkingdom", "phylum", "class", "order", "family", "genus", + "species", "strain" ) } dat <- dat |> - dplyr::filter(Rank %in% tax_level) |> - dplyr::filter(Evidence %in% evidence) |> - dplyr::filter(Frequency %in% frequency) + {\(y) y[which(y$Rank %in% taxLevel),]}() |> + {\(y) y[which(y$Evidence %in% evidence),]}() |> + {\(y) y[which(y$Frequency %in% frequency),]}() + if (!nrow(dat)) { warning( "Not enough data for creating signatures.", @@ -184,16 +182,16 @@ makeSignatures <- function( return(NULL) } if ( - attr_type %in% + attrType %in% c("multistate-intersection", "binary", "multistate-union") ) { - s <- .makeSignaturesDiscrete(dat = dat, tax_id_type = tax_id_type) - } else if (attr_type %in% c("range", "numeric")) { + s <- .makeSignaturesDiscrete(dat = dat, taxIdType = taxIdType) + } else if (attrType %in% c("range", "numeric")) { s <- .makeSignaturesNumeric( - dat = dat, tax_id_type = tax_id_type, min = min, max = max + dat = dat, taxIdType = taxIdType, min = min, max = max ) } - output <- purrr::keep(s, ~ length(.x) >= min_size) + output <- purrr::keep(s, ~ length(.x) >= minSize) if (!length(output)) { warning( "Not enough data for creating signatures.", @@ -211,7 +209,7 @@ makeSignatures <- function( #' bugphyzz vignette; please run `browseVignettes("bugphyzz")`. #' #' @param tax A valid NCBI ID or taxon name. If taxon name is used, the -#' argument tax_id_type = "Taxon_name" must also be used. +#' argument taxIdType = "Taxon_name" must also be used. #' @param bp List of data.frames imported with \code{importBugphyzz}. #' @param ... Arguments passed to \code{makeSignatures}. #' @@ -223,7 +221,7 @@ makeSignatures <- function( #' taxonName <- "Escherichia coli" #' bp <- importBugphyzz() #' sig_names_1 <- getTaxonSignatures(taxid, bp) -#' sig_names_2 <- getTaxonSignatures(taxonName, bp, tax_id_type = "Taxon_name") +#' sig_names_2 <- getTaxonSignatures(taxonName, bp, taxIdType = "Taxon_name") #' getTaxonSignatures <- function(tax, bp, ...) { sigs <- purrr::map(bp, makeSignatures, ...) @@ -234,17 +232,17 @@ getTaxonSignatures <- function(tax, bp, ...) { } # Non exported functions ---------------------------------------------------- -.makeSignaturesDiscrete <- function(dat, tax_id_type = "NCBI_ID") { - dat |> - dplyr::mutate( - Attribute = paste0("bugphyzz:", Attribute, "|", Attribute_value) - ) |> - {\(y) split(y, y$Attribute)}() |> - lapply(function(x) unique(x[[tax_id_type]])) +.makeSignaturesDiscrete <- function(dat, taxIdType = "NCBI_ID") { + dat$Attribute <- paste0( + "bugphyz:", dat$Attribute, "|", dat$Attribute_value + ) + dat |> + {\(y) S4Vectors::split(y, y$Attribute)}() |> + lapply(function(x) unique(x[[taxIdType]])) } .makeSignaturesNumeric <- function( - dat, tax_id_type = "NCBI_ID", min = NULL, max = NULL + dat, taxIdType = "NCBI_ID", min = NULL, max = NULL ) { if (!is.null(min) || !is.null(max)) { if (is.null(min)) { @@ -259,41 +257,41 @@ getTaxonSignatures <- function(tax, bp, ...) { ) max <- max(dat$Attribute_value) } - dat <- dat |> - dplyr::filter( - Attribute_value >= min & Attribute_value <= max - ) |> - dplyr::mutate( - Attribute = paste0( - "bugphyzz:", Attribute, "| >=", min, " & <=", max - ) - ) + + dat <- dat[ + which(dat$Attribute_value >= min & dat$Attribute_value <= max), + ] + dat$Attribute <- paste0( + "bugphyzz:", dat$Attribute, "| >=", min, " & <=", max + ) } else { thr <- .thresholds() |> dplyr::filter(Attribute_group == unique(dat$Attribute)) - attr_name <- thr$Attribute - min_values <- thr$lower - max_values <- thr$upper + attrName <- thr$Attribute + minValues <- thr$lower + maxValues <- thr$upper dat$tmp_col <- NA - for (i in seq_along(attr_name)) { - if (is.na(min_values[i])) - min_values[i] <- min(dat$Attribute_value) - 0.01 - if (is.na(max_values[i])) - max_values[i] <- max(dat$Attribute_value) + ## This for loop modify a series of vectors rather than creating + ## a list. Result has been pre-allocated in those vectors. + for (i in seq_along(attrName)) { + if (is.na(minValues[i])) + minValues[i] <- min(dat$Attribute_value) - 0.01 + if (is.na(maxValues[i])) + maxValues[i] <- max(dat$Attribute_value) pos <- which( - dat$Attribute_value > min_values[i] & - dat$Attribute_value <= max_values[i] + dat$Attribute_value > minValues[i] & + dat$Attribute_value <= maxValues[i] ) - dat$tmp_col[pos] <- attr_name[i] + dat$tmp_col[pos] <- attrName[i] dat$Attribute[pos] <- paste0( - "bugphyzz:", dat$Attribute[pos], "|", attr_name[i], "| > ", - round(min_values[i], 2), " & <= ", max_values[i] + "bugphyzz:", dat$Attribute[pos], "|", attrName[i], "| > ", + round(minValues[i], 2), " & <= ", maxValues[i] ) } } dat |> - {\(y) split(y, y$Attribute)}() |> - lapply(function(x) unique(x[[tax_id_type]])) + {\(y) S4Vectors::split(y, y$Attribute)}() |> + lapply(function(x) unique(x[[taxIdType]])) } .thresholds <- function() { @@ -330,67 +328,59 @@ getTaxonSignatures <- function(tax, bp, ...) { } ## Import a version of bupghyzz -.downloadResource <- function(version, force_download) { +.downloadResource <- function(version, forceDownload) { if (stringr::str_detect(version, "^10.5281/zenodo.[0-9]+$")) { suffix <- sub("^10.5281/zenodo\\.", "", version) - output <- .downloadZ(suffix, force_download) + output <- .downloadZ(suffix, forceDownload) } else if ( version == "devel" || stringr::str_detect(version, stringr::regex("^[:alnum:]{7}$")) ){ - output <- .downloadGH(version, force_download) + output <- .downloadGH(version, forceDownload) } else { stop("Version must be a Zenodo DOI, GitHub commit hash, or 'devel'.") } return(output) } -## Function for downloading data on Zenodo -.downloadZ <- function(record, force_download) { - base_url <- paste0("https://zenodo.org/api/records/", record) - req <- httr2::request(base_url) +## Function for downloading data from Zenodo +.downloadZ <- function(record, forceDownload) { + baseUrl <- paste0("https://zenodo.org/api/records/", record) + req <- httr2::request(baseUrl) res <- httr2::req_perform(req) l <- httr2::resp_body_json(res) - - file_names_api <- purrr::map_chr(l$files, ~ .x$links$self) - file_names_url <- sub( - "(^.*)(api/)(.*)(/content$)", "\\1\\3", file_names_api + fileNamesApi <- purrr::map_chr(l$files, ~ .x$links$self) + fileNamesUrl <- sub( + "(^.*)(api/)(.*)(/content$)", "\\1\\3", fileNamesApi ) - rpath <- .getResource( rname = paste0("bugphyzz.zip"), - url = file_names_url, verbose = TRUE, force = force_download + url = fileNamesUrl, verbose = TRUE, force = forceDownload ) - temp_dir <- tempdir() - utils::unzip(zipfile = rpath, exdir = temp_dir, junkpaths = TRUE) - files <- list.files(temp_dir, pattern = "csv", full.names = TRUE) - - output <- vector("list", length(files)) - for (i in seq_along(output)) { - output[[i]] <- utils::read.csv(files[i], header = TRUE, skip = 1) |> - dplyr::mutate(Attribute = tolower(.data$Attribute)) - } - return(output) + tempDir <- tempdir() + utils::unzip(zipfile = rpath, exdir = tempDir, junkpaths = TRUE) + files <- list.files(tempDir, pattern = "csv", full.names = TRUE) + lapply(files, function(x) { + utils::read.csv(x, header = TRUE, skip = 1) |> + dplyr::mutate(Attribute = tolower(Attribute)) + }) } -## Function for downloading data on GitHub -.downloadGH <- function(version, force_download) { - file_suffix <- c("binary", "multistate", "numeric") +## Function for downloading data from GitHub +.downloadGH <- function(version, forceDownload) { + fileSuffix <- c("binary", "multistate", "numeric") urls <- paste0( "https://github.com/waldronlab/bugphyzzExports/raw/", version, - "/bugphyzz_", file_suffix, ".csv" + "/bugphyzz_", fileSuffix, ".csv" ) names(urls) <- c("binary", "multistate", "numeric") - output <- vector("list", length(urls)) - for (i in seq_along(output)) { + lapply(seq_len(length(urls)), function(i) { message("Importing ", names(urls)[i], " data...") - names(output)[i] <- names(urls)[i] rpath <- .getResource( rname = paste0("bugphyzz_", names(urls)[i], ".csv"), - url = urls[i], verbose = TRUE, force = force_download + url = urls[i], verbose = TRUE, force = forceDownload ) - output[[i]] <- utils::read.csv(rpath, header = TRUE, skip = 1) |> + utils::read.csv(rpath, header = TRUE, skip = 1) |> dplyr::mutate(Attribute = tolower(Attribute)) - } - return(output) + }) } diff --git a/R/fattyAcidComposition.R b/R/fattyAcidComposition.R index 7e9097e4..082a8c28 100644 --- a/R/fattyAcidComposition.R +++ b/R/fattyAcidComposition.R @@ -1,8 +1,4 @@ - ## Function for importing fatty acid compositions -## TODO This dataset needs more curation. -## TODO Names of the Fatty Acids should be more "user-friendly" -## TODO Maybe a threshold should be decided to consider a FA as present or not. .fattyAcidComposition <- function(){ link <- .customLinks() |> dplyr::filter(functionname == "fattyAcidComposition") |> @@ -14,7 +10,7 @@ names_to = "Attribute_new", values_to = "Attribute_value" ) |> dplyr::mutate(NCBI_ID = as.character(NCBI_ID)) - dplyr::left_join(fac_long, ranks_parents, by = "NCBI_ID") |> + dplyr::left_join(fac_long, ranksParents, by = "NCBI_ID") |> as.data.frame() |> .addSourceInfo() |> purrr::modify_at( @@ -25,7 +21,7 @@ ) |> dplyr::select(-Attribute) |> dplyr::rename(Attribute = Attribute_new) |> - .reorderColumns(attr_type = 'numeric') + .reorderColumns(attrType = 'numeric') } ## Function to import custom links diff --git a/R/physiologies.R b/R/physiologies.R index 651cd795..99472c03 100644 --- a/R/physiologies.R +++ b/R/physiologies.R @@ -8,7 +8,7 @@ #' @param keyword Character vector with one or more valid keywords. #' Valid keyboards can be checked with \code{showPhys}. If 'all', all #' physiologies are imported. -#' @param full_source Logical. If `TRUE`, the Attribute_source column will +#' @param fullSource Logical. If `TRUE`, the Attribute_source column will #' contain full source information. If `FALSE`, the Attribute_source column #' will contain shortened versions of the sources. Default is `FALSE`. #' @@ -20,7 +20,7 @@ #' l <- physiologies('all') #' df <- physiologies('aerophilicity')[[1]] #' -physiologies <- function(keyword = 'all', full_source = FALSE) { +physiologies <- function(keyword = 'all', fullSource = FALSE) { keyword <- .checkKeyword(keyword) cond1 <- any(keyword %in% showPhys('spreadsheets')) cond2 <- any(keyword %in% showPhys('bacdive')) @@ -30,23 +30,25 @@ physiologies <- function(keyword = 'all', full_source = FALSE) { spreadsheets <- spreadsheets[names(spreadsheets) %in% keyword] bacdive <- .reshapeBacDive(.getBacDive(verbose = FALSE)) bacdive <- bacdive[names(bacdive) %in% keyword] - physiologies <- vector('list', length(keyword)) - for (i in seq_along(keyword)) { + physiologies <- lapply(seq_along(keyword), function(i) { df1 <- spreadsheets[[keyword[i]]] df2 <- bacdive[[keyword[i]]] - physiologies[[i]] <- dplyr::bind_rows(df1, df2) - names(physiologies)[i] <- keyword[i] + o <- dplyr::bind_rows(df1, df2) message('Finished ', keyword[i], '.') - } + o + }) + names(physiologies) <- keyword } else if (cond1 && !cond2) { spreadsheets <- .importSpreadsheets(keyword = keyword) physiologies <- spreadsheets[names(spreadsheets) %in% keyword] + ## Not creating a vector. Only usin the side effect of the for loop. for (i in seq_along(keyword)) { message('Finished ', keyword[i], '.') } } else if (!cond1 && cond2) { bacdive <- .reshapeBacDive(.getBacDive(verbose = FALSE)) physiologies <- bacdive[names(bacdive) %in% keyword] + ## Not creating a vector. Only using the side effect of the for loop. for (i in seq_along(keyword)) { message('Finished ', keyword[i], '.') } @@ -62,20 +64,19 @@ physiologies <- function(keyword = 'all', full_source = FALSE) { ) |> dplyr::distinct() - if (full_source) { - df$Attribute_source <- df$full_source + if (fullSource) { + df$Attribute_source <- df$fullSource } df$full_source <- NULL df <- .reorderColumns( df = df, name = unique(df$Attribute_group), - attr_type = unique(df$Attribute_type) + attrType = unique(df$Attribute_type) ) df <- as.data.frame(df[, vapply(df, \(y) !all(is.na(y)), logical(1))]) - ## TODO this code could be somewhere else if (unique(df$Attribute_group) == 'aerophilicity') { df <- .homogenizeAerophilicityAttributeNames(df) } @@ -92,7 +93,7 @@ physiologies <- function(keyword = 'all', full_source = FALSE) { #' imported with the \code{\link{physiologies}} function. This function #' should be used by developers/curators. #' -#' @param which_names A character string. Options: 'all' (default), +#' @param whichNames A character string. Options: 'all' (default), #' 'spreadsheets', 'bacdive'. #' #' @return A character vector with the names of the physiologies. @@ -103,20 +104,20 @@ physiologies <- function(keyword = 'all', full_source = FALSE) { #' showPhys('bacdive') #' showPhys('spreadsheets') #' -showPhys <- function(which_names = 'all') { +showPhys <- function(whichNames = 'all') { fname <- system.file( 'extdata', 'spreadsheet_links.tsv', package = 'bugphyzz' ) links <- utils::read.table(fname, header = TRUE, sep = '\t') - spreadsheet_phys <- links[['physiology']] - if (which_names == 'all') - ## bacdive_phys_names is a character vector saved as internal data - phys_names <- sort(unique(c(spreadsheet_phys, bacdive_phys_names))) - if (which_names == 'spreadsheets') - phys_names <- spreadsheet_phys - if (which_names == 'bacdive') - phys_names <- bacdive_phys_names - return(phys_names) + spreadsheetPhys <- links[['physiology']] + if (whichNames == 'all') + ## bacdivePhysNames is a character vector saved as internal data + physNames <- sort(unique(c(spreadsheetPhys, bacdivePhysNames))) + if (whichNames == 'spreadsheets') + physNames <- spreadsheetPhys + if (whichNames == 'bacdive') + physNames <- bacdivePhysNames + return(physNames) } ## Helper function for physiologies @@ -135,13 +136,13 @@ showPhys <- function(which_names = 'all') { keyword <- showPhys() } } - valid_keywords <- showPhys() - lgl_vct <- keyword %in% valid_keywords - if (any(!lgl_vct) ) { - invalid_keywords <- keyword[!lgl_vct] + validKeywords <- showPhys() + lglVct <- keyword %in% validKeywords + if (any(!lglVct) ) { + invalidKeywords <- keyword[!lglVct] stop( "Invalid keyword(s): ", - paste0(invalid_keywords, collapse = ', '), '.', + paste0(invalidKeywords, collapse = ', '), '.', " Check valid keywords with showPhys() or use 'all' to import all", " physiologies.", call. = FALSE @@ -152,51 +153,49 @@ showPhys <- function(which_names = 'all') { ## Helper function for physiologies .importSpreadsheets <- function(keyword) { - parent_col_names <- c('Parent_name', 'Parent_NCBI_ID', 'Parent_rank') + parentColNames <- c('Parent_name', 'Parent_NCBI_ID', 'Parent_rank') fname <- system.file( 'extdata', 'spreadsheet_links.tsv', package = 'bugphyzz' ) links <- utils::read.table(fname, header = TRUE, sep = '\t') links <- links[links[['physiology']] %in% keyword,] - spreadsheets <- vector('list', nrow(links)) - for (i in seq_along(spreadsheets)) { - phys_name <- links[i, 'physiology', drop = FALSE][[1]] - attr_type <- links[i, 'attribute_type', drop = FALSE][[1]] - names(spreadsheets)[i] <- phys_name + spreadsheets <- lapply(seq_len(nrow(links)), function(i) { + physName <- links[i, 'physiology', drop = FALSE][[1]] + attrType <- links[i, 'attribute_type', drop = FALSE][[1]] url <- links[i, 'link', drop = FALSE][[1]] df <- dplyr::distinct(utils::read.csv(url)) - df[['Attribute_type']] <- attr_type - df[['Attribute_group']] <- phys_name + df[['Attribute_type']] <- attrType + df[['Attribute_group']] <- physName df[['NCBI_ID']] <- as.character(df[['NCBI_ID']]) df <- df[!is.na(df[['Attribute_value']]),] - + if (unique(df[['Attribute_type']]) == 'numeric') { df <- .numericToRange(df) } else if (unique(df[['Attribute_type']] == 'range')) { df <- .modifyRange(df) } else if ( - unique(df[['Attribute_type']] %in% .DISCRETE_ATTRIBUTE_TYPES()) + unique(df[['Attribute_type']] %in% .discreteAttributeTypes()) ) { df <- dplyr::filter( df, Attribute_value == TRUE | Attribute_value == FALSE ) } - - if (all(parent_col_names %in% colnames(df))) { + if (all(parentColNames %in% colnames(df))) { df$Parent_NCBI_ID <- stringr::str_squish( as.character(df$Parent_NCBI_ID) ) } else { - ## ranks_parents is an internal object (data.frame) in bugphyzz + ## ranksParents is an internal object (data.frame) in bugphyzz rp <- purrr::modify_at( - .x = ranks_parents, + .x = ranksParents, .at = c('NCBI_ID', 'Parent_NCBI_ID'), .f = as.character ) df <- dplyr::left_join(df, rp, by = "NCBI_ID") } - spreadsheets[[i]] <- df - } + df + }) + names(spreadsheets) <- links$physiology return(spreadsheets) } @@ -264,7 +263,7 @@ showPhys <- function(which_names = 'all') { } ## helper function for .importSpreadsheets -.DISCRETE_ATTRIBUTE_TYPES <- function() { +.discreteAttributeTypes <- function() { fname <- system.file( 'extdata', 'spreadsheet_links.tsv', package = 'bugphyzz' ) @@ -277,33 +276,33 @@ showPhys <- function(which_names = 'all') { fpath <- system.file( 'extdata', 'attribute_sources.tsv', package = 'bugphyzz' ) - source_data <- utils::read.table( + sourceData <- utils::read.table( file = fpath, header = TRUE, sep = '\t', quote = '', check.names = FALSE, comment.char = '' ) - dplyr::left_join(dat, source_data, by = 'Attribute_source') + dplyr::left_join(dat, sourceData, by = 'Attribute_source') } ## Helper function for physiologies -.reorderColumns <- function(df, name = NULL, attr_type) { - col_names <- colnames(df) - req_cols <- .requiredColumns(attr_type) - cols_lgl <- req_cols %in% col_names - if (!all(cols_lgl)) { - missing_cols <- paste0(req_cols[!cols_lgl], collapse = ', ') +.reorderColumns <- function(df, name = NULL, attrType) { + colNames <- colnames(df) + reqCols <- .requiredColumns(attrType) + colsLgl <- reqCols %in% colNames + if (!all(colsLgl)) { + missingCols <- paste0(reqCols[!colsLgl], collapse = ', ') if (!is.null(name)) { msg <- paste0( 'Missing columns in ', name, '.', ' Missing columns are: ', - missing_cols + missingCols ) } else { msg <- paste0( - 'Missing columns.', ' Missing columns are: ', missing_cols + 'Missing columns.', ' Missing columns are: ', missingCols ) } warning(msg, call. = FALSE) } - cols <- req_cols[cols_lgl] + cols <- reqCols[colsLgl] df |> dplyr::relocate(dplyr::all_of(cols)) } @@ -321,12 +320,12 @@ showPhys <- function(which_names = 'all') { } ## Required columns -.requiredColumns <- function(attr_type) { +.requiredColumns <- function(attrType) { fname <- system.file("extdata/curation_template.tsv", package = "bugphyzz") df <- utils::read.table(fname, sep = "\t", header = TRUE) - lgl_vct_1 <- df$requiredness == "required" - lgl_vct_2 <- grepl(attr_type, df$attribute_types) - df <- df[lgl_vct_1 & lgl_vct_2,] + lglVct1 <- df$requiredness == "required" + lglVct2 <- grepl(attrType, df$attribute_types) + df <- df[lglVct1 & lglVct2,] df[order(df[["required_column_order"]]), , drop = FALSE] output <- df[['column_name']] return(output) @@ -334,11 +333,11 @@ showPhys <- function(which_names = 'all') { ## Generate a template for a bugphyzz dataset .template <- function(dataset) { - template_tsv <- system.file( + templateTsv <- system.file( "extdata/curation_template.tsv", package = "bugphyzz" ) template <- utils::read.table( - file = template_tsv, sep = "\t", check.names = FALSE, header = TRUE, + file = templateTsv, sep = "\t", check.names = FALSE, header = TRUE, allowEscapes = TRUE ) # template <- readr::read_tsv(template_tsv, show_col_types = FALSE) template[template[["column_name"]] %in% colnames(dataset), ] @@ -359,11 +358,11 @@ showPhys <- function(which_names = 'all') { 'extdata/spreadsheet_links.tsv', package = 'bugphyzz' ) links <- utils::read.table(fname1, header = TRUE, sep = '\t') - select_cols <- c("physiology", "source_link") + selectCols <- c("physiology", "source_link") phys_links <- links |> - dplyr::select(tidyselect::all_of(select_cols)) + dplyr::select(tidyselect::all_of(selectCols)) custom_links <- .customLinks() |> - dplyr::select(tidyselect::all_of((select_cols))) + dplyr::select(tidyselect::all_of((selectCols))) links <- dplyr::bind_rows(phys_links, custom_links) x |> dplyr::left_join(links, by = c("dataset" = "physiology")) diff --git a/R/sysdata.rda b/R/sysdata.rda index 7be45a71..419ee423 100644 Binary files a/R/sysdata.rda and b/R/sysdata.rda differ diff --git a/inst/extdata/attributes.tsv b/inst/extdata/attributes.tsv index a674c776..02b76a92 100644 --- a/inst/extdata/attributes.tsv +++ b/inst/extdata/attributes.tsv @@ -171,7 +171,7 @@ "COGEM pathogenicity rating" "COGEM pathogenicity rating" "integer" NA NA "antimicrobial sensitivity" "antimicrobial sensitivity" "logical" NA NA "biofilm formation" "biofilm formation" "logical" "APO:0000159" "The ability to form a layer on solid surfaces with or without other microorganisms." -"Butyrate-Producing Bacteria" "Butyrate-Producing Bacteria" "numeric" "NCIT:C126417" "Any anaerobic bacteria that metabolizes complex carbohydrates to produce butyrate. These microorganisms may form colonies in the intestines of mammals; their presence may aid the host by catabolizing undigested fiber and starch. Additionally, butyrate is metabolized by colonic epithelial cells and may protect the host against ulcerative colitis and cancer." +"Butyrate-Producing Bacteria" "Butyrate-Producing Bacteria;butyrate producing" "numeric" "NCIT:C126417" "Any anaerobic bacteria that metabolizes complex carbohydrates to produce butyrate. These microorganisms may form colonies in the intestines of mammals; their presence may aid the host by catabolizing undigested fiber and starch. Additionally, butyrate is metabolized by colonic epithelial cells and may protect the host against ulcerative colitis and cancer." "acetate producing" "acetate producing" "numeric" NA NA "lactate producing" "lactate producing" "numeric" NA NA "mutation rate per site per year" "mutation rates per site per year" "numeric" NA NA @@ -193,7 +193,7 @@ "growth temperature" "growth temperature" "numeric" NA NA "oral" "habitat" "logical" NA NA "optimal ph" "optimal ph" "numeric" NA NA -"plant pathogenity" "plant pathogenicity" "numeric" NA NA +"plant pathogenicity" "plant pathogenicity" "numeric" NA NA "blade of grass from raritan river nj, usa" "isolation site" "logical" NA NA "hot swamp from kunashir island, russia" "isolation site" "logical" NA NA "30-m-tall sulfide mound in the iheya north field, japan" "isolation site" "logical" NA NA diff --git a/inst/extdata/spreadsheet_links.tsv b/inst/extdata/spreadsheet_links.tsv index 4eb3a204..0a1e9c4b 100644 --- a/inst/extdata/spreadsheet_links.tsv +++ b/inst/extdata/spreadsheet_links.tsv @@ -5,7 +5,7 @@ animal pathogen NA discrete binary https://docs.google.com/spreadsheets/d/e/2PAC antimicrobial resistance NA discrete multistate-union https://docs.google.com/spreadsheets/d/e/2PACX-1vQmtXGNF7n8mGZKow2Kj2m5JPXsisau9T4TF_u53ovqeELiIVxrDUjaNbdGi-qiKanTKoO5xfba5LTq/pub?gid=0&single=true&output=csv https://docs.google.com/spreadsheets/d/1ldVVQr6zT9g_Bf6WsJtAjke8yyP-gLp8llMYh6bipag antimicrobial sensitivity NA discrete binary https://docs.google.com/spreadsheets/d/e/2PACX-1vR0FdQpAD-xhkyIXgZMtC9oCmlbx77Cj1ncCyojm9oO410NBwJRhBNFl4hnd4HMlj2okxEKLvA8EPr2/pub?output=csv https://docs.google.com/spreadsheets/d/1eTceiQnE8YRoYGDWRL6uQqG6Wx9PPsBXk2X6Pvq7p6M arrangement NA discrete multistate-intersection https://docs.google.com/spreadsheets/d/e/2PACX-1vSgNXC-VPLmG8dUN8VAun_pbT0Y4-qJo6u9ozHGO8j7Weq6ymPaPw9kJUPl-RpHSYvtUgACo-SVHyk5/pub?output=csv https://docs.google.com/spreadsheets/d/1qkNPdVDjpkdvZ-UtWeIrmWA3aYCInIQrseDDYqXiH-w -biofilm forming NA discrete binary https://docs.google.com/spreadsheets/d/e/2PACX-1vRduthxAWAhSLSoOgFNAmI8DX8ORerG1U6OmI8y6-5coe31pONYfISNsUE-e-E_eiyxtQzin6Xq9b0m/pub?output=csv https://docs.google.com/spreadsheets/d/1H8Pon9zmivcN7mtrlLuTLF4IL_lN5wi_k93XRlhWlRg +biofilm formation NA discrete binary https://docs.google.com/spreadsheets/d/e/2PACX-1vRduthxAWAhSLSoOgFNAmI8DX8ORerG1U6OmI8y6-5coe31pONYfISNsUE-e-E_eiyxtQzin6Xq9b0m/pub?output=csv https://docs.google.com/spreadsheets/d/1H8Pon9zmivcN7mtrlLuTLF4IL_lN5wi_k93XRlhWlRg butyrate producing NA discrete binary https://docs.google.com/spreadsheets/d/e/2PACX-1vQPnLOpbFbfLvOb8fvC6A-WfzVNMDYRWOUsT4hIseDNgI7nwZGjVwg3KBkWeFtOBBLK0j-DRiZ-jtqG/pub?output=csv https://docs.google.com/spreadsheets/d/1uPOIyeFD5ClPR2W2XyT6jAdD8FkgGuNBSl_8F2CT_uI COGEM pathogenicity rating NA discrete multistate-intersection https://docs.google.com/spreadsheets/d/e/2PACX-1vQYZjtf0RcacNAe54pLOnzor2q0ZeK4cF19f-gjji7cHfrtOf3VuDPA-jSAKRNDNZoWQanyw2QCugRZ/pub?output=csv https://docs.google.com/spreadsheets/d/19DrBFbAMGNvqf17yJ4HzuKQjJoRlMCVYTCtmyugoDjQ disease association NA discrete multistate-union https://docs.google.com/spreadsheets/d/e/2PACX-1vQvmmfCe2iXNNR3oC73wXTgukNGNj6_i9qU_20kwpBITtrjn_zgNBK_w9-cfJYKsvXmgXWF_2P8ZMZF/pub?gid=546928968&single=true&output=csv https://docs.google.com/spreadsheets/d/1MAtkI-UEc7T33BYvLBAkw4PDmEmNBroMVUi4KwN8cCc diff --git a/inst/extdata/validation_summary.tsv b/inst/extdata/validation_summary.tsv index a5d6087e..93f80828 100644 --- a/inst/extdata/validation_summary.tsv +++ b/inst/extdata/validation_summary.tsv @@ -1,111 +1,111 @@ method rank physiology attribute mcc_mean mcc_sd r2_mean r2_sd ltp_bp bp ltp_bp_phys bp_phys ltp nsti_mean nsti_sd ltp_bp_per ltp_bp_phys_per -phytools-ltp all aerophilicity aerobic 0.71 0.02 NA NA 8530 10771 11728 15872 23501 0.042 0.046 36 50 -phytools-ltp all aerophilicity anaerobic 0.84 0.02 NA NA 2399 3766 11728 15872 23501 0.042 0.046 10 50 -phytools-ltp all aerophilicity facultatively anaerobic 0.23 0.06 NA NA 970 1575 11728 15872 23501 0.042 0.046 4 50 -phytools-ltp genus aerophilicity aerobic 0.71 0.05 NA NA 631 639 1169 1184 23501 0.088 0.06 3 5 -phytools-ltp genus aerophilicity anaerobic 0.71 0.04 NA NA 392 398 1169 1184 23501 0.088 0.06 2 5 -phytools-ltp genus aerophilicity facultatively anaerobic 0.5 0.1 NA NA 173 174 1169 1184 23501 0.088 0.06 1 5 -phytools-ltp species aerophilicity aerobic 0.68 0.03 NA NA 7152 8418 9078 11268 23501 0.054 0.06 30 39 -phytools-ltp species aerophilicity anaerobic 0.84 0.03 NA NA 1387 2162 9078 11268 23501 0.054 0.06 6 39 -phytools-ltp species aerophilicity facultatively anaerobic 0.21 0.07 NA NA 683 901 9078 11268 23501 0.054 0.06 3 39 -phytools-ltp strain aerophilicity aerobic 0.74 0.07 NA NA 747 1714 1481 3420 23501 0.1 0.065 3 6 -phytools-ltp strain aerophilicity anaerobic 0.83 0.05 NA NA 620 1206 1481 3420 23501 0.1 0.065 3 6 -phytools-ltp strain aerophilicity facultatively anaerobic 0.35 0.07 NA NA 114 500 1481 3420 23501 0.1 0.065 0 6 -phytools-ltp all arrangement branched 0.1 0.16 NA NA 97 109 741 1568 23501 0.119 0.079 0 3 -phytools-ltp all arrangement cell chain 0.37 0.17 NA NA 219 448 741 1568 23501 0.119 0.079 1 3 -phytools-ltp all arrangement cell cluster 0.28 0.23 NA NA 97 130 741 1568 23501 0.119 0.079 0 3 -phytools-ltp all arrangement filamentous 0.33 0.36 NA NA 24 68 741 1568 23501 0.119 0.079 0 3 -phytools-ltp all arrangement paired cells 0.38 0.17 NA NA 376 775 741 1568 23501 0.119 0.079 2 3 -phytools-ltp all arrangement single 0.23 0.17 NA NA 368 771 741 1568 23501 0.119 0.079 2 3 -phytools-ltp all arrangement tetrads 0 0 NA NA 28 44 741 1568 23501 0.119 0.079 0 3 -phytools-ltp genus arrangement branched 0.13 0.19 NA NA 87 90 376 390 23501 0.137 0.086 0 2 -phytools-ltp genus arrangement cell chain -0.01 0.01 NA NA 108 109 376 390 23501 0.137 0.086 0 2 -phytools-ltp genus arrangement cell cluster 0.06 0.22 NA NA 70 73 376 390 23501 0.137 0.086 0 2 -phytools-ltp genus arrangement paired cells 0.09 0.23 NA NA 193 194 376 390 23501 0.137 0.086 1 2 -phytools-ltp genus arrangement single -0.01 0.02 NA NA 221 228 376 390 23501 0.137 0.086 1 2 -phytools-ltp all biosafety level biosafety level 1 0.58 0.04 NA NA 9076 11160 9928 12477 23501 0.051 0.051 39 42 -phytools-ltp all biosafety level biosafety level 2 0.51 0.05 NA NA 851 1320 9928 12477 23501 0.051 0.051 4 42 -phytools-ltp all biosafety level biosafety level 3 0.24 0.4 NA NA 12 15 9928 12477 23501 0.051 0.051 0 42 -phytools-ltp species biosafety level biosafety level 1 0.54 0.04 NA NA 7623 8195 8208 8907 23501 0.06 0.059 32 35 -phytools-ltp species biosafety level biosafety level 2 0.5 0.05 NA NA 586 717 8208 8907 23501 0.06 0.059 2 35 -phytools-ltp strain biosafety level biosafety level 1 0.52 0.08 NA NA 1453 2965 1720 3570 23501 0.094 0.062 6 7 -phytools-ltp strain biosafety level biosafety level 2 0.5 0.1 NA NA 265 603 1720 3570 23501 0.094 0.062 1 7 +phytools-ltp all aerophilicity aerobic 0.71 0.02 NA NA 8530 10771 11728 15872 23501 0.039 0.045 36 50 +phytools-ltp all aerophilicity anaerobic 0.84 0.02 NA NA 2399 3766 11728 15872 23501 0.039 0.045 10 50 +phytools-ltp all aerophilicity facultatively anaerobic 0.23 0.06 NA NA 970 1575 11728 15872 23501 0.039 0.045 4 50 +phytools-ltp genus aerophilicity aerobic 0.71 0.05 NA NA 631 639 1169 1184 23501 0.085 0.062 3 5 +phytools-ltp genus aerophilicity anaerobic 0.71 0.04 NA NA 392 398 1169 1184 23501 0.085 0.062 2 5 +phytools-ltp genus aerophilicity facultatively anaerobic 0.5 0.1 NA NA 173 174 1169 1184 23501 0.085 0.062 1 5 +phytools-ltp species aerophilicity aerobic 0.69 0.03 NA NA 7152 8418 9078 11268 23501 0.053 0.059 30 39 +phytools-ltp species aerophilicity anaerobic 0.84 0.03 NA NA 1387 2162 9078 11268 23501 0.053 0.059 6 39 +phytools-ltp species aerophilicity facultatively anaerobic 0.21 0.07 NA NA 683 901 9078 11268 23501 0.053 0.059 3 39 +phytools-ltp strain aerophilicity aerobic 0.74 0.07 NA NA 747 1714 1481 3420 23501 0.099 0.066 3 6 +phytools-ltp strain aerophilicity anaerobic 0.83 0.05 NA NA 620 1206 1481 3420 23501 0.099 0.066 3 6 +phytools-ltp strain aerophilicity facultatively anaerobic 0.35 0.07 NA NA 114 500 1481 3420 23501 0.099 0.066 0 6 +phytools-ltp all arrangement branched 0.1 0.16 NA NA 97 109 741 1568 23501 0.116 0.079 0 3 +phytools-ltp all arrangement cell chain 0.37 0.17 NA NA 219 448 741 1568 23501 0.116 0.079 1 3 +phytools-ltp all arrangement cell cluster 0.28 0.23 NA NA 97 130 741 1568 23501 0.116 0.079 0 3 +phytools-ltp all arrangement filamentous 0.33 0.36 NA NA 24 68 741 1568 23501 0.116 0.079 0 3 +phytools-ltp all arrangement paired cells 0.38 0.17 NA NA 376 775 741 1568 23501 0.116 0.079 2 3 +phytools-ltp all arrangement single 0.23 0.17 NA NA 368 771 741 1568 23501 0.116 0.079 2 3 +phytools-ltp all arrangement tetrads 0 0 NA NA 28 44 741 1568 23501 0.116 0.079 0 3 +phytools-ltp genus arrangement branched 0.13 0.19 NA NA 87 90 376 390 23501 0.133 0.086 0 2 +phytools-ltp genus arrangement cell chain -0.01 0.01 NA NA 108 109 376 390 23501 0.133 0.086 0 2 +phytools-ltp genus arrangement cell cluster 0.06 0.22 NA NA 70 73 376 390 23501 0.133 0.086 0 2 +phytools-ltp genus arrangement paired cells 0.09 0.23 NA NA 193 194 376 390 23501 0.133 0.086 1 2 +phytools-ltp genus arrangement single -0.01 0.02 NA NA 221 228 376 390 23501 0.133 0.086 1 2 +phytools-ltp all biosafety level biosafety level 1 0.58 0.04 NA NA 9076 11160 9928 12477 23501 0.049 0.05 39 42 +phytools-ltp all biosafety level biosafety level 2 0.51 0.05 NA NA 851 1320 9928 12477 23501 0.049 0.05 4 42 +phytools-ltp all biosafety level biosafety level 3 0.24 0.4 NA NA 12 15 9928 12477 23501 0.049 0.05 0 42 +phytools-ltp species biosafety level biosafety level 1 0.54 0.04 NA NA 7623 8195 8208 8907 23501 0.058 0.059 32 35 +phytools-ltp species biosafety level biosafety level 2 0.5 0.05 NA NA 586 717 8208 8907 23501 0.058 0.059 2 35 +phytools-ltp strain biosafety level biosafety level 1 0.52 0.08 NA NA 1453 2965 1720 3570 23501 0.093 0.062 6 7 +phytools-ltp strain biosafety level biosafety level 2 0.5 0.1 NA NA 265 603 1720 3570 23501 0.093 0.062 1 7 phytools-ltp all COGEM pathogenicity rating COGEM pathogenicity rating 1 0.74 0.1 NA NA 164 489 450 1036 23501 0.184 0.129 1 2 phytools-ltp all COGEM pathogenicity rating COGEM pathogenicity rating 2 0.71 0.1 NA NA 267 514 450 1036 23501 0.184 0.129 1 2 phytools-ltp all COGEM pathogenicity rating COGEM pathogenicity rating 3 0.72 0.42 NA NA 19 33 450 1036 23501 0.184 0.129 0 2 phytools-ltp species COGEM pathogenicity rating COGEM pathogenicity rating 1 0.74 0.1 NA NA 164 489 450 1036 23501 0.184 0.129 1 2 phytools-ltp species COGEM pathogenicity rating COGEM pathogenicity rating 2 0.71 0.1 NA NA 267 514 450 1036 23501 0.184 0.129 1 2 phytools-ltp species COGEM pathogenicity rating COGEM pathogenicity rating 3 0.72 0.42 NA NA 19 33 450 1036 23501 0.184 0.129 0 2 -phytools-ltp all gram stain gram stain negative 0.93 0.01 NA NA 5515 7508 8856 12116 23501 0.043 0.042 23 38 -phytools-ltp all gram stain gram stain positive 0.91 0.01 NA NA 3223 4466 8856 12116 23501 0.043 0.042 14 38 -phytools-ltp all gram stain gram stain variable 0.1 0.08 NA NA 130 155 8856 12116 23501 0.043 0.042 1 38 -phytools-ltp genus gram stain gram stain negative 0.85 0.05 NA NA 786 816 1243 1278 23501 0.083 0.059 3 5 -phytools-ltp genus gram stain gram stain positive 0.83 0.05 NA NA 423 427 1243 1278 23501 0.083 0.059 2 5 -phytools-ltp genus gram stain gram stain variable 0.12 0.22 NA NA 44 45 1243 1278 23501 0.083 0.059 0 5 -phytools-ltp species gram stain gram stain negative 0.95 0.01 NA NA 4034 5179 6609 8498 23501 0.063 0.067 17 28 -phytools-ltp species gram stain gram stain positive 0.93 0.02 NA NA 2502 3237 6609 8498 23501 0.063 0.067 11 28 -phytools-ltp species gram stain gram stain variable 0.1 0.17 NA NA 75 85 6609 8498 23501 0.063 0.067 0 28 -phytools-ltp strain gram stain gram stain negative 0.87 0.04 NA NA 695 1513 1004 2340 23501 0.11 0.07 3 4 -phytools-ltp strain gram stain gram stain positive 0.88 0.03 NA NA 298 802 1004 2340 23501 0.11 0.07 1 4 -phytools-ltp strain gram stain gram stain variable 0.2 0.42 NA NA 11 25 1004 2340 23501 0.11 0.07 0 4 -phytools-ltp all hemolysis alpha 0.27 0.16 NA NA 94 112 435 587 23501 0.272 0.188 0 2 -phytools-ltp all hemolysis beta 0.31 0.17 NA NA 131 195 435 587 23501 0.272 0.188 1 2 -phytools-ltp all hemolysis gamma 0.33 0.13 NA NA 266 345 435 587 23501 0.272 0.188 1 2 -phytools-ltp species hemolysis alpha 0.15 0.2 NA NA 74 77 341 411 23501 0.282 0.188 0 1 -phytools-ltp species hemolysis beta 0.23 0.15 NA NA 115 159 341 411 23501 0.282 0.188 0 1 -phytools-ltp species hemolysis gamma 0.36 0.11 NA NA 201 228 341 411 23501 0.282 0.188 1 1 -phytools-ltp all shape bacillus 0.46 0.07 NA NA 1511 2960 2045 3969 23501 0.071 0.051 6 9 -phytools-ltp all shape coccobacillus 0 0 NA NA 35 37 2045 3969 23501 0.071 0.051 0 9 -phytools-ltp all shape coccus 0.63 0.09 NA NA 441 781 2045 3969 23501 0.071 0.051 2 9 -phytools-ltp all shape elliptic 0.18 0.3 NA NA 109 122 2045 3969 23501 0.071 0.051 0 9 -phytools-ltp all shape filamentous 0.17 0.23 NA NA 45 60 2045 3969 23501 0.071 0.051 0 9 -phytools-ltp all shape spirillum 0.7 0.16 NA NA 96 202 2045 3969 23501 0.071 0.051 0 9 -phytools-ltp all shape tail 1 0 NA NA 15 34 2045 3969 23501 0.071 0.051 0 9 -phytools-ltp genus shape bacillus 0.18 0.08 NA NA 806 840 1113 1177 23501 0.088 0.058 3 5 -phytools-ltp genus shape coccobacillus 0 0 NA NA 35 37 1113 1177 23501 0.088 0.058 0 5 -phytools-ltp genus shape coccus 0.37 0.11 NA NA 292 319 1113 1177 23501 0.088 0.058 1 5 -phytools-ltp genus shape elliptic 0.12 0.27 NA NA 109 122 1113 1177 23501 0.088 0.058 0 5 -phytools-ltp genus shape filamentous 0.06 0.18 NA NA 33 36 1113 1177 23501 0.088 0.058 0 5 -phytools-ltp genus shape spirillum 0.23 0.32 NA NA 53 59 1113 1177 23501 0.088 0.058 0 5 +phytools-ltp all gram stain gram stain negative 0.93 0.01 NA NA 5515 7508 8856 12116 23501 0.041 0.042 23 38 +phytools-ltp all gram stain gram stain positive 0.91 0.01 NA NA 3223 4466 8856 12116 23501 0.041 0.042 14 38 +phytools-ltp all gram stain gram stain variable 0.1 0.08 NA NA 130 155 8856 12116 23501 0.041 0.042 1 38 +phytools-ltp genus gram stain gram stain negative 0.85 0.05 NA NA 786 816 1243 1278 23501 0.08 0.06 3 5 +phytools-ltp genus gram stain gram stain positive 0.83 0.05 NA NA 423 427 1243 1278 23501 0.08 0.06 2 5 +phytools-ltp genus gram stain gram stain variable 0.12 0.22 NA NA 44 45 1243 1278 23501 0.08 0.06 0 5 +phytools-ltp species gram stain gram stain negative 0.95 0.01 NA NA 4034 5179 6609 8498 23501 0.062 0.067 17 28 +phytools-ltp species gram stain gram stain positive 0.93 0.02 NA NA 2502 3237 6609 8498 23501 0.062 0.067 11 28 +phytools-ltp species gram stain gram stain variable 0.1 0.17 NA NA 75 85 6609 8498 23501 0.062 0.067 0 28 +phytools-ltp strain gram stain gram stain negative 0.87 0.04 NA NA 695 1513 1004 2340 23501 0.109 0.07 3 4 +phytools-ltp strain gram stain gram stain positive 0.88 0.03 NA NA 298 802 1004 2340 23501 0.109 0.07 1 4 +phytools-ltp strain gram stain gram stain variable 0.2 0.42 NA NA 11 25 1004 2340 23501 0.109 0.07 0 4 +phytools-ltp all hemolysis alpha 0.27 0.16 NA NA 94 112 433 585 23501 0.273 0.19 0 2 +phytools-ltp all hemolysis beta 0.37 0.18 NA NA 131 195 433 585 23501 0.273 0.19 1 2 +phytools-ltp all hemolysis gamma 0.33 0.13 NA NA 266 345 433 585 23501 0.273 0.19 1 2 +phytools-ltp species hemolysis alpha 0.15 0.2 NA NA 74 77 339 409 23501 0.283 0.19 0 1 +phytools-ltp species hemolysis beta 0.3 0.16 NA NA 115 159 339 409 23501 0.283 0.19 0 1 +phytools-ltp species hemolysis gamma 0.36 0.11 NA NA 201 228 339 409 23501 0.283 0.19 1 1 +phytools-ltp all shape bacillus 0.46 0.07 NA NA 1511 2960 2045 3969 23501 0.069 0.052 6 9 +phytools-ltp all shape coccobacillus 0 0 NA NA 35 37 2045 3969 23501 0.069 0.052 0 9 +phytools-ltp all shape coccus 0.63 0.09 NA NA 441 781 2045 3969 23501 0.069 0.052 2 9 +phytools-ltp all shape elliptic 0.18 0.3 NA NA 109 122 2045 3969 23501 0.069 0.052 0 9 +phytools-ltp all shape filamentous 0.17 0.23 NA NA 45 60 2045 3969 23501 0.069 0.052 0 9 +phytools-ltp all shape spirillum 0.7 0.16 NA NA 96 202 2045 3969 23501 0.069 0.052 0 9 +phytools-ltp all shape tail 1 0 NA NA 15 34 2045 3969 23501 0.069 0.052 0 9 +phytools-ltp genus shape bacillus 0.18 0.08 NA NA 806 840 1113 1177 23501 0.085 0.059 3 5 +phytools-ltp genus shape coccobacillus 0 0 NA NA 35 37 1113 1177 23501 0.085 0.059 0 5 +phytools-ltp genus shape coccus 0.37 0.11 NA NA 292 319 1113 1177 23501 0.085 0.059 1 5 +phytools-ltp genus shape elliptic 0.12 0.27 NA NA 109 122 1113 1177 23501 0.085 0.059 0 5 +phytools-ltp genus shape filamentous 0.06 0.18 NA NA 33 36 1113 1177 23501 0.085 0.059 0 5 +phytools-ltp genus shape spirillum 0.23 0.32 NA NA 53 59 1113 1177 23501 0.085 0.059 0 5 phytools-ltp species shape bacillus 0.96 0.06 NA NA 409 1299 527 1629 23501 0.182 0.131 2 2 phytools-ltp species shape coccus 0.79 0.15 NA NA 67 203 527 1629 23501 0.182 0.131 0 2 phytools-ltp species shape spirillum 1 0 NA NA 26 78 527 1629 23501 0.182 0.131 0 2 phytools-ltp species shape tail 1 0 NA NA 15 34 527 1629 23501 0.182 0.131 0 2 -phytools-ltp strain shape bacillus 0.68 0.1 NA NA 296 821 405 1163 23501 0.163 0.082 1 2 -phytools-ltp strain shape coccus 0.62 0.2 NA NA 82 259 405 1163 23501 0.163 0.082 0 2 -phytools-ltp strain shape filamentous 0.27 0.44 NA NA 10 18 405 1163 23501 0.163 0.082 0 2 -phytools-ltp strain shape spirillum 0.42 0.45 NA NA 17 65 405 1163 23501 0.163 0.082 0 2 -phytools-ltp all spore shape coccus 0 0 NA NA 26 26 300 708 23501 0.366 0.204 0 1 -phytools-ltp all spore shape elliptic 0.19 0.43 NA NA 34 34 300 708 23501 0.366 0.204 0 1 -phytools-ltp all spore shape endospore 0.15 0.23 NA NA 281 689 300 708 23501 0.366 0.204 1 1 -castor-ltp all coding genes coding genes NA NA 0.72 0.03 NA NA 1410 4303 23031 0.101 0.07 NA 6 +phytools-ltp strain shape bacillus 0.68 0.1 NA NA 296 821 405 1163 23501 0.161 0.081 1 2 +phytools-ltp strain shape coccus 0.62 0.2 NA NA 82 259 405 1163 23501 0.161 0.081 0 2 +phytools-ltp strain shape filamentous 0.27 0.44 NA NA 10 18 405 1163 23501 0.161 0.081 0 2 +phytools-ltp strain shape spirillum 0.42 0.45 NA NA 17 65 405 1163 23501 0.161 0.081 0 2 +phytools-ltp all spore shape coccus 0 0 NA NA 26 26 300 708 23501 0.364 0.204 0 1 +phytools-ltp all spore shape elliptic 0.19 0.43 NA NA 34 34 300 708 23501 0.364 0.204 0 1 +phytools-ltp all spore shape endospore 0.15 0.23 NA NA 281 689 300 708 23501 0.364 0.204 1 1 +castor-ltp all coding genes coding genes NA NA 0.72 0.03 NA NA 1410 4303 23031 0.101 0.069 NA 6 castor-ltp species coding genes coding genes NA NA 0.66 0.11 NA NA 700 1440 23031 0.135 0.096 NA 3 -castor-ltp strain coding genes coding genes NA NA 0.62 0.11 NA NA 710 2863 23031 0.132 0.077 NA 3 -castor-ltp all genome size genome size NA NA 0.72 0.07 NA NA 1409 4298 23031 0.101 0.07 NA 6 +castor-ltp strain coding genes coding genes NA NA 0.62 0.11 NA NA 710 2863 23031 0.131 0.077 NA 3 +castor-ltp all genome size genome size NA NA 0.72 0.07 NA NA 1409 4298 23031 0.101 0.069 NA 6 castor-ltp species genome size genome size NA NA 0.67 0.12 NA NA 700 1437 23031 0.135 0.096 NA 3 -castor-ltp strain genome size genome size NA NA 0.62 0.11 NA NA 709 2861 23031 0.132 0.077 NA 3 -castor-ltp all growth temperature growth temperature NA NA 0.55 0.04 NA NA 14251 21033 23031 0.036 0.042 NA 62 -castor-ltp species growth temperature growth temperature NA NA 0.47 0.04 NA NA 12433 16916 23031 0.044 0.053 NA 54 -castor-ltp strain growth temperature growth temperature NA NA 0.71 0.09 NA NA 1818 4117 23031 0.093 0.062 NA 8 -castor-ltp all length length NA NA 0.15 0.24 NA NA 605 636 23031 0.118 0.077 NA 3 -castor-ltp genus length length NA NA 0.15 0.24 NA NA 605 636 23031 0.118 0.077 NA 3 -castor-ltp all optimal ph optimal ph NA NA 0.14 0.16 NA NA 290 878 23031 0.18 0.11 NA 1 -castor-ltp species optimal ph optimal ph NA NA 0.14 0.16 NA NA 290 878 23031 0.18 0.11 NA 1 -castor-ltp all width width NA NA 0.04 0.04 NA NA 756 812 23031 0.105 0.076 NA 3 -castor-ltp genus width width NA NA 0.04 0.04 NA NA 756 812 23031 0.105 0.076 NA 3 +castor-ltp strain genome size genome size NA NA 0.62 0.11 NA NA 709 2861 23031 0.131 0.077 NA 3 +castor-ltp all growth temperature growth temperature NA NA 0.55 0.04 NA NA 14251 21033 23031 0.035 0.04 NA 62 +castor-ltp species growth temperature growth temperature NA NA 0.47 0.04 NA NA 12433 16916 23031 0.043 0.051 NA 54 +castor-ltp strain growth temperature growth temperature NA NA 0.71 0.09 NA NA 1818 4117 23031 0.092 0.062 NA 8 +castor-ltp all length length NA NA 0.15 0.24 NA NA 605 636 23031 0.114 0.078 NA 3 +castor-ltp genus length length NA NA 0.15 0.24 NA NA 605 636 23031 0.114 0.078 NA 3 +castor-ltp all optimal ph optimal ph NA NA 0.14 0.16 NA NA 290 878 23031 0.179 0.11 NA 1 +castor-ltp species optimal ph optimal ph NA NA 0.14 0.16 NA NA 290 878 23031 0.179 0.11 NA 1 +castor-ltp all width width NA NA 0.04 0.04 NA NA 756 812 23031 0.102 0.076 NA 3 +castor-ltp genus width width NA NA 0.04 0.04 NA NA 756 812 23031 0.102 0.076 NA 3 phytools-ltp all animal pathogen animal pathogen 0.47 0.06 NA NA NA NA 999 2071 23501 0.154 0.11 NA 4 phytools-ltp species animal pathogen animal pathogen 0.5 0.06 NA NA NA NA 879 1782 23501 0.157 0.111 NA 4 -phytools-ltp all antimicrobial sensitivity antimicrobial sensitivity 0.5 0.12 NA NA NA NA 304 825 23501 0.192 0.12 NA 1 -phytools-ltp species antimicrobial sensitivity antimicrobial sensitivity 0.5 0.12 NA NA NA NA 304 825 23501 0.192 0.12 NA 1 +phytools-ltp all antimicrobial sensitivity antimicrobial sensitivity 0.5 0.12 NA NA NA NA 304 825 23501 0.191 0.119 NA 1 +phytools-ltp species antimicrobial sensitivity antimicrobial sensitivity 0.5 0.12 NA NA NA NA 304 825 23501 0.191 0.119 NA 1 phytools-ltp all extreme environment extreme environment 0.4 0.13 NA NA NA NA 708 1856 23501 0.147 0.1 NA 3 phytools-ltp species extreme environment extreme environment 0.4 0.13 NA NA NA NA 708 1856 23501 0.147 0.1 NA 3 phytools-ltp all host-associated host-associated 0.47 0.06 NA NA NA NA 900 2492 23501 0.133 0.095 NA 4 -phytools-ltp species host-associated host-associated 0.46 0.05 NA NA NA NA 890 2482 23501 0.135 0.095 NA 4 -phytools-ltp all motility motility 0.53 0.03 NA NA NA NA 5793 6529 23501 0.065 0.063 NA 25 -phytools-ltp species motility motility 0.52 0.03 NA NA NA NA 5122 5331 23501 0.071 0.069 NA 22 -phytools-ltp strain motility motility 0.43 0.07 NA NA NA NA 671 1198 23501 0.13 0.085 NA 3 +phytools-ltp species host-associated host-associated 0.46 0.05 NA NA NA NA 890 2482 23501 0.134 0.095 NA 4 +phytools-ltp all motility motility 0.53 0.03 NA NA NA NA 5793 6529 23501 0.064 0.063 NA 25 +phytools-ltp species motility motility 0.52 0.03 NA NA NA NA 5122 5331 23501 0.07 0.069 NA 22 +phytools-ltp strain motility motility 0.43 0.07 NA NA NA NA 671 1198 23501 0.129 0.084 NA 3 phytools-ltp all plant pathogenicity plant pathogenicity 0.57 0.26 NA NA NA NA 568 1476 23501 0.164 0.113 NA 2 phytools-ltp species plant pathogenicity plant pathogenicity 0.57 0.26 NA NA NA NA 568 1476 23501 0.164 0.113 NA 2 -phytools-ltp all spore formation spore formation 0.88 0.03 NA NA NA NA 4177 4731 23501 0.086 0.087 NA 18 -phytools-ltp species spore formation spore formation 0.89 0.03 NA NA NA NA 3673 3809 23501 0.096 0.097 NA 16 +phytools-ltp all spore formation spore formation 0.88 0.03 NA NA NA NA 4177 4731 23501 0.085 0.086 NA 18 +phytools-ltp species spore formation spore formation 0.89 0.03 NA NA NA NA 3673 3809 23501 0.095 0.097 NA 16 phytools-ltp strain spore formation spore formation 0.76 0.13 NA NA NA NA 504 922 23501 0.146 0.094 NA 2 diff --git a/inst/scripts/README.md b/inst/scripts/README.md index 0d5f8d5f..6468bb64 100644 --- a/inst/scripts/README.md +++ b/inst/scripts/README.md @@ -121,10 +121,10 @@ https://raw.githubusercontent.com/waldronlab/taxPProValidation/a7a31b0/validatio The code used, which was executed directly in the extdata directory, was: -This is version 1.0.2 +This is version 1.0.4 ```bash -wget https://raw.githubusercontent.com/waldronlab/taxPProValidation/a7a31b0/validation_summary.tsv +wget https://raw.githubusercontent.com/waldronlab/taxPProValidation/cc45761/validation_summary.tsv ``` | Column name | Description | diff --git a/inst/scripts/sysdata.R b/inst/scripts/sysdata.R index 9b185082..31d7b132 100644 --- a/inst/scripts/sysdata.R +++ b/inst/scripts/sysdata.R @@ -9,25 +9,25 @@ library(stringr) getParentRank <- function(x) { ranks <- taxizedb::taxid2rank(x, db = 'ncbi', verbose = FALSE) - lowest_ranks <- c( + lowestRanks <- c( 'biotype', 'isolate', 'serogroup', 'serotype', 'strain', 'subspecies' ) dplyr::case_when( - ranks %in% lowest_ranks ~ 'species', + ranks %in% lowestRanks ~ 'species', ranks == 'species' ~ 'genus', ranks == 'genus' ~ 'family', TRUE ~ NA ) } -tax_ranks <- c( +taxRanks <- c( "superkingdom", "phylum", "class", "order", "family", "genus", "species", "strain" ) phys <- physiologies() -ncbi_ids <- phys |> +ncbiIds <- phys |> map( ~ pull(.x, NCBI_ID)) |> flatten_chr() |> unique() |> @@ -38,59 +38,59 @@ ncbi_ids <- phys |> sort(decreasing = TRUE) tim <- system.time({ - taxonomies <- taxizedb::classification(ncbi_ids, db = "ncbi") - lgl_vct <- !map_lgl(taxonomies, ~ all(is.na(.x))) - taxonomies <- taxonomies[lgl_vct] - ncbi_ids <- ncbi_ids[lgl_vct] + taxonomies <- taxizedb::classification(ncbiIds, db = "ncbi") + lglVct <- !map_lgl(taxonomies, ~ all(is.na(.x))) + taxonomies <- taxonomies[lglVct] + ncbiIds <- ncbiIds[lglVct] }) print(tim) ## Check names and taxid match all(names(taxonomies) == map_chr(taxonomies, ~ as.character(tail(.x$id, 1)))) -parents_ranks <- getParentRank(ncbi_ids) -lgl_vct <- !is.na(parents_ranks) -ncbi_ids <- ncbi_ids[lgl_vct] -parents_ranks <- parents_ranks[lgl_vct] -taxonomies <- taxonomies[lgl_vct] +parentsRanks <- getParentRank(ncbiIds) +lglVct <- !is.na(parentsRanks) +ncbiIds <- ncbiIds[lglVct] +parentsRanks <- parentsRanks[lglVct] +taxonomies <- taxonomies[lglVct] -parent_ids <- map2(taxonomies, parents_ranks, ~{ - parent_rank <- .x |> - filter(rank %in% tax_ranks) |> +parentIds <- map2(taxonomies, parentsRanks, ~{ + parentRank <- .x |> + filter(rank %in% taxRanks) |> pull(rank) |> {\(y) y[-length(y)]}() |> ## Need to remove the current rank tail(1) - parent_id <- .x |> - filter(rank %in% tax_ranks) |> + parentId <- .x |> + filter(rank %in% taxRanks) |> pull(id) |> {\(y) y[-length(y)]}() |> ## Need to remove the current rank tail(1) - names(parent_id) <- parent_rank - ifelse(names(parent_id) == .y, parent_id, NA) + names(parentId) <- parentRank + ifelse(names(parentId) == .y, parentId, NA) }) -lgl_vct <- !is.na(parent_ids) -ncbi_ids <- ncbi_ids[lgl_vct] -parent_ids <- parent_ids[lgl_vct] +lglVct <- !is.na(parentIds) +ncbiIds <- ncbiIds[lglVct] +parentIds <- parentIds[lglVct] -ranks_parents <- data.frame( - NCBI_ID = ncbi_ids, - # Taxon_name = taxizedb::taxid2name(ncbi_ids, db = 'ncbi'), - Rank = taxizedb::taxid2rank(ncbi_ids, db = 'ncbi'), - Parent_NCBI_ID = unlist(parent_ids), - Parent_name = taxizedb::taxid2name(unlist(parent_ids), db = 'ncbi'), - Parent_rank = taxizedb::taxid2rank(unlist(parent_ids), db = 'ncbi') +ranksParents <- data.frame( + NCBI_ID = ncbiIds, + # Taxon_name = taxizedb::taxid2name(ncbiIds, db = 'ncbi'), + Rank = taxizedb::taxid2rank(ncbiIds, db = 'ncbi'), + Parent_NCBI_ID = unlist(parentIds), + Parent_name = taxizedb::taxid2name(unlist(parentIds), db = 'ncbi'), + Parent_rank = taxizedb::taxid2rank(unlist(parentIds), db = 'ncbi') ) -rownames(ranks_parents) <- NULL +rownames(ranksParents) <- NULL # BacDive ----------------------------------------------------------------- bacdive <- bugphyzz:::.getBacDive() |> bugphyzz:::.reshapeBacDive() -bacdive_phys_names <- names(bacdive) +bacdivePhysNames <- names(bacdive) ## Save data ------------------------------------------------------------- usethis::use_data( - ranks_parents, - bacdive_phys_names, + ranksParents, + bacdivePhysNames, overwrite = TRUE, internal = TRUE ) diff --git a/man/getTaxonSignatures.Rd b/man/getTaxonSignatures.Rd index e6b5dde1..68dc4eeb 100644 --- a/man/getTaxonSignatures.Rd +++ b/man/getTaxonSignatures.Rd @@ -8,7 +8,7 @@ getTaxonSignatures(tax, bp, ...) } \arguments{ \item{tax}{A valid NCBI ID or taxon name. If taxon name is used, the -argument tax_id_type = "Taxon_name" must also be used.} +argument taxIdType = "Taxon_name" must also be used.} \item{bp}{List of data.frames imported with \code{importBugphyzz}.} @@ -27,6 +27,6 @@ taxid <- "562" taxonName <- "Escherichia coli" bp <- importBugphyzz() sig_names_1 <- getTaxonSignatures(taxid, bp) -sig_names_2 <- getTaxonSignatures(taxonName, bp, tax_id_type = "Taxon_name") +sig_names_2 <- getTaxonSignatures(taxonName, bp, taxIdType = "Taxon_name") } diff --git a/man/importBugphyzz.Rd b/man/importBugphyzz.Rd index 699e3b0d..57e08685 100644 --- a/man/importBugphyzz.Rd +++ b/man/importBugphyzz.Rd @@ -5,22 +5,22 @@ \title{Import bugphyzz} \usage{ importBugphyzz( - version = "10.5281/zenodo.10980813", - force_download = FALSE, + version = "10.5281/zenodo.12574596", + forceDownload = FALSE, v = 0.8, - exclude_rarely = TRUE + excludeRarely = TRUE ) } \arguments{ \item{version}{Character string indicating the version. Default is the latest release on Zenodo. Options: Zenodo DOI, GitHub commit hash, or devel.} -\item{force_download}{Logical value. Force a fresh download of the data or +\item{forceDownload}{Logical value. Force a fresh download of the data or use the one stored in the cache (if available). Default is FALSE.} \item{v}{Validation value. Default 0.8 (see details).} -\item{exclude_rarely}{Default is TRUE. Exclude values with +\item{excludeRarely}{Default is TRUE. Exclude values with Frequency == FALSE (see details).} } \value{ @@ -29,7 +29,8 @@ A list of tidy data frames. \description{ \code{importBugphyzz} imports bugphyzz annotations as a list of tidy data.frames. To learn more about the structure of the data.frames -please check the bugphyzz vignette with \code{browseVignettes("bugphyzz")}. +please check the bugphyzz vignette with \code{browseVignettes("bugphyzz")} or +`vignette("bugphyzz", "bugphyzz"). } \details{ \subsection{Data structure}{ @@ -51,15 +52,22 @@ imported. The minimum value can be adjusted with the \code{v} argument (only values between 0 and 1). } -\subsection{Frequency (exclude_rarely argument)}{ +\subsection{Frequency (excludeRarely argument)}{ One of the variables in the bugphyzz data.frames is "Frequency", which can adopt values of "always", "usually", "sometimes", "rarely", or "never". By default "never" and "rarely" are excluded. "rarely" could be included with -\code{exclude_rarely = FALSE}. To learn more about these frequency keywords +\code{excludeRarely = FALSE}. To learn more about these frequency keywords please check the bugphyzz vignette with \code{browseVignettes("bugphyzz")}. } + +\subsection{Sources}{ + +By default, the datasets imported with the \code{importBugphuzz} function +will always return a shortened version of the source. Please use +vigette("sources", "bugphyz") to see the full sources. +} } \examples{ diff --git a/man/makeSignatures.Rd b/man/makeSignatures.Rd index 5eb2b59e..a6410f4d 100644 --- a/man/makeSignatures.Rd +++ b/man/makeSignatures.Rd @@ -6,11 +6,12 @@ \usage{ makeSignatures( dat, - tax_id_type = "NCBI_ID", - tax_level = "mixed", + taxIdType = c("NCBI_ID", "Taxon_name"), + taxLevel = c("mixed", "superkingdom", "phylum", "class", "order", "family", "genus", + "species", "strain"), evidence = c("exp", "igc", "tas", "nas", "tax", "asr"), frequency = c("always", "usually", "sometimes", "unknown"), - min_size = 10, + minSize = 10, min = NULL, max = NULL ) @@ -18,9 +19,9 @@ makeSignatures( \arguments{ \item{dat}{A data.frame.} -\item{tax_id_type}{A character string. Valid options: NCBI_ID, Taxon_name.} +\item{taxIdType}{A character string. Valid options: NCBI_ID, Taxon_name.} -\item{tax_level}{A character vector. Taxonomic rank. Valid options: +\item{taxLevel}{A character vector. Taxonomic rank. Valid options: superkingdom, kingdom, phylum, class, order, family, genus, species, strain. They can be combined. "mixed" is equivalent to select all valid ranks.} @@ -31,7 +32,7 @@ asr. They can be combined. Default is all.} sometimes, rarely, unknown. They can be combined. By default, "rarely" is excluded.} -\item{min_size}{Minimum number of bugs in a signature. Default is 10.} +\item{minSize}{Minimum number of bugs in a signature. Default is 10.} \item{min}{Minimum value (inclusive). Only for numeric attributes. Default is NULL.} diff --git a/man/physiologies.Rd b/man/physiologies.Rd index fe7fc3b5..20ab0676 100644 --- a/man/physiologies.Rd +++ b/man/physiologies.Rd @@ -4,14 +4,14 @@ \alias{physiologies} \title{Import physiologies (for devs)} \usage{ -physiologies(keyword = "all", full_source = FALSE) +physiologies(keyword = "all", fullSource = FALSE) } \arguments{ \item{keyword}{Character vector with one or more valid keywords. Valid keyboards can be checked with \code{showPhys}. If 'all', all physiologies are imported.} -\item{full_source}{Logical. If \code{TRUE}, the Attribute_source column will +\item{fullSource}{Logical. If \code{TRUE}, the Attribute_source column will contain full source information. If \code{FALSE}, the Attribute_source column will contain shortened versions of the sources. Default is \code{FALSE}.} } diff --git a/man/showPhys.Rd b/man/showPhys.Rd index 4e40c1f9..2794e2a8 100644 --- a/man/showPhys.Rd +++ b/man/showPhys.Rd @@ -4,10 +4,10 @@ \alias{showPhys} \title{Show list of available physiologies (for devs)} \usage{ -showPhys(which_names = "all") +showPhys(whichNames = "all") } \arguments{ -\item{which_names}{A character string. Options: 'all' (default), +\item{whichNames}{A character string. Options: 'all' (default), 'spreadsheets', 'bacdive'.} } \value{ diff --git a/tests/testthat/test-getTaxonSignatures.R b/tests/testthat/test-getTaxonSignatures.R index a31e1858..4a109d23 100644 --- a/tests/testthat/test-getTaxonSignatures.R +++ b/tests/testthat/test-getTaxonSignatures.R @@ -3,7 +3,7 @@ taxName <- "Escherichia coli" bp <- importBugphyzz() sigs_ids <- getTaxonSignatures(taxID, bp) sigs_tax <- getTaxonSignatures( - tax = taxName, bp = bp, tax_id_type = "Taxon_name" + tax = taxName, bp = bp, taxIdType = "Taxon_name" ) test_that("getTaxonSignatures works with IDs", { expect_gt(length(sigs_ids), 0) diff --git a/tests/testthat/test-importBugphyzz.R b/tests/testthat/test-importBugphyzz.R index a9478842..e8ef16d5 100644 --- a/tests/testthat/test-importBugphyzz.R +++ b/tests/testthat/test-importBugphyzz.R @@ -168,7 +168,7 @@ checkUniqueAnnotations <- function(x) { # tests ------------------------------------------------------------------- test_that("importBugphyzz works with devel", { - bp <- importBugphyzz(version = "devel", force_download = TRUE) + bp <- importBugphyzz(version = "devel", forceDownload = TRUE) expect_true(all("data.frame" == map_chr(bp, class))) expect_true(all(map_lgl(bp, ~ nrow(.x) > 0))) expect_true(all(map_lgl(bp, checkColumnNames))) @@ -179,7 +179,7 @@ test_that("importBugphyzz works with devel", { }) test_that("importBugphyzz works with hash", { - bp <- importBugphyzz(version = "c2d34c0", force_download = TRUE) + bp <- importBugphyzz(version = "8a09b46", forceDownload = TRUE) expect_true(all("data.frame" == map_chr(bp, class))) expect_true(all(map_lgl(bp, ~ nrow(.x) > 0))) expect_true(all(map_lgl(bp, checkColumnNames))) @@ -190,7 +190,7 @@ test_that("importBugphyzz works with hash", { }) test_that("importBugphyzz works with Zenodo DOI", { - bp <- importBugphyzz(version = "10.5281/zenodo.10980813", force_download = TRUE) + bp <- importBugphyzz(version = "10.5281/zenodo.12574596", forceDownload = TRUE) expect_true(all("data.frame" == map_chr(bp, class))) expect_true(all(map_lgl(bp, ~ nrow(.x) > 0))) expect_true(all(map_lgl(bp, checkColumnNames))) @@ -202,5 +202,5 @@ test_that("importBugphyzz works with Zenodo DOI", { ## TODO create test for using Zenodo test_that("importBugphyzz doesn't work with other words", { - expect_error(importBugphyzz(version = "abcd-1234", force_download = TRUE)) + expect_error(importBugphyzz(version = "abcd-1234", forceDownload = TRUE)) }) diff --git a/tests/testthat/test-makeSignatures-DataFrame.R b/tests/testthat/test-makeSignatures-DataFrame.R new file mode 100644 index 00000000..7a7bbaf9 --- /dev/null +++ b/tests/testthat/test-makeSignatures-DataFrame.R @@ -0,0 +1,14 @@ +library(purrr) +library(S4Vectors) +bp <- map(importBugphyzz(), DataFrame) +sigsNames <- map(bp, ~ makeSignatures(.x, taxIdType = "Taxon_name")) |> + list_flatten(name_spec = "{inner}") +sigsIDs <- map(bp, ~ makeSignatures(.x, taxIdType = "NCBI_ID")) |> + list_flatten(name_spec = "{inner}") + +test_that("makeSignatures works with IDs", { + expect_true(all(map_lgl(sigsIDs, is.integer))) +}) +test_that("makeSignatures works with taxon names", { + expect_true(all(map_lgl(sigsNames, is.character))) +}) diff --git a/tests/testthat/test-makeSignatures.R b/tests/testthat/test-makeSignatures.R index a8fc7d25..f758844e 100644 --- a/tests/testthat/test-makeSignatures.R +++ b/tests/testthat/test-makeSignatures.R @@ -1,8 +1,8 @@ library(purrr) bp <- importBugphyzz() -sigsNames <- map(bp, ~ makeSignatures(.x, tax_id_type = "Taxon_name")) |> +sigsNames <- map(bp, ~ makeSignatures(.x, taxIdType = "Taxon_name")) |> list_flatten(name_spec = "{inner}") -sigsIDs <- map(bp, ~ makeSignatures(.x, tax_id_type = "NCBI_ID")) |> +sigsIDs <- map(bp, ~ makeSignatures(.x, taxIdType = "NCBI_ID")) |> list_flatten(name_spec = "{inner}") test_that("makeSignatures works with IDs", { diff --git a/vignettes/articles/attributes.Rmd b/vignettes/articles/attributes.Rmd index 5aa89fcf..4d8a36fc 100644 --- a/vignettes/articles/attributes.Rmd +++ b/vignettes/articles/attributes.Rmd @@ -26,8 +26,6 @@ library(dplyr) library(purrr) ``` -## Sources - ```{r, echo=FALSE} sources_fname <- system.file( "extdata", "attribute_sources.tsv", package = "bugphyzz", mustWork = TRUE @@ -40,8 +38,6 @@ sources <- readr::read_tsv(sources_fname, show_col_types = FALSE) |> ) ``` -The `r nrow(sources)` sources of annotations in bugphyzz: - ```{r, echo=FALSE} DT::datatable( data = sources, rownames = FALSE, diff --git a/vignettes/bugphyzz.Rmd b/vignettes/bugphyzz.Rmd index a66429a4..a5da0a1e 100644 --- a/vignettes/bugphyzz.Rmd +++ b/vignettes/bugphyzz.Rmd @@ -217,7 +217,7 @@ attribute (discrete): ```{r} aer_sigs_g <- makeSignatures( - dat = bp[["aerophilicity"]], tax_id_type = "Taxon_name", tax_level = "genus" + dat = bp[["aerophilicity"]], taxIdType = "Taxon_name", taxLevel = "genus" ) map(aer_sigs_g, head) ``` @@ -227,8 +227,8 @@ temperature attribute (numeric): ```{r} gt_sigs_sp <- makeSignatures( - dat = bp[["growth temperature"]], tax_id_type = "Taxon_name", - tax_level = 'species' + dat = bp[["growth temperature"]], taxIdType = "Taxon_name", + taxLevel = 'species' ) map(gt_sigs_sp, head) ``` @@ -238,8 +238,8 @@ attribute (numeric): ```{r} gt_sigs_mix <- makeSignatures( - dat = bp[["growth temperature"]], tax_id_type = "Taxon_name", - tax_level = "mixed", min = 0, max = 25 + dat = bp[["growth temperature"]], taxIdType = "Taxon_name", + taxLevel = "mixed", min = 0, max = 25 ) map(gt_sigs_mix, head) ``` @@ -248,8 +248,8 @@ map(gt_sigs_mix, head) ```{r} ap_sigs_mix <- makeSignatures( - dat = bp[["animal pathogen"]], tax_id_type = "NCBI_ID", - tax_level = "mixed", evidence = c("exp", "igc", "nas", "tas") + dat = bp[["animal pathogen"]], taxIdType = "NCBI_ID", + taxLevel = "mixed", evidence = c("exp", "igc", "nas", "tas") ) map(ap_sigs_mix, head) ``` @@ -304,17 +304,47 @@ tse_subset <- tse_genus[rowSums(assay(tse_genus) >= 1) >= min_n_samples,] tse_subset ``` -Perform differential abundance (DA) analysis to get sets of microbes: +Let's use the edgeR method for differential abundance analysis and +obtain sets of microbes. +Subgingival plaque will be used as reference +or "control", so negative values will mean enrichment in the subgingival plaque +and positive values will mean enrichment in the supragingival plaque. + +Perform differential abundance (DA) analysis: ```{r} tse_subset$GROUP <- ifelse( tse_subset$body_subsite == 'subgingival_plaque', 0, 1 ) se <- EnrichmentBrowser::deAna( - expr = tse_subset, de.method = 'limma', padj.method = 'fdr', + expr = tse_subset, de.method = 'edgeR', padj.method = 'fdr', filter.by.expr = FALSE, ) +``` + +It's recommended to perform a normalization step of the counts before +running GSEA. From the original [GSEA user guide](https://www.gsea-msigdb.org/gsea/doc/GSEAUserGuideTEXT.htm): +"GSEA does not normalize RNA-seq data. +RNA-seq data must be normalized for between-sample comparisons using an +external normalization procedure (e.g. those in DESeq2 or Voom)." + +In this example, we are treating the microbiome +data as RNA-seq (see: https://link.springer.com/article/10.1186/s13059-020-02104-1). +Let's use the `limma::voom` function. + +A glimpse to the assay stored in the SE: + +```{r} +assay(se)[1:5, 1:5] # counts +``` +From the `?limma::voom` documentation, input should be "a numeric matrix +containing raw counts...". Note that the assay in the SummarizedExperiment +will be replaced with normalized counts. + +Perform normalization step: + +```{r} dat <- data.frame(colData(se)) design <- stats::model.matrix(~ GROUP, data = dat) assay(se) <- limma::voom( @@ -322,12 +352,22 @@ assay(se) <- limma::voom( )$E ``` +The output is a "numeric matrix of normalized expression values on the +log2 scale" as described in the `?lima::voom` documentation. This output +is ready for GSEA. + +```{r} +assay(se)[1:5, 1:5] # normalized counts +``` + Perform GSEA and display the results: ```{r, message=FALSE} gsea <- EnrichmentBrowser::sbea( method = 'gsea', se = se, gs = aer_sigs_g, perm = 1000, - alpha = 0.1 + # Alpha is the FDR threshold (calculated above) to consider a feature as + # significant. + alpha = 0.1 ) gsea_tbl <- as.data.frame(gsea$res.tbl) |> mutate( diff --git a/vignettes/sources.Rmd b/vignettes/sources.Rmd new file mode 100644 index 00000000..4dbe36c3 --- /dev/null +++ b/vignettes/sources.Rmd @@ -0,0 +1,48 @@ +--- +title: "Sources" +output: + rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Sources} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + message = FALSE, + warning = FALSE, + echo = FALSE +) +``` + +```{r setup} +library(DT) +library(bugphyzz) +library(dplyr) +library(purrr) +``` + +## Sources + +```{r, echo=FALSE} +sources_fname <- system.file( + "extdata", "attribute_sources.tsv", package = "bugphyzz", mustWork = TRUE +) +sources <- readr::read_tsv(sources_fname, show_col_types = FALSE) |> + dplyr::rename( + Source = Attribute_source, + `Confidence in curation` = Confidence_in_curation, + `Full source` = full_source + ) +``` + +The `r nrow(sources)` sources of annotations in bugphyzza. +exp = experimental data; igc = inferred from genome context; tas = traceable +author statement; nast = non-traceable author statement. + +```{r, echo=FALSE} +knitr::kable(sources) +```