Skip to content

Commit

Permalink
Update script for sysdata. Remove Unnecessary taxonomy information; l…
Browse files Browse the repository at this point in the history
…eave parent information only.
  • Loading branch information
sdgamboa committed Dec 14, 2023
1 parent c561c6e commit b006b48
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 115 deletions.
Binary file modified R/sysdata.rda
Binary file not shown.
174 changes: 59 additions & 115 deletions inst/scripts/sysdata.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,24 @@ library(dplyr)
library(magrittr)
library(stringr)

getParentRank <- function(x) {
ranks <- taxizedb::taxid2rank(x, db = 'ncbi', verbose = FALSE)
lowest_ranks <- c(
'biotype', 'isolate', 'serogroup', 'serotype', 'strain', 'subspecies'
)
dplyr::case_when(
ranks %in% lowest_ranks ~ 'species',
ranks == 'species' ~ 'genus',
ranks == 'genus' ~ 'family',
TRUE ~ NA
)
}

tax_ranks <- c(
"superkingdom", "phylum", "class", "order", "family", "genus",
"species", "strain"
)

phys <- physiologies()

ncbi_ids <- phys |>
Expand All @@ -14,128 +32,55 @@ ncbi_ids <- phys |>
unique() |>
str_squish() |>
str_to_lower() |>
{\(y) y[!is.na(y)]}() |>
{\(y) y[y != 'unknown']}() |>
{\(y) y[!is.na(y)]}() |>
sort(decreasing = TRUE)

parent_ids <- getParentID(ncbi_ids)


taxonomies <- taxizedb::classification(ncbi_ids, db = "ncbi")
taxonomies <- discard(taxonomies, ~ all(is.na(.x))) # disc+arded taxids migth need to be updated
tim <- system.time({
taxonomies <- taxizedb::classification(ncbi_ids, db = "ncbi")
lgl_vct <- !map_lgl(taxonomies, ~ all(is.na(.x)))
taxonomies <- taxonomies[lgl_vct]
ncbi_ids <- ncbi_ids[lgl_vct]
})
print(tim)

## Check names and taxid match
all(names(taxonomies) == map_chr(taxonomies, ~ as.character(tail(.x$id, 1))))

table(taxizedb::taxid2rank(names(taxonomies), db = 'ncbi'))


getParentID <- function(taxid) {
below_sp <- c(
'biotype', 'isolate', 'serotype', 'strain', 'subspecies'
)
rank <- taxizedb::taxid2rank(taxid, db = 'ncbi')
parent_rank <- dplyr::case_when(
rank %in% below_sp ~ 'species',
rank == 'species' ~ 'genus',
rank == 'genus' ~ 'family',
TRUE ~ NA
)
remove_pos <- which(is.na(rank) | is.na(parent_rank))
taxid <- taxid[-remove_pos]
rank <- rank[-remove_pos]
parent_rank <- parent_rank[-remove_pos]
parent_id <- map2(.x = taxid, .y = parent_rank, ~ {
taxizedb::taxa_at(x = .x, rank = .y, db = 'ncbi', verbose = FALSE)
})
data.frame(
taxid = taxid,
rank = rank,
parent_it = parent_id,
parent_rank = parent_rank
)
}



taxizedb::taxa_at(x = '562', rank = 'family', db = 'ncbi', verbose = FALSE)

x = getParentID(names(taxonomies))

table(x, useNA = 'always')


x = map(taxonomies, ~ {
count(.x, rank)
}) |>
bind_rows()



## >>>>>>>> this is not part of the output <<<<<<<<<<<<<<<
## Check that names correspond to the right output
name_value <- names(taxonomies) == map_chr(
taxonomies, ~ tryCatch(tail(.x$id, 1), error = function(e) NA)
)
invalid_taxonomy_ids <- name_value[is.na(name_value)]
sum(name_value, na.rm = TRUE) / length(name_value) * 100
## In the lines above, most names correspond to the right output
## >>>>>>>> this is not part of the output <<<<<<<<<<<<<<<

taxonomies <- taxonomies[!is.na(taxonomies)]
valid_ranks <- c(
"superkingdom", "phylum", "class", "order", "family", "genus",
"species", "strain"
)

taxonomies <- map(taxonomies, ~ filter(.x, rank %in% valid_ranks))
taxonomies <- discard(taxonomies, ~ nrow(.x) <= 2)

## Get parents
parents <- taxonomies %>%
map( ~ {
.x %>%
head(-1) %>%
filter(rank %in% valid_ranks) %>%
tail(1) %>%
set_colnames(paste0("Parent_", names(.)))
}) %>%
bind_rows(.id = "NCBI_ID")

## Get ranks
ranks <- taxonomies %>%
map_chr(~ tail(.x$rank, 1)) %>%
as_tibble(rownames = "NCBI_ID") %>%
set_colnames(c("NCBI_ID", "Rank"))

ranks_parents <- full_join(ranks, parents, by = "NCBI_ID") %>%
relocate("Parent_rank", .after = "Parent_id") %>%
rename(Parent_NCBI_ID = Parent_id)


# full taxonomy annotations -----------------------------------------------
tax_regex <- valid_ranks %>%
c("query") %>%
paste0(., collapse = "|") %>%
paste0("^(", ., ")(_id)*$")

attr(taxonomies, "class") <- "classification"
taxonomy_table <- cbind(taxonomies)

taxonomyAnnotations <- taxonomy_table %>%
dplyr::select_at(grep(tax_regex, colnames(taxonomy_table), value = TRUE)) %>%
dplyr::rename(NCBI_ID = query) %>%
dplyr::left_join(ranks, by = "NCBI_ID") %>%
dplyr::relocate("NCBI_ID", "Rank") %>%
dplyr::rename(rank = Rank) %>%
dplyr::distinct() %>%
magrittr::set_colnames(paste0("bugphyzz_", colnames(.)))

ranks_parents <- purrr::modify_if(
ranks_parents,
.p = grepl("NCBI_ID", colnames(ranks_parents)),
.f = ~ as.character(.x)
parents_ranks <- getParentRank(ncbi_ids)
lgl_vct <- !is.na(parents_ranks)
ncbi_ids <- ncbi_ids[lgl_vct]
parents_ranks <- parents_ranks[lgl_vct]
taxonomies <- taxonomies[lgl_vct]

parent_ids <- map2(taxonomies, parents_ranks, ~{
parent_rank <- .x |>
filter(rank %in% tax_ranks) |>
pull(rank) |>
{\(y) y[-length(y)]}() |> ## Need to remove the current rank
tail(1)
parent_id <- .x |>
filter(rank %in% tax_ranks) |>
pull(id) |>
{\(y) y[-length(y)]}() |> ## Need to remove the current rank
tail(1)
names(parent_id) <- parent_rank
ifelse(names(parent_id) == .y, parent_id, NA)
})

lgl_vct <- !is.na(parent_ids)
ncbi_ids <- ncbi_ids[lgl_vct]
parent_ids <- parent_ids[lgl_vct]

ranks_parents <- data.frame(
NCBI_ID = ncbi_ids,
# Taxon_name = taxizedb::taxid2name(ncbi_ids, db = 'ncbi'),
Rank = taxizedb::taxid2rank(ncbi_ids, db = 'ncbi'),
Parent_NCBI_ID = unlist(parent_ids),
Parent_name = taxizedb::taxid2name(unlist(parent_ids), db = 'ncbi'),
Parent_rank = taxizedb::taxid2rank(unlist(parent_ids), db = 'ncbi')
)
rownames(ranks_parents) <- NULL

# BacDive -----------------------------------------------------------------
bacdive <- bugphyzz:::.getBacDive() |>
Expand All @@ -145,7 +90,6 @@ bacdive_phys_names <- names(bacdive)
## Save data -------------------------------------------------------------
usethis::use_data(
ranks_parents,
taxonomyAnnotations,
bacdive_phys_names,
overwrite = TRUE, internal = TRUE
)

0 comments on commit b006b48

Please sign in to comment.