Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changes requested for inclusion in Bioconductor #250

Merged
merged 20 commits into from
Jun 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
2c5d399
remove TODOs related to fattyAcidCompositions. These have been docume…
sdgamboa Jun 20, 2024
9c74491
makeSignatures works with DataFrame
sdgamboa Jun 20, 2024
09da92f
Add unit tests to make sure makeSignatures works with DataFrame
sdgamboa Jun 20, 2024
25a41cc
Add S4Vectors dependency
sdgamboa Jun 20, 2024
7d25e61
homogenize function calls and variable names and arguments to camelCase
sdgamboa Jun 21, 2024
a511505
Fix bug in a test file afte homogenizing variable names
sdgamboa Jun 21, 2024
aa4fd47
fix bug in tests for makeSignatures related to homogenization of vari…
sdgamboa Jun 21, 2024
b9cf2d0
fix bugs in .fattyAcidCompositions and R code in .downloadZ
sdgamboa Jun 21, 2024
c034916
update vignette clarifying the use of counts and normalized counts pr…
sdgamboa Jun 24, 2024
d8fe5a6
add vignette('sources', 'bugphyzz')
sdgamboa Jun 24, 2024
71e3b0e
use match.ags to check imput parameters
sdgamboa Jun 24, 2024
b5d6a15
remove unncessary for loops and use lapply instead
sdgamboa Jun 24, 2024
d710bf3
fix plant pathogenity to plant pathogenicity in the curation file
sdgamboa Jun 24, 2024
ad7af59
remove TODO that was already completed
sdgamboa Jun 24, 2024
347b46a
Change biofilm forming to biofilm formation in spreadsheets link
sdgamboa Jun 25, 2024
b3b4b39
change Butyrate-producing Bacteria to butyrate producing in the attri…
sdgamboa Jun 25, 2024
685467c
Update valitadion file to v1.0.4
sdgamboa Jun 27, 2024
c9cf709
Update attributes.tsv file to accept both butyrate producing and buty…
sdgamboa Jun 27, 2024
36f9d78
update Zenodo version to the most recent
sdgamboa Jun 27, 2024
5c29708
Add Rproj file to .gitignore
sdgamboa Jun 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,5 @@ jupyter_notebooks/updated_spore_shape_v2.xlsx
protratis_data/

misc

*.Rproj
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ Imports:
tidyselect,
BiocFileCache,
httr2,
tools
tools,
S4Vectors
Suggests:
DT,
forcats,
Expand Down
120 changes: 61 additions & 59 deletions R/bacdive.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@

## Main function for importing BacDive
.getBacDive <- function(verbose = FALSE ) {
bacdive_data <- .importBacDiveExcel(verbose = verbose)
colnames(bacdive_data) <- .changeBDColNames(colnames(bacdive_data))
.getTidyBD(bacdive_data)
bacdiveData <- .importBacDiveExcel(verbose = verbose)
colnames(bacdiveData) <- .changeBDColNames(colnames(bacdiveData))
.getTidyBD(bacdiveData)
}

## Helper function for .getBacDive
Expand Down Expand Up @@ -34,8 +34,8 @@
}

## Helper function for .getBacDive
.getTidyBD <- function(bacdive_data) {
bacdive_data |>
.getTidyBD <- function(bacdiveData) {
bacdiveData |>
tidyr::pivot_longer(
# Attributes start in the gram_stain column
cols = gram_stain:tidyr::last_col(),
Expand All @@ -60,10 +60,10 @@
.reshapeBacDive <- function(df) {

df[['Attribute_source']] <- 'BacDive'
split_df <- split(df, factor(df[['Attribute']]))
splitDf <- split(df, factor(df[['Attribute']]))

## Attributes that must be changed from character to logical (simplest fix)
attr_names <- c(
attrNames <- c(
'aerophilicity',
'shape',
'country',
Expand All @@ -72,20 +72,22 @@
'isolation site'
## colony color (delete)
)

for (i in seq_along(attr_names)) {
split_df[[attr_names[i]]] <- .catToLog(split_df[[attr_names[i]]])
if (attr_names[i] %in% c('aerophilicity', 'shape')) {
split_df[[attr_names[i]]]$Attribute_type <-

## Modifying an already existing vector rather than creating a list
## Keeping this for loop
for (i in seq_along(attrNames)) {
splitDf[[attrNames[i]]] <- .catToLog(splitDf[[attrNames[i]]])
if (attrNames[i] %in% c('aerophilicity', 'shape')) {
splitDf[[attrNames[i]]]$Attribute_type <-
'multistate-intersection'
} else {
split_df[[attr_names[i]]]$Attribute_type <- 'multistate-union'
splitDf[[attrNames[i]]]$Attribute_type <- 'multistate-union'
}
}

## aerophilicity ####
## This is only to match the data in the bugphyzz spreadsheet
aer <- split_df[['aerophilicity']]
aer <- splitDf[['aerophilicity']]
aer$Attribute <- dplyr::case_when(
aer$Attribute == 'aerobe' ~ 'aerobic',
aer$Attribute == 'anaerobe' ~ 'anaerobic',
Expand All @@ -95,51 +97,51 @@
aer$Attribute == 'obligate aerobe' ~ 'obligately aerobic',
TRUE ~ aer$Attribute
)
split_df[['aerophilicity']] <- aer
splitDf[['aerophilicity']] <- aer

## animal pathogen ####
pos <- names(split_df) == 'animal pathongen'
names(split_df)[pos] <- 'animal pathogen'
x_ <- split_df[['animal pathogen']][['Attribute_value']]
pos <- names(splitDf) == 'animal pathongen'
names(splitDf)[pos] <- 'animal pathogen'
x_ <- splitDf[['animal pathogen']][['Attribute_value']]
x_ <- ifelse(x_ == "yes, in single cases", "yes", x_)
x_ <- dplyr::case_when(x_ == 'yes' ~ TRUE, x_ == 'no' ~ FALSE)
split_df[['animal pathogen']][['Attribute_value']] <- x_
split_df[['animal pathogen']][['Attribute_group']] <- 'animal pathogen'
split_df[['animal pathogen']][['Attribute']] <- 'animal pathogen'
split_df[['animal pathogen']][['Attribute_type']] <- 'binary'
splitDf[['animal pathogen']][['Attribute_value']] <- x_
splitDf[['animal pathogen']][['Attribute_group']] <- 'animal pathogen'
splitDf[['animal pathogen']][['Attribute']] <- 'animal pathogen'
splitDf[['animal pathogen']][['Attribute_type']] <- 'binary'

## biosafety level ####
y <- split_df[['biosafety level comment']][
y <- splitDf[['biosafety level comment']][
, c('BacDive_ID', 'Attribute_value')
]
colnames(y)[2] <- 'Note'
x <- dplyr::left_join(split_df[['biosafety level']], y, by = 'BacDive_ID')
x <- dplyr::left_join(splitDf[['biosafety level']], y, by = 'BacDive_ID')
x[['Attribute_value']] <- paste0('biosafety level ', x[['Attribute_value']])
x[['Attribute']] <- x[['Attribute_value']]
x[['Attribute_value']] <- TRUE
x[['Attribute_group']] <- 'biosafety level'
x[['Attribute_type']] <- 'multistate-intersection'
split_df[['biosafety level']] <- x
split_df[['biosafety level comment']] <- NULL
splitDf[['biosafety level']] <- x
splitDf[['biosafety level comment']] <- NULL

## colony color ####
## This one must be removed
split_df[['colony color']] <- NULL
splitDf[['colony color']] <- NULL

## cultivation medium used - growth medium ####
pos <- names(split_df) == 'cultivation medium used'
names(split_df)[pos] <- 'growth medium'
split_df[['growth medium']][['Attribute_group']] <- 'growth medium'
pos <- names(splitDf) == 'cultivation medium used'
names(splitDf)[pos] <- 'growth medium'
splitDf[['growth medium']][['Attribute_group']] <- 'growth medium'

## growth temperature ####
## culture temperature
## culture temperature growth
## culture temperature range (ignore)
## culture temperature type (ignore)
split_df[['culture temperature range']] <- NULL
split_df[['culture temperature type']] <- NULL
a <- split_df[['culture temperature']]
b <- split_df[['culture temperature growth']]
splitDf[['culture temperature range']] <- NULL
splitDf[['culture temperature type']] <- NULL
a <- splitDf[['culture temperature']]
b <- splitDf[['culture temperature growth']]
b_ <- b[,c('BacDive_ID', 'Attribute_value')]
colnames(b_)[2] <- 'growth'
ab <- dplyr::left_join(a, b_, by = 'BacDive_ID')
Expand All @@ -148,25 +150,25 @@
ab[['Attribute_group']] <- 'growth temperature'
ab[['Attribute_type']] <- 'range'
ab[['Attribute']] <- 'growth temperature'
split_df[['growth temperature']] <- ab
split_df[['culture temperature']] <- NULL
split_df[['culture temperature growth']] <- NULL
splitDf[['growth temperature']] <- ab
splitDf[['culture temperature']] <- NULL
splitDf[['culture temperature growth']] <- NULL

## gram stain ####
gs <- split_df[['gram stain']]
gs <- splitDf[['gram stain']]
gs[['Attribute']] <- paste(gs[['Attribute']], gs[['Attribute_value']])
gs[['Attribute_value']] <- TRUE
gs[['Attribute_group']] <- 'gram stain'
gs[['Attribute_type']] <- 'multistate-intersection'
split_df[['gram stain']] <- gs
splitDf[['gram stain']] <- gs

## halophily ####
valid_terms <- c(
validTerms <- c(
'NaCl', 'KCl', 'MgCl2', 'MgCl2x6H2O', 'Na\\+', 'MgSO4x7H2O', 'Na2SO4',
'Sea salts', 'Chromium \\(Cr6\\+\\)'
)
regex <- paste0('(', paste0(valid_terms, collapse = '|'), ')')
split_df[['halophily']] <- split_df[['halophily']] |>
regex <- paste0('(', paste0(validTerms, collapse = '|'), ')')
splitDf[['halophily']] <- splitDf[['halophily']] |>
dplyr::mutate(Attribute_value = strsplit(Attribute_value, ';')) |>
tidyr::unnest(cols = 'Attribute_value') |>
dplyr::filter(!grepl('no growth', Attribute_value)) |>
Expand Down Expand Up @@ -194,7 +196,7 @@
dplyr::distinct()

## hemolysis ####
split_df[['hemolysis']] <- split_df[['hemolysis']] |>
splitDf[['hemolysis']] <- splitDf[['hemolysis']] |>
dplyr::mutate(
Attribute_value = strsplit(Attribute_value, ';|/')
) |>
Expand All @@ -210,21 +212,21 @@

## incubation period
## This one must be removed
split_df[['incubation period']] <- NULL
splitDf[['incubation period']] <- NULL

## motility ####
split_df[['motility']] <- split_df[['motility']] |>
splitDf[['motility']] <- splitDf[['motility']] |>
dplyr::mutate(
Attribute_value = dplyr::case_when(
Attribute_value == 'yes' ~ TRUE,
Attribute_value == 'no' ~ FALSE
)
)
split_df[['motility']][['Attribute_group']] <- 'motility'
split_df[['motility']][['Attribute_type']] <- 'binary'
splitDf[['motility']][['Attribute_group']] <- 'motility'
splitDf[['motility']][['Attribute_type']] <- 'binary'

## pathogenicity human ####
pat <- split_df[['pathogenicity human']]
pat <- splitDf[['pathogenicity human']]
pat[['Note']] <- stringr::str_extract(
pat[['Attribute_value']], 'in single cases'
)
Expand All @@ -235,10 +237,10 @@
pat <- pat[!is.na(pat[['Attribute_value']]),]
pat[['Attribute_group']] <- 'pathogenicity human'
pat[['Attribute_type']] <- 'binary'
split_df[['pathogenicity human']] <- pat
splitDf[['pathogenicity human']] <- pat

## metabolite production ####
mp <- split_df[['metabolite production']]
mp <- splitDf[['metabolite production']]
mp <- mp |>
dplyr::mutate(Attribute_value = strsplit(Attribute_value, ';')) |>
tidyr::unnest(Attribute_value)
Expand All @@ -250,12 +252,12 @@
mp[['Attribute']] <- sub(' (yes|no)$', '', mp[['Attribute']])
mp[['Attribute_group']] <- 'metabolite utilization'
mp[['Attribute_type']] <- 'multistate-intersection'
split_df[['metabolite production']] <- mp
splitDf[['metabolite production']] <- mp

## metabolite utilization ####
pos <- names(split_df) == 'metabolite utiilization'
names(split_df)[pos] <- 'metabolite utilization'
mu <- split_df[['metabolite utilization']]
pos <- names(splitDf) == 'metabolite utiilization'
names(splitDf)[pos] <- 'metabolite utilization'
mu <- splitDf[['metabolite utilization']]
mu <- mu |>
dplyr::mutate(Attribute_value = strsplit(Attribute_value, ';')) |>
tidyr::unnest(Attribute_value) |>
Expand All @@ -280,10 +282,10 @@
dplyr::mutate(Attribute_value = as.logical(Attribute_value))
mu[['Attribute_group']] <- 'metabolite utilization'
mu[['Attribute_type']] <- 'multistate-intersection'
split_df[['metabolite utilization']] <- mu
splitDf[['metabolite utilization']] <- mu

## spore formation ####
sf <- split_df[['spore formation']]
sf <- splitDf[['spore formation']]
sf <- sf |>
dplyr::mutate(
Attribute_value = dplyr::case_when(
Expand All @@ -294,9 +296,9 @@
Attribute_type = 'binary'
) |>
dplyr::filter(!is.na(Attribute_value))
split_df[['spore formation']] <- sf
splitDf[['spore formation']] <- sf

split_df <- lapply(split_df, function(x) {
splitDf <- lapply(splitDf, function(x) {
x <- as.data.frame(x)
x[['NCBI_ID']] <- as.character(x[['NCBI_ID']])
x[['Parent_NCBI_ID']] <- as.character(x[['Parent_NCBI_ID']])
Expand All @@ -312,7 +314,7 @@
as.data.frame(x)
})

return(split_df)
return(splitDf)
}

## Helper function for .reshapeBacDive
Expand Down
Loading
Loading