Pull request #33: Feature/230fordashboard (HTTK-196)

Merge in HTTK/httk-dev from feature/230fordashboard to dev * commit '0252dd895cd1b6fcb2b8a0ac7433ca5aa8e88360': update the NEWS file to add in the Fabsgut variable change not previously provided in the NEWS file and provide better clarification fix typo - Wetmore et al. 2012 and 2015 ref Cleaned up dashboard script Correctly split RMSLE figures New predictions for CCD New predictions for CCD Split benchmark plots Finished running all chemicals Loading all QSARs introduces duplicates
USEPA · Feb 7, 2024 · 6e3da44 · 6e3da44
2 parents e52b2ff + 0252dd8
commit 6e3da44
Show file tree

Hide file tree

Showing 8 changed files with 136,732 additions and 123,765 deletions.
diff --git a/httk/NEWS.md b/httk/NEWS.md
@@ -39,6 +39,10 @@ permeability chemicals and therefore predicts only three values
 for calculating systemic bioavailability as ***Fbio = Fabs * Fgut * Fhep*** 
 where first-pass hepatic metabolism was already available from 
 `calc_hep_bioavailability`.
+* Changed the name of the variable describing fraction absorbed from the gut
+prior to first-pass hepatic metabolism to ***Fabsgut*** to reflect that
+***Fabs*** and ***Fgut*** are now modeled separately
+(that is, ***Fabsgut = Fabs * Fgut***).
 * Integrated ***Fabs*** and ***Fgut*** into oral exposure for all TK models and 
 integrated into population variability and uncertainty functions within 
 `invitro_uv`

diff --git a/httk/R/benchmark_httk.R b/httk/R/benchmark_httk.R
@@ -104,7 +104,14 @@
 #' calculated. If the units are correct the ratio should be 1 (within the 
 #' precision of the functions -- usually four significant figures). \cr
 #'
-#'   rmsle.plot \tab A ggplot2 figure showing RMSLE tests of various functions. 
+#'   invivo.rmsle.plot \tab A ggplot2 figure comparing
+#' model predictions to in vivo measured values. 
+#' Output generated is the root mean square log10 error for parameters estimated
+#' by the package. \cr
+#'
+#'   model.rmsle.plot \tab A ggplot2 figure comparing various functions values
+#' against values predicted by other models (chiefly SimCyp predictions from
+#' Wetmore et al. 2012 and 2015. 
 #' Output generated is the root mean square log10 error for parameters estimated
 #' by the package. \cr
 #'
@@ -423,13 +430,6 @@ benchmark_httk <- function(
                                             as.numeric(FitData2$Pred.Cmax) /
                                             as.numeric(FitData2$Cmax)))
                                         )
-
-#    write.table(FitData[,c(1,3,5,6,29,31)], 
-#                file=paste("invivovcsscheck-",
-#                           packageVersion("httk"),
-#                           ".txt",sep=""),
-#                           row.names=FALSE,
-#                           sep="\t")
   }
 
   #
@@ -616,15 +616,26 @@ benchmark_httk <- function(
       theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))
     benchmarks[["units.plot"]] <- units.plot
 
+    model.rmsle.plot <- 
+      ggplot2::ggplot(subset(plot.table, Benchmark %in%
+                             c("RMSLE.Wetmore","RMSLE.noMC")),
+             aes(x=Version, y=Value, color=Benchmark, group=Benchmark)) + 
+      geom_point() +
+      geom_line() +
+      ylab("Root Mean Squared Log10 Error (RMSLE)") +
+      theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))
+    benchmarks[["model.rmsle.plot"]] <- model.rmsle.plot
 
-    rmsle.plot <- 
-      ggplot2::ggplot(subset(plot.table, regexpr("RMSLE", Benchmark)!=-1),
+    invivo.rmsle.plot <- 
+      ggplot2::ggplot(subset(plot.table, Benchmark %in%
+                             c("RMSLE.InVivoCss","RMSLE.InVivoAUC",
+                               "RMSLE.InVivoCmax","RMSLE.TissuePC")),
              aes(x=Version, y=Value, color=Benchmark, group=Benchmark)) + 
       geom_point() +
       geom_line() +
       ylab("Root Mean Squared Log10 Error (RMSLE)") +
       theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))
-    benchmarks[["rmsle.plot"]] <- rmsle.plot
+    benchmarks[["invivo.rmsle.plot"]] <- invivo.rmsle.plot
 
     count.plot <- 
       ggplot2::ggplot(subset(plot.table, regexpr("N.", Benchmark)!=-1),

diff --git a/models/mod.RENAMEtoEXE b/models/mod.RENAMEtoEXE
diff --git a/scripts/Dashboard-HTTK-CssunitsmgpL-change.txt b/scripts/Dashboard-HTTK-CssunitsmgpL-change.txt
diff --git a/scripts/Dashboard-HTTK-CssunitsmgpL.txt b/scripts/Dashboard-HTTK-CssunitsmgpL.txt
diff --git a/scripts/Dashboard-HTTK-stamp.RData b/scripts/Dashboard-HTTK-stamp.RData
diff --git a/scripts/calc-mc-css-fold-change.png b/scripts/calc-mc-css-fold-change.png
diff --git a/scripts/httk-for-dashboard.R b/scripts/httk-for-dashboard.R
@@ -1,3 +1,5 @@
+#R CMD BATCH --vanilla httk-for-dashboard.R
+
 library(httk)
 library(parallel)
 library(data.table)
@@ -31,7 +33,7 @@ SPECIES.LIST <- c("Human","Rat")
 MODELS.LIST <- c("3compartmentss","PBTK")
 
 # How many processors are available for parallel computing:
-NUM.CPU <- 6
+NUM.CPU <- detectCores()-1
 
 # Add the in silico predictions:
 # Categorical QSPR:
@@ -43,6 +45,51 @@ load_pradeep2020()
 # Caco-2 QSPR:
 load_honda2023()
 
+# Find all the duplicates:
+# First eliminat NA DTXSIDs (can't do logical tests on them)
+chem.physical_and_invitro.data <- subset(chem.physical_and_invitro.data,
+                                         !is.na(DTXSID))
+# Now check for duplicated DTXSIDs:
+dup.chems <- chem.physical_and_invitro.data$DTXSID[
+  duplicated((chem.physical_and_invitro.data$DTXSID))]
+length(dup.chems)
+# Now check for duplicated CASRN and grab the DTXSIDs of any chemicals with
+# duplicated CAS (but unique DTXSIDs):
+dup.chems <- unique(dup.chems,
+                    subset(chem.physical_and_invitro.data, CAS %in%
+                    chem.physical_and_invitro.data$CAS[
+                      duplicated((chem.physical_and_invitro.data$CAS))])$DTXSID)
+length(dup.chems)
+
+# Among the duplicates, try to keep the predictions from Dawson 2021:
+for (this.chem in dup.chems)
+{
+  these.dup.rows <- which(chem.physical_and_invitro.data$DTXSID==this.chem)
+  keep.row <- NULL
+  for(this.row in these.dup.rows)
+    if (regexpr("Dawson",
+                chem.physical_and_invitro.data[this.row,
+                                               "Human.Clint.Reference"])!=-1
+      ) keep.row <- this.row
+  these.dup.rows <- these.dup.rows[these.dup.rows!=keep.row]
+  chem.physical_and_invitro.data <- chem.physical_and_invitro.data[
+    -these.dup.rows,]
+}
+
+# Check to make sure no duplicates left:
+chem.physical_and_invitro.data <- subset(chem.physical_and_invitro.data,
+                                         !is.na(DTXSID))
+dup.chems <- chem.physical_and_invitro.data$DTXSID[
+  duplicated((chem.physical_and_invitro.data$DTXSID))]
+length(dup.chems)
+dup.chems <- unique(dup.chems,
+                    subset(chem.physical_and_invitro.data, CAS %in%
+                    chem.physical_and_invitro.data$CAS[
+                      duplicated((chem.physical_and_invitro.data$CAS))])$DTXSID)
+# This should be true:
+length(dup.chems) == 0
+
+
 # Organize HTTK data by species:
 HTTK.data.list <- list()
 all.ids <- NULL
@@ -62,12 +109,6 @@ for (this.species in SPECIES.LIST)
   all.ids <- sort(unique(c(all.ids,HTTK.data.list[[this.species]]$DTXSID)))
 }
 
-# temporilay make it run fast:
-ivpkfit <- read.csv("invivoPKfit-params.for.dashboard.txt")
-short.list <- ivpkfit$Chemical[1:25]
-all.ids <- all.ids[all.ids %in% short.list]
-
-
 # We want one parameter per line, but the code is pretty different wrt how we
 # retrieve/calculate these values:
 param.list <- c("Clint","Fup","Vd","Days.Css","TK.Half.Life","Css")
@@ -79,8 +120,6 @@ units.list[["Days.Css"]] <- "Days"
 units.list[["TK.Half.Life"]] <- "hours"
 units.list[["Css"]] <- "mg/L"
 
-
-
 # Function to create all the rows of info for a particular chemical:
 make.ccd.table <- function(
   this.id,
@@ -238,33 +277,13 @@ make.ccd.table <- function(
   return(dashboard.table)
 }
 
-# Create a multicore cluster:
-cl <- parallel::makeCluster(detectCores()-1)
-
-# Load httk on all cores:
-clusterEvalQ(cl, library(httk))
-# Clear memory all cores:
-clusterEvalQ(cl, rm(list=ls()))
-# Load ADMet predicitons:
-clusterEvalQ(cl, load_sipes2017())
-# Define the table creator function on all cores:
-clusterExport(cl, "make.ccd.table")
-# Share data with all cores:
-clusterExport(cl, c(
-  "HTTK.data.list",
-  "SPECIES.LIST",
-  "MODELS.LIST",
-  "param.list",
-  "units.list",
-  "all.ids",
-  "RANDOM.SEED",
-  "WHICH.QUANTILES",
-  "NUM.SAMPLES"))
-
-# Create a list with one table per chemical:
-dashboard.list <- clusterApply(cl,all.ids,function(x)
-  make.ccd.table(
-    this.id=x,
+if (NUM.CPU == 1)
+{
+# Non-parallel version:
+dashboard.list <- NULL
+for (this.id in all.ids)
+  dashboard.list[[this.id]] <-   make.ccd.table(
+    this.id=this.id,
     HTTK.data.list=HTTK.data.list,
     species.list=SPECIES.LIST,
     model.list=MODELS.LIST,
@@ -274,9 +293,60 @@ dashboard.list <- clusterApply(cl,all.ids,function(x)
     RANDOM.SEED=RANDOM.SEED,
     which.quantiles=WHICH.QUANTILES,
     num.samples=NUM.SAMPLES
-    ))
+  )
+} else {
+  # Create a multicore cluster:
+  cl <- parallel::makeCluster(NUM.CPU)
+
+  # Load httk on all cores:
+  clusterEvalQ(cl, library(httk))
+  # Clear memory all cores:
+  clusterEvalQ(cl, rm(list=ls()))
+
+  # Dawson 2021:
+  clusterEvalQ(cl, load_dawson2021())
+  # ADmet Predictor:
+  clusterEvalQ(cl, load_sipes2017())
+  # Machine learning model:
+  clusterEvalQ(cl, load_pradeep2020())
+  # Caco-2 QSPR:
+  clusterEvalQ(cl, load_honda2023())
+
+  # Define the table creator function on all cores:
+  clusterExport(cl, "make.ccd.table")
+  # Share data with all cores:
+  clusterExport(cl, c(
+    "chem.physical_and_invitro.data",
+    "HTTK.data.list",
+    "SPECIES.LIST",
+    "MODELS.LIST",
+    "param.list",
+    "units.list",
+    "all.ids",
+    "RANDOM.SEED",
+    "WHICH.QUANTILES",
+    "NUM.SAMPLES"))
+
+  # Create a list with one table per chemical:
+  dashboard.list <- clusterApply(cl,
+                                 all.ids,
+                                 function(x)
+    make.ccd.table(
+      this.id=x,
+      HTTK.data.list=HTTK.data.list,
+      species.list=SPECIES.LIST,
+      model.list=MODELS.LIST,
+      param.list=param.list,
+      units.list=units.list,
+      all.ids=all.ids,
+      RANDOM.SEED=RANDOM.SEED,
+      which.quantiles=WHICH.QUANTILES,
+      num.samples=NUM.SAMPLES
+      ))
+
+  stopCluster(cl)
+}
 
-stopCluster(cl)
 # Combine all the individual tables into a single table:
 dashboard.table <- rbindlist(dashboard.list)
 
@@ -355,11 +425,11 @@ for (this.parameter in c(
               ivpkfit[ivpkfit.row,"Ref"]
             dashboard.table[dashboard.row,"Measured"] <- ivpkfit[ivpkfit.row,
                             this.parameter]
-        }
+          }
+        }  
+      }
     }  
   }
-}  
-
 
 #
 #
@@ -495,20 +565,18 @@ png("calc-mc-css-fold-change.png")
 print(Fig)
 dev.off()
 
+# Duplicate column names in the two merged tables (dashboard.table, prev.table) 
+# have .x and .y assigned to them respectively.
 tmp <- tmp[,colnames(tmp)[c(1:5,7:8,12:13,16)]]
-colnames(tmp) <- gsub("x","prev",colnames(tmp))
-colnames(tmp) <- gsub("y","new",colnames(tmp))
+colnames(tmp) <- gsub("x","new",colnames(tmp))
+colnames(tmp) <- gsub("y","prev",colnames(tmp))
 write.table(
   tmp,
   file=paste(
     "Dashboard-HTTK-CssunitsmgpL-change.txt",sep=""),
   row.names=F,
   quote=F,
   sep="\t")
-
-
-
-
-
-
+
+sessionInfo()