Skip to content

Commit

Permalink
Merge pull request #13 from carriedaymont/fix-github-tests
Browse files Browse the repository at this point in the history
Fix GitHub tests, including cluster generation addressing parallel processing on windows, fixing #12
  • Loading branch information
dchud authored Nov 27, 2020
2 parents fbd61a7 + a58d21b commit a6e30ba
Show file tree
Hide file tree
Showing 10 changed files with 53 additions and 37 deletions.
2 changes: 2 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ Imports:
Hmisc (>= 4.4-0),
labelled (>= 2.5.0),
magrittr (>= 1.5)
Depends:
R (>= 2.10)
License: MIT + file LICENSE
Encoding: UTF-8
LazyData: true
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@ import(dplyr, except = c(last, first, summarize, src, between))
import(foreach)
import(labelled)
import(magrittr)
import(parallel)
import(plyr, except = c(failwith, id, summarize, count, desc, mutate, arrange, rename, is.discrete, summarise, summarize))
import(tidyr, except = extract)
6 changes: 0 additions & 6 deletions R/cdc.R
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,6 @@ ext_bmiz <- function(data,
bmi = "bmi",
adjust.integer.age = T,
ref.data.path = "") {
library(data.table, quietly = T)
library(dplyr, quietly = T)
library(Hmisc, quietly = T)
library(magrittr, quietly = T)
library(labelled, quietly = T)

setDT(data)

setnames(data,
Expand Down
33 changes: 21 additions & 12 deletions R/growth.R
Original file line number Diff line number Diff line change
Expand Up @@ -1598,6 +1598,7 @@ cleanbatch <- function(data.df,
#' matching same ageday measurements for the other parameter. Options include "default" (standard growthcleanr approach),
#' and "flag.both" (in case of two measurements of one type without matching values for the other parameter, flag both
#' for exclusion if beyond threshold)
#' @param height.tolerance.cm maximum decrease in height tolerated for sequential measurements
#' @param error.load.mincount minimum count of exclusions on parameter before
#' considering excluding all measurements. Defaults to 2.
#' @param error.load.threshold threshold of percentage of excluded measurement count to included measurement
Expand Down Expand Up @@ -1648,6 +1649,7 @@ cleanbatch <- function(data.df,
#' @rawNamespace import(plyr, except = c(failwith, id, summarize, count, desc, mutate, arrange, rename, is.discrete, summarise, summarize))
#' @import foreach
#' @import doParallel
#' @import parallel
#' @examples
#' # Run calculation using a small subset of given data
#' df_stats <- as.data.frame(syngrowth)
Expand All @@ -1660,17 +1662,17 @@ cleanbatch <- function(data.df,
#' measurement = df_stats$measurement)
#'
#' # Once processed you can filter data based on result value
#' df_stats <- cbind(df_stats, "clean_result" == clean_stats)
#' clean_df_stats <- df_stats[, df_stats$clean_result == "Include"]
#' df_stats <- cbind(df_stats, "clean_result" = clean_stats)
#' clean_df_stats <- df_stats[df_stats$clean_result == "Include",]
#'
#' # Parallel processing: run using 3 cores and batches
#' df_stats<-cleangrowth(subjid = df_stats$subjid,
#' param = df_stats$param,
#' agedays = df_stats$agedays,
#' sex = df_stats$sex,
#' measurement = df_stats$measurement,
#' parallel = TRUE,
#' num.batches = 2)
#' # Parallel processing: run using 2 cores and batches
#' clean_stats <- cleangrowth(subjid = df_stats$subjid,
#' param = df_stats$param,
#' agedays = df_stats$agedays,
#' sex = df_stats$sex,
#' measurement = df_stats$measurement,
#' parallel = TRUE,
#' num.batches = 2)
cleangrowth <- function(subjid,
param,
agedays,
Expand Down Expand Up @@ -1707,10 +1709,17 @@ cleangrowth <- function(subjid,

# if parallel processing is desired, load additional modules
if (parallel) {
registerDoParallel(cores = num.batches)
if (is.na(num.batches)) {
num.batches <- getDoParWorkers()
}
# variables needed for parallel workers
var_for_par <- c("temporary_duplicates", "valid", "swap_parameters",
"na_as_false", "ewma", "read_anthro", "as_matrix_delta",
"sd_median")

cl <- makeCluster(num.batches)
clusterExport(cl = cl, varlist = var_for_par, envir = environment())
registerDoParallel(cl)
} else {
if (is.na(num.batches))
num.batches <- 1
Expand Down Expand Up @@ -1986,7 +1995,7 @@ cleangrowth <- function(subjid,
error.load.threshold = error.load.threshold,
error.load.mincount = error.load.mincount
)
stopImplicitCluster()
stopCluster(cl)
}
if (!quietly)
cat(sprintf("[%s] Done!\n", Sys.time()))
Expand Down
7 changes: 4 additions & 3 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#' @param df data frame to split
#' @param fname new name for each of the split files to start with
#' @param fdir directory to put each of the split files (default working directory)
#' @param min_row minimum number of rows for each split file (default 10000)
#' @param min_nrow minimum number of rows for each split file (default 10000)
#' @param keepcol the column name (default "subjid") to use to keep records with the same values together in the same single split file
#'
#' @return the count number referring to the last split file written
Expand All @@ -33,11 +33,11 @@
splitinput <-
function(df,
fname = deparse(substitute(df)),
fdir = "",
fdir = ".",
min_nrow = 10000,
keepcol = 'subjid') {
# first, check if the given directory exists
if (fdir != "" & is.character(fdir) & !dir.exists(fdir)){
if (fdir != "." & is.character(fdir) & !dir.exists(fdir)){
stop("invalid directory")
}

Expand Down Expand Up @@ -144,6 +144,7 @@ recode_sex <- function(input_data,
#' @param agedays name of age (in days) descriptor column
#' @param param name of parameter column to identify each type of measurement
#' @param measurement name of measurement column containing the actual measurement data
#' @param clean_value name of column of cleaned values from growthcleanr::cleangrowth()
#' @param include_all Determines whether the function keeps all exclusion codes. If TRUE, all exclusion types are kept and the inclusion_types argument is ignored. Defaults to FALSE.
#' @param inclusion_types Vector indicating which exclusion codes from the cleaning algorithm should be included in the data, given that include_all is FALSE. For all options, see growthcleanr::cleangrowth(). Defaults to c("Include").
#'
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# growthcleanr

![R-CMD-check](https://github.com/carriedaymont/growthcleanr/workflows/R-CMD-check/badge.svg?branch=main)

R package for cleaning data from Electronic Health Record systems, focused on
cleaning height and weight measurements.

Expand Down Expand Up @@ -66,6 +68,7 @@ following packages:
* `data.table`
* `foreach`
* `doParallel`
* `parallel`
* `dplyr`
* `Hmisc`
* `labelled`
Expand Down
24 changes: 13 additions & 11 deletions man/cleangrowth.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/longwide.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions man/splitinput.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 4 additions & 2 deletions man/syngrowth.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit a6e30ba

Please sign in to comment.