Merge pull request #32 from openpharma/018_improve_data_loading

Ensure external metadata file can be used. Improve data loading.
openpharma · Jun 18, 2024 · 42afd79 · 42afd79
2 parents d4f8848 + 2f65717
commit 42afd79
Show file tree

Hide file tree

Showing 57 changed files with 565 additions and 905 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -14,4 +14,5 @@ _\.new\.png$
 ^docs$
 ^dev$
 ^Meta$
-^pkgdown$
+^pkgdown$
+^test_data$
diff --git a/.gitignore b/.gitignore
@@ -12,4 +12,5 @@ inst/doc
 /doc/
 /Meta/
 *.sqlite
-/pkgdown/
+/pkgdown/
+/test_data/
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: clinsight
 Title: ClinSight
-Version: 0.0.0.9001
+Version: 0.0.0.9002
 Authors@R: c(
     person("Leonard Daniël", "Samson", , "[email protected]", role = c("cre", "aut")),
     person("GCP-Service International Ltd.& Co. KG", role = "fnd")

diff --git a/NAMESPACE b/NAMESPACE
@@ -13,7 +13,6 @@ export(adjust_colnames)
 export(app_ui)
 export(bind_rows_custom)
 export(check_appdata)
-export(check_available_data)
 export(clean_dates)
 export(collapse_column_vals)
 export(collapse_fct_levels)
@@ -47,6 +46,7 @@ export(get_db_connection)
 export(get_ggplot_layer_names)
 export(get_max_time)
 export(get_meta_vars)
+export(get_metadata)
 export(get_raw_data)
 export(get_review_data)
 export(get_static_overview_data)

diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,8 @@
 
 ## Changed
 
+- Improved metadata so that external file can be used. 
+- Improved data loading by using a config file.
 - Created two renv profiles, one for development and one for production. Goal is 
 to minimize the package dependencies of the production version.
 - Removed development package dependencies (for example devtools) that were not needed to run the application.  

diff --git a/R/app_config.R b/R/app_config.R
@@ -16,10 +16,12 @@ app_sys <- function(...) {
 #' Read App Config
 #'
 #' @param value Value to retrieve from the config file.
-#' @param config GOLEM_CONFIG_ACTIVE value. If unset, R_CONFIG_ACTIVE.
-#' If unset, "default".
+#' @param config GOLEM_CONFIG_ACTIVE value. If unset, R_CONFIG_ACTIVE. If unset,
+#'   "default".
 #' @param use_parent Logical, scan the parent directory for config file.
-#' @param file Location of the config file
+#' @param file Location of the config file. Can be set with the option
+#'   'CONFIG_PATH' so that a study-specific config.yml file can be provided at
+#'   runtime, without rebuilding the application's package from source.
 #'
 #' @noRd
 get_golem_config <- function(
@@ -33,7 +35,7 @@ get_golem_config <- function(
   ),
   use_parent = TRUE,
   # Modify this if your config file is somewhere else
-  file = app_sys("golem-config.yml")
+  file = Sys.getenv("CONFIG_PATH", app_sys("golem-config.yml"))
 ) {
   config::get(
     value = value,

diff --git a/R/app_server.R b/R/app_server.R
@@ -19,33 +19,23 @@ app_server <- function(
 ){
   meta <- golem::get_golem_options("meta")
   merged_data <- golem::get_golem_options("data")
-  if(is.character(merged_data)) merged_data <- readRDS(merged_data)
   user_db <- golem::get_golem_options("user_db")
   credentials_db <- golem::get_golem_options("credentials_db")
   test_mode <- golem::get_golem_options("test_mode")
 
-  app_data <- get_appdata(merged_data)
+  app_data <- get_appdata(merged_data, meta = meta)
   app_vars <- get_meta_vars(data = app_data, meta = meta)
   app_tables <- lapply(
     setNames(names(app_data), names(app_data)), \(x){
       create_table(app_data[[x]], expected_columns = names(app_vars$items[[x]]))
     })
   check_appdata(app_data, meta)
 
-  if(!file.exists(user_db)){
-    warning("no database found. New database will be created")
-    db_create(get_review_data(merged_data), db_path = user_db)
-  } else{
-    if(!test_mode){
-      db_update(get_review_data(merged_data), db_path = user_db, data_synched = FALSE) 
-    }
-  }
-
   res_auth <- authenticate_server(
     test_mode = test_mode,
     sites = app_vars$Sites$site_code,  
     credentials_db = credentials_db,
-    credentials_pwd =  golem::get_golem_options("credentials_pwd")
+    credentials_pwd = golem::get_golem_options("credentials_pwd")
     )
 
   output$user_info <- renderText({
@@ -84,7 +74,7 @@ app_server <- function(
   static_overview_data <- get_static_overview_data(
     data = app_data,
     expected_general_columns = unique(
-      with(metadata$items_expanded, item_name[item_group == "General"])
+      with(meta$items_expanded, item_name[item_group == "General"])
       )
   )
   # think of using the pool package, but functions such as row_update are not yet supported.
@@ -191,13 +181,20 @@ app_server <- function(
 
   lapply(app_vars$groups, \(x){mod_study_forms_server(
     id = paste0("sf_", simplify_string(x)), r = r, form = x,
-    form_items = app_vars$items[[x]], table_names = app_vars$table_names
+    form_items = app_vars$items[[x]], table_names = app_vars$table_names,
+    item_info = meta$groups[meta$groups$item_group == x, ]
   ) }) |>
     unlist(recursive = FALSE)
 
   mod_start_page_server("start_page_1", r, rev_data, navinfo, app_vars$all_forms,
                         app_vars$table_names)
-  mod_header_widgets_server("header_widgets_1", r, rev_data, navinfo)
+  mod_header_widgets_server(
+    id = "header_widgets_1", 
+    r = r, 
+    rev_data = rev_data, 
+    navinfo = navinfo, 
+    events = meta$events
+    )
 
   # Only initiate the sidebar after successful login, because it contains a
   # modal that pops up if data is out of synch. Modals interfere with shinymanager.
@@ -257,7 +254,6 @@ app_server <- function(
     app_vars$all_forms,
     table_names = app_vars$table_names
   )
-
   shiny::exportTestValues(
     user_db = user_db,
     active_participant = r$subject_id,

diff --git a/R/fct_SQLite.R b/R/fct_SQLite.R
@@ -48,8 +48,8 @@ db_temp_connect <- function(db_path, code, drv = RSQLite::SQLite()){
 #' Creates application database. To create a database with all data flagged as
 #' 'new', use the default settings of `reviewed`, `reviewer`, and `status`.
 #'
-#' @param data Either a data frame with review data (Usually created with
-#'   [get_review_data()]), or a character path to the raw data files.
+#' @param data A data frame with review data (Usually created with
+#'   [get_review_data()]).
 #' @param db_path A character vector with the path to the database to be
 #'   created.
 #' @param reviewed Character vector. Sets the reviewed tag in the review
@@ -74,11 +74,6 @@ db_create <- function(
   stopifnot(!file.exists(db_path))
   stopifnot(reviewed %in% c("Yes", "No", ""))
   stopifnot(is.data.frame(data) || is.character(data))
-  if(!is.data.frame(data)){
-    data <- get_raw_data(data) |> 
-      merge_meta_with_data() |> 
-      get_review_data() 
-  }
   df <- data |> 
     dplyr::mutate(
       reviewed = reviewed, 
@@ -147,7 +142,7 @@ db_update <- function(
   cat("Start adding new rows to database\n")
   updated_review_data <- update_review_data(
     review_df = review_data,
-    latest_review_data = data, #get_review_data(merge_meta_with_data(data), common_vars), 
+    latest_review_data = data,
     common_vars = common_vars,
     edit_time_var = edit_time_var
   )

diff --git a/R/fct_appdata.R b/R/fct_appdata.R
@@ -42,15 +42,6 @@ get_raw_data <- function(
     dplyr::rename(
       setNames(column_specs$name_raw, column_specs$name_new)
       ) |> 
-    dplyr::mutate(
-      db_update_time = max(edit_date_time, na.rm = T),
-      region = dplyr::case_when(
-        grepl("^AU", site_code)  ~ "AUS",
-        grepl("^DE", site_code)  ~ "GER",
-        grepl("^FR", site_code)  ~ "FRA",
-        TRUE                    ~ NA_character_
-      )
-    ) |> 
     dplyr::mutate(
       day = event_date - min(event_date, na.rm = TRUE), 
       vis_day = ifelse(event_id %in% c("SCR", "VIS", "VISEXT", "VISVAR", "FU1", "FU2"), day, NA),
@@ -72,10 +63,6 @@ get_raw_data <- function(
       ),
       .by = subject_id
     ) |> 
-    # Add a fix for MC in raw dataset. 
-    # Otherwise, we have to repeat this calculation multiple times when creating 
-    # other datasets from the raw data
-    fix_multiple_choice_vars() |> 
     dplyr::arrange(
       factor(site_code, levels = order_string(site_code)),
       factor(subject_id, levels = order_string(subject_id))
@@ -104,13 +91,24 @@ get_raw_data <- function(
 #' @export
 #' 
 merge_meta_with_data <- function(
-    data = raw_data,
-    meta = metadata,
+    data,
+    meta,
     expected_columns = c("LBORNR_Lower", "LBORNR_Upper", "LBORRESU", 
                          "LBORRESUOTH", "LBREASND", "unit", 
                          "lower_limit", "upper_limit", "LBCLSIG")
 ){
+  stopifnot(is.data.frame(data))
+  stopifnot(inherits(meta, "list"))
+  missing_colnames <- with(meta$column_specs, name_new[!name_new %in% names(data)]) |> 
+    paste0(collapse = ", ")
+  if(nchar(missing_colnames) > 0) stop(
+    paste0("The following columns are defined in the metadata ", 
+           "(column_specs$name_new) but are missing in the study data:\n", 
+           missing_colnames, ".")
+    )
   merged_data <- data |> 
+    # fix MC values before merging:
+    fix_multiple_choice_vars(expected_vars = meta$items_expanded$var) |> 
     dplyr::right_join(meta$items_expanded, by = "var") |> 
     dplyr::filter(!is.na(item_value)) |> 
     dplyr::mutate(
@@ -161,7 +159,7 @@ merge_meta_with_data <- function(
 apply_study_specific_fixes <- function(
     data, 
     form_id_vars = c("subject_id", "event_name", "item_group")
-    ){
+){
   ## apply study-specific fixes:
   # fix significance in ECG before proceeding (stored in its own separate variable):
   ECG_significance <- data |> 
@@ -204,7 +202,17 @@ apply_study_specific_fixes <- function(
       ),
       .by = c(subject_id, form_repeat)
     ) 
-  data
+
+  # Add regions: 
+  data |> 
+    dplyr::mutate(
+      region = dplyr::case_when(
+        grepl("^AU", site_code)  ~ "AUS",
+        grepl("^DE", site_code)  ~ "GER",
+        grepl("^FR", site_code)  ~ "FRA",
+        TRUE                    ~ NA_character_
+      )
+    )
 }
 
 #' Get appdata

diff --git a/R/fct_data_helpers.R b/R/fct_data_helpers.R
@@ -7,11 +7,13 @@
 #' @param expand_tab_items Character vector with the names of the tabs of which
 #'   the items need to be expanded. If not empty, a new data frame will be
 #'   created named 'expanded_items', containing all items in the tabs of
-#'   `expand_tab_items`.
+#'   `expand_tab_items`. Will abort if a tab name is provided that does not
+#'   exist in the metadata.
 #' @param expand_cols Column names containing the columns for expansion. Will be
 #'   ignored if the variable `expand_tab_items` is left empty.
 #'
 #' @return A list with data frames.
+#' @export
 #' 
 get_metadata <- function(
     filepath,
@@ -33,7 +35,6 @@ get_metadata <- function(
     message("'items_expanded' already present. Expanding items aborted.")
     meta
   })
-
   missing_tab_items <- expand_tab_items[!expand_tab_items %in% names(meta)]
   if(length(missing_tab_items) > 0) {
     stop_message <- paste0(
@@ -63,32 +64,60 @@ get_metadata <- function(
 }
 
 #' Correct multiple choice variables
-#' 
-#' Function to correct multiple choice variables in the data.
 #'
-#' @param data data frame (typically the raw data)
-#' @param meta metadata, list of data frames.
+#' In some EDC systems, if there is a multiple choice variable in which multiple
+#' answers are possible, the variable will be renamed with a suffix with the
+#' multiple answers in it. For example var1, var2, for answers 1 and 2. This
+#' function cleans this specific output so that the variable name remains
+#' consistent.
+#'
+#' @param data A data frame. 
+#' @param expected_vars Character vector containing the expected names of the
+#'   variables.
 #' @param var_column column name in which the variable names are stored
-#' @param value_column column name in which the values of the variables are stored
+#' @param value_column column name in which the values of the variables are
+#'   stored
 #' @param suffix Multiple choice suffix. Used to define multiple choice values
 #' @param common_vars variables used for identifying unique rows in the dataset.
-#' @param collapse_with character value to collapse the multiple choice options with. 
-#' If this value is NULL, the rows will be left as is. 
+#' @param collapse_with character value to collapse the multiple choice options
+#'   with. If this value is NULL, the rows will be left as is.
 #'
 #' @return data frame with corrected multiple choice variables
+#' @examples
+#'  df <- data.frame(
+#'   ID = "Subj1",
+#'   var = c("Age", paste0("MH_TRT", 1:4)),
+#'   item_value = as.character(c(95, 67, 58, 83, 34))
+#'  )
+#'  fix_multiple_choice_vars(df, common_vars = "ID")
 #' @export
+#' 
 fix_multiple_choice_vars <- function(
-    data = raw_data,
-    meta = metadata,
+    data,
+    expected_vars = metadata$items_expanded$var,
     var_column = "var",
     value_column = "item_value",
     suffix = "[[:digit:]]+$",
     common_vars = c("subject_id", "event_repeat", "event_date", "form_repeat"),
     collapse_with = "; "
 ){
+  stopifnot(is.data.frame(data))
+  stopifnot(is.character(expected_vars))
+  stopifnot("var_column should be a vector of length 1" = {
+    is.character(var_column) & length(var_column) == 1
+    })
+  stopifnot("suffix should be a character vector of length 1" = {
+    is.character(suffix) & length(suffix) == 1
+  })
+  stopifnot(is.character(common_vars))
+  if(!is.null(collapse_with)){
+    stopifnot("collapse_with should be a character vector of length 1" = {
+      is.character(collapse_with) & length(collapse_with) == 1
+    })
+  }
 
   all_vars <- unique(data[[var_column]])
-  expected_vars <- meta$items_expanded$var
+
   missing_vars <- expected_vars[!expected_vars %in% all_vars]
   if(length(missing_vars) == 0) return(data)
 
@@ -137,7 +166,7 @@ fix_multiple_choice_vars <- function(
 #'
 get_meta_vars <- function(data = appdata, meta = metadata){
   stopifnot(inherits(data, "list"))
-  stopifnot(inherits(metadata, "list"))
+  stopifnot(inherits(meta, "list"))
   if(length(data) == 0) stop("Empty list with data provided")
   vars <- list()
   # add metadata variables:
@@ -148,7 +177,7 @@ get_meta_vars <- function(data = appdata, meta = metadata){
     split(~item_group) |> 
     lapply(\(x){setNames(simplify_string(x$item_name), x$item_name)})
   vars$groups <- meta$groups$item_group
-  common_forms <- c("Adverse events", "Medical History", "Medication", "Conc. Procedures")
+  common_forms <- unique(meta$common_forms$item_group)
   vars$all_forms <- data.frame(
     "main_tab" = c(
       rep("Common events", times = length(common_forms)),
@@ -161,7 +190,7 @@ get_meta_vars <- function(data = appdata, meta = metadata){
   vars$subject_id <- order_string(get_unique_vars(data, "subject_id")[[1]])
   vars$Sites     <- get_unique_vars(data, c("site_code", "region")) |> 
     dplyr::arrange(factor(site_code, levels = order_string(site_code)))
-  vars$table_names <- setNames(metadata$table_names$raw_name, metadata$table_names$table_name) 
+  vars$table_names <- setNames(meta$table_names$raw_name, meta$table_names$table_name) 
   vars
 }