metrumresearchgroup · andersone1 · Jan 31, 2025 · Nov 20, 2024 · Nov 20, 2024 · Nov 20, 2024
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,9 @@
+# mrgda development
+
+## New features and changes
+
+- `write_derived' now writes out columns that have a unique value per subject. (#230)
+
 # mrgda 0.12.1
 
 ## New features and changes

diff --git a/R/identify-subject-cols.R b/R/identify-subject-cols.R
@@ -0,0 +1,38 @@
+#' Identify columns that are constant within subjects
+#'
+#' This function identifies columns in a data frame that are constant within each subject defined by a specified subject variable.
+#'
+#' @param .df A data frame to analyze.
+#' @param .subject_col A string specifying the name of the subject variable.
+#'
+#' @return A character vector of column names that are constant within each subject.
+#'
+#' @keywords internal
+identify_subject_cols <- function(.df, .subject_col) {
+  if (!.subject_col %in% names(.df)) {
+    stop("Subject column '", .subject_col, "' does not exist in the data frame.")
+  }
+
+  # Identify columns that are constant within all subjects
+  constant_cols <-
+    .df %>%
+    dplyr::group_by(!!rlang::sym(.subject_col)) %>%
+    dplyr::summarise(
+      dplyr::across(tidyselect::everything(), ~ dplyr::n_distinct(.x) == 1)
+    ) %>%
+    dplyr::ungroup() %>%
+    # Summarize all columns other than .subject_col
+    dplyr::summarise(
+      across(-!!rlang::sym(.subject_col), ~ all(.x))
+    ) %>%
+    tidyr::pivot_longer(cols = tidyselect::everything()) %>%
+    dplyr::filter(value) %>%
+    dplyr::pull(name) %>%
+    sort()
+
+  if (length(constant_cols) == 0) {
+    constant_cols <- "none"
+  }
+
+  constant_cols
+}
diff --git a/R/mrgda-package.R b/R/mrgda-package.R
@@ -60,7 +60,8 @@ globalVariables(
     "MRGDA_PARSE_DATETIME",
     "ID_SORT",
     ".output_dir",
-    "Previous Revision"
+    "Previous Revision",
+    "all_constant"
   )
 )
 
diff --git a/R/write-derived.R b/R/write-derived.R
@@ -98,6 +98,11 @@ write_derived <- function(.data, .spec, .file, .comment = NULL, .subject_col = "
     stop("Defined .subject_col '", .subject_col, "' not found in data")
   }
 
+  # Determine and write out subject columns ---------------------------------
+  subject_columns <- identify_subject_cols(.df = .data, .subject_col = .subject_col)
+
+  yaml::write_yaml(subject_columns, file = file.path(.meta_data_folder, "subject-columns.yml"))
+
   # Execute data diffs ------------------------------------------------------
   compare_df <- read_csv_dots(.file)
 

diff --git a/man/identify_subject_cols.Rd b/man/identify_subject_cols.Rd
diff --git a/tests/testthat/test-identify-subject-cols.R b/tests/testthat/test-identify-subject-cols.R
@@ -0,0 +1,55 @@
+test_that("Identifies columns constant within subjects", {
+  df <- data.frame(
+    subject = c(1, 1, 2, 2),
+    z = c("a", "a", "a", "a"),
+    x = c(5, 5, 5, 5),
+    y = c(1, 2, 1, 2)
+  )
+  result <- identify_subject_cols(df, "subject")
+  expect_equal(sort(result), c("x", "z"))
+})
+
+test_that("Returns none when no columns are constant", {
+  df <- data.frame(
+    subject = c(1, 1, 2, 2),
+    x = c(1, 2, 3, 4),
+    y = c(5, 6, 7, 8)
+  )
+  result <- identify_subject_cols(df, "subject")
+  expect_equal(result, "none")
+})
+
+test_that("Handles missing values appropriately", {
+  df <- data.frame(
+    subject = c(1, 1, 2, 2),
+    x = c(NA, NA, NA, NA),
+    y = c(1, 1, NA, NA),
+    z = c(1, 1, 1, 1),
+    a = c(1, NA, 1, 1)
+  )
+  result <- identify_subject_cols(df, "subject")
+  expect_equal(sort(result), c("x", "y", "z"))
+})
+
+test_that("Handles various data types", {
+  df <- data.frame(
+    subject = c(1, 1, 2, 2),
+    num = c(5, 5, 5, 5),
+    char = c("a", "a", "b", "b"),
+    factor = factor(c("d", "d", "e", "e")),
+    logical = c(TRUE, TRUE, FALSE, FALSE)
+  )
+  result <- identify_subject_cols(df, "subject")
+  expect_equal(sort(result), c("char", "factor", "logical", "num"))
+})
+
+test_that("Errors on nonexistent subject column", {
+  df <- data.frame(
+    subject = c(1, 1, 2, 2),
+    x = c(1, 2, 3, 4)
+  )
+  expect_error(
+    identify_subject_cols(df, "nonexistent_col"),
+    "Subject column 'nonexistent_col' does not exist in the data frame."
+  )
+})