refactor muac prevalence to work on a multiple geographical area acco…

…rding to status of mfaz SD and age ratio
nutriverse · Jul 11, 2024 · 91faf56 · 91faf56
1 parent 46447fe
commit 91faf56
Show file tree

Hide file tree

Showing 23 changed files with 340 additions and 207 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -31,7 +31,8 @@ Imports:
     scales,
     srvyr,
     stats,
-    zscorer
+    zscorer,
+    tibble
 Suggests: 
     spelling,
     testthat (>= 3.0.0)

diff --git a/R/age.R b/R/age.R
@@ -92,7 +92,7 @@ process_age <- function(df, svdate = NULL, birdate = NULL, age) {
         age_days = round({{ age }} * 30.44, 2)
       )
   }
-  dplyr::as_tibble(df)
+  tibble::as_tibble(df)
 }
 
 #'

diff --git a/R/prevalence_helpers.R → R/case_definitions.R b/R/prevalence_helpers.R → R/case_definitions.R
@@ -1,32 +1,3 @@
-
-
-# Function to identify the type of treatment for prevalence --------------------
-#'
-#' A helper function to tell how to go about MUAC prevalence analysis based on
-#' on the output of age ratio and standard deviation test results
-#'
-#' @param age_ratio_class,sd_class Character vectors storing age ratio's p-values
-#' and standard deviation's classification, respectively.
-#'
-#' @returns A character vector of the same length containing the indication of
-#' what to do for the MUAC prevalence analysis: "weighted", "unweighted" and
-#' "missing". If "weighted", the CDC weighting approach is applied to correct for
-#' age bias. If "unweighted" a normal complex sample analysis is applied, and for
-#' the latter, NA are thrown.
-#'
-#'
-tell_muac_analysis_strategy <- function(age_ratio_class, sd_class) {
-  case_when(
-    age_ratio_class == "Problematic" & sd_class != "Problematic" ~ "weighted",
-    age_ratio_class != "Problematic" & sd_class == "Problematic" ~ "missing",
-    age_ratio_class == "Problematic" & sd_class == "Problematic" ~ "missing",
-    .default = "unweighted"
-  )
-}
-
-
-# Function on acute malnutrition case-definition -------------------------------
-
 #'
 #' Case-Definition: is an observation acutely malnourished?
 #'
@@ -255,3 +226,36 @@ define_wasting <- function(df, zscore = NULL, muac = NULL, edema = NULL,
     }
   )
 }
+
+#'
+#' A helper function to classify nutritional status into SAM, MAM or not wasted
+#'
+#' `classify_wasting_for_cdc_approach()` is used a helper inside
+#' [apply_cdc_age_weighting()] to classify nutritional status into "sam", "mam"
+#' or "not wasted" and then the vector returned is used downstream to calculate
+#' the proportions of children with severe and moderate acute malnutrition.
+#'
+#' @param muac An integer vector containing MUAC values. They should be in
+#' millimeters.
+#'
+#' @param .edema Optional. Its a vector containing data on bilateral pitting
+#' edema coded as "y" for yes and "n" for no.
+#'
+#'
+classify_wasting_for_cdc_approach <- function(muac, .edema = NULL) {
+  if (!is.null(.edema)) {
+    #edema <- ifelse(edema == 1, "y", "n")
+    x <- case_when(
+      muac < 115 | {{ .edema }} == "y" ~ "sam",
+      muac >= 115 & muac < 125 & {{ .edema }} == "n" ~ "mam",
+      .default = "not wasted"
+    )
+  } else {
+    x <- case_when(
+      muac < 115 ~ "sam",
+      muac >= 115 & muac < 125 ~ "mam",
+      .default = "not wasted"
+    )
+  }
+  x
+}
diff --git a/R/data.R b/R/data.R
@@ -73,10 +73,63 @@
 
 
 #'
-#' District level survey data with Standard Deviation classified as problematic
+#' District level SMART surveys conducted in four district in Mozambique
+#'
+#' @description
+#' This example data contains survey data of four districts. Two of them have their WFHZ
+#' standard deviation classified as problematic, and the are other two within range of
+#' acceptable standard deviation. The data is used to test the performance of WFHZ based
+#' prevalence when used on a data set with multiple survey areas that may or not have
+#' different classification for standard deviation that may warrant different analysis
+#' approach, as the function is designed for.
+#'
+#' @format A tibble of 943 x 9.
+#'
+#' |**Variable** | **Description** |
+#' | :--- | :---|
+#' | *district* | The administrative unit (admin 1) where data was collected. |
+#' | *cluster* | Primary sampling unit |
+#' | *team* | Survey teams |
+#' | *sex* | Sex, "m" = boys, "f" = girls |
+#' | *age* | calculated age in months with two decimal places |
+#' | *weight* | Weight (kg) |
+#' | *height* | Height (cm) |
+#' | *edema* | Edema, "n" = no, "y" = yes |
+#' | *muac* | Mid-upper arm circumference (mm) |
 #'
 "anthro.03"
 
+
+#'
+#' MUAC data from a community-based sentinel site from an anonymized location
+#'
+#' @description
+#' Data in `anthro.04` was generated from a community-based sentinel site of three provinces.
+#' Each province data set holds different scenarios that informs the appropriate analysis
+#' approach to follow. One province (province 3) has its MFAZ standard deviation and age
+#' ratio tests classified as problematic. Another province (province 2) has its age ratio
+#' classified as problematic, but with a within range standard deviation. Lastly, province 1
+#' has both tests falling within range of nor problematic. The data is used to test the
+#' performance of `[compute_muac_prevalence()]` based when used on a multiple survey areas
+#' data that may or not have on the aforementioned test that may then warrant a different
+#' analysis approach, as the function is designed for.
+#'
+#' @format A tibble of 3,002 x 8.
+#'
+#' |**Variable** | **Description** |
+#' | :--- | :---|
+#' | *province* |
+#' | *cluster* | Primary sampling unit |
+#' | *sex* | Sex, "m" = boys, "f" = girls |
+#' | *age* | calculated age in months with two decimal places |
+#' | *muac* | Mid-upper arm circumference (mm) |
+#' | *edema* | Edema, "n" = no, "y" = yes |
+#' | *mfaz* | MUAC-for-age z-scores with 3 decimal places |
+#' | *flag_mfaz* | Flagged observations. 1=flagged, 0=not flagged |
+#'
+"anthro.04"
+
+
 #'
 #' A SMART survey data with standard deviation on weight-for-height zscores
 #' classified as problematic

diff --git a/R/prevalence_muac.R b/R/prevalence_muac.R
@@ -1,36 +1,24 @@
-# Function to classify nutritional status  -------------------------------------
-
-#'
-#' A helper function to classify nutritional status into SAM, MAM or not wasted
 #'
-#' `classify_wasting_for_cdc_approach()` is used a helper inside
-#' [apply_cdc_age_weighting()] to classify nutritional status into "sam", "mam"
-#' or "not wasted" and then the vector returned is used downstream to calculate
-#' the proportions of children with severe and moderate acute malnutrition.
+#' A helper function to tell how to go about MUAC prevalence analysis based on
+#' on the output of age ratio and standard deviation test results
 #'
-#' @param muac An integer vector containing MUAC values. They should be in
-#' millimeters.
+#' @param age_ratio_class,sd_class Character vectors storing age ratio's p-values
+#' and standard deviation's classification, respectively.
 #'
-#' @param .edema Optional. Its a vector containing data on bilateral pitting
-#' edema coded as "y" for yes and "n" for no.
+#' @returns A character vector of the same length containing the indication of
+#' what to do for the MUAC prevalence analysis: "weighted", "unweighted" and
+#' "missing". If "weighted", the CDC weighting approach is applied to correct for
+#' age bias. If "unweighted" a normal complex sample analysis is applied, and for
+#' the latter, NA are thrown.
 #'
 #'
-classify_wasting_for_cdc_approach <- function(muac, .edema = NULL) {
-  if (!is.null(.edema)) {
-    #edema <- ifelse(edema == 1, "y", "n")
-    x <- case_when(
-      muac < 115 | {{ .edema }} == "y" ~ "sam",
-      muac >= 115 & muac < 125 & {{ .edema }} == "n" ~ "mam",
-      .default = "not wasted"
-    )
-  } else {
-    x <- case_when(
-      muac < 115 ~ "sam",
-      muac >= 115 & muac < 125 ~ "mam",
-      .default = "not wasted"
-    )
-  }
-  x
+tell_muac_analysis_strategy <- function(age_ratio_class, sd_class) {
+  case_when(
+    age_ratio_class == "Problematic" & sd_class != "Problematic" ~ "weighted",
+    age_ratio_class != "Problematic" & sd_class == "Problematic" ~ "missing",
+    age_ratio_class == "Problematic" & sd_class == "Problematic" ~ "missing",
+    .default = "unweighted"
+  )
 }
 
 # The CDC approach to correct for age bias in MUAC -----------------------------
@@ -99,18 +87,16 @@ compute_weighted_prevalence <- function(df, .edema=NULL, .summary_by = NULL) {
   if (!is.null(.summary_by)) {
     df <- df |>
       filter(.data$flag_mfaz == 0) |>
-      mutate(muac = recode_muac(.data$muac, unit = "mm")) |>
-      group_by(!!.summary_by) |>
+      #mutate(muac = recode_muac(.data$muac, unit = "cm")) |>
       summarise(
         sam = apply_cdc_age_weighting(.data$muac, .data$age, {{ .edema }}, status = "sam"),
         mam = apply_cdc_age_weighting(.data$muac, .data$age, {{ .edema }}, status = "mam"),
         gam = sum(.data$sam, .data$mam),
-        .groups = "drop"
+        .by = !!.summary_by
       ) |>
-      mutate(
-        gam_p = gam, sam_p = sam, mam_p = mam,
-        gam = NA, sam = NA, mam = NA
-      ) |> dplyr::select(!2:4)
+      dplyr::rename(
+        gam_p = .data$gam, sam_p = .data$sam, mam_p = .data$mam
+      )
   } else {
     df <- df |>
       filter(.data$flag_mfaz == 0) |>
@@ -120,10 +106,9 @@ compute_weighted_prevalence <- function(df, .edema=NULL, .summary_by = NULL) {
         mam = apply_cdc_age_weighting(.data$muac, .data$age, {{ .edema }}, status = "mam"),
         gam = sum(.data$sam, .data$mam)
       ) |>
-      mutate(
-        gam_p = gam, sam_p = sam, mam_p = mam,
-        gam = NA, sam = NA, mam = NA
-      ) |> dplyr::select(!2:4)
+      dplyr::rename(
+        gam_p = .data$gam, sam_p = .data$sam, mam_p = .data$mam
+      )
   }
   df
 }
@@ -189,60 +174,74 @@ compute_muac_prevalence <- function(df,
 
   ## Get and classify age ratio and standard deviation ----
   if (!rlang::quo_is_null(.summary_by)) {
-    x <- group_by(df, !!.summary_by) |>
+    x <- df |>
+      group_by(!!.summary_by) |>
       summarise(
         age_ratio = classify_age_sex_ratio(age_ratio_test(.data$age, .expectedP = 0.66)$p),
         std = classify_sd(sd(remove_flags(as.numeric(.data$mfaz), "zscore"), na.rm = TRUE)),
-        analysis_approach = tell_muac_analysis_strategy(age_ratio, std),
+        analysis_approach = tell_muac_analysis_strategy(.data$age_ratio, .data$std),
         .groups = "drop"
       )
   } else {
-    x <- summarise(
-      df,
-      age_ratio = classify_age_sex_ratio(age_ratio_test(.data$age, .expectedP = 0.66)$p),
-      std = classify_sd(sd(remove_flags(as.numeric(.data$mfaz), "zscore"), na.rm = TRUE)),
-      analysis_approach = tell_muac_analysis_strategy(age_ratio, std)
-    )
+    x <- df |>
+      summarise(
+        age_ratio = classify_age_sex_ratio(age_ratio_test(.data$age, .expectedP = 0.66)$p),
+        std = classify_sd(sd(remove_flags(as.numeric(.data$mfaz), "zscore"), na.rm = TRUE)),
+        analysis_approach = tell_muac_analysis_strategy(.data$age_ratio, .data$std)
+      )
   }
 
-  for (i in seq_along(nrow(x))) {
+  ## Iterate over data frame to compute prevalence according to analysis_approach ----
+  for (i in seq_len(nrow(x))) {
     if (!rlang::quo_is_null(.summary_by)) {
       area <- dplyr::pull(x, !!.summary_by)[i]
-      data <- filter(df, !!sym(rlang::quo_name(.summary_by)) == !!area)
+      data <- filter(df, !!sym(rlang::quo_name(.summary_by)) == area)
     } else {
       data <- df
     }
 
     analysis_approach <- x$analysis_approach[i]
 
     if (analysis_approach == "unweighted") {
+      ### Compute standard complex sample based prevalence analysis ----
       output <- compute_pps_based_muac_prevalence(data, {{ .wt }}, {{ .edema }}, !!.summary_by)
     } else if (analysis_approach == "weighted") {
       if (!rlang::quo_is_null(.summary_by)) {
+        ### Compute weighted prevalence summarized at .summary_by ----
         output <- compute_weighted_prevalence(data, .edema = {{ .edema }}, !!.summary_by)
       } else {
+        ### Compute non-summarized weighted prevalence ----
         output <- compute_weighted_prevalence(data, .edema = {{ .edema }})
       }
     } else {
-      output <- tibble::tibble(
-        gam_n = NA,
-        sam_n = NA,
-        mam_n = NA,
-        gam_p = NA,
-        sam_p = NA,
-        mam_p = NA,
-        gam_p_low = NA,
-        gam_p_upp = NA,
-        sam_p_low = NA,
-        sam_p_upp = NA,
-        mam_p_low = NA,
-        mam_p_upp = NA,
-        gam_p_deff = NA,
-        sam_p_deff = NA,
-        mam_p_deff = NA
-      )
+      if (!rlang::quo_is_null(.summary_by)) {
+        ### Add NA's to the summarized tibble accordingly ----
+        output <- summarise(
+          data,
+          gam_p = NA_real_,
+          sam_p = NA_real_,
+          mam_p = NA_real_,
+          .by = !!.summary_by
+        )
+      } else {
+        ### Return a non-summarised tibble ----
+        output <- tibble::tibble(
+          gam_p = NA_real_,
+          sam_p = NA_real_,
+          mam_p = NA_real_
+        )
+      }
     }
     results[[i]] <- output
   }
-  dplyr::bind_rows(results)
+  ### Ensure that all geographical aras are added to the tibble ----
+  if (!rlang::quo_is_null(.summary_by)) {
+    results <- dplyr::bind_rows(results) |>
+      dplyr::relocate(.data$gam_p, .after = .data$gam_n) |>
+      dplyr::relocate(.data$sam_p, .after = .data$sam_n) |>
+      dplyr::relocate(.data$mam_p, .after = .data$mam_n)
+  } else {
+    results <- dplyr::bind_rows(results)
+  }
+  results
 }
diff --git a/R/prevalence_wfhz.R b/R/prevalence_wfhz.R
@@ -109,7 +109,7 @@ apply_probit_approach <- function(x, .status = c("gam", "sam")) {
 #' Compute global, severe and moderate acute malnutrition prevalence using PROBIT approach.
 #'
 #' @description
-#' This function is a helper function inside the main fuction [compute_wfhz_prevalence()].
+#' This function is a helper function inside the main function [compute_wfhz_prevalence()].
 #' It is used to compute PROBIT based prevalence depending on the status of standard deviation.
 #'
 #' @param df A data frame object returned by [process_wfhz_data()]. this will contain the
@@ -131,11 +131,11 @@ compute_probit_prevalence <- function(df, .summary_by = NULL) {
       df,
       gam = apply_probit_approach(.data$wfhz, .status = "gam"),
       sam = apply_probit_approach(.data$wfhz, .status = "sam"),
-      mam = gam - sam,
+      mam = .data$gam - .data$sam,
       .by = !!.summary_by
       ) |>
       mutate(
-        gam_p = gam, sam_p = sam, mam_p = mam,
+        gam_p = .data$gam, sam_p = .data$sam, mam_p = .data$mam,
         gam = NA, sam = NA, mam = NA
       ) |>
       dplyr::select(!2:4) ## To make it fit in the tibble structure from the main function
@@ -144,10 +144,10 @@ compute_probit_prevalence <- function(df, .summary_by = NULL) {
       df,
       gam = apply_probit_approach(.data$wfhz, .status = "gam"),
       sam = apply_probit_approach(.data$wfhz, .status = "sam"),
-      mam = gam - sam
+      mam = .data$gam - .data$sam
       ) |>
       mutate(
-        gam_p = gam, sam_p = sam, mam_p = mam,
+        gam_p = .data$gam, sam_p = .data$sam, mam_p = .data$mam,
         gam = NA, sam = NA, mam = NA
       ) |>
       dplyr::select(!2:4) ## To make it fit in the tibble structure from the main function
@@ -183,8 +183,8 @@ compute_probit_prevalence <- function(df, .summary_by = NULL) {
 #' .summary_by = NULL (default).
 #'
 #' @returns A tibble. The length vary depending on .summary_by. If set to NULL, a tibble of
-#' 1 x 16 is returned, otherwise, a tibble of n rowns (depending on the number of geographical
-#' areas in the dataset) x 17.
+#' 1 x 16 is returned, otherwise, a tibble of n rows (depending on the number of geographical
+#' areas in the data set) x 17.
 #'
 #' @examples
 #'
-Original file line number
+Diff line change
@@ Expand Up @@
             age_days = round({{ age }} * 30.44, 2)
           )
       }
-      dplyr::as_tibble(df)
+      tibble::as_tibble(df)
     }
     #'
@@ Expand Down @@