From 0c586ba98bdb76e7419ca326c191091ad3e6b231 Mon Sep 17 00:00:00 2001 From: Mohamed Yusuf Date: Wed, 11 Sep 2024 13:55:17 +0300 Subject: [PATCH] Add new functionality to validate AFRO raw data :star2: --- DESCRIPTION | 6 +- NAMESPACE | 3 + R/globals.R | 33 +- R/validate_data.R | 578 +++++++++++++++++++++++----- man/check_coords.Rd | 55 ++- man/check_data.Rd | 5 +- man/create_summary_by_group.Rd | 5 +- man/summarize_validation_results.Rd | 51 +++ man/validate_afro.Rd | 110 ++++++ man/validate_date.Rd | 26 +- man/validate_dates.Rd | 28 +- man/validate_polis.Rd | 5 +- 12 files changed, 745 insertions(+), 160 deletions(-) create mode 100644 man/summarize_validation_results.Rd create mode 100644 man/validate_afro.Rd diff --git a/DESCRIPTION b/DESCRIPTION index d080ee3..840e3ac 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -53,7 +53,11 @@ Suggests: yaml, scales, webshot, - gt + gt, + parzer, + readr, + zoo, + ggplot2 Config/testthat/edition: 3 Encoding: UTF-8 LazyData: true diff --git a/NAMESPACE b/NAMESPACE index f2b86e1..c9012b4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ export(create_gt_table) export(create_summary_by_group) export(create_summary_table) export(detect_date_format) +export(export_and_read_mdb) export(find_nearest_coord) export(get_missing_ids) export(get_multi_ona_data) @@ -24,7 +25,9 @@ export(prep_match_names) export(prep_mdb_table_extract) export(read) export(save) +export(summarize_validation_results) export(validate_admin_hierarchy) +export(validate_afro) export(validate_date) export(validate_dates) export(validate_polis) diff --git a/R/globals.R b/R/globals.R index 6c743c8..0c0edaf 100644 --- a/R/globals.R +++ b/R/globals.R @@ -1,27 +1,30 @@ # Declare global variables for package check globalVariables( - c(".data", ":=", "AdminToClean", "Algorithm", "Distance", "MatchRank", - "MatchedNames", "X", "country", "district_prepped", "district", + c( + ".data", ":=", "AdminToClean", "Algorithm", "Distance", "MatchRank", + "MatchedNames", "X", "country", "district_prepped", "district", "province_prepped", "country_prepped", "long_geo", - "dot_name", "joined_adm", "level", "matches", "name_alias", "replacement", - "name_corrected", "name_corrected2", "province", "_id", "matched_names", + "dot_name", "joined_adm", "level", "matches", "name_alias", "replacement", + "name_corrected", "name_corrected2", "province", "_id", "matched_names", "date_last_updated", "form_id", "form_id_num", "last_date_in_chunk", "x", - "ADM0_NAME", "ADM1_NAME", "ADM2_NAME", "name_to_match", "match_rank", + "ADM0_NAME", "ADM1_NAME", "ADM2_NAME", "name_to_match", "match_rank", "algorithm_name", "created_time", "distance", "name_of_creator", "level0_prepped", "level1_prepped", "level2_prepped", "log_message", - "longname_corrected", "longname_to_match", "log_results", "lat_precision", - "CENTER_LAT", "CENTER_LON", "ENDDATE", "low_precision", "lon_precision", + "longname_corrected", "longname_to_match", "log_results", "lat_precision", + "CENTER_LAT", "CENTER_LON", "ENDDATE", "low_precision", "lon_precision", "dist_correct", "dist_flipped", "flipped", "temp_lat", "temp_lon", - "new_columns", "new_rows", "total_columns", "total_rows", "key_order", - "check_on_water", "httpcode", "missing_coords", "on_water", "current_year", + "new_columns", "new_rows", "total_columns", "total_rows", "key_order", + "check_on_water", "httpcode", "missing_coords", "on_water", "current_year", "potentially_flipped", "total_coords", "priority", "out_of_bounds", - "coord_diff_correct", "coord_diff_input", "missing_count", + "coord_diff_correct", "coord_diff_input", "missing_count", "Column", "Is Completely Null", "Missing Count", "Missing Percent", - "Priority column", "column", "Priority", "non_unique_count", + "Priority column", "column", "Priority", "non_unique_count", "proportion_unique", "total_count", "unique_count", "Data_Value", "Geo_Column", "Missing_in_Shapefile", "DataType", "VDPV1", "VDPV2", "VdpvClassifications", "WILD1", - "geo_id_cols", "geo_name_cols", "header", "Missing in Shapefile", - "Non-unique Count", "Variable", "ID", "Test", "Value", "null_count" - ) -) \ No newline at end of file + "geo_id_cols", "geo_name_cols", "header", "Missing in Shapefile", + "Non-unique Count", "Variable", "ID", "Test", "Value", "null_count", + "Lat_parzer", "Long_parzer", "parse_failed", "parsed_date", "parsed_date1", + "parsed_date2", "Total" + ) +) diff --git a/R/validate_data.R b/R/validate_data.R index 77e92a3..1b1d2a2 100644 --- a/R/validate_data.R +++ b/R/validate_data.R @@ -618,6 +618,7 @@ join_and_check_mismatches <- function(data, shapefile_data = NULL, #' @param run_geo_mismatch_check Logical, whether to run geographic mismatch #' check. #' @param run_coordinate_checks Logical, whether to run coordinate checks. +#' @param run_detections Logical, whether to run virus detections checks. #' @param coordinate_checks Character vector specifying which coordinate checks #' to run. #' @@ -681,10 +682,11 @@ check_data <- function(data, run_geo_hierarchy_check = TRUE, run_geo_mismatch_check = TRUE, run_coordinate_checks = TRUE, + run_detections = TRUE, coordinate_checks = c( "flip", "on_water", "missing", "out_of_bounds", "precision", - "null_coords" + "null_coords", "parse" )) { # Initialize results list full_results <- list() @@ -733,7 +735,7 @@ check_data <- function(data, ) |> dplyr::filter(is.na(Value)) |> dplyr::distinct(!!rlang::sym(id_col), Column) |> - dplyr::mutate(`Column Type` = "Geo") |> + dplyr::mutate(`Column Type` = "Geo") |> dplyr::mutate(Test = glue::glue("{Column} is missing")) |> dplyr::select(-Column) |> as.data.frame() @@ -898,7 +900,7 @@ check_data <- function(data, summary_table = TRUE, checks = coordinate_checks ) - + coord_check_results$var1 # add coordinate_checks to results full_results$coordinate_checks <- coord_check_results @@ -916,68 +918,88 @@ check_data <- function(data, ) full_results$coord_issue_id <- dplyr::bind_rows( - coord_check_results |> - dplyr::filter(on_water != "Land") |> - dplyr::distinct(!!rlang::sym(id_col)) |> - dplyr::mutate(Test = "on_water"), - coord_check_results |> - dplyr::filter(potentially_flipped == TRUE) |> - dplyr::distinct(!!rlang::sym(id_col)) |> - dplyr::mutate(Test = "potentially_flipped"), - coord_check_results |> - dplyr::filter(null_count == TRUE) |> - dplyr::distinct(!!rlang::sym(id_col)) |> - dplyr::mutate(Test = "null_count"), - coord_check_results |> - dplyr::filter(out_of_bounds == TRUE) |> - dplyr::distinct(!!rlang::sym(id_col)) |> - dplyr::mutate(Test = "out_of_bounds"), - coord_check_results |> - dplyr::filter(low_precision == TRUE) |> - dplyr::distinct(!!rlang::sym(id_col)) |> - dplyr::mutate(Test = "low_precision"), - coord_check_results |> - dplyr::filter(missing_coords == TRUE) |> - dplyr::distinct(!!rlang::sym(id_col)) |> - dplyr::mutate(Test = "missing_coords") + if ("parse" %in% coordinate_checks) { + coord_check_results |> + dplyr::filter(parse_failed == TRUE) |> + dplyr::distinct(!!rlang::sym(id_col)) |> + dplyr::mutate(Test = "parse_failed") + }, + if ("on_water" %in% coordinate_checks) { + coord_check_results |> + dplyr::filter(on_water != "Land") |> + dplyr::distinct(!!rlang::sym(id_col)) |> + dplyr::mutate(Test = "on_water") + }, + if ("flip" %in% coordinate_checks) { + coord_check_results |> + dplyr::filter(potentially_flipped == TRUE) |> + dplyr::distinct(!!rlang::sym(id_col)) |> + dplyr::mutate(Test = "potentially_flipped") + }, + if ("null_coords" %in% coordinate_checks) { + coord_check_results |> + dplyr::filter(null_count == TRUE) |> + dplyr::distinct(!!rlang::sym(id_col)) |> + dplyr::mutate(Test = "null_count") + }, + if ("out_of_bounds" %in% coordinate_checks) { + coord_check_results |> + dplyr::filter(out_of_bounds == TRUE) |> + dplyr::distinct(!!rlang::sym(id_col)) |> + dplyr::mutate(Test = "out_of_bounds") + }, + if ("precision" %in% coordinate_checks) { + coord_check_results |> + dplyr::filter(low_precision == TRUE) |> + dplyr::distinct(!!rlang::sym(id_col)) |> + dplyr::mutate(Test = "low_precision") + }, + if ("missing" %in% coordinate_checks) { + coord_check_results |> + dplyr::filter(missing_coords == TRUE) |> + dplyr::distinct(!!rlang::sym(id_col)) |> + dplyr::mutate(Test = "missing_coords") + } ) |> dplyr::mutate(`Column Type` = "Coordinate") } - # set up data identify detections - data2 <- data |> - dplyr::mutate( - cVDPV1 = if ("VDPV1" %in% names(data)) { - dplyr::if_else(VDPV1 == TRUE & VdpvClassifications == "Circulating", - TRUE, FALSE, missing = FALSE - ) - }, - cVDPV2 = if ("VDPV2" %in% names(data)) { - dplyr::if_else(VDPV2 == TRUE & VdpvClassifications == "Circulating", - TRUE, FALSE, missing = FALSE - ) - }, - WPV1 = if ("WILD1" %in% names(data)) { - dplyr::if_else(WILD1 == TRUE, TRUE, FALSE, missing = FALSE) - } else { - FALSE - } - ) + if (run_detections) { + # set up data identify detections + data2 <- data |> + dplyr::mutate( + cVDPV1 = if ("VDPV1" %in% names(data)) { + dplyr::if_else(VDPV1 == TRUE & VdpvClassifications == "Circulating", + TRUE, FALSE, missing = FALSE + ) + }, + cVDPV2 = if ("VDPV2" %in% names(data)) { + dplyr::if_else(VDPV2 == TRUE & VdpvClassifications == "Circulating", + TRUE, FALSE, missing = FALSE + ) + }, + WPV1 = if ("WILD1" %in% names(data)) { + dplyr::if_else(WILD1 == TRUE, TRUE, FALSE, missing = FALSE) + } else { + FALSE + } + ) - full_results$wpv1_count <- if ("WPV1" %in% names(data2)) { - sum(data2$WPV1, na.rm = TRUE) - } else { - 0 - } - full_results$cvdpv1_count <- if ("cVDPV1" %in% names(data2)) { - sum(data2$cVDPV1, na.rm = TRUE) - } else { - 0 - } - full_results$cvdpv2_count <- if ("cVDPV2" %in% names(data2)) { - sum(data2$cVDPV2, na.rm = TRUE) - } else { - 0 + full_results$wpv1_count <- if ("WPV1" %in% names(data2)) { + sum(data2$WPV1, na.rm = TRUE) + } else { + 0 + } + full_results$cvdpv1_count <- if ("cVDPV1" %in% names(data2)) { + sum(data2$cVDPV1, na.rm = TRUE) + } else { + 0 + } + full_results$cvdpv2_count <- if ("cVDPV2" %in% names(data2)) { + sum(data2$cVDPV2, na.rm = TRUE) + } else { + 0 + } } full_results$total_columns <- ncol(data) @@ -1161,6 +1183,7 @@ create_summary_table <- function(data) { Column == "Total Columns" ~ "Total Columns", Column == "Total Duplicate" ~ "Total Duplicate Rows", Column == "Missing Coords" ~ "Missing Coordinates", + Column == "Parse Failed" ~ "Coordinates Failed to Parse", Column == "Null Coords" ~ "Null Coordinates (Latitude and Longitude are 0, 0)", Column == "Out of Bounds" ~ @@ -1234,6 +1257,7 @@ create_summary_table <- function(data) { ) |> dplyr::mutate( Test = dplyr::case_when( + Test == "parse_failed" ~ "Coordinates Failed to Parse", Test == "missing_coords" ~ "Missing Coordinate Values", Test == "null_count" ~ "Null Coordinates (Latitude And Longitude Are 0, 0)", @@ -1312,6 +1336,7 @@ create_summary_table <- function(data) { #' check (default is TRUE). #' @param run_coordinate_checks Logical; if TRUE, runs coordinate validation #' checks (default is TRUE). +#' @param run_detections Logical, whether to run virus detections checks. #' @param coordinate_checks Character vector specifying which coordinate checks #' to run. #' @param n_groups The number of groups to include in the summary (def is 4). @@ -1357,12 +1382,13 @@ create_summary_by_group <- function(data, group_var, id_col, geo_name_cols, run_geo_hierarchy_check = TRUE, run_geo_mismatch_check = TRUE, run_coordinate_checks = TRUE, + run_detections = FALSE, coordinate_checks = c( "flip", "on_water", "missing", "out_of_bounds", "precision", - "null_coords" + "null_coords", "parse" ), n_groups = 4, decreasing = FALSE) { # Get unique groups and sort them @@ -1394,7 +1420,8 @@ create_summary_by_group <- function(data, group_var, id_col, geo_name_cols, run_geo_hierarchy_check = run_geo_hierarchy_check, run_geo_mismatch_check = run_geo_mismatch_check, run_coordinate_checks = run_coordinate_checks, - coordinate_checks = coordinate_checks + coordinate_checks = coordinate_checks, + run_detections = run_detections ) |> create_summary_table() } @@ -1547,55 +1574,56 @@ create_gt_table <- function(summary_data, #' Validate POLIS Data #' -#' This function performs data quality checks on POLIS (Polio Information +#' This function performs data quality checks on POLIS (Polio Information #' System) data. #' #' @param data A dataframe containing the POLIS data to be validated. #' @param type Character string specifying the data type. Either "AFP" or "ES". -#' @param group_var Optional. The column name to group the data by. If NULL, +#' @param group_var Optional. The column name to group the data by. If NULL, #' uses default. -#' @param id_col Optional. The column name for the unique identifier. If NULL, +#' @param id_col Optional. The column name for the unique identifier. If NULL, #' uses default. -#' @param geo_name_cols Optional. A vector of column names for geographic names. +#' @param geo_name_cols Optional. A vector of column names for geographic names. #' If NULL, uses default. -#' @param geo_id_cols Optional. A vector of column names for geographic IDs. +#' @param geo_id_cols Optional. A vector of column names for geographic IDs. #' If NULL, uses default. -#' @param lat_long_cols Optional. A vector of column names for latitude and +#' @param lat_long_cols Optional. A vector of column names for latitude and #' longitude. If NULL, uses default. -#' @param date_cols Optional. A vector of column names for date fields. If NULL, +#' @param date_cols Optional. A vector of column names for date fields. If NULL, #' uses default. -#' @param date_pair_cols Optional. A list of date column pairs for comparison. +#' @param date_pair_cols Optional. A list of date column pairs for comparison. #' If NULL, uses default. -#' @param n_groups Integer. The number of groups to display in the summary. +#' @param n_groups Integer. The number of groups to display in the summary. #' Default is 8. -#' @param decreasing Logical. Whether to sort the groups in decreasing order. +#' @param decreasing Logical. Whether to sort the groups in decreasing order. #' Default is FALSE. -#' @param plots_path Optional. The file path to save output plots. Required if +#' @param plots_path Optional. The file path to save output plots. Required if #' save_output is TRUE. -#' @param polis_version Character string. The version of POLIS being used. +#' @param polis_version Character string. The version of POLIS being used. #' Default is "2.37.1". #' @param custom_title Optional. A custom title for the output table. -#' @param save_output Logical. Whether to save the output as HTML and PNG. +#' @param save_output Logical. Whether to save the output as HTML and PNG. #' Default is FALSE. -#' @param vheight Integer. The height of the output image in pixels. +#' @param vheight Integer. The height of the output image in pixels. #' Default is 1400. -#' @param vwidth Integer. The width of the output image in pixels. Default +#' @param vwidth Integer. The width of the output image in pixels. Default #' is 1550. +#' @param ... Additional arguments passed to internal functions. #' #' @return A list containing two elements: #' \item{gt_table}{A gt table object with the validation summary} -#' \item{id_data}{A dataframe with detailed information for each unique +#' \item{id_data}{A dataframe with detailed information for each unique #' identifier} #' #' @details -#' This function performs various data quality checks on POLIS data, including -#' checks for missing values, geographic data consistency, date field validity, -#' and more. It allows for customization of column names and grouping -#' variables, making it flexible for different data structures within the +#' This function performs various data quality checks on POLIS data, including +#' checks for missing values, geographic data consistency, date field validity, +#' and more. It allows for customization of column names and grouping +#' variables, making it flexible for different data structures within the #' POLIS system. #' -#' The function will use default parameters based on the specified data type -#' (AFP or ES) if custom parameters are not provided. It also checks if all +#' The function will use default parameters based on the specified data type +#' (AFP or ES) if custom parameters are not provided. It also checks if all #' specified columns exist in the dataset before proceeding with the analysis. #' #' @examples @@ -1627,8 +1655,7 @@ validate_polis <- function(data, type = "AFP", plots_path = NULL, polis_version = "2.37.1", custom_title = NULL, save_output = FALSE, - vheight = 1400, vwidth = 1550) { - + vheight = 1400, vwidth = 1550, ...) { # Conditional loading for packages required_packages <- c("scales", "gt", "glue", "webshot") @@ -1756,7 +1783,8 @@ validate_polis <- function(data, type = "AFP", date_cols = params$date_cols, date_pair_cols = params$date_pair_cols, n_groups = n_groups, - decreasing = decreasing + decreasing = decreasing, + ... ) title <- if (is.null(custom_title)) { @@ -1795,3 +1823,379 @@ validate_polis <- function(data, type = "AFP", return(list(gt_table = gt_table, id_data = summary$id_data)) } + +#' Validate AFRO Data +#' +#' This function performs data quality checks on AFRO (African Regional Office) +#' data before it is sent to POLIS (Polio Information System). +#' +#' @param data A dataframe containing the AFRO data to be validated. +#' @param type Character string specifying the data type. Either "AFP" or "ES". +#' @param group_var Optional. The column name to group the data by. If NULL, +#' uses default. +#' @param id_col Optional. The column name for the unique identifier. If NULL, +#' uses default. +#' @param geo_name_cols Optional. A vector of column names for geographic names. +#' If NULL, uses default. +#' @param geo_id_cols Optional. A vector of column names for geographic IDs. +#' If NULL, uses default. +#' @param lat_long_cols Optional. A vector of column names for latitude and +#' longitude. If NULL, uses default. +#' @param date_cols Optional. A vector of column names for date fields. If NULL, +#' uses default. +#' @param date_pair_cols Optional. A list of date column pairs for comparison. +#' If NULL, uses default. +#' @param n_groups Integer. The number of groups to display in the summary. +#' Default is 8. +#' @param decreasing Logical. Whether to sort the groups in decreasing order. +#' Default is FALSE. +#' @param plots_path Optional. The file path to save output plots. Required if +#' save_output is TRUE. +#' @param custom_title Optional. A custom title for the output table. +#' @param save_output Logical. Whether to save the output as HTML and PNG. +#' Default is FALSE. +#' @param vheight Integer. The height of the output image in pixels. +#' Default is 1400. +#' @param vwidth Integer. The width of the output image in pixels. Default +#' is 1550. +#' @param ... Additional arguments passed to internal functions. +#' +#' @return A list containing two elements: +#' \item{gt_table}{A gt table object with the validation summary} +#' \item{id_data}{A dataframe with detailed information for each unique +#' identifier} +#' +#' @details +#' This function performs various data quality checks on AFRO data before it is +#' sent to POLIS, including checks for missing values, geographic data +#' consistency, date field validity, and more. It allows for customization of +#' column names and grouping variables, making it flexible for different data +#' structures within the AFRO system. +#' +#' The function will use default parameters based on the specified data type +#' (AFP or ES) if custom parameters are not provided. It also checks if all +#' specified columns exist in the dataset before proceeding with the analysis. +#' +#' @examples +#' # Assuming afro_data is your dataset and you have the necessary dependencies +#' # result <- validate_afro(afro_data, type = "AFP", +#' # group_var = "ReportingYear", +#' # n_groups = 8, +#' # decreasing = FALSE, +#' # plots_path = "/path/to/save/plots", +#' # save_output = TRUE, +#' # vheight = 1500, vwidth = 1600) +#' +#' # Access the GT table and ID data +#' # gt_table <- result$gt_table +#' # id_data <- result$id_data +#' +#' @export +validate_afro <- function(data, type = "AFP", + group_var = NULL, + id_col = NULL, + geo_name_cols = NULL, + geo_id_cols = NULL, + lat_long_cols = NULL, + date_cols = NULL, + date_pair_cols = NULL, + n_groups = 8, + decreasing = FALSE, + plots_path = NULL, + custom_title = NULL, save_output = FALSE, + vheight = 1400, vwidth = 1550, ...) { + # Conditional loading for packages + required_packages <- c( + "scales", "zoo", "gt", "glue", "webshot" + ) + + missing_packages <- required_packages[!sapply( + required_packages, requireNamespace, + quietly = TRUE + )] + + if (length(missing_packages) > 0) { + stop( + paste0( + "Package(s) ", paste(missing_packages, collapse = ", "), + " required but not installed. Please install them." + ), + call. = FALSE + ) + } + + # Define parameters based on data type + default_params <- list( + AFP = list( + group_var = group_var, + id_col = "EpidNumber", + geo_name_cols = c("ctry", "Province", "District"), + geo_id_cols = NULL, + date_cols = c( + "DateReceived", "DateOfOnset", + "DateNotified", "DateCaseinvestigated", + "Date1stStool", "Date2ndStool", + "DateStoolSentolab", "DateSpecRecbyNatLab", + "DateFinalCellcultureResults" + ), + date_pair_cols = list( + c("DateOfOnset", "DateNotified"), + c("DateNotified", "DateCaseinvestigated"), + c("DateOfOnset", "DateCaseinvestigated"), + c("DateOfOnset", "Date1stStool"), + c("DateOfOnset", "Date2ndStool"), + c("Date1stStool", "Date2ndStool"), + c("DateSpecRecbyNatLab", "DateStoolSentolab"), + c("DateSpecRecbyNatLab", "DateOfOnset"), + c("DateFinalCellcultureResults", "DateOfOnset"), + c("DateSpecRecbyNatLab", "DateCaseinvestigated"), + c("DateFinalCellcultureResults", "DateCaseinvestigated") + ), + lat_long_cols = c("Latitude", "Longitude") + ) + ) + + if (!type %in% names(default_params)) { + stop( + "Invalid data type. Supported types are: ", + paste(names(default_params), collapse = ", ") + ) + } + + # Use provided parameters if not NULL, otherwise use defaults + params <- list( + group_var = if (!is.null(group_var)) { + group_var + } else { + default_params[[type]]$group_var + }, + id_col = if (!is.null(id_col)) { + id_col + } else { + default_params[[type]]$id_col + }, + geo_name_cols = if (!is.null(geo_name_cols)) { + geo_name_cols + } else { + default_params[[type]]$geo_name_cols + }, + geo_id_cols = if (!is.null(geo_id_cols)) { + geo_id_cols + } else { + default_params[[type]]$geo_id_cols + }, + lat_long_cols = if (!is.null(lat_long_cols)) { + lat_long_cols + } else { + default_params[[type]]$lat_long_cols + }, + date_cols = if (!is.null(date_cols)) { + date_cols + } else { + default_params[[type]]$date_cols + }, + date_pair_cols = if (!is.null(date_pair_cols)) { + date_pair_cols + } else { + default_params[[type]]$date_pair_cols + } + ) + + # Check if all specified columns exist in the dataset + all_cols <- c( + params$group_var, params$id_col, params$geo_name_cols, + params$geo_id_cols, params$lat_long_cols, params$date_cols, + unlist(params$date_pair_cols) + ) + missing_cols <- setdiff(all_cols, names(data)) + + if (length(missing_cols) > 0) { + stop(paste( + "The following columns are not present in the dataset:", + paste(missing_cols, collapse = ", ") + )) + } + + + summary <- create_summary_by_group( + data = data, + group_var = params$group_var, + id_col = params$id_col, + geo_name_cols = params$geo_name_cols, + geo_id_cols = params$geo_id_cols, + lat_long_cols = params$lat_long_cols, + date_cols = params$date_cols, + date_pair_cols = params$date_pair_cols, + n_groups = n_groups, + decreasing = decreasing, + ... + ) + + # set up time lab + time_labs <- paste0( + "For ", zoo::as.yearmon(Sys.Date()), + " in Epiweek ", + lubridate::epiweek(Sys.Date()) + ) + + title <- if (is.null(custom_title)) { + glue::glue( + "AFRO {type} Data Quality Checks ", + "{time_labs}" + ) + } else { + custom_title + } + + gt_table <- create_gt_table( + summary$summary_table |> dplyr::filter( + Column != "Total Null Columns" + ), + title = title + ) + + if (save_output) { + if (is.null(plots_path)) { + stop("plots_path must be provided when save_output is TRUE") + } + + today <- format(Sys.Date(), "%Y%m%d") + file_prefix <- glue::glue( + "afro_quality_check_{type}_validation_{today}}" + ) + + html_path <- file.path(plots_path, glue::glue("{file_prefix}.html")) + png_path <- file.path(plots_path, glue::glue("{file_prefix}.png")) + + gt_table |> gt::gtsave(html_path) + + webshot::webshot(html_path, png_path, vheight = vheight, vwidth = vwidth) + + file.remove(html_path) + } + + return(list(gt_table = gt_table, id_data = summary$id_data)) +} + +#' Summarize Validation Results +#' +#' Summarizes validation results and optionally creates a plot. +#' +#' @param data Data frame containing validation results. +#' @param metadata Data frame with test metadata. +#' @param id_col Character. Column name for unique identifiers. +#' @param test_type_name Character. Name of test type to summarize. +#' @param agg_vars Character vector. Column(s) to aggregate results by. +#' @param create_plot Logical. Whether to create a summary plot (default FALSE). +#' @param return_cols Character vector. Additional columns to return for culprit +#' identification (default NULL). +#' +#' @return List with: +#' \item{summary}{Data frame of summarized results, grouped by agg_vars.} +#' \item{plot}{ggplot object if create_plot is TRUE, else NULL.} +#' +#' @export +#' +#' @examples +#' \dontrun{ +#' result <- summarize_validation_results( +#' data, metadata, "ID", "missing coord", +#' c("Region", "Year"), +#' create_plot = TRUE +#' ) +#' print(result$summary) +#' if (!is.null(result$plot)) print(result$plot) +#' } +summarize_validation_results <- function(data, + metadata, id_col, + test_type_name, agg_vars, + create_plot = FALSE, + return_cols = NULL) { + # Validate inputs + if (!id_col %in% names(data)) { + stop("id_col not found in the dataset") + } + + if (length(agg_vars) == 0) { + stop("At least one aggregation variable must be provided") + } + for (var in agg_vars) { + if (!var %in% names(data)) { + stop(paste("Aggregation variable", var, "not found in the dataset")) + } + } + if (!test_type_name %in% metadata$Test) { + stop("test_type not found in metadata") + } + + # Get relevant IDs for the specified test type + ids <- metadata |> + dplyr::filter(Test %in% test_type_name) |> + dplyr::pull(!!rlang::sym(id_col)) + + # Summarize test results + summary <- data |> + dplyr::filter(!!rlang::sym(id_col) %in% ids) |> + dplyr::select(!!!rlang::syms(c(agg_vars, id_col))) |> + dplyr::group_by(!!!rlang::syms(agg_vars)) |> + dplyr::summarise(Total = dplyr::n_distinct(!!rlang::sym(id_col))) |> + dplyr::arrange(dplyr::desc(Total)) |> + dplyr::ungroup() + + # Get culprit columns if return_cols is provided + culprits <- NULL + if (!is.null(return_cols)) { + culprits <- data |> + dplyr::filter(!!rlang::sym(id_col) %in% ids) |> + dplyr::select(!!rlang::sym(id_col), !!!rlang::syms(return_cols)) + } + + plot <- NULL + if (create_plot) { + if (length(agg_vars) == 1) { + plot <- summary |> + dplyr::arrange(dplyr::desc(Total)) |> + ggplot2::ggplot(ggplot2::aes( + x = stats::reorder(!!rlang::sym(agg_vars), Total), + y = Total, + fill = !!rlang::sym(agg_vars) + )) + + ggplot2::geom_bar(stat = "identity", show.legend = FALSE) + + ggplot2::coord_flip() + + ggplot2::theme_minimal() + + ggplot2::labs( + title = paste("Summary of", test_type_name), + x = NULL, + y = "Total" + ) + + ggplot2::scale_y_continuous(labels = scales::comma) + } else if (length(agg_vars) == 2) { + plot <- summary |> + dplyr::group_by(!!rlang::sym(agg_vars[2])) |> + dplyr::arrange(dplyr::desc(Total), .by_group = TRUE) |> + dplyr::ungroup() |> + ggplot2::ggplot(ggplot2::aes( + x = stats::reorder(!!rlang::sym(agg_vars[1]), Total), + y = Total, + fill = !!rlang::sym(agg_vars[1]) + )) + + ggplot2::geom_bar(stat = "identity", show.legend = FALSE) + + ggplot2::facet_wrap( + ggplot2::vars(!!rlang::sym(agg_vars[2])), + scales = "free_y" + ) + + ggplot2::coord_flip() + + ggplot2::theme_minimal() + + ggplot2::labs( + title = paste("Summary of", test_type_name), + x = NULL, + y = "Total" + ) + + ggplot2::scale_y_continuous(labels = scales::comma) + } else { + warning("Plot can only be created for one or two aggregation variables.") + } + } + + list(summary = summary, plot = plot, culprits = culprits) +} diff --git a/man/check_coords.Rd b/man/check_coords.Rd index f985ea2..69c442e 100644 --- a/man/check_coords.Rd +++ b/man/check_coords.Rd @@ -13,7 +13,8 @@ check_coords( lon_col, correct_lat_col = NULL, correct_lon_col = NULL, - checks = c("flip", "on_water", "missing", "out_of_bounds", "precision", "null_coords"), + checks = c("flip", "on_water", "missing", "out_of_bounds", "precision", "null_coords", + "parse"), aggregate_by = NULL, summary_table = FALSE, min_precision = 4 @@ -22,46 +23,42 @@ check_coords( \arguments{ \item{data}{A dataframe containing the geographic data to be checked.} -\item{shapefile_data}{Optional. A dataframe containing reference shapefile -data. If NULL, \code{poliprep::shp_global} will be used.} +\item{shapefile_data}{Optional. A dataframe with reference shapefile data. +If NULL, \code{poliprep::shp_global} is used.} -\item{join_key_a}{The column name in data to join with the shapefile data.} +\item{join_key_a}{Column name in data to join with shapefile data.} -\item{join_key_b}{Optional. The column name in shapefile_data to join with -data. If NULL and shapefile_data is NULL, defaults to "ADM2_GUID".} +\item{join_key_b}{Optional. Column name in shapefile_data to join with data. +Defaults to "ADM2_GUID" if shapefile_data is NULL.} -\item{lat_col}{The name of the latitude column in the data.} +\item{lat_col}{Name of the latitude column in the data.} -\item{lon_col}{The name of the longitude column in the data.} +\item{lon_col}{Name of the longitude column in the data.} -\item{correct_lat_col}{Optional. The name of the correct latitude column in -shapefile_data. If NULL and shapefile_data is NULL, defaults to -"CENTER_LAT".} +\item{correct_lat_col}{Optional. Name of correct latitude column in +shapefile_data. Defaults to "CENTER_LAT" if shapefile_data is NULL.} -\item{correct_lon_col}{Optional. The name of the correct longitude column in -shapefile_data. If NULL and shapefile_data is NULL, defaults to -"CENTER_LON".} +\item{correct_lon_col}{Optional. Name of correct longitude column in +shapefile_data. Defaults to "CENTER_LON" if shapefile_data is NULL.} -\item{checks}{A character vector specifying which checks to perform. -Options are "flip", "on_water", "missing", "out_of_bounds", -"precision", and "null_coords". Default is all checks.} +\item{checks}{Character vector specifying checks to perform. Options: +"flip", "on_water", "missing", "out_of_bounds", "precision", +"null_coords", "parse". Default is all checks.} -\item{aggregate_by}{Optional. A column name to aggregate results by (e.g., -"country" or "year").} +\item{aggregate_by}{Optional. Column name(s) to aggregate results by.} -\item{summary_table}{Logical. If TRUE, returns a summary table instead of CLI -output.} +\item{summary_table}{Logical. If TRUE, returns a summary table instead of +detailed results. Default is FALSE.} -\item{min_precision}{Numeric. The minimum number of decimal places required -for coordinate precision. Default is 4.} +\item{min_precision}{Numeric. Minimum decimal places for coordinate +precision check. Default is 4.} } \value{ -If summary_table is FALSE, returns the input data with additional -check columns. If summary_table is TRUE, returns a summary table of -the checks. +If summary_table is FALSE, returns input data with additional check +columns. If TRUE, returns a summary table of the checks. } \description{ -This function performs various checks on geographic data, including -coordinate flipping, land/water checks, and missing coordinate checks. It -provides CLI output and an optional table summary. +Performs various checks on geographic data, including coordinate flipping, +land/water checks, missing coordinate checks, and more. Provides CLI output +and an optional table summary. } diff --git a/man/check_data.Rd b/man/check_data.Rd index ca9f232..25837ae 100644 --- a/man/check_data.Rd +++ b/man/check_data.Rd @@ -23,8 +23,9 @@ check_data( run_geo_hierarchy_check = TRUE, run_geo_mismatch_check = TRUE, run_coordinate_checks = TRUE, + run_detections = TRUE, coordinate_checks = c("flip", "on_water", "missing", "out_of_bounds", "precision", - "null_coords") + "null_coords", "parse") ) } \arguments{ @@ -76,6 +77,8 @@ check.} \item{run_coordinate_checks}{Logical, whether to run coordinate checks.} +\item{run_detections}{Logical, whether to run virus detections checks.} + \item{coordinate_checks}{Character vector specifying which coordinate checks to run.} } diff --git a/man/create_summary_by_group.Rd b/man/create_summary_by_group.Rd index 042353a..a38e787 100644 --- a/man/create_summary_by_group.Rd +++ b/man/create_summary_by_group.Rd @@ -24,8 +24,9 @@ create_summary_by_group( run_geo_hierarchy_check = TRUE, run_geo_mismatch_check = TRUE, run_coordinate_checks = TRUE, + run_detections = FALSE, coordinate_checks = c("flip", "on_water", "missing", "out_of_bounds", "precision", - "null_coords"), + "null_coords", "parse"), n_groups = 4, decreasing = FALSE ) @@ -80,6 +81,8 @@ check (default is TRUE).} \item{run_coordinate_checks}{Logical; if TRUE, runs coordinate validation checks (default is TRUE).} +\item{run_detections}{Logical, whether to run virus detections checks.} + \item{coordinate_checks}{Character vector specifying which coordinate checks to run.} diff --git a/man/summarize_validation_results.Rd b/man/summarize_validation_results.Rd new file mode 100644 index 0000000..9dd94f2 --- /dev/null +++ b/man/summarize_validation_results.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate_data.R +\name{summarize_validation_results} +\alias{summarize_validation_results} +\title{Summarize Validation Results} +\usage{ +summarize_validation_results( + data, + metadata, + id_col, + test_type_name, + agg_vars, + create_plot = FALSE, + return_cols = NULL +) +} +\arguments{ +\item{data}{Data frame containing validation results.} + +\item{metadata}{Data frame with test metadata.} + +\item{id_col}{Character. Column name for unique identifiers.} + +\item{test_type_name}{Character. Name of test type to summarize.} + +\item{agg_vars}{Character vector. Column(s) to aggregate results by.} + +\item{create_plot}{Logical. Whether to create a summary plot (default FALSE).} + +\item{return_cols}{Character vector. Additional columns to return for culprit +identification (default NULL).} +} +\value{ +List with: +\item{summary}{Data frame of summarized results, grouped by agg_vars.} +\item{plot}{ggplot object if create_plot is TRUE, else NULL.} +} +\description{ +Summarizes validation results and optionally creates a plot. +} +\examples{ +\dontrun{ +result <- summarize_validation_results( + data, metadata, "ID", "missing coord", + c("Region", "Year"), + create_plot = TRUE +) +print(result$summary) +if (!is.null(result$plot)) print(result$plot) +} +} diff --git a/man/validate_afro.Rd b/man/validate_afro.Rd new file mode 100644 index 0000000..cf0a04a --- /dev/null +++ b/man/validate_afro.Rd @@ -0,0 +1,110 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate_data.R +\name{validate_afro} +\alias{validate_afro} +\title{Validate AFRO Data} +\usage{ +validate_afro( + data, + type = "AFP", + group_var = NULL, + id_col = NULL, + geo_name_cols = NULL, + geo_id_cols = NULL, + lat_long_cols = NULL, + date_cols = NULL, + date_pair_cols = NULL, + n_groups = 8, + decreasing = FALSE, + plots_path = NULL, + custom_title = NULL, + save_output = FALSE, + vheight = 1400, + vwidth = 1550, + ... +) +} +\arguments{ +\item{data}{A dataframe containing the AFRO data to be validated.} + +\item{type}{Character string specifying the data type. Either "AFP" or "ES".} + +\item{group_var}{Optional. The column name to group the data by. If NULL, +uses default.} + +\item{id_col}{Optional. The column name for the unique identifier. If NULL, +uses default.} + +\item{geo_name_cols}{Optional. A vector of column names for geographic names. +If NULL, uses default.} + +\item{geo_id_cols}{Optional. A vector of column names for geographic IDs. +If NULL, uses default.} + +\item{lat_long_cols}{Optional. A vector of column names for latitude and +longitude. If NULL, uses default.} + +\item{date_cols}{Optional. A vector of column names for date fields. If NULL, +uses default.} + +\item{date_pair_cols}{Optional. A list of date column pairs for comparison. +If NULL, uses default.} + +\item{n_groups}{Integer. The number of groups to display in the summary. +Default is 8.} + +\item{decreasing}{Logical. Whether to sort the groups in decreasing order. +Default is FALSE.} + +\item{plots_path}{Optional. The file path to save output plots. Required if +save_output is TRUE.} + +\item{custom_title}{Optional. A custom title for the output table.} + +\item{save_output}{Logical. Whether to save the output as HTML and PNG. +Default is FALSE.} + +\item{vheight}{Integer. The height of the output image in pixels. +Default is 1400.} + +\item{vwidth}{Integer. The width of the output image in pixels. Default +is 1550.} + +\item{...}{Additional arguments passed to internal functions.} +} +\value{ +A list containing two elements: +\item{gt_table}{A gt table object with the validation summary} +\item{id_data}{A dataframe with detailed information for each unique +identifier} +} +\description{ +This function performs data quality checks on AFRO (African Regional Office) +data before it is sent to POLIS (Polio Information System). +} +\details{ +This function performs various data quality checks on AFRO data before it is +sent to POLIS, including checks for missing values, geographic data +consistency, date field validity, and more. It allows for customization of +column names and grouping variables, making it flexible for different data +structures within the AFRO system. + +The function will use default parameters based on the specified data type +(AFP or ES) if custom parameters are not provided. It also checks if all +specified columns exist in the dataset before proceeding with the analysis. +} +\examples{ +# Assuming afro_data is your dataset and you have the necessary dependencies +# result <- validate_afro(afro_data, type = "AFP", +# group_var = "ReportingYear", +# n_groups = 8, +# decreasing = FALSE, +# plots_path = "/path/to/save/plots", +# save_output = TRUE, +# vheight = 1500, vwidth = 1600) + +# Access the GT table and ID data +# gt_table <- result$gt_table +# id_data <- result$id_data + +} diff --git a/man/validate_date.Rd b/man/validate_date.Rd index 8da7468..acf539c 100644 --- a/man/validate_date.Rd +++ b/man/validate_date.Rd @@ -32,30 +32,32 @@ leap year validity, and date formatting. } \examples{ data <- data.frame( - country = c("Rwanda", "Burundi", "Ethiopia", "Zambia", "Zambia", - "Chad", "Niger", "Angola"), + country = c( + "Rwanda", "Burundi", "Ethiopia", "Zambia", "Zambia", + "Chad", "Niger", "Angola" + ), date = c( "2023-06-15", "2024-07-20", NA, "1999-12-31", "2025-08-22", "2020/23/10", "2020-02-29", "2019-02-29" ) ) -# check whether dates column is valid -res <- validate_date(data, "date") +# check whether dates column is valid +res <- validate_date(data, "date") # Check for countries where there is a non-date issue in the date column -res |> - dplyr::filter(date_non_date == TRUE) |> +res |> + dplyr::filter(date_non_date == TRUE) |> dplyr::count(country) # Check for countries where there is an invalid leap year issue -res |> - dplyr::filter(date_leap_issue == TRUE) |> +res |> + dplyr::filter(date_leap_issue == TRUE) |> dplyr::count(country) -# Check for countries where there is improper formatting of the date -res |> - dplyr::filter(date_format_issue == TRUE) |> +# Check for countries where there is improper formatting of the date +res |> + dplyr::filter(date_format_issue == TRUE) |> dplyr::count(country) - + } diff --git a/man/validate_dates.Rd b/man/validate_dates.Rd index 1ab8f7f..af41acc 100644 --- a/man/validate_dates.Rd +++ b/man/validate_dates.Rd @@ -38,8 +38,10 @@ if the first date is before the second date. } \examples{ data <- data.frame( - country = c("Rwanda", "Burundi", "Ethiopia", "Zambia", "Zambia", - "Chad", "Niger", "Angola"), + country = c( + "Rwanda", "Burundi", "Ethiopia", "Zambia", "Zambia", + "Chad", "Niger", "Angola" + ), date1 = c( "2024-06-15", "2024-07-20", NA, "1999-12-31", "2025-08-22", "2020/23/10", "2020-02-29", "2024-02-29" @@ -50,26 +52,26 @@ data <- data.frame( ) ) -res <- validate_dates(data, "date1", "date2") +res <- validate_dates(data, "date1", "date2") # Check for countries where there is a non-date issue in the date1 column -res |> - dplyr::filter(date1_non_date == TRUE) |> +res |> + dplyr::filter(date1_non_date == TRUE) |> dplyr::count(country) # Check for countries where there is an invalid leap year issue in date1 -res |> - dplyr::filter(date1_leap_issue == TRUE) |> +res |> + dplyr::filter(date1_leap_issue == TRUE) |> dplyr::count(country) -# Check for countries where there is improper formatting of date1 col -res |> - dplyr::filter(date1_format_issue == TRUE) |> +# Check for countries where there is improper formatting of date1 col +res |> + dplyr::filter(date1_format_issue == TRUE) |> dplyr::count(country) # Check for countries where date1 is after date2 -res |> - dplyr::filter(date1_invalid_order == TRUE) |> +res |> + dplyr::filter(date1_invalid_order == TRUE) |> dplyr::count(country) - + } diff --git a/man/validate_polis.Rd b/man/validate_polis.Rd index 1ff71f7..9361502 100644 --- a/man/validate_polis.Rd +++ b/man/validate_polis.Rd @@ -21,7 +21,8 @@ validate_polis( custom_title = NULL, save_output = FALSE, vheight = 1400, - vwidth = 1550 + vwidth = 1550, + ... ) } \arguments{ @@ -72,6 +73,8 @@ Default is 1400.} \item{vwidth}{Integer. The width of the output image in pixels. Default is 1550.} + +\item{...}{Additional arguments passed to internal functions.} } \value{ A list containing two elements: