diff --git a/.gitignore b/.gitignore index 68933491..ffaf1f85 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,6 @@ test_function.R _snaps testing_log.txt + +# test data from AK +AK_EPATADA_troubleshooting diff --git a/R/ResultFlagsIndependent.R b/R/ResultFlagsIndependent.R index e95d3f0b..5bc70fcd 100644 --- a/R/ResultFlagsIndependent.R +++ b/R/ResultFlagsIndependent.R @@ -153,11 +153,11 @@ TADA_FlagMethod <- function(.data, clean = TRUE, flaggedonly = FALSE) { #' results are included in the output. When flaggedonly = TRUE, the dataframe #' will be filtered to include only the rows flagged as "Continuous" results. #' @param time_difference Numeric argument defining the maximum time difference -#' in hours between measurements taken on the same day. This is used to search for -#' continuous time series data (i.e., if there are multiple measurements on the same -#' day within the selected time_difference, then the row will be flagged as -#' continuous). The default time window is 4 hours. The time_difference can be -#' adjusted by the user. +#' in hours between measurements of the same TADA.ComparableDataIdentifier taken at the same +#' latitude, longitude, and depth. This is used to search for +#' continuous time series data (i.e., if there are multiple measurements within the selected +#' time_difference, then the row will be flagged as continuous). The default time window is 4 hours. +#' The time_difference can be adjusted by the user. #' @return The default is clean = FALSE and flaggedonly = FALSE. #' When clean = FALSE and flaggedonly = FALSE (default), a new column, #' "TADA.ContinuousData.Flag", is appended to the input data set which @@ -194,24 +194,9 @@ TADA_FlagMethod <- function(.data, clean = TRUE, flaggedonly = FALSE) { #' # Remove continuous data in dataframe #' Data_Nutrients_UT_clean <- TADA_FlagContinuousData(Data_Nutrients_UT, clean = TRUE) #' unique(Data_Nutrients_UT_clean$TADA.ContinuousData.Flag) -#' -#' data(Data_R5_TADAPackageDemo) -#' -#' # Flag continuous data in new column titled "TADA.ContinuousData.Flag" -#' Data_R5_TADAPackageDemo_flags <- TADA_FlagContinuousData(Data_R5_TADAPackageDemo, clean = FALSE) -#' unique(Data_R5_TADAPackageDemo_flags$TADA.ContinuousData.Flag) -#' -#' # Show only rows flagged as continuous data -#' Data_R5_TADAPackageDemo_flaggedonly <- TADA_FlagContinuousData(Data_R5_TADAPackageDemo, clean = FALSE, flaggedonly = TRUE) -#' -#' # Remove continuous data in dataframe -#' Data_R5_TADAPackageDemo_clean <- TADA_FlagContinuousData(Data_R5_TADAPackageDemo, clean = TRUE) -#' unique(Data_R5_TADAPackageDemo_clean$TADA.ContinuousData.Flag) #' } #' TADA_FlagContinuousData <- function(.data, clean = FALSE, flaggedonly = FALSE, time_difference = 4) { - # start.time <- Sys.time() - # check .data is data.frame TADA_CheckType(.data, "data.frame", "Input object") # check clean is boolean @@ -269,124 +254,100 @@ TADA_FlagContinuousData <- function(.data, clean = FALSE, flaggedonly = FALSE, t (SampleCollectionEquipmentName == "Probe/Sensor" & !is.na(ResultTimeBasisText)) | (SampleCollectionEquipmentName == "Probe/Sensor" & !is.na(StatisticalBaseCode)) | (SampleCollectionEquipmentName == "Probe/Sensor" & ResultValueTypeName == "Calculated") | - (SampleCollectionEquipmentName == "Probe/Sensor" & ResultValueTypeName == "Estimated"))) + (SampleCollectionEquipmentName == "Probe/Sensor" & ResultValueTypeName == "Estimated"))) %>% + dplyr::mutate(TADA.ContinuousData.Flag = "Continuous") # everything not YET in cont dataframe noncont.data <- subset(.data, !.data$ResultIdentifier %in% cont.data$ResultIdentifier) # if time field is not NA, find time difference between results + if (length(noncont.data) >= 1) { - for (i in 1:nrow(noncont.data)) { - if (!is.na(noncont.data$ActivityStartDateTime[i])) { - # find samples with the same date, lat/long, organization name, comparable data identifier, and depth - info_match <- which( - noncont.data$TADA.LatitudeMeasure == noncont.data$TADA.LatitudeMeasure[i] & - noncont.data$TADA.LongitudeMeasure == noncont.data$TADA.LongitudeMeasure[i] & - noncont.data$OrganizationIdentifier == noncont.data$OrganizationIdentifier[i] & - noncont.data$TADA.ComparableDataIdentifier == noncont.data$TADA.ComparableDataIdentifier[i] & - ((noncont.data$TADA.ActivityDepthHeightMeasure.MeasureValue == noncont.data$TADA.ActivityDepthHeightMeasure.MeasureValue[i]) | (is.na(noncont.data$TADA.ActivityDepthHeightMeasure.MeasureValue) & is.na(noncont.data$TADA.ActivityDepthHeightMeasure.MeasureValue[i]))) & - ((noncont.data$TADA.ResultDepthHeightMeasure.MeasureValue == noncont.data$TADA.ResultDepthHeightMeasure.MeasureValue[i]) | (is.na(noncont.data$TADA.ResultDepthHeightMeasure.MeasureValue) & is.na(noncont.data$TADA.ResultDepthHeightMeasure.MeasureValue[i]))) & - ((noncont.data$TADA.ActivityTopDepthHeightMeasure.MeasureValue == noncont.data$TADA.ActivityTopDepthHeightMeasure.MeasureValue[i]) | (is.na(noncont.data$TADA.ActivityTopDepthHeightMeasure.MeasureValue) & is.na(noncont.data$TADA.ActivityTopDepthHeightMeasure.MeasureValue[i]))) & - ((noncont.data$TADA.ActivityBottomDepthHeightMeasure.MeasureValue == noncont.data$TADA.ActivityBottomDepthHeightMeasure.MeasureValue[i]) | (is.na(noncont.data$TADA.ActivityBottomDepthHeightMeasure.MeasureValue) & is.na(noncont.data$TADA.ActivityBottomDepthHeightMeasure.MeasureValue[i]))) & - ((noncont.data$ActivityRelativeDepthName == noncont.data$ActivityRelativeDepthName[i]) | (is.na(noncont.data$ActivityRelativeDepthName) & is.na(noncont.data$ActivityRelativeDepthName[i]))) - ) + info_match <- noncont.data %>% + dplyr::group_by( + TADA.LatitudeMeasure, TADA.LongitudeMeasure, + OrganizationIdentifier, TADA.ComparableDataIdentifier, + TADA.ActivityDepthHeightMeasure.MeasureValue, + TADA.ResultDepthHeightMeasure.MeasureValue, + TADA.ActivityBottomDepthHeightMeasure.MeasureValue, + TADA.ActivityTopDepthHeightMeasure.MeasureValue, + ActivityRelativeDepthName + ) %>% + dplyr::mutate(n_records = length(TADA.ResultMeasureValue)) %>% + dplyr::mutate(group_id = dplyr::cur_group_id()) %>% + dplyr::filter(n_records > 1) %>% + dplyr::ungroup() %>% + dplyr::group_by(group_id) %>% + dplyr::arrange(ActivityStartDateTime, .by_group = TRUE) %>% + dplyr::mutate( + time_diff_lag = abs(difftime(ActivityStartDateTime, dplyr::lag(ActivityStartDateTime), units = "hours")), + time_diff_lead = abs(difftime(ActivityStartDateTime, dplyr::lead(ActivityStartDateTime), units = "hours")) + ) %>% + dplyr::ungroup() - time_diff <- abs(difftime(noncont.data$ActivityStartDateTime[i], noncont.data$ActivityStartDateTime[info_match], units = "hours")) + # find results where the time differences is <= time_difference (default is 4 hours) + within_window <- info_match %>% + dplyr::filter(time_diff_lead <= time_difference | + time_diff_lag <= time_difference) - # samples where the time differences is <= time_difference (default is 4 hours) - within_window <- info_match[time_diff <= time_difference] + rm(info_match) - # keep the samples with times within the window - info_match <- intersect(info_match, within_window) + # if matches are identified change flag to continuous + noncont.data <- noncont.data %>% + dplyr::mutate(TADA.ContinuousData.Flag = ifelse(ResultIdentifier %in% within_window$ResultIdentifier, + "Continuous", TADA.ContinuousData.Flag + )) - # if matches are identified change flag to continuous - if (length(info_match) >= 1) { - noncont.data$TADA.ContinuousData.Flag[info_match] <- "Continuous" - } - } - } + rm(within_window) } - # remove continuous results from noncont.data and create new df for these (more.cont.data) - more.cont.data <- noncont.data %>% dplyr::filter(TADA.ContinuousData.Flag == "Continuous") - - # add additional continuous data (more.cont.data) to cont.data - all.cont.data <- plyr::rbind.fill(cont.data, more.cont.data) - - # filter noncont.data to ONLY discrete results - noncont.data <- noncont.data %>% dplyr::filter(TADA.ContinuousData.Flag == "Discrete") - - # if there is continuous data in the data set - # flag output - if (nrow(all.cont.data) != 0) { - # change contents of ContDataFlag column - all.cont.data$TADA.ContinuousData.Flag <- "Continuous" - # join all.cont.data to flag.data - flag.data <- plyr::rbind.fill(all.cont.data, noncont.data) - - # flagged output, all data - if (clean == FALSE & flaggedonly == FALSE) { - flag.data <- TADA_OrderCols(flag.data) - - # end.time <- Sys.time() - # time.taken <- round(end.time - start.time, 2) - # print(time.taken) - - return(flag.data) - } - - # clean output - if (clean == TRUE & flaggedonly == FALSE) { - # filter out invalid characteristic-unit-media combinations - clean.data <- dplyr::filter(flag.data, !(TADA.ContinuousData.Flag %in% "Continuous")) + flag.data <- cont.data %>% + dplyr::full_join(noncont.data, by = c(names(cont.data))) - # remove TADA.AggregatedContinuousData column - # clean.data <- dplyr::select(clean.data, -TADA.ContinuousData.Flag) - clean.data <- TADA_OrderCols(clean.data) + # flagged output, all data + if (clean == FALSE & flaggedonly == FALSE) { + flag.data <- TADA_OrderCols(flag.data) - # end.time <- Sys.time() - # time.taken <- round(end.time - start.time, 2) - # print(time.taken) + return(flag.data) + } - return(clean.data) - } + # clean output + if (clean == TRUE & flaggedonly == FALSE) { + # filter out invalid characteristic-unit-media combinations + clean.data <- flag.data %>% + dplyr::filter(!(TADA.ContinuousData.Flag %in% "Continuous")) %>% + dplyr::select(-TADA.ContinuousData.Flag) %>% + TADA_OrderCols() - # flagged output, only aggregated continuous data - if (clean == FALSE & flaggedonly == TRUE) { - # filter to show only invalid characteristic-unit-media combinations + return(clean.data) + } - onlycont.data <- dplyr::filter(flag.data, TADA.ContinuousData.Flag == "Continuous") - onlycont.data <- TADA_OrderCols(onlycont.data) + # flagged output, only aggregated continuous data + if (clean == FALSE & flaggedonly == TRUE) { + # filter to show only invalid characteristic-unit-media combinations - # end.time <- Sys.time() - # time.taken <- round(end.time - start.time, 2) - # print(time.taken) + onlycont.data <- flag.data %>% + dplyr::filter(TADA.ContinuousData.Flag == "Continuous") %>% + TADA_OrderCols() - return(onlycont.data) - } + return(onlycont.data) } + # if no aggregated continuous data is in the data set - if (nrow(all.cont.data) == 0) { + if (nrow(flag.data[flag.data$TADA.ContinuousData.Flag == "Continuous", ]) == 0) { if (flaggedonly == FALSE) { print("No evidence of aggregated continuous data in your dataframe. Returning the input dataframe with TADA.ContinuousData.Flag column for tracking.") .data <- TADA_OrderCols(.data) - # end.time <- Sys.time() - # time.taken <- round(end.time - start.time, 2) - # print(time.taken) - return(.data) } if (flaggedonly == TRUE) { print("This dataframe is empty because we did not find any aggregated continuous data in your dataframe") - all.cont.data <- TADA_OrderCols(all.cont.data) - - # end.time <- Sys.time() - # time.taken <- round(end.time - start.time, 2) - # print(time.taken) + all.cont.data <- flag.data %>% + dplyr::filter(TADA.ContinuousData.Flag == "Continuous") return(all.cont.data) } diff --git a/data/Data_6Tribes_5y_Harmonized.rda b/data/Data_6Tribes_5y_Harmonized.rda index 0272b72a..26f7883e 100644 Binary files a/data/Data_6Tribes_5y_Harmonized.rda and b/data/Data_6Tribes_5y_Harmonized.rda differ diff --git a/data/Data_NCTCShepherdstown_HUC12.rda b/data/Data_NCTCShepherdstown_HUC12.rda index 22a65b3b..d5a5ca42 100644 Binary files a/data/Data_NCTCShepherdstown_HUC12.rda and b/data/Data_NCTCShepherdstown_HUC12.rda differ diff --git a/data/Data_Nutrients_UT.rda b/data/Data_Nutrients_UT.rda index 91750ba7..2dedc98d 100644 Binary files a/data/Data_Nutrients_UT.rda and b/data/Data_Nutrients_UT.rda differ diff --git a/data/Data_R5_TADAPackageDemo.rda b/data/Data_R5_TADAPackageDemo.rda index 85440c88..e7c5447b 100644 Binary files a/data/Data_R5_TADAPackageDemo.rda and b/data/Data_R5_TADAPackageDemo.rda differ diff --git a/inst/extdata/AKAllotments.dbf b/inst/extdata/AKAllotments.dbf index 108a6447..71e69198 100644 Binary files a/inst/extdata/AKAllotments.dbf and b/inst/extdata/AKAllotments.dbf differ diff --git a/inst/extdata/AKVillages.dbf b/inst/extdata/AKVillages.dbf index 3a911cf0..eebc2f6a 100644 Binary files a/inst/extdata/AKVillages.dbf and b/inst/extdata/AKVillages.dbf differ diff --git a/inst/extdata/AmericanIndian.dbf b/inst/extdata/AmericanIndian.dbf index 4134af2b..cf4e8694 100644 Binary files a/inst/extdata/AmericanIndian.dbf and b/inst/extdata/AmericanIndian.dbf differ diff --git a/inst/extdata/OKTribe.dbf b/inst/extdata/OKTribe.dbf index 64951ff2..a1b0c303 100644 Binary files a/inst/extdata/OKTribe.dbf and b/inst/extdata/OKTribe.dbf differ diff --git a/inst/extdata/OffReservation.dbf b/inst/extdata/OffReservation.dbf index 0fcaba11..00526d31 100644 Binary files a/inst/extdata/OffReservation.dbf and b/inst/extdata/OffReservation.dbf differ diff --git a/inst/extdata/VATribe.dbf b/inst/extdata/VATribe.dbf index a64fc453..481c271d 100644 Binary files a/inst/extdata/VATribe.dbf and b/inst/extdata/VATribe.dbf differ diff --git a/inst/extdata/WQXCharacteristicRef.csv b/inst/extdata/WQXCharacteristicRef.csv index 620be61d..f96e1794 100644 --- a/inst/extdata/WQXCharacteristicRef.csv +++ b/inst/extdata/WQXCharacteristicRef.csv @@ -28,6 +28,7 @@ "(-)-1-Perfluoropropylethanol","Accepted","" "(-)-Perfluoro(2-propoxypropionic) acid","Accepted","" "(-)-cis-Permethrin","Accepted","" +"(-)-delta9-THC-d3","Accepted","" "(-)-trans-4-(4-Fluorophenyl)-3-(4-hydroxy-3-methoxyphenoxymethyl)piperidine","Accepted","" "(-)-trans-Permethrin","Accepted","" "(.alpha.R,7.alpha.)-3-Hydroxy-.alpha.-((perfluorobutyl)propyl)-17-oxo-estra-1,3,5(10)-triene-7-decanoic acid","Accepted","" diff --git a/inst/extdata/WQXMeasureQualifierCodeRef.csv b/inst/extdata/WQXMeasureQualifierCodeRef.csv index bf94ac64..949b2f01 100644 --- a/inst/extdata/WQXMeasureQualifierCodeRef.csv +++ b/inst/extdata/WQXMeasureQualifierCodeRef.csv @@ -16,6 +16,7 @@ "Result Measure Qualifier(MeasureQualifierCode)",822,"AP","The analyte was positively identified and the associated numerical value is the approximate concentration of the analyte in the sample. And serial dilution acceptance criteria not met","4/10/2020 6:07:22 PM","Pass" "Result Measure Qualifier(MeasureQualifierCode)",1121,"AR","counts outside acceptable range","8/16/2021 11:05:27 AM","Suspect" "Result Measure Qualifier(MeasureQualifierCode)",5,"B","Detection in blank, Analyte found in sample and associated blank","4/24/2008 8:28:30 AM","Suspect" +"Result Measure Qualifier(MeasureQualifierCode)",1200,"B3","Target analyte detected in calibration blank at or above the method reporting limit","8/1/2024 7:33:52 PM","Not Reviewed" "Result Measure Qualifier(MeasureQualifierCode)",124,"BAC","Correction Factor, background","3/16/2016 1:04:24 PM","Pass" "Result Measure Qualifier(MeasureQualifierCode)",125,"BQL","Below Quantitation Limit","7/6/2016 4:09:14 PM","Non-Detect" "Result Measure Qualifier(MeasureQualifierCode)",126,"BRL","Below Reporting Limit","7/6/2016 4:09:17 PM","Non-Detect" @@ -56,6 +57,7 @@ "Result Measure Qualifier(MeasureQualifierCode)",19,"EE","Identifies compounds whose concentration exceed the calibration range addition of the instrument for that specific analysis.","8/28/2009 5:04:35 PM","Over-Detect" "Result Measure Qualifier(MeasureQualifierCode)",788,"EER","No Result Reported, entry error; Original value is known to be incorrect due to a data entry error. The correct value could not be determined. No result value was reported","1/23/2020 2:12:06 PM","Suspect" "Result Measure Qualifier(MeasureQualifierCode)",59,"EFAI","Equipment failure","8/22/2013 1:44:00 PM","Suspect" +"Result Measure Qualifier(MeasureQualifierCode)",1196,"EMCL","Value Exceeds Maximum Contaminant Level","8/1/2024 7:33:52 PM","Not Reviewed" "Result Measure Qualifier(MeasureQualifierCode)",137,"EMPC","Estimated Maximum Possible Concentration","3/16/2016 1:04:25 PM","Suspect" "Result Measure Qualifier(MeasureQualifierCode)",138,"ESD","Estimated Value, serial dilution difference","3/16/2016 1:04:25 PM","Suspect" "Result Measure Qualifier(MeasureQualifierCode)",139,"EST","Estimated Value, outside limit of precision","3/16/2016 1:04:25 PM","Suspect" @@ -131,6 +133,8 @@ "Result Measure Qualifier(MeasureQualifierCode)",161,"JCN","Sample Container Damaged, no sample lost","3/16/2016 1:04:25 PM","Suspect" "Result Measure Qualifier(MeasureQualifierCode)",162,"JCW","Sample Container Damaged, sample lost","3/16/2016 1:04:25 PM","Suspect" "Result Measure Qualifier(MeasureQualifierCode)",66,"K","Value below the detection Limit. For BOD: depletion is less than 1.0","8/22/2013 1:44:00 PM","Non-Detect" +"Result Measure Qualifier(MeasureQualifierCode)",1198,"K5","The dilution water D.O. depletion was > 0.2 mg/L","8/1/2024 7:33:52 PM","Not Reviewed" +"Result Measure Qualifier(MeasureQualifierCode)",1197,"K6","Glucose/glutamic acid BOD was below method acceptance criteria","8/1/2024 7:33:52 PM","Not Reviewed" "Result Measure Qualifier(MeasureQualifierCode)",163,"KCF","Known Contamination, field","3/16/2016 1:04:25 PM","Suspect" "Result Measure Qualifier(MeasureQualifierCode)",1061,"KCX","Known Contamination, unknown","7/10/2020 12:31:13 PM","Suspect" "Result Measure Qualifier(MeasureQualifierCode)",18,"KK","True bacterial concentration is assumed to be less than the reported value. ","8/28/2009 5:04:14 PM","Suspect" @@ -157,6 +161,7 @@ "Result Measure Qualifier(MeasureQualifierCode)",834,"LTGTE","Result is less than the MQL but greater than or equal to the MDL","4/10/2020 6:07:23 PM","Non-Detect" "Result Measure Qualifier(MeasureQualifierCode)",33,"LVER","low calibration verification standard recovery, potential low bias","5/19/2010 11:24:47 AM","Pass" "Result Measure Qualifier(MeasureQualifierCode)",1188,"M3","Spike recovery value unusable since the analyte concentration in the sample is disproportionate to the spike level","7/18/2024 8:49:50 PM","Not Reviewed" +"Result Measure Qualifier(MeasureQualifierCode)",1199,"M4","The analysis of the spiked sample required a dilution such that the spike recovery calculation does not provide useful information","8/1/2024 7:33:52 PM","Not Reviewed" "Result Measure Qualifier(MeasureQualifierCode)",168,"M6F","More Than 6 Flags Applied","3/16/2016 1:04:26 PM","Pass" "Result Measure Qualifier(MeasureQualifierCode)",813,"MDL","Method Detection Level","4/10/2020 6:07:22 PM","Suspect" "Result Measure Qualifier(MeasureQualifierCode)",1189,"MHA","Due to high levels of analyte in the sample, the MS/MSD calculation does not provide useful spike recovery information","7/18/2024 8:49:50 PM","Not Reviewed" diff --git a/man/TADA_FlagContinuousData.Rd b/man/TADA_FlagContinuousData.Rd index 4207223c..c4488a86 100644 --- a/man/TADA_FlagContinuousData.Rd +++ b/man/TADA_FlagContinuousData.Rd @@ -24,11 +24,11 @@ results are included in the output. When flaggedonly = TRUE, the dataframe will be filtered to include only the rows flagged as "Continuous" results.} \item{time_difference}{Numeric argument defining the maximum time difference -in hours between measurements taken on the same day. This is used to search for -continuous time series data (i.e., if there are multiple measurements on the same -day within the selected time_difference, then the row will be flagged as -continuous). The default time window is 4 hours. The time_difference can be -adjusted by the user.} +in hours between measurements of the same TADA.ComparableDataIdentifier taken at the same +latitude, longitude, and depth. This is used to search for +continuous time series data (i.e., if there are multiple measurements within the selected +time_difference, then the row will be flagged as continuous). The default time window is 4 hours. +The time_difference can be adjusted by the user.} } \value{ The default is clean = FALSE and flaggedonly = FALSE. @@ -78,19 +78,6 @@ Data_Nutrients_UT_flaggedonly <- TADA_FlagContinuousData(Data_Nutrients_UT, clea # Remove continuous data in dataframe Data_Nutrients_UT_clean <- TADA_FlagContinuousData(Data_Nutrients_UT, clean = TRUE) unique(Data_Nutrients_UT_clean$TADA.ContinuousData.Flag) - -data(Data_R5_TADAPackageDemo) - -# Flag continuous data in new column titled "TADA.ContinuousData.Flag" -Data_R5_TADAPackageDemo_flags <- TADA_FlagContinuousData(Data_R5_TADAPackageDemo, clean = FALSE) -unique(Data_R5_TADAPackageDemo_flags$TADA.ContinuousData.Flag) - -# Show only rows flagged as continuous data -Data_R5_TADAPackageDemo_flaggedonly <- TADA_FlagContinuousData(Data_R5_TADAPackageDemo, clean = FALSE, flaggedonly = TRUE) - -# Remove continuous data in dataframe -Data_R5_TADAPackageDemo_clean <- TADA_FlagContinuousData(Data_R5_TADAPackageDemo, clean = TRUE) -unique(Data_R5_TADAPackageDemo_clean$TADA.ContinuousData.Flag) } }