Skip to content

Commit

Permalink
Merge pull request #501 from USEPA/494-epatada-continuous-data-flag-e…
Browse files Browse the repository at this point in the history
…rror

494 epatada continuous data flag error
  • Loading branch information
hillarymarler authored Aug 6, 2024
2 parents 00dbc2e + ca2dc83 commit 6ac2171
Show file tree
Hide file tree
Showing 15 changed files with 81 additions and 124 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@ test_function.R
_snaps

testing_log.txt

# test data from AK
AK_EPATADA_troubleshooting
173 changes: 67 additions & 106 deletions R/ResultFlagsIndependent.R
Original file line number Diff line number Diff line change
Expand Up @@ -153,11 +153,11 @@ TADA_FlagMethod <- function(.data, clean = TRUE, flaggedonly = FALSE) {
#' results are included in the output. When flaggedonly = TRUE, the dataframe
#' will be filtered to include only the rows flagged as "Continuous" results.
#' @param time_difference Numeric argument defining the maximum time difference
#' in hours between measurements taken on the same day. This is used to search for
#' continuous time series data (i.e., if there are multiple measurements on the same
#' day within the selected time_difference, then the row will be flagged as
#' continuous). The default time window is 4 hours. The time_difference can be
#' adjusted by the user.
#' in hours between measurements of the same TADA.ComparableDataIdentifier taken at the same
#' latitude, longitude, and depth. This is used to search for
#' continuous time series data (i.e., if there are multiple measurements within the selected
#' time_difference, then the row will be flagged as continuous). The default time window is 4 hours.
#' The time_difference can be adjusted by the user.
#' @return The default is clean = FALSE and flaggedonly = FALSE.
#' When clean = FALSE and flaggedonly = FALSE (default), a new column,
#' "TADA.ContinuousData.Flag", is appended to the input data set which
Expand Down Expand Up @@ -194,24 +194,9 @@ TADA_FlagMethod <- function(.data, clean = TRUE, flaggedonly = FALSE) {
#' # Remove continuous data in dataframe
#' Data_Nutrients_UT_clean <- TADA_FlagContinuousData(Data_Nutrients_UT, clean = TRUE)
#' unique(Data_Nutrients_UT_clean$TADA.ContinuousData.Flag)
#'
#' data(Data_R5_TADAPackageDemo)
#'
#' # Flag continuous data in new column titled "TADA.ContinuousData.Flag"
#' Data_R5_TADAPackageDemo_flags <- TADA_FlagContinuousData(Data_R5_TADAPackageDemo, clean = FALSE)
#' unique(Data_R5_TADAPackageDemo_flags$TADA.ContinuousData.Flag)
#'
#' # Show only rows flagged as continuous data
#' Data_R5_TADAPackageDemo_flaggedonly <- TADA_FlagContinuousData(Data_R5_TADAPackageDemo, clean = FALSE, flaggedonly = TRUE)
#'
#' # Remove continuous data in dataframe
#' Data_R5_TADAPackageDemo_clean <- TADA_FlagContinuousData(Data_R5_TADAPackageDemo, clean = TRUE)
#' unique(Data_R5_TADAPackageDemo_clean$TADA.ContinuousData.Flag)
#' }
#'
TADA_FlagContinuousData <- function(.data, clean = FALSE, flaggedonly = FALSE, time_difference = 4) {
# start.time <- Sys.time()

# check .data is data.frame
TADA_CheckType(.data, "data.frame", "Input object")
# check clean is boolean
Expand Down Expand Up @@ -269,124 +254,100 @@ TADA_FlagContinuousData <- function(.data, clean = FALSE, flaggedonly = FALSE, t
(SampleCollectionEquipmentName == "Probe/Sensor" & !is.na(ResultTimeBasisText)) |
(SampleCollectionEquipmentName == "Probe/Sensor" & !is.na(StatisticalBaseCode)) |
(SampleCollectionEquipmentName == "Probe/Sensor" & ResultValueTypeName == "Calculated") |
(SampleCollectionEquipmentName == "Probe/Sensor" & ResultValueTypeName == "Estimated")))
(SampleCollectionEquipmentName == "Probe/Sensor" & ResultValueTypeName == "Estimated"))) %>%
dplyr::mutate(TADA.ContinuousData.Flag = "Continuous")

# everything not YET in cont dataframe
noncont.data <- subset(.data, !.data$ResultIdentifier %in% cont.data$ResultIdentifier)

# if time field is not NA, find time difference between results

if (length(noncont.data) >= 1) {
for (i in 1:nrow(noncont.data)) {
if (!is.na(noncont.data$ActivityStartDateTime[i])) {
# find samples with the same date, lat/long, organization name, comparable data identifier, and depth
info_match <- which(
noncont.data$TADA.LatitudeMeasure == noncont.data$TADA.LatitudeMeasure[i] &
noncont.data$TADA.LongitudeMeasure == noncont.data$TADA.LongitudeMeasure[i] &
noncont.data$OrganizationIdentifier == noncont.data$OrganizationIdentifier[i] &
noncont.data$TADA.ComparableDataIdentifier == noncont.data$TADA.ComparableDataIdentifier[i] &
((noncont.data$TADA.ActivityDepthHeightMeasure.MeasureValue == noncont.data$TADA.ActivityDepthHeightMeasure.MeasureValue[i]) | (is.na(noncont.data$TADA.ActivityDepthHeightMeasure.MeasureValue) & is.na(noncont.data$TADA.ActivityDepthHeightMeasure.MeasureValue[i]))) &
((noncont.data$TADA.ResultDepthHeightMeasure.MeasureValue == noncont.data$TADA.ResultDepthHeightMeasure.MeasureValue[i]) | (is.na(noncont.data$TADA.ResultDepthHeightMeasure.MeasureValue) & is.na(noncont.data$TADA.ResultDepthHeightMeasure.MeasureValue[i]))) &
((noncont.data$TADA.ActivityTopDepthHeightMeasure.MeasureValue == noncont.data$TADA.ActivityTopDepthHeightMeasure.MeasureValue[i]) | (is.na(noncont.data$TADA.ActivityTopDepthHeightMeasure.MeasureValue) & is.na(noncont.data$TADA.ActivityTopDepthHeightMeasure.MeasureValue[i]))) &
((noncont.data$TADA.ActivityBottomDepthHeightMeasure.MeasureValue == noncont.data$TADA.ActivityBottomDepthHeightMeasure.MeasureValue[i]) | (is.na(noncont.data$TADA.ActivityBottomDepthHeightMeasure.MeasureValue) & is.na(noncont.data$TADA.ActivityBottomDepthHeightMeasure.MeasureValue[i]))) &
((noncont.data$ActivityRelativeDepthName == noncont.data$ActivityRelativeDepthName[i]) | (is.na(noncont.data$ActivityRelativeDepthName) & is.na(noncont.data$ActivityRelativeDepthName[i])))
)
info_match <- noncont.data %>%
dplyr::group_by(
TADA.LatitudeMeasure, TADA.LongitudeMeasure,
OrganizationIdentifier, TADA.ComparableDataIdentifier,
TADA.ActivityDepthHeightMeasure.MeasureValue,
TADA.ResultDepthHeightMeasure.MeasureValue,
TADA.ActivityBottomDepthHeightMeasure.MeasureValue,
TADA.ActivityTopDepthHeightMeasure.MeasureValue,
ActivityRelativeDepthName
) %>%
dplyr::mutate(n_records = length(TADA.ResultMeasureValue)) %>%
dplyr::mutate(group_id = dplyr::cur_group_id()) %>%
dplyr::filter(n_records > 1) %>%
dplyr::ungroup() %>%
dplyr::group_by(group_id) %>%
dplyr::arrange(ActivityStartDateTime, .by_group = TRUE) %>%
dplyr::mutate(
time_diff_lag = abs(difftime(ActivityStartDateTime, dplyr::lag(ActivityStartDateTime), units = "hours")),
time_diff_lead = abs(difftime(ActivityStartDateTime, dplyr::lead(ActivityStartDateTime), units = "hours"))
) %>%
dplyr::ungroup()

time_diff <- abs(difftime(noncont.data$ActivityStartDateTime[i], noncont.data$ActivityStartDateTime[info_match], units = "hours"))
# find results where the time differences is <= time_difference (default is 4 hours)
within_window <- info_match %>%
dplyr::filter(time_diff_lead <= time_difference |
time_diff_lag <= time_difference)

# samples where the time differences is <= time_difference (default is 4 hours)
within_window <- info_match[time_diff <= time_difference]
rm(info_match)

# keep the samples with times within the window
info_match <- intersect(info_match, within_window)
# if matches are identified change flag to continuous
noncont.data <- noncont.data %>%
dplyr::mutate(TADA.ContinuousData.Flag = ifelse(ResultIdentifier %in% within_window$ResultIdentifier,
"Continuous", TADA.ContinuousData.Flag
))

# if matches are identified change flag to continuous
if (length(info_match) >= 1) {
noncont.data$TADA.ContinuousData.Flag[info_match] <- "Continuous"
}
}
}
rm(within_window)
}

# remove continuous results from noncont.data and create new df for these (more.cont.data)
more.cont.data <- noncont.data %>% dplyr::filter(TADA.ContinuousData.Flag == "Continuous")

# add additional continuous data (more.cont.data) to cont.data
all.cont.data <- plyr::rbind.fill(cont.data, more.cont.data)

# filter noncont.data to ONLY discrete results
noncont.data <- noncont.data %>% dplyr::filter(TADA.ContinuousData.Flag == "Discrete")

# if there is continuous data in the data set
# flag output
if (nrow(all.cont.data) != 0) {
# change contents of ContDataFlag column
all.cont.data$TADA.ContinuousData.Flag <- "Continuous"
# join all.cont.data to flag.data
flag.data <- plyr::rbind.fill(all.cont.data, noncont.data)

# flagged output, all data
if (clean == FALSE & flaggedonly == FALSE) {
flag.data <- TADA_OrderCols(flag.data)

# end.time <- Sys.time()
# time.taken <- round(end.time - start.time, 2)
# print(time.taken)

return(flag.data)
}

# clean output
if (clean == TRUE & flaggedonly == FALSE) {
# filter out invalid characteristic-unit-media combinations
clean.data <- dplyr::filter(flag.data, !(TADA.ContinuousData.Flag %in% "Continuous"))
flag.data <- cont.data %>%
dplyr::full_join(noncont.data, by = c(names(cont.data)))

# remove TADA.AggregatedContinuousData column
# clean.data <- dplyr::select(clean.data, -TADA.ContinuousData.Flag)
clean.data <- TADA_OrderCols(clean.data)
# flagged output, all data
if (clean == FALSE & flaggedonly == FALSE) {
flag.data <- TADA_OrderCols(flag.data)

# end.time <- Sys.time()
# time.taken <- round(end.time - start.time, 2)
# print(time.taken)
return(flag.data)
}

return(clean.data)
}
# clean output
if (clean == TRUE & flaggedonly == FALSE) {
# filter out invalid characteristic-unit-media combinations
clean.data <- flag.data %>%
dplyr::filter(!(TADA.ContinuousData.Flag %in% "Continuous")) %>%
dplyr::select(-TADA.ContinuousData.Flag) %>%
TADA_OrderCols()

# flagged output, only aggregated continuous data
if (clean == FALSE & flaggedonly == TRUE) {
# filter to show only invalid characteristic-unit-media combinations
return(clean.data)
}

onlycont.data <- dplyr::filter(flag.data, TADA.ContinuousData.Flag == "Continuous")
onlycont.data <- TADA_OrderCols(onlycont.data)
# flagged output, only aggregated continuous data
if (clean == FALSE & flaggedonly == TRUE) {
# filter to show only invalid characteristic-unit-media combinations

# end.time <- Sys.time()
# time.taken <- round(end.time - start.time, 2)
# print(time.taken)
onlycont.data <- flag.data %>%
dplyr::filter(TADA.ContinuousData.Flag == "Continuous") %>%
TADA_OrderCols()

return(onlycont.data)
}
return(onlycont.data)
}


# if no aggregated continuous data is in the data set
if (nrow(all.cont.data) == 0) {
if (nrow(flag.data[flag.data$TADA.ContinuousData.Flag == "Continuous", ]) == 0) {
if (flaggedonly == FALSE) {
print("No evidence of aggregated continuous data in your dataframe. Returning the input dataframe with TADA.ContinuousData.Flag column for tracking.")
.data <- TADA_OrderCols(.data)

# end.time <- Sys.time()
# time.taken <- round(end.time - start.time, 2)
# print(time.taken)

return(.data)
}

if (flaggedonly == TRUE) {
print("This dataframe is empty because we did not find any aggregated continuous data in your dataframe")

all.cont.data <- TADA_OrderCols(all.cont.data)

# end.time <- Sys.time()
# time.taken <- round(end.time - start.time, 2)
# print(time.taken)
all.cont.data <- flag.data %>%
dplyr::filter(TADA.ContinuousData.Flag == "Continuous")

return(all.cont.data)
}
Expand Down
Binary file modified data/Data_6Tribes_5y_Harmonized.rda
Binary file not shown.
Binary file modified data/Data_NCTCShepherdstown_HUC12.rda
Binary file not shown.
Binary file modified data/Data_Nutrients_UT.rda
Binary file not shown.
Binary file modified data/Data_R5_TADAPackageDemo.rda
Binary file not shown.
Binary file modified inst/extdata/AKAllotments.dbf
Binary file not shown.
Binary file modified inst/extdata/AKVillages.dbf
Binary file not shown.
Binary file modified inst/extdata/AmericanIndian.dbf
Binary file not shown.
Binary file modified inst/extdata/OKTribe.dbf
Binary file not shown.
Binary file modified inst/extdata/OffReservation.dbf
Binary file not shown.
Binary file modified inst/extdata/VATribe.dbf
Binary file not shown.
1 change: 1 addition & 0 deletions inst/extdata/WQXCharacteristicRef.csv
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
"(-)-1-Perfluoropropylethanol","Accepted",""
"(-)-Perfluoro(2-propoxypropionic) acid","Accepted",""
"(-)-cis-Permethrin","Accepted",""
"(-)-delta9-THC-d3","Accepted",""
"(-)-trans-4-(4-Fluorophenyl)-3-(4-hydroxy-3-methoxyphenoxymethyl)piperidine","Accepted",""
"(-)-trans-Permethrin","Accepted",""
"(.alpha.R,7.alpha.)-3-Hydroxy-.alpha.-((perfluorobutyl)propyl)-17-oxo-estra-1,3,5(10)-triene-7-decanoic acid","Accepted",""
Expand Down
5 changes: 5 additions & 0 deletions inst/extdata/WQXMeasureQualifierCodeRef.csv
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"Result Measure Qualifier(MeasureQualifierCode)",822,"AP","The analyte was positively identified and the associated numerical value is the approximate concentration of the analyte in the sample. And serial dilution acceptance criteria not met","4/10/2020 6:07:22 PM","Pass"
"Result Measure Qualifier(MeasureQualifierCode)",1121,"AR","counts outside acceptable range","8/16/2021 11:05:27 AM","Suspect"
"Result Measure Qualifier(MeasureQualifierCode)",5,"B","Detection in blank, Analyte found in sample and associated blank","4/24/2008 8:28:30 AM","Suspect"
"Result Measure Qualifier(MeasureQualifierCode)",1200,"B3","Target analyte detected in calibration blank at or above the method reporting limit","8/1/2024 7:33:52 PM","Not Reviewed"
"Result Measure Qualifier(MeasureQualifierCode)",124,"BAC","Correction Factor, background","3/16/2016 1:04:24 PM","Pass"
"Result Measure Qualifier(MeasureQualifierCode)",125,"BQL","Below Quantitation Limit","7/6/2016 4:09:14 PM","Non-Detect"
"Result Measure Qualifier(MeasureQualifierCode)",126,"BRL","Below Reporting Limit","7/6/2016 4:09:17 PM","Non-Detect"
Expand Down Expand Up @@ -56,6 +57,7 @@
"Result Measure Qualifier(MeasureQualifierCode)",19,"EE","Identifies compounds whose concentration exceed the calibration range addition of the instrument for that specific analysis.","8/28/2009 5:04:35 PM","Over-Detect"
"Result Measure Qualifier(MeasureQualifierCode)",788,"EER","No Result Reported, entry error; Original value is known to be incorrect due to a data entry error. The correct value could not be determined. No result value was reported","1/23/2020 2:12:06 PM","Suspect"
"Result Measure Qualifier(MeasureQualifierCode)",59,"EFAI","Equipment failure","8/22/2013 1:44:00 PM","Suspect"
"Result Measure Qualifier(MeasureQualifierCode)",1196,"EMCL","Value Exceeds Maximum Contaminant Level","8/1/2024 7:33:52 PM","Not Reviewed"
"Result Measure Qualifier(MeasureQualifierCode)",137,"EMPC","Estimated Maximum Possible Concentration","3/16/2016 1:04:25 PM","Suspect"
"Result Measure Qualifier(MeasureQualifierCode)",138,"ESD","Estimated Value, serial dilution difference","3/16/2016 1:04:25 PM","Suspect"
"Result Measure Qualifier(MeasureQualifierCode)",139,"EST","Estimated Value, outside limit of precision","3/16/2016 1:04:25 PM","Suspect"
Expand Down Expand Up @@ -131,6 +133,8 @@
"Result Measure Qualifier(MeasureQualifierCode)",161,"JCN","Sample Container Damaged, no sample lost","3/16/2016 1:04:25 PM","Suspect"
"Result Measure Qualifier(MeasureQualifierCode)",162,"JCW","Sample Container Damaged, sample lost","3/16/2016 1:04:25 PM","Suspect"
"Result Measure Qualifier(MeasureQualifierCode)",66,"K","Value below the detection Limit. For BOD: depletion is less than 1.0","8/22/2013 1:44:00 PM","Non-Detect"
"Result Measure Qualifier(MeasureQualifierCode)",1198,"K5","The dilution water D.O. depletion was > 0.2 mg/L","8/1/2024 7:33:52 PM","Not Reviewed"
"Result Measure Qualifier(MeasureQualifierCode)",1197,"K6","Glucose/glutamic acid BOD was below method acceptance criteria","8/1/2024 7:33:52 PM","Not Reviewed"
"Result Measure Qualifier(MeasureQualifierCode)",163,"KCF","Known Contamination, field","3/16/2016 1:04:25 PM","Suspect"
"Result Measure Qualifier(MeasureQualifierCode)",1061,"KCX","Known Contamination, unknown","7/10/2020 12:31:13 PM","Suspect"
"Result Measure Qualifier(MeasureQualifierCode)",18,"KK","True bacterial concentration is assumed to be less than the reported value. ","8/28/2009 5:04:14 PM","Suspect"
Expand All @@ -157,6 +161,7 @@
"Result Measure Qualifier(MeasureQualifierCode)",834,"LTGTE","Result is less than the MQL but greater than or equal to the MDL","4/10/2020 6:07:23 PM","Non-Detect"
"Result Measure Qualifier(MeasureQualifierCode)",33,"LVER","low calibration verification standard recovery, potential low bias","5/19/2010 11:24:47 AM","Pass"
"Result Measure Qualifier(MeasureQualifierCode)",1188,"M3","Spike recovery value unusable since the analyte concentration in the sample is disproportionate to the spike level","7/18/2024 8:49:50 PM","Not Reviewed"
"Result Measure Qualifier(MeasureQualifierCode)",1199,"M4","The analysis of the spiked sample required a dilution such that the spike recovery calculation does not provide useful information","8/1/2024 7:33:52 PM","Not Reviewed"
"Result Measure Qualifier(MeasureQualifierCode)",168,"M6F","More Than 6 Flags Applied","3/16/2016 1:04:26 PM","Pass"
"Result Measure Qualifier(MeasureQualifierCode)",813,"MDL","Method Detection Level","4/10/2020 6:07:22 PM","Suspect"
"Result Measure Qualifier(MeasureQualifierCode)",1189,"MHA","Due to high levels of analyte in the sample, the MS/MSD calculation does not provide useful spike recovery information","7/18/2024 8:49:50 PM","Not Reviewed"
Expand Down
23 changes: 5 additions & 18 deletions man/TADA_FlagContinuousData.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 6ac2171

Please sign in to comment.