Skip to content
This repository has been archived by the owner on Jun 30, 2023. It is now read-only.

Commit

Permalink
Merge pull request #115 from lekoenig/114-omit-wqp-dups
Browse files Browse the repository at this point in the history
Omit duplicated observations from WQP data
  • Loading branch information
lkoenig-usgs authored Mar 18, 2022
2 parents 7e875be + 4625bfb commit 30126d4
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 16 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ _targets

#Other local files
archive/*
1_fetch/in/NLCDs_2001_2019/*

#R files
# History files
Expand Down
8 changes: 7 additions & 1 deletion 2_process.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,13 @@ p2_targets_list <- list(
# Subset discrete SC data from harmonized WQP
tar_target(
p2_wqp_SC_data,
subset_wqp_SC_data(p2_filtered_wqp_data)
subset_wqp_SC_data(p2_filtered_wqp_data, omit_dups = TRUE)
),

# Subset duplicated discrete SC observations from the harmonized WQP dataset
tar_target(
p2_wqp_SC_dups,
subset_wqp_SC_dups(p2_filtered_wqp_data)
),

# Aggregate instantaneous SC data to hourly averages
Expand Down
72 changes: 67 additions & 5 deletions 2_process/src/filter_wqp_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,18 @@ filter_wqp_salinity_data <- function(data,major_ion_names,wqp_vars_select,omit_w
}


subset_wqp_SC_data <- function(filtered_data){
subset_wqp_SC_data <- function(filtered_data,omit_dups = TRUE){
#'
#' @description Function to subset the filtered WQP salinity dataset for specific conductance
#'
#' @param filtered_data a data frame containing the filtered DRB multisource surface-water-quality dataset.
#' Filtered_data is the output from filter_wqp_salinity_data().
#' @param omit_dups logical indicating whether to omit duplicated observations for a unique site/date-time/
#' lat-lon location/collecting organization; defaults to TRUE
#'
#' @value A data frame containing discrete specific conductance samples from the Delaware River Basin
#' @examples
#' subset_wqp_SC_data(filtered_data = filtered_wqp_data)
#' subset_wqp_SC_data(filtered_data = filtered_wqp_data, omit_dups = TRUE)

# Filter out specific conductance param values "min" and "max"
SC_params <- c("Specific conductance, field",
Expand All @@ -50,14 +52,74 @@ subset_wqp_SC_data <- function(filtered_data){
"Specific conductance, lab")

SC_data_subset <- filtered_data %>%
# Omit samples originally entered as "conductivity" since we can't be sure these reflect temperature-corrected conductance
filter(param %in% SC_params,CharacteristicName!="Conductivity")
# Omit samples originally entered as "conductivity" since we can't be sure these reflect
# temperature-corrected conductance
filter(param %in% SC_params,CharacteristicName!="Conductivity") %>%
# Fill in date-time stamp so that if sampling time is missing, assume some value (12:00:00)
# that we can use to look for duplicated date-times
mutate(ActivityStartDateTime = na_if(ActivityStartDateTime, "NA")) %>%
mutate(ActivityStartDateTime_filled = if_else(is.na(ActivityStartDateTime),
paste(ActivityStartDate,"12:00:00",sep=" "),
ActivityStartDateTime))

if(omit_dups == "TRUE"){
# When duplicate observations exist for a unique combination of [site name & date-time &
# geographic location & collecting organization], select one observation based on
# the `param` attribute
SC_data_subset_omit_dups <- SC_data_subset %>%
group_by(MonitoringLocationIdentifier,
ActivityStartDateTime_filled,
OrganizationIdentifier,
LongitudeMeasure, LatitudeMeasure) %>%
# Preferentially retain samples with param equals "Specific conductance, lab" first
# and "Specific conductance, field, mean" last
arrange(match(param, c("Specific conductance, lab",
"Specific conductance",
"Specific conductance, field",
"Specific conductance, field, mean"))) %>%
slice(1) %>%
ungroup()

return(SC_data_subset_omit_dups)
} else {
return(SC_data_subset)
}

}

return(SC_data_subset)


subset_wqp_SC_dups <- function(filtered_data){
#'
#' @description Function to subset duplicate observations within the filtered
#' WQP salinity dataset for specific conductance
#'
#' @param filtered_data a data frame containing the filtered DRB multisource
#' surface-water-quality dataset.filtered_data is the output from filter_wqp_salinity_data().
#'
#' @value A data frame containing discrete specific conductance samples from the Delaware River Basin
#' @examples
#' subset_wqp_SC_dups(filtered_data = filtered_wqp_data)

SC_data_subset <- subset_wqp_SC_data(filtered_data, omit_dups = FALSE)

# Isolate the duplicated observations
SC_data_subset_dups <- SC_data_subset %>%
group_by(MonitoringLocationIdentifier,
ActivityStartDateTime_filled,
OrganizationIdentifier,
LongitudeMeasure, LatitudeMeasure) %>%
mutate(dup = n()>1) %>%
filter(dup == "TRUE") %>%
select(-dup) %>%
ungroup()

return(SC_data_subset_dups)

}



subset_wqp_nontidal <- function(wqp_data,site_list_w_segs,mainstem_segs){
#'
#' @description Function to filter discrete samples potentially influenced by tides from the WQP specific conductance dataset
Expand Down
11 changes: 6 additions & 5 deletions 2_process/src/match_sites_reaches.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,12 @@ get_site_flowlines <- function(reach_sf, sites, sites_crs, max_matches = 1, sear
#rejoin to original reaches df
sites_w_reach_ids <- sites %>%
left_join(flowline_indices, by = "id") %>%
select(-id)

# add `segidnat` column
sites_w_reach_ids <- sites_w_reach_ids %>%
left_join(select(reach_sf, c(subsegid, segidnat)))
select(-id) %>%
# add `segidnat` column
left_join(.,reach_sf %>%
st_drop_geometry() %>%
select(subsegid,segidnat),
by = "subsegid")

return(sites_w_reach_ids)
}
Expand Down
10 changes: 5 additions & 5 deletions _targets.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ dir.create("3_visualize/out/nhdv2_attr_png/",showWarnings = FALSE)

# Define columns of interest for harmonized WQP data
wqp_vars_select <- c("MonitoringLocationIdentifier","MonitoringLocationName","LongitudeMeasure","LatitudeMeasure",
"MonitoringLocationTypeName","OrganizationIdentifier","ActivityStartDate","ActivityStartTime.Time",
"ActivityEndDate","CharacteristicName","param_group","param","USGSPCode","ActivityMediaName",
"ResultSampleFractionText","HydrologicCondition","HydrologicEvent","resultVal2","resultUnits2",
"ResultDetectionConditionText","ResultTemperatureBasisText","PrecisionValue","ResultStatusIdentifier",
"final")
"MonitoringLocationTypeName","OrganizationIdentifier","ActivityStartDate","ActivityStartDateTime",
"ActivityStartTime.Time","ActivityStartTime.TimeZoneCode","CharacteristicName","param_group","param",
"USGSPCode","ActivityMediaName","ResultSampleFractionText","HydrologicCondition","HydrologicEvent",
"resultVal2","resultUnits2","ResultDetectionConditionText","ResultTemperatureBasisText",
"PrecisionValue","ResultStatusIdentifier","final")

# Define water quality major ions of interest
major_ion_names = c("Chloride","Sodium")
Expand Down

0 comments on commit 30126d4

Please sign in to comment.