Skip to content
This repository has been archived by the owner on Jun 30, 2023. It is now read-only.

Edit search radius used for site-to-reach matching #116

Merged
merged 2 commits into from
Mar 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion 2_process.R
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,12 @@ p2_targets_list <- list(
hucs=drb_huc8s,crs_out="NAD83")
),

# Match PRMS stream segments to observation site ids and return subset of sites within
# the distance specified by search_radius (in meters)
tar_target(
p2_sites_w_segs,
get_site_flowlines(p1_reaches_sf, p2_site_list, sites_crs = 4269, max_matches = 1, search_radius = 0.1)
get_site_flowlines(p1_reaches_sf, p2_site_list, sites_crs = 4269, max_matches = 1,
search_radius = bird_dist_cutoff_m, retain_sites = retain_nwis_sites)
),

# Pair PRMS segments with intersecting NHDPlusV2 reaches and contributing NHDPlusV2 catchments
Expand Down
54 changes: 37 additions & 17 deletions 2_process/src/match_sites_reaches.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,26 @@
#' from https://code.usgs.gov/wwatkins/national-site-reach/-/blob/master/R/match_sites_reaches.R
#' modified by: Jeff Sadler
#' Match each site with a reach (seg_id/COMID)
get_site_flowlines <- function(reach_sf, sites, sites_crs, max_matches = 1, search_radius = 0.1) {
get_site_flowlines <- function(reach_sf, sites, sites_crs, max_matches = 1, search_radius = 500, retain_sites = NULL) {
#'
#' @description Function to match reaches (flowlines) to point sites
#'
#' @param reach_sf sf object of reach polylines with column "subsegid" and in WGS84
#' @param sites dataframe with columns "lat" "lon"
#' @param sites_crs the crs os the sites table (i.e., 4269 for NAD83)
#' @param reach_sf sf object of reach polylines with column "subsegid"
#' @param sites dataframe with columns "site_id", "lat", and "lon"
#' @param sites_crs the crs of the sites table (i.e., 4269 for NAD83)
#' @param max_matches the maximum number of segments that a point can match to
#' @param search_radius the maximum radius in same units as sf object
#' within which a segment will match (segments outside of the radius will not match)
#' @param search_radius the maximum radius in meters to match a point to a segment;
#' segments outside of search_radius will not match
#' @param retain_sites character vector indicating particular sites to retain, even if
#' distance between the site and the nearest flowline segment exceeds the search_radius;
#' the site is indicated by the `site_id` column within the sites data frame.
#'
#' @value A data frame the same columns as the sites input dataframe with additional columns
#' of "segidnat", "subsegid" and "offset" where "offset" is the distance (in degrees) between the point
#' and matching flowline
#' of "segidnat", "subsegid" and "bird_dist_to_subseg_m" where "bird_dist_to_subseg_m" is the
#' distance (in meters) between the point and matching flowline

# set up NHDPlus fields used by get_flowline_index
# Note: that we are renaming the subsegid column to COMID because the nhdplusTools
# Note: we are renaming the subsegid column to COMID because the nhdplusTools
# function requires an sf object with a "COMID" column. This does not have anything
# to do with the actual nhd COMIDs
# Note: the `ToMeas` and `FromMeas` are also required columns for the nhdplusTools
Expand All @@ -27,36 +30,53 @@ get_site_flowlines <- function(reach_sf, sites, sites_crs, max_matches = 1, sear
reaches_nhd_fields <- reach_sf %>%
rename(COMID = subsegid) %>%
mutate(REACHCODE = COMID, ToMeas = 100, FromMeas = 100) %>%
st_as_sf()
# project reaches to Albers Equal Area Conic so that offsets returned by
# get_flowline_index are in meters rather than degrees
st_transform(5070)

sites_sf <- sites %>% rowwise() %>%
filter(across(c(lon, lat), ~ !is.na(.x))) %>%
mutate(Shape = list(st_point(c(lon, lat), dim = "XY"))) %>%
st_as_sf() %>% st_set_crs(sites_crs) %>%
st_transform(st_crs(reaches_nhd_fields)) %>%
st_geometry()

# Indicate the site numbers (id's) to retain based on retain_sites
retain_rows <- which(sites$site_id %in% retain_sites)

message('matching flowlines with reaches...')
# Below, precision indicates the resolution of measure precision (in meters) in the output;
# since we are interested in a more accurate estimate of the `offset` distance between a
# point and the matched reach, set precision to 1 m.
# Conduct initial search using a larger radius (search_radius*2) than specified to
# account for any uncertainty in the RANN:nn2 radius search. Then
# filter sites to include those within the specified search_radius.
flowline_indices <- nhdplusTools::get_flowline_index(flines = reaches_nhd_fields,
points = sites_sf,
max_matches = max_matches,
search_radius = search_radius) %>%
search_radius = search_radius*2,
Copy link
Member

@jds485 jds485 Mar 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why *2 now? Is this to allow NWIS sites to be matched and later retained? If that's why, I think it would be better to add a separate search radius column for the NWIS sites to retain. This would allow for still retaining NWIS sites when the search_radius is reduced to something smaller than would match the NWIS sites

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, good question. I made this change to initially use a wider search radius when we apply nhdplusTools::get_flowline_index(). That function uses a radius search implemented in RANN::nn2() and I noticed when looking at DO data that a couple sites were not being picked up with a 500 m search radius (but were picked up with a 1000 m radius) even though the distance between those sites was < 500 m, as confirmed by the offset returned and separately using {sf}. More details are here.

So this doesn't have anything to do with the two NWIS sites but rather, to ensure we're picking up all the sites we would expect initially, which we then filter to the requested search_radius a couple lines down. I'm open to other suggestions, too, so happy to hear your thoughts!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Strange that those 2 sites do not get picked up! I'm not familiar with the nn2 function, and the source is in C, so I'm less inclined to look into if it's a matter of using a different argument in get_flowline_index

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I wasn't worried about those two sites in particular, but the behavior was puzzling to me. The distances returned seem to be correct, so for now I decided to just adjust the argument in get_flowline_index to make sure we're capturing all the sites we'd expect. I tried to explain this reasoning in the comments because I know the different argument to search_radius is a little odd.

precision = 1) %>%
select(COMID, id, offset) %>%
rename(subsegid = COMID)
rename(subsegid = COMID, bird_dist_to_subseg_m = offset) %>%
filter(bird_dist_to_subseg_m <= search_radius | id %in% retain_rows)

# nhdplusTools returns an "id" column which is just an index from 1 to
# the number of sites. To later join to the site-ids, we need to add
# a matching index column.
sites <- rowid_to_column(sites, "id")

message("rejoining with other geometries")
#rejoin to original reaches df
message("rejoining with other geometries")

sites_w_reach_ids <- sites %>%
left_join(flowline_indices, by = "id") %>%
# only retain sites that got matched to flowlines and are
# within specified search_radius
right_join(flowline_indices, by = "id") %>%
select(-id) %>%
# add `segidnat` column
left_join(.,reach_sf %>%
st_drop_geometry() %>%
select(subsegid,segidnat),
left_join(reach_sf %>%
select(subsegid, segidnat) %>%
sf::st_drop_geometry(),
by = "subsegid")

return(sites_w_reach_ids)
Expand Down
6 changes: 6 additions & 0 deletions _targets.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,12 @@ site_tp_select <- c("ST","ST-CA","SP")
# sites 01412350, 01484272, and 01482695 coded as site type "ST" but appear to be tidally-influenced
omit_nwis_sites <- c("01412350","01484272","01482695")

# Define search radius to match sites to PRMS segments (in meters)
bird_dist_cutoff_m <- 500

# Retain NWIS sites even if nearest PRMS segment is outside of radius set by bird_dist_cutoff_m
retain_nwis_sites <- c("01474703","01477050")

# Define PRMS mainstem segments thought to be tidally-influenced (used for filtering sites used in daily/sub-daily models)
mainstem_reaches_tidal <- c("2771_1","2769_1","2768_1","2767_1","2764_1","2762_1","2763_1","2759_1","2757_1","2752_1",
"2753_1","2755_1","2772_1","388_1","389_1","385_1","386_1","383_1","382_1","377_1","378_1",
Expand Down