Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Internal geog data #121

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 5 additions & 7 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,21 @@ Suggests:
roxygen2,
rmarkdown,
dplyr,
curl,
glue,
tibble,
sf
cincy
Remotes:
geomarker-io/cincy,
cole-brokamp/dpkg
geomarker-io/cincy
Config/testthat/edition: 3
URL: https://github.com/geomarker-io/codec,
http://geomarker.io/codec/
BugReports: https://github.com/geomarker-io/codec/issues
Imports:
rlang,
stringr,
glue,
purrr (>= 1.0.0),
cincy (>= 1.1.0),
dpkg (>= 0.5.1)
dpkg,
sf
Depends:
R (>= 2.10)
LazyData: true
8 changes: 6 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
# Generated by roxygen2: do not edit by hand

export(as_codec_dpkg)
export(cincy_census_geo)
export(cincy_city_geo)
export(cincy_county_geo)
export(cincy_neighborhood_geo)
export(cincy_zcta_geo)
export(codec_colors)
export(codec_dpkg_as_sf)
export(codec_dpkg_s3_put)
export(get_codec_dpkg)
export(is_codec_dpkg)
import(cincy)
195 changes: 195 additions & 0 deletions R/cincy_geographies.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
#' Cincy census tracts and block groups
#'
#' Read tract and block group ("bg") geographies from the online Census
#' [TIGER/Line](https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)
#' files into R
#' @param geography which type of cincy census geography to return
#' @param vintage a character vector of a year corresponding to the vintage of TIGER/Line data
#' @details
#' Compressed shapefiles are downloaded from TIGER into an R user data directory and will be cached
#' for use across other R sessions (see `?dpkg::stow` for more details).
#' @returns a simple features object with a geographic identifier column (`geoid`)
#' and a geometry column (`s2_geography`)
#' @export
#' @examples
#' cincy_census_geo("tract", "2024")
#' cincy_census_geo("tract", "2020")
#' cincy_census_geo("tract", "2019")
#' cincy_census_geo("bg", "2020")
#' cincy_census_geo("bg", "2019")
cincy_census_geo <- function(geography = c("tract", "bg"), vintage = as.character(2024:2013)) {
geography <- rlang::arg_match(geography)
vintage <- rlang::arg_match(vintage)
tiger_url <- glue::glue(
"https://www2.census.gov/geo/tiger/TIGER{vintage}",
"/{toupper(geography)}/tl_{vintage}_39_{geography}.zip"
)
tiger_local <- dpkg::stow_url(tiger_url)
out <-
sf::read_sf(glue::glue("/vsizip/", tiger_local),
query = glue::glue("SELECT GEOID FROM tl_{vintage}_39_{geography} WHERE COUNTYFP = '061'")
)
names(out) <- tolower(names(out))
out$s2_geography <- sf::st_as_s2(out$geometry)
out <- sf::st_drop_geometry(out)
out <- sf::st_as_sf(out)
return(out)
}

#' Cincy county
#' @rdname cincy_census_geo
#' @export
#' @examples
#' cincy_county_geo("2024")
cincy_county_geo <- function(vintage = as.character(2024:2013)) {
vintage <- rlang::arg_match(vintage)
tiger_url <- glue::glue("https://www2.census.gov/geo/tiger/TIGER{vintage}/COUNTY/tl_{vintage}_us_county.zip")
tiger_local <- dpkg::stow_url(tiger_url)
out <-
sf::read_sf(glue::glue("/vsizip/", tiger_local),
query = glue::glue("SELECT GEOID FROM tl_{vintage}_us_county WHERE GEOID = '39061'")
)
return(sf::st_as_s2(out$geometry))
}

#' Install CAGIS GIS database
#'
#' This installs the CAGIS Open Data GIS database (`.gdb`) into the data
#' directory for the codec package. Once downloaded, it will be reused
#' across R sessions on the same computer.
#' The geodatabase contains many [layers](https://www.cagis.org/Opendata/Quarterly_GIS_Data/OpenData_Layer_List.txt) that are
#' updated quarterly. (Historical geodatabases are not available here.)
#' @seealso This function is called by `cincy_neighborhood_geo()`, `cincy_city_geo()` and others that import individual layers.
#' @param cagis_data_url the url to the CAGIS Open Data .gdb.zip file; this changes quarterly, so
#' [check](https://www.cagis.org/Opendata/Quarterly_GIS_Data) for something more recent if the file cannot be found
#' @examples
#' options(timeout = max(2500, getOption("timeout")), download.file.method = "libcurl")
#' install_cagis_data()
#' sf::st_layers(install_cagis_data())$name
install_cagis_data <- function(cagis_data_url = "https://www.cagis.org/Opendata/Quarterly_GIS_Data/CAGISOpenDataQ4_2024.gdb.zip") {
cagis_gdb_name <- tools::file_path_sans_ext(basename(cagis_data_url))
dest <- file.path(tools::R_user_dir(package = "codec", "data"), cagis_gdb_name)
if (file.exists(dest)) {
return(dest)
}
tmp <- tempfile(fileext = ".zip")
utils::download.file(cagis_data_url, destfile = tmp, mode = "wb")
unzip(tmp, exdir = dirname(dest))
return(dest)
}

#' Cincy neighborhood geographies
#'
#' CAGIS data (see `install_cagis_data()`) provides community council boundaries, but these boundaries can
#' overlap and do not align with census geographies or ZIP codes.
#' By default, the statistical neighborhood approximations are instead returned,
#' which are calculated by aggregating census tracts into 50 matching neighborhoods.
#' @param geography which type of cincy neighborhood geography to return
#' @returns a simple features object with a geographic identifier column (`geoid`)
#' and a geometry column (`s2_geography`)
#' @export
#' @examples
#' cincy_neighborhood_geo("statistical_neighborhood_approximations")
#' cincy_neighborhood_geo("community_council")
cincy_neighborhood_geo <- function(geography = c("statistical_neighborhood_approximations", "community_council")) {
geography <- rlang::arg_match(geography)
if (geography == "statistical_neighborhood_approximations") {
noi <- c("Cincinnati_Statistical_Neighborhood_Approximations" = "SNA_NAME")
}
if (geography == "community_council") {
noi <- c("Cincinnati_Community_Council_Neighborhoods" = "NEIGH")
}
d <- sf::st_read(install_cagis_data(), names(noi), quiet = TRUE)
out <- tibble::tibble(
geoid = sf::st_drop_geometry(d)[, noi],
s2_geography = sf::st_as_s2(sf::st_cast(sf::st_zm(d$SHAPE), "MULTIPOLYGON"))
) |>
sf::st_as_sf()
return(out)
}

#' cincy_city_geo()
#' @export
#' @rdname cincy_neighorhood_geo
cincy_city_geo <- function() {
cagis_db <- install_cagis_data()
out <- sf::st_read(cagis_db, layer = "Cincinnati_City_Boundary", quiet = TRUE)
return(sf::st_as_s2(out$SHAPE))
}


#' Cincy census tracts and block groups
#'
#' Read tract and block group ("bg") geographies from the online Census
#' [TIGER/Line](https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)
#' files into R
#' @param geography which type of cincy census geography to return
#' @param vintage a character vector of a year corresponding to the vintage of TIGER/Line data
#' @details
#' Compressed shapefiles are downloaded from TIGER into an R user data directory and will be cached
#' for use across other R sessions (see `?dpkg::stow` for more details).
#' @returns a simple features object with a geographic identifier column (`geoid`)
#' and a geometry column (`s2_geography`)
#' @export
#' @examples
#' cincy_census_geo("tract", "2024")
#' cincy_census_geo("tract", "2020")
#' cincy_census_geo("tract", "2019")
#' cincy_census_geo("bg", "2020")
#' cincy_census_geo("bg", "2019")

#' Cincy ZIP Code Tabulation Areas
#'
#' Read [ZIP Code Tabulation Areas (ZCTAs)](https://www.census.gov/programs-surveys/geography/guidance/geo-areas/zctas.html)
#' geographies from the online Census
#' [TIGER/Line](https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)
#' files into R
#' @param vintage a character vector of a year corresponding to the vintage of TIGER/Line data
#' @export
#' @returns a simple features object with a geographic identifier column (`geoid`)
#' and a geometry column (`s2_geography`)
#' @examples
#' cincy_zcta_geo()
#' cincy_zcta_geo("2018")
cincy_zcta_geo <- function(vintage = as.character(2024:2013)) {
vintage <- rlang::arg_match(vintage)
is_vintage_old <- vintage %in% as.character(2013:2019)
tiger_url <- glue::glue(
"https://www2.census.gov/geo/tiger/TIGER{vintage}/",
ifelse(is_vintage_old, "ZCTA5", "ZCTA520"),
"/tl_{vintage}_us_zcta",
ifelse(is_vintage_old, "510", "520"),
".zip"
)
tiger_local <- dpkg::stow_url(tiger_url)
out <-
sf::read_sf(glue::glue("/vsizip/", tiger_local),
query = glue::glue(
"SELECT ",
ifelse(is_vintage_old, "GEOID10", "GEOID20"),
" FROM tl_{vintage}_us_zcta",
ifelse(is_vintage_old, "510", "520"),
" WHERE ",
ifelse(is_vintage_old, "GEOID10", "GEOID20"),
" IN ({paste(paste0(\"'\", cincy_zip_codes, \"'\"), collapse = \", \")})"
)
)
names(out) <- gsub("[0-9]", "", tolower(names(out)))
out$s2_geography <- sf::st_as_s2(out$geometry)
out <- sf::st_drop_geometry(out)
out <- sf::st_as_sf(out)
return(out)
}

# from cincy::zcta_tiger_2020 (version 1.1.0) on 2024-11-08
cincy_zip_codes <-
c(
"45214", "45208", "45236", "45247", "45225", "45205", "45220",
"45206", "45223", "45232", "45174", "45207", "45209", "45212",
"45213", "45217", "45218", "45229", "45238", "45242", "45051",
"45002", "45227", "45211", "45215", "45216", "45219", "45224",
"45033", "45237", "45239", "45248", "45041", "45267", "45030",
"45252", "45244", "45202", "45249", "45255", "45226", "45203",
"45246", "45111", "45147", "45052", "45240", "45241", "45243",
"45251", "45001", "45204", "45231", "45230", "45233"
)
98 changes: 98 additions & 0 deletions R/cincy_interpolate.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#' Spatially interpolate community-level data
#'
#' Weights at the census block-level are used to spatially interpolate different geographies.
#' Block-level total population, total number of homes, or total land area from the 2020 Census
#' can be chosen to use for the weights.
#' @param from a CoDEC data package
#' @param to ; if NULL, tract geographies are returned without data interpolation
#' @details Tract identifers do not change across decennial censuses, but the digital representation of their boundaries
#' may be improved over time. Here, data packages using 2010 tract identifers use the TIGER/Line 2019 tract shapefiles
#' and data packages using 2020 tract identifiers use the TIGER/Line 2020 tract shapefiles
#' @returns a simple features object with a geographic identifier column (`geoid`)
#' and a geometry column (`s2_geography`) in addition to the (interpolated) columns in `from`
#' @examples
#' codec_interpolate(from = get_codec_dpkg("property_code_enforcements-v0.2.0"))
#' codec_interpolate(get_codec_dpkg("property_code_enforcements-v0.2.0"), to = "zcta")
codec_interpolate <- function(from, to = NULL, weights = c("pop", "homes", "area")) {
weights <- rlang::arg_match(weights)
if (!is_codec_dpkg(from)) rlang::abort("from must be a CoDEC data package")
md <- dpkg::dpkg_meta(from)
gd_vintage <- ifelse(any(grepl("census_tract_id_2010", names(from), fixed = TRUE)), "2019", "2020")
gd <-
cincy_census_geo("tract", gd_vintage) |>
dplyr::left_join(from, by = c("geoid" = paste0("census_tract_id_", ifelse(gd_vintage == "2019", "2010", "2020"))))
if (is.null(to)) {
return(gd)
}
return("interpolation is under construction.....")
}

cincy_block_weights <- function() {
tiger_url <- "https://www2.census.gov/geo/tiger/TIGER2020/TABBLOCK20/tl_2020_39_tabblock20.zip"
tiger_local <- dpkg::stow_url(tiger_url)
rd <-
sf::read_sf(glue::glue("/vsizip/", tiger_local),
query = glue::glue("SELECT GEOID20,ALAND20,HOUSING20,POP20 FROM tl_2020_39_tabblock20 WHERE COUNTYFP20 = '061'")
)
out <-
rd |>
sf::st_transform(5072) |>
sf::st_point_on_surface() |>
suppressWarnings() |>
dplyr::select(pop = POP20, homes = HOUSING20, area = ALAND20)
out$s2_geography <- sf::st_as_s2(out$geometry)
out <- sf::st_drop_geometry(out)
out <- sf::st_as_sf(out)
return(out)
}

from <-
cincy_census_geo("tract", "2020") |>
sf::st_transform(5072) |>
dplyr::mutate(n = 0.1)
to <- sf::st_transform(cincy_zcta_geo("2020"), 5072)

bw <-
cincy_block_weights() |>
sf::st_transform(5072) |>
dplyr::select(weight = pop, s2_geography)

# from 'total weights'
fromm <-
sf::st_join(from, bw, left = FALSE) |>
sf::st_drop_geometry() |>
dplyr::summarize(total_weight = sum(weight, na.rm = TRUE), .by = "geoid") |>
dplyr::right_join(from, by = "geoid") |>
sf::st_as_sf()

# calculate intersections and intersection proportions
intersections <-
fromm |>
sf::st_intersection(to) |>
dplyr::filter(sf::st_is(s2_geography, c("POLYGON", "MULTIPOLYGON", "GEOMETRYCOLLECTION"))) |>
dplyr::mutate(intersection_id = dplyr::row_number())

weights <-
sf::st_join(bw, intersections) |>
sf::st_drop_geometry() |>
dplyr::summarize(
weight = sum(weight, na.rm = TRUE),
weight_coef = weight / total_weight,
.by = "intersection_id"
) |>
dplyr::distinct(weight_coef, .keep_all = TRUE) |>
dplyr::right_join(intersections, by = "intersection_id") |>
dplyr::select(geoid, geoid.1, weight, weight_coef)

# non-extensive (same total)
fromm |>
sf::st_drop_geometry() |>
dplyr::left_join(weights, by = "geoid") |>
dplyr::summarize(n_interpolated = sum(n * weight_coef, na.rm = TRUE), .by = "geoid.1")

# extensive (same average)
fromm |>
sf::st_drop_geometry() |>
dplyr::left_join(weights, by = "geoid") |>
dplyr::summarize(n_interpolated = weighted.mean(n, weight, na.rm = TRUE), .by = "geoid.1")

Loading
Loading