diff --git a/tmd/areas/targets/prepare/R/libraries.R b/tmd/areas/targets/prepare/R/libraries.R index ec03d8d9..ea2b7bb8 100644 --- a/tmd/areas/targets/prepare/R/libraries.R +++ b/tmd/areas/targets/prepare/R/libraries.R @@ -1,6 +1,7 @@ # libraries --------------------------------------------------------------- library(renv) +library(here) library(DT) library(fs) diff --git a/tmd/areas/targets/prepare/_quarto.yml b/tmd/areas/targets/prepare/_quarto.yml index 45415576..a5bc307e 100644 --- a/tmd/areas/targets/prepare/_quarto.yml +++ b/tmd/areas/targets/prepare/_quarto.yml @@ -25,7 +25,7 @@ execute: eval: true echo: true output: true - freeze: auto # auto: during global project renders, re-render only when source changes + freeze: false # auto: during global project renders, re-render only when source changes book: title: "Develop targets for subnational areas" @@ -44,8 +44,8 @@ book: - cd_download_soi_data.qmd - cd_construct_soi_variable_documentation.qmd - cd_construct_long_soi_data_file.qmd - - cd_create_basefile_for_cd_target_files.qmd - - cd_create_crosswalk_from_cd117th_to_cd118th.qmd + - cd_create_basefile_for_117Congress_cd_target_files.qmd + - cd_create_cd_117_118_crosswalk_and_cdbasefile_118.qmd - cd_map_tcvars_and_extract_target_files.qmd appendices: - cd_issues_and_TODOs.qmd diff --git a/tmd/areas/targets/prepare/cd_create_basefile_for_cd_target_files.qmd b/tmd/areas/targets/prepare/cd_create_basefile_for_117Congress_cd_target_files.qmd similarity index 99% rename from tmd/areas/targets/prepare/cd_create_basefile_for_cd_target_files.qmd rename to tmd/areas/targets/prepare/cd_create_basefile_for_117Congress_cd_target_files.qmd index 923bb220..28f5231e 100644 --- a/tmd/areas/targets/prepare/cd_create_basefile_for_cd_target_files.qmd +++ b/tmd/areas/targets/prepare/cd_create_basefile_for_117Congress_cd_target_files.qmd @@ -263,7 +263,7 @@ cdbasefile |> count(basevname) cdbasefile |> filter(statecd=="WY00", agistub==0, basevname=="v00100") cdbasefile |> filter(statecd=="WY00", agistub==0, basevname=="v00101") -write_csv(cdbasefile, fs::path(CDINTERMEDIATE, "cdbasefile.csv")) +write_csv(cdbasefile, fs::path(CDINTERMEDIATE, "cdbasefile_117.csv")) ``` diff --git a/tmd/areas/targets/prepare/cd_create_crosswalk_from_cd117th_to_cd118th.qmd b/tmd/areas/targets/prepare/cd_create_cd_117_118_crosswalk_and_cdbasefile_118.qmd similarity index 76% rename from tmd/areas/targets/prepare/cd_create_crosswalk_from_cd117th_to_cd118th.qmd rename to tmd/areas/targets/prepare/cd_create_cd_117_118_crosswalk_and_cdbasefile_118.qmd index b0196f17..5d18affb 100644 --- a/tmd/areas/targets/prepare/cd_create_crosswalk_from_cd117th_to_cd118th.qmd +++ b/tmd/areas/targets/prepare/cd_create_cd_117_118_crosswalk_and_cdbasefile_118.qmd @@ -6,7 +6,7 @@ editor_options: # Prepare crosswalk from Congressional district boundaries for the 117th Congressional session to those for the 118th session -IRS SOI data currently available for Congressional districts is based on 117th Congressional session boundaries, which were drawn using information from the 2010 decennial census. These sometimes differ signficantly from current district boundaries, for the 118th Congressional session, which were drawn based on data from the 2020 decennial census. +IRS SOI data currently (October 2024) available for Congressional districts is based on 117th Congressional session boundaries, which were drawn using information from the 2010 decennial census. These sometimes differ signficantly from current district boundaries, for the 118th Congressional session, which were drawn based on data from the 2020 decennial census. To address this, we develop estimates for 118th session districts by allocating targets for 117th session districts to 118th session districts based on the fraction of each 117th district's 2020 population that is present in different 118th session districts. @@ -33,14 +33,15 @@ source(here::here("R", "constants.R")) #| label: get-data #| output: false -cd117 <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile.csv")) +cd117 <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile_117.csv")) -df <- read_csv(fs::path(CDRAW, "geocorr2022_2428906586.csv")) -glimpse(df) +xwalk1 <- read_csv(fs::path(CDRAW, "geocorr2022_2428906586.csv")) +glimpse(xwalk1) -(xwlabs <- unlist(df[1, ], use.names = TRUE)) # variable names and labels -xwalk1 <- df |> +(xwalk_labels <- unlist(xwalk1[1, ], use.names = TRUE)) # variable names and labels + +xwalk2 <- xwalk1 |> filter(row_number() != 1) |> rename_with(str_to_lower) |> rename(stabbr=stab, pop2020=pop20) |> @@ -54,49 +55,50 @@ xwalk1 <- df |> ```{r} -#| label: data-checks +#| label: interactive-data-checks +#| eval: false #| output: false -count(xwalk1, stabbr) # 52 including DC and PR -count(xwalk1, stabbr) |> filter(n==1) -xwalk1 |> filter(stabbr=="PR") +count(xwalk2, stabbr) # 52 including DC and PR +count(xwalk2, stabbr) |> filter(n==1) +xwalk2 |> filter(stabbr=="PR") # check numbers of districts -cd117codes <- unique(xwalk1$statecd117) |> sort() # 438 -- why? -cd118codes <- unique(xwalk1$statecd118) |> sort() # 437 -- why? +cd117codes <- unique(xwalk2$statecd117) |> sort() # 438 -- why? +cd118codes <- unique(xwalk2$statecd118) |> sort() # 437 -- why? cd117codes # DC98 instead of DC00; PR98; NC seems to have bad codes < 10 cd118codes # DC98 instead of DC00; PR98; NC codes look ok here # do the shares of statecd117 given to various statecd118s add to 1? -xwalk1 |> +xwalk2 |> summarise(af117to118=sum(af117to118), .by=statecd117) |> filter(af117to118 != 1) |> arrange(desc(abs(af117to118 - 1))) # minimal differences from 1 # do the shares of statecd118 given to various statecd117s add to 1? -xwalk1 |> +xwalk2 |> summarise(af118to117=sum(af118to117), .by=statecd118) |> filter(af118to117 != 1) |> arrange(desc(abs(af118to117 - 1))) # minimal differences from 1 # do the individual shares of af117to118 match what we get with population? -xwalk1 |> +xwalk2 |> mutate(share117to118=pop2020 / sum(pop2020), .by=statecd117) |> mutate(diff=share117to118 - af117to118) |> relocate(af117to118, .before=share117to118) |> arrange(desc(abs(diff))) # good, they match within small tolerances -# use our calculated amounts +# use our calculated amounts, which have greater precision # do the individual shares of af118to117 match what we get with population? -xwalk1 |> +xwalk2 |> mutate(share118to117=pop2020 / sum(pop2020), .by=statecd118) |> mutate(diff=share118to117 - af118to117) |> relocate(af118to117, .before=share118to117) |> arrange(desc(abs(diff))) # good, they match within small tolerances # how well do the cds match against our 117th cd data? -xwalk2 <- xwalk1 |> +xwalk3 <- xwalk2 |> filter(stabbr != "PR") |> filter(cd117 != "-") |> # not sure what this is and pop2020 is only 13 # redo codes @@ -114,16 +116,16 @@ xwalk2 <- xwalk1 |> statecd118 == "DC98" ~ "DC00", .default = statecd118)) -xwalk2 |> +xwalk3 |> filter(cd117 != oldcd117) |> relocate(oldcd117, .after=cd117) -xwalk2 |> +xwalk3 |> filter(statecd118 == "NC14") # how do the 117th CDs match up? usoi <- cd117$statecd |> unique() -ugeo <- xwalk2$statecd117 |> unique() +ugeo <- xwalk3$statecd117 |> unique() usoi ugeo @@ -141,16 +143,36 @@ check <- xwalk2 |> #| label: make-save-final-xwalk #| output: false -# calc pop shares (so we have more precision than in the source data) and save +# recreate xwalk3 as it won't be available when data checks have eval: false + xwalk3 <- xwalk2 |> + filter(stabbr != "PR") |> + filter(cd117 != "-") |> # not sure what this is and pop2020 is only 13 + # redo codes + mutate( + oldcd117 = cd117, + cd117 = case_when(stabbr=="NC" & nchar(cd117) != 2 ~ + str_pad(as.integer(cd117), width=2, side="left", pad="0"), + .default = cd117), + statecd117=paste0(stabbr, cd117), + statecd118=paste0(stabbr, cd118), + statecd117 = case_when( + statecd117 == "DC98" ~ "DC00", + .default = statecd117), + statecd118 = case_when( + statecd118 == "DC98" ~ "DC00", + .default = statecd118)) + +# calc pop shares (so we have more precision than in the source data) and save +xwalk4 <- xwalk3 |> mutate(share117to118=pop2020 / sum(pop2020), .by=statecd117) -xwalk3 |> +xwalk4 |> mutate(diff=share117to118 - af117to118) |> relocate(af117to118, .before=share117to118) |> arrange(desc(abs(diff))) # good, they match within small tolerances -xwalk_final <- xwalk3 |> +xwalk_final <- xwalk4 |> select(stabbr, cd117, cd118, statecd117, statecd118, share117to118) write_csv(xwalk_final, fs::path(CDINTERMEDIATE, "xwalk_final.csv")) @@ -186,41 +208,5 @@ glimpse(cd118v2) summary(cd118v2) write_csv(cd118v2, fs::path(CDINTERMEDIATE, "cdbasefile_118.csv")) - -``` - - -```{r} - - -cd117 <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile.csv")) -cd118 <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile_118.csv")) - -stack <- bind_rows( - cd117 |> mutate(session="s117"), - cd118 |> mutate(session="s118") -) - -glimpse(stack) -states <- stack |> - summarise(target=sum(target), - .by=c(session, stabbr, src, rectype, - agistub, agilo, agihi, basevname, - scope, fstatus, count, vname, description, agirange)) - -states |> - pivot_wider(names_from = session, - values_from = target) |> - mutate(diff=s118 - s117, - pdiff=diff / s117) |> - arrange(desc(abs(pdiff))) # good all the state sums work - -write_csv(stack, fs::path(CDINTERMEDIATE, "cdbasefile_sessions.csv")) - ``` - - - - - diff --git a/tmd/areas/targets/prepare/cd_map_tcvars_and_extract_target_files.qmd b/tmd/areas/targets/prepare/cd_map_tcvars_and_extract_target_files.qmd index 8b37fab5..6b87e0dc 100644 --- a/tmd/areas/targets/prepare/cd_map_tcvars_and_extract_target_files.qmd +++ b/tmd/areas/targets/prepare/cd_map_tcvars_and_extract_target_files.qmd @@ -18,12 +18,34 @@ source(here::here("R", "constants.R")) ``` -## Get needed data +## Combine cd117 and cd118 into a stacked cd file ```{r} #| label: get-cdbasefile -cdbase <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile.csv")) +cd117 <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile_117.csv")) +cd118 <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile_118.csv")) + +stack <- bind_rows( + cd117 |> mutate(session="s117"), + cd118 |> mutate(session="s118") +) + +glimpse(stack) +states <- stack |> + summarise(target=sum(target), + .by=c(session, stabbr, src, rectype, + agistub, agilo, agihi, basevname, + scope, fstatus, count, vname, description, agirange)) + +states |> + pivot_wider(names_from = session, + values_from = target) |> + mutate(diff=s118 - s117, + pdiff=diff / s117) |> + arrange(desc(abs(pdiff))) # good all the state sums work + +write_csv(stack, fs::path(CDINTERMEDIATE, "cdbasefile_sessions.csv")) ``` @@ -32,7 +54,7 @@ cdbase <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile.csv")) ```{r} #| label: tc-soi-variablemap -soivars <- count(cdbase, basevname) +soivars <- count(stack, basevname) soivars$basevname # the MARS mappings let us get counts by filing status by agi range @@ -54,7 +76,7 @@ e26270, v26270 ```{r} #| label: mapped-file -mapped <- cdbase |> +mapped <- stack |> filter(basevname %in% vmap$soivar) |> mutate(varname=factor(basevname, levels=vmap$soivar, labels=vmap$tcvar)) @@ -76,9 +98,11 @@ phase4cds <- c("AK00", "DE00", "ID01", "ID02", "ME02", "MT00", "ND00", "PA08", " # statecds <- "NY21" statecds <- phase4cds +session_number <- 117 extracted <- mapped |> filter(statecd %in% statecds) |> + filter(session==paste0("s", session_number)) |> arrange(statecd, src, scope, fstatus, basevname, count, agistub) # to be safe count(extracted, statecd) diff --git a/tmd/areas/targets/prepare/index.qmd b/tmd/areas/targets/prepare/index.qmd index bc4e2029..03bffc59 100644 --- a/tmd/areas/targets/prepare/index.qmd +++ b/tmd/areas/targets/prepare/index.qmd @@ -14,4 +14,4 @@ This R project constructs subnational area targets for: In addition, it creates a local web page that a user can upload to a web service if desired. An in-progress version can be viewed on the web [here](https://tmd-areas-prepare-targets.netlify.app/). During development, it may not be identical to what your clone creates. -It is part of the larger tax-microdata-benchmarking project, and is in the subfolder "tmd/areas/targets/prepare/". +This R project is part of the larger tax-microdata-benchmarking project, and is in the subfolder "tmd/areas/targets/prepare/". diff --git a/tmd/areas/targets/prepare/usage.qmd b/tmd/areas/targets/prepare/usage.qmd index 00c99b2e..1c407cf6 100644 --- a/tmd/areas/targets/prepare/usage.qmd +++ b/tmd/areas/targets/prepare/usage.qmd @@ -4,7 +4,7 @@ editor_options: chunk_output_type: console --- -# Usage: How to create Congressional District target files and the associated local web page +# How to create Congressional District target files and the associated local web page ## Prerequisites @@ -26,7 +26,7 @@ editor_options: The first time the project is rendered, it will create needed intermediates files and put them in the "../cds/intermediate" folder. -Note that the \_quarto.yml file sets the `freeze` execution option to `auto`, which means .qmd files will not be rerendered if they have not changed. For a full re-rendering, set `freeze: false`, which will rerender everything regardless of whether code has changed (except that it will not re-render chunks with the option `eval: false`). +Note that the \_quarto.yml file sets the `freeze` execution option to `false`, which means .qmd files will be rerendered even if they have not changed (except that quarto will not re-render chunks with the option `eval: false`). For incremental re-rendering of changed files only, set `freeze: auto`. This should be used cautiously to avoid unintended consequences. At present the code prepares target files with targets we believe are useful and practical. Users who want different targets will have to modify code to do so. However, as described in overall repo documentation, users can comment-out individual targets.