PSLmodels · donboyd5 · Oct 31, 2024 · Oct 31, 2024 · Oct 31, 2024 · Oct 31, 2024
diff --git a/tmd/areas/targets/prepare/R/libraries.R b/tmd/areas/targets/prepare/R/libraries.R
@@ -1,6 +1,7 @@
 # libraries ---------------------------------------------------------------
 
 library(renv)
+library(here)
 
 library(DT)
 library(fs)

diff --git a/tmd/areas/targets/prepare/_quarto.yml b/tmd/areas/targets/prepare/_quarto.yml
@@ -25,7 +25,7 @@ execute:
   eval: true
   echo: true
   output: true
-  freeze: auto  # auto: during global project renders, re-render only when source changes
+  freeze: false  # auto: during global project renders, re-render only when source changes
 
 book:
   title: "Develop targets for subnational areas"
@@ -44,8 +44,8 @@ book:
         - cd_download_soi_data.qmd
         - cd_construct_soi_variable_documentation.qmd
         - cd_construct_long_soi_data_file.qmd
-        - cd_create_basefile_for_cd_target_files.qmd
-        - cd_create_crosswalk_from_cd117th_to_cd118th.qmd
+        - cd_create_basefile_for_117Congress_cd_target_files.qmd
+        - cd_create_cd_117_118_crosswalk_and_cdbasefile_118.qmd
         - cd_map_tcvars_and_extract_target_files.qmd
   appendices:
     - cd_issues_and_TODOs.qmd  

diff --git a/...d_create_basefile_for_cd_target_files.qmd → ...efile_for_117Congress_cd_target_files.qmd b/...d_create_basefile_for_cd_target_files.qmd → ...efile_for_117Congress_cd_target_files.qmd
@@ -263,7 +263,7 @@ cdbasefile |> count(basevname)
 cdbasefile |> filter(statecd=="WY00", agistub==0, basevname=="v00100")
 cdbasefile |> filter(statecd=="WY00", agistub==0, basevname=="v00101")
 
-write_csv(cdbasefile, fs::path(CDINTERMEDIATE, "cdbasefile.csv"))
+write_csv(cdbasefile, fs::path(CDINTERMEDIATE, "cdbasefile_117.csv"))
 
 ```
 

diff --git a/...ate_crosswalk_from_cd117th_to_cd118th.qmd → ..._117_118_crosswalk_and_cdbasefile_118.qmd b/...ate_crosswalk_from_cd117th_to_cd118th.qmd → ..._117_118_crosswalk_and_cdbasefile_118.qmd
@@ -6,7 +6,7 @@ editor_options:
 
 # Prepare crosswalk from Congressional district boundaries for the 117th Congressional session to those for the 118th session
 
-IRS SOI data currently available for Congressional districts is based on 117th Congressional session boundaries, which were drawn using information from the 2010 decennial census. These sometimes differ signficantly from current district boundaries, for the 118th Congressional session, which were drawn based on data from the 2020 decennial census.
+IRS SOI data currently (October 2024) available for Congressional districts is based on 117th Congressional session boundaries, which were drawn using information from the 2010 decennial census. These sometimes differ signficantly from current district boundaries, for the 118th Congressional session, which were drawn based on data from the 2020 decennial census.
 
 To address this, we develop estimates for 118th session districts by allocating targets for 117th session districts to 118th session districts based on the fraction of each 117th district's 2020 population that is present in different 118th session districts.
 
@@ -33,14 +33,15 @@ source(here::here("R", "constants.R"))
 #| label: get-data
 #| output: false
 
-cd117 <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile.csv"))
+cd117 <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile_117.csv"))
 
-df <- read_csv(fs::path(CDRAW, "geocorr2022_2428906586.csv"))
-glimpse(df)
+xwalk1 <- read_csv(fs::path(CDRAW, "geocorr2022_2428906586.csv"))
+glimpse(xwalk1)
 
-(xwlabs <- unlist(df[1, ], use.names = TRUE))  # variable names and labels
 
-xwalk1 <- df |> 
+(xwalk_labels <- unlist(xwalk1[1, ], use.names = TRUE))  # variable names and labels
+
+xwalk2 <- xwalk1 |> 
   filter(row_number() != 1) |> 
   rename_with(str_to_lower) |> 
   rename(stabbr=stab, pop2020=pop20) |> 
@@ -54,49 +55,50 @@ xwalk1 <- df |>
 
 
 ```{r}
-#| label: data-checks
+#| label: interactive-data-checks
+#| eval: false
 #| output: false
 
-count(xwalk1, stabbr) # 52 including DC and PR
-count(xwalk1, stabbr) |> filter(n==1)
-xwalk1 |> filter(stabbr=="PR")
+count(xwalk2, stabbr) # 52 including DC and PR
+count(xwalk2, stabbr) |> filter(n==1)
+xwalk2 |> filter(stabbr=="PR")
 
 # check numbers of districts
-cd117codes <- unique(xwalk1$statecd117) |> sort() # 438 -- why?
-cd118codes <- unique(xwalk1$statecd118) |> sort() # 437 -- why?
+cd117codes <- unique(xwalk2$statecd117) |> sort() # 438 -- why?
+cd118codes <- unique(xwalk2$statecd118) |> sort() # 437 -- why?
 
 cd117codes # DC98 instead of DC00; PR98; NC seems to have bad codes < 10
 cd118codes # DC98 instead of DC00; PR98; NC codes look ok here
 
 # do the shares of statecd117 given to various statecd118s add to 1?
-xwalk1 |> 
+xwalk2 |> 
   summarise(af117to118=sum(af117to118), .by=statecd117) |> 
   filter(af117to118 != 1) |> 
   arrange(desc(abs(af117to118 - 1))) # minimal differences from 1
 
 # do the shares of statecd118 given to various statecd117s add to 1?
-xwalk1 |> 
+xwalk2 |> 
   summarise(af118to117=sum(af118to117), .by=statecd118) |> 
   filter(af118to117 != 1) |> 
   arrange(desc(abs(af118to117 - 1))) # minimal differences from 1
 
 # do the individual shares of af117to118 match what we get with population?
-xwalk1 |> 
+xwalk2 |> 
   mutate(share117to118=pop2020 / sum(pop2020), .by=statecd117) |> 
   mutate(diff=share117to118 - af117to118) |> 
   relocate(af117to118, .before=share117to118) |> 
   arrange(desc(abs(diff))) # good, they match within small tolerances
-# use our calculated amounts
+# use our calculated amounts, which have greater precision
 
 # do the individual shares of af118to117 match what we get with population?
-xwalk1 |> 
+xwalk2 |> 
   mutate(share118to117=pop2020 / sum(pop2020), .by=statecd118) |> 
   mutate(diff=share118to117 - af118to117) |> 
   relocate(af118to117, .before=share118to117) |> 
   arrange(desc(abs(diff))) # good, they match within small tolerances
 
 # how well do the cds match against our 117th cd data?
-xwalk2 <- xwalk1 |> 
+xwalk3 <- xwalk2 |> 
   filter(stabbr != "PR") |> 
   filter(cd117 != "-") |> # not sure what this is and pop2020 is only 13
   # redo codes
@@ -114,16 +116,16 @@ xwalk2 <- xwalk1 |>
       statecd118 == "DC98" ~ "DC00",
     .default = statecd118))
 
-xwalk2 |> 
+xwalk3 |> 
   filter(cd117 != oldcd117) |> 
   relocate(oldcd117, .after=cd117)
 
-xwalk2 |> 
+xwalk3 |> 
   filter(statecd118 == "NC14")
 
 # how do the 117th CDs match up?
 usoi <- cd117$statecd |> unique()
-ugeo <- xwalk2$statecd117 |> unique()
+ugeo <- xwalk3$statecd117 |> unique()
 
 usoi
 ugeo
@@ -141,16 +143,36 @@ check <- xwalk2 |>
 #| label: make-save-final-xwalk
 #| output: false
 
-# calc pop shares (so we have more precision than in the source data) and save
+# recreate xwalk3 as it won't be available when data checks have eval: false
+
 xwalk3 <- xwalk2 |> 
+  filter(stabbr != "PR") |> 
+  filter(cd117 != "-") |> # not sure what this is and pop2020 is only 13
+  # redo codes
+  mutate(
+    oldcd117 = cd117,
+    cd117 = case_when(stabbr=="NC" & nchar(cd117) != 2 ~ 
+                         str_pad(as.integer(cd117), width=2, side="left", pad="0"),
+                       .default = cd117),
+    statecd117=paste0(stabbr, cd117),
+    statecd118=paste0(stabbr, cd118),
+    statecd117 = case_when(
+      statecd117 == "DC98" ~ "DC00",
+      .default = statecd117),
+    statecd118 = case_when(
+      statecd118 == "DC98" ~ "DC00",
+    .default = statecd118))
+
+# calc pop shares (so we have more precision than in the source data) and save
+xwalk4 <- xwalk3 |> 
   mutate(share117to118=pop2020 / sum(pop2020), .by=statecd117)
 
-xwalk3 |> 
+xwalk4 |> 
   mutate(diff=share117to118 - af117to118) |> 
   relocate(af117to118, .before=share117to118) |> 
   arrange(desc(abs(diff))) # good, they match within small tolerances
 
-xwalk_final <- xwalk3 |> 
+xwalk_final <- xwalk4 |> 
   select(stabbr, cd117, cd118, statecd117, statecd118, share117to118)
 
 write_csv(xwalk_final, fs::path(CDINTERMEDIATE, "xwalk_final.csv"))
@@ -186,41 +208,5 @@ glimpse(cd118v2)
 summary(cd118v2)
 write_csv(cd118v2, fs::path(CDINTERMEDIATE, "cdbasefile_118.csv"))
 
-
-```
-
-
-```{r}
-
-
-cd117 <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile.csv"))
-cd118 <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile_118.csv"))
-
-stack <- bind_rows(
-  cd117 |> mutate(session="s117"),
-  cd118 |> mutate(session="s118")
-)
-
-glimpse(stack)
-states <- stack |> 
-  summarise(target=sum(target), 
-            .by=c(session, stabbr, src, rectype,
-                     agistub, agilo, agihi, basevname,
-                     scope, fstatus, count, vname, description, agirange))
-
-states |> 
-  pivot_wider(names_from = session,
-              values_from = target) |> 
-  mutate(diff=s118 - s117,
-         pdiff=diff / s117) |> 
-  arrange(desc(abs(pdiff))) # good all the state sums work
-
-write_csv(stack, fs::path(CDINTERMEDIATE, "cdbasefile_sessions.csv"))
-
 ```
 
-
-
-
-
-
diff --git a/tmd/areas/targets/prepare/cd_map_tcvars_and_extract_target_files.qmd b/tmd/areas/targets/prepare/cd_map_tcvars_and_extract_target_files.qmd
@@ -18,12 +18,34 @@ source(here::here("R", "constants.R"))
 
 ```
 
-## Get needed data
+## Combine cd117 and cd118 into a stacked cd file
 
 ```{r}
 #| label: get-cdbasefile
 
-cdbase <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile.csv"))
+cd117 <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile_117.csv"))
+cd118 <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile_118.csv"))
+
+stack <- bind_rows(
+  cd117 |> mutate(session="s117"),
+  cd118 |> mutate(session="s118")
+)
+
+glimpse(stack)
+states <- stack |> 
+  summarise(target=sum(target), 
+            .by=c(session, stabbr, src, rectype,
+                     agistub, agilo, agihi, basevname,
+                     scope, fstatus, count, vname, description, agirange))
+
+states |> 
+  pivot_wider(names_from = session,
+              values_from = target) |> 
+  mutate(diff=s118 - s117,
+         pdiff=diff / s117) |> 
+  arrange(desc(abs(pdiff))) # good all the state sums work
+
+write_csv(stack, fs::path(CDINTERMEDIATE, "cdbasefile_sessions.csv"))
 
 ```
 
@@ -32,7 +54,7 @@ cdbase <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile.csv"))
 ```{r}
 #| label: tc-soi-variablemap
 
-soivars <- count(cdbase, basevname)
+soivars <- count(stack, basevname)
 soivars$basevname
 
 # the MARS mappings let us get counts by filing status by agi range
@@ -54,7 +76,7 @@ e26270, v26270
 ```{r}
 #| label: mapped-file
 
-mapped <- cdbase |> 
+mapped <- stack |> 
   filter(basevname %in% vmap$soivar) |> 
   mutate(varname=factor(basevname, levels=vmap$soivar, labels=vmap$tcvar))
 
@@ -76,9 +98,11 @@ phase4cds <- c("AK00", "DE00", "ID01", "ID02", "ME02", "MT00", "ND00", "PA08", "
 # statecds <- "NY21"
 
 statecds <- phase4cds
+session_number <- 117
 
 extracted <- mapped |> 
   filter(statecd %in% statecds) |> 
+  filter(session==paste0("s", session_number)) |> 
   arrange(statecd, src, scope, fstatus, basevname, count, agistub) # to be safe
 
 count(extracted, statecd)

diff --git a/tmd/areas/targets/prepare/index.qmd b/tmd/areas/targets/prepare/index.qmd
@@ -14,4 +14,4 @@ This R project constructs subnational area targets for:
 
 In addition, it creates a local web page that a user can upload to a web service if desired. An in-progress version can be viewed on the web [here](https://tmd-areas-prepare-targets.netlify.app/). During development, it may not be identical to what your clone creates.
 
-It is part of the larger tax-microdata-benchmarking project, and is in the subfolder "tmd/areas/targets/prepare/".
+This R project is part of the larger tax-microdata-benchmarking project, and is in the subfolder "tmd/areas/targets/prepare/".
diff --git a/tmd/areas/targets/prepare/usage.qmd b/tmd/areas/targets/prepare/usage.qmd
@@ -4,7 +4,7 @@ editor_options:
  chunk_output_type: console
 ---
 
-# Usage: How to create Congressional District target files and the associated local web page
+# How to create Congressional District target files and the associated local web page
 
 ## Prerequisites
 
@@ -26,7 +26,7 @@ editor_options:
 
 The first time the project is rendered, it will create needed intermediates files and put them in the "../cds/intermediate" folder.
 
-Note that the \_quarto.yml file sets the `freeze` execution option to `auto`, which means .qmd files will not be rerendered if they have not changed. For a full re-rendering, set `freeze: false`, which will rerender everything regardless of whether code has changed (except that it will not re-render chunks with the option `eval: false`).
+Note that the \_quarto.yml file sets the `freeze` execution option to `false`, which means .qmd files will be rerendered even if they have not changed (except that quarto will not re-render chunks with the option `eval: false`). For incremental re-rendering of changed files only, set `freeze: auto`. This should be used cautiously to avoid unintended consequences.
 
 At present the code prepares target files with targets we believe are useful and practical. Users who want different targets will have to modify code to do so. However, as described in overall repo documentation, users can comment-out individual targets.
Original file line number	Diff line number	Diff line change
Expand Up		@@ -14,4 +14,4 @@ This R project constructs subnational area targets for:

		In addition, it creates a local web page that a user can upload to a web service if desired. An in-progress version can be viewed on the web [here](https://tmd-areas-prepare-targets.netlify.app/). During development, it may not be identical to what your clone creates.

		It is part of the larger tax-microdata-benchmarking project, and is in the subfolder "tmd/areas/targets/prepare/".
		This R project is part of the larger tax-microdata-benchmarking project, and is in the subfolder "tmd/areas/targets/prepare/".