From 837ae2900ba458a4829ebd7598cc76fb38f30666 Mon Sep 17 00:00:00 2001
From: donboyd5 <donboyd5@gmail.com>
Date: Tue, 29 Oct 2024 19:01:49 -0400
Subject: [PATCH 1/2] add .gitignore, hml, Rproj, qmd and R files

---
 tmd/areas/weights/examine/.gitignore          |  19 +
 tmd/areas/weights/examine/R/constants.R       |  26 ++
 tmd/areas/weights/examine/R/libraries.R       |  79 ++++
 tmd/areas/weights/examine/_quarto.yml         |  63 ++++
 tmd/areas/weights/examine/cd_prepare_data.qmd | 180 ++++++++++
 .../examine/cd_results_vs_targets_tables.qmd  | 338 ++++++++++++++++++
 .../weights/examine/cd_simple_tables.qmd      | 273 ++++++++++++++
 tmd/areas/weights/examine/examine.Rproj       |  13 +
 tmd/areas/weights/examine/index.qmd           |  10 +
 tmd/areas/weights/examine/play.R              | 268 ++++++++++++++
 10 files changed, 1269 insertions(+)
 create mode 100644 tmd/areas/weights/examine/.gitignore
 create mode 100644 tmd/areas/weights/examine/R/constants.R
 create mode 100644 tmd/areas/weights/examine/R/libraries.R
 create mode 100644 tmd/areas/weights/examine/_quarto.yml
 create mode 100644 tmd/areas/weights/examine/cd_prepare_data.qmd
 create mode 100644 tmd/areas/weights/examine/cd_results_vs_targets_tables.qmd
 create mode 100644 tmd/areas/weights/examine/cd_simple_tables.qmd
 create mode 100644 tmd/areas/weights/examine/examine.Rproj
 create mode 100644 tmd/areas/weights/examine/index.qmd
 create mode 100644 tmd/areas/weights/examine/play.R

diff --git a/tmd/areas/weights/examine/.gitignore b/tmd/areas/weights/examine/.gitignore
new file mode 100644
index 0000000..58c3fe9
--- /dev/null
+++ b/tmd/areas/weights/examine/.gitignore
@@ -0,0 +1,19 @@
+# examine
+
+# folders to ignore
+.Rproj.user/
+.quarto/
+_examine/
+_freeze/
+site_libs/
+temp_data/
+
+# files to ignore
+.Rhistory
+*.html
+
+# Local Netlify folder
+.netlify
+
+
+/.quarto/
diff --git a/tmd/areas/weights/examine/R/constants.R b/tmd/areas/weights/examine/R/constants.R
new file mode 100644
index 0000000..103f31b
--- /dev/null
+++ b/tmd/areas/weights/examine/R/constants.R
@@ -0,0 +1,26 @@
+
+# \\wsl.localhost\Ubuntu\home\donboyd5\Documents\python_projects\tax-microdata-benchmarking\tmd\storage\output
+TMDDIR <- here::here("..", "..", "..", "storage", "output")
+# list.files(TMDDIR)
+
+TARGETSDIR <- here::here("..", "..", "targets")
+WEIGHTSDIR <- here::here("..")
+# list.files(TARGETSDIR)
+# list.files(WEIGHTSDIR)
+
+# CDZIPURL <- "https://www.irs.gov/pub/irs-soi/congressional2021.zip"
+# CDDOCURL <- "https://www.irs.gov/pub/irs-soi/21incddocguide.docx"
+
+# \\wsl.localhost\Ubuntu\home\donboyd5\Documents\python_projects\tax-microdata-benchmarking\tmd\areas\weights\examine
+# \\wsl.localhost\Ubuntu\home\donboyd5\Documents\python_projects\tax-microdata-benchmarking\tmd\areas\targets\prepare
+# TARGETSPREPDIR <- here::here("..", "..", "targets", "prepare")
+# print(TARGETSPREPDIR)  # Should print the absolute path to the folder
+# list.files(TARGETSPREPDIR)
+
+# CDDIR <- here::here("cds")
+# CDDIR <- fs::path(TARGETSPREPDIR, "cds")
+# CDRAW <- fs::path(CDDIR, "raw_data")
+# CDINTERMEDIATE <- fs::path(CDDIR, "intermediate")
+# CDFINAL <- fs::path(CDDIR, "final")
+# list.files(CDFINAL)
+# CDDOCEXTRACT <- "cd_documentation_extracted_from_21incddocguide.docx.xlsx"
diff --git a/tmd/areas/weights/examine/R/libraries.R b/tmd/areas/weights/examine/R/libraries.R
new file mode 100644
index 0000000..914fb45
--- /dev/null
+++ b/tmd/areas/weights/examine/R/libraries.R
@@ -0,0 +1,79 @@
+# libraries ---------------------------------------------------------------
+
+library(DT)
+library(fs)
+library(gt)
+library(knitr)
+library(readxl)
+library(skimr)
+library(stringr)
+library(tidyverse)
+# includes: dplyr, forcats, ggplot2, lubridate, purrr, stringr, tibble, tidyr
+
+tprint <- 75  # default tibble print
+options(tibble.print_max = tprint, tibble.print_min = tprint) # show up to tprint rows
+
+library(janitor)
+# census_api_key("b27cb41e46ffe3488af186dd80c64dce66bd5e87", install = TRUE) # stored in .Renviron
+# libraries needed for census population
+library(sf)
+library(tidycensus)
+library(tigris)
+options(tigris_use_cache = TRUE)
+library(vroom)
+
+
+# possible libraries ------------------------------------------------------
+
+# library(rlang)
+# library(tidyverse)
+# tprint <- 75  # default tibble print
+# options(tibble.print_max = tprint, tibble.print_min = tprint) # show up to tprint rows
+#  
+# library(fs)
+ 
+# tools
+# library(vroom)
+# library(readxl)
+# library(openxlsx) # for writing xlsx files
+# library(lubridate)
+# library(RColorBrewer)
+# library(RcppRoll)
+# library(fredr)
+# library(tidycensus)
+# library(googledrive)
+# library(arrow)
+# 
+# library(jsonlite)
+# library(tidyjson)
+# 
+# 
+# # boyd libraries
+# # library(btools)
+# # library(bdata)
+# # library(bggtools)
+# # library(bmaps)
+# 
+# # graphics
+# library(scales)
+# library(ggbeeswarm)
+# library(patchwork)
+# library(gridExtra)
+# library(ggrepel)
+# library(ggbreak)
+# 
+# # tables
+# library(knitr)
+# library(kableExtra)
+# library(DT)
+# library(gt)
+# library(gtExtras)
+# library(janitor)
+# library(skimr)
+# library(vtable)
+# 
+# # maps
+# library(maps)
+# # https://cran.r-project.org/web/packages/usmap/vignettes/mapping.html
+# library(usmap)
+
diff --git a/tmd/areas/weights/examine/_quarto.yml b/tmd/areas/weights/examine/_quarto.yml
new file mode 100644
index 0000000..8ef2635
--- /dev/null
+++ b/tmd/areas/weights/examine/_quarto.yml
@@ -0,0 +1,63 @@
+project:
+  type: book
+  output-dir: _examine
+
+# https://prerelease.quarto.org/  # quarto documentation at this link
+
+# publishing with netllify cli:
+#  open terminal in examine
+# quarto render && netlify deploy --prod --dir=_examine
+
+#  quarto render # inspect to be sure it is as desired
+#  netlify deploy --prod --dir=_examine
+
+# or step by step
+#  netlify deploy # to test it, give _examine as publish directory
+#  netlify deploy --prod   # to deploy, give _examine as publish directory
+
+execute:
+  eval: true
+  echo: true
+  output: true
+  freeze: auto  # auto: during global project renders, re-render only when source changes
+
+book:
+  title: "Examine area weights creation results"
+  subtitle: "Create csv file"
+  # author: "Don Boyd"
+  date: today
+  date-format: long
+  chapters:
+    - index.qmd
+    - part: "IRS Congressional District data"
+      chapters:
+        # - cd_overall_documentation.qmd
+        - cd_prepare_data.qmd
+        - cd_simple_tables.qmd
+        - cd_results_vs_targets_tables.qmd
+
+format:
+  html:
+    theme: cosmo
+    code-fold: true    
+
+editor_options:
+  chunk_output_type: console
+
+# R packages using old 209 libxml
+#  gt, 
+
+
+# rendering commands
+#   quarto render
+#   quarto publish netlify --no-prompt --no-render --no-browser
+
+# possibly use this at start of each doc
+# ---
+# output: html_document
+# editor_options: 
+#   chunk_output_type: console
+# ---
+
+  
+  
\ No newline at end of file
diff --git a/tmd/areas/weights/examine/cd_prepare_data.qmd b/tmd/areas/weights/examine/cd_prepare_data.qmd
new file mode 100644
index 0000000..eb9560f
--- /dev/null
+++ b/tmd/areas/weights/examine/cd_prepare_data.qmd
@@ -0,0 +1,180 @@
+---
+output: html_document
+editor_options: 
+ chunk_output_type: console
+---
+
+# Read tmd 2021, area targets, area weights and prepare data
+
+
+## Setup
+
+```{r}
+#| label: setup
+#| output: false
+
+source(here::here("R", "libraries.R"))
+source(here::here("R", "constants.R"))
+
+phase4_statecds <- c("AK00", "DE00", "ID01", "ID02", "ME02", "MT00", "ND00", "PA08", "SD00", "WY00")
+
+```
+
+
+```{r}
+#| label: functions
+#| output: false
+
+ns <- function(obj){
+  sort(names(obj))
+}
+
+```
+
+## Download files from google drive
+
+Only do this when target files and results have changed. Otherwise, necessary data should be in the temp_data folder.
+
+```{r}
+#| label: hookup-googledrive
+#| eval: false
+
+library(googledrive)
+drive_auth() # authenticate
+
+```
+
+
+```{r}
+#| label: download-files
+#| eval: false
+
+# /AFPI_2024/Phase 4
+# folder_id <- "1pEdofaxeQgEeDLM8NOpo0vOGL1jT8Qa1" # AFPI folder
+folder_id <- "1Z7ZWYTbldfuQCFbpqKi4Z8FYxkbwmhnu" # Phase 4 folder
+
+files <- drive_ls(as_id(folder_id))
+files
+
+f <- function(gdfname){
+  fpath <- here::here("temp_data", gdfname)
+  print(fpath)
+  drive_download(gdfname, path = fpath, overwrite = TRUE)
+}
+# f(files$name[[1]])
+
+files |> 
+  pull(name) |> 
+  walk(\(gdfname) f(gdfname))
+
+```
+
+## Prepare target files
+
+Get all targets prepared
+
+```{r}
+#| label: targets-all
+#| eval: false
+#| output: false
+
+# ~/Documents/python_projects/tax-microdata-benchmarking/tmd/areas/weights/examine   # project dir
+# ~/Documents/python_projects/tax-microdata-benchmarking/tmd/areas/targets/prepare/cds/intermediate  # cdbasefile
+HERE <- here::here()
+CDTARGETSDIR <- fs::path(HERE, "..", "..", "targets", "prepare", "cds", "intermediate")
+# list.files(CDTARGETSDIR)
+
+targets_data <- read_csv(fs::path(CDTARGETSDIR, "cdbasefile.csv"))
+glimpse(targets_data)
+
+saveRDS(targets_data, here::here("temp_data", "targets_data.rds"))
+
+```
+
+
+Get targets used in the optimization
+
+```{r}
+#| label: targets-used
+#| eval: false
+#| output: false
+
+targetfiles <- dir_ls(here::here("temp_data")) |> str_subset("targets.csv")
+
+targets_used <- vroom(targetfiles, id="src") |> 
+  mutate(src=path_file(src) |> str_sub(1, 4)) |> 
+  mutate(active=!(str_sub(varname, 1, 1) == "#"),
+         varname = ifelse(!active, 
+                          varname |> str_remove("#") |> str_trim(),
+                          varname))
+saveRDS(targets_used, here::here("temp_data", "targets_used.rds"))
+
+glimpse(targets_used)
+count(targets_used, src)
+count(targets_used, active)
+count(targets_used, varname)
+count(targets_used, varname, active)
+
+targets_used |> filter(src == "ak00")
+targets_used |> filter(src == "de00")
+
+```
+
+
+## Get and prepare tmd data and area weights
+
+```{r}
+#| label: get-tmd-2021
+#| eval: false
+#| output: false
+
+# fpath <-  fs::path(TMDDIR, "tmd_2021.csv") # NO - it is out of sync with tmd.csv
+fpath <- here::here("temp_data", "djbout.csv")
+tmd2021 <- read_csv(fpath)
+ns(tmd2021)
+
+# djbout <- read_csv(here::here("temp_data", "djbout.csv")) # this is tax calc output vdf from create_area_weights.py
+saveRDS(tmd2021, here::here("temp_data", "tmd2021.rds"))
+
+sum(tmd2021$s006) # 184,024,657 with djbout.csv, s006 units are numbers of units, not hundreds of units
+
+# con <- unz(zpath, "21incd.csv")
+# data <- read_csv(con)
+
+us_weights <- read_csv(fs::path(TMDDIR, "tmd_weights.csv.gz"))
+sum(us_weights$WT2021) # 184,024,656.95 # must divide by 100
+saveRDS(us_weights, here::here("temp_data", "us_weights.rds"))
+
+tmd_base <- read_csv(fs::path(TMDDIR, "tmd.csv.gz")) # for comparison to tmd2021
+ns(tmd_base)
+saveRDS(tmd_base, here::here("temp_data", "tmd_base.rds"))
+
+
+```
+
+
+```{r}
+#| label: prep-weights
+#| eval: false
+#| output: false
+
+# weightfiles <- dir_ls(here::here("temp_data")) |> str_subset("weights.csv.gz")
+wtfiles <- dir_ls(WEIGHTSDIR, glob="*.gz") #  |> path_file()
+
+df <- read_csv(wtfiles[1])
+sum(df$WT2021)
+
+area_weights <- vroom(wtfiles, id="src") |> 
+  mutate(src = str_sub(path_file(src), 1, 4),
+         across(-src, \(x) x / 100.))
+glimpse(area_weights)
+count(area_weights, src)
+
+area_weights |> 
+  select(src, WT2021) |> 
+  summarise(wtdn=sum(WT2021), .by=src)
+
+saveRDS(area_weights, here::here("temp_data", "area_weights.rds"))
+
+```
+
diff --git a/tmd/areas/weights/examine/cd_results_vs_targets_tables.qmd b/tmd/areas/weights/examine/cd_results_vs_targets_tables.qmd
new file mode 100644
index 0000000..43b0f81
--- /dev/null
+++ b/tmd/areas/weights/examine/cd_results_vs_targets_tables.qmd
@@ -0,0 +1,338 @@
+---
+output: html_document
+editor_options: 
+ chunk_output_type: console
+---
+
+# Compare results to targets
+
+## Setup
+
+```{r}
+#| label: setup
+#| output: false
+
+source(here::here("R", "libraries.R"))
+source(here::here("R", "constants.R"))
+
+phase4_statecds <- c("AK00", "DE00", "ID01", "ID02", "ME02", "MT00", "ND00", "PA08", "SD00", "WY00")
+
+```
+
+
+## Get saved data
+
+```{r}
+#| label: get-data
+#| output: false
+
+# pick one or the other
+# OLD tmd2021 <- read_csv(here::here("temp_data", "djbout.csv")) # this is tax calc output vdf from
+
+tmd2021 <- readRDS(here::here("temp_data", "tmd2021.rds"))
+targets_data <- readRDS(here::here("temp_data", "targets_data.rds"))
+targets_used <- readRDS(here::here("temp_data", "targets_used.rds"))
+us_weights <- readRDS(here::here("temp_data", "us_weights.rds"))
+area_weights <- readRDS(here::here("temp_data", "area_weights.rds"))
+
+```
+
+
+
+## Prepare data
+
+## Prepare weighted and summarized microdata
+
+Add weights and make long file
+
+```{r}
+#| label: prepdata-microdata-for-tables
+#| eval: false
+#| output: false
+
+# ns(tmd2021)
+
+# 0 = Total
+# 1 = Under $1
+# 2 = $1 under $10,000
+# 3 = $10,000 under $25,000
+# 4 = $25,000 under $50,000
+# 5 = $50,000 under $75,000
+# 6 = $75,000 under $100,000
+# 7 = $100,000 under $200,000
+# 8 = $200,000 under $500,000
+# 9 = $500,000 or more
+
+icuts <- c(-Inf, 1, 10e3, 25e3, 50e3, 75e3, 100e3, 200e3, 500e3, Inf)
+areas <- c("us", unique(area_weights$src))
+keepvars <- c("XTOT", "c00100", "e00200", "e26270")
+targvars <- c("wtdn", keepvars)
+
+# prep weights
+wts2021 <- area_weights |> 
+  select(src, WT2021) |> 
+  mutate(row = row_number(), .by=src) |> 
+  pivot_wider(names_from = src, values_from = WT2021)
+
+wts2021 |> summarise(across(-row, \(x) sum(x)))
+
+# prep tmd
+tmd2 <- tmd2021 |> 
+  select(RECID, data_source, fstatus=MARS, s006, all_of(keepvars)) |>
+  mutate(scope=ifelse(data_source==1, 1, 2)) |> 
+  select(-data_source) |> 
+  # define irange and extend it to allow (later) for totals
+  mutate(irange=cut(c00100, icuts, right = FALSE, ordered_result = TRUE),
+         irange = factor(irange, 
+                         levels = levels(irange), # Ensure ordering is maintained
+                         labels = str_replace(levels(irange), ",", ", ")),
+         irange = fct_expand(irange, "total"),
+         irange = fct_relevel(irange, "total"))
+
+glimpse(tmd2)
+count(tmd2, irange)
+sum(tmd2$s006)
+
+check <- tmd2 |> 
+  select(RECID, scope, fstatus, c00100, s006, irange) |> 
+  mutate(agistub = as.integer(irange) - 1,
+         topagi = c00100 >= 500e3,
+         topbin = agistub == 9,
+         both = topagi & topbin) |> 
+  filter(topagi | topbin)
+check |> filter(!both)
+
+
+tmd3 <- tmd2 |> 
+  bind_cols(wts2021) |> 
+  select(-row) |> 
+  rename(us = s006)
+
+```
+
+Make long file
+
+```{r}
+#| label: long-file
+#| eval: false
+#| output: false
+
+tmd_long1 <- tmd3 |> 
+  # flip the areas
+  pivot_longer(cols=all_of(areas),
+               names_to = "area",
+               values_to = "weight") |> 
+  # flip the variables
+  mutate(wtdn = 1) |> 
+  pivot_longer(cols=all_of(targvars),
+               names_to = "varname",
+               values_to = "amount") |> 
+  # flip the variable type (count or amount)
+  # mutate(count=ifelse(amount != 0, 1, 0)) |>  # djb fix this later!!!
+  mutate(count=1) |> # djb TEMPORARY count ALWAYS is ALL returns
+  pivot_longer(cols=c(amount, count),
+               names_to = "vartype",
+               values_to = "value") |> 
+  mutate(wtdvalue=weight * value)
+  
+# now we are ready to summarize
+
+```
+
+Summarize microdata, then construct and concatenate totals
+
+```{r}
+#| label: summarise-save
+#| eval: false
+#| output: false
+
+details <- tmd_long1 |>
+  summarise(wtdvalue = sum(wtdvalue),
+            .by=c(area, scope, fstatus, varname, vartype, irange)) |> 
+  arrange(area, scope, fstatus, varname, irange)
+
+glimpse(details)
+count(details, irange)
+count(details, area)
+count(details, scope)
+count(details, fstatus)
+count(details, scope, fstatus)
+
+details |> filter(area=="ak00", scope==1)
+details |> filter(area=="ak00", scope==0)
+
+# calculate a series of subtotals records that drop one or more of the other variables
+
+# totals over all income ranges
+irangesums <- details |> 
+  summarise(wtdvalue = sum(wtdvalue),
+            .by=c(area, scope, fstatus, varname, vartype)) |> 
+  mutate(irange="total",
+         irange = factor(irange, levels = levels(details$irange), ordered = TRUE))
+
+details2 <- bind_rows(details, irangesums)
+glimpse(details2)
+count(details2, irange)
+
+# totals over all scopes
+scopesums <- details2 |> 
+  summarise(wtdvalue = sum(wtdvalue),
+            .by=c(area, fstatus, varname, vartype, irange)) |> 
+  mutate(scope=0)
+
+details3 <- bind_rows(details2, scopesums)
+glimpse(details3)
+count(details3, scope)
+
+# totals over all filing statuses
+fstatussums <- details3 |> 
+  summarise(wtdvalue = sum(wtdvalue),
+            .by=c(area, scope, varname, vartype, irange)) |> 
+  mutate(fstatus=0)
+
+details4 <- bind_rows(details3, fstatussums) |> 
+  mutate(agistub=as.integer(irange) - 1)
+glimpse(details4)
+count(details4, fstatus)
+count(details4, agistub, irange)
+
+saveRDS(details4, here::here("temp_data", "area_details.rds"))
+# rm(tmd_long, tmd_long1, tmd2, tmd3, tmd4, details, details2, details3, details4)
+
+```
+
+
+### Prepare targets for tables
+
+```{r}
+#| label: prepdata-targets-for-tables
+#| output: false
+
+agibins <- read_delim(
+delim=";",
+trim_ws = TRUE,
+file="agistub; agirange; agilo; agihi
+0; Total; -9e99; 9e99
+1; Under $1; -9e99; 1
+2; $1 under $10,000; 1; 10e3
+3; $10,000 under $25,000; 10e3; 25e3
+4; $25,000 under $50,000; 25e3; 50e3
+5; $50,000 under $75,000; 50e3; 75e3
+6; $75,000 under $100,000; 75e3; 100e3
+7; $100,000 under $200,000; 100e3; 200e3
+8; $200,000 under $500,000; 200e3; 500e3
+9; $500,000 or more; 500e3; 9e99
+")
+
+# area  scope fstatus varname vartype irange              wtdvalue
+
+targ1 <- targets_used |> 
+  rename(area=src) |> 
+  mutate(vartype=ifelse(count==0, "amount", "count")) |> 
+  select(-count) |> 
+  left_join(agibins,
+            by = join_by(agilo, agihi))
+count(targ1, agistub, agirange, agilo, agihi)
+
+targ1 |> filter(area=="ak00", agistub==9, varname=="e00200")
+
+targ2 <- targ1 |> 
+  select(area, scope, fstatus, varname, vartype, active, agistub, agirange, target)
+glimpse(targ2)
+
+targ2 |> filter(area=="ak00", agistub==9, varname=="e00200")
+
+```
+
+
+Comparison file
+
+```{r}
+#| label: comp-file
+#| output: false
+
+area_details <- readRDS(here::here("temp_data", "area_details.rds"))
+
+compfile <- targ2 |> 
+  left_join(area_details,
+            by = join_by(area, scope, fstatus, varname, vartype, agistub)) |> 
+  select(-irange) |> 
+  mutate(diff = wtdvalue - target,
+         pdiff = diff / target) |> 
+  mutate(sort = row_number(), .by=area) |> 
+  mutate(across(c(scope, fstatus, agistub, sort),
+                as.factor))
+
+summary(compfile)
+summary(compfile |> filter(active))
+
+errors <- compfile |> 
+  mutate(apdiff = abs(pdiff)) |> 
+  filter(apdiff > 0.04)
+
+summary(errors)
+count(errors, agistub)
+errors |> filter(agistub != 9) |> arrange(desc(apdiff))
+errors |> filter(agistub == 9) |> arrange(desc(apdiff))
+errors |> filter(agistub == 9) |> arrange(varname, area)
+
+```
+
+
+## Show results vs. targets (VERY PRELIMINARY)
+
+Units:
+
+-   Dollar amounts are in $ millions (varname==amount for target, wtdvalue, and diff)
+-   Counts (including XTOT) are actual numbers
+
+scope:
+
+-   0 = total population
+-   1 = filers
+-   2 = nonfilers (none currently in the table)
+
+fstatus:
+
+-   0 = sum of all statuses
+-   1 = married joint
+-   2 = single
+-   3 = married filing separately (not targeted)
+-   4 = head of household
+
+active:
+
+-  true = item was targeted
+-  false = item was in target file but was commented out (e26270 for DE-00)
+
+Dropdown boxes and search fields allow narrowing down the records that are displayed.
+
+**NOTE**: Weighted values for agistub 9 are not within our tolerances although the optimization solver reported that they are for the transformed problem it solved. We will investigate this and resolve it in Phase 5.
+
+
+```{r}
+#| label: show-comps
+#| eval: true
+#| column: page
+
+
+compfile |>
+  # select(-type) |> 
+  mutate(across(c(target, wtdvalue, diff), 
+                \(x) ifelse(vartype=="amount" & varname != "XTOT", x / 1e6, x))) |> 
+  mutate(varname = as.factor(varname)) |> 
+  DT::datatable(rownames = FALSE,
+                options = list(order = list(0, "asc"), # use 1st column (0) for sorting
+                               scrollX = TRUE, scrollY = TRUE, paging = TRUE, pageLength = 20,
+                               autoWidth = TRUE),
+                filter="top",
+                escape = FALSE) |>
+  formatCurrency(columns = c("target", "wtdvalue", "diff"), currency="", digits=1) |> 
+  formatPercentage(columns = c("pdiff"),  digits = 1)
+
+```
+
+
+
+
+
diff --git a/tmd/areas/weights/examine/cd_simple_tables.qmd b/tmd/areas/weights/examine/cd_simple_tables.qmd
new file mode 100644
index 0000000..073afdc
--- /dev/null
+++ b/tmd/areas/weights/examine/cd_simple_tables.qmd
@@ -0,0 +1,273 @@
+---
+output: html_document
+editor_options: 
+ chunk_output_type: console
+---
+
+# Simple summary tables
+
+
+## Setup
+
+```{r}
+#| label: setup
+#| output: false
+
+source(here::here("R", "libraries.R"))
+source(here::here("R", "constants.R"))
+
+phase4_statecds <- c("AK00", "DE00", "ID01", "ID02", "ME02", "MT00", "ND00", "PA08", "SD00", "WY00")
+
+```
+
+
+## Get prepared data
+
+```{r}
+#| label: get-data
+#| output: false
+
+tmd2021 <- readRDS(here::here("temp_data", "tmd2021.rds"))
+targets_data <- readRDS(here::here("temp_data", "targets_data.rds"))
+targets_used <- readRDS(here::here("temp_data", "targets_used.rds"))
+us_weights <- readRDS(here::here("temp_data", "us_weights.rds"))
+area_weights <- readRDS(here::here("temp_data", "area_weights.rds"))
+
+```
+
+
+## Construct weighted totals
+
+```{r}
+#| label: prepdata-for-tables
+#| output: false
+
+# ns(tmd2021)
+
+# 0 = Total
+# 1 = Under $1
+# 2 = $1 under $10,000
+# 3 = $10,000 under $25,000
+# 4 = $25,000 under $50,000
+# 5 = $50,000 under $75,000
+# 6 = $75,000 under $100,000
+# 7 = $100,000 under $200,000
+# 8 = $200,000 under $500,000
+# 9 = $500,000 or more
+
+
+icuts <- c(-Inf, 1, 10e3, 25e3, 50e3, 75e3, 100e3, 200e3, 500e3, Inf)
+
+# prep tmd
+tmd2 <- tmd2021 |> 
+  select(RECID, data_source, s006, c00100, e00200, e26270, iitax) |> 
+  mutate(irange=cut(c00100, icuts, right = FALSE, ordered_result = TRUE),
+         irange = factor(irange, 
+                         levels = levels(irange), # Ensure ordering is maintained
+                         labels = str_replace(levels(irange), ",", ", ")))
+count(tmd2, irange)
+glimpse(tmd2)
+sum(tmd2$s006)
+
+# prep weights
+wts2 <- area_weights |> 
+  select(src, WT2021) |> 
+  mutate(row = row_number(), .by=src) |> 
+  pivot_wider(names_from = src, values_from = WT2021)
+wts2 |> summarise(across(-row, \(x) sum(x)))
+
+areas <- c("us", unique(area_weights$src))
+tmd3 <- tmd2 |> 
+  bind_cols(wts2) |> 
+  select(-row) |> 
+  rename(us = s006) |> 
+  pivot_longer(cols=all_of(areas),
+               names_to = "area",
+               values_to = "weight")
+
+details <- tmd3 |> 
+  summarise(n=n(), wtdn = sum(weight), 
+            agi=sum(weight * c00100),
+            wages = sum(weight * e00200),
+            scorppartner = sum(weight * e26270),
+            iitax = sum(weight * iitax),
+            .by=c(irange, area)) |> 
+  mutate(irange = fct_expand(irange, "total")) |> 
+  mutate(irange = fct_relevel(irange, "total")) |> 
+  arrange(area, irange)
+
+count(details, irange)
+
+areasums <- details |> 
+  summarise(across(c(wtdn, agi, wages, scorppartner, iitax),
+                   \(x) sum(x)),
+            .by=area) |> 
+  mutate(irange="total",
+         irange = factor(irange, levels = levels(details$irange), ordered = TRUE))
+
+tmd4 <- bind_rows(details, areasums) |> 
+  arrange(area, irange)
+count(tmd4, irange)
+tmd4
+
+```
+
+
+## Selected tables
+
+### Number of tax units
+
+```{r}
+#| label: tables-wtdn
+#| output: true
+
+tmd4 |>  
+  select(irange, area, wtdn) |> 
+  pivot_wider(names_from = area,
+              values_from = wtdn) |> 
+  relocate(us, .after = irange) |> 
+  gt() |> 
+  tab_header("Number of tax units, thousands, 2021 tax year",
+             subtitle = "Filers and nonfilers") |> 
+  fmt_number(columns = -c(irange, us),
+             scale=1e-3,
+             decimals = 1) |> 
+    fmt_number(columns = us,
+             scale=1e-3,
+             decimals = 0)
+
+# tmd4 |>  
+#   filter(data_source==1) |> 
+#   select(irange, area, wtdn) |> 
+#   pivot_wider(names_from = area,
+#               values_from = wtdn) |> 
+#   relocate(us, .after = irange) |> 
+#   gt() |> 
+#   tab_header("Number of tax filers, thousands, 2021 tax year",
+#              subtitle = "data_source==1") |> 
+#   fmt_number(columns = -c(irange, us),
+#              scale=1e-3,
+#              decimals = 1) |> 
+#     fmt_number(columns = us,
+#              scale=1e-3,
+#              decimals = 0)
+
+```
+
+
+### Percentage distribution of tax units
+
+```{r}
+#| label: tables-pctdist
+#| output: true
+
+tmd4 |> 
+  mutate(pct=wtdn / wtdn[irange=="total"], 
+         .by=area) |> 
+  select(irange, area, pct) |> 
+  pivot_wider(names_from = area,
+              values_from = pct) |> 
+  relocate(us, .after = irange) |> 
+  gt() |> 
+  tab_header("Number of tax units as % of area total, 2021 tax year",
+             subtitle = "Filers and nonfilers") |> 
+  fmt_percent(columns = -irange,
+             decimals = 1)
+
+```
+
+
+### Average adjusted gross income
+
+```{r}
+#| label: tables-avgagi
+#| output: true
+
+tmd4 |> 
+  select(-n) |> 
+   mutate(across(c(agi, wages, iitax),
+                \(x) x / wtdn)) |> 
+  select(irange, area, value=agi) |> 
+  pivot_wider(names_from = area,
+              values_from = value) |> 
+  relocate(us, .after = irange) |> 
+  gt() |> 
+  tab_header("Average AGI in $, 2021 tax year",
+             subtitle = "Filers and nonfilers") |> 
+  fmt_number(columns = -irange,
+             decimals = 0)
+
+```
+
+### Average wages
+
+```{r}
+#| label: tables-avgwages
+#| output: true
+
+tmd4 |> 
+  select(-n) |> 
+   mutate(across(c(agi, wages, iitax),
+                \(x) x / wtdn)) |> 
+  select(irange, area, value=wages) |> 
+  pivot_wider(names_from = area,
+              values_from = value) |> 
+  relocate(us, .after = irange) |> 
+  gt() |> 
+  tab_header("Average wages in $, 2021 tax year",
+             subtitle = "Filers and nonfilers") |> 
+  fmt_number(columns = -irange,
+             decimals = 0)
+
+
+```
+
+
+### Average S Corporation and partnership income (net)
+
+```{r}
+#| label: tables-scorppartner
+#| output: true
+
+tmd4 |> 
+  select(-n) |> 
+   mutate(across(c(agi, wages, scorppartner, iitax),
+                \(x) x / wtdn)) |> 
+  select(irange, area, value=scorppartner) |> 
+  pivot_wider(names_from = area,
+              values_from = value) |> 
+  relocate(us, .after = irange) |> 
+  gt() |> 
+  tab_header("Average S Corporation and partnership income in $, 2021 tax year",
+             subtitle = "Filers and nonfilers") |> 
+  fmt_number(columns = -irange,
+             decimals = 0)
+
+
+```
+
+
+### Average iitax
+
+```{r}
+#| label: tables-avgiitax
+#| output: true
+
+tmd4 |> 
+  select(-n) |> 
+   mutate(across(c(agi, wages, iitax),
+                \(x) x / wtdn)) |> 
+  select(irange, area, value=iitax) |> 
+  pivot_wider(names_from = area,
+              values_from = value) |> 
+  relocate(us, .after = irange) |> 
+  gt() |> 
+  tab_header("Average iitax, 2021 tax year",
+             subtitle = "Filers and nonfilers") |> 
+  fmt_number(columns = -irange,
+             decimals = 0)
+  
+
+```
+
+
diff --git a/tmd/areas/weights/examine/examine.Rproj b/tmd/areas/weights/examine/examine.Rproj
new file mode 100644
index 0000000..8e3c2eb
--- /dev/null
+++ b/tmd/areas/weights/examine/examine.Rproj
@@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
diff --git a/tmd/areas/weights/examine/index.qmd b/tmd/areas/weights/examine/index.qmd
new file mode 100644
index 0000000..d117723
--- /dev/null
+++ b/tmd/areas/weights/examine/index.qmd
@@ -0,0 +1,10 @@
+---
+output: html_document
+editor_options: 
+ chunk_output_type: console
+---
+
+# Introduction
+
+TO COME.
+
diff --git a/tmd/areas/weights/examine/play.R b/tmd/areas/weights/examine/play.R
new file mode 100644
index 0000000..45f442d
--- /dev/null
+++ b/tmd/areas/weights/examine/play.R
@@ -0,0 +1,268 @@
+
+
+# libraries ---------------------------------------------------------------
+
+source(here::here("R", "libraries.R"))
+source(here::here("R", "constants.R"))
+
+phase4_statecds <- c("AK00", "DE00", "ID01", "ID02", "ME02", "MT00", "ND00", "PA08", "SD00", "WY00")
+
+
+# get data ----------------------------------------------------------------
+
+# OLD: djbout <- read_csv(here::here("temp_data", "djbout.csv")) # this is tax calc output vdf from create_area_weights.py
+tmd2021 <- readRDS(here::here("temp_data", "tmd2021.rds")) # now based on djbout.csv
+sum(tmd2021$s006) # is what we want to see
+ns(tmd2021)
+tmdbase <- readRDS(here::here("temp_data", "tmd_base.rds"))
+usweights <- readRDS(here::here("temp_data", "us_weights.rds"))
+ns(usweights)
+area_weights <- readRDS(here::here("temp_data", "area_weights.rds"))
+
+
+# debug ak00 agistub 9 c00100 amount (target 10, row 11) --------------------------------------------------------
+
+#.. get data ----
+# djbout 225,256 data rows
+# same for weights
+# same for masks
+
+# ...national_population 334283385.27000004
+# ...scale 2.0948260728756473e-10
+
+# target 10
+national_population <- 334283385.27000004
+cdpopulation <- 732673 # row 2 xtot
+unscaled_target <- 4773666000 # unscaled_target 4773666000 good
+(initial_weights_scale = cdpopulation / national_population)
+# 1 / cdpopulation
+1 / unscaled_target
+scale <- 2.0948260728756473e-10 # good
+(scaled_target <- unscaled_target * scale)
+
+iweights <- read_csv(here::here("temp_data", "unmasked_varray.csv"),
+                col_names = "umv") # this should be c00100
+
+xvalues <- read_csv(here::here("temp_data", "xvalues.csv"),
+                     col_names = "x") # this should be c00100
+
+umv <- read_csv(here::here("temp_data", "unmasked_varray.csv"),
+                col_names = "umv") # this should be c00100
+
+mask <- read_csv(here::here("temp_data", "mask.csv"),
+                 col_names = "mask")
+
+smv <- read_csv(here::here("temp_data", "scaled_masked_varray.csv"),
+                col_names = "smv")
+
+
+# ..check some of the data ------------------------------------------------
+class(umv); class(mask); class(smv)
+
+akweights <- area_weights |> filter(src=="ak00")
+
+combo <- cbind(tmd2021 |> 
+                 select(data_source, s006, XTOT, c00100), 
+               xvalues, umv, mask, smv,
+               akweights |> select(fweight=WT2021)
+               ) |> 
+  mutate(row=row_number(),
+         iweight = s006 * initial_weights_scale) |> 
+  relocate(row)
+
+# population check - good
+combo |> 
+  summarise(pop=sum(s006 * XTOT)) # 334,283,385 vs expected 334,283,385.27000004
+
+# c00100 sums - good
+combo |> 
+  mutate(diff = umv - c00100,
+         pdiff = diff / c00100) |> 
+  filter(pdiff != 0) |> 
+  arrange(desc(abs(pdiff))) # super minor diffs
+
+sum(umv) # 638131482635
+sum(tmd2021$c00100) # 638131482635
+
+# mask check
+combo |> 
+  mutate(maskcheck = c00100 )
+
+# final weights check
+check <- combo |> 
+  mutate(fweight_check = iweight * x,
+         diff=fweight_check - fweight,
+         pdiff=diff / fweight)
+check |> arrange(desc(abs(pdiff)))
+check |> 
+  summarise(fweight=sum(fweight),
+            fweight_check=sum(fweight), 
+            .by=data_source)
+
+
+# smv looks good
+smvcheck <- combo |> 
+  mutate(smvcheck = c00100 * mask * scale,
+         diff = smv - smvcheck,
+         pdiff = diff / smvcheck) |> 
+  filter(pdiff != 0)
+sum(abs(smvcheck$pdiff)) # 1.118748e-12
+  
+smvcheck <- tmd2021$c00100 * mask * scale
+class(smvcheck)
+sum(smv) # 130.6638
+sum(smvcheck) # 130.6638
+
+tibble(smv=as.numeric(smv), check=as.numeric(smvcheck)) |> 
+  mutate(diff=smv - check) |> 
+  filter(diff != 0)
+
+
+
+sum(mask$mask)
+smv |> 
+  mutate(nz=(smv != 0) * 1) |> 
+  summarise(n=n(), smv=sum(smv), .by=nz)
+
+nrow(mask)
+nrow(smv)
+
+combo |> 
+  filter(mask == 1) |> 
+  mutate(fweight_round=round(iweight * x, 2)) |> 
+  summarise(totmod=sum(umv * fweight),
+            totcalc=sum(c00100 * iweight * x),
+            totcalc_round=sum(c00100 * fweight_round))
+
+# 4,773,666,000
+
+
+
+
+# weights analysis --------------------------------------------------------
+
+check <- combo |> 
+  mutate(fweight_calc = iweight * x,
+         fweight_calcround = round(fweight, 2),
+         rnddiff=fweight - fweight_calcround,
+         rndpdiff=rnddiff / fweight_calcround,
+         calcdiff=fweight - fweight_calc,
+         calcpdiff=calcdiff / fweight_calc)
+
+check |> 
+  summarise(mdns006=median(s006),
+            mdnfwcalc=median(fweight_calc),
+            mdnfweight=median(fweight),
+            .by=mask)
+
+
+# analysis ----------------------------------------------------------------
+
+
+
+glimpse(tmd3)
+
+tmp <- tmd3 |> 
+  mutate(agistub=as.integer(irange) - 1) |> 
+  filter(scope==1, agistub==1)
+
+tmp |> 
+  summarise(c00100a = sum(de00 * c00100),
+            c00100n = sum(de00 * (c00100 != 0)),
+            nrets = sum(de00),
+            e00200a = sum(de00 * e00200))
+
+# c00100a agistub 1 amount
+# -248171000 target
+# -248057066. result
+
+# c00100n agistub 1 count
+# 8760 target
+# 6420 result
+# 8757 if we just summarize returns
+
+# e00200a agistub 1 amount
+# 40998000 target
+# 40805367 result
+
+tmp |>
+  filter(fstatus==1) |> 
+  summarise(mars1 = sum(de00))
+
+# mars1 agistub 1 count
+# 6040 target
+# 6038  result
+
+
+tmp |>
+  filter(fstatus==1, c00100 != 0) |> 
+  summarise(mars1 = sum(de00))
+
+# mars1 agistub 1 count
+# 6040 target
+# 4300 result
+
+
+# compare tmd2021 to djbout -----------------------------------------------
+
+setdiff(names(tmd2021), names(djbout))
+
+# simple checks on tmd2021 vs. tmdbase
+sum(tmd2021$s006) # 184,024,650 why not identical to other files?
+sum(tmdbase$s006) # 184,024,657 same as in US weights
+sum(usweights$WT2021) / 100. # 184,024,656.95 
+sum(round(usweights$WT2021 / 100)) # 184,023,729
+sum(djbout$s006) # 184,024,657
+
+sum(tmd2021$e00200) # 126,004,562,344
+sum(tmdbase$e00200) # 126,004,434,333
+sum(djbout$e00200) # 126,004,434,333
+
+checkstub9 <- bind_rows(tmd2021 |> filter(c00100 >= 500e3, c00100 < 9e99) |> mutate(src="tmd"),
+                        djbout |> filter(c00100 >= 500e3, c00100 < 9e99) |> mutate(src="djb")) |> 
+  summarise(n=n(), agisum=sum(c00100), wtsum=sum(s006), wtdagisum=sum(s006 * c00100), .by=src)
+
+checkstub9 |> gt()
+checkstub9 |> 
+  pivot_longer(-src) |> 
+  pivot_wider(names_from = src) |> 
+  mutate(diff = djb - tmd,
+         pdiff = diff / tmd)
+
+baddjb <- djbout |> 
+  mutate(n=n(), .by=RECID) |> 
+  filter(n!=1) |> 
+  arrange(RECID) |> 
+  relocate(RECID, n, data_source, c00100)
+count(baddjb, data_source)
+
+baddjbds0 <- djbout |> 
+  filter(data_source==0) |>  # CPS records
+  mutate(n=n(), .by=RECID) |> 
+  filter(n!=1) |> 
+  arrange(RECID) |> 
+  relocate(RECID, n, data_source, c00100)
+
+baddjbds1 <- djbout |> 
+  filter(data_source==1) |> # PUF records
+  mutate(n=n(), .by=RECID) |> 
+  filter(n!=1) |> 
+  arrange(RECID) |> 
+  relocate(RECID, n, data_source, c00100)
+
+comp <- bind_rows(
+  tmd2021 |> select(RECID, data_source, c00100, s006) |> mutate(src="tmd"),
+  djbout |> select(RECID, data_source, c00100, s006) |> mutate(src="djb")) |> 
+  arrange(RECID, desc(src))
+
+count(comp, src)
+
+bad <- comp |> 
+  mutate(n=n(), .by=RECID) |> 
+  filter(n != 2)
+
+
+comp |> 
+  pivot_longer(-c(RECID, src)) |> 
+  pivot_wider(names_from = src, values_from = value)
+

From 05520bffb59dee3a3aafcb23d5d5bcdb133913ab Mon Sep 17 00:00:00 2001
From: donboyd5 <donboyd5@gmail.com>
Date: Tue, 29 Oct 2024 19:03:21 -0400
Subject: [PATCH 2/2] don't gitignore temp_data but do gitignore its contents

---
 tmd/areas/weights/examine/.gitignore           | 1 -
 tmd/areas/weights/examine/temp_data/.gitignore | 5 +++++
 2 files changed, 5 insertions(+), 1 deletion(-)
 create mode 100644 tmd/areas/weights/examine/temp_data/.gitignore

diff --git a/tmd/areas/weights/examine/.gitignore b/tmd/areas/weights/examine/.gitignore
index 58c3fe9..6dfc158 100644
--- a/tmd/areas/weights/examine/.gitignore
+++ b/tmd/areas/weights/examine/.gitignore
@@ -6,7 +6,6 @@
 _examine/
 _freeze/
 site_libs/
-temp_data/
 
 # files to ignore
 .Rhistory
diff --git a/tmd/areas/weights/examine/temp_data/.gitignore b/tmd/areas/weights/examine/temp_data/.gitignore
new file mode 100644
index 0000000..86d3d6f
--- /dev/null
+++ b/tmd/areas/weights/examine/temp_data/.gitignore
@@ -0,0 +1,5 @@
+# Ignore everything in this directory
+*
+
+# Allow the .gitignore file itself
+!.gitignore