From 837ae2900ba458a4829ebd7598cc76fb38f30666 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Tue, 29 Oct 2024 19:01:49 -0400 Subject: [PATCH 1/2] add .gitignore, hml, Rproj, qmd and R files --- tmd/areas/weights/examine/.gitignore | 19 + tmd/areas/weights/examine/R/constants.R | 26 ++ tmd/areas/weights/examine/R/libraries.R | 79 ++++ tmd/areas/weights/examine/_quarto.yml | 63 ++++ tmd/areas/weights/examine/cd_prepare_data.qmd | 180 ++++++++++ .../examine/cd_results_vs_targets_tables.qmd | 338 ++++++++++++++++++ .../weights/examine/cd_simple_tables.qmd | 273 ++++++++++++++ tmd/areas/weights/examine/examine.Rproj | 13 + tmd/areas/weights/examine/index.qmd | 10 + tmd/areas/weights/examine/play.R | 268 ++++++++++++++ 10 files changed, 1269 insertions(+) create mode 100644 tmd/areas/weights/examine/.gitignore create mode 100644 tmd/areas/weights/examine/R/constants.R create mode 100644 tmd/areas/weights/examine/R/libraries.R create mode 100644 tmd/areas/weights/examine/_quarto.yml create mode 100644 tmd/areas/weights/examine/cd_prepare_data.qmd create mode 100644 tmd/areas/weights/examine/cd_results_vs_targets_tables.qmd create mode 100644 tmd/areas/weights/examine/cd_simple_tables.qmd create mode 100644 tmd/areas/weights/examine/examine.Rproj create mode 100644 tmd/areas/weights/examine/index.qmd create mode 100644 tmd/areas/weights/examine/play.R diff --git a/tmd/areas/weights/examine/.gitignore b/tmd/areas/weights/examine/.gitignore new file mode 100644 index 0000000..58c3fe9 --- /dev/null +++ b/tmd/areas/weights/examine/.gitignore @@ -0,0 +1,19 @@ +# examine + +# folders to ignore +.Rproj.user/ +.quarto/ +_examine/ +_freeze/ +site_libs/ +temp_data/ + +# files to ignore +.Rhistory +*.html + +# Local Netlify folder +.netlify + + +/.quarto/ diff --git a/tmd/areas/weights/examine/R/constants.R b/tmd/areas/weights/examine/R/constants.R new file mode 100644 index 0000000..103f31b --- /dev/null +++ b/tmd/areas/weights/examine/R/constants.R @@ -0,0 +1,26 @@ + +# \\wsl.localhost\Ubuntu\home\donboyd5\Documents\python_projects\tax-microdata-benchmarking\tmd\storage\output +TMDDIR <- here::here("..", "..", "..", "storage", "output") +# list.files(TMDDIR) + +TARGETSDIR <- here::here("..", "..", "targets") +WEIGHTSDIR <- here::here("..") +# list.files(TARGETSDIR) +# list.files(WEIGHTSDIR) + +# CDZIPURL <- "https://www.irs.gov/pub/irs-soi/congressional2021.zip" +# CDDOCURL <- "https://www.irs.gov/pub/irs-soi/21incddocguide.docx" + +# \\wsl.localhost\Ubuntu\home\donboyd5\Documents\python_projects\tax-microdata-benchmarking\tmd\areas\weights\examine +# \\wsl.localhost\Ubuntu\home\donboyd5\Documents\python_projects\tax-microdata-benchmarking\tmd\areas\targets\prepare +# TARGETSPREPDIR <- here::here("..", "..", "targets", "prepare") +# print(TARGETSPREPDIR) # Should print the absolute path to the folder +# list.files(TARGETSPREPDIR) + +# CDDIR <- here::here("cds") +# CDDIR <- fs::path(TARGETSPREPDIR, "cds") +# CDRAW <- fs::path(CDDIR, "raw_data") +# CDINTERMEDIATE <- fs::path(CDDIR, "intermediate") +# CDFINAL <- fs::path(CDDIR, "final") +# list.files(CDFINAL) +# CDDOCEXTRACT <- "cd_documentation_extracted_from_21incddocguide.docx.xlsx" diff --git a/tmd/areas/weights/examine/R/libraries.R b/tmd/areas/weights/examine/R/libraries.R new file mode 100644 index 0000000..914fb45 --- /dev/null +++ b/tmd/areas/weights/examine/R/libraries.R @@ -0,0 +1,79 @@ +# libraries --------------------------------------------------------------- + +library(DT) +library(fs) +library(gt) +library(knitr) +library(readxl) +library(skimr) +library(stringr) +library(tidyverse) +# includes: dplyr, forcats, ggplot2, lubridate, purrr, stringr, tibble, tidyr + +tprint <- 75 # default tibble print +options(tibble.print_max = tprint, tibble.print_min = tprint) # show up to tprint rows + +library(janitor) +# census_api_key("b27cb41e46ffe3488af186dd80c64dce66bd5e87", install = TRUE) # stored in .Renviron +# libraries needed for census population +library(sf) +library(tidycensus) +library(tigris) +options(tigris_use_cache = TRUE) +library(vroom) + + +# possible libraries ------------------------------------------------------ + +# library(rlang) +# library(tidyverse) +# tprint <- 75 # default tibble print +# options(tibble.print_max = tprint, tibble.print_min = tprint) # show up to tprint rows +# +# library(fs) + +# tools +# library(vroom) +# library(readxl) +# library(openxlsx) # for writing xlsx files +# library(lubridate) +# library(RColorBrewer) +# library(RcppRoll) +# library(fredr) +# library(tidycensus) +# library(googledrive) +# library(arrow) +# +# library(jsonlite) +# library(tidyjson) +# +# +# # boyd libraries +# # library(btools) +# # library(bdata) +# # library(bggtools) +# # library(bmaps) +# +# # graphics +# library(scales) +# library(ggbeeswarm) +# library(patchwork) +# library(gridExtra) +# library(ggrepel) +# library(ggbreak) +# +# # tables +# library(knitr) +# library(kableExtra) +# library(DT) +# library(gt) +# library(gtExtras) +# library(janitor) +# library(skimr) +# library(vtable) +# +# # maps +# library(maps) +# # https://cran.r-project.org/web/packages/usmap/vignettes/mapping.html +# library(usmap) + diff --git a/tmd/areas/weights/examine/_quarto.yml b/tmd/areas/weights/examine/_quarto.yml new file mode 100644 index 0000000..8ef2635 --- /dev/null +++ b/tmd/areas/weights/examine/_quarto.yml @@ -0,0 +1,63 @@ +project: + type: book + output-dir: _examine + +# https://prerelease.quarto.org/ # quarto documentation at this link + +# publishing with netllify cli: +# open terminal in examine +# quarto render && netlify deploy --prod --dir=_examine + +# quarto render # inspect to be sure it is as desired +# netlify deploy --prod --dir=_examine + +# or step by step +# netlify deploy # to test it, give _examine as publish directory +# netlify deploy --prod # to deploy, give _examine as publish directory + +execute: + eval: true + echo: true + output: true + freeze: auto # auto: during global project renders, re-render only when source changes + +book: + title: "Examine area weights creation results" + subtitle: "Create csv file" + # author: "Don Boyd" + date: today + date-format: long + chapters: + - index.qmd + - part: "IRS Congressional District data" + chapters: + # - cd_overall_documentation.qmd + - cd_prepare_data.qmd + - cd_simple_tables.qmd + - cd_results_vs_targets_tables.qmd + +format: + html: + theme: cosmo + code-fold: true + +editor_options: + chunk_output_type: console + +# R packages using old 209 libxml +# gt, + + +# rendering commands +# quarto render +# quarto publish netlify --no-prompt --no-render --no-browser + +# possibly use this at start of each doc +# --- +# output: html_document +# editor_options: +# chunk_output_type: console +# --- + + + \ No newline at end of file diff --git a/tmd/areas/weights/examine/cd_prepare_data.qmd b/tmd/areas/weights/examine/cd_prepare_data.qmd new file mode 100644 index 0000000..eb9560f --- /dev/null +++ b/tmd/areas/weights/examine/cd_prepare_data.qmd @@ -0,0 +1,180 @@ +--- +output: html_document +editor_options: + chunk_output_type: console +--- + +# Read tmd 2021, area targets, area weights and prepare data + + +## Setup + +```{r} +#| label: setup +#| output: false + +source(here::here("R", "libraries.R")) +source(here::here("R", "constants.R")) + +phase4_statecds <- c("AK00", "DE00", "ID01", "ID02", "ME02", "MT00", "ND00", "PA08", "SD00", "WY00") + +``` + + +```{r} +#| label: functions +#| output: false + +ns <- function(obj){ + sort(names(obj)) +} + +``` + +## Download files from google drive + +Only do this when target files and results have changed. Otherwise, necessary data should be in the temp_data folder. + +```{r} +#| label: hookup-googledrive +#| eval: false + +library(googledrive) +drive_auth() # authenticate + +``` + + +```{r} +#| label: download-files +#| eval: false + +# /AFPI_2024/Phase 4 +# folder_id <- "1pEdofaxeQgEeDLM8NOpo0vOGL1jT8Qa1" # AFPI folder +folder_id <- "1Z7ZWYTbldfuQCFbpqKi4Z8FYxkbwmhnu" # Phase 4 folder + +files <- drive_ls(as_id(folder_id)) +files + +f <- function(gdfname){ + fpath <- here::here("temp_data", gdfname) + print(fpath) + drive_download(gdfname, path = fpath, overwrite = TRUE) +} +# f(files$name[[1]]) + +files |> + pull(name) |> + walk(\(gdfname) f(gdfname)) + +``` + +## Prepare target files + +Get all targets prepared + +```{r} +#| label: targets-all +#| eval: false +#| output: false + +# ~/Documents/python_projects/tax-microdata-benchmarking/tmd/areas/weights/examine # project dir +# ~/Documents/python_projects/tax-microdata-benchmarking/tmd/areas/targets/prepare/cds/intermediate # cdbasefile +HERE <- here::here() +CDTARGETSDIR <- fs::path(HERE, "..", "..", "targets", "prepare", "cds", "intermediate") +# list.files(CDTARGETSDIR) + +targets_data <- read_csv(fs::path(CDTARGETSDIR, "cdbasefile.csv")) +glimpse(targets_data) + +saveRDS(targets_data, here::here("temp_data", "targets_data.rds")) + +``` + + +Get targets used in the optimization + +```{r} +#| label: targets-used +#| eval: false +#| output: false + +targetfiles <- dir_ls(here::here("temp_data")) |> str_subset("targets.csv") + +targets_used <- vroom(targetfiles, id="src") |> + mutate(src=path_file(src) |> str_sub(1, 4)) |> + mutate(active=!(str_sub(varname, 1, 1) == "#"), + varname = ifelse(!active, + varname |> str_remove("#") |> str_trim(), + varname)) +saveRDS(targets_used, here::here("temp_data", "targets_used.rds")) + +glimpse(targets_used) +count(targets_used, src) +count(targets_used, active) +count(targets_used, varname) +count(targets_used, varname, active) + +targets_used |> filter(src == "ak00") +targets_used |> filter(src == "de00") + +``` + + +## Get and prepare tmd data and area weights + +```{r} +#| label: get-tmd-2021 +#| eval: false +#| output: false + +# fpath <- fs::path(TMDDIR, "tmd_2021.csv") # NO - it is out of sync with tmd.csv +fpath <- here::here("temp_data", "djbout.csv") +tmd2021 <- read_csv(fpath) +ns(tmd2021) + +# djbout <- read_csv(here::here("temp_data", "djbout.csv")) # this is tax calc output vdf from create_area_weights.py +saveRDS(tmd2021, here::here("temp_data", "tmd2021.rds")) + +sum(tmd2021$s006) # 184,024,657 with djbout.csv, s006 units are numbers of units, not hundreds of units + +# con <- unz(zpath, "21incd.csv") +# data <- read_csv(con) + +us_weights <- read_csv(fs::path(TMDDIR, "tmd_weights.csv.gz")) +sum(us_weights$WT2021) # 184,024,656.95 # must divide by 100 +saveRDS(us_weights, here::here("temp_data", "us_weights.rds")) + +tmd_base <- read_csv(fs::path(TMDDIR, "tmd.csv.gz")) # for comparison to tmd2021 +ns(tmd_base) +saveRDS(tmd_base, here::here("temp_data", "tmd_base.rds")) + + +``` + + +```{r} +#| label: prep-weights +#| eval: false +#| output: false + +# weightfiles <- dir_ls(here::here("temp_data")) |> str_subset("weights.csv.gz") +wtfiles <- dir_ls(WEIGHTSDIR, glob="*.gz") # |> path_file() + +df <- read_csv(wtfiles[1]) +sum(df$WT2021) + +area_weights <- vroom(wtfiles, id="src") |> + mutate(src = str_sub(path_file(src), 1, 4), + across(-src, \(x) x / 100.)) +glimpse(area_weights) +count(area_weights, src) + +area_weights |> + select(src, WT2021) |> + summarise(wtdn=sum(WT2021), .by=src) + +saveRDS(area_weights, here::here("temp_data", "area_weights.rds")) + +``` + diff --git a/tmd/areas/weights/examine/cd_results_vs_targets_tables.qmd b/tmd/areas/weights/examine/cd_results_vs_targets_tables.qmd new file mode 100644 index 0000000..43b0f81 --- /dev/null +++ b/tmd/areas/weights/examine/cd_results_vs_targets_tables.qmd @@ -0,0 +1,338 @@ +--- +output: html_document +editor_options: + chunk_output_type: console +--- + +# Compare results to targets + +## Setup + +```{r} +#| label: setup +#| output: false + +source(here::here("R", "libraries.R")) +source(here::here("R", "constants.R")) + +phase4_statecds <- c("AK00", "DE00", "ID01", "ID02", "ME02", "MT00", "ND00", "PA08", "SD00", "WY00") + +``` + + +## Get saved data + +```{r} +#| label: get-data +#| output: false + +# pick one or the other +# OLD tmd2021 <- read_csv(here::here("temp_data", "djbout.csv")) # this is tax calc output vdf from + +tmd2021 <- readRDS(here::here("temp_data", "tmd2021.rds")) +targets_data <- readRDS(here::here("temp_data", "targets_data.rds")) +targets_used <- readRDS(here::here("temp_data", "targets_used.rds")) +us_weights <- readRDS(here::here("temp_data", "us_weights.rds")) +area_weights <- readRDS(here::here("temp_data", "area_weights.rds")) + +``` + + + +## Prepare data + +## Prepare weighted and summarized microdata + +Add weights and make long file + +```{r} +#| label: prepdata-microdata-for-tables +#| eval: false +#| output: false + +# ns(tmd2021) + +# 0 = Total +# 1 = Under $1 +# 2 = $1 under $10,000 +# 3 = $10,000 under $25,000 +# 4 = $25,000 under $50,000 +# 5 = $50,000 under $75,000 +# 6 = $75,000 under $100,000 +# 7 = $100,000 under $200,000 +# 8 = $200,000 under $500,000 +# 9 = $500,000 or more + +icuts <- c(-Inf, 1, 10e3, 25e3, 50e3, 75e3, 100e3, 200e3, 500e3, Inf) +areas <- c("us", unique(area_weights$src)) +keepvars <- c("XTOT", "c00100", "e00200", "e26270") +targvars <- c("wtdn", keepvars) + +# prep weights +wts2021 <- area_weights |> + select(src, WT2021) |> + mutate(row = row_number(), .by=src) |> + pivot_wider(names_from = src, values_from = WT2021) + +wts2021 |> summarise(across(-row, \(x) sum(x))) + +# prep tmd +tmd2 <- tmd2021 |> + select(RECID, data_source, fstatus=MARS, s006, all_of(keepvars)) |> + mutate(scope=ifelse(data_source==1, 1, 2)) |> + select(-data_source) |> + # define irange and extend it to allow (later) for totals + mutate(irange=cut(c00100, icuts, right = FALSE, ordered_result = TRUE), + irange = factor(irange, + levels = levels(irange), # Ensure ordering is maintained + labels = str_replace(levels(irange), ",", ", ")), + irange = fct_expand(irange, "total"), + irange = fct_relevel(irange, "total")) + +glimpse(tmd2) +count(tmd2, irange) +sum(tmd2$s006) + +check <- tmd2 |> + select(RECID, scope, fstatus, c00100, s006, irange) |> + mutate(agistub = as.integer(irange) - 1, + topagi = c00100 >= 500e3, + topbin = agistub == 9, + both = topagi & topbin) |> + filter(topagi | topbin) +check |> filter(!both) + + +tmd3 <- tmd2 |> + bind_cols(wts2021) |> + select(-row) |> + rename(us = s006) + +``` + +Make long file + +```{r} +#| label: long-file +#| eval: false +#| output: false + +tmd_long1 <- tmd3 |> + # flip the areas + pivot_longer(cols=all_of(areas), + names_to = "area", + values_to = "weight") |> + # flip the variables + mutate(wtdn = 1) |> + pivot_longer(cols=all_of(targvars), + names_to = "varname", + values_to = "amount") |> + # flip the variable type (count or amount) + # mutate(count=ifelse(amount != 0, 1, 0)) |> # djb fix this later!!! + mutate(count=1) |> # djb TEMPORARY count ALWAYS is ALL returns + pivot_longer(cols=c(amount, count), + names_to = "vartype", + values_to = "value") |> + mutate(wtdvalue=weight * value) + +# now we are ready to summarize + +``` + +Summarize microdata, then construct and concatenate totals + +```{r} +#| label: summarise-save +#| eval: false +#| output: false + +details <- tmd_long1 |> + summarise(wtdvalue = sum(wtdvalue), + .by=c(area, scope, fstatus, varname, vartype, irange)) |> + arrange(area, scope, fstatus, varname, irange) + +glimpse(details) +count(details, irange) +count(details, area) +count(details, scope) +count(details, fstatus) +count(details, scope, fstatus) + +details |> filter(area=="ak00", scope==1) +details |> filter(area=="ak00", scope==0) + +# calculate a series of subtotals records that drop one or more of the other variables + +# totals over all income ranges +irangesums <- details |> + summarise(wtdvalue = sum(wtdvalue), + .by=c(area, scope, fstatus, varname, vartype)) |> + mutate(irange="total", + irange = factor(irange, levels = levels(details$irange), ordered = TRUE)) + +details2 <- bind_rows(details, irangesums) +glimpse(details2) +count(details2, irange) + +# totals over all scopes +scopesums <- details2 |> + summarise(wtdvalue = sum(wtdvalue), + .by=c(area, fstatus, varname, vartype, irange)) |> + mutate(scope=0) + +details3 <- bind_rows(details2, scopesums) +glimpse(details3) +count(details3, scope) + +# totals over all filing statuses +fstatussums <- details3 |> + summarise(wtdvalue = sum(wtdvalue), + .by=c(area, scope, varname, vartype, irange)) |> + mutate(fstatus=0) + +details4 <- bind_rows(details3, fstatussums) |> + mutate(agistub=as.integer(irange) - 1) +glimpse(details4) +count(details4, fstatus) +count(details4, agistub, irange) + +saveRDS(details4, here::here("temp_data", "area_details.rds")) +# rm(tmd_long, tmd_long1, tmd2, tmd3, tmd4, details, details2, details3, details4) + +``` + + +### Prepare targets for tables + +```{r} +#| label: prepdata-targets-for-tables +#| output: false + +agibins <- read_delim( +delim=";", +trim_ws = TRUE, +file="agistub; agirange; agilo; agihi +0; Total; -9e99; 9e99 +1; Under $1; -9e99; 1 +2; $1 under $10,000; 1; 10e3 +3; $10,000 under $25,000; 10e3; 25e3 +4; $25,000 under $50,000; 25e3; 50e3 +5; $50,000 under $75,000; 50e3; 75e3 +6; $75,000 under $100,000; 75e3; 100e3 +7; $100,000 under $200,000; 100e3; 200e3 +8; $200,000 under $500,000; 200e3; 500e3 +9; $500,000 or more; 500e3; 9e99 +") + +# area scope fstatus varname vartype irange wtdvalue + +targ1 <- targets_used |> + rename(area=src) |> + mutate(vartype=ifelse(count==0, "amount", "count")) |> + select(-count) |> + left_join(agibins, + by = join_by(agilo, agihi)) +count(targ1, agistub, agirange, agilo, agihi) + +targ1 |> filter(area=="ak00", agistub==9, varname=="e00200") + +targ2 <- targ1 |> + select(area, scope, fstatus, varname, vartype, active, agistub, agirange, target) +glimpse(targ2) + +targ2 |> filter(area=="ak00", agistub==9, varname=="e00200") + +``` + + +Comparison file + +```{r} +#| label: comp-file +#| output: false + +area_details <- readRDS(here::here("temp_data", "area_details.rds")) + +compfile <- targ2 |> + left_join(area_details, + by = join_by(area, scope, fstatus, varname, vartype, agistub)) |> + select(-irange) |> + mutate(diff = wtdvalue - target, + pdiff = diff / target) |> + mutate(sort = row_number(), .by=area) |> + mutate(across(c(scope, fstatus, agistub, sort), + as.factor)) + +summary(compfile) +summary(compfile |> filter(active)) + +errors <- compfile |> + mutate(apdiff = abs(pdiff)) |> + filter(apdiff > 0.04) + +summary(errors) +count(errors, agistub) +errors |> filter(agistub != 9) |> arrange(desc(apdiff)) +errors |> filter(agistub == 9) |> arrange(desc(apdiff)) +errors |> filter(agistub == 9) |> arrange(varname, area) + +``` + + +## Show results vs. targets (VERY PRELIMINARY) + +Units: + +- Dollar amounts are in $ millions (varname==amount for target, wtdvalue, and diff) +- Counts (including XTOT) are actual numbers + +scope: + +- 0 = total population +- 1 = filers +- 2 = nonfilers (none currently in the table) + +fstatus: + +- 0 = sum of all statuses +- 1 = married joint +- 2 = single +- 3 = married filing separately (not targeted) +- 4 = head of household + +active: + +- true = item was targeted +- false = item was in target file but was commented out (e26270 for DE-00) + +Dropdown boxes and search fields allow narrowing down the records that are displayed. + +**NOTE**: Weighted values for agistub 9 are not within our tolerances although the optimization solver reported that they are for the transformed problem it solved. We will investigate this and resolve it in Phase 5. + + +```{r} +#| label: show-comps +#| eval: true +#| column: page + + +compfile |> + # select(-type) |> + mutate(across(c(target, wtdvalue, diff), + \(x) ifelse(vartype=="amount" & varname != "XTOT", x / 1e6, x))) |> + mutate(varname = as.factor(varname)) |> + DT::datatable(rownames = FALSE, + options = list(order = list(0, "asc"), # use 1st column (0) for sorting + scrollX = TRUE, scrollY = TRUE, paging = TRUE, pageLength = 20, + autoWidth = TRUE), + filter="top", + escape = FALSE) |> + formatCurrency(columns = c("target", "wtdvalue", "diff"), currency="", digits=1) |> + formatPercentage(columns = c("pdiff"), digits = 1) + +``` + + + + + diff --git a/tmd/areas/weights/examine/cd_simple_tables.qmd b/tmd/areas/weights/examine/cd_simple_tables.qmd new file mode 100644 index 0000000..073afdc --- /dev/null +++ b/tmd/areas/weights/examine/cd_simple_tables.qmd @@ -0,0 +1,273 @@ +--- +output: html_document +editor_options: + chunk_output_type: console +--- + +# Simple summary tables + + +## Setup + +```{r} +#| label: setup +#| output: false + +source(here::here("R", "libraries.R")) +source(here::here("R", "constants.R")) + +phase4_statecds <- c("AK00", "DE00", "ID01", "ID02", "ME02", "MT00", "ND00", "PA08", "SD00", "WY00") + +``` + + +## Get prepared data + +```{r} +#| label: get-data +#| output: false + +tmd2021 <- readRDS(here::here("temp_data", "tmd2021.rds")) +targets_data <- readRDS(here::here("temp_data", "targets_data.rds")) +targets_used <- readRDS(here::here("temp_data", "targets_used.rds")) +us_weights <- readRDS(here::here("temp_data", "us_weights.rds")) +area_weights <- readRDS(here::here("temp_data", "area_weights.rds")) + +``` + + +## Construct weighted totals + +```{r} +#| label: prepdata-for-tables +#| output: false + +# ns(tmd2021) + +# 0 = Total +# 1 = Under $1 +# 2 = $1 under $10,000 +# 3 = $10,000 under $25,000 +# 4 = $25,000 under $50,000 +# 5 = $50,000 under $75,000 +# 6 = $75,000 under $100,000 +# 7 = $100,000 under $200,000 +# 8 = $200,000 under $500,000 +# 9 = $500,000 or more + + +icuts <- c(-Inf, 1, 10e3, 25e3, 50e3, 75e3, 100e3, 200e3, 500e3, Inf) + +# prep tmd +tmd2 <- tmd2021 |> + select(RECID, data_source, s006, c00100, e00200, e26270, iitax) |> + mutate(irange=cut(c00100, icuts, right = FALSE, ordered_result = TRUE), + irange = factor(irange, + levels = levels(irange), # Ensure ordering is maintained + labels = str_replace(levels(irange), ",", ", "))) +count(tmd2, irange) +glimpse(tmd2) +sum(tmd2$s006) + +# prep weights +wts2 <- area_weights |> + select(src, WT2021) |> + mutate(row = row_number(), .by=src) |> + pivot_wider(names_from = src, values_from = WT2021) +wts2 |> summarise(across(-row, \(x) sum(x))) + +areas <- c("us", unique(area_weights$src)) +tmd3 <- tmd2 |> + bind_cols(wts2) |> + select(-row) |> + rename(us = s006) |> + pivot_longer(cols=all_of(areas), + names_to = "area", + values_to = "weight") + +details <- tmd3 |> + summarise(n=n(), wtdn = sum(weight), + agi=sum(weight * c00100), + wages = sum(weight * e00200), + scorppartner = sum(weight * e26270), + iitax = sum(weight * iitax), + .by=c(irange, area)) |> + mutate(irange = fct_expand(irange, "total")) |> + mutate(irange = fct_relevel(irange, "total")) |> + arrange(area, irange) + +count(details, irange) + +areasums <- details |> + summarise(across(c(wtdn, agi, wages, scorppartner, iitax), + \(x) sum(x)), + .by=area) |> + mutate(irange="total", + irange = factor(irange, levels = levels(details$irange), ordered = TRUE)) + +tmd4 <- bind_rows(details, areasums) |> + arrange(area, irange) +count(tmd4, irange) +tmd4 + +``` + + +## Selected tables + +### Number of tax units + +```{r} +#| label: tables-wtdn +#| output: true + +tmd4 |> + select(irange, area, wtdn) |> + pivot_wider(names_from = area, + values_from = wtdn) |> + relocate(us, .after = irange) |> + gt() |> + tab_header("Number of tax units, thousands, 2021 tax year", + subtitle = "Filers and nonfilers") |> + fmt_number(columns = -c(irange, us), + scale=1e-3, + decimals = 1) |> + fmt_number(columns = us, + scale=1e-3, + decimals = 0) + +# tmd4 |> +# filter(data_source==1) |> +# select(irange, area, wtdn) |> +# pivot_wider(names_from = area, +# values_from = wtdn) |> +# relocate(us, .after = irange) |> +# gt() |> +# tab_header("Number of tax filers, thousands, 2021 tax year", +# subtitle = "data_source==1") |> +# fmt_number(columns = -c(irange, us), +# scale=1e-3, +# decimals = 1) |> +# fmt_number(columns = us, +# scale=1e-3, +# decimals = 0) + +``` + + +### Percentage distribution of tax units + +```{r} +#| label: tables-pctdist +#| output: true + +tmd4 |> + mutate(pct=wtdn / wtdn[irange=="total"], + .by=area) |> + select(irange, area, pct) |> + pivot_wider(names_from = area, + values_from = pct) |> + relocate(us, .after = irange) |> + gt() |> + tab_header("Number of tax units as % of area total, 2021 tax year", + subtitle = "Filers and nonfilers") |> + fmt_percent(columns = -irange, + decimals = 1) + +``` + + +### Average adjusted gross income + +```{r} +#| label: tables-avgagi +#| output: true + +tmd4 |> + select(-n) |> + mutate(across(c(agi, wages, iitax), + \(x) x / wtdn)) |> + select(irange, area, value=agi) |> + pivot_wider(names_from = area, + values_from = value) |> + relocate(us, .after = irange) |> + gt() |> + tab_header("Average AGI in $, 2021 tax year", + subtitle = "Filers and nonfilers") |> + fmt_number(columns = -irange, + decimals = 0) + +``` + +### Average wages + +```{r} +#| label: tables-avgwages +#| output: true + +tmd4 |> + select(-n) |> + mutate(across(c(agi, wages, iitax), + \(x) x / wtdn)) |> + select(irange, area, value=wages) |> + pivot_wider(names_from = area, + values_from = value) |> + relocate(us, .after = irange) |> + gt() |> + tab_header("Average wages in $, 2021 tax year", + subtitle = "Filers and nonfilers") |> + fmt_number(columns = -irange, + decimals = 0) + + +``` + + +### Average S Corporation and partnership income (net) + +```{r} +#| label: tables-scorppartner +#| output: true + +tmd4 |> + select(-n) |> + mutate(across(c(agi, wages, scorppartner, iitax), + \(x) x / wtdn)) |> + select(irange, area, value=scorppartner) |> + pivot_wider(names_from = area, + values_from = value) |> + relocate(us, .after = irange) |> + gt() |> + tab_header("Average S Corporation and partnership income in $, 2021 tax year", + subtitle = "Filers and nonfilers") |> + fmt_number(columns = -irange, + decimals = 0) + + +``` + + +### Average iitax + +```{r} +#| label: tables-avgiitax +#| output: true + +tmd4 |> + select(-n) |> + mutate(across(c(agi, wages, iitax), + \(x) x / wtdn)) |> + select(irange, area, value=iitax) |> + pivot_wider(names_from = area, + values_from = value) |> + relocate(us, .after = irange) |> + gt() |> + tab_header("Average iitax, 2021 tax year", + subtitle = "Filers and nonfilers") |> + fmt_number(columns = -irange, + decimals = 0) + + +``` + + diff --git a/tmd/areas/weights/examine/examine.Rproj b/tmd/areas/weights/examine/examine.Rproj new file mode 100644 index 0000000..8e3c2eb --- /dev/null +++ b/tmd/areas/weights/examine/examine.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/tmd/areas/weights/examine/index.qmd b/tmd/areas/weights/examine/index.qmd new file mode 100644 index 0000000..d117723 --- /dev/null +++ b/tmd/areas/weights/examine/index.qmd @@ -0,0 +1,10 @@ +--- +output: html_document +editor_options: + chunk_output_type: console +--- + +# Introduction + +TO COME. + diff --git a/tmd/areas/weights/examine/play.R b/tmd/areas/weights/examine/play.R new file mode 100644 index 0000000..45f442d --- /dev/null +++ b/tmd/areas/weights/examine/play.R @@ -0,0 +1,268 @@ + + +# libraries --------------------------------------------------------------- + +source(here::here("R", "libraries.R")) +source(here::here("R", "constants.R")) + +phase4_statecds <- c("AK00", "DE00", "ID01", "ID02", "ME02", "MT00", "ND00", "PA08", "SD00", "WY00") + + +# get data ---------------------------------------------------------------- + +# OLD: djbout <- read_csv(here::here("temp_data", "djbout.csv")) # this is tax calc output vdf from create_area_weights.py +tmd2021 <- readRDS(here::here("temp_data", "tmd2021.rds")) # now based on djbout.csv +sum(tmd2021$s006) # is what we want to see +ns(tmd2021) +tmdbase <- readRDS(here::here("temp_data", "tmd_base.rds")) +usweights <- readRDS(here::here("temp_data", "us_weights.rds")) +ns(usweights) +area_weights <- readRDS(here::here("temp_data", "area_weights.rds")) + + +# debug ak00 agistub 9 c00100 amount (target 10, row 11) -------------------------------------------------------- + +#.. get data ---- +# djbout 225,256 data rows +# same for weights +# same for masks + +# ...national_population 334283385.27000004 +# ...scale 2.0948260728756473e-10 + +# target 10 +national_population <- 334283385.27000004 +cdpopulation <- 732673 # row 2 xtot +unscaled_target <- 4773666000 # unscaled_target 4773666000 good +(initial_weights_scale = cdpopulation / national_population) +# 1 / cdpopulation +1 / unscaled_target +scale <- 2.0948260728756473e-10 # good +(scaled_target <- unscaled_target * scale) + +iweights <- read_csv(here::here("temp_data", "unmasked_varray.csv"), + col_names = "umv") # this should be c00100 + +xvalues <- read_csv(here::here("temp_data", "xvalues.csv"), + col_names = "x") # this should be c00100 + +umv <- read_csv(here::here("temp_data", "unmasked_varray.csv"), + col_names = "umv") # this should be c00100 + +mask <- read_csv(here::here("temp_data", "mask.csv"), + col_names = "mask") + +smv <- read_csv(here::here("temp_data", "scaled_masked_varray.csv"), + col_names = "smv") + + +# ..check some of the data ------------------------------------------------ +class(umv); class(mask); class(smv) + +akweights <- area_weights |> filter(src=="ak00") + +combo <- cbind(tmd2021 |> + select(data_source, s006, XTOT, c00100), + xvalues, umv, mask, smv, + akweights |> select(fweight=WT2021) + ) |> + mutate(row=row_number(), + iweight = s006 * initial_weights_scale) |> + relocate(row) + +# population check - good +combo |> + summarise(pop=sum(s006 * XTOT)) # 334,283,385 vs expected 334,283,385.27000004 + +# c00100 sums - good +combo |> + mutate(diff = umv - c00100, + pdiff = diff / c00100) |> + filter(pdiff != 0) |> + arrange(desc(abs(pdiff))) # super minor diffs + +sum(umv) # 638131482635 +sum(tmd2021$c00100) # 638131482635 + +# mask check +combo |> + mutate(maskcheck = c00100 ) + +# final weights check +check <- combo |> + mutate(fweight_check = iweight * x, + diff=fweight_check - fweight, + pdiff=diff / fweight) +check |> arrange(desc(abs(pdiff))) +check |> + summarise(fweight=sum(fweight), + fweight_check=sum(fweight), + .by=data_source) + + +# smv looks good +smvcheck <- combo |> + mutate(smvcheck = c00100 * mask * scale, + diff = smv - smvcheck, + pdiff = diff / smvcheck) |> + filter(pdiff != 0) +sum(abs(smvcheck$pdiff)) # 1.118748e-12 + +smvcheck <- tmd2021$c00100 * mask * scale +class(smvcheck) +sum(smv) # 130.6638 +sum(smvcheck) # 130.6638 + +tibble(smv=as.numeric(smv), check=as.numeric(smvcheck)) |> + mutate(diff=smv - check) |> + filter(diff != 0) + + + +sum(mask$mask) +smv |> + mutate(nz=(smv != 0) * 1) |> + summarise(n=n(), smv=sum(smv), .by=nz) + +nrow(mask) +nrow(smv) + +combo |> + filter(mask == 1) |> + mutate(fweight_round=round(iweight * x, 2)) |> + summarise(totmod=sum(umv * fweight), + totcalc=sum(c00100 * iweight * x), + totcalc_round=sum(c00100 * fweight_round)) + +# 4,773,666,000 + + + + +# weights analysis -------------------------------------------------------- + +check <- combo |> + mutate(fweight_calc = iweight * x, + fweight_calcround = round(fweight, 2), + rnddiff=fweight - fweight_calcround, + rndpdiff=rnddiff / fweight_calcround, + calcdiff=fweight - fweight_calc, + calcpdiff=calcdiff / fweight_calc) + +check |> + summarise(mdns006=median(s006), + mdnfwcalc=median(fweight_calc), + mdnfweight=median(fweight), + .by=mask) + + +# analysis ---------------------------------------------------------------- + + + +glimpse(tmd3) + +tmp <- tmd3 |> + mutate(agistub=as.integer(irange) - 1) |> + filter(scope==1, agistub==1) + +tmp |> + summarise(c00100a = sum(de00 * c00100), + c00100n = sum(de00 * (c00100 != 0)), + nrets = sum(de00), + e00200a = sum(de00 * e00200)) + +# c00100a agistub 1 amount +# -248171000 target +# -248057066. result + +# c00100n agistub 1 count +# 8760 target +# 6420 result +# 8757 if we just summarize returns + +# e00200a agistub 1 amount +# 40998000 target +# 40805367 result + +tmp |> + filter(fstatus==1) |> + summarise(mars1 = sum(de00)) + +# mars1 agistub 1 count +# 6040 target +# 6038 result + + +tmp |> + filter(fstatus==1, c00100 != 0) |> + summarise(mars1 = sum(de00)) + +# mars1 agistub 1 count +# 6040 target +# 4300 result + + +# compare tmd2021 to djbout ----------------------------------------------- + +setdiff(names(tmd2021), names(djbout)) + +# simple checks on tmd2021 vs. tmdbase +sum(tmd2021$s006) # 184,024,650 why not identical to other files? +sum(tmdbase$s006) # 184,024,657 same as in US weights +sum(usweights$WT2021) / 100. # 184,024,656.95 +sum(round(usweights$WT2021 / 100)) # 184,023,729 +sum(djbout$s006) # 184,024,657 + +sum(tmd2021$e00200) # 126,004,562,344 +sum(tmdbase$e00200) # 126,004,434,333 +sum(djbout$e00200) # 126,004,434,333 + +checkstub9 <- bind_rows(tmd2021 |> filter(c00100 >= 500e3, c00100 < 9e99) |> mutate(src="tmd"), + djbout |> filter(c00100 >= 500e3, c00100 < 9e99) |> mutate(src="djb")) |> + summarise(n=n(), agisum=sum(c00100), wtsum=sum(s006), wtdagisum=sum(s006 * c00100), .by=src) + +checkstub9 |> gt() +checkstub9 |> + pivot_longer(-src) |> + pivot_wider(names_from = src) |> + mutate(diff = djb - tmd, + pdiff = diff / tmd) + +baddjb <- djbout |> + mutate(n=n(), .by=RECID) |> + filter(n!=1) |> + arrange(RECID) |> + relocate(RECID, n, data_source, c00100) +count(baddjb, data_source) + +baddjbds0 <- djbout |> + filter(data_source==0) |> # CPS records + mutate(n=n(), .by=RECID) |> + filter(n!=1) |> + arrange(RECID) |> + relocate(RECID, n, data_source, c00100) + +baddjbds1 <- djbout |> + filter(data_source==1) |> # PUF records + mutate(n=n(), .by=RECID) |> + filter(n!=1) |> + arrange(RECID) |> + relocate(RECID, n, data_source, c00100) + +comp <- bind_rows( + tmd2021 |> select(RECID, data_source, c00100, s006) |> mutate(src="tmd"), + djbout |> select(RECID, data_source, c00100, s006) |> mutate(src="djb")) |> + arrange(RECID, desc(src)) + +count(comp, src) + +bad <- comp |> + mutate(n=n(), .by=RECID) |> + filter(n != 2) + + +comp |> + pivot_longer(-c(RECID, src)) |> + pivot_wider(names_from = src, values_from = value) + From 05520bffb59dee3a3aafcb23d5d5bcdb133913ab Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Tue, 29 Oct 2024 19:03:21 -0400 Subject: [PATCH 2/2] don't gitignore temp_data but do gitignore its contents --- tmd/areas/weights/examine/.gitignore | 1 - tmd/areas/weights/examine/temp_data/.gitignore | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 tmd/areas/weights/examine/temp_data/.gitignore diff --git a/tmd/areas/weights/examine/.gitignore b/tmd/areas/weights/examine/.gitignore index 58c3fe9..6dfc158 100644 --- a/tmd/areas/weights/examine/.gitignore +++ b/tmd/areas/weights/examine/.gitignore @@ -6,7 +6,6 @@ _examine/ _freeze/ site_libs/ -temp_data/ # files to ignore .Rhistory diff --git a/tmd/areas/weights/examine/temp_data/.gitignore b/tmd/areas/weights/examine/temp_data/.gitignore new file mode 100644 index 0000000..86d3d6f --- /dev/null +++ b/tmd/areas/weights/examine/temp_data/.gitignore @@ -0,0 +1,5 @@ +# Ignore everything in this directory +* + +# Allow the .gitignore file itself +!.gitignore