Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial pr for examine results for areas #270

Merged
merged 2 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions tmd/areas/weights/examine/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# examine

# folders to ignore
.Rproj.user/
.quarto/
_examine/
_freeze/
site_libs/

# files to ignore
.Rhistory
*.html

# Local Netlify folder
.netlify


/.quarto/
26 changes: 26 additions & 0 deletions tmd/areas/weights/examine/R/constants.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@

# \\wsl.localhost\Ubuntu\home\donboyd5\Documents\python_projects\tax-microdata-benchmarking\tmd\storage\output
TMDDIR <- here::here("..", "..", "..", "storage", "output")
# list.files(TMDDIR)

TARGETSDIR <- here::here("..", "..", "targets")
WEIGHTSDIR <- here::here("..")
# list.files(TARGETSDIR)
# list.files(WEIGHTSDIR)

# CDZIPURL <- "https://www.irs.gov/pub/irs-soi/congressional2021.zip"
# CDDOCURL <- "https://www.irs.gov/pub/irs-soi/21incddocguide.docx"

# \\wsl.localhost\Ubuntu\home\donboyd5\Documents\python_projects\tax-microdata-benchmarking\tmd\areas\weights\examine
# \\wsl.localhost\Ubuntu\home\donboyd5\Documents\python_projects\tax-microdata-benchmarking\tmd\areas\targets\prepare
# TARGETSPREPDIR <- here::here("..", "..", "targets", "prepare")
# print(TARGETSPREPDIR) # Should print the absolute path to the folder
# list.files(TARGETSPREPDIR)

# CDDIR <- here::here("cds")
# CDDIR <- fs::path(TARGETSPREPDIR, "cds")
# CDRAW <- fs::path(CDDIR, "raw_data")
# CDINTERMEDIATE <- fs::path(CDDIR, "intermediate")
# CDFINAL <- fs::path(CDDIR, "final")
# list.files(CDFINAL)
# CDDOCEXTRACT <- "cd_documentation_extracted_from_21incddocguide.docx.xlsx"
79 changes: 79 additions & 0 deletions tmd/areas/weights/examine/R/libraries.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# libraries ---------------------------------------------------------------

library(DT)
library(fs)
library(gt)
library(knitr)
library(readxl)
library(skimr)
library(stringr)
library(tidyverse)
# includes: dplyr, forcats, ggplot2, lubridate, purrr, stringr, tibble, tidyr

tprint <- 75 # default tibble print
options(tibble.print_max = tprint, tibble.print_min = tprint) # show up to tprint rows

library(janitor)
# census_api_key("b27cb41e46ffe3488af186dd80c64dce66bd5e87", install = TRUE) # stored in .Renviron
# libraries needed for census population
library(sf)
library(tidycensus)
library(tigris)
options(tigris_use_cache = TRUE)
library(vroom)


# possible libraries ------------------------------------------------------

# library(rlang)
# library(tidyverse)
# tprint <- 75 # default tibble print
# options(tibble.print_max = tprint, tibble.print_min = tprint) # show up to tprint rows
#
# library(fs)

# tools
# library(vroom)
# library(readxl)
# library(openxlsx) # for writing xlsx files
# library(lubridate)
# library(RColorBrewer)
# library(RcppRoll)
# library(fredr)
# library(tidycensus)
# library(googledrive)
# library(arrow)
#
# library(jsonlite)
# library(tidyjson)
#
#
# # boyd libraries
# # library(btools)
# # library(bdata)
# # library(bggtools)
# # library(bmaps)
#
# # graphics
# library(scales)
# library(ggbeeswarm)
# library(patchwork)
# library(gridExtra)
# library(ggrepel)
# library(ggbreak)
#
# # tables
# library(knitr)
# library(kableExtra)
# library(DT)
# library(gt)
# library(gtExtras)
# library(janitor)
# library(skimr)
# library(vtable)
#
# # maps
# library(maps)
# # https://cran.r-project.org/web/packages/usmap/vignettes/mapping.html
# library(usmap)

63 changes: 63 additions & 0 deletions tmd/areas/weights/examine/_quarto.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
project:
type: book
output-dir: _examine

# https://prerelease.quarto.org/ # quarto documentation at this link

# publishing with netllify cli:
# open terminal in examine
# quarto render && netlify deploy --prod --dir=_examine

# quarto render # inspect to be sure it is as desired
# netlify deploy --prod --dir=_examine

# or step by step
# netlify deploy # to test it, give _examine as publish directory
# netlify deploy --prod # to deploy, give _examine as publish directory

execute:
eval: true
echo: true
output: true
freeze: auto # auto: during global project renders, re-render only when source changes

book:
title: "Examine area weights creation results"
subtitle: "Create csv file"
# author: "Don Boyd"
date: today
date-format: long
chapters:
- index.qmd
- part: "IRS Congressional District data"
chapters:
# - cd_overall_documentation.qmd
- cd_prepare_data.qmd
- cd_simple_tables.qmd
- cd_results_vs_targets_tables.qmd

format:
html:
theme: cosmo
code-fold: true

editor_options:
chunk_output_type: console

# R packages using old 209 libxml
# gt,


# rendering commands
# quarto render
# quarto publish netlify --no-prompt --no-render --no-browser

# possibly use this at start of each doc
# ---
# output: html_document
# editor_options:
# chunk_output_type: console
# ---



180 changes: 180 additions & 0 deletions tmd/areas/weights/examine/cd_prepare_data.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
---
output: html_document
editor_options:
chunk_output_type: console
---

# Read tmd 2021, area targets, area weights and prepare data


## Setup

```{r}
#| label: setup
#| output: false

source(here::here("R", "libraries.R"))
source(here::here("R", "constants.R"))

phase4_statecds <- c("AK00", "DE00", "ID01", "ID02", "ME02", "MT00", "ND00", "PA08", "SD00", "WY00")

```


```{r}
#| label: functions
#| output: false

ns <- function(obj){
sort(names(obj))
}

```

## Download files from google drive

Only do this when target files and results have changed. Otherwise, necessary data should be in the temp_data folder.

```{r}
#| label: hookup-googledrive
#| eval: false

library(googledrive)
drive_auth() # authenticate

```


```{r}
#| label: download-files
#| eval: false

# /AFPI_2024/Phase 4
# folder_id <- "1pEdofaxeQgEeDLM8NOpo0vOGL1jT8Qa1" # AFPI folder
folder_id <- "1Z7ZWYTbldfuQCFbpqKi4Z8FYxkbwmhnu" # Phase 4 folder

files <- drive_ls(as_id(folder_id))
files

f <- function(gdfname){
fpath <- here::here("temp_data", gdfname)
print(fpath)
drive_download(gdfname, path = fpath, overwrite = TRUE)
}
# f(files$name[[1]])

files |>
pull(name) |>
walk(\(gdfname) f(gdfname))

```

## Prepare target files

Get all targets prepared

```{r}
#| label: targets-all
#| eval: false
#| output: false

# ~/Documents/python_projects/tax-microdata-benchmarking/tmd/areas/weights/examine # project dir
# ~/Documents/python_projects/tax-microdata-benchmarking/tmd/areas/targets/prepare/cds/intermediate # cdbasefile
HERE <- here::here()
CDTARGETSDIR <- fs::path(HERE, "..", "..", "targets", "prepare", "cds", "intermediate")
# list.files(CDTARGETSDIR)

targets_data <- read_csv(fs::path(CDTARGETSDIR, "cdbasefile.csv"))
glimpse(targets_data)

saveRDS(targets_data, here::here("temp_data", "targets_data.rds"))

```


Get targets used in the optimization

```{r}
#| label: targets-used
#| eval: false
#| output: false

targetfiles <- dir_ls(here::here("temp_data")) |> str_subset("targets.csv")

targets_used <- vroom(targetfiles, id="src") |>
mutate(src=path_file(src) |> str_sub(1, 4)) |>
mutate(active=!(str_sub(varname, 1, 1) == "#"),
varname = ifelse(!active,
varname |> str_remove("#") |> str_trim(),
varname))
saveRDS(targets_used, here::here("temp_data", "targets_used.rds"))

glimpse(targets_used)
count(targets_used, src)
count(targets_used, active)
count(targets_used, varname)
count(targets_used, varname, active)

targets_used |> filter(src == "ak00")
targets_used |> filter(src == "de00")

```


## Get and prepare tmd data and area weights

```{r}
#| label: get-tmd-2021
#| eval: false
#| output: false

# fpath <- fs::path(TMDDIR, "tmd_2021.csv") # NO - it is out of sync with tmd.csv
fpath <- here::here("temp_data", "djbout.csv")
tmd2021 <- read_csv(fpath)
ns(tmd2021)

# djbout <- read_csv(here::here("temp_data", "djbout.csv")) # this is tax calc output vdf from create_area_weights.py
saveRDS(tmd2021, here::here("temp_data", "tmd2021.rds"))

sum(tmd2021$s006) # 184,024,657 with djbout.csv, s006 units are numbers of units, not hundreds of units

# con <- unz(zpath, "21incd.csv")
# data <- read_csv(con)

us_weights <- read_csv(fs::path(TMDDIR, "tmd_weights.csv.gz"))
sum(us_weights$WT2021) # 184,024,656.95 # must divide by 100
saveRDS(us_weights, here::here("temp_data", "us_weights.rds"))

tmd_base <- read_csv(fs::path(TMDDIR, "tmd.csv.gz")) # for comparison to tmd2021
ns(tmd_base)
saveRDS(tmd_base, here::here("temp_data", "tmd_base.rds"))


```


```{r}
#| label: prep-weights
#| eval: false
#| output: false

# weightfiles <- dir_ls(here::here("temp_data")) |> str_subset("weights.csv.gz")
wtfiles <- dir_ls(WEIGHTSDIR, glob="*.gz") # |> path_file()

df <- read_csv(wtfiles[1])
sum(df$WT2021)

area_weights <- vroom(wtfiles, id="src") |>
mutate(src = str_sub(path_file(src), 1, 4),
across(-src, \(x) x / 100.))
glimpse(area_weights)
count(area_weights, src)

area_weights |>
select(src, WT2021) |>
summarise(wtdn=sum(WT2021), .by=src)

saveRDS(area_weights, here::here("temp_data", "area_weights.rds"))

```

Loading
Loading