Skip to content

Commit

Permalink
Merge pull request #245 from PSLmodels/revise-caching
Browse files Browse the repository at this point in the history
Add tmd/create_taxcalc_cached_files.py and make related changes
  • Loading branch information
martinholmer authored Oct 22, 2024
2 parents 3425789 + 0cf6c86 commit f04f12c
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 30 deletions.
7 changes: 4 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
**/*.h5
**/*.pyc
**/*.npy
**/*.csv
**/*.csv.zip
**/*.csv.gz
**/*.csv
**/*.pyc
**/*.egg-info
**/_build/
**/*tfevents*
tmd/storage/output/cached_files
tmd/storage/output/tax_expenditures
!tmd/storage/input/*.csv
!tmd/areas/targets/*.csv
tmd/areas/weights/*.log
**demographics_2015.csv
**puf_2015.csv
*.DS_STORE
11 changes: 9 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ install:

.PHONY=clean
clean:
rm -f tmd/storage/output/tmd*
rm -f tmd/storage/output/tmd* tmd/storage/output/cached_files

tmd/storage/output/tmd.csv.gz: \
setup.py \
Expand Down Expand Up @@ -36,10 +36,17 @@ tmd/storage/output/tmd_weights.csv.gz: \
tmd/create_taxcalc_sampling_weights.py
python tmd/create_taxcalc_sampling_weights.py

tmd/storage/output/cached_files: \
tmd/storage/output/tmd.csv.gz \
tmd/storage/output/tmd_growfactors.csv \
tmd/storage/output/tmd_weights.csv.gz
python tmd/create_taxcalc_cached_files.py

.PHONY=tmd_files
tmd_files: tmd/storage/output/tmd.csv.gz \
tmd/storage/output/tmd_growfactors.csv \
tmd/storage/output/tmd_weights.csv.gz
tmd/storage/output/tmd_weights.csv.gz \
tmd/storage/output/cached_files

.PHONY=test
test: tmd_files
Expand Down
26 changes: 5 additions & 21 deletions tmd/areas/create_area_weights.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
GFFILE_PATH = STORAGE_FOLDER / "output" / "tmd_growfactors.csv"
POPFILE_PATH = STORAGE_FOLDER / "input" / "cbo_population_forecast.yaml"

# taxcalc calculated variable cache files:
TAXCALC_AGI_CACHE = AREAS_FOLDER / "cache_agi.npy"
# Tax-Calcultor calculated variable cache files:
TAXCALC_AGI_CACHE = STORAGE_FOLDER / "output" / "cached_c00100.npy"

PARAMS = {}

Expand Down Expand Up @@ -166,26 +166,12 @@ def valid_area(area: str):
return all_ok


def all_taxcalc_variables(write_cache):
def all_taxcalc_variables():
"""
Return all read and needed calc Tax-Calculator variables in pd.DataFrame.
"""
vdf = pd.read_csv(INFILE_PATH)
if TAXCALC_AGI_CACHE.exists():
vdf["c00100"] = np.load(TAXCALC_AGI_CACHE)
else:
input_data = tc.Records.tmd_constructor(
data_path=INFILE_PATH,
weights_path=WTFILE_PATH,
growfactors_path=GFFILE_PATH,
exact_calculations=True,
)
sim = tc.Calculator(records=input_data, policy=tc.Policy())
sim.calc_all()
agi = sim.array("c00100")
vdf["c00100"] = agi
if write_cache:
np.save(TAXCALC_AGI_CACHE, agi, allow_pickle=False)
vdf["c00100"] = np.load(TAXCALC_AGI_CACHE)
assert np.all(vdf.s006 > 0), "Not all weights are positive"
return vdf

Expand Down Expand Up @@ -432,7 +418,6 @@ def create_area_weights_file(
area: str,
write_log: bool = True,
write_file: bool = True,
write_cache: bool = True,
):
"""
Create Tax-Calculator-style weights file for FIRST_YEAR through LAST_YEAR
Expand Down Expand Up @@ -501,7 +486,7 @@ def create_area_weights_file(
out.write(f"USING CUSTOMIZED PARAMETERS IN {pfile}\n")

# construct variable matrix and target array and weights_scale
vdf = all_taxcalc_variables(write_cache)
vdf = all_taxcalc_variables()
target_matrix, target_array, weights_scale = prepared_data(area, vdf)
wght_us = np.array(vdf.s006 * weights_scale)
out.write("INITIAL WEIGHTS STATISTICS:\n")
Expand Down Expand Up @@ -672,6 +657,5 @@ def create_area_weights_file(
area_code,
write_log=False,
write_file=True,
write_cache=False,
)
sys.exit(RCODE)
4 changes: 0 additions & 4 deletions tmd/areas/make_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from tmd.areas.create_area_weights import (
valid_area,
create_area_weights_file,
TAXCALC_AGI_CACHE,
)
from tmd.areas import AREAS_FOLDER
from tmd.storage import STORAGE_FOLDER
Expand Down Expand Up @@ -87,7 +86,6 @@ def create_area_weights(area: str):
area,
write_log=True,
write_file=True,
write_cache=True,
)
time1 = time.time()
print(f"... {area} exectime(secs)= {(time1 - time0):.1f}")
Expand All @@ -101,7 +99,6 @@ def make_all_areas(num_workers):
Call create_area_weights(area) for each out-of-date or non-existent
area weights file for which there is an area targets file.
"""
TAXCALC_AGI_CACHE.unlink(missing_ok=True)
todo_areas = to_do_areas()
# show processing plan
if todo_areas:
Expand All @@ -119,7 +116,6 @@ def make_all_areas(num_workers):
# process each target file for which the weights file is not up-to-date
with Pool(num_workers) as pool:
pool.map(create_area_weights, todo_areas)
TAXCALC_AGI_CACHE.unlink(missing_ok=True)
return 0


Expand Down
46 changes: 46 additions & 0 deletions tmd/create_taxcalc_cached_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""
Generate tmd/storage/output/cached_*.npy files for TAX_YEAR.
"""

import numpy as np
import taxcalc as tc
from tmd.storage import STORAGE_FOLDER, CACHED_TAXCALC_VARIABLES

TAX_YEAR = 2021

INFILE_PATH = STORAGE_FOLDER / "output" / "tmd.csv.gz"
WTFILE_PATH = STORAGE_FOLDER / "output" / "tmd_weights.csv.gz"
GFFILE_PATH = STORAGE_FOLDER / "output" / "tmd_growfactors.csv"


def create_cached_files():
"""
Create a Numpy binary file containing FIRST_YEAR values
for each variable in the CACHED_TAXCALC_VARIABLES list.
"""
# calculate all Tax-Calculator variables for TAX_YEAR
pol = tc.Policy()
rec = tc.Records.tmd_constructor(
data_path=INFILE_PATH,
weights_path=WTFILE_PATH,
growfactors_path=GFFILE_PATH,
exact_calculations=True,
)
calc = tc.Calculator(policy=pol, records=rec)
calc.advance_to_year(TAX_YEAR)
calc.calc_all()

# write each variable in CACHED_TAXCALC_VARIABLES list to a binary file
for vname in CACHED_TAXCALC_VARIABLES:
varray = calc.array(vname)
fpath = STORAGE_FOLDER / "output" / f"cached_{vname}.npy"
np.save(fpath, varray, allow_pickle=False)
fpath = STORAGE_FOLDER / "output" / "cached_files"
with open(fpath, "w", encoding="utf-8") as cfiles:
cfiles.write(" ") # provides timestamp for Makefile

return 0


if __name__ == "__main__":
create_cached_files()
5 changes: 5 additions & 0 deletions tmd/storage/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
from pathlib import Path

STORAGE_FOLDER = Path(__file__).parent

CACHED_TAXCALC_VARIABLES = [
"c00100", # AGI
"iitax", # individual income tax liability (including refundable credits)
]

0 comments on commit f04f12c

Please sign in to comment.