Skip to content

Commit

Permalink
Merge pull request #47 from PSLmodels/dataset-update-3
Browse files Browse the repository at this point in the history
Add tests on significant variables
  • Loading branch information
nikhilwoodruff authored Apr 23, 2024
2 parents f034ef4 + 8f07a61 commit 5fa304c
Show file tree
Hide file tree
Showing 7 changed files with 259 additions and 32 deletions.
17 changes: 16 additions & 1 deletion .github/workflows/pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
with:
options: ". -l 79 --check"
test:
name: Test
name: Main tests
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
Expand All @@ -30,4 +30,19 @@ jobs:
run: make test-lite
env:
POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN: ${{ secrets.POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN}}
test_reweight:
name: Full tests
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install dependencies
run: make install
- name: Run tests
run: make test
env:
POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN: ${{ secrets.POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN}}

15 changes: 15 additions & 0 deletions .github/workflows/push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,18 @@ jobs:
run: make test-lite
env:
POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN: ${{ secrets.POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN}}
test_reweight:
name: Full tests
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install dependencies
run: make install
- name: Run tests
run: make test
env:
POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN: ${{ secrets.POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN}}
5 changes: 3 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
**/*.pyc
**/*.csv.gz
**/*.egg-info

# Tensorboard
tax_microdata_benchmarking/calibration
tax_microdata_benchmarking/calibration

*.ipynb
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@
"pytest",
"black",
],
extras_require={"reweight": ["torch"]},
extras_require={"reweight": ["torch", "tensorboard"]},
)
68 changes: 42 additions & 26 deletions tax_microdata_benchmarking/create_flat_file.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# This file should create tax_microdata.csv.gz in the root of the repo.
import warnings

warnings.filterwarnings("ignore")
import taxcalc as tc
from policyengine_us import Microsimulation
from policyengine_us.model_api import *
Expand Down Expand Up @@ -458,6 +459,16 @@ class tc_wic_ben(TaxCalcVariableAlias):
adds = ["wic"]


class tc_e18500(TaxCalcVariableAlias):
label = "real-estate taxes paid"
adds = ["real_estate_taxes"]


class tc_e19200(TaxCalcVariableAlias):
label = "interest expense"
adds = ["interest_expense"]


class is_tax_filer(Variable):
label = "tax filer"
value_type = bool
Expand Down Expand Up @@ -633,6 +644,8 @@ def apply(self):
tc_p23250,
tc_wic_ben,
is_tax_filer,
tc_e18500,
tc_e19200,
)

for variable in EXTRA_PUF_VARIABLES:
Expand All @@ -644,31 +657,34 @@ def create_flat_file(
target_year: int = 2024,
) -> pd.DataFrame:
sim = Microsimulation(reform=taxcalc_extension, dataset=source_dataset)
original_year = sim.dataset.time_period

for variable in UPRATING_VARIABLES:
original_value = sim.calculate(variable, 2024)
original_value = sim.calculate(variable, original_year)
uprating_factor = get_variable_uprating(
variable,
source_time_period=2024,
source_time_period=original_year,
target_time_period=target_year,
)
try:
sim.set_input(variable, 2024, original_value * uprating_factor)
except:
pass
sim.set_input(
variable, original_year, original_value * uprating_factor
)
except Exception as e:
print(f"Error uprating {variable}: {e}")

df = pd.DataFrame()

for variable in sim.tax_benefit_system.variables:
if variable.startswith("tc_"):
df[variable[3:]] = sim.calculate(variable, 2024).values.astype(
np.float64
)
df[variable[3:]] = sim.calculate(
variable, original_year
).values.astype(np.float64)

if variable == "is_tax_filer":
df[variable] = sim.calculate(variable, 2024).values.astype(
np.float64
)
df[variable] = sim.calculate(
variable, original_year
).values.astype(np.float64)

# Extra quality-control checks to do with different data types, nothing major
FILER_SUM_COLUMNS = [
Expand Down Expand Up @@ -752,7 +768,20 @@ def create_stacked_flat_file(
print(
f"Adding Tax-Calculator outputs to the flat file for {target_year}"
)
input_data = tc.Records(data=stacked_file)
print(
f"Adding pass-through W2 wages to the flat file for {target_year}"
)
qbi = np.maximum(
0,
stacked_file.e00900
+ stacked_file.e26270
+ stacked_file.e02100
+ stacked_file.e27200,
)
stacked_file["PT_binc_w2_wages"] = (
qbi * 0.357
) # Solved in 2021 using adjust_qbi.py
input_data = tc.Records(data=stacked_file, start_year=target_year)
policy = tc.Policy()
simulation = tc.Calculator(records=input_data, policy=policy)
simulation.calc_all()
Expand All @@ -774,19 +803,6 @@ def create_stacked_flat_file(
except ValueError as e:
print(e)
print("Skipping reweighting.")
print(
f"Adding pass-through W2 wages to the flat file for {target_year}"
)
qbi = np.maximum(
0,
combined_file.e00900
+ combined_file.e26270
+ combined_file.e02100
+ combined_file.e27200,
)
combined_file["PT_binc_w2_wages"] = (
qbi * 0.357
) # Solved in 2021 using adjust_qbi.py
return combined_file

return stacked_file
Expand Down
106 changes: 106 additions & 0 deletions tests/tc_variable_totals.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
DSI: 10691123.8
EIC: 42779552.39
FLPDYR: 421632002232.4199
MARS: 353242147.99
MIDR: 732548.9000000003
PT_SSTB_income: 0.0
PT_binc_w2_wages: 0.0
PT_ubia_property: 0.0
RECID: 30013006204532.71
XTOT: 369094112.43
a_lineno: 322311625.18
age_head: 9853008503.07
age_spouse: 3490884180.649999
agi_bin: 1074293553.8700001
blind_head: 4228589.040000001
blind_spouse: 729846.03
cmbtp: 114433481478.58578
data_source: 164759286.17999995
e00200: 10310664684015.027
e00200p: 7246367847990.525
e00200s: 3064296834644.767
e00300: 137183678413.40382
e00400: 94743506150.49107
e00600: 405990141519.5385
e00650: 285099022083.43726
e00700: 48771545399.6015
e00800: 19386714549.7095
e00900: 429539440385.2141
e00900p: 361522073167.05817
e00900s: 68017367420.70369
e01100: 7285729712.1093
e01200: -60216853854.91481
e01400: 552468020849.5068
e01500: 1641191913466.6348
e01700: 1000550811057.0841
e02000: 927149787725.4258
e02100: -6522807775.893997
e02100p: -6333314477.3977
e02100s: -189493275.0666
e02300: 23258519971.6033
e02400: 1491419721114.5906
e03150: 18515980459.5357
e03210: 15544627964.346104
e03220: 1637964491.1388001
e03230: 6926332334.3152
e03240: 14309653954.092802
e03270: 40914871477.34481
e03290: 4905336721.6764
e03300: 30203493523.927105
e03400: 930321104.7312
e03500: 11944679455.189001
e07240: 1632108958.2028
e07260: 2791734813.2793
e07300: 20917948400.8361
e07400: 3527062529.777301
e07600: 963701335.9661999
e09700: 37001598.5893
e09800: 21355072.916999996
e09900: 10709816534.181198
e11200: 3186861701.7149005
e17500: 235042375652.4932
e18400: 508102620626.47064
e18500: 364777444041.7581
e19200: 521982783400.69037
e19800: 272428333065.87024
e20100: 64814165943.684494
e20400: 213136239172.01752
e24515: 46065816558.2304
e24518: 12744333564.099298
e26270: 733636357457.6952
e27200: 11361348668.954098
e32800: 29847384812.3065
e58990: 4155217553.2594
e62900: 23309665135.4052
e87521: 49410862467.992905
e87530: 25810835097.957104
elderly_dependents: 40361.42
f2441: 15395770.6
f6251: 12829284.17
ffpos: 236413412.58999997
fips: 5720027978.5199995
g20500: 8881208151.5932
h_seq: 9532981967967.932
housing_ben: 0.0
k1bx14p: -30532461972.4528
k1bx14s: 2364706719.2499
mcaid_ben: 0.0
mcare_ben: 0.0
n1820: 18744271.849999998
n21: 266997611.39000002
n24: 77430155.88999997
nu06: 28512728.69
nu13: 65413305.05000002
nu18: 94116118.29
other_ben: 0.0
p08000: 209646233.93069997
p22250: -72875477887.86668
p23250: 1188459035726.539
pencon_p: 256173893908.90164
pencon_s: 115418682766.19753
s006: 385270311428.83453
snap_ben: 0.0
ssi_ben: 0.0
tanf_ben: 0.0
vet_ben: 0.0
wic_ben: 0.0
78 changes: 76 additions & 2 deletions tests/test_basic_flat_file.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,85 @@
import os
import pytest
import yaml
from pathlib import Path

test_mode = os.environ.get("TEST_MODE", "lite")

FOLDER = Path(__file__).parent
with open(FOLDER / "tc_variable_totals.yaml") as f:
tc_variable_totals = yaml.safe_load(f)

def test_flat_file_runs():
with open(
FOLDER.parent
/ "tax_microdata_benchmarking"
/ "taxcalc_variable_metadata.yaml"
) as f:
taxcalc_variable_metadata = yaml.safe_load(f)

EXEMPTED_VARIABLES = [
"DSI", # Issue here but deprioritized.
"EIC", # PUF-PE file almost certainly more correct by including CPS data
"MIDR", # Issue here but deprioritized.
"RECID", # No reason to compare.
"a_lineno", # No reason to compare.
"agi_bin", # No reason to compare.
"blind_spouse", # Issue here but deprioritized.
"cmbtp", # No reason to compare.
"data_source", # No reason to compare.
"s006", # No reason to compare.
"h_seq", # No reason to compare.
"fips", # No reason to compare.
"ffpos", # No reason to compare.
"p23250", # PE-PUF likely closer to truth than taxdata (needs triple check).
"e01200", # Unknown but deprioritized for now.
"e17500", # Unknown but deprioritized for now.
"e18500", # Unknown but deprioritized for now.
"e02100", # Farm income, unsure who's closer.
]

# Exempt any variable split between filer and spouse for now.
EXEMPTED_VARIABLES += [
variable
for variable in taxcalc_variable_metadata["read"]
if variable.endswith("p") or variable.endswith("s")
]


def pytest_namespace():
return {"flat_file": None}


@pytest.mark.dependency()
def test_flat_file_builds():
from tax_microdata_benchmarking.create_flat_file import (
create_stacked_flat_file,
)

create_stacked_flat_file(2021, reweight=test_mode == "full")
flat_file = create_stacked_flat_file(2021, reweight=test_mode == "full")

pytest.flat_file = flat_file


variables_to_test = [
variable
for variable in tc_variable_totals.keys()
if variable not in EXEMPTED_VARIABLES
]


@pytest.mark.dependency(depends=["test_flat_file_builds"])
@pytest.mark.parametrize("variable", variables_to_test)
def test_tc_variable_totals(variable):
meta = taxcalc_variable_metadata["read"][variable]
name = meta.get("desc")
flat_file = pytest.flat_file
weight = flat_file.s006
total = (flat_file[variable] * weight).sum()
if tc_variable_totals[variable] == 0:
# If the taxdata file has a zero total, we'll assume the PE file is still correct.
return
# 20% and more than 10bn off taxdata is a failure.
assert (
abs(total / tc_variable_totals[variable] - 1) < 0.45
or abs(total / 1e9 - tc_variable_totals[variable] / 1e9) < 30
), f"{variable} ({name}) differs to tax-data by {total / tc_variable_totals[variable] - 1:.1%} ({total/1e9:.1f}bn vs {tc_variable_totals[variable]/1e9:.1f}bn)"

0 comments on commit 5fa304c

Please sign in to comment.