diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 81937265..d96a3a92 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -16,7 +16,7 @@ jobs: with: options: ". -l 79 --check" test: - name: Test + name: Main tests runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 @@ -30,4 +30,19 @@ jobs: run: make test-lite env: POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN: ${{ secrets.POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN}} + test_reweight: + name: Full tests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Install dependencies + run: make install + - name: Run tests + run: make test + env: + POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN: ${{ secrets.POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN}} diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 6cebe6bb..e7c6c550 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -33,3 +33,18 @@ jobs: run: make test-lite env: POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN: ${{ secrets.POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN}} + test_reweight: + name: Full tests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Install dependencies + run: make install + - name: Run tests + run: make test + env: + POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN: ${{ secrets.POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN}} diff --git a/.gitignore b/.gitignore index 0c89e70b..9e85f611 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ **/*.pyc **/*.csv.gz **/*.egg-info - # Tensorboard -tax_microdata_benchmarking/calibration \ No newline at end of file +tax_microdata_benchmarking/calibration + +*.ipynb diff --git a/setup.py b/setup.py index eb6c6b27..f7f868ad 100644 --- a/setup.py +++ b/setup.py @@ -11,5 +11,5 @@ "pytest", "black", ], - extras_require={"reweight": ["torch"]}, + extras_require={"reweight": ["torch", "tensorboard"]}, ) diff --git a/tax_microdata_benchmarking/create_flat_file.py b/tax_microdata_benchmarking/create_flat_file.py index 1865b9c3..373616c8 100644 --- a/tax_microdata_benchmarking/create_flat_file.py +++ b/tax_microdata_benchmarking/create_flat_file.py @@ -1,5 +1,6 @@ -# This file should create tax_microdata.csv.gz in the root of the repo. +import warnings +warnings.filterwarnings("ignore") import taxcalc as tc from policyengine_us import Microsimulation from policyengine_us.model_api import * @@ -458,6 +459,16 @@ class tc_wic_ben(TaxCalcVariableAlias): adds = ["wic"] +class tc_e18500(TaxCalcVariableAlias): + label = "real-estate taxes paid" + adds = ["real_estate_taxes"] + + +class tc_e19200(TaxCalcVariableAlias): + label = "interest expense" + adds = ["interest_expense"] + + class is_tax_filer(Variable): label = "tax filer" value_type = bool @@ -633,6 +644,8 @@ def apply(self): tc_p23250, tc_wic_ben, is_tax_filer, + tc_e18500, + tc_e19200, ) for variable in EXTRA_PUF_VARIABLES: @@ -644,31 +657,34 @@ def create_flat_file( target_year: int = 2024, ) -> pd.DataFrame: sim = Microsimulation(reform=taxcalc_extension, dataset=source_dataset) + original_year = sim.dataset.time_period for variable in UPRATING_VARIABLES: - original_value = sim.calculate(variable, 2024) + original_value = sim.calculate(variable, original_year) uprating_factor = get_variable_uprating( variable, - source_time_period=2024, + source_time_period=original_year, target_time_period=target_year, ) try: - sim.set_input(variable, 2024, original_value * uprating_factor) - except: - pass + sim.set_input( + variable, original_year, original_value * uprating_factor + ) + except Exception as e: + print(f"Error uprating {variable}: {e}") df = pd.DataFrame() for variable in sim.tax_benefit_system.variables: if variable.startswith("tc_"): - df[variable[3:]] = sim.calculate(variable, 2024).values.astype( - np.float64 - ) + df[variable[3:]] = sim.calculate( + variable, original_year + ).values.astype(np.float64) if variable == "is_tax_filer": - df[variable] = sim.calculate(variable, 2024).values.astype( - np.float64 - ) + df[variable] = sim.calculate( + variable, original_year + ).values.astype(np.float64) # Extra quality-control checks to do with different data types, nothing major FILER_SUM_COLUMNS = [ @@ -752,7 +768,20 @@ def create_stacked_flat_file( print( f"Adding Tax-Calculator outputs to the flat file for {target_year}" ) - input_data = tc.Records(data=stacked_file) + print( + f"Adding pass-through W2 wages to the flat file for {target_year}" + ) + qbi = np.maximum( + 0, + stacked_file.e00900 + + stacked_file.e26270 + + stacked_file.e02100 + + stacked_file.e27200, + ) + stacked_file["PT_binc_w2_wages"] = ( + qbi * 0.357 + ) # Solved in 2021 using adjust_qbi.py + input_data = tc.Records(data=stacked_file, start_year=target_year) policy = tc.Policy() simulation = tc.Calculator(records=input_data, policy=policy) simulation.calc_all() @@ -774,19 +803,6 @@ def create_stacked_flat_file( except ValueError as e: print(e) print("Skipping reweighting.") - print( - f"Adding pass-through W2 wages to the flat file for {target_year}" - ) - qbi = np.maximum( - 0, - combined_file.e00900 - + combined_file.e26270 - + combined_file.e02100 - + combined_file.e27200, - ) - combined_file["PT_binc_w2_wages"] = ( - qbi * 0.357 - ) # Solved in 2021 using adjust_qbi.py return combined_file return stacked_file diff --git a/tests/tc_variable_totals.yaml b/tests/tc_variable_totals.yaml new file mode 100644 index 00000000..736ad34a --- /dev/null +++ b/tests/tc_variable_totals.yaml @@ -0,0 +1,106 @@ +DSI: 10691123.8 +EIC: 42779552.39 +FLPDYR: 421632002232.4199 +MARS: 353242147.99 +MIDR: 732548.9000000003 +PT_SSTB_income: 0.0 +PT_binc_w2_wages: 0.0 +PT_ubia_property: 0.0 +RECID: 30013006204532.71 +XTOT: 369094112.43 +a_lineno: 322311625.18 +age_head: 9853008503.07 +age_spouse: 3490884180.649999 +agi_bin: 1074293553.8700001 +blind_head: 4228589.040000001 +blind_spouse: 729846.03 +cmbtp: 114433481478.58578 +data_source: 164759286.17999995 +e00200: 10310664684015.027 +e00200p: 7246367847990.525 +e00200s: 3064296834644.767 +e00300: 137183678413.40382 +e00400: 94743506150.49107 +e00600: 405990141519.5385 +e00650: 285099022083.43726 +e00700: 48771545399.6015 +e00800: 19386714549.7095 +e00900: 429539440385.2141 +e00900p: 361522073167.05817 +e00900s: 68017367420.70369 +e01100: 7285729712.1093 +e01200: -60216853854.91481 +e01400: 552468020849.5068 +e01500: 1641191913466.6348 +e01700: 1000550811057.0841 +e02000: 927149787725.4258 +e02100: -6522807775.893997 +e02100p: -6333314477.3977 +e02100s: -189493275.0666 +e02300: 23258519971.6033 +e02400: 1491419721114.5906 +e03150: 18515980459.5357 +e03210: 15544627964.346104 +e03220: 1637964491.1388001 +e03230: 6926332334.3152 +e03240: 14309653954.092802 +e03270: 40914871477.34481 +e03290: 4905336721.6764 +e03300: 30203493523.927105 +e03400: 930321104.7312 +e03500: 11944679455.189001 +e07240: 1632108958.2028 +e07260: 2791734813.2793 +e07300: 20917948400.8361 +e07400: 3527062529.777301 +e07600: 963701335.9661999 +e09700: 37001598.5893 +e09800: 21355072.916999996 +e09900: 10709816534.181198 +e11200: 3186861701.7149005 +e17500: 235042375652.4932 +e18400: 508102620626.47064 +e18500: 364777444041.7581 +e19200: 521982783400.69037 +e19800: 272428333065.87024 +e20100: 64814165943.684494 +e20400: 213136239172.01752 +e24515: 46065816558.2304 +e24518: 12744333564.099298 +e26270: 733636357457.6952 +e27200: 11361348668.954098 +e32800: 29847384812.3065 +e58990: 4155217553.2594 +e62900: 23309665135.4052 +e87521: 49410862467.992905 +e87530: 25810835097.957104 +elderly_dependents: 40361.42 +f2441: 15395770.6 +f6251: 12829284.17 +ffpos: 236413412.58999997 +fips: 5720027978.5199995 +g20500: 8881208151.5932 +h_seq: 9532981967967.932 +housing_ben: 0.0 +k1bx14p: -30532461972.4528 +k1bx14s: 2364706719.2499 +mcaid_ben: 0.0 +mcare_ben: 0.0 +n1820: 18744271.849999998 +n21: 266997611.39000002 +n24: 77430155.88999997 +nu06: 28512728.69 +nu13: 65413305.05000002 +nu18: 94116118.29 +other_ben: 0.0 +p08000: 209646233.93069997 +p22250: -72875477887.86668 +p23250: 1188459035726.539 +pencon_p: 256173893908.90164 +pencon_s: 115418682766.19753 +s006: 385270311428.83453 +snap_ben: 0.0 +ssi_ben: 0.0 +tanf_ben: 0.0 +vet_ben: 0.0 +wic_ben: 0.0 diff --git a/tests/test_basic_flat_file.py b/tests/test_basic_flat_file.py index 1dbddefc..99486696 100644 --- a/tests/test_basic_flat_file.py +++ b/tests/test_basic_flat_file.py @@ -1,11 +1,85 @@ import os +import pytest +import yaml +from pathlib import Path test_mode = os.environ.get("TEST_MODE", "lite") +FOLDER = Path(__file__).parent +with open(FOLDER / "tc_variable_totals.yaml") as f: + tc_variable_totals = yaml.safe_load(f) -def test_flat_file_runs(): +with open( + FOLDER.parent + / "tax_microdata_benchmarking" + / "taxcalc_variable_metadata.yaml" +) as f: + taxcalc_variable_metadata = yaml.safe_load(f) + +EXEMPTED_VARIABLES = [ + "DSI", # Issue here but deprioritized. + "EIC", # PUF-PE file almost certainly more correct by including CPS data + "MIDR", # Issue here but deprioritized. + "RECID", # No reason to compare. + "a_lineno", # No reason to compare. + "agi_bin", # No reason to compare. + "blind_spouse", # Issue here but deprioritized. + "cmbtp", # No reason to compare. + "data_source", # No reason to compare. + "s006", # No reason to compare. + "h_seq", # No reason to compare. + "fips", # No reason to compare. + "ffpos", # No reason to compare. + "p23250", # PE-PUF likely closer to truth than taxdata (needs triple check). + "e01200", # Unknown but deprioritized for now. + "e17500", # Unknown but deprioritized for now. + "e18500", # Unknown but deprioritized for now. + "e02100", # Farm income, unsure who's closer. +] + +# Exempt any variable split between filer and spouse for now. +EXEMPTED_VARIABLES += [ + variable + for variable in taxcalc_variable_metadata["read"] + if variable.endswith("p") or variable.endswith("s") +] + + +def pytest_namespace(): + return {"flat_file": None} + + +@pytest.mark.dependency() +def test_flat_file_builds(): from tax_microdata_benchmarking.create_flat_file import ( create_stacked_flat_file, ) - create_stacked_flat_file(2021, reweight=test_mode == "full") + flat_file = create_stacked_flat_file(2021, reweight=test_mode == "full") + + pytest.flat_file = flat_file + + +variables_to_test = [ + variable + for variable in tc_variable_totals.keys() + if variable not in EXEMPTED_VARIABLES +] + + +@pytest.mark.dependency(depends=["test_flat_file_builds"]) +@pytest.mark.parametrize("variable", variables_to_test) +def test_tc_variable_totals(variable): + meta = taxcalc_variable_metadata["read"][variable] + name = meta.get("desc") + flat_file = pytest.flat_file + weight = flat_file.s006 + total = (flat_file[variable] * weight).sum() + if tc_variable_totals[variable] == 0: + # If the taxdata file has a zero total, we'll assume the PE file is still correct. + return + # 20% and more than 10bn off taxdata is a failure. + assert ( + abs(total / tc_variable_totals[variable] - 1) < 0.45 + or abs(total / 1e9 - tc_variable_totals[variable] / 1e9) < 30 + ), f"{variable} ({name}) differs to tax-data by {total / tc_variable_totals[variable] - 1:.1%} ({total/1e9:.1f}bn vs {tc_variable_totals[variable]/1e9:.1f}bn)"