Merge pull request #47 from PSLmodels/dataset-update-3

Add tests on significant variables
PSLmodels · Apr 23, 2024 · 5fa304c · 5fa304c
2 parents f034ef4 + 8f07a61
commit 5fa304c
Show file tree

Hide file tree

Showing 7 changed files with 259 additions and 32 deletions.
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
@@ -16,7 +16,7 @@ jobs:
         with:
           options: ". -l 79 --check"
   test:
-    name: Test
+    name: Main tests
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
@@ -30,4 +30,19 @@ jobs:
         run: make test-lite
         env:
           POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN: ${{ secrets.POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN}}
+  test_reweight:
+    name: Full tests
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+      - name: Install dependencies
+        run: make install
+      - name: Run tests
+        run: make test
+        env:
+          POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN: ${{ secrets.POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN}}
 
diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
@@ -33,3 +33,18 @@ jobs:
         run: make test-lite
         env:
           POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN: ${{ secrets.POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN}}
+  test_reweight:
+    name: Full tests
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+      - name: Install dependencies
+        run: make install
+      - name: Run tests
+        run: make test
+        env:
+          POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN: ${{ secrets.POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN}}
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 **/*.pyc
 **/*.csv.gz
 **/*.egg-info
-
 # Tensorboard
-tax_microdata_benchmarking/calibration
+tax_microdata_benchmarking/calibration
+
+*.ipynb
diff --git a/setup.py b/setup.py
@@ -11,5 +11,5 @@
         "pytest",
         "black",
     ],
-    extras_require={"reweight": ["torch"]},
+    extras_require={"reweight": ["torch", "tensorboard"]},
 )
diff --git a/tax_microdata_benchmarking/create_flat_file.py b/tax_microdata_benchmarking/create_flat_file.py
@@ -1,5 +1,6 @@
-# This file should create tax_microdata.csv.gz in the root of the repo.
+import warnings
 
+warnings.filterwarnings("ignore")
 import taxcalc as tc
 from policyengine_us import Microsimulation
 from policyengine_us.model_api import *
@@ -458,6 +459,16 @@ class tc_wic_ben(TaxCalcVariableAlias):
     adds = ["wic"]
 
 
+class tc_e18500(TaxCalcVariableAlias):
+    label = "real-estate taxes paid"
+    adds = ["real_estate_taxes"]
+
+
+class tc_e19200(TaxCalcVariableAlias):
+    label = "interest expense"
+    adds = ["interest_expense"]
+
+
 class is_tax_filer(Variable):
     label = "tax filer"
     value_type = bool
@@ -633,6 +644,8 @@ def apply(self):
             tc_p23250,
             tc_wic_ben,
             is_tax_filer,
+            tc_e18500,
+            tc_e19200,
         )
 
         for variable in EXTRA_PUF_VARIABLES:
@@ -644,31 +657,34 @@ def create_flat_file(
     target_year: int = 2024,
 ) -> pd.DataFrame:
     sim = Microsimulation(reform=taxcalc_extension, dataset=source_dataset)
+    original_year = sim.dataset.time_period
 
     for variable in UPRATING_VARIABLES:
-        original_value = sim.calculate(variable, 2024)
+        original_value = sim.calculate(variable, original_year)
         uprating_factor = get_variable_uprating(
             variable,
-            source_time_period=2024,
+            source_time_period=original_year,
             target_time_period=target_year,
         )
         try:
-            sim.set_input(variable, 2024, original_value * uprating_factor)
-        except:
-            pass
+            sim.set_input(
+                variable, original_year, original_value * uprating_factor
+            )
+        except Exception as e:
+            print(f"Error uprating {variable}: {e}")
 
     df = pd.DataFrame()
 
     for variable in sim.tax_benefit_system.variables:
         if variable.startswith("tc_"):
-            df[variable[3:]] = sim.calculate(variable, 2024).values.astype(
-                np.float64
-            )
+            df[variable[3:]] = sim.calculate(
+                variable, original_year
+            ).values.astype(np.float64)
 
         if variable == "is_tax_filer":
-            df[variable] = sim.calculate(variable, 2024).values.astype(
-                np.float64
-            )
+            df[variable] = sim.calculate(
+                variable, original_year
+            ).values.astype(np.float64)
 
     # Extra quality-control checks to do with different data types, nothing major
     FILER_SUM_COLUMNS = [
@@ -752,7 +768,20 @@ def create_stacked_flat_file(
         print(
             f"Adding Tax-Calculator outputs to the flat file for {target_year}"
         )
-        input_data = tc.Records(data=stacked_file)
+        print(
+            f"Adding pass-through W2 wages to the flat file for {target_year}"
+        )
+        qbi = np.maximum(
+            0,
+            stacked_file.e00900
+            + stacked_file.e26270
+            + stacked_file.e02100
+            + stacked_file.e27200,
+        )
+        stacked_file["PT_binc_w2_wages"] = (
+            qbi * 0.357
+        )  # Solved in 2021 using adjust_qbi.py
+        input_data = tc.Records(data=stacked_file, start_year=target_year)
         policy = tc.Policy()
         simulation = tc.Calculator(records=input_data, policy=policy)
         simulation.calc_all()
@@ -774,19 +803,6 @@ def create_stacked_flat_file(
             except ValueError as e:
                 print(e)
                 print("Skipping reweighting.")
-        print(
-            f"Adding pass-through W2 wages to the flat file for {target_year}"
-        )
-        qbi = np.maximum(
-            0,
-            combined_file.e00900
-            + combined_file.e26270
-            + combined_file.e02100
-            + combined_file.e27200,
-        )
-        combined_file["PT_binc_w2_wages"] = (
-            qbi * 0.357
-        )  # Solved in 2021 using adjust_qbi.py
         return combined_file
 
     return stacked_file

diff --git a/tests/tc_variable_totals.yaml b/tests/tc_variable_totals.yaml
@@ -0,0 +1,106 @@
+DSI: 10691123.8
+EIC: 42779552.39
+FLPDYR: 421632002232.4199
+MARS: 353242147.99
+MIDR: 732548.9000000003
+PT_SSTB_income: 0.0
+PT_binc_w2_wages: 0.0
+PT_ubia_property: 0.0
+RECID: 30013006204532.71
+XTOT: 369094112.43
+a_lineno: 322311625.18
+age_head: 9853008503.07
+age_spouse: 3490884180.649999
+agi_bin: 1074293553.8700001
+blind_head: 4228589.040000001
+blind_spouse: 729846.03
+cmbtp: 114433481478.58578
+data_source: 164759286.17999995
+e00200: 10310664684015.027
+e00200p: 7246367847990.525
+e00200s: 3064296834644.767
+e00300: 137183678413.40382
+e00400: 94743506150.49107
+e00600: 405990141519.5385
+e00650: 285099022083.43726
+e00700: 48771545399.6015
+e00800: 19386714549.7095
+e00900: 429539440385.2141
+e00900p: 361522073167.05817
+e00900s: 68017367420.70369
+e01100: 7285729712.1093
+e01200: -60216853854.91481
+e01400: 552468020849.5068
+e01500: 1641191913466.6348
+e01700: 1000550811057.0841
+e02000: 927149787725.4258
+e02100: -6522807775.893997
+e02100p: -6333314477.3977
+e02100s: -189493275.0666
+e02300: 23258519971.6033
+e02400: 1491419721114.5906
+e03150: 18515980459.5357
+e03210: 15544627964.346104
+e03220: 1637964491.1388001
+e03230: 6926332334.3152
+e03240: 14309653954.092802
+e03270: 40914871477.34481
+e03290: 4905336721.6764
+e03300: 30203493523.927105
+e03400: 930321104.7312
+e03500: 11944679455.189001
+e07240: 1632108958.2028
+e07260: 2791734813.2793
+e07300: 20917948400.8361
+e07400: 3527062529.777301
+e07600: 963701335.9661999
+e09700: 37001598.5893
+e09800: 21355072.916999996
+e09900: 10709816534.181198
+e11200: 3186861701.7149005
+e17500: 235042375652.4932
+e18400: 508102620626.47064
+e18500: 364777444041.7581
+e19200: 521982783400.69037
+e19800: 272428333065.87024
+e20100: 64814165943.684494
+e20400: 213136239172.01752
+e24515: 46065816558.2304
+e24518: 12744333564.099298
+e26270: 733636357457.6952
+e27200: 11361348668.954098
+e32800: 29847384812.3065
+e58990: 4155217553.2594
+e62900: 23309665135.4052
+e87521: 49410862467.992905
+e87530: 25810835097.957104
+elderly_dependents: 40361.42
+f2441: 15395770.6
+f6251: 12829284.17
+ffpos: 236413412.58999997
+fips: 5720027978.5199995
+g20500: 8881208151.5932
+h_seq: 9532981967967.932
+housing_ben: 0.0
+k1bx14p: -30532461972.4528
+k1bx14s: 2364706719.2499
+mcaid_ben: 0.0
+mcare_ben: 0.0
+n1820: 18744271.849999998
+n21: 266997611.39000002
+n24: 77430155.88999997
+nu06: 28512728.69
+nu13: 65413305.05000002
+nu18: 94116118.29
+other_ben: 0.0
+p08000: 209646233.93069997
+p22250: -72875477887.86668
+p23250: 1188459035726.539
+pencon_p: 256173893908.90164
+pencon_s: 115418682766.19753
+s006: 385270311428.83453
+snap_ben: 0.0
+ssi_ben: 0.0
+tanf_ben: 0.0
+vet_ben: 0.0
+wic_ben: 0.0
diff --git a/tests/test_basic_flat_file.py b/tests/test_basic_flat_file.py
@@ -1,11 +1,85 @@
 import os
+import pytest
+import yaml
+from pathlib import Path
 
 test_mode = os.environ.get("TEST_MODE", "lite")
 
+FOLDER = Path(__file__).parent
+with open(FOLDER / "tc_variable_totals.yaml") as f:
+    tc_variable_totals = yaml.safe_load(f)
 
-def test_flat_file_runs():
+with open(
+    FOLDER.parent
+    / "tax_microdata_benchmarking"
+    / "taxcalc_variable_metadata.yaml"
+) as f:
+    taxcalc_variable_metadata = yaml.safe_load(f)
+
+EXEMPTED_VARIABLES = [
+    "DSI",  # Issue here but deprioritized.
+    "EIC",  # PUF-PE file almost certainly more correct by including CPS data
+    "MIDR",  # Issue here but deprioritized.
+    "RECID",  # No reason to compare.
+    "a_lineno",  # No reason to compare.
+    "agi_bin",  # No reason to compare.
+    "blind_spouse",  # Issue here but deprioritized.
+    "cmbtp",  # No reason to compare.
+    "data_source",  # No reason to compare.
+    "s006",  # No reason to compare.
+    "h_seq",  # No reason to compare.
+    "fips",  # No reason to compare.
+    "ffpos",  # No reason to compare.
+    "p23250",  # PE-PUF likely closer to truth than taxdata (needs triple check).
+    "e01200",  # Unknown but deprioritized for now.
+    "e17500",  # Unknown but deprioritized for now.
+    "e18500",  # Unknown but deprioritized for now.
+    "e02100",  # Farm income, unsure who's closer.
+]
+
+# Exempt any variable split between filer and spouse for now.
+EXEMPTED_VARIABLES += [
+    variable
+    for variable in taxcalc_variable_metadata["read"]
+    if variable.endswith("p") or variable.endswith("s")
+]
+
+
+def pytest_namespace():
+    return {"flat_file": None}
+
+
+@pytest.mark.dependency()
+def test_flat_file_builds():
     from tax_microdata_benchmarking.create_flat_file import (
         create_stacked_flat_file,
     )
 
-    create_stacked_flat_file(2021, reweight=test_mode == "full")
+    flat_file = create_stacked_flat_file(2021, reweight=test_mode == "full")
+
+    pytest.flat_file = flat_file
+
+
+variables_to_test = [
+    variable
+    for variable in tc_variable_totals.keys()
+    if variable not in EXEMPTED_VARIABLES
+]
+
+
+@pytest.mark.dependency(depends=["test_flat_file_builds"])
+@pytest.mark.parametrize("variable", variables_to_test)
+def test_tc_variable_totals(variable):
+    meta = taxcalc_variable_metadata["read"][variable]
+    name = meta.get("desc")
+    flat_file = pytest.flat_file
+    weight = flat_file.s006
+    total = (flat_file[variable] * weight).sum()
+    if tc_variable_totals[variable] == 0:
+        # If the taxdata file has a zero total, we'll assume the PE file is still correct.
+        return
+    # 20% and more than 10bn off taxdata is a failure.
+    assert (
+        abs(total / tc_variable_totals[variable] - 1) < 0.45
+        or abs(total / 1e9 - tc_variable_totals[variable] / 1e9) < 30
+    ), f"{variable} ({name}) differs to tax-data by {total / tc_variable_totals[variable] - 1:.1%} ({total/1e9:.1f}bn vs {tc_variable_totals[variable]/1e9:.1f}bn)"