From b76c3a3c3f75aa1632838771504206690a1f9f1d Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Wed, 17 Apr 2024 14:13:53 +0100 Subject: [PATCH] Add reweighting into main module --- Makefile | 1 - .../create_flat_file.py | 31 +- .../create_summary_file.py | 57 --- tax_microdata_benchmarking/summary.md | 471 ------------------ test.ipynb | 225 ++++++--- 5 files changed, 179 insertions(+), 606 deletions(-) delete mode 100644 tax_microdata_benchmarking/create_summary_file.py delete mode 100644 tax_microdata_benchmarking/summary.md diff --git a/Makefile b/Makefile index 0cc4e6b6..8a1aa544 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,5 @@ format: flat-file: python tax_microdata_benchmarking/create_flat_file.py - python tax_microdata_benchmarking/create_summary_file.py all: format test \ No newline at end of file diff --git a/tax_microdata_benchmarking/create_flat_file.py b/tax_microdata_benchmarking/create_flat_file.py index 99375125..2ebe51b2 100644 --- a/tax_microdata_benchmarking/create_flat_file.py +++ b/tax_microdata_benchmarking/create_flat_file.py @@ -10,6 +10,8 @@ from scipy.optimize import minimize from tax_microdata_benchmarking.adjust_qbi import add_pt_w2_wages from tax_microdata_benchmarking.reweight import reweight +from microdf import MicroDataFrame +import numpy as np UPRATING_VARIABLES = [ "employment_income", @@ -780,8 +782,31 @@ def create_stacked_flat_file( return stacked_file +def summary_analytics(df): + df = MicroDataFrame(df.copy(), weights="s006") + + variables = [] + sums = [] + nonzero_counts = [] + + for variable in df.columns: + variables.append(variable) + sums.append((df[variable].sum() / 1e9).round(1)) + nonzero_counts.append(((df[variable] > 0).sum() / 1e6).round(1)) + + summary_df = pd.DataFrame( + { + "Variable": variables, + "Sum (bn)": sums, + "Nonzero count (m)": nonzero_counts, + } + ) + + return summary_df + + if __name__ == "__main__": - for target_year in [2021]: + for target_year in [2015, 2021, 2026]: stacked_file = create_stacked_flat_file( target_year=target_year, use_puf=True ) @@ -790,3 +815,7 @@ def create_stacked_flat_file( index=False, compression="gzip", ) + analytics_df = summary_analytics(stacked_file) + analytics_df.to_csv( + f"tax_microdata_{target_year}_analytics.csv", index=False + ) diff --git a/tax_microdata_benchmarking/create_summary_file.py b/tax_microdata_benchmarking/create_summary_file.py deleted file mode 100644 index d0b740d3..00000000 --- a/tax_microdata_benchmarking/create_summary_file.py +++ /dev/null @@ -1,57 +0,0 @@ -from tax_microdata_benchmarking.create_flat_file import taxcalc_extension -from policyengine_us import Simulation -import pandas as pd -from pathlib import Path -import yaml - -FOLDER = Path(__file__).parent -sim = Simulation(reform=taxcalc_extension, situation={"person_id": 1}) - -with open(FOLDER / "taxcalc_variable_metadata.yaml") as file: - taxcalc_variable_metadata = yaml.safe_load(file) - -tc_variables = taxcalc_variable_metadata["read"] -tc_puf_variables = [ - key - for key, data in tc_variables.items() - if "taxdata_puf" in data["availability"] -] - -summary_file = """# PolicyEngine US Tax-Calculator flat file - -This file contains a summary of the Tax-Calculator microdata file. It is intended to be used as a reference for the Tax-Calculator microdata file. -""" - -added_columns = [] -added_unnecessary_columns = [] - -variables = sim.tax_benefit_system.variables -for variable in variables.values(): - if variable.name.startswith("tc_"): - tc_name = variable.name[3:] - if tc_name in tc_puf_variables: - added_columns.append(tc_name) - else: - added_unnecessary_columns.append(tc_name) - - -summary_file += f"\nThe flat file currently has {len(added_columns)} out of 68 ({len(added_columns) / len(tc_puf_variables):.0%}) columns in the Tax-Calculator PUF microdata file.\n\n" -missing_columns = [ - column for column in tc_puf_variables if column not in added_columns -] -summary_file += f"Missing columns: \n- " + "\n- ".join(missing_columns) + "\n" - -summary_file += ( - f"\nExtra, non-taxdata-PUF columns: \n- " - + "\n- ".join(added_unnecessary_columns) - + "\n" -) - -for variable in variables.values(): - if variable.name.startswith("tc_"): - summary_file += f"\n## {variable.name[3:]}\n\n{variable.label}\n\n" - -FOLDER = Path(__file__).parent - -with open(FOLDER / "summary.md", "w") as file: - file.write(summary_file) diff --git a/tax_microdata_benchmarking/summary.md b/tax_microdata_benchmarking/summary.md deleted file mode 100644 index ba5e079a..00000000 --- a/tax_microdata_benchmarking/summary.md +++ /dev/null @@ -1,471 +0,0 @@ -# PolicyEngine US Tax-Calculator flat file - -This file contains a summary of the Tax-Calculator microdata file. It is intended to be used as a reference for the Tax-Calculator microdata file. - -The flat file currently has 83 out of 68 (88%) columns in the Tax-Calculator PUF microdata file. - -Missing columns: -- MIDR -- a_lineno -- agi_bin -- cmbtp -- data_source -- f6251 -- ffpos -- g20500 -- h_seq -- k1bx14p -- k1bx14s - -Extra, non-taxdata-PUF columns: -- ssi_ben -- mcaid_ben -- tanf_ben -- snap_ben -- housing_ben -- wic_ben - -## RECID - -record ID - - -## MARS - -filing status - - -## e00200p - -wages less pension contributions (filer) - - -## e00200s - -wages less pension contributions (spouse) - - -## e00200 - -wages less pension contributions - - -## age_head - -age of head of tax unit - - -## age_spouse - -age of spouse of head of tax unit - - -## blind_head - -blindness of head of tax unit - - -## blind_spouse - -blindness of spouse of head of tax unit - - -## fips - -FIPS state code - - -## s006 - -tax unit weight - - -## FLPDYR - -tax year to calculate for - - -## EIC - -EITC-qualifying children - - -## nu18 - -number of people under 18 - - -## n1820 - -number of people 18-20 - - -## nu13 - -number of people under 13 - - -## nu06 - -number of people under 6 - - -## n24 - -number of people eligible for the CTC - - -## elderly_dependents - -number of elderly dependents - - -## f2441 - -CDCC-qualifying children - - -## e00900p - -self-employment income - - -## e00900s - -self-employment income (spouse) - - -## e00900 - -self-employment income - - -## e02100p - -farm income - - -## e02100s - -farm income (spouse) - - -## e02100 - -farm income - - -## e01500 - -pension income - - -## e00800 - -alimony income - - -## e02400 - -social security income - - -## e02300 - -unemployment compensation - - -## XTOT - -total exemptions - - -## ssi_ben - -SSI - - -## mcaid_ben - -Medicaid - - -## tanf_ben - -TANF - - -## snap_ben - -SNAP - - -## housing_ben - -housing subsidy - - -## DSI - -dependent filer - - -## n21 - -number of people 21 or over - - -## e00600 - -ordinary dividends included in AGI - - -## e18400 - -State income tax - - -## e00650 - -qualified dividends - - -## e00300 - -taxable interest income - - -## e00400 - -tax-exempt interest income - - -## e01700 - -taxable pension income - - -## e01100 - -e01100 - - -## e01400 - -taxable IRA distributions - - -## e03270 - -e03270 - - -## e32800 - -e32800 - - -## e17500 - -medical and dental expenses - - -## pencon_p - -pension contributions (filer) - - -## pencon_s - -pension contributions (spouse) - - -## e03150 - -deductible IRA contributions - - -## e03210 - -student loan interest - - -## p22250 - -net short-term capital gains - - -## p23250 - -net long-term capital gains - - -## wic_ben - -WIC - - -## e02000 - -e02000 - - -## e26270 - -e26270 - - -## e19200 - -e19200 - - -## e18500 - -e18500 - - -## e19800 - -e19800 - - -## e20400 - -e20400 - - -## e20100 - -e20100 - - -## e00700 - -e00700 - - -## e24515 - -e24515 - - -## e03300 - -e03300 - - -## e07300 - -e07300 - - -## e62900 - -e62900 - - -## e87530 - -e87530 - - -## e03240 - -e03240 - - -## e01200 - -e01200 - - -## e24518 - -e24518 - - -## e09900 - -e09900 - - -## e27200 - -e27200 - - -## e03290 - -e03290 - - -## e58990 - -e58990 - - -## e03230 - -e03230 - - -## e07400 - -e07400 - - -## e11200 - -e11200 - - -## e07260 - -e07260 - - -## e07240 - -e07240 - - -## e07600 - -e07600 - - -## e03220 - -e03220 - - -## p08000 - -p08000 - - -## e03400 - -e03400 - - -## e09800 - -e09800 - - -## e09700 - -e09700 - - -## e03500 - -e03500 - - -## e87521 - -e87521 - diff --git a/test.ipynb b/test.ipynb index d6570e42..d3db2180 100644 --- a/test.ipynb +++ b/test.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 17, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -13,100 +13,173 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 31, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Scale: 1.0, total: 1235.6420345077538, deviation: 1029.8420345077539\n", - "Scale: 1.0000000149011612, total: 1235.6420529202553, deviation: 1029.8420529202554\n", - "Scale: -0.009999999999999787, total: -12.356420345077279, deviation: -218.1564203450773\n", - "Scale: -0.009999985098838593, total: -12.356401932576146, deviation: -218.15640193257616\n", - "Scale: 0.16655307989653162, total: 205.79998649688287, deviation: -1.3503117145319266e-05\n", - "Scale: 0.1665530947976928, total: 205.80000490938406, deviation: 4.9093840459590865e-06\n" - ] - }, + "data": { + "text/plain": [ + "173.07107828276648" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(df.c00100 > 0).sum() / 1e6" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
VariableSum (bn)Nonzero count (m)
0index12152.3214.0
1RECID828073.8214.0
2MARS0.4214.0
3e00200p7492.2140.0
4e00200s2440.551.3
............
206othertaxes97.513.2
207odc0.00.4
208c5966048.927.1
209f62510.00.0
210codtc_limited39.429.0
\n", + "

211 rows × 3 columns

\n", + "
" + ], "text/plain": [ - "0 0.0\n", - "1 0.0\n", - "2 0.0\n", - "3 0.0\n", - "4 0.0\n", - " ... \n", - "233384 0.0\n", - "233385 0.0\n", - "233386 0.0\n", - "233387 0.0\n", - "233388 0.0\n", - "Name: PT_binc_w2_wages, Length: 233389, dtype: float64" + " Variable Sum (bn) Nonzero count (m)\n", + "0 index 12152.3 214.0\n", + "1 RECID 828073.8 214.0\n", + "2 MARS 0.4 214.0\n", + "3 e00200p 7492.2 140.0\n", + "4 e00200s 2440.5 51.3\n", + ".. ... ... ...\n", + "206 othertaxes 97.5 13.2\n", + "207 odc 0.0 0.4\n", + "208 c59660 48.9 27.1\n", + "209 f6251 0.0 0.0\n", + "210 codtc_limited 39.4 29.0\n", + "\n", + "[211 rows x 3 columns]" ] }, - "execution_count": 27, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "from microdf import MicroDataFrame\n", "import numpy as np\n", - "import pandas as pd\n", - "from scipy.optimize import minimize\n", - "\n", - "def add_pt_w2_wages(df, time_period: int, verbose: bool = True):\n", - " \"\"\"\n", - " Add pass-through W2 wages to the flat file.\n", "\n", - " Args:\n", - " df (pd.DataFrame): The DataFrame to add W2 wages to.\n", + "df = MicroDataFrame(df, weights=\"s006\")\n", "\n", - " Returns:\n", - " pd.DataFrame: The DataFrame with W2 wages added.\n", - " \"\"\"\n", - " qbid_tax_expenditures = { # From JCT TE reports 2018- and 2023-\n", - " 2015: 0,\n", - " 2016: 0,\n", - " 2017: 0,\n", - " 2018: 33.2,\n", - " 2019: 48.6,\n", - " 2020: 56.3,\n", - " 2021: 59.0,\n", - " 2022: 61.9,\n", - " 2023: 55.7,\n", - " 2024: 57.6,\n", - " 2025: 60.9,\n", - " 2026: 24.9,\n", - " }\n", + "variables = []\n", + "sums = []\n", + "nonzero_counts = []\n", "\n", - " QBID_TOTAL_21 = 205.8\n", + "for variable in df.columns:\n", + " variables.append(variable)\n", + " sums.append((df[variable].sum() / 1e9).round(1))\n", + " nonzero_counts.append(((df[variable] > 0).sum() / 1e6).round(1))\n", "\n", - " target = QBID_TOTAL_21 * qbid_tax_expenditures[time_period] / qbid_tax_expenditures[2021]\n", - "\n", - " qbi = np.maximum(0, df.e00900 + df.e26270 + df.e02100 + df.e27200)\n", - "\n", - " # Solve for scale to match the tax expenditure\n", - "\n", - " def expenditure_loss(scale):\n", - " res = (qbi * df.s006 * scale[0]).sum()/1e9\n", - " deviation = (res - target)\n", - " if verbose:\n", - " print(f\"Scale: {scale[0]}, total: {res}, deviation: {deviation}\")\n", - " return deviation ** 2\n", - " \n", - " \n", - " scale = minimize(\n", - " expenditure_loss,\n", - " 1,\n", - " tol=1,\n", - " ).x[0]\n", - "\n", - " df[\"PT_binc_w2_wages\"] = qbi * scale\n", - " \n", - " return df\n", + "summary_df = pd.DataFrame(\n", + " {\n", + " \"Variable\": variables,\n", + " \"Sum (bn)\": sums,\n", + " \"Nonzero count (m)\": nonzero_counts,\n", + " }\n", + ")\n", "\n", - "add_pt_w2_wages(df, 2021).PT_binc_w2_wages" + "summary_df" ] } ],