From b76c3a3c3f75aa1632838771504206690a1f9f1d Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Wed, 17 Apr 2024 14:13:53 +0100 Subject: [PATCH] Add reweighting into main module --- Makefile | 1 - .../create_flat_file.py | 31 +- .../create_summary_file.py | 57 --- tax_microdata_benchmarking/summary.md | 471 ------------------ test.ipynb | 225 ++++++--- 5 files changed, 179 insertions(+), 606 deletions(-) delete mode 100644 tax_microdata_benchmarking/create_summary_file.py delete mode 100644 tax_microdata_benchmarking/summary.md diff --git a/Makefile b/Makefile index 0cc4e6b6..8a1aa544 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,5 @@ format: flat-file: python tax_microdata_benchmarking/create_flat_file.py - python tax_microdata_benchmarking/create_summary_file.py all: format test \ No newline at end of file diff --git a/tax_microdata_benchmarking/create_flat_file.py b/tax_microdata_benchmarking/create_flat_file.py index 99375125..2ebe51b2 100644 --- a/tax_microdata_benchmarking/create_flat_file.py +++ b/tax_microdata_benchmarking/create_flat_file.py @@ -10,6 +10,8 @@ from scipy.optimize import minimize from tax_microdata_benchmarking.adjust_qbi import add_pt_w2_wages from tax_microdata_benchmarking.reweight import reweight +from microdf import MicroDataFrame +import numpy as np UPRATING_VARIABLES = [ "employment_income", @@ -780,8 +782,31 @@ def create_stacked_flat_file( return stacked_file +def summary_analytics(df): + df = MicroDataFrame(df.copy(), weights="s006") + + variables = [] + sums = [] + nonzero_counts = [] + + for variable in df.columns: + variables.append(variable) + sums.append((df[variable].sum() / 1e9).round(1)) + nonzero_counts.append(((df[variable] > 0).sum() / 1e6).round(1)) + + summary_df = pd.DataFrame( + { + "Variable": variables, + "Sum (bn)": sums, + "Nonzero count (m)": nonzero_counts, + } + ) + + return summary_df + + if __name__ == "__main__": - for target_year in [2021]: + for target_year in [2015, 2021, 2026]: stacked_file = create_stacked_flat_file( target_year=target_year, use_puf=True ) @@ -790,3 +815,7 @@ def create_stacked_flat_file( index=False, compression="gzip", ) + analytics_df = summary_analytics(stacked_file) + analytics_df.to_csv( + f"tax_microdata_{target_year}_analytics.csv", index=False + ) diff --git a/tax_microdata_benchmarking/create_summary_file.py b/tax_microdata_benchmarking/create_summary_file.py deleted file mode 100644 index d0b740d3..00000000 --- a/tax_microdata_benchmarking/create_summary_file.py +++ /dev/null @@ -1,57 +0,0 @@ -from tax_microdata_benchmarking.create_flat_file import taxcalc_extension -from policyengine_us import Simulation -import pandas as pd -from pathlib import Path -import yaml - -FOLDER = Path(__file__).parent -sim = Simulation(reform=taxcalc_extension, situation={"person_id": 1}) - -with open(FOLDER / "taxcalc_variable_metadata.yaml") as file: - taxcalc_variable_metadata = yaml.safe_load(file) - -tc_variables = taxcalc_variable_metadata["read"] -tc_puf_variables = [ - key - for key, data in tc_variables.items() - if "taxdata_puf" in data["availability"] -] - -summary_file = """# PolicyEngine US Tax-Calculator flat file - -This file contains a summary of the Tax-Calculator microdata file. It is intended to be used as a reference for the Tax-Calculator microdata file. -""" - -added_columns = [] -added_unnecessary_columns = [] - -variables = sim.tax_benefit_system.variables -for variable in variables.values(): - if variable.name.startswith("tc_"): - tc_name = variable.name[3:] - if tc_name in tc_puf_variables: - added_columns.append(tc_name) - else: - added_unnecessary_columns.append(tc_name) - - -summary_file += f"\nThe flat file currently has {len(added_columns)} out of 68 ({len(added_columns) / len(tc_puf_variables):.0%}) columns in the Tax-Calculator PUF microdata file.\n\n" -missing_columns = [ - column for column in tc_puf_variables if column not in added_columns -] -summary_file += f"Missing columns: \n- " + "\n- ".join(missing_columns) + "\n" - -summary_file += ( - f"\nExtra, non-taxdata-PUF columns: \n- " - + "\n- ".join(added_unnecessary_columns) - + "\n" -) - -for variable in variables.values(): - if variable.name.startswith("tc_"): - summary_file += f"\n## {variable.name[3:]}\n\n{variable.label}\n\n" - -FOLDER = Path(__file__).parent - -with open(FOLDER / "summary.md", "w") as file: - file.write(summary_file) diff --git a/tax_microdata_benchmarking/summary.md b/tax_microdata_benchmarking/summary.md deleted file mode 100644 index ba5e079a..00000000 --- a/tax_microdata_benchmarking/summary.md +++ /dev/null @@ -1,471 +0,0 @@ -# PolicyEngine US Tax-Calculator flat file - -This file contains a summary of the Tax-Calculator microdata file. It is intended to be used as a reference for the Tax-Calculator microdata file. - -The flat file currently has 83 out of 68 (88%) columns in the Tax-Calculator PUF microdata file. - -Missing columns: -- MIDR -- a_lineno -- agi_bin -- cmbtp -- data_source -- f6251 -- ffpos -- g20500 -- h_seq -- k1bx14p -- k1bx14s - -Extra, non-taxdata-PUF columns: -- ssi_ben -- mcaid_ben -- tanf_ben -- snap_ben -- housing_ben -- wic_ben - -## RECID - -record ID - - -## MARS - -filing status - - -## e00200p - -wages less pension contributions (filer) - - -## e00200s - -wages less pension contributions (spouse) - - -## e00200 - -wages less pension contributions - - -## age_head - -age of head of tax unit - - -## age_spouse - -age of spouse of head of tax unit - - -## blind_head - -blindness of head of tax unit - - -## blind_spouse - -blindness of spouse of head of tax unit - - -## fips - -FIPS state code - - -## s006 - -tax unit weight - - -## FLPDYR - -tax year to calculate for - - -## EIC - -EITC-qualifying children - - -## nu18 - -number of people under 18 - - -## n1820 - -number of people 18-20 - - -## nu13 - -number of people under 13 - - -## nu06 - -number of people under 6 - - -## n24 - -number of people eligible for the CTC - - -## elderly_dependents - -number of elderly dependents - - -## f2441 - -CDCC-qualifying children - - -## e00900p - -self-employment income - - -## e00900s - -self-employment income (spouse) - - -## e00900 - -self-employment income - - -## e02100p - -farm income - - -## e02100s - -farm income (spouse) - - -## e02100 - -farm income - - -## e01500 - -pension income - - -## e00800 - -alimony income - - -## e02400 - -social security income - - -## e02300 - -unemployment compensation - - -## XTOT - -total exemptions - - -## ssi_ben - -SSI - - -## mcaid_ben - -Medicaid - - -## tanf_ben - -TANF - - -## snap_ben - -SNAP - - -## housing_ben - -housing subsidy - - -## DSI - -dependent filer - - -## n21 - -number of people 21 or over - - -## e00600 - -ordinary dividends included in AGI - - -## e18400 - -State income tax - - -## e00650 - -qualified dividends - - -## e00300 - -taxable interest income - - -## e00400 - -tax-exempt interest income - - -## e01700 - -taxable pension income - - -## e01100 - -e01100 - - -## e01400 - -taxable IRA distributions - - -## e03270 - -e03270 - - -## e32800 - -e32800 - - -## e17500 - -medical and dental expenses - - -## pencon_p - -pension contributions (filer) - - -## pencon_s - -pension contributions (spouse) - - -## e03150 - -deductible IRA contributions - - -## e03210 - -student loan interest - - -## p22250 - -net short-term capital gains - - -## p23250 - -net long-term capital gains - - -## wic_ben - -WIC - - -## e02000 - -e02000 - - -## e26270 - -e26270 - - -## e19200 - -e19200 - - -## e18500 - -e18500 - - -## e19800 - -e19800 - - -## e20400 - -e20400 - - -## e20100 - -e20100 - - -## e00700 - -e00700 - - -## e24515 - -e24515 - - -## e03300 - -e03300 - - -## e07300 - -e07300 - - -## e62900 - -e62900 - - -## e87530 - -e87530 - - -## e03240 - -e03240 - - -## e01200 - -e01200 - - -## e24518 - -e24518 - - -## e09900 - -e09900 - - -## e27200 - -e27200 - - -## e03290 - -e03290 - - -## e58990 - -e58990 - - -## e03230 - -e03230 - - -## e07400 - -e07400 - - -## e11200 - -e11200 - - -## e07260 - -e07260 - - -## e07240 - -e07240 - - -## e07600 - -e07600 - - -## e03220 - -e03220 - - -## p08000 - -p08000 - - -## e03400 - -e03400 - - -## e09800 - -e09800 - - -## e09700 - -e09700 - - -## e03500 - -e03500 - - -## e87521 - -e87521 - diff --git a/test.ipynb b/test.ipynb index d6570e42..d3db2180 100644 --- a/test.ipynb +++ b/test.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 17, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -13,100 +13,173 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 31, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Scale: 1.0, total: 1235.6420345077538, deviation: 1029.8420345077539\n", - "Scale: 1.0000000149011612, total: 1235.6420529202553, deviation: 1029.8420529202554\n", - "Scale: -0.009999999999999787, total: -12.356420345077279, deviation: -218.1564203450773\n", - "Scale: -0.009999985098838593, total: -12.356401932576146, deviation: -218.15640193257616\n", - "Scale: 0.16655307989653162, total: 205.79998649688287, deviation: -1.3503117145319266e-05\n", - "Scale: 0.1665530947976928, total: 205.80000490938406, deviation: 4.9093840459590865e-06\n" - ] - }, + "data": { + "text/plain": [ + "173.07107828276648" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(df.c00100 > 0).sum() / 1e6" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ { "data": { + "text/html": [ + "
\n", + " | Variable | \n", + "Sum (bn) | \n", + "Nonzero count (m) | \n", + "
---|---|---|---|
0 | \n", + "index | \n", + "12152.3 | \n", + "214.0 | \n", + "
1 | \n", + "RECID | \n", + "828073.8 | \n", + "214.0 | \n", + "
2 | \n", + "MARS | \n", + "0.4 | \n", + "214.0 | \n", + "
3 | \n", + "e00200p | \n", + "7492.2 | \n", + "140.0 | \n", + "
4 | \n", + "e00200s | \n", + "2440.5 | \n", + "51.3 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "
206 | \n", + "othertaxes | \n", + "97.5 | \n", + "13.2 | \n", + "
207 | \n", + "odc | \n", + "0.0 | \n", + "0.4 | \n", + "
208 | \n", + "c59660 | \n", + "48.9 | \n", + "27.1 | \n", + "
209 | \n", + "f6251 | \n", + "0.0 | \n", + "0.0 | \n", + "
210 | \n", + "codtc_limited | \n", + "39.4 | \n", + "29.0 | \n", + "
211 rows × 3 columns
\n", + "