From b76c3a3c3f75aa1632838771504206690a1f9f1d Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com>
Date: Wed, 17 Apr 2024 14:13:53 +0100
Subject: [PATCH] Add reweighting into main module

---
 Makefile                                      |   1 -
 .../create_flat_file.py                       |  31 +-
 .../create_summary_file.py                    |  57 ---
 tax_microdata_benchmarking/summary.md         | 471 ------------------
 test.ipynb                                    | 225 ++++++---
 5 files changed, 179 insertions(+), 606 deletions(-)
 delete mode 100644 tax_microdata_benchmarking/create_summary_file.py
 delete mode 100644 tax_microdata_benchmarking/summary.md

diff --git a/Makefile b/Makefile
index 0cc4e6b6..8a1aa544 100644
--- a/Makefile
+++ b/Makefile
@@ -9,6 +9,5 @@ format:
 
 flat-file:
 	python tax_microdata_benchmarking/create_flat_file.py
-	python tax_microdata_benchmarking/create_summary_file.py
 
 all: format test
\ No newline at end of file
diff --git a/tax_microdata_benchmarking/create_flat_file.py b/tax_microdata_benchmarking/create_flat_file.py
index 99375125..2ebe51b2 100644
--- a/tax_microdata_benchmarking/create_flat_file.py
+++ b/tax_microdata_benchmarking/create_flat_file.py
@@ -10,6 +10,8 @@
 from scipy.optimize import minimize
 from tax_microdata_benchmarking.adjust_qbi import add_pt_w2_wages
 from tax_microdata_benchmarking.reweight import reweight
+from microdf import MicroDataFrame
+import numpy as np
 
 UPRATING_VARIABLES = [
     "employment_income",
@@ -780,8 +782,31 @@ def create_stacked_flat_file(
     return stacked_file
 
 
+def summary_analytics(df):
+    df = MicroDataFrame(df.copy(), weights="s006")
+
+    variables = []
+    sums = []
+    nonzero_counts = []
+
+    for variable in df.columns:
+        variables.append(variable)
+        sums.append((df[variable].sum() / 1e9).round(1))
+        nonzero_counts.append(((df[variable] > 0).sum() / 1e6).round(1))
+
+    summary_df = pd.DataFrame(
+        {
+            "Variable": variables,
+            "Sum (bn)": sums,
+            "Nonzero count (m)": nonzero_counts,
+        }
+    )
+
+    return summary_df
+
+
 if __name__ == "__main__":
-    for target_year in [2021]:
+    for target_year in [2015, 2021, 2026]:
         stacked_file = create_stacked_flat_file(
             target_year=target_year, use_puf=True
         )
@@ -790,3 +815,7 @@ def create_stacked_flat_file(
             index=False,
             compression="gzip",
         )
+        analytics_df = summary_analytics(stacked_file)
+        analytics_df.to_csv(
+            f"tax_microdata_{target_year}_analytics.csv", index=False
+        )
diff --git a/tax_microdata_benchmarking/create_summary_file.py b/tax_microdata_benchmarking/create_summary_file.py
deleted file mode 100644
index d0b740d3..00000000
--- a/tax_microdata_benchmarking/create_summary_file.py
+++ /dev/null
@@ -1,57 +0,0 @@
-from tax_microdata_benchmarking.create_flat_file import taxcalc_extension
-from policyengine_us import Simulation
-import pandas as pd
-from pathlib import Path
-import yaml
-
-FOLDER = Path(__file__).parent
-sim = Simulation(reform=taxcalc_extension, situation={"person_id": 1})
-
-with open(FOLDER / "taxcalc_variable_metadata.yaml") as file:
-    taxcalc_variable_metadata = yaml.safe_load(file)
-
-tc_variables = taxcalc_variable_metadata["read"]
-tc_puf_variables = [
-    key
-    for key, data in tc_variables.items()
-    if "taxdata_puf" in data["availability"]
-]
-
-summary_file = """# PolicyEngine US Tax-Calculator flat file
-
-This file contains a summary of the Tax-Calculator microdata file. It is intended to be used as a reference for the Tax-Calculator microdata file.
-"""
-
-added_columns = []
-added_unnecessary_columns = []
-
-variables = sim.tax_benefit_system.variables
-for variable in variables.values():
-    if variable.name.startswith("tc_"):
-        tc_name = variable.name[3:]
-        if tc_name in tc_puf_variables:
-            added_columns.append(tc_name)
-        else:
-            added_unnecessary_columns.append(tc_name)
-
-
-summary_file += f"\nThe flat file currently has {len(added_columns)} out of 68 ({len(added_columns) / len(tc_puf_variables):.0%}) columns in the Tax-Calculator PUF microdata file.\n\n"
-missing_columns = [
-    column for column in tc_puf_variables if column not in added_columns
-]
-summary_file += f"Missing columns: \n- " + "\n- ".join(missing_columns) + "\n"
-
-summary_file += (
-    f"\nExtra, non-taxdata-PUF columns: \n- "
-    + "\n- ".join(added_unnecessary_columns)
-    + "\n"
-)
-
-for variable in variables.values():
-    if variable.name.startswith("tc_"):
-        summary_file += f"\n## {variable.name[3:]}\n\n{variable.label}\n\n"
-
-FOLDER = Path(__file__).parent
-
-with open(FOLDER / "summary.md", "w") as file:
-    file.write(summary_file)
diff --git a/tax_microdata_benchmarking/summary.md b/tax_microdata_benchmarking/summary.md
deleted file mode 100644
index ba5e079a..00000000
--- a/tax_microdata_benchmarking/summary.md
+++ /dev/null
@@ -1,471 +0,0 @@
-# PolicyEngine US Tax-Calculator flat file
-
-This file contains a summary of the Tax-Calculator microdata file. It is intended to be used as a reference for the Tax-Calculator microdata file.
-
-The flat file currently has 83 out of 68 (88%) columns in the Tax-Calculator PUF microdata file.
-
-Missing columns: 
-- MIDR
-- a_lineno
-- agi_bin
-- cmbtp
-- data_source
-- f6251
-- ffpos
-- g20500
-- h_seq
-- k1bx14p
-- k1bx14s
-
-Extra, non-taxdata-PUF columns: 
-- ssi_ben
-- mcaid_ben
-- tanf_ben
-- snap_ben
-- housing_ben
-- wic_ben
-
-## RECID
-
-record ID
-
-
-## MARS
-
-filing status
-
-
-## e00200p
-
-wages less pension contributions (filer)
-
-
-## e00200s
-
-wages less pension contributions (spouse)
-
-
-## e00200
-
-wages less pension contributions
-
-
-## age_head
-
-age of head of tax unit
-
-
-## age_spouse
-
-age of spouse of head of tax unit
-
-
-## blind_head
-
-blindness of head of tax unit
-
-
-## blind_spouse
-
-blindness of spouse of head of tax unit
-
-
-## fips
-
-FIPS state code
-
-
-## s006
-
-tax unit weight
-
-
-## FLPDYR
-
-tax year to calculate for
-
-
-## EIC
-
-EITC-qualifying children
-
-
-## nu18
-
-number of people under 18
-
-
-## n1820
-
-number of people 18-20
-
-
-## nu13
-
-number of people under 13
-
-
-## nu06
-
-number of people under 6
-
-
-## n24
-
-number of people eligible for the CTC
-
-
-## elderly_dependents
-
-number of elderly dependents
-
-
-## f2441
-
-CDCC-qualifying children
-
-
-## e00900p
-
-self-employment income
-
-
-## e00900s
-
-self-employment income (spouse)
-
-
-## e00900
-
-self-employment income
-
-
-## e02100p
-
-farm income
-
-
-## e02100s
-
-farm income (spouse)
-
-
-## e02100
-
-farm income
-
-
-## e01500
-
-pension income
-
-
-## e00800
-
-alimony income
-
-
-## e02400
-
-social security income
-
-
-## e02300
-
-unemployment compensation
-
-
-## XTOT
-
-total exemptions
-
-
-## ssi_ben
-
-SSI
-
-
-## mcaid_ben
-
-Medicaid
-
-
-## tanf_ben
-
-TANF
-
-
-## snap_ben
-
-SNAP
-
-
-## housing_ben
-
-housing subsidy
-
-
-## DSI
-
-dependent filer
-
-
-## n21
-
-number of people 21 or over
-
-
-## e00600
-
-ordinary dividends included in AGI
-
-
-## e18400
-
-State income tax
-
-
-## e00650
-
-qualified dividends
-
-
-## e00300
-
-taxable interest income
-
-
-## e00400
-
-tax-exempt interest income
-
-
-## e01700
-
-taxable pension income
-
-
-## e01100
-
-e01100
-
-
-## e01400
-
-taxable IRA distributions
-
-
-## e03270
-
-e03270
-
-
-## e32800
-
-e32800
-
-
-## e17500
-
-medical and dental expenses
-
-
-## pencon_p
-
-pension contributions (filer)
-
-
-## pencon_s
-
-pension contributions (spouse)
-
-
-## e03150
-
-deductible IRA contributions
-
-
-## e03210
-
-student loan interest
-
-
-## p22250
-
-net short-term capital gains
-
-
-## p23250
-
-net long-term capital gains
-
-
-## wic_ben
-
-WIC
-
-
-## e02000
-
-e02000
-
-
-## e26270
-
-e26270
-
-
-## e19200
-
-e19200
-
-
-## e18500
-
-e18500
-
-
-## e19800
-
-e19800
-
-
-## e20400
-
-e20400
-
-
-## e20100
-
-e20100
-
-
-## e00700
-
-e00700
-
-
-## e24515
-
-e24515
-
-
-## e03300
-
-e03300
-
-
-## e07300
-
-e07300
-
-
-## e62900
-
-e62900
-
-
-## e87530
-
-e87530
-
-
-## e03240
-
-e03240
-
-
-## e01200
-
-e01200
-
-
-## e24518
-
-e24518
-
-
-## e09900
-
-e09900
-
-
-## e27200
-
-e27200
-
-
-## e03290
-
-e03290
-
-
-## e58990
-
-e58990
-
-
-## e03230
-
-e03230
-
-
-## e07400
-
-e07400
-
-
-## e11200
-
-e11200
-
-
-## e07260
-
-e07260
-
-
-## e07240
-
-e07240
-
-
-## e07600
-
-e07600
-
-
-## e03220
-
-e03220
-
-
-## p08000
-
-p08000
-
-
-## e03400
-
-e03400
-
-
-## e09800
-
-e09800
-
-
-## e09700
-
-e09700
-
-
-## e03500
-
-e03500
-
-
-## e87521
-
-e87521
-
diff --git a/test.ipynb b/test.ipynb
index d6570e42..d3db2180 100644
--- a/test.ipynb
+++ b/test.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -13,100 +13,173 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Scale: 1.0, total: 1235.6420345077538, deviation: 1029.8420345077539\n",
-      "Scale: 1.0000000149011612, total: 1235.6420529202553, deviation: 1029.8420529202554\n",
-      "Scale: -0.009999999999999787, total: -12.356420345077279, deviation: -218.1564203450773\n",
-      "Scale: -0.009999985098838593, total: -12.356401932576146, deviation: -218.15640193257616\n",
-      "Scale: 0.16655307989653162, total: 205.79998649688287, deviation: -1.3503117145319266e-05\n",
-      "Scale: 0.1665530947976928, total: 205.80000490938406, deviation: 4.9093840459590865e-06\n"
-     ]
-    },
+     "data": {
+      "text/plain": [
+       "173.07107828276648"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(df.c00100 > 0).sum() / 1e6"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
     {
      "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Variable</th>\n",
+       "      <th>Sum (bn)</th>\n",
+       "      <th>Nonzero count (m)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>index</td>\n",
+       "      <td>12152.3</td>\n",
+       "      <td>214.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>RECID</td>\n",
+       "      <td>828073.8</td>\n",
+       "      <td>214.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>MARS</td>\n",
+       "      <td>0.4</td>\n",
+       "      <td>214.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>e00200p</td>\n",
+       "      <td>7492.2</td>\n",
+       "      <td>140.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>e00200s</td>\n",
+       "      <td>2440.5</td>\n",
+       "      <td>51.3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>206</th>\n",
+       "      <td>othertaxes</td>\n",
+       "      <td>97.5</td>\n",
+       "      <td>13.2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>207</th>\n",
+       "      <td>odc</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>208</th>\n",
+       "      <td>c59660</td>\n",
+       "      <td>48.9</td>\n",
+       "      <td>27.1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>209</th>\n",
+       "      <td>f6251</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>210</th>\n",
+       "      <td>codtc_limited</td>\n",
+       "      <td>39.4</td>\n",
+       "      <td>29.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>211 rows × 3 columns</p>\n",
+       "</div>"
+      ],
       "text/plain": [
-       "0         0.0\n",
-       "1         0.0\n",
-       "2         0.0\n",
-       "3         0.0\n",
-       "4         0.0\n",
-       "         ... \n",
-       "233384    0.0\n",
-       "233385    0.0\n",
-       "233386    0.0\n",
-       "233387    0.0\n",
-       "233388    0.0\n",
-       "Name: PT_binc_w2_wages, Length: 233389, dtype: float64"
+       "          Variable  Sum (bn)  Nonzero count (m)\n",
+       "0            index   12152.3              214.0\n",
+       "1            RECID  828073.8              214.0\n",
+       "2             MARS       0.4              214.0\n",
+       "3          e00200p    7492.2              140.0\n",
+       "4          e00200s    2440.5               51.3\n",
+       "..             ...       ...                ...\n",
+       "206     othertaxes      97.5               13.2\n",
+       "207            odc       0.0                0.4\n",
+       "208         c59660      48.9               27.1\n",
+       "209          f6251       0.0                0.0\n",
+       "210  codtc_limited      39.4               29.0\n",
+       "\n",
+       "[211 rows x 3 columns]"
       ]
      },
-     "execution_count": 27,
+     "execution_count": 35,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "from microdf import MicroDataFrame\n",
     "import numpy as np\n",
-    "import pandas as pd\n",
-    "from scipy.optimize import minimize\n",
-    "\n",
-    "def add_pt_w2_wages(df, time_period: int, verbose: bool = True):\n",
-    "    \"\"\"\n",
-    "    Add pass-through W2 wages to the flat file.\n",
     "\n",
-    "    Args:\n",
-    "        df (pd.DataFrame): The DataFrame to add W2 wages to.\n",
+    "df = MicroDataFrame(df, weights=\"s006\")\n",
     "\n",
-    "    Returns:\n",
-    "        pd.DataFrame: The DataFrame with W2 wages added.\n",
-    "    \"\"\"\n",
-    "    qbid_tax_expenditures = { # From JCT TE reports 2018- and 2023-\n",
-    "        2015: 0,\n",
-    "        2016: 0,\n",
-    "        2017: 0,\n",
-    "        2018: 33.2,\n",
-    "        2019: 48.6,\n",
-    "        2020: 56.3,\n",
-    "        2021: 59.0,\n",
-    "        2022: 61.9,\n",
-    "        2023: 55.7,\n",
-    "        2024: 57.6,\n",
-    "        2025: 60.9,\n",
-    "        2026: 24.9,\n",
-    "    }\n",
+    "variables = []\n",
+    "sums = []\n",
+    "nonzero_counts = []\n",
     "\n",
-    "    QBID_TOTAL_21 = 205.8\n",
+    "for variable in df.columns:\n",
+    "    variables.append(variable)\n",
+    "    sums.append((df[variable].sum() / 1e9).round(1))\n",
+    "    nonzero_counts.append(((df[variable] > 0).sum() / 1e6).round(1))\n",
     "\n",
-    "    target = QBID_TOTAL_21 * qbid_tax_expenditures[time_period] / qbid_tax_expenditures[2021]\n",
-    "\n",
-    "    qbi = np.maximum(0, df.e00900 + df.e26270 + df.e02100 + df.e27200)\n",
-    "\n",
-    "    # Solve for scale to match the tax expenditure\n",
-    "\n",
-    "    def expenditure_loss(scale):\n",
-    "        res = (qbi * df.s006 * scale[0]).sum()/1e9\n",
-    "        deviation = (res - target)\n",
-    "        if verbose:\n",
-    "            print(f\"Scale: {scale[0]}, total: {res}, deviation: {deviation}\")\n",
-    "        return deviation ** 2\n",
-    "    \n",
-    "    \n",
-    "    scale = minimize(\n",
-    "        expenditure_loss,\n",
-    "        1,\n",
-    "        tol=1,\n",
-    "    ).x[0]\n",
-    "\n",
-    "    df[\"PT_binc_w2_wages\"] = qbi * scale\n",
-    "    \n",
-    "    return df\n",
+    "summary_df = pd.DataFrame(\n",
+    "    {\n",
+    "        \"Variable\": variables,\n",
+    "        \"Sum (bn)\": sums,\n",
+    "        \"Nonzero count (m)\": nonzero_counts,\n",
+    "    }\n",
+    ")\n",
     "\n",
-    "add_pt_w2_wages(df, 2021).PT_binc_w2_wages"
+    "summary_df"
    ]
   }
  ],