Merge branch 'master' into cbo_covid_update

PSLmodels · Aug 10, 2020 · 8f57e4e · 8f57e4e
2 parents ca23f1a + 98fc990
commit 8f57e4e
Show file tree

Hide file tree

Showing 19 changed files with 363 additions and 2,832 deletions.
diff --git a/.gitignore b/.gitignore
@@ -22,7 +22,5 @@ cpsmar*.sas
 cpsmar*.csv
 *.dat
 
-# Pickle files (GitHub says they're too large)
-cps_data/pycps/data/cpsmar2013.pkl
-cps_data/pycps/data/cpsmar2014.pkl
-cps_data/pycps/data/cpsmar2015.pkl
+# pickle
+cps*.pkl
diff --git a/Makefile b/Makefile
@@ -150,7 +150,7 @@ cps_stage1/stage_2_targets.csv: cps_stage1/stage1.py \
 
 cps_stage2/cps_weights.csv.gz: cps_stage2/stage2.py \
                                cps_stage2/solve_lp_for_year.py \
-                               cps_data/cps_raw.csv.gz \
+                               cps_data/pycps/cps_raw.csv.gz \
                                puf_stage1/Stage_I_factors.csv \
                                cps_stage1/stage_2_targets.csv
 	cd cps_stage2 ; python stage2.py && \

diff --git a/cps_data/README.md b/cps_data/README.md
@@ -1,17 +1,92 @@
 About cps_data
 ==============
 
-This directory contains the following script:
+This directory contains the python scripts used to create `cps.csv.gz`. You
+can run all of the scripts with the command `python create.py`. By default,
+you will get a CPS file composed of the 2013, 2014, and 2015 March CPS Supplemental
+files. If you would like to use another combination of the 2013, 2014, 2015,
+2016, 2017, and 2018 files, there are two ways to do so.
 
-* Python script **finalprep.py**, which reads/writes:
+1. You can modify `create.py` by adding the `cps_files` argument to the `create()`
+function call at the bottom of the file to specify which files you would like to
+use. For example, to use the 2016, 2017, and 2018 files, the function call would
+now be
+```python
+if __name__ == "__main__":
+    create(
+        exportcsv=False, exportpkl=True, exportraw=False, validate=False,
+        benefits=True, verbose=True, cps_files=[2016, 2017, 2018]
+    )
+```
 
-  Input files:
-    - cps_raw.csv.gz
-    - adjustment_targets.csv
-    - benefitprograms.csv
+2. You could write a separate python file that imports the `create()` function
+and calls it in the same way as above.
 
-  Output files:
-    - cps.csv
+## Input files:
+With the exception of the CPS March Supplements, all input files can be found
+in the `pycps/data` directory.
+
+### CPS March Supplements
+* asec2013_pubuse.dat
+* asec2014_pubuse_tax_fix_5x8_2017.dat
+* asec2015_pubuse.dat
+* asec2016_pubuse.dat
+* asec2017_pubuse.dat
+* asec2018_pubuse.dat
+
+### C-TAM Benefit Imputations
+
+Note that we only have C-TAM imputations for the 2013, 2014, and 2015 files.
+For other years, we just use the benefit program information in the CPS
+* Housing_Imputation_logreg_2013.csv
+* Housing_Imputation_logreg_2014.csv
+* Housing_Imputation_logreg_2015.csv
+* medicaid2013.csv
+* medicaid2014.csv
+* medicaid2015.csv
+* medicare2013.csv
+* medicare2014.csv
+* medicare2015.csv
+* otherbenefitprograms.csv
+* SNAP_Imputation_2013.csv
+* SNAP_Imputation_2014.csv
+* SNAP_Imputation_2015.csv
+* SS_augmentation_2013.csv
+* SS_augmentation_2014.csv
+* SS_augmentation_2015.csv
+* SSI_Imputation2013.csv
+* SSI_Imputation2014.csv
+* SSI_Imputation2015.csv
+* TANF_Imputation_2013.csv
+* TANF_Imputation_2014.csv
+* TANF_Imputation_2015.csv
+* UI_imputation_logreg_2013.csv
+* UI_imputation_logreg_2014.csv
+* UI_imputation_logreg_2015.csv
+* VB_Imputation2013.csv
+* VB_Imputation2014.csv
+* VB_Imputation2015.csv
+* WIC_imputation_children_logreg_2013.csv
+* WIC_imputation_children_logreg_2014.csv
+* WIC_imputation_children_logreg_2015.csv
+* WIC_imputation_infants_logreg_2013.csv
+* WIC_imputation_infants_logreg_2014.csv
+* WIC_imputation_infants_logreg_2015.csv
+* WIC_imputation_women_logreg_2013.csv
+* WIC_imputation_women_logreg_2014.csv
+* WIC_imputation_women_logreg_2015.csv
+
+### Imputation Parameters
+
+These parameters are used in the imputations found in `pycps/impute.py`
+* logit_beta.csv
+* ols_betas.csv
+
+## Output Files
+
+Only `cps.csv.gz` is included in the repository due to the size of `cps_raw.csv.gz`.
+* cps.csv.gz
+* cps_raw.csv.gz
 
 
 Documentation

diff --git a/cps_data/pycps/benefits.py b/cps_data/pycps/benefits.py
@@ -95,26 +95,39 @@ def distribute_benefits(data, other_ben):
     other_ben["2014_cost"] *= 1e6
 
     # adjust medicare and medicaid totals
-    weighted_mcare_count = (data["mcare_count"] * data["s006"]).sum()
-    weighted_mcaid_count = (data["mcaid_count"] * data["s006"]).sum()
-    weighted_mcare = (data["mcare_ben"] * data["s006"]).sum()
-    weighted_mcaid = (data["mcaid_ben"] * data["s006"]).sum()
-    mcare_amt = weighted_mcare / weighted_mcare_count
-    mcaid_amt = weighted_mcaid / weighted_mcaid_count
-    data["mcaid_ben"] = data["mcaid_count"] * mcaid_amt
-    data["mcare_ben"] = data["mcare_count"] * mcare_amt
+    try:
+        weighted_mcare_count = (data["mcare_count"] * data["s006"]).sum()
+        weighted_mcaid_count = (data["mcaid_count"] * data["s006"]).sum()
+        weighted_mcare = (data["mcare_ben"] * data["s006"]).sum()
+        weighted_mcaid = (data["mcaid_ben"] * data["s006"]).sum()
+        mcare_amt = weighted_mcare / weighted_mcare_count
+        mcaid_amt = weighted_mcaid / weighted_mcaid_count
+        data["mcaid_ben"] = data["mcaid_count"] * mcaid_amt
+        data["mcare_ben"] = data["mcare_count"] * mcare_amt
+    except KeyError:
+        # skip over adjusting medicare and medicaid if we don't impute them
+        # set to zero to avoid errors later
+        data["mcaid_ben"] = 0.
+        data["mcare_ben"] = 0
 
     # Distribute other benefits
     data["dist_ben"] = data[["mcaid_ben", "ssi_ben", "snap_ben"]].sum(axis=1)
     data["ratio"] = (data["dist_ben"] * data["s006"] /
                      (data["dist_ben"] * data["s006"]).sum())
     # ... remove TANF and WIC from other_ben total
     tanf_total = (data["tanf_ben"] * data["s006"]).sum()
-    wic_total = (data["wic_ben"] * data["s006"]).sum()
+    try:
+        wic_total = (data["wic_ben"] * data["s006"]).sum()
+    except KeyError:
+        # Same as medicare and medicaid
+        wic_total = 0.
     other_ben_total = other_ben["2014_cost"].sum() - tanf_total - wic_total
     # ... divide by the weight to account for weighting in Tax-Calculator
     data["other_ben"] = (data["ratio"] * other_ben_total / data["s006"])
 
-    data["housing_ben"] *= 12
+    try:
+        data["housing_ben"] *= 12
+    except KeyError:
+        pass
 
     return data
diff --git a/cps_data/pycps/cps_meta.py b/cps_data/pycps/cps_meta.py
@@ -0,0 +1,31 @@
+"""
+Holds all the CPS file metadata we need. Created to keep create.py clean
+"""
+C_TAM_YEARS = [2013, 2014, 2015]  # years we have C-TAM imputations for
+
+CPS_META_DATA = {
+    2013: {
+        "dat_file": "asec2013_pubuse.dat",
+        "sas_file": "cpsmar2013.sas"
+    },
+    2014: {
+        "dat_file": "asec2014_pubuse_tax_fix_5x8_2017.dat",
+        "sas_file": "cpsmar2014t.sas"
+    },
+    2015: {
+        "dat_file": "asec2015_pubuse.dat",
+        "sas_file": "cpsmar2015.sas"
+    },
+    2016: {
+        "dat_file": "asec2016_pubuse_v3.dat",
+        "sas_file": "cpsmar2016.sas"
+    },
+    2017: {
+        "dat_file": "asec2017_pubuse.dat",
+        "sas_file": "cpsmar2017.sas"
+    },
+    2018: {
+        "dat_file": "asec2018_pubuse.dat",
+        "sas_file": "cpsmar2018.sas"
+    }
+}
diff --git a/cps_data/pycps/template.txt → cps_data/pycps/cpsmar.py b/cps_data/pycps/template.txt → cps_data/pycps/cpsmar.py
@@ -10,46 +10,16 @@
 DATA_PATH = Path(CUR_PATH, "data")
 
 
-def h_rec(rec):
-
-    record = OrderedDict()
-
-{% for item in household %}
-    {{ item }}{% endfor %}
-
-    return record
-
-
-def f_rec(rec):
-    """
-    Process family record in CPS
-    """
-
-    record = OrderedDict()
-
-{% for item in family %}
-    {{ item }}{% endfor %}
-
-    return record
-
-
-def p_rec(rec, benefits, h_seq, fhseq, ffpos):
+def person_details(record, benefits, h_seq, fhseq, ffpos, year):
     """
-    Process person record in CPS
+    Add additonal details for person records
     """
-    record = OrderedDict()
-
-{% for item in person %}
-    {{ item }}{% endfor %}
-
-    {# This might need to be updated to year >= 2015 #}
-    {% if year == 2015 %}
-    record["alimony"] = 0.
-    if record["oi_off"] == 20:
-        record["alimony"] = record["oi_val"]
-    {% else %}
-    record["alimony"] = record["alm_val"]
-    {% endif %}
+    if year >= 2015:
+        record["alimony"] = 0.
+        if record["oi_off"] == 20:
+            record["alimony"] = record["oi_val"]
+    else:
+        record["alimony"] = record["alm_val"]
     # Calculate pensions and annuities
     pensions_annuities = (
         ((record["oi_off"] == 2) * record["oi_val"]) +
@@ -109,10 +79,36 @@ def p_rec(rec, benefits, h_seq, fhseq, ffpos):
         record["tot_inc"] -= record["uc_val"]
         record["tot_inc"] += record["UI_impute"]
         record["tot_inc"] += record["ss_impute"]
+    else:
+        # calculate benefits in CPS where possible
+        record["tanf_val"] = 0.
+        if record["paw_yn"] == 1:
+            record["tanf_val"] = record["paw_val"]
+        if year >= 2016:
+            record["housing_val"] = 0.
+        else:
+            record["housing_val"] = record["fhoussub"]
     return record
 
 
-def create_cps(dat_file, year, benefits=True, exportpkl=True, exportcsv=True):
+def parse(rec, parse_dict):
+    """
+    Function for parsing lines of the CPS
+    """
+    record = OrderedDict()
+
+    for var in parse_dict.keys():
+        start, end, decimals = parse_dict[var]
+        value = int(rec[start: end])
+        if decimals != 0:
+            value /= int("1" + ("0" * decimals))
+        record[var] = value
+
+    return record
+
+
+def create_cps(dat_file, year, parsing_dict, benefits=True, exportpkl=True,
+               exportcsv=True):
     """
     Read the .DAT CPS file and convert it to a list of dictionaries that
     to later be converted to tax units. Optionally export that list as a
@@ -121,11 +117,11 @@ def create_cps(dat_file, year, benefits=True, exportpkl=True, exportcsv=True):
     ----------
     dat_file: Path to the .DAT version of the CPS downloaded from NBER
     year: year of the CPS being converted
+    parsing_dict: dictionary with information
     benefits: Set to true to include C-TAM imputed benefits in the CPS
     exportpkl: Set to true to export a pickled list of households in the CPS
     exportcsv: Set to true to export a CSV version of the CPS
     """
-
     # read in file
     print("Reading DAT file")
     with Path(dat_file).open("r") as f:
@@ -148,15 +144,16 @@ def create_cps(dat_file, year, benefits=True, exportpkl=True, exportcsv=True):
             if household:
                 cps_list.append(household)
                 household = []
-            house = h_rec(record)
+            house = parse(record, parsing_dict["household"])
         # family record
         elif rec_type == "2":
-            family = f_rec(record)
+            family = parse(record, parsing_dict["family"])
         # person record
         elif rec_type == "3":
-            person = p_rec(
-                record, benefits, house["h_seq"], family["fh_seq"],
-                family["ffpos"]
+            person = parse(record, parsing_dict["person"])
+            person = person_details(
+                person, benefits, house["h_seq"], family["fh_seq"],
+                family["ffpos"], year
             )
             full_rec = {**house, **family, **person}
             household.append(full_rec)
@@ -179,8 +176,3 @@ def create_cps(dat_file, year, benefits=True, exportpkl=True, exportcsv=True):
             pickle.dump(cps_list, f)
 
     return cps_list
-
-
-if __name__ == "__main__":
-    create_cps(Path(CUR_PATH, "data", "{{ file_name }}"), {{ year }})
-