Skip to content

Commit

Permalink
Merge branch 'master' into cbo_covid_update
Browse files Browse the repository at this point in the history
  • Loading branch information
andersonfrailey authored Aug 10, 2020
2 parents ca23f1a + 98fc990 commit 8f57e4e
Show file tree
Hide file tree
Showing 19 changed files with 363 additions and 2,832 deletions.
6 changes: 2 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,5 @@ cpsmar*.sas
cpsmar*.csv
*.dat

# Pickle files (GitHub says they're too large)
cps_data/pycps/data/cpsmar2013.pkl
cps_data/pycps/data/cpsmar2014.pkl
cps_data/pycps/data/cpsmar2015.pkl
# pickle
cps*.pkl
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ cps_stage1/stage_2_targets.csv: cps_stage1/stage1.py \

cps_stage2/cps_weights.csv.gz: cps_stage2/stage2.py \
cps_stage2/solve_lp_for_year.py \
cps_data/cps_raw.csv.gz \
cps_data/pycps/cps_raw.csv.gz \
puf_stage1/Stage_I_factors.csv \
cps_stage1/stage_2_targets.csv
cd cps_stage2 ; python stage2.py && \
Expand Down
91 changes: 83 additions & 8 deletions cps_data/README.md
Original file line number Diff line number Diff line change
@@ -1,17 +1,92 @@
About cps_data
==============

This directory contains the following script:
This directory contains the python scripts used to create `cps.csv.gz`. You
can run all of the scripts with the command `python create.py`. By default,
you will get a CPS file composed of the 2013, 2014, and 2015 March CPS Supplemental
files. If you would like to use another combination of the 2013, 2014, 2015,
2016, 2017, and 2018 files, there are two ways to do so.

* Python script **finalprep.py**, which reads/writes:
1. You can modify `create.py` by adding the `cps_files` argument to the `create()`
function call at the bottom of the file to specify which files you would like to
use. For example, to use the 2016, 2017, and 2018 files, the function call would
now be
```python
if __name__ == "__main__":
create(
exportcsv=False, exportpkl=True, exportraw=False, validate=False,
benefits=True, verbose=True, cps_files=[2016, 2017, 2018]
)
```

Input files:
- cps_raw.csv.gz
- adjustment_targets.csv
- benefitprograms.csv
2. You could write a separate python file that imports the `create()` function
and calls it in the same way as above.

Output files:
- cps.csv
## Input files:
With the exception of the CPS March Supplements, all input files can be found
in the `pycps/data` directory.

### CPS March Supplements
* asec2013_pubuse.dat
* asec2014_pubuse_tax_fix_5x8_2017.dat
* asec2015_pubuse.dat
* asec2016_pubuse.dat
* asec2017_pubuse.dat
* asec2018_pubuse.dat

### C-TAM Benefit Imputations

Note that we only have C-TAM imputations for the 2013, 2014, and 2015 files.
For other years, we just use the benefit program information in the CPS
* Housing_Imputation_logreg_2013.csv
* Housing_Imputation_logreg_2014.csv
* Housing_Imputation_logreg_2015.csv
* medicaid2013.csv
* medicaid2014.csv
* medicaid2015.csv
* medicare2013.csv
* medicare2014.csv
* medicare2015.csv
* otherbenefitprograms.csv
* SNAP_Imputation_2013.csv
* SNAP_Imputation_2014.csv
* SNAP_Imputation_2015.csv
* SS_augmentation_2013.csv
* SS_augmentation_2014.csv
* SS_augmentation_2015.csv
* SSI_Imputation2013.csv
* SSI_Imputation2014.csv
* SSI_Imputation2015.csv
* TANF_Imputation_2013.csv
* TANF_Imputation_2014.csv
* TANF_Imputation_2015.csv
* UI_imputation_logreg_2013.csv
* UI_imputation_logreg_2014.csv
* UI_imputation_logreg_2015.csv
* VB_Imputation2013.csv
* VB_Imputation2014.csv
* VB_Imputation2015.csv
* WIC_imputation_children_logreg_2013.csv
* WIC_imputation_children_logreg_2014.csv
* WIC_imputation_children_logreg_2015.csv
* WIC_imputation_infants_logreg_2013.csv
* WIC_imputation_infants_logreg_2014.csv
* WIC_imputation_infants_logreg_2015.csv
* WIC_imputation_women_logreg_2013.csv
* WIC_imputation_women_logreg_2014.csv
* WIC_imputation_women_logreg_2015.csv

### Imputation Parameters

These parameters are used in the imputations found in `pycps/impute.py`
* logit_beta.csv
* ols_betas.csv

## Output Files

Only `cps.csv.gz` is included in the repository due to the size of `cps_raw.csv.gz`.
* cps.csv.gz
* cps_raw.csv.gz


Documentation
Expand Down
33 changes: 23 additions & 10 deletions cps_data/pycps/benefits.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,26 +95,39 @@ def distribute_benefits(data, other_ben):
other_ben["2014_cost"] *= 1e6

# adjust medicare and medicaid totals
weighted_mcare_count = (data["mcare_count"] * data["s006"]).sum()
weighted_mcaid_count = (data["mcaid_count"] * data["s006"]).sum()
weighted_mcare = (data["mcare_ben"] * data["s006"]).sum()
weighted_mcaid = (data["mcaid_ben"] * data["s006"]).sum()
mcare_amt = weighted_mcare / weighted_mcare_count
mcaid_amt = weighted_mcaid / weighted_mcaid_count
data["mcaid_ben"] = data["mcaid_count"] * mcaid_amt
data["mcare_ben"] = data["mcare_count"] * mcare_amt
try:
weighted_mcare_count = (data["mcare_count"] * data["s006"]).sum()
weighted_mcaid_count = (data["mcaid_count"] * data["s006"]).sum()
weighted_mcare = (data["mcare_ben"] * data["s006"]).sum()
weighted_mcaid = (data["mcaid_ben"] * data["s006"]).sum()
mcare_amt = weighted_mcare / weighted_mcare_count
mcaid_amt = weighted_mcaid / weighted_mcaid_count
data["mcaid_ben"] = data["mcaid_count"] * mcaid_amt
data["mcare_ben"] = data["mcare_count"] * mcare_amt
except KeyError:
# skip over adjusting medicare and medicaid if we don't impute them
# set to zero to avoid errors later
data["mcaid_ben"] = 0.
data["mcare_ben"] = 0

# Distribute other benefits
data["dist_ben"] = data[["mcaid_ben", "ssi_ben", "snap_ben"]].sum(axis=1)
data["ratio"] = (data["dist_ben"] * data["s006"] /
(data["dist_ben"] * data["s006"]).sum())
# ... remove TANF and WIC from other_ben total
tanf_total = (data["tanf_ben"] * data["s006"]).sum()
wic_total = (data["wic_ben"] * data["s006"]).sum()
try:
wic_total = (data["wic_ben"] * data["s006"]).sum()
except KeyError:
# Same as medicare and medicaid
wic_total = 0.
other_ben_total = other_ben["2014_cost"].sum() - tanf_total - wic_total
# ... divide by the weight to account for weighting in Tax-Calculator
data["other_ben"] = (data["ratio"] * other_ben_total / data["s006"])

data["housing_ben"] *= 12
try:
data["housing_ben"] *= 12
except KeyError:
pass

return data
31 changes: 31 additions & 0 deletions cps_data/pycps/cps_meta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""
Holds all the CPS file metadata we need. Created to keep create.py clean
"""
C_TAM_YEARS = [2013, 2014, 2015] # years we have C-TAM imputations for

CPS_META_DATA = {
2013: {
"dat_file": "asec2013_pubuse.dat",
"sas_file": "cpsmar2013.sas"
},
2014: {
"dat_file": "asec2014_pubuse_tax_fix_5x8_2017.dat",
"sas_file": "cpsmar2014t.sas"
},
2015: {
"dat_file": "asec2015_pubuse.dat",
"sas_file": "cpsmar2015.sas"
},
2016: {
"dat_file": "asec2016_pubuse_v3.dat",
"sas_file": "cpsmar2016.sas"
},
2017: {
"dat_file": "asec2017_pubuse.dat",
"sas_file": "cpsmar2017.sas"
},
2018: {
"dat_file": "asec2018_pubuse.dat",
"sas_file": "cpsmar2018.sas"
}
}
92 changes: 42 additions & 50 deletions cps_data/pycps/template.txt → cps_data/pycps/cpsmar.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,46 +10,16 @@
DATA_PATH = Path(CUR_PATH, "data")


def h_rec(rec):

record = OrderedDict()

{% for item in household %}
{{ item }}{% endfor %}

return record


def f_rec(rec):
"""
Process family record in CPS
"""

record = OrderedDict()

{% for item in family %}
{{ item }}{% endfor %}

return record


def p_rec(rec, benefits, h_seq, fhseq, ffpos):
def person_details(record, benefits, h_seq, fhseq, ffpos, year):
"""
Process person record in CPS
Add additonal details for person records
"""
record = OrderedDict()

{% for item in person %}
{{ item }}{% endfor %}

{# This might need to be updated to year >= 2015 #}
{% if year == 2015 %}
record["alimony"] = 0.
if record["oi_off"] == 20:
record["alimony"] = record["oi_val"]
{% else %}
record["alimony"] = record["alm_val"]
{% endif %}
if year >= 2015:
record["alimony"] = 0.
if record["oi_off"] == 20:
record["alimony"] = record["oi_val"]
else:
record["alimony"] = record["alm_val"]
# Calculate pensions and annuities
pensions_annuities = (
((record["oi_off"] == 2) * record["oi_val"]) +
Expand Down Expand Up @@ -109,10 +79,36 @@ def p_rec(rec, benefits, h_seq, fhseq, ffpos):
record["tot_inc"] -= record["uc_val"]
record["tot_inc"] += record["UI_impute"]
record["tot_inc"] += record["ss_impute"]
else:
# calculate benefits in CPS where possible
record["tanf_val"] = 0.
if record["paw_yn"] == 1:
record["tanf_val"] = record["paw_val"]
if year >= 2016:
record["housing_val"] = 0.
else:
record["housing_val"] = record["fhoussub"]
return record


def create_cps(dat_file, year, benefits=True, exportpkl=True, exportcsv=True):
def parse(rec, parse_dict):
"""
Function for parsing lines of the CPS
"""
record = OrderedDict()

for var in parse_dict.keys():
start, end, decimals = parse_dict[var]
value = int(rec[start: end])
if decimals != 0:
value /= int("1" + ("0" * decimals))
record[var] = value

return record


def create_cps(dat_file, year, parsing_dict, benefits=True, exportpkl=True,
exportcsv=True):
"""
Read the .DAT CPS file and convert it to a list of dictionaries that
to later be converted to tax units. Optionally export that list as a
Expand All @@ -121,11 +117,11 @@ def create_cps(dat_file, year, benefits=True, exportpkl=True, exportcsv=True):
----------
dat_file: Path to the .DAT version of the CPS downloaded from NBER
year: year of the CPS being converted
parsing_dict: dictionary with information
benefits: Set to true to include C-TAM imputed benefits in the CPS
exportpkl: Set to true to export a pickled list of households in the CPS
exportcsv: Set to true to export a CSV version of the CPS
"""

# read in file
print("Reading DAT file")
with Path(dat_file).open("r") as f:
Expand All @@ -148,15 +144,16 @@ def create_cps(dat_file, year, benefits=True, exportpkl=True, exportcsv=True):
if household:
cps_list.append(household)
household = []
house = h_rec(record)
house = parse(record, parsing_dict["household"])
# family record
elif rec_type == "2":
family = f_rec(record)
family = parse(record, parsing_dict["family"])
# person record
elif rec_type == "3":
person = p_rec(
record, benefits, house["h_seq"], family["fh_seq"],
family["ffpos"]
person = parse(record, parsing_dict["person"])
person = person_details(
person, benefits, house["h_seq"], family["fh_seq"],
family["ffpos"], year
)
full_rec = {**house, **family, **person}
household.append(full_rec)
Expand All @@ -179,8 +176,3 @@ def create_cps(dat_file, year, benefits=True, exportpkl=True, exportcsv=True):
pickle.dump(cps_list, f)

return cps_list


if __name__ == "__main__":
create_cps(Path(CUR_PATH, "data", "{{ file_name }}"), {{ year }})

Loading

0 comments on commit 8f57e4e

Please sign in to comment.