-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdodo.py
154 lines (133 loc) · 4.75 KB
/
dodo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# standard library imports
import datetime
import getpass
import os
from pathlib import Path
from string import Template
import time
# third-party imports
from doit.tools import config_changed, create_folder, run_once
from loguru import logger
import pandas as pd
import requests
# local imports
from settings import (
DIRS,
RAW_DIR,
INTERIM_DIR,
PROCESSED_DIR,
MODELS_DIR,
LOG_PATH,
ROOT_DIR,
ACS_SPAN,
ACS_YEAR,
RAW_ACS_DATA_DIR,
RAW_SHAPEFILES_DIR,
LOOKUPS_SRC,
RANDOM_STATE,
CE_CUTOFF,
N_HIDDEN,
N_SAMPLES,
N_TRIALS,
MAX_COMPONENTS
)
from src.acs import ACS
logger.add(LOG_PATH)
@logger.catch
def task_makedirs():
"""Make directories if they don't exist
To run, cd into root dir and type `doit makedirs`.
"""
for dir_ in DIRS:
yield dict(name=dir_, actions=[(create_folder, [dir_])], uptodate=[run_once])
@logger.catch
def task_get_tiger_files():
"""Download and save TIGER shapefiles.
To run, cd into root dir and type `doit get_tiger_files`.
"""
def get_zips(url, dst):
r = requests.get(url)
with open(dst, "wb") as f:
f.write(r.content)
return True
fips = """1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 60, 66, 69, 72, 78"""
fips = [x.strip().zfill(2) for x in fips.split(",") if len(x) > 0]
url_template = Template(
"https://www2.census.gov/geo/tiger/TIGER${year}/TRACT/tl_${year}_${fip}_tract.zip"
)
for fip in fips:
url = url_template.substitute(year=ACS_YEAR, fip=fip)
fn = url.split("/")[-1]
dst = RAW_SHAPEFILES_DIR / fn
yield dict(
name=dst.stem,
actions=[(get_zips, [url, dst])],
uptodate=[True],
targets=[dst],
clean=True,
)
@logger.catch
def task_download_acs():
"""Download American Community Survey (ACS) data.
This underlying code is from
https://gist.githubusercontent.com/erikbern/89c5f44bd1354854a8954fa2df04453d/raw/efd7b7c31d781a5cae9849be60ab86967bf7d2ed/american_community_survey_example.py
Author of that underlying code is Erik Bernhardsson | erikbern | https://gist.github.com/erikbern
To run, cd into root dir and type `doit download_acs`.
"""
acs = ACS(
ACS_YEAR, ACS_SPAN, RAW_ACS_DATA_DIR, INTERIM_DIR, LOOKUPS_SRC, overwrite=False
)
return dict(
actions=[acs.get_acs_metadata, acs.get_acs_data],
task_dep=["makedirs"],
verbosity=2,
clean=True,
)
@logger.catch
def task_parse_acs():
"""Parse downloaded ACS data.
To run, cd into root dir and type `doit parse_acs`.
"""
# TODO: Refactor so that parse_acs.py uses pydoit dependency manaagement framework
cmd = "python parse_acs.py"
# file_dep = f"{ACS_YEAR}_{ACS_SPAN}y_lookup.txt"
return dict(actions=[cmd], verbosity=2, clean=True)
@logger.catch
def task_scale_and_impute_data():
"""Scale and impute missing data"""
i = INTERIM_DIR / "acs__preprocessed_tables.pkl" # input_src, aka `i`
m = MODELS_DIR / "scaler_imputer.pkl" # models_dst, aka `m`
o = PROCESSED_DIR / "scaled_imputed_data.pkl" # output_dst, aka `o`
r = RANDOM_STATE # random_state, aka `r`
cmd = f"python scale_impute.py -i {i} -m {m} -o {o} -r {r}"
return dict(actions=[cmd], file_dep=[i], targets=[o], verbosity=2, clean=True)
@logger.catch
def task_select_n_components():
"""Select number of components to use for dimensionality reduction"""
cmd = f"python select_n_components.py"
c = CE_CUTOFF
d = N_HIDDEN
i = PROCESSED_DIR / "scaled_imputed_data.pkl"
n = N_SAMPLES
o = PROCESSED_DIR / "selected_n_components.json"
t = N_TRIALS
cmd = f"python select_n_components.py -c {c} -d {d} -i {i} -n {n} -o {o} -t {t}"
return dict(actions=[cmd], file_dep=[i], targets=[o], verbosity=2, clean=True)
# TODO: #4 Add report generation task that creates a report of model summary statistics
def task_cluster():
"""Train set of Gaussian Mixture models, select best one, and cluster tracts"""
src = PROCESSED_DIR / "scaled_imputed_data.pkl"
orig_src = INTERIM_DIR / "acs__preprocessed_tables.pkl"
corex_obj_src = PROCESSED_DIR / "selected_n_components.pkl"
gm_dst = MODELS_DIR / "gaussian_mixture.pkl"
ce_dst = MODELS_DIR / "corex.pkl"
labeled_dst = PROCESSED_DIR / "labeled.pkl"
labeled_orig_dst = PROCESSED_DIR / "labeled_orig.pkl"
cmd = f"python cluster.py"
return dict(
actions=[cmd],
file_dep=[src, orig_src, corex_obj_src],
targets=[gm_dst, ce_dst, labeled_dst, labeled_orig_dst],
verbosity=2,
clean=True,
)