-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscale_impute.py
127 lines (119 loc) · 4.25 KB
/
scale_impute.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# standard library imports
import argparse
import os
from pathlib import Path
import pickle
# third-party imports
from loguru import logger
import pandas as pd
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer, StandardScaler
# local imports
from settings import INTERIM_DIR, PROCESSED_DIR, RANDOM_STATE, MODELS_DIR
if __name__ == "__main__":
"""Scale and impute parsed, preprocessed ACS data.
Last data processing step prior to modeling
"""
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
print("Configure and instantiate logger")
logger.add(
f"log_{__file__}.log".replace(".py", ""), backtrace=False, diagnose=False
)
logger.debug(f"Begin {__file__}")
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
print("Parse arguments")
try:
default_src = INTERIM_DIR / "acs__preprocessed_tables.pkl"
default_dst = PROCESSED_DIR / "scaled_imputed_data.pkl"
default_model_dst = MODELS_DIR / 'scaler_imputer.pkl'
description = "Scale and impute parsed, preprocessed ACS data"
parser = argparse.ArgumentParser(description=description)
parser.add_argument(
"-i",
"--input_src",
default=default_src,
help="Path to parsed, preprocessed ACS data",
type=Path,
)
parser.add_argument(
"-m",
"--model_dst",
default=default_model_dst,
help="Path to trained scale-impute model",
type=Path,
)
parser.add_argument(
"-o",
"--output_dst",
default=default_dst,
help="Path to scaled, imputed data",
type=Path,
)
parser.add_argument(
"-r",
"--random_state",
default=RANDOM_STATE,
help="Directory to save parsed ACS files",
type=int,
)
args = parser.parse_args()
input_src = args.input_src
model_dst = args.model_dst
models_dir = model_dst.parents[0]
cache_dir = models_dir / "cache"
cache_dir.mkdir(exist_ok=True)
output_dst = args.output_dst
random_state = args.random_state
logger.debug("Finish parsing arguments")
except Exception:
logger.error("Failed to parse arguments", exc_info=True)
raise
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
print("Scale and impute data")
try:
df = pd.read_pickle(input_src)
mi = MissingIndicator(features="all")
columns = [f"mi__{x}" for x in df]
df_mi = pd.DataFrame(mi.fit_transform(df), columns=columns, index=df.index)
columns = df_mi.sum()[df_mi.sum() > 0].index.values
df_mi = df_mi[columns]
df = pd.concat([df, df_mi], axis=1)
subsample = int(len(df) / 5)
n_quantiles = min(
1000, subsample - 1
) # default is 1000, use min to ensure < subsample
qt = QuantileTransformer(
n_quantiles=n_quantiles,
output_distribution="normal",
subsample=subsample,
random_state=random_state,
)
imputer = SimpleImputer(strategy="median")
pipe = Pipeline(
steps=[
("quantile_transformer", qt),
("imputer", imputer),
("standard_scaler", StandardScaler()),
],
memory=str(cache_dir),
verbose=True,
)
df_transformed = pipe.fit_transform(df)
df_transformed = pd.DataFrame(
df_transformed, index=df.index, columns=df.columns
)
logger.debug("Finish scaling and imputing")
except Exception:
logger.error("Failed to scale / impute data", exc_info=True)
raise
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
print("Save outputs")
try:
df_transformed.to_pickle(output_dst)
with open(str(model_dst), "wb") as f:
pickle.dump(pipe, f)
logger.debug("Save outputs")
except Exception:
logger.error("Failed to save output(s)", exc_info=True)
raise