forked from chrisconlan/algorithmic-trading-with-python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
model.py
116 lines (87 loc) · 3.24 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedKFold
from sklearn.base import clone
from joblib import Parallel, delayed
# Number of jobs to run in parallel
# Set to number of computer cores to use
N_JOBS = 10
N_SPLITS = 5
N_REPEATS = 4
def _fit_and_score(classifier, X, y, w, train_index, test_index, i) -> float:
"""
The function used by joblib to split, train, and score cross-validations
"""
X_train = X.iloc[train_index]
X_test = X.iloc[test_index]
y_train = y.iloc[train_index]
y_test = y.iloc[test_index]
w_train = w.iloc[train_index]
w_test = w.iloc[test_index]
classifier.fit(X_train, y_train, w_train)
score = classifier.score(X_test, y_test, w_test)
print(f'Finished {i} ({100*score:.1f}%)')
return score
def repeated_k_fold(classifier, X, y, w) -> np.ndarray:
"""
Perform repeated k-fold cross-validation on a classifier. Spread fitting
job over multiple computer cores.
"""
n_jobs = N_JOBS
n_splits = N_SPLITS
n_repeats = N_REPEATS
total_fits = n_splits * n_repeats
_k_fold = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats)
print(f'Fitting {total_fits} models {n_jobs} at a time ...')
print()
parallel = Parallel(n_jobs=n_jobs)
scores = parallel(
delayed(_fit_and_score)(
clone(classifier), X, y, w, train_index, test_index, i
) for i, (train_index, test_index) in enumerate(_k_fold.split(X))
)
return np.array(scores)
def calculate_model(df: pd.DataFrame) -> RandomForestClassifier:
"""
Given a dataframe with a y column, weights column, and predictor columns
with arbitrary names, cross-validated and fit a classifier. Print
diagnostics.
"""
classifier = RandomForestClassifier(n_estimators=100)
# Separate data
predictor_columns = [
c for c in df.columns.values if not c in ('y', 'weights')
]
X = df[predictor_columns]
y = df['y']
w = df['weights']
# Fit cross-validation
scores = repeated_k_fold(classifier, X, y, w)
# Get a full dataset fit for importance scores
classifier.fit(X, y, w)
# Compute diagnostics
_imp = classifier.feature_importances_
importance_series = pd.Series(_imp, index=predictor_columns)
importance_series = importance_series.sort_values(ascending=False)
# baseline accuracy is the best value achievable with a constant guess
baseline = np.max(y.value_counts() / y.shape[0])
# Compute a rough confidence interval for the improvement
mean_score = scores.mean()
std_score = scores.std()
upper_bound = mean_score + 2 * std_score
lower_bound = mean_score - 2 * std_score
ibounds = (lower_bound - baseline, upper_bound - baseline)
print()
print('Feature importances')
for col, imp in importance_series.items():
print(f'{col:24} {imp:>.3f}')
print()
print('Cross validation scores')
print(np.round(100 * scores, 1))
print()
print(f'Baseline accuracy {100*baseline:.1f}%')
print(f'OOS accuracy {100*mean_score:.1f}% +/- {200 * scores.std():.1f}%')
print(f'Improvement {100*(ibounds[0]):.1f} to {100*(ibounds[1]):.1f}%')
print()
return classifier