-
Notifications
You must be signed in to change notification settings - Fork 0
/
preval.py
159 lines (103 loc) · 4.09 KB
/
preval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# Angus Dempster, Geoffrey I Webb, Daniel F Schmidt
# Prevalidated ridge regression is a highly-efficient drop-in replacement
# for logistic regression for high-dimensional data
# https://arxiv.org/abs/2401.15610
import numpy as np
from scipy.optimize import minimize
from sklearn.preprocessing import LabelBinarizer
# == constants =================================================================
EPS = np.finfo(np.float32).eps
LOG_EPS = np.log(EPS)
# == softmax function ==========================================================
def _softmax(X):
exp_X = np.exp(X.clip(LOG_EPS, -LOG_EPS))
return exp_X / np.sum(exp_X, axis = -1, keepdims = True)
# == log loss function for scipy.optimize.minimize =============================
def _log_loss(c, *args):
Y, Y_loocv, B0 = args
P = _softmax(c * Y_loocv + B0)
return -np.log((Y * P).max(1)).sum()
# == PreVal ====================================================================
class PreVal():
def __init__(self, lambdas = np.logspace(-3, 3, 10)):
self.lambdas = lambdas.astype(np.float32)
self._is_fitted = False
def fit(self, X, Y):
X = X.astype(np.float32, copy = False)
# drop low-variance columns
self._mask = X.std(0) < 1e-6
X = X[:, ~self._mask]
X = np.hstack((np.ones((X.shape[0], 1), dtype = np.float32), X))
n, p = X.shape
# encode class as regression target, Y in {-1, +1}
self._lb = LabelBinarizer(neg_label = -1)
Y = self._lb.fit_transform(Y).astype(np.float32)
# fix for binary classes
if Y.shape[-1] == 1:
Y = np.hstack((-Y, Y))
# cetnre Y
self.B0 = Y.mean(0)
Y = Y - self.B0
# svd via eigendecomposition
# on X^T X (for n >= p)
# on X X^T (for n < p)
if n >= p:
batch_size = 4_096
G = np.zeros((p, p), dtype = np.float32)
for i in range(0, X.shape[0], batch_size):
G = G + (X[i:i + batch_size].T @ X[i:i + batch_size])
S2, V = np.linalg.eigh(G)
S2 = S2.clip(EPS)
S = np.sqrt(S2)
U = (X @ V) * (1 / S)
else:
G = X @ X.T
S2, U = np.linalg.eigh(G)
S2 = S2.clip(EPS)
S = np.sqrt(S2)
V = (X.T @ U) * (1 / S)
R = U * S
R2 = R ** 2
RTY = R.T @ Y # "Q" in paper
best_loss = np.inf
self.c = np.float32(1.0)
self.lambda_ = None
for lambda_ in self.lambdas:
alpha_hat = (1 / (S2[:, None] + lambda_)) * RTY
Y_hat = R @ alpha_hat
# "full fit" residuals for given alpha
E = Y - Y_hat
# diagonal of hat matrix
diag_H = (R2 / (S2 + lambda_)).sum(1)
# loocv residuals
E_loocv = E / (1 - diag_H[:, None]).clip(EPS)
# difference between overall residuals and loocv residuals
delta = E_loocv - E
# loocv predictions
Y_loocv = Y_hat - delta
result = \
minimize(
fun = _log_loss,
x0 = 1.0,
args = (Y, Y_loocv, self.B0),
method = "BFGS",
jac = "2-point",
)
# use of Y_hat in place of Y_loocv in minimize gives "naive scaling"
nll = result.fun
if nll < best_loss:
best_loss = nll
self.c = np.float32(result.x.item())
self.lambda_ = lambda_
alpha_hat_best = alpha_hat
self.B = self.c * (V @ alpha_hat_best)
self._is_fitted = True
def predict_proba(self, X):
assert self._is_fitted, "not fitted"
X = X.astype(np.float32, copy = False)
X = X[:, ~self._mask]
X = np.hstack((np.ones((X.shape[0], 1), dtype = np.float32), X))
return _softmax(X @ self.B + self.B0)
def predict(self, X):
assert self._is_fitted, "not fitted"
return self._lb.classes_[self.predict_proba(X).argmax(1)]