-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
593fea4
commit fa494e8
Showing
11 changed files
with
372 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
from tseries.tseries.double_learner import DoubleLearner | ||
import pandas as pd | ||
import numpy as np | ||
from sklearn.linear_model.base import LinearRegression | ||
from sklearn.linear_model import Lasso | ||
import matplotlib.pyplot as plt | ||
from sklearn.cross_validation import train_test_split | ||
from sklearn.ensemble import RandomForestRegressor | ||
from tseries.tseries.alternating_learner import AlternatingLearner | ||
|
||
dataset_size = 1000 | ||
|
||
treatment_dim = 4 | ||
confounder_dim = 30 | ||
result_dim = 1 | ||
iters = 20 | ||
theta0 = np.array([1, -1, 1, 5]) | ||
|
||
|
||
def gen_x_y(dataset_size=dataset_size, | ||
result_dim=result_dim, | ||
treatment_dim=treatment_dim, | ||
confounder_dim=confounder_dim, | ||
theta0=theta0): | ||
Z = np.random.randn(dataset_size, confounder_dim) | ||
|
||
m0_m = np.random.randn(confounder_dim, treatment_dim) | ||
m0 = lambda x: np.tanh(np.dot(x, m0_m)) | ||
|
||
g0_m = np.random.randn(confounder_dim, result_dim) | ||
g0_b = np.random.randn(result_dim) | ||
g0 = lambda x: np.tanh(np.dot(x, g0_m) + g0_b) | ||
|
||
V = np.random.randn(dataset_size, treatment_dim) | ||
U = np.random.randn(dataset_size, result_dim) | ||
|
||
D = m0(Z) + V | ||
y = np.dot(D, np.expand_dims(theta0, 1)) + g0(Z) + U | ||
X = np.hstack([D, Z]) | ||
|
||
return X, y | ||
|
||
|
||
X, y = gen_x_y() | ||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3) | ||
|
||
|
||
def mse(y1, y2): | ||
return np.mean((y1 - y2) ** 2) | ||
|
||
|
||
treatment_cols = range(0, treatment_dim) | ||
confounder_cols = range(treatment_dim, treatment_dim + confounder_dim) | ||
|
||
# lr = LinearRegression(fit_intercept=False).fit(X_train, y_train) | ||
# dr = DoubleLearner(treatment_cols, confounder_cols).fit(X_train, y_train) | ||
al = AlternatingLearner(treatment_cols, confounder_cols).fit(X_train, y_train) | ||
dl = DoubleLearner(treatment_cols, | ||
confounder_cols, | ||
treatment_estimator=RandomForestRegressor(), | ||
y_estimator=RandomForestRegressor()).fit(X_train, y_train) | ||
|
||
al_pred = al.predict(X_test) | ||
dl_pred = dl.predict(X_test) | ||
print(mse(al_pred, y_test)) | ||
print(mse(dl_pred, y_test)) | ||
print(al.estimator_1.coef_[treatment_cols]) | ||
print(dl.effect_estimator.coef_) | ||
|
||
''' | ||
plt.subplot(2, 2, 1) | ||
plt.plot(lr_pred, y_test, 'r.', alpha=.2) | ||
plt.plot(np.linspace(-20, 20), np.linspace(-20, 20)) | ||
plt.subplot(2, 2, 2) | ||
plt.plot(dr_pred, y_test, 'r.', alpha=.2) | ||
plt.plot(np.linspace(-20, 20), np.linspace(-20, 20)) | ||
plt.subplot(2, 2, 3) | ||
plt.hist(lr_pred - y_test) | ||
plt.subplot(2, 2, 4) | ||
plt.hist(dr_pred - y_test) | ||
plt.show() | ||
lr_coeffs = [] | ||
dl_lr_coeffs = [] | ||
for i in range(iters): | ||
X, y = gen_x_y(dataset_size) | ||
lr_coeffs.append(LinearRegression(fit_intercept=False).fit(X, y).coef_) | ||
print(DoubleLearner([0], [1]).fit(X, y).effect_estimator.coef_) | ||
print(lr_coeffs[i]) | ||
x_lr, n_lr = zip(*lr_coeffs) | ||
plt.plot(x_lr, n_lr, 'ro', label='LR') | ||
plt.plot([15], [0], 'b*', ms=20, label='truth') | ||
plt.xlabel('x1 coeff') | ||
plt.ylabel('y1 coeff') | ||
plt.show() | ||
''' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from sklearn.linear_model import LinearRegression | ||
from tseries.tseries.models import LinearRegressionWithUncertainty | ||
from tseries.tseries.utils import mse | ||
import pandas as pd | ||
import numpy as np | ||
|
||
n = 300 | ||
dx = 5 | ||
dy = 1 | ||
X = np.random.randn(n, dx) | ||
M = np.random.randn(dx, dy) | ||
Y = np.dot(X, M) + .1 * np.random.randn(n, dy) | ||
|
||
|
||
def test_linear_regression_with_uncertainty(): | ||
fitLR = LinearRegression().fit(X, Y) | ||
fitLRU = LinearRegressionWithUncertainty().fit(X, Y) | ||
|
||
predLR = fitLR.predict(X) | ||
predLRU = fitLRU.predict(X) | ||
|
||
assert ((fitLRU.confidence_intervals()[1] - fitLRU.coef_) > 0).all() | ||
assert ((fitLRU.confidence_intervals()[0] - fitLRU.coef_) < 0).all() | ||
assert (np.isclose(fitLR.coef_, fitLRU.coef_).all) | ||
assert (np.isclose(mse(predLR, Y), mse(predLRU, Y), rtol=1e-2)) | ||
# assert () | ||
# assert (np.isclose(fit_model.x_pipe_.steps[-1][1].coef_, [1.0, 0.0, 0.0]).all()) | ||
|
||
|
||
if __name__ == '__main__': | ||
test_linear_regression_with_uncertainty() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
from sklearn.base import BaseEstimator, RegressorMixin | ||
import numpy as np | ||
from sklearn.linear_model.base import LinearRegression | ||
from sklearn.ensemble.forest import RandomForestRegressor | ||
|
||
|
||
class AlternatingLearner(BaseEstimator, RegressorMixin): | ||
def __init__(self, | ||
cols_1, | ||
cols_2, | ||
estimator_1=LinearRegression(fit_intercept=False), | ||
estimator_2=RandomForestRegressor(), | ||
iters=2): | ||
self.cols_1 = cols_1 | ||
self.cols_2 = cols_2 | ||
self.estimator_1 = estimator_1 | ||
self.estimator_2 = estimator_2 | ||
self.iters = iters | ||
|
||
def _normalize_x(self, X): | ||
if len(X.shape) < 2: | ||
X = np.expand_dims(X, 1) | ||
return np.array(X) | ||
|
||
def _normalize_y(self, y): | ||
return np.squeeze(y) | ||
|
||
|
||
def fit(self, X, y): | ||
X = self._normalize_x(X) | ||
y = self._normalize_y(y) | ||
|
||
for i in range(self.iters): | ||
if i == 0: | ||
target_1 = y | ||
else: | ||
target_1 = y - self.estimator_2.predict(X[:, self.cols_2]) | ||
self.estimator_1 = self.estimator_1.fit(X[:, self.cols_1], target_1) | ||
|
||
target_2 = y - self.estimator_1.predict(X[:, self.cols_1]) | ||
self.estimator_2 = self.estimator_2.fit(X[:, self.cols_2], target_2) | ||
|
||
return self | ||
|
||
def predict(self, X): | ||
X = self._normalize_x(X) | ||
return self.estimator_1.predict(X[:, self.cols_1]) + \ | ||
self.estimator_2.predict(X[:, self.cols_2]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
from sklearn.base import BaseEstimator, RegressorMixin | ||
import numpy as np | ||
from sklearn.linear_model.base import LinearRegression | ||
|
||
|
||
class DoubleLearner(BaseEstimator, RegressorMixin): | ||
def __init__(self, | ||
treatment_cols, | ||
nusiance_cols, | ||
effect_estimator=LinearRegression(fit_intercept=False), | ||
treatment_estimator=LinearRegression(fit_intercept=False), | ||
y_estimator=LinearRegression(fit_intercept=False)): | ||
self.nusiance_cols = nusiance_cols | ||
self.treatment_cols = treatment_cols | ||
self.effect_estimator = effect_estimator | ||
self.treatment_estimator = treatment_estimator | ||
self.y_estimator = y_estimator | ||
|
||
def _normalize_x(self, X): | ||
if len(X.shape) < 2: | ||
X = np.expand_dims(X, 1) | ||
return np.array(X) | ||
|
||
def _normalize_y(self, y): | ||
return np.squeeze(y) | ||
|
||
def fit(self, X, y): | ||
X = self._normalize_x(X) | ||
y = self._normalize_y(y) | ||
|
||
self.y_estimator = self.y_estimator \ | ||
.fit(X[:, self.nusiance_cols], y) | ||
y_resid = y - self.y_estimator.predict(X[:, self.nusiance_cols]) | ||
|
||
self.treatment_estimator = self.treatment_estimator \ | ||
.fit(X[:, self.nusiance_cols], X[:, self.treatment_cols]) | ||
treatment_resid = X[:, self.treatment_cols] - self.treatment_estimator.predict(X[:, self.nusiance_cols]) | ||
|
||
self.effect_estimator = self.effect_estimator.fit(treatment_resid, y_resid) | ||
|
||
return self | ||
|
||
def predict(self, X): | ||
X = self._normalize_x(X) | ||
treatment_resid = X[:, self.treatment_cols] - self.treatment_estimator.predict(X[:, self.nusiance_cols]) | ||
return self.effect_estimator.predict(treatment_resid) + \ | ||
self.y_estimator.predict(X[:, self.nusiance_cols]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from bayesian_lasso import * | ||
from linear_regression_with_uncertainty import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
from sklearn.utils.validation import check_X_y | ||
from sklearn.base import BaseEstimator, RegressorMixin | ||
import pymc | ||
import numpy as np | ||
|
||
|
||
class BayesianLasso(BaseEstimator, RegressorMixin): | ||
def __init__(self, b=1.0, use_mcmc=False, | ||
mcmc_burn=5000, mcmc_trials=10000, | ||
feature_map=None, feature_weights=None): | ||
self.b = b | ||
self.use_mcmc = use_mcmc | ||
self._map = None | ||
self.coef_ = None | ||
self._confidence = None | ||
self.mcmc = None | ||
self.num_betas = None | ||
self.mcmc_burn = mcmc_burn | ||
self.mcmc_trials = mcmc_trials | ||
self.feature_map = feature_map | ||
self.feature_weights = feature_weights | ||
|
||
def get_coefficients(self): | ||
return [{str(variable): variable.value} for variable in self._map.variables | ||
if str(variable).startswith('beta')] | ||
|
||
def lasso_model(self, X, y): | ||
|
||
if self.feature_map is None: | ||
self.feature_map = {i: i for i in range(X.shape[1])} | ||
if self.feature_weights is None: | ||
self.feature_weights = {i: 1.0 for i in range(X.shape[1])} | ||
|
||
# Priors for unknown model parameters | ||
betas = [] | ||
for i in range(max(self.feature_map.values()) + 1): | ||
betas.append(pymc.Laplace('beta_{}'.format(i), mu=0, tau=1.0 / self.b)) | ||
|
||
@pymc.deterministic | ||
def y_hat_lasso(betas=betas, X=X): | ||
return sum(betas[self.feature_map[i]] * | ||
self.feature_weights[i] * X[:, i] | ||
for i in range(X.shape[1])) | ||
|
||
tau = pymc.Uniform('tau', lower=0.001, upper=100.0) | ||
Y_lasso = pymc.Normal('Y', mu=y_hat_lasso, tau=tau, value=y, observed=True) | ||
return locals() | ||
|
||
def fit(self, X, y): | ||
X, y = check_X_y(X, y, multi_output=True, y_numeric=True, force_all_finite=False) | ||
|
||
if self.use_mcmc: | ||
self.mcmc = pymc.MCMC(self.lasso_model(X, y)) | ||
self.mcmc.sample(self.mcmc_trials, self.mcmc_burn, 2) | ||
if self.feature_map is not None: | ||
self.num_betas = max(self.feature_map.values()) + 1 | ||
else: | ||
self.num_betas = X.shape[1] | ||
|
||
traces = [] | ||
for i in range(self.num_betas): | ||
traces.append(self.mcmc.trace('beta_{}'.format(i))[:]) | ||
|
||
self.coef_ = np.array([np.mean(trace) for trace in traces]) | ||
else: | ||
self._map = pymc.MAP(self.lasso_model(X, y)) | ||
self._map.fit() | ||
self.coef_ = np.array([beta.value for beta in self._map.betas]) | ||
|
||
def confidence_intervals(self, confidence=95.): | ||
if self.mcmc is None: | ||
raise ValueError('Need to fit the mcmc first') | ||
|
||
traces = [] | ||
uppers = [] | ||
lowers = [] | ||
cutoff = (100 - confidence) / 2 | ||
for i in range(self.num_betas): | ||
traces.append(self.mcmc.trace('beta_{}'.format(i))[:]) | ||
uppers.append(np.percentile(traces[i], 100. - cutoff)) | ||
lowers.append(np.percentile(traces[i], cutoff)) | ||
|
||
return np.array(lowers), np.array(uppers) | ||
|
||
def predict(self, X): | ||
mapping = np.array( | ||
[self.coef_[self.feature_map[i]] * | ||
self.feature_weights[i] | ||
for i in range(len(self.feature_map))]) | ||
return np.dot(X, mapping) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import scipy | ||
import scipy.stats as sp | ||
import statsmodels.api as sm | ||
from sklearn.base import BaseEstimator, RegressorMixin | ||
import numpy as np | ||
|
||
|
||
class LinearRegressionWithUncertainty(BaseEstimator, RegressorMixin): | ||
def __init__(self): | ||
self.model = None | ||
self.coef_ = None | ||
|
||
def fit(self, X, y): | ||
self.model = sm.OLS(y, X).fit() | ||
if len(self.model.params.shape)==2: | ||
self.coef_ = np.transpose(self.model.params) | ||
else: | ||
self.coef_ = np.expand_dims(self.model.params,0) | ||
|
||
return self | ||
|
||
def confidence_intervals(self, confidence=95.): | ||
confidence = confidence * .01 | ||
|
||
unc = self.model.conf_int(1-confidence) | ||
lows, highs = np.expand_dims(np.transpose(unc[:,0]),0), np.expand_dims(np.transpose(unc[:,1]),0) | ||
return lows, highs | ||
|
||
|
||
def predict(self, X): | ||
return np.dot(X, np.transpose(self.coef_)) |
Oops, something went wrong.