Skip to content

Commit

Permalink
major update
Browse files Browse the repository at this point in the history
  • Loading branch information
mhamilton723 committed Jul 14, 2017
1 parent 593fea4 commit fa494e8
Show file tree
Hide file tree
Showing 11 changed files with 372 additions and 3 deletions.
3 changes: 2 additions & 1 deletion tests/test_delta_transformer.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from TimeSeriesRegressor.tseries.delta_transformer import *
from tseries.tseries.delta_transformer import *
import pandas as pd

X = pd.DataFrame([[1.0, 1.0, 1.0],
[2.0, 0.0, 4.0],
[3.0, -1.0, 25.0],
[4.0, -1.0, 25.0],
[5.0, -1.0, 25.0]], columns=["foo", 'bar', 'baz'])

X_trans = pd.DataFrame([[1.0, -1.0, 4.0],
[1.0, -1.0, 25.0],
[1.0, 0.0, 25.0],
Expand Down
98 changes: 98 additions & 0 deletions tests/test_double_learner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from tseries.tseries.double_learner import DoubleLearner
import pandas as pd
import numpy as np
from sklearn.linear_model.base import LinearRegression
from sklearn.linear_model import Lasso
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from tseries.tseries.alternating_learner import AlternatingLearner

dataset_size = 1000

treatment_dim = 4
confounder_dim = 30
result_dim = 1
iters = 20
theta0 = np.array([1, -1, 1, 5])


def gen_x_y(dataset_size=dataset_size,
result_dim=result_dim,
treatment_dim=treatment_dim,
confounder_dim=confounder_dim,
theta0=theta0):
Z = np.random.randn(dataset_size, confounder_dim)

m0_m = np.random.randn(confounder_dim, treatment_dim)
m0 = lambda x: np.tanh(np.dot(x, m0_m))

g0_m = np.random.randn(confounder_dim, result_dim)
g0_b = np.random.randn(result_dim)
g0 = lambda x: np.tanh(np.dot(x, g0_m) + g0_b)

V = np.random.randn(dataset_size, treatment_dim)
U = np.random.randn(dataset_size, result_dim)

D = m0(Z) + V
y = np.dot(D, np.expand_dims(theta0, 1)) + g0(Z) + U
X = np.hstack([D, Z])

return X, y


X, y = gen_x_y()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)


def mse(y1, y2):
return np.mean((y1 - y2) ** 2)


treatment_cols = range(0, treatment_dim)
confounder_cols = range(treatment_dim, treatment_dim + confounder_dim)

# lr = LinearRegression(fit_intercept=False).fit(X_train, y_train)
# dr = DoubleLearner(treatment_cols, confounder_cols).fit(X_train, y_train)
al = AlternatingLearner(treatment_cols, confounder_cols).fit(X_train, y_train)
dl = DoubleLearner(treatment_cols,
confounder_cols,
treatment_estimator=RandomForestRegressor(),
y_estimator=RandomForestRegressor()).fit(X_train, y_train)

al_pred = al.predict(X_test)
dl_pred = dl.predict(X_test)
print(mse(al_pred, y_test))
print(mse(dl_pred, y_test))
print(al.estimator_1.coef_[treatment_cols])
print(dl.effect_estimator.coef_)

'''
plt.subplot(2, 2, 1)
plt.plot(lr_pred, y_test, 'r.', alpha=.2)
plt.plot(np.linspace(-20, 20), np.linspace(-20, 20))
plt.subplot(2, 2, 2)
plt.plot(dr_pred, y_test, 'r.', alpha=.2)
plt.plot(np.linspace(-20, 20), np.linspace(-20, 20))
plt.subplot(2, 2, 3)
plt.hist(lr_pred - y_test)
plt.subplot(2, 2, 4)
plt.hist(dr_pred - y_test)
plt.show()
lr_coeffs = []
dl_lr_coeffs = []
for i in range(iters):
X, y = gen_x_y(dataset_size)
lr_coeffs.append(LinearRegression(fit_intercept=False).fit(X, y).coef_)
print(DoubleLearner([0], [1]).fit(X, y).effect_estimator.coef_)
print(lr_coeffs[i])
x_lr, n_lr = zip(*lr_coeffs)
plt.plot(x_lr, n_lr, 'ro', label='LR')
plt.plot([15], [0], 'b*', ms=20, label='truth')
plt.xlabel('x1 coeff')
plt.ylabel('y1 coeff')
plt.show()
'''
2 changes: 1 addition & 1 deletion tests/test_double_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from sklearn.linear_model.base import LinearRegression
from TimeSeriesRegressor.tseries import DoublePipeline, DeltaTransformer
from tseries.tseries import DoublePipeline, DeltaTransformer
import pandas as pd
import numpy as np

Expand Down
31 changes: 31 additions & 0 deletions tests/test_linear_regression_with_uncertainty.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from sklearn.linear_model import LinearRegression
from tseries.tseries.models import LinearRegressionWithUncertainty
from tseries.tseries.utils import mse
import pandas as pd
import numpy as np

n = 300
dx = 5
dy = 1
X = np.random.randn(n, dx)
M = np.random.randn(dx, dy)
Y = np.dot(X, M) + .1 * np.random.randn(n, dy)


def test_linear_regression_with_uncertainty():
fitLR = LinearRegression().fit(X, Y)
fitLRU = LinearRegressionWithUncertainty().fit(X, Y)

predLR = fitLR.predict(X)
predLRU = fitLRU.predict(X)

assert ((fitLRU.confidence_intervals()[1] - fitLRU.coef_) > 0).all()
assert ((fitLRU.confidence_intervals()[0] - fitLRU.coef_) < 0).all()
assert (np.isclose(fitLR.coef_, fitLRU.coef_).all)
assert (np.isclose(mse(predLR, Y), mse(predLRU, Y), rtol=1e-2))
# assert ()
# assert (np.isclose(fit_model.x_pipe_.steps[-1][1].coef_, [1.0, 0.0, 0.0]).all())


if __name__ == '__main__':
test_linear_regression_with_uncertainty()
2 changes: 1 addition & 1 deletion tests/test_tse.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from numpy.testing import assert_equal
from sklearn.grid_search import GridSearchCV
from tseries.time_series_estimator import *
from tseries.tseries.time_series_estimator import *
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet

Expand Down
48 changes: 48 additions & 0 deletions tseries/alternating_learner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from sklearn.base import BaseEstimator, RegressorMixin
import numpy as np
from sklearn.linear_model.base import LinearRegression
from sklearn.ensemble.forest import RandomForestRegressor


class AlternatingLearner(BaseEstimator, RegressorMixin):
def __init__(self,
cols_1,
cols_2,
estimator_1=LinearRegression(fit_intercept=False),
estimator_2=RandomForestRegressor(),
iters=2):
self.cols_1 = cols_1
self.cols_2 = cols_2
self.estimator_1 = estimator_1
self.estimator_2 = estimator_2
self.iters = iters

def _normalize_x(self, X):
if len(X.shape) < 2:
X = np.expand_dims(X, 1)
return np.array(X)

def _normalize_y(self, y):
return np.squeeze(y)


def fit(self, X, y):
X = self._normalize_x(X)
y = self._normalize_y(y)

for i in range(self.iters):
if i == 0:
target_1 = y
else:
target_1 = y - self.estimator_2.predict(X[:, self.cols_2])
self.estimator_1 = self.estimator_1.fit(X[:, self.cols_1], target_1)

target_2 = y - self.estimator_1.predict(X[:, self.cols_1])
self.estimator_2 = self.estimator_2.fit(X[:, self.cols_2], target_2)

return self

def predict(self, X):
X = self._normalize_x(X)
return self.estimator_1.predict(X[:, self.cols_1]) + \
self.estimator_2.predict(X[:, self.cols_2])
47 changes: 47 additions & 0 deletions tseries/double_learner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from sklearn.base import BaseEstimator, RegressorMixin
import numpy as np
from sklearn.linear_model.base import LinearRegression


class DoubleLearner(BaseEstimator, RegressorMixin):
def __init__(self,
treatment_cols,
nusiance_cols,
effect_estimator=LinearRegression(fit_intercept=False),
treatment_estimator=LinearRegression(fit_intercept=False),
y_estimator=LinearRegression(fit_intercept=False)):
self.nusiance_cols = nusiance_cols
self.treatment_cols = treatment_cols
self.effect_estimator = effect_estimator
self.treatment_estimator = treatment_estimator
self.y_estimator = y_estimator

def _normalize_x(self, X):
if len(X.shape) < 2:
X = np.expand_dims(X, 1)
return np.array(X)

def _normalize_y(self, y):
return np.squeeze(y)

def fit(self, X, y):
X = self._normalize_x(X)
y = self._normalize_y(y)

self.y_estimator = self.y_estimator \
.fit(X[:, self.nusiance_cols], y)
y_resid = y - self.y_estimator.predict(X[:, self.nusiance_cols])

self.treatment_estimator = self.treatment_estimator \
.fit(X[:, self.nusiance_cols], X[:, self.treatment_cols])
treatment_resid = X[:, self.treatment_cols] - self.treatment_estimator.predict(X[:, self.nusiance_cols])

self.effect_estimator = self.effect_estimator.fit(treatment_resid, y_resid)

return self

def predict(self, X):
X = self._normalize_x(X)
treatment_resid = X[:, self.treatment_cols] - self.treatment_estimator.predict(X[:, self.nusiance_cols])
return self.effect_estimator.predict(treatment_resid) + \
self.y_estimator.predict(X[:, self.nusiance_cols])
2 changes: 2 additions & 0 deletions tseries/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from bayesian_lasso import *
from linear_regression_with_uncertainty import *
90 changes: 90 additions & 0 deletions tseries/models/bayesian_lasso.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from sklearn.utils.validation import check_X_y
from sklearn.base import BaseEstimator, RegressorMixin
import pymc
import numpy as np


class BayesianLasso(BaseEstimator, RegressorMixin):
def __init__(self, b=1.0, use_mcmc=False,
mcmc_burn=5000, mcmc_trials=10000,
feature_map=None, feature_weights=None):
self.b = b
self.use_mcmc = use_mcmc
self._map = None
self.coef_ = None
self._confidence = None
self.mcmc = None
self.num_betas = None
self.mcmc_burn = mcmc_burn
self.mcmc_trials = mcmc_trials
self.feature_map = feature_map
self.feature_weights = feature_weights

def get_coefficients(self):
return [{str(variable): variable.value} for variable in self._map.variables
if str(variable).startswith('beta')]

def lasso_model(self, X, y):

if self.feature_map is None:
self.feature_map = {i: i for i in range(X.shape[1])}
if self.feature_weights is None:
self.feature_weights = {i: 1.0 for i in range(X.shape[1])}

# Priors for unknown model parameters
betas = []
for i in range(max(self.feature_map.values()) + 1):
betas.append(pymc.Laplace('beta_{}'.format(i), mu=0, tau=1.0 / self.b))

@pymc.deterministic
def y_hat_lasso(betas=betas, X=X):
return sum(betas[self.feature_map[i]] *
self.feature_weights[i] * X[:, i]
for i in range(X.shape[1]))

tau = pymc.Uniform('tau', lower=0.001, upper=100.0)
Y_lasso = pymc.Normal('Y', mu=y_hat_lasso, tau=tau, value=y, observed=True)
return locals()

def fit(self, X, y):
X, y = check_X_y(X, y, multi_output=True, y_numeric=True, force_all_finite=False)

if self.use_mcmc:
self.mcmc = pymc.MCMC(self.lasso_model(X, y))
self.mcmc.sample(self.mcmc_trials, self.mcmc_burn, 2)
if self.feature_map is not None:
self.num_betas = max(self.feature_map.values()) + 1
else:
self.num_betas = X.shape[1]

traces = []
for i in range(self.num_betas):
traces.append(self.mcmc.trace('beta_{}'.format(i))[:])

self.coef_ = np.array([np.mean(trace) for trace in traces])
else:
self._map = pymc.MAP(self.lasso_model(X, y))
self._map.fit()
self.coef_ = np.array([beta.value for beta in self._map.betas])

def confidence_intervals(self, confidence=95.):
if self.mcmc is None:
raise ValueError('Need to fit the mcmc first')

traces = []
uppers = []
lowers = []
cutoff = (100 - confidence) / 2
for i in range(self.num_betas):
traces.append(self.mcmc.trace('beta_{}'.format(i))[:])
uppers.append(np.percentile(traces[i], 100. - cutoff))
lowers.append(np.percentile(traces[i], cutoff))

return np.array(lowers), np.array(uppers)

def predict(self, X):
mapping = np.array(
[self.coef_[self.feature_map[i]] *
self.feature_weights[i]
for i in range(len(self.feature_map))])
return np.dot(X, mapping)
31 changes: 31 additions & 0 deletions tseries/models/linear_regression_with_uncertainty.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import scipy
import scipy.stats as sp
import statsmodels.api as sm
from sklearn.base import BaseEstimator, RegressorMixin
import numpy as np


class LinearRegressionWithUncertainty(BaseEstimator, RegressorMixin):
def __init__(self):
self.model = None
self.coef_ = None

def fit(self, X, y):
self.model = sm.OLS(y, X).fit()
if len(self.model.params.shape)==2:
self.coef_ = np.transpose(self.model.params)
else:
self.coef_ = np.expand_dims(self.model.params,0)

return self

def confidence_intervals(self, confidence=95.):
confidence = confidence * .01

unc = self.model.conf_int(1-confidence)
lows, highs = np.expand_dims(np.transpose(unc[:,0]),0), np.expand_dims(np.transpose(unc[:,1]),0)
return lows, highs


def predict(self, X):
return np.dot(X, np.transpose(self.coef_))
Loading

0 comments on commit fa494e8

Please sign in to comment.