Skip to content

Swap out patsy for formulaic #463

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 19 additions & 16 deletions causalpy/experiments/diff_in_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
import numpy as np
import pandas as pd
import seaborn as sns
from formulaic import model_matrix
from matplotlib import pyplot as plt
from patsy import build_design_matrices, dmatrices
from sklearn.base import RegressorMixin

from causalpy.custom_exceptions import (
Expand Down Expand Up @@ -95,12 +95,11 @@ def __init__(
self.group_variable_name = group_variable_name
self.input_validation()

y, X = dmatrices(formula, self.data)
self._y_design_info = y.design_info
self._x_design_info = X.design_info
self.labels = X.design_info.column_names
self.y, self.X = np.asarray(y), np.asarray(X)
self.outcome_variable_name = y.design_info.column_names[0]
dm = model_matrix(self.formula, self.data)
self.labels = list(dm.rhs.columns)
self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
self.rhs_matrix_spec = dm.rhs.model_spec
self.outcome_variable_name = dm.lhs.columns[0]

# fit model
if isinstance(self.model, PyMCModel):
Expand All @@ -125,8 +124,10 @@ def __init__(
)
if self.x_pred_control.empty:
raise ValueError("x_pred_control is empty")
(new_x,) = build_design_matrices([self._x_design_info], self.x_pred_control)
self.y_pred_control = self.model.predict(np.asarray(new_x))
new_x = model_matrix(
spec=self.rhs_matrix_spec, data=self.x_pred_control
).to_numpy()
self.y_pred_control = self.model.predict(new_x)

# predicted outcome for treatment group
self.x_pred_treatment = (
Expand All @@ -142,8 +143,10 @@ def __init__(
)
if self.x_pred_treatment.empty:
raise ValueError("x_pred_treatment is empty")
(new_x,) = build_design_matrices([self._x_design_info], self.x_pred_treatment)
self.y_pred_treatment = self.model.predict(np.asarray(new_x))
new_x = model_matrix(
spec=self.rhs_matrix_spec, data=self.x_pred_treatment
).to_numpy()
self.y_pred_treatment = self.model.predict(new_x)

# predicted outcome for counterfactual. This is given by removing the influence
# of the interaction term between the group and the post_treatment variable
Expand All @@ -162,15 +165,15 @@ def __init__(
)
if self.x_pred_counterfactual.empty:
raise ValueError("x_pred_counterfactual is empty")
(new_x,) = build_design_matrices(
[self._x_design_info], self.x_pred_counterfactual, return_type="dataframe"
)
new_x = model_matrix(
spec=self.rhs_matrix_spec, data=self.x_pred_counterfactual
).to_numpy()
# INTERVENTION: set the interaction term between the group and the
# post_treatment variable to zero. This is the counterfactual.
for i, label in enumerate(self.labels):
if "post_treatment" in label and self.group_variable_name in label:
new_x.iloc[:, i] = 0
self.y_pred_counterfactual = self.model.predict(np.asarray(new_x))
new_x[:, i] = 0
self.y_pred_counterfactual = self.model.predict(new_x)

# calculate causal impact
if isinstance(self.model, PyMCModel):
Expand Down
28 changes: 13 additions & 15 deletions causalpy/experiments/instrumental_variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

import numpy as np
import pandas as pd
from patsy import dmatrices
from formulaic import model_matrix
from sklearn.linear_model import LinearRegression as sk_lin_reg

from causalpy.custom_exceptions import DataException
Expand Down Expand Up @@ -110,19 +110,17 @@ def __init__(
self.model = model
self.input_validation()

y, X = dmatrices(formula, self.data)
self._y_design_info = y.design_info
self._x_design_info = X.design_info
self.labels = X.design_info.column_names
self.y, self.X = np.asarray(y), np.asarray(X)
self.outcome_variable_name = y.design_info.column_names[0]
dm = model_matrix(self.formula, self.data)
self.labels = list(dm.rhs.columns)
self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
self.rhs_matrix_spec = dm.rhs.model_spec
self.outcome_variable_name = dm.lhs.columns[0]

t, Z = dmatrices(instruments_formula, self.instruments_data)
self._t_design_info = t.design_info
self._z_design_info = Z.design_info
self.labels_instruments = Z.design_info.column_names
self.t, self.Z = np.asarray(t), np.asarray(Z)
self.instrument_variable_name = t.design_info.column_names[0]
dm = model_matrix(self.instruments_formula, self.instruments_data)
self.labels_instruments = list(dm.rhs.columns)
self.t, self.Z = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
self.instrument_rhs_matrix_spec = dm.rhs.model_spec
self.instrument_variable_name = dm.lhs.columns[0]

self.get_naive_OLS_fit()
self.get_2SLS_fit()
Expand Down Expand Up @@ -176,7 +174,7 @@ def get_2SLS_fit(self):
fitted_Z_values = first_stage_reg.predict(self.Z)
X2 = self.data.copy(deep=True)
X2[self.instrument_variable_name] = fitted_Z_values
_, X2 = dmatrices(self.formula, X2)
X2 = model_matrix(self.formula, X2).rhs.to_numpy()
second_stage_reg = sk_lin_reg().fit(X=X2, y=self.y)
betas_first = list(first_stage_reg.coef_[0][1:])
betas_first.insert(0, first_stage_reg.intercept_[0])
Expand All @@ -196,7 +194,7 @@ def get_naive_OLS_fit(self):
ols_reg = sk_lin_reg().fit(self.X, self.y)
beta_params = list(ols_reg.coef_[0][1:])
beta_params.insert(0, ols_reg.intercept_[0])
self.ols_beta_params = dict(zip(self._x_design_info.column_names, beta_params))
self.ols_beta_params = dict(zip(self.labels, beta_params))
self.ols_reg = ols_reg

def plot(self, round_to=None):
Expand Down
21 changes: 9 additions & 12 deletions causalpy/experiments/interrupted_time_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
import arviz as az
import numpy as np
import pandas as pd
from formulaic import model_matrix
from matplotlib import pyplot as plt
from patsy import build_design_matrices, dmatrices
from sklearn.base import RegressorMixin

from causalpy.custom_exceptions import BadIndexException
Expand Down Expand Up @@ -95,18 +95,15 @@ def __init__(
self.formula = formula

# set things up with pre-intervention data
y, X = dmatrices(formula, self.datapre)
self.outcome_variable_name = y.design_info.column_names[0]
self._y_design_info = y.design_info
self._x_design_info = X.design_info
self.labels = X.design_info.column_names
self.pre_y, self.pre_X = np.asarray(y), np.asarray(X)
dm = model_matrix(self.formula, self.datapre)
self.labels = list(dm.rhs.columns)
self.matrix_spec = dm.model_spec
self.outcome_variable_name = dm.lhs.columns[0]
self.pre_y, self.pre_X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
# process post-intervention data
(new_y, new_x) = build_design_matrices(
[self._y_design_info, self._x_design_info], self.datapost
)
self.post_X = np.asarray(new_x)
self.post_y = np.asarray(new_y)
new_dm = model_matrix(spec=self.matrix_spec, data=self.datapost)
self.post_X = new_dm.rhs.to_numpy()
self.post_y = new_dm.lhs.to_numpy()

# fit the model to the observed (pre-intervention) data
if isinstance(self.model, PyMCModel):
Expand Down
12 changes: 6 additions & 6 deletions causalpy/experiments/inverse_propensity_weighting.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from formulaic import model_matrix
from matplotlib.lines import Line2D
from patsy import dmatrices
from sklearn.linear_model import LinearRegression as sk_lin_reg

from causalpy.custom_exceptions import DataException
Expand Down Expand Up @@ -89,11 +89,11 @@ def __init__(
self.weighting_scheme = weighting_scheme
self.input_validation()

t, X = dmatrices(formula, self.data)
self._t_design_info = t.design_info
self._t_design_info = X.design_info
self.labels = X.design_info.column_names
self.t, self.X = np.asarray(t), np.asarray(X)
dm = model_matrix(self.formula, self.data)
self.labels = list(dm.rhs.columns)
self.t, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
self.rhs_matrix_spec = dm.rhs.model_spec
self.outcome_variable_name = dm.lhs.columns[0]
self.y = self.data[self.outcome_variable]

COORDS = {"obs_ind": list(range(self.X.shape[0])), "coeffs": self.labels}
Expand Down
27 changes: 14 additions & 13 deletions causalpy/experiments/prepostnegd.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
import numpy as np
import pandas as pd
import seaborn as sns
from formulaic import model_matrix
from matplotlib import pyplot as plt
from patsy import build_design_matrices, dmatrices
from sklearn.base import RegressorMixin

from causalpy.custom_exceptions import (
Expand Down Expand Up @@ -104,12 +104,11 @@ def __init__(
self.pretreatment_variable_name = pretreatment_variable_name
self.input_validation()

y, X = dmatrices(formula, self.data)
self._y_design_info = y.design_info
self._x_design_info = X.design_info
self.labels = X.design_info.column_names
self.y, self.X = np.asarray(y), np.asarray(X)
self.outcome_variable_name = y.design_info.column_names[0]
dm = model_matrix(self.formula, self.data)
self.labels = list(dm.rhs.columns)
self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
self.rhs_matrix_spec = dm.rhs.model_spec
self.outcome_variable_name = dm.lhs.columns[0]

# fit the model to the observed (pre-intervention) data
if isinstance(self.model, PyMCModel):
Expand All @@ -135,19 +134,21 @@ def __init__(
self.group_variable_name: np.zeros(self.pred_xi.shape),
}
)
(new_x_untreated,) = build_design_matrices(
[self._x_design_info], x_pred_untreated
)
self.pred_untreated = self.model.predict(X=np.asarray(new_x_untreated))
new_x_untreated = model_matrix(
spec=self.rhs_matrix_spec, data=x_pred_untreated
).to_numpy()
self.pred_untreated = self.model.predict(X=new_x_untreated)
# treated
x_pred_treated = pd.DataFrame(
{
self.pretreatment_variable_name: self.pred_xi,
self.group_variable_name: np.ones(self.pred_xi.shape),
}
)
(new_x_treated,) = build_design_matrices([self._x_design_info], x_pred_treated)
self.pred_treated = self.model.predict(X=np.asarray(new_x_treated))
new_x_treated = model_matrix(
spec=self.rhs_matrix_spec, data=x_pred_treated
).to_numpy()
self.pred_treated = self.model.predict(X=new_x_treated)

# Evaluate causal impact as equal to the trestment effect
self.causal_impact = self.model.idata.posterior["beta"].sel(
Expand Down
23 changes: 11 additions & 12 deletions causalpy/experiments/regression_discontinuity.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from patsy import build_design_matrices, dmatrices
from formulaic import model_matrix
from sklearn.base import RegressorMixin

from causalpy.custom_exceptions import (
Expand Down Expand Up @@ -111,15 +111,14 @@ def __init__(
f"Choice of bandwidth parameter has lead to only {len(filtered_data)} remaining datapoints. Consider increasing the bandwidth parameter.", # noqa: E501
UserWarning,
)
y, X = dmatrices(formula, filtered_data)
dm = model_matrix(formula, filtered_data)
else:
y, X = dmatrices(formula, self.data)
dm = model_matrix(formula, self.data)

self._y_design_info = y.design_info
self._x_design_info = X.design_info
self.labels = X.design_info.column_names
self.y, self.X = np.asarray(y), np.asarray(X)
self.outcome_variable_name = y.design_info.column_names[0]
self.labels = list(dm.rhs.columns)
self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
self.rhs_matrix_spec = dm.rhs.model_spec
self.outcome_variable_name = dm.lhs.columns[0]

# fit model
if isinstance(self.model, PyMCModel):
Expand All @@ -146,8 +145,8 @@ def __init__(
self.x_pred = pd.DataFrame(
{self.running_variable_name: xi, "treated": self._is_treated(xi)}
)
(new_x,) = build_design_matrices([self._x_design_info], self.x_pred)
self.pred = self.model.predict(X=np.asarray(new_x))
new_x = model_matrix(spec=self.rhs_matrix_spec, data=self.x_pred).to_numpy()
self.pred = self.model.predict(X=new_x)

# calculate discontinuity by evaluating the difference in model expectation on
# either side of the discontinuity
Expand All @@ -164,8 +163,8 @@ def __init__(
"treated": np.array([0, 1]),
}
)
(new_x,) = build_design_matrices([self._x_design_info], self.x_discon)
self.pred_discon = self.model.predict(X=np.asarray(new_x))
new_x = model_matrix(spec=self.rhs_matrix_spec, data=self.x_discon).to_numpy()
self.pred_discon = self.model.predict(X=new_x)

# ******** THIS IS SUBOPTIMAL AT THE MOMENT ************************************
if isinstance(self.model, PyMCModel):
Expand Down
23 changes: 11 additions & 12 deletions causalpy/experiments/regression_kink.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import numpy as np
import pandas as pd
import seaborn as sns
from patsy import build_design_matrices, dmatrices
from formulaic import model_matrix

from causalpy.plot_utils import plot_xY

Expand Down Expand Up @@ -74,15 +74,14 @@ def __init__(
f"Choice of bandwidth parameter has lead to only {len(filtered_data)} remaining datapoints. Consider increasing the bandwidth parameter.", # noqa: E501
UserWarning,
)
y, X = dmatrices(formula, filtered_data)
dm = model_matrix(formula, filtered_data)
else:
y, X = dmatrices(formula, self.data)
dm = model_matrix(formula, self.data)

self._y_design_info = y.design_info
self._x_design_info = X.design_info
self.labels = X.design_info.column_names
self.y, self.X = np.asarray(y), np.asarray(X)
self.outcome_variable_name = y.design_info.column_names[0]
self.labels = list(dm.rhs.columns)
self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
self.rhs_matrix_spec = dm.rhs.model_spec
self.outcome_variable_name = dm.lhs.columns[0]

COORDS = {"coeffs": self.labels, "obs_indx": np.arange(self.X.shape[0])}
self.model.fit(X=self.X, y=self.y, coords=COORDS)
Expand All @@ -102,8 +101,8 @@ def __init__(
self.x_pred = pd.DataFrame(
{self.running_variable_name: xi, "treated": self._is_treated(xi)}
)
(new_x,) = build_design_matrices([self._x_design_info], self.x_pred)
self.pred = self.model.predict(X=np.asarray(new_x))
new_x = model_matrix(spec=self.rhs_matrix_spec, data=self.x_pred).to_numpy()
self.pred = self.model.predict(X=new_x)

# evaluate gradient change around kink point
mu_kink_left, mu_kink, mu_kink_right = self._probe_kink_point()
Expand Down Expand Up @@ -158,8 +157,8 @@ def _probe_kink_point(self):
"treated": np.array([0, 1, 1]),
}
)
(new_x,) = build_design_matrices([self._x_design_info], x_predict)
predicted = self.model.predict(X=np.asarray(new_x))
new_x = model_matrix(spec=self.rhs_matrix_spec, data=x_predict).to_numpy()
predicted = self.model.predict(X=new_x)
# extract predicted mu values
mu_kink_left = predicted["posterior_predictive"].sel(obs_ind=0)["mu"]
mu_kink = predicted["posterior_predictive"].sel(obs_ind=1)["mu"]
Expand Down
21 changes: 9 additions & 12 deletions causalpy/experiments/synthetic_control.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
import arviz as az
import numpy as np
import pandas as pd
from formulaic import model_matrix
from matplotlib import pyplot as plt
from patsy import build_design_matrices, dmatrices
from sklearn.base import RegressorMixin

from causalpy.custom_exceptions import BadIndexException
Expand Down Expand Up @@ -90,18 +90,15 @@ def __init__(
self.formula = formula

# set things up with pre-intervention data
y, X = dmatrices(formula, self.datapre)
self.outcome_variable_name = y.design_info.column_names[0]
self._y_design_info = y.design_info
self._x_design_info = X.design_info
self.labels = X.design_info.column_names
self.pre_y, self.pre_X = np.asarray(y), np.asarray(X)
dm = model_matrix(self.formula, self.datapre)
self.labels = list(dm.rhs.columns)
self.pre_y, self.pre_X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
self.matrix_spec = dm.model_spec
self.outcome_variable_name = dm.lhs.columns[0]
# process post-intervention data
(new_y, new_x) = build_design_matrices(
[self._y_design_info, self._x_design_info], self.datapost
)
self.post_X = np.asarray(new_x)
self.post_y = np.asarray(new_y)
new_dm = model_matrix(spec=self.matrix_spec, data=self.datapost)
self.post_X = new_dm.rhs.to_numpy()
self.post_y = new_dm.lhs.to_numpy()

# fit the model to the observed (pre-intervention) data
if isinstance(self.model, PyMCModel):
Expand Down
Loading