From e2862a5c3c1904e870a1727a36e3e89860275c17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 18 Apr 2024 00:11:30 +0100 Subject: [PATCH] [ENH] adapter to `lifelines`, most distributional survival regressors interfaced (#247) Adds an adapter to `lifelines`, exposing array-like survival functions as `Empirical` distributions in `predict_proba`. Adds all models from `lifelines` which are capable of full distributional predictions: * `AalenAdditiveFitter` * `CoxPHFitter` * `WeibullAFTFitter` Remaining AFT fitters require distributions not yet merged. --- docs/source/api_reference/survival.rst | 23 ++ pyproject.toml | 1 + skpro/survival/adapters/_common.py | 120 ++++++++++ skpro/survival/adapters/lifelines.py | 196 +++++++++++++++++ skpro/survival/adapters/sksurv.py | 55 +---- skpro/survival/additive/__init__.py | 5 + skpro/survival/additive/_aalen_lifelines.py | 103 +++++++++ skpro/survival/aft/__init__.py | 6 + skpro/survival/aft/_aft_lifelines_weibull.py | 219 +++++++++++++++++++ skpro/survival/coxph/__init__.py | 3 +- skpro/survival/coxph/_coxph_lifelines.py | 214 ++++++++++++++++++ 11 files changed, 897 insertions(+), 48 deletions(-) create mode 100644 skpro/survival/adapters/_common.py create mode 100644 skpro/survival/adapters/lifelines.py create mode 100644 skpro/survival/additive/__init__.py create mode 100644 skpro/survival/additive/_aalen_lifelines.py create mode 100644 skpro/survival/aft/__init__.py create mode 100644 skpro/survival/aft/_aft_lifelines_weibull.py create mode 100644 skpro/survival/coxph/_coxph_lifelines.py diff --git a/docs/source/api_reference/survival.rst b/docs/source/api_reference/survival.rst index d85298cd0..a03ee9c40 100644 --- a/docs/source/api_reference/survival.rst +++ b/docs/source/api_reference/survival.rst @@ -86,9 +86,32 @@ Proportional hazards models :template: class.rst CoxPH + CoxPHlifelines CoxPHSkSurv CoxNet +Accelerated failure time models +------------------------------- + +.. currentmodule:: skpro.survival.aft + +.. autosummary:: + :toctree: auto_generated/ + :template: class.rst + + AFTWeibull + +Generalized additive survival models +------------------------------------ + +.. currentmodule:: skpro.survival.additive + +.. autosummary:: + :toctree: auto_generated/ + :template: class.rst + + AalenAdditiveLifelines + Tree models ----------- diff --git a/pyproject.toml b/pyproject.toml index c0dab3e46..bd384ca3f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,7 @@ dependencies = [ all_extras = [ "attrs", "distfit", + "lifelines<0.29.0", "mapie", "matplotlib>=3.3.2", "ngboost", diff --git a/skpro/survival/adapters/_common.py b/skpro/survival/adapters/_common.py new file mode 100644 index 000000000..468eecda9 --- /dev/null +++ b/skpro/survival/adapters/_common.py @@ -0,0 +1,120 @@ +"""Common utilities for adapters.""" + +import numpy as np + + +def _clip_surv(surv_arr): + """Clips improper survival function values to proper range. + + Enforces: values are in [0, 1] and are monotonically decreasing. + + First clips to [0, 1], then enforces monotonicity, by replacing + any value with minimum of itself and any previous values. + + Parameters + ---------- + surv_arr : 2D np.ndarray + Survival function values. + index 0 is instance index. + index 1 is time index, increasing. + + Returns + ------- + surv_arr_clipped : 2D np.ndarray + Clipped survival function values. + surv_arr_diff : 2D np.ndarray, same shape as surv_arr_clipped. + Difference of clipped survival function values. + Same as np.diff(surv_arr, axis=1, prepend=1, append=0), + then summing the last two columns to become one column. + Returned to avoid recomputation, if needed later in context. + clipped : boolean + Whether clipping was needed. + """ + too_large = surv_arr > 1 + too_small = surv_arr < 0 + + surv_arr[too_large] = 1 + surv_arr[too_small] = 0 + + surv_arr_diff = _surv_diff(surv_arr) + + # avoid iterative minimization if no further clipping is needed + if not (surv_arr_diff > 0).any(): + clipped = too_large.any() or too_small.any() + return surv_arr, surv_arr_diff, clipped + + # enforce monotonicity + # iterating from left to right ensures values are replaced + # with minimum of itself and all values to the left + for i in range(1, surv_arr.shape[1]): + surv_arr[:, i] = np.minimum(surv_arr[:, i], surv_arr[:, i - 1]) + + surv_arr_diff = _surv_diff(surv_arr) + + return surv_arr, surv_arr_diff, True + + +def _surv_diff(surv_arr): + """Compute difference of survival function values. + + Parameters + ---------- + surv_arr : 2D np.ndarray + Survival function values. + index 0 is instance index. + index 1 is time index, increasing. + + Returns + ------- + surv_arr_diff : 2D np.ndarray, same shape as surv_arr + Difference of survival function values. + Same as np.diff(surv_arr, axis=1, prepend=1, append=0), + then summing the last two columns to become one column + """ + surv_arr_diff = np.diff(surv_arr, axis=1, prepend=1, append=0) + + surv_arr_diff[:, -2] = surv_arr_diff[:, -2] + surv_arr_diff[:, -1] + surv_arr_diff = surv_arr_diff[:, :-1] + + return surv_arr_diff + + +def _get_fitted_params_default_safe(obj=None): + """Obtain fitted params of object, per sklearn convention. + + Same as _get_fitted_params_default, but with exception handling. + + This is since in sksurv, feature_importances_ is a property + and may raise an exception if the estimator does not have it. + + Parameters + ---------- + obj : any object + + Returns + ------- + fitted_params : dict with str keys + fitted parameters, keyed by names of fitted parameter + """ + # default retrieves all self attributes ending in "_" + # and returns them with keys that have the "_" removed + # + # get all attributes ending in "_", exclude any that start with "_" (private) + fitted_params = [ + attr for attr in dir(obj) if attr.endswith("_") and not attr.startswith("_") + ] + + def hasattr_safe(obj, attr): + try: + if hasattr(obj, attr): + getattr(obj, attr) + return True + except Exception: + return False + + # remove the "_" at the end + fitted_param_dict = { + p[:-1]: getattr(obj, p) for p in fitted_params if hasattr_safe(obj, p) + } + + return fitted_param_dict diff --git a/skpro/survival/adapters/lifelines.py b/skpro/survival/adapters/lifelines.py new file mode 100644 index 000000000..f3c0b49f7 --- /dev/null +++ b/skpro/survival/adapters/lifelines.py @@ -0,0 +1,196 @@ +# copyright: sktime developers, BSD-3-Clause License (see LICENSE file) +"""Implements adapter for lifelines models.""" + +__all__ = ["_LifelinesAdapter"] +__author__ = ["fkiraly"] + +from warnings import warn + +import numpy as np +import pandas as pd + +from skpro.distributions.empirical import Empirical +from skpro.survival.adapters._common import _clip_surv, _get_fitted_params_default_safe +from skpro.utils.sklearn import prep_skl_df + + +class _LifelinesAdapter: + """Mixin adapter class for lifelines models.""" + + _tags = { + # packaging info + # -------------- + "authors": ["fkiraly"], + "python_dependencies": ["lifelines"], + "license_type": "permissive", + # capability tags + # --------------- + "X_inner_mtype": "pd_DataFrame_Table", + "y_inner_mtype": "pd_DataFrame_Table", + "C_inner_mtype": "pd_DataFrame_Table", + "capability:multioutput": False, + } + + # defines the name of the attribute containing the lifelines estimator + _estimator_attr = "_estimator" + + def _get_lifelines_class(self): + """Abstract method to get lifelines class. + + should import and return lifelines class + """ + # from lifelines import LifelinesClass + # + # return LifelinesClass + raise NotImplementedError("abstract method") + + def _get_lifelines_object(self): + """Abstract method to initialize lifelines object. + + The default initializes result of _get_lifelines_class + with self.get_params. + """ + cls = self._get_lifelines_class() + return cls(**self.get_params()) + + def _init_lifelines_object(self): + """Abstract method to initialize lifelines object and set to _estimator_attr. + + The default writes the return of _get_lifelines_object to + the attribute of self with name _estimator_attr + """ + cls = self._get_lifelines_object() + setattr(self, self._estimator_attr, cls) + return getattr(self, self._estimator_attr) + + def _get_extra_fit_args(self, X, y, C=None): + """Get extra arguments for the fit method. + + Parameters + ---------- + X : pd.DataFrame + Training features + y: pd.DataFrame + Training labels + C: pd.DataFrame, optional (default=None) + Censoring information for survival analysis. + + Returns + ------- + dict + Extra arguments for the fit method. + """ + return {} + + def _fit(self, X, y, C=None): + """Fit estimator training data. + + Parameters + ---------- + X : pd.DataFrame + Training features + y: pd.DataFrame + Training labels + C: pd.DataFrame, optional (default=None) + Censoring information for survival analysis. + + Returns + ------- + self: reference to self + Fitted estimator. + """ + lifelines_est = self._init_lifelines_object() + + # input conversion + X = X.astype("float") # lifelines insists on float dtype + X = prep_skl_df(X) + + if hasattr(self, "X_col_subset"): + X = X[self.X_col_subset] + + to_concat = [X, y] + + if C is not None: + C_col = 1 - C.copy() # lifelines uses 1 for uncensored, 0 for censored + C_col.columns = ["__C"] + to_concat.append(C_col) + + df = pd.concat(to_concat, axis=1) + + self._y_cols = y.columns # remember column names for later + y_name = y.columns[0] + + fit_args = { + "df": df, + "duration_col": y_name, + } + if C is not None: + fit_args["event_col"] = "__C" + + fit_args.update(self._get_extra_fit_args(X, y, C)) + + # fit lifelines estimator + lifelines_est.fit(**fit_args) + + # write fitted params to self + # some fitted parameters are properties and may raise exceptions + # for example, AIC_ and AIC_partial_ of CoxPHFitter + # to avoid this, we use a safe getter + lifelines_fitted_params = _get_fitted_params_default_safe(lifelines_est) + for k, v in lifelines_fitted_params.items(): + setattr(self, f"{k}_", v) + + return self + + def _predict_proba(self, X): + """Predict_proba method adapter. + + Parameters + ---------- + X : pd.DataFrame + Features to predict on. + + Returns + ------- + skpro Empirical distribution + """ + lifelines_est = getattr(self, self._estimator_attr) + + # input conversion + X = X.astype("float") # lifelines insists on float dtype + X = prep_skl_df(X) + + # predict on X + lifelines_survf = lifelines_est.predict_survival_function(X) + + times = lifelines_survf.index + + nt = len(times) + mi = pd.MultiIndex.from_product([X.index, range(nt)]).swaplevel() + + times_val = np.repeat(times, repeats=len(X)) + times_df = pd.DataFrame(times_val, index=mi, columns=self._y_cols) + + lifelines_survf_t = np.transpose(lifelines_survf.values) + _, lifelines_survf_t_diff, clipped = _clip_surv(lifelines_survf_t) + + if clipped: + warn( + f"Warning from {self.__class__.__name__}: " + f"Interfaced lifelines class {lifelines_est.__class__.__name__} " + "produced improper survival function predictions, i.e., " + "not monotonically decreasing or not in [0, 1]. " + "skpro has clipped the predictions to enforce proper range and " + "valid predictive distributions. " + "However, predictions may still be unreliable.", + stacklevel=2, + ) + + weights = -lifelines_survf_t_diff.flatten() + weights_df = pd.Series(weights, index=mi) + + dist = Empirical( + spl=times_df, weights=weights_df, index=X.index, columns=self._y_cols + ) + + return dist diff --git a/skpro/survival/adapters/sksurv.py b/skpro/survival/adapters/sksurv.py index 2c26aec58..2716270c8 100644 --- a/skpro/survival/adapters/sksurv.py +++ b/skpro/survival/adapters/sksurv.py @@ -8,6 +8,7 @@ import pandas as pd from skpro.distributions.empirical import Empirical +from skpro.survival.adapters._common import _get_fitted_params_default_safe from skpro.utils.sklearn import prep_skl_df @@ -37,9 +38,9 @@ def _get_sksurv_class(self): should import and return sksurv class """ - # from sksurv import sksurvClass + # from sksurv import SksurvClass # - # return sksurv + # return SksurvClass raise NotImplementedError("abstract method") def _get_sksurv_object(self): @@ -100,56 +101,17 @@ def _fit(self, X, y, C=None): sksurv_est.fit(X, y_sksurv) # write fitted params to self + # some fitted parameters are properties and may raise exceptions + # for example, AIC_ and AIC_partial_ of CoxPHFitter + # to avoid this, we use a safe getter EXCEPTED_FITTED_PARAMS = ["n_features_in", "feature_names_in"] - sksurv_fitted_params = self._get_fitted_params_default_safe(sksurv_est) + sksurv_fitted_params = _get_fitted_params_default_safe(sksurv_est) for k, v in sksurv_fitted_params.items(): if k not in EXCEPTED_FITTED_PARAMS: setattr(self, f"{k}_", v) return self - def _get_fitted_params_default_safe(self, obj=None): - """Obtain fitted params of object, per sklearn convention. - - Same as _get_fitted_params_default, but with exception handling. - - This is since in sksurv, feature_importances_ is a property - and may raise an exception if the estimator does not have it. - - Parameters - ---------- - obj : any object, optional, default=self - - Returns - ------- - fitted_params : dict with str keys - fitted parameters, keyed by names of fitted parameter - """ - obj = obj if obj else self - - # default retrieves all self attributes ending in "_" - # and returns them with keys that have the "_" removed - # - # get all attributes ending in "_", exclude any that start with "_" (private) - fitted_params = [ - attr for attr in dir(obj) if attr.endswith("_") and not attr.startswith("_") - ] - - def hasattr_safe(obj, attr): - try: - if hasattr(obj, attr): - getattr(obj, attr) - return True - except Exception: - return False - - # remove the "_" at the end - fitted_param_dict = { - p[:-1]: getattr(obj, p) for p in fitted_params if hasattr_safe(obj, p) - } - - return fitted_param_dict - def _predict_proba(self, X): """Predict_proba method adapter. @@ -160,8 +122,7 @@ def _predict_proba(self, X): Returns ------- - np.ndarray (1d array of shape (n_instances,)) - Index of the cluster each time series in X belongs to. + skpro Empirical distribution """ sksurv_est = getattr(self, self._estimator_attr) diff --git a/skpro/survival/additive/__init__.py b/skpro/survival/additive/__init__.py new file mode 100644 index 000000000..bc5f7f526 --- /dev/null +++ b/skpro/survival/additive/__init__.py @@ -0,0 +1,5 @@ +"""Generalized additive survival models.""" + +__all__ = ["AalenAdditive"] + +from skpro.survival.additive._aalen_lifelines import AalenAdditive diff --git a/skpro/survival/additive/_aalen_lifelines.py b/skpro/survival/additive/_aalen_lifelines.py new file mode 100644 index 000000000..210d17126 --- /dev/null +++ b/skpro/survival/additive/_aalen_lifelines.py @@ -0,0 +1,103 @@ +"""Interface adapter to lifelines Aalen additive surival model.""" +# copyright: skpro developers, BSD-3-Clause License (see LICENSE file) + +__author__ = ["fkiraly"] + +from skpro.survival.adapters.lifelines import _LifelinesAdapter +from skpro.survival.base import BaseSurvReg + + +class AalenAdditive(_LifelinesAdapter, BaseSurvReg): + r"""Aalen additive hazards model, from lifelines. + + Direct interface to ``lifelines.fitters.AalenAdditiveFitter``, + by ``CamDavidsonPilon``. + + This class fits the regression model: + + .. math:: h(t|x) = b_0(t) + b_1(t) x_1 + ... + b_N(t) x_N + + that is, the hazard rate is a linear function of the covariates + with time-varying coefficients. + This implementation assumes non-time-varying covariates. + + Parameters + ---------- + fit_intercept: bool, optional (default: True) + If False, do not attach an intercept (column of ones) to the covariate matrix. + The intercept, :math:`b_0(t)` acts as a baseline hazard. + alpha: float, optional (default=0.05) + the level in the confidence intervals around the estimated survival function, + for computation of ``confidence_intervals_`` fitted parameter. + coef_penalizer: float, optional (default: 0) + Attach a L2 penalizer to the size of the coefficients during regression. + This improves + stability of the estimates and controls for high correlation between covariates. + For example, this shrinks the magnitude of :math:`c_{i,t}`. + smoothing_penalizer: float, optional (default: 0) + Attach a L2 penalizer to difference between adjacent (over time) coefficients. + For example, this shrinks the magnitude of :math:`c_{i,t} - c_{i,t+1}`. + + Attributes + ---------- + cumulative_hazards_ : DataFrame + The estimated cumulative hazard + hazards_ : DataFrame + The estimated hazards + confidence_intervals_ : DataFrame + The lower and upper confidence intervals for the cumulative hazard + durations: array + The durations provided + """ + + _tags = {"authors": ["CamDavidsonPilon", "rocreguant", "fkiraly"]} + # CamDavidsonPilon, rocreguant credit for interfaced estimator + + def __init__( + self, + fit_intercept=True, + alpha=0.05, + coef_penalizer=0.0, + smoothing_penalizer=0.0, + ): + self.fit_intercept = fit_intercept + self.alpha = alpha + self.coef_penalizer = coef_penalizer + self.smoothing_penalizer = smoothing_penalizer + + super().__init__() + + def _get_lifelines_class(self): + """Getter of the lifelines class to be used for the adapter.""" + from lifelines.fitters.aalen_additive_fitter import AalenAdditiveFitter + + return AalenAdditiveFitter + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return `"default"` set. + + Returns + ------- + params : dict or list of dict, default = {} + Parameters to create testing instances of the class + Each dict are parameters to construct an "interesting" test instance, i.e., + `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. + `create_test_instance` uses the first (or only) dictionary in `params` + """ + params1 = {} + + params2 = { + "fit_intercept": False, + "alpha": 0.1, + "coef_penalizer": 0.1, + "smoothing_penalizer": 0.1, + } + + return [params1, params2] diff --git a/skpro/survival/aft/__init__.py b/skpro/survival/aft/__init__.py new file mode 100644 index 000000000..3b591ab91 --- /dev/null +++ b/skpro/survival/aft/__init__.py @@ -0,0 +1,6 @@ +"""Module containing accelerated failure time models.""" +# copyright: skpro developers, BSD-3-Clause License (see LICENSE file) + +__all__ = ["AFTWeibull"] + +from skpro.survival.aft._aft_lifelines_weibull import AFTWeibull diff --git a/skpro/survival/aft/_aft_lifelines_weibull.py b/skpro/survival/aft/_aft_lifelines_weibull.py new file mode 100644 index 000000000..1b9b5f976 --- /dev/null +++ b/skpro/survival/aft/_aft_lifelines_weibull.py @@ -0,0 +1,219 @@ +"""Interface adapter to lifelines Weibull AFT model.""" +# copyright: skpro developers, BSD-3-Clause License (see LICENSE file) + +__author__ = ["fkiraly"] + +import numpy as np + +from skpro.distributions.weibull import Weibull +from skpro.survival.adapters.lifelines import _LifelinesAdapter +from skpro.survival.base import BaseSurvReg + + +class AFTWeibull(_LifelinesAdapter, BaseSurvReg): + r"""Weibull AFT model, from lifelines. + + Direct interface to ``lifelines.fitters.WeibullAFTFitter``, + by ``CamDavidsonPilon``. + + This class implements a Weibull AFT model. The model has parametric form, with + :math:`\lambda(x) = \exp\left(\beta_0 + \beta_1x_1 + ... + \beta_n x_n \right)`, + and optionally, + :math:`\rho(y) = \exp\left(\alpha_0 + \alpha_1 y_1 + ... + \alpha_m y_m \right)`, + + with predictive distribution being Weibull, with + scale parameter :math:`\lambda(x)` and shape parameter (exponent) :math:`\rho(y)`. + + The :math:`\lambda` (scale) parameter is a decay or half-like like parameter, + more specifically, the time by which the survival probability is 37%. + The :math:`\rho` (shape) parameter controls curvature of the the cumulative hazard, + e.g., whether it is convex or concave, representing accelerating or decelerating + hazards. + + The cumulative hazard rate is + + .. math:: H(t; x, y) = \left(\frac{t}{\lambda(x)} \right)^{\rho(y)}, + + Parameters + ---------- + scale_cols: pd.Index or coercible, optional, default=None + Columns of the input data frame to be used as covariates for + the scale parameter :math:`\lambda`. + If None, all columns are used. + + shape_cols: string "all", pd.Index or coercible, optional, default=None + Columns of the input data frame to be used as covariates for + the shape parameter :math:`\rho`. + If None, no covariates are used, the shape parameter is estimated as a constant. + If "all", all columns are used. + + fit_intercept: boolean, optional (default=True) + Whether to fit an intercept term in the model. + + alpha: float, optional (default=0.05) + the level in the confidence intervals around the estimated survival function, + for computation of ``confidence_intervals_`` fitted parameter. + + penalizer: float or array, optional (default=0.0) + the penalizer coefficient to the size of the coefficients. + See ``l1_ratio``. Must be equal to or greater than 0. + Alternatively, penalizer is an array equal in size to the number of parameters, + with penalty coefficients for specific variables. For + example, ``penalizer=0.01 * np.ones(p)`` is the same as ``penalizer=0.01`` + + l1_ratio: float, optional (default=0.0) + how much of the penalizer should be attributed to an l1 penalty + (otherwise an l2 penalty). The penalty function looks like + ``penalizer * l1_ratio * ||w||_1 + 0.5 * penalizer * (1 - l1_ratio) * ||w||^2_2`` # noqa E501 + + Attributes + ---------- + params_ : DataFrame + The estimated coefficients + confidence_intervals_ : DataFrame + The lower and upper confidence intervals for the coefficients + durations: Series + The event_observed variable provided + event_observed: Series + The event_observed variable provided + weights: Series + The event_observed variable provided + variance_matrix_ : DataFrame + The variance matrix of the coefficients + standard_errors_: Series + the standard errors of the estimates + score_: float + the concordance index of the model. + """ + + _tags = {"authors": ["CamDavidsonPilon", "fkiraly"]} + # CamDavidsonPilon, credit for interfaced estimator + + def __init__( + self, + scale_cols=None, + shape_cols=None, + fit_intercept: bool = True, + alpha: float = 0.05, + penalizer: float = 0.0, + l1_ratio: float = 0.0, + ): + self.scale_cols = scale_cols + self.shape_cols = shape_cols + self.alpha = alpha + self.penalizer = penalizer + self.l1_ratio = l1_ratio + self.fit_intercept = fit_intercept + + super().__init__() + + if scale_cols is not None: + self.X_col_subset = scale_cols + + def _get_lifelines_class(self): + """Getter of the lifelines class to be used for the adapter.""" + from lifelines.fitters.weibull_aft_fitter import WeibullAFTFitter + + return WeibullAFTFitter + + def _get_lifelines_object(self): + """Abstract method to initialize lifelines object. + + The default initializes result of _get_lifelines_class + with self.get_params. + """ + cls = self._get_lifelines_class() + params = self.get_params() + params.pop("scale_cols", None) + params.pop("shape_cols", None) + return cls(**params) + + def _add_extra_fit_args(self, X, y, C=None): + """Get extra arguments for the fit method. + + Parameters + ---------- + X : pd.DataFrame + Training features + y: pd.DataFrame + Training labels + C: pd.DataFrame, optional (default=None) + Censoring information for survival analysis. + fit_args: dict, optional (default=None) + Existing arguments for the fit method, from the adapter. + + Returns + ------- + dict + Extra arguments for the fit method. + """ + if self.scale_cols is not None: + if self.scale_cols == "all": + return {"ancillary": True} + else: + return {"ancillary": X[self.scale_cols]} + else: + return {} + + def _predict_proba(self, X): + """Predict_proba method adapter. + + Parameters + ---------- + X : pd.DataFrame + Features to predict on. + + Returns + ------- + skpro Empirical distribution + """ + if self.shape_cols == "all": + ancillary = X + elif self.shape_cols is not None: + ancillary = X[self.shape_cols] + else: + ancillary = None + + if self.scale_cols is not None: + df = X[self.scale_cols] + else: + df = X + + lifelines_est = getattr(self, self._estimator_attr) + ll_pred_proba = lifelines_est._prep_inputs_for_prediction_and_return_scores + + scale, k = ll_pred_proba(df, ancillary) + scale = np.expand_dims(scale, axis=1) + k = np.expand_dims(k, axis=1) + + dist = Weibull(scale=scale, k=k, index=X.index, columns=self._y_cols) + return dist + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return `"default"` set. + + Returns + ------- + params : dict or list of dict, default = {} + Parameters to create testing instances of the class + Each dict are parameters to construct an "interesting" test instance, i.e., + `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. + `create_test_instance` uses the first (or only) dictionary in `params` + """ + params1 = {} + + params2 = { + "shape_cols": "all", + "fit_intercept": False, + "alpha": 0.1, + "penalizer": 0.1, + "l1_ratio": 0.1, + } + return [params1, params2] diff --git a/skpro/survival/coxph/__init__.py b/skpro/survival/coxph/__init__.py index 45d88ae44..d0a6dfbdf 100644 --- a/skpro/survival/coxph/__init__.py +++ b/skpro/survival/coxph/__init__.py @@ -1,7 +1,8 @@ """Cox proportional hazards models.""" from skpro.survival.coxph._coxnet_sksurv import CoxNet +from skpro.survival.coxph._coxph_lifelines import CoxPHlifelines from skpro.survival.coxph._coxph_sksurv import CoxPHSkSurv from skpro.survival.coxph._coxph_statsmodels import CoxPH -__all__ = ["CoxNet", "CoxPH", "CoxPHSkSurv"] +__all__ = ["CoxNet", "CoxPH", "CoxPHlifelines", "CoxPHSkSurv"] diff --git a/skpro/survival/coxph/_coxph_lifelines.py b/skpro/survival/coxph/_coxph_lifelines.py new file mode 100644 index 000000000..3f4ed158d --- /dev/null +++ b/skpro/survival/coxph/_coxph_lifelines.py @@ -0,0 +1,214 @@ +"""Interface adapter to lifelines Cox PH surival model.""" +# copyright: skpro developers, BSD-3-Clause License (see LICENSE file) + +__author__ = ["fkiraly"] + +from skpro.survival.adapters.lifelines import _LifelinesAdapter +from skpro.survival.base import BaseSurvReg + + +class CoxPHlifelines(_LifelinesAdapter, BaseSurvReg): + r"""Cox proportional hazards models, from lifelines. + + Direct interface to ``lifelines.fitters.CoxPHFitter``, + by ``CamDavidsonPilon``. + + This class implements Cox proportional hazard model, + + .. math:: h(t|x) = h_0(t) \exp((x - \overline{x})' \beta) + + with different options to fit the baseline hazard, :math:`h_0(t)`. + + The class offers multiple options via the ``baseline_estimation_method`` parameter: + + ``"breslow"`` (default): non-parametric estimate via Breslow's method. + In this case, the entire model is the traditional semi-parametric Cox model. + Ties are handled using Efron's method. + + ``"spline"``: parametric spline fit of baseline hazard, + via Royston-Parmar's method [1]_. The parametric form is + + .. math:: H_0(t) = \exp{\left( \phi_0 + \phi_1\log{t} + \sum_{j=2}^N \phi_j v_j(\log{t})\right)} # noqa E501 + + where :math:`v_j` are our cubic basis functions at predetermined knots, + and :math:`H_0` is the cumulative baseline hazard. See [1]_ for exact definition. + + ``"piecewise"``: non-parametric, piecewise constant empirical baseline hazard. + The explicit form of the baseline hazard is + + .. math:: h_0(t) = \begin{cases} + exp{\beta \cdot \text{center}(x)} & \text{if $t \le \tau_0$} \\ + exp{\beta \cdot \text{center}(x)} \cdot lambda_1 & \text{if $\tau_0 < t \le \tau_1$} \\ # noqa E501 + exp{\beta \cdot \text{center}(x)} \cdot lambda_2 & \text{if $\tau_1 < t \le \tau_2$} \\ # noqa E501 + ... + \end{cases} + + Parameters + ---------- + alpha: float, optional (default=0.05) + the level in the confidence intervals around the estimated survival function, + for computation of ``confidence_intervals_`` fitted parameter. + + baseline_estimation_method: string, default="breslow", + one of: ``"breslow"``, ``"spline"``, or ``"piecewise"``. + Specifies algorithm for estimation of baseline hazard, see above. + If ``"piecewise"``, the ``breakpoints`` parameter must be set. + + penalizer: float or array, optional (default=0.0) + Penalty to the size of the coefficients during regression. + This improves stability of the estimates and controls for high correlation + between covariates. + For example, this shrinks the magnitude value of :math:`\beta_i`. + See ``l1_ratio`` below. + The penalty term is :math:`\text{penalizer} \left( \frac{1-\text{l1_ratio}}{2} ||\beta||_2^2 + \text{l1_ratio}||\beta||_1\right)`. # noqa E501 + + If an array, must be equal in size to the number of parameters, + with penalty coefficients for specific variables. For + example, ``penalizer=0.01 * np.ones(p)`` is the same as ``penalizer=0.01``. + + l1_ratio: float, optional (default=0.0) + Specify what ratio to assign to a L1 vs L2 penalty. + Same as in scikit-learn. See ``penalizer`` above. + + strata: list, optional + specify a list of columns to use in stratification. This is useful if a + categorical covariate does not obey the proportional hazard assumption. This + is used similar to the ``strata`` expression in R. + See http://courses.washington.edu/b515/l17.pdf. + + n_baseline_knots: int, optional, default=4 + Used only when ``baseline_estimation_method="spline"``. + Set the number of knots (interior & exterior) in the baseline hazard, + which will be placed evenly along the time axis. + Should be at least 2. + Royston et. al, the authors of this model, suggest 4 to start, + but any values between 2 and 8 are reasonable. + If you need to customize the timestamps used to calculate the curve, + use the ``knots`` parameter instead. + + knots: list, optional + Used only when ``baseline_estimation_method="spline"``. + Specifies custom points in the time axis for the baseline hazard curve. + To use evenly-spaced points in time, the ``n_baseline_knots`` + parameter can be employed instead. + + breakpoints: list, optional + Used only when ``baseline_estimation_method="piecewise"``, + must be passed in this case. + Set the positions of the baseline hazard breakpoints. + + Attributes + ---------- + params_ : Series + The estimated coefficients. + hazard_ratios_ : Series + The exp(coefficients) + confidence_intervals_ : DataFrame + The lower and upper confidence intervals for the hazard coefficients + durations: Series + The durations provided + event_observed: Series + The event_observed variable provided + weights: Series + The event_observed variable provided + variance_matrix_ : DataFrame + The variance matrix of the coefficients + strata: list + the strata provided + standard_errors_: Series + the standard errors of the estimates + log_likelihood_: float + the log-likelihood at the fitted coefficients + AIC_: float + the AIC at the fitted coefficients (if using splines for baseline hazard) + partial_AIC_: float + the AIC at the fitted coefficients + (if using non-parametric inference for baseline hazard) + baseline_hazard_: DataFrame + the baseline hazard evaluated at the observed times. + Estimated using Breslow's method. + baseline_cumulative_hazard_: DataFrame + the baseline cumulative hazard evaluated at the observed times. + Estimated using Breslow's method. + baseline_survival_: DataFrame + the baseline survival evaluated at the observed times. + Estimated using Breslow's method. + summary: Dataframe + a Dataframe of the coefficients, p-values, CIs, etc. + + References + ---------- + .. [1] Royston, P., Parmar, M. K. B. (2002). + Flexible parametric proportional-hazards and proportional-odds + models for censored survival data, with application to prognostic + modelling and estimation of treatment effects. + Statistics in Medicine, 21(15), 2175–2197. doi:10.1002/sim.1203 + """ + + _tags = {"authors": ["CamDavidsonPilon", "JoseLlanes", "mathurinm", "fkiraly"]} + # CamDavidsonPilon, JoseLlanes, mathurinm credit for interfaced estimator + + def __init__( + self, + baseline_estimation_method: str = "breslow", + penalizer=0.0, + strata=None, + l1_ratio=0.0, + n_baseline_knots=None, + knots=None, + breakpoints=None, + ): + self.baseline_estimation_method = baseline_estimation_method + self.penalizer = penalizer + self.strata = strata + self.l1_ratio = l1_ratio + self.n_baseline_knots = n_baseline_knots + self.knots = knots + self.breakpoints = breakpoints + + super().__init__() + + def _get_lifelines_class(self): + """Getter of the lifelines class to be used for the adapter.""" + from lifelines.fitters.coxph_fitter import CoxPHFitter + + return CoxPHFitter + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return `"default"` set. + + Returns + ------- + params : dict or list of dict, default = {} + Parameters to create testing instances of the class + Each dict are parameters to construct an "interesting" test instance, i.e., + `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. + `create_test_instance` uses the first (or only) dictionary in `params` + """ + params1 = {} + + params2 = { + "baseline_estimation_method": "spline", + "penalizer": 0.1, + "l1_ratio": 0.1, + "n_baseline_knots": 3, + } + + # breakpoints are specific to data ranges, + # but tests loop over various data sets, so this would break + # + # params3 = { + # "baseline_estimation_method": "piecewise", + # "penalizer": 0.15, + # "l1_ratio": 0.05, + # "breakpoints": [10, 20, 30, 100], + # } + + return [params1, params2]