diff --git a/README.rst b/README.rst index 4810fdb2..25e43570 100644 --- a/README.rst +++ b/README.rst @@ -56,6 +56,10 @@ Available Estimators +------------------------------+------------------+-------------------------+-----------------------+-----------------------+ | `CaseOutstanding`_ | | | | | +------------------------------+------------------+-------------------------+-----------------------+-----------------------+ +| `TweedieGLM`_ | | | | | ++------------------------------+------------------+-------------------------+-----------------------+-----------------------+ +| `DevelopmentML`_ | | | | | ++------------------------------+------------------+-------------------------+-----------------------+-----------------------+ Documentation ------------- @@ -85,6 +89,8 @@ code documentation. .. _VotingChainladder: https://chainladder-python.readthedocs.io/en/latest/modules/workflow.html#votingchainladder .. _Trend: https://chainladder-python.readthedocs.io/en/latest/modules/adjustments.html#trend .. _CaseOutstanding: https://chainladder-python.readthedocs.io/en/latest/modules/development.html#caseoutstanding +.. _TweedieGLM: https://chainladder-python.readthedocs.io/en/latest/modules/development.html#tweedieglm +.. _DevelopmentML: https://chainladder-python.readthedocs.io/en/latest/modules/development.html#developmentml .. _Documentation: https://chainladder-python.readthedocs.io/en/latest/ Getting Started Tutorials diff --git a/chainladder/development/clark.py b/chainladder/development/clark.py index 15d84224..3aa85b24 100644 --- a/chainladder/development/clark.py +++ b/chainladder/development/clark.py @@ -195,8 +195,6 @@ def solver(x): obj._set_slicers() self.ldf_ = obj self.ldf_.valuation_date = pd.to_datetime(ULT_VAL) - self.sigma_ = self.ldf_ * 0 + 1 - self.std_err_ = self.ldf_ * 0 + 1 rows = X.index.set_index(X.key_labels).index self.omega_ = pd.DataFrame(params[..., 0, 0], index=rows, columns=X.vdims) self.theta_ = pd.DataFrame(params[..., 0, 1], index=rows, columns=X.vdims) @@ -237,8 +235,6 @@ def transform(self, X): X_new = X.copy() triangles = [ "ldf_", - "sigma_", - "std_err_", "omega_", "theta_", "incremental_fits_", diff --git a/chainladder/development/constant.py b/chainladder/development/constant.py index 32c90d2b..f34b0fca 100644 --- a/chainladder/development/constant.py +++ b/chainladder/development/constant.py @@ -79,8 +79,6 @@ def fit(self, X, y=None, sample_weight=None): self.ldf_.is_pattern = True self.ldf_.is_cumulative = False self.ldf_.valuation_date = pd.to_datetime(ULT_VAL) - self.sigma_ = self.ldf_ * 0 + 1 - self.std_err_ = self.ldf_ * 0 + 1 return self def transform(self, X): @@ -97,7 +95,7 @@ def transform(self, X): X_new : New triangle with transformed attributes. """ X_new = X.copy() - triangles = ["ldf_", "sigma_", "std_err_"] + triangles = ["ldf_"] for item in triangles: setattr(X_new, item, getattr(self, item)) X_new._set_slicers() diff --git a/chainladder/development/glm.py b/chainladder/development/glm.py index 605b4298..8b53ed7f 100644 --- a/chainladder/development/glm.py +++ b/chainladder/development/glm.py @@ -1,45 +1,35 @@ # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at https://mozilla.org/MPL/2.0/. - import pandas as pd import numpy as np -from patsy import dmatrix -from sklearn.base import BaseEstimator, TransformerMixin from chainladder.development.base import DevelopmentBase from chainladder.development.learning import DevelopmentML from sklearn.linear_model import TweedieRegressor from sklearn.pipeline import Pipeline - - -class PatsyFormula(BaseEstimator, TransformerMixin): - """ A sklearn-style wrapper for patsy formulas """ - def __init__(self, formula=None): - self.formula = formula - - def fit(self, X, y=None, sample_weight=None): - self.design_info_ = dmatrix(self.formula, X).design_info - return self - - def transform(self, X): - return dmatrix(self.design_info_, X) +from chainladder.utils.utility_functions import PatsyFormula class TweedieGLM(DevelopmentBase): """ This estimator creates development patterns with a GLM using a Tweedie distribution. - The Tweedie family includes several of the more popular distributions including the normal, ODP poisson, and gamma distributions. This class is a special case of `DevleopmentML`. It restricts to just GLM using a TweedieRegressor and provides an R-like formulation of the design matrix. + .. versionadded:: 0.8.1 + Parameters ----------- design_matrix : formula-like A patsy formula describing the independent variables, X of the GLM - response : str, default None - Name of the response column. + response : str + Column name for the reponse variable of the GLM. If ommitted, then the + first column of the Triangle will be used. + weight : str + Column name of any weight to use in the GLM. If none specified, then an + unweighted regression will be performed. power : float, default=0 The power determines the underlying target distribution according to the following table: @@ -84,9 +74,10 @@ class TweedieGLM(DevelopmentBase): """ def __init__(self, design_matrix='C(development) + C(origin)', - response=None, power=1.0, alpha=1.0, link='log', + response=None, weight=None, power=1.0, alpha=1.0, link='log', max_iter=100, tol=0.0001, warm_start=False, verbose=0): self.response=response + self.weight=weight self.design_matrix = design_matrix self.power=power self.alpha=alpha @@ -104,7 +95,7 @@ def fit(self, X, y=None, sample_weight=None): link=self.link, power=self.power, max_iter=self.max_iter, tol=self.tol, warm_start=self.warm_start, verbose=self.verbose, fit_intercept=False))]), - y_ml=response).fit(X) + y_ml=response, weight_ml=self.weight).fit(X) return self @property @@ -119,7 +110,8 @@ def triangle_glm_(self): def coef_(self): return pd.Series( self.model.estimator_ml.named_steps.model.coef_, name='coef_', - index=list(self.model.estimator_ml.named_steps.design_matrix.design_info_.column_name_indexes.keys()) + index=list(self.model.estimator_ml.named_steps.design_matrix. + design_info_.column_name_indexes.keys()) ).to_frame() def transform(self, X): diff --git a/chainladder/development/incremental.py b/chainladder/development/incremental.py index e2ec815a..3cabebfe 100644 --- a/chainladder/development/incremental.py +++ b/chainladder/development/incremental.py @@ -91,6 +91,8 @@ def fit(self, X, y=None, sample_weight=None): X = X.copy() if sample_weight.array_backend == "sparse": sample_weight = sample_weight.set_backend("numpy") + else: + sample_weight = sample_weight.copy() xp = X.get_array_module() sample_weight.is_cumulative = False obj = X.cum_to_incr() / sample_weight.values @@ -141,7 +143,6 @@ def fit(self, X, y=None, sample_weight=None): 1/(1+future_trend)-1, axis='valuation', start=X.valuation_date, end=self.incremental_.valuation_date) self.ldf_ = obj.incr_to_cum().link_ratio - self.sigma_ = self.std_err_ = 0 * self.ldf_ return self def transform(self, X): @@ -158,6 +159,6 @@ def transform(self, X): X_new : New triangle with transformed attributes. """ X_new = X.copy() - for item in ["incremental_", "ldf_", "sigma_", "std_err_"]: + for item in ["ldf_"]: X_new.__dict__[item] = self.__dict__[item] return X_new diff --git a/chainladder/development/learning.py b/chainladder/development/learning.py index 0b5edba5..0fa9eaca 100644 --- a/chainladder/development/learning.py +++ b/chainladder/development/learning.py @@ -8,14 +8,19 @@ from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures from sklearn.compose import ColumnTransformer from chainladder.development.base import DevelopmentBase +from chainladder import ULT_VAL class DevelopmentML(DevelopmentBase): """ A Estimator that interfaces with machine learning (ML) tools that implement the scikit-learn API. + The `DevelopmentML` estimator is used to generate ``ldf_`` patterns from + the data. + .. versionadded:: 0.8.1 + Parameters ---------- estimator_ml : skearn Estimator @@ -23,10 +28,11 @@ class DevelopmentML(DevelopmentBase): y_ml : list or str or sklearn_transformer The response column(s) for the machine learning algorithm. It must be present within the Triangle. - y_features : + autoregressive : tuple, (autoregressive_col_name, lag, source_col_name) The subset of response column(s) to use as lagged features for the Time Series aspects of the model. Predictions from one development period - get used as featues in the next development period. + get used as featues in the next development period. Lags should be negative + integers. fit_incrementals : Whether the response variable should be converted to an incremental basis for fitting. @@ -34,16 +40,19 @@ class DevelopmentML(DevelopmentBase): Attributes ---------- + estimator_ml : Estimator + An sklearn-style estimator to predict development patterns ldf_ : Triangle The estimated loss development patterns. cdf_ : Triangle The estimated cumulative development patterns. """ - def __init__(self, estimator_ml=None, - y_ml=None, y_features=False, fit_incrementals=True): + def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False, + weight_ml=None, fit_incrementals=True): self.estimator_ml=estimator_ml self.y_ml=y_ml - self.y_features=y_features + self.weight_ml = weight_ml + self.autoregressive=autoregressive self.fit_incrementals=fit_incrementals def _get_y_names(self): @@ -77,14 +86,16 @@ def y_ml_(self): else: return transformer - def _get_triangle_ml(self, df): + def _get_triangle_ml(self, df, preds=None): """ Create fitted Triangle """ from chainladder.core import Triangle - preds = self.estimator_ml.predict(df) + if preds is None: + preds = self.estimator_ml.predict(df) X_r = [df] y_r = [preds] dgrain = {'Y':12, 'Q':3, 'M': 1}[self.development_grain_] - latest_filter = df['origin']+(df['development']-dgrain)/dgrain + ograin = {'Y':1, 'Q':4, 'M': 12}[self.origin_grain_] + latest_filter = (df['origin']+1)*ograin+(df['development']-dgrain)/dgrain latest_filter = latest_filter == latest_filter.max() preds=pd.DataFrame(preds.copy())[latest_filter].values out = df.loc[latest_filter].copy() @@ -93,10 +104,12 @@ def _get_triangle_ml(self, df): out['development'] = out['development'] + dgrain if len(preds.shape) == 1: preds = preds[:, None] - if self.y_features: - for num, col in enumerate(self.y_features): + if self.autoregressive: + for num, col in enumerate(self.autoregressive): out[col[0]]=preds[:, num] out = out[out['development']<=dev_lags.max()] + if len(out) == 0: + continue X_r.append(out.copy()) preds = self.estimator_ml.predict(out) y_r.append(preds.copy()) @@ -108,8 +121,26 @@ def _get_triangle_ml(self, df): out['origin'] = out['origin'].map({v: k for k, v in self.origin_encoder_.items()}) out = out.merge(self.valuation_vector_, how='left', on=['origin', 'development']) return Triangle( - out, origin='origin', development='valuation', index=self._key_labels, columns=self._get_y_names()).dropna() + out, origin='origin', development='valuation', + index=self._key_labels, columns=self._get_y_names(), + cumulative=not self.fit_incrementals).dropna() + def _prep_X_ml(self, X): + """ Preps Triangle data ahead of the pipeline """ + if self.fit_incrementals: + X_ = X.cum_to_incr() + else: + X_ = X.copy() + if self.autoregressive: + for i in self.autoregressive: + lag = X[i[2]].shift(i[1]) + X_[i[0]] = lag[lag.valuation<=X.valuation_date] + df_base = X.incr_to_cum().to_frame(keepdims=True).reset_index().iloc[:, :-1] + df = df_base.merge( + X.cum_to_incr().to_frame(keepdims=True).reset_index(), how='left', + on=list(df_base.columns)).fillna(0) + df['origin'] = df['origin'].map(self.origin_encoder_) + return df def fit(self, X, y=None, sample_weight=None): """Fit the model with X. @@ -129,10 +160,6 @@ def fit(self, X, y=None, sample_weight=None): Returns the instance itself. """ - if self.fit_incrementals: - X_ = X.cum_to_incr() - else: - X_ = X.copy() self._columns = list(X.columns) self._key_labels = X.key_labels self.origin_grain_ = X.origin_grain @@ -144,24 +171,16 @@ def fit(self, X, y=None, sample_weight=None): X.valuation.values.reshape(X.shape[-2:], order='F'), index=X.odims, columns=X.ddims).unstack().reset_index() self.valuation_vector_.columns=['development', 'origin', 'valuation'] - # response as a feature - if self.y_features: - for i in self.y_features: - lag = X[i[2]].shift(i[1]) - X_[i[0]] = lag[lag.valuation<=X.valuation_date] - - df = X_.to_frame(keepdims=True).reset_index().fillna(0) - df['origin'] = df['origin'].map(self.origin_encoder_) - self.df_ = df # Unncecessary, used for debugging - + df = self._prep_X_ml(X) + self.df_ = df # Fit model self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze()) + #return self self.triangle_ml_ = self._get_triangle_ml(df) return self @property def ldf_(self): - from chainladder import ULT_VAL ldf = self.triangle_ml_.incr_to_cum().link_ratio ldf.valuation_date = pd.to_datetime(ULT_VAL) return ldf @@ -179,13 +198,11 @@ def transform(self, X): ------- X_new : New triangle with transformed attributes. """ - X_new = X.copy() - triangles = [ - "ldf_", - ] - for item in triangles: - setattr(X_new, item, getattr(self, item)) - X_new.sigma_ = X_new.std_err_ = X_new.ldf_ * 0 + 1 + X_ml = self._prep_X_ml(X) + y_ml=self.estimator_ml.predict(X_ml) + triangle_ml = self._get_triangle_ml(X_ml, y_ml) + X_new.ldf_ = triangle_ml.incr_to_cum().link_ratio + X_new.ldf_.valuation_date = pd.to_datetime(ULT_VAL) X_new._set_slicers() return X_new diff --git a/chainladder/development/outstanding.py b/chainladder/development/outstanding.py index bd799d6b..0c4b43f2 100644 --- a/chainladder/development/outstanding.py +++ b/chainladder/development/outstanding.py @@ -105,7 +105,6 @@ def fit(self, X, y=None, sample_weight=None): dev.is_pattern=True dev.is_cumulative=True self.ldf_ = dev.cum_to_incr() - self.std_err_ = self.sigma_ = self.ldf_ * 0 + 1 return self @property @@ -142,7 +141,7 @@ def transform(self, X): X_new : New triangle with transformed attributes. """ X_new = X.copy() - triangles = ["ldf_", "sigma_", "std_err_"] + triangles = ["ldf_"] for item in triangles: setattr(X_new, item, getattr(self, item)) X_new._set_slicers() diff --git a/chainladder/development/tests/test_incremental.py b/chainladder/development/tests/test_incremental.py index c010e7b2..5005ecdc 100644 --- a/chainladder/development/tests/test_incremental.py +++ b/chainladder/development/tests/test_incremental.py @@ -9,7 +9,7 @@ def test_schmidt(): answer = ia.fit_transform( tri.iloc[0, 0], sample_weight=tri.iloc[0, 1].latest_diagonal ) - answer = answer.incremental_.incr_to_cum().values[0, 0, :, -1] + answer = ia.incremental_.incr_to_cum().values[0, 0, :, -1] check = xp.array( [ 3483.0, diff --git a/chainladder/methods/mack.py b/chainladder/methods/mack.py index 8dcaacbb..09ea7ab7 100644 --- a/chainladder/methods/mack.py +++ b/chainladder/methods/mack.py @@ -57,7 +57,7 @@ def fit(self, X, y=None, sample_weight=None): Returns the instance itself. """ super().fit(X, y, sample_weight) - if not ("average_" in self.X_ and "w_" in self.X_): + if "sigma_" not in self.X_: raise ValueError("Triangle not compatible with MackChainladder") # Caching full_triangle_ for fit as it is called a lot self.X_._full_triangle_ = self.full_triangle_ diff --git a/chainladder/tails/base.py b/chainladder/tails/base.py index 578ac591..1cd6ca99 100644 --- a/chainladder/tails/base.py +++ b/chainladder/tails/base.py @@ -36,24 +36,19 @@ def fit(self, X, y=None, sample_weight=None): self.ldf_.values = xp.concatenate((self.ldf_.values, tail), -1) self.ldf_.ddims = ddims if hasattr(obj, "sigma_"): + zeros = tail[..., 0:obj.ldf_.shape[2], -1:] * 0 self.sigma_ = getattr(obj, "sigma_").copy() - else: - self.sigma_ = obj.ldf_ - obj.ldf_ - if hasattr(obj, "std_err_"): + self.sigma_.values = xp.concatenate((self.sigma_.values, zeros), -1) self.std_err_ = getattr(obj, "std_err_").copy() - else: - self.std_err_ = obj.ldf_ - obj.ldf_ - zeros = tail[..., 0:obj.ldf_.shape[2], -1:] * 0 - self.sigma_.values = xp.concatenate((self.sigma_.values, zeros), -1) - self.std_err_.values = xp.concatenate((self.std_err_.values, zeros), -1) - self.sigma_.ddims = self.std_err_.ddims = self.ldf_.ddims + self.std_err_.values = xp.concatenate((self.std_err_.values, zeros), -1) + self.sigma_.ddims = self.std_err_.ddims = self.ldf_.ddims + self.sigma_._set_slicers() + self.std_err_._set_slicers() if hasattr(obj, "average_"): self.average_ = obj.average_ else: self.average_ = None self.ldf_._set_slicers() - self.sigma_._set_slicers() - self.std_err_._set_slicers() return self def transform(self, X): @@ -70,7 +65,7 @@ def transform(self, X): X_new : New triangle with transformed attributes. """ X_new = X.copy() - triangles = ["std_err_", "ldf_", "sigma_"] + triangles = ["ldf_", "std_err_", "sigma_"] for item in triangles + ["tail_", "_ave_period", "average_"]: setattr(X_new, item, getattr(self, item)) X_new._set_slicers() @@ -120,16 +115,25 @@ def _get_tail_stats(self, X): log-linear extrapolation applied to tail average period """ from chainladder.utils.utility_functions import num_to_nan + if not hasattr(X, 'sigma_'): + self.sigma_ = None + self.std_err_ = None + else: + time_pd = self._get_tail_weighted_time_period(X) + xp = X.sigma_.get_array_module() + reg = WeightedRegression(axis=3, xp=xp).fit(None, xp.log(X.sigma_.values), None) + sigma_ = xp.exp(time_pd * reg.slope_ + reg.intercept_) + y = X.std_err_.values + y = num_to_nan(y) + reg = WeightedRegression(axis=3, xp=xp).fit(None, xp.log(y), None) + std_err_ = xp.exp(time_pd * reg.slope_ + reg.intercept_) - time_pd = self._get_tail_weighted_time_period(X) - xp = X.sigma_.get_array_module() - reg = WeightedRegression(axis=3, xp=xp).fit(None, xp.log(X.sigma_.values), None) - sigma_ = xp.exp(time_pd * reg.slope_ + reg.intercept_) - y = X.std_err_.values - y = num_to_nan(y) - reg = WeightedRegression(axis=3, xp=xp).fit(None, xp.log(y), None) - std_err_ = xp.exp(time_pd * reg.slope_ + reg.intercept_) - return sigma_, std_err_ + self.sigma_.values = xp.concatenate( + (self.sigma_.values[..., :-1], sigma_[..., -1:]), axis=-1 + ) + self.std_err_.values = xp.concatenate( + (self.std_err_.values[..., :-1], std_err_[..., -1:]), axis=-1 + ) def _get_tail_weighted_time_period(self, X): """ Method to approximate the weighted-average development age of tail @@ -145,6 +149,7 @@ def _get_tail_weighted_time_period(self, X): self.ldf_.values[..., -self._ave_period[0] - 1 :], -1, keepdims=True ) reg = WeightedRegression(axis=3, xp=xp).fit(None, xp.log(y - 1), None) + tail = tail if tail.max() > 1 else 1.001 time_pd = (xp.log(tail - 1) - reg.intercept_) / reg.slope_ return time_pd diff --git a/chainladder/tails/bondy.py b/chainladder/tails/bondy.py index 8784c8d6..8fc5b181 100644 --- a/chainladder/tails/bondy.py +++ b/chainladder/tails/bondy.py @@ -134,13 +134,7 @@ def fit(self, X, y=None, sample_weight=None): ), axis=-1, ) - sigma, std_err = self._get_tail_stats(obj) - self.sigma_.values = xp.concatenate( - (self.sigma_.values[..., :-1], sigma[..., -1:]), axis=-1 - ) - self.std_err_.values = xp.concatenate( - (self.std_err_.values[..., :-1], std_err[..., -1:]), axis=-1 - ) + self._get_tail_stats(obj) if backend == "cupy": self = self.set_backend(backend, inplace=True, deep=True) return self diff --git a/chainladder/tails/clark.py b/chainladder/tails/clark.py index 2e1a1927..b736f72b 100644 --- a/chainladder/tails/clark.py +++ b/chainladder/tails/clark.py @@ -110,6 +110,7 @@ def fit(self, X, y=None, sample_weight=None): self.norm_resid_ = model.norm_resid_ if self.truncation_age: self.ldf_.values[..., -1:] = self.ldf_.values[..., -1:] * self.G_(self.truncation_age).values + # self._get_tail_stats(self) if backend == "cupy": self = self.set_backend("cupy", inplace=True) return self diff --git a/chainladder/tails/constant.py b/chainladder/tails/constant.py index dd1ba2a5..4c9540fe 100644 --- a/chainladder/tails/constant.py +++ b/chainladder/tails/constant.py @@ -75,12 +75,5 @@ def fit(self, X, y=None, sample_weight=None): attach_idx = len(X.ddims) - 1 self = self._apply_decay(X, tail, attach_idx) obj = Development().fit_transform(X) if "ldf_" not in X else X - if xp.max(xp.array(self.tail)) != 1.0: - sigma, std_err = self._get_tail_stats(obj) - self.sigma_.values = xp.concatenate( - (self.sigma_.values[..., :-1], sigma[..., -1:]), axis=-1 - ) - self.std_err_.values = xp.concatenate( - (self.std_err_.values[..., :-1], std_err[..., -1:]), axis=-1 - ) + self._get_tail_stats(obj) return self diff --git a/chainladder/tails/curve.py b/chainladder/tails/curve.py index 95e0485f..7f65a2b8 100644 --- a/chainladder/tails/curve.py +++ b/chainladder/tails/curve.py @@ -162,13 +162,7 @@ def fit(self, X, y=None, sample_weight=None): (self.ldf_.values[..., :attach_idx], tail[..., attach_idx:]), -1 ) obj = Development().fit_transform(X) if "ldf_" not in X else X - sigma, std_err = self._get_tail_stats(obj) - self.sigma_.values = xp.concatenate( - (self.sigma_.values[..., :-1], sigma[..., -1:]), axis=-1 - ) - self.std_err_.values = xp.concatenate( - (self.std_err_.values[..., :-1], std_err[..., -1:]), axis=-1 - ) + self._get_tail_stats(obj) return self def _get_x(self, w, y): diff --git a/chainladder/utils/__init__.py b/chainladder/utils/__init__.py index 05148b60..ca4ca4d6 100644 --- a/chainladder/utils/__init__.py +++ b/chainladder/utils/__init__.py @@ -54,6 +54,7 @@ def load_template(template, env=None, **kwargs): load_sample, minimum, maximum, + PatsyFormula, ) from chainladder.utils.cupy import cp from chainladder.utils.sparse import sp diff --git a/chainladder/utils/utility_functions.py b/chainladder/utils/utility_functions.py index bb7c2924..2dad443b 100644 --- a/chainladder/utils/utility_functions.py +++ b/chainladder/utils/utility_functions.py @@ -12,6 +12,8 @@ import os import copy from sklearn.utils import deprecated +from patsy import dmatrix +from sklearn.base import BaseEstimator, TransformerMixin def load_sample(key, *args, **kwargs): @@ -269,3 +271,39 @@ def minimum(x1, x2): def maximum(x1, x2): return x1.maximum(x2) + +class PatsyFormula(BaseEstimator, TransformerMixin): + """ A sklearn-style Transformer for patsy formulas. + + PatsyFormula allows for R-style formula preprocessing of the ``design_matrix`` + of a machine learning algorithm. It's particularly useful with the `DevelopmentML` + and `TweedieGLM` estimators. + + Parameters + ----------- + + formula : str + A string representation of the regression model X features. + + Attributes + ------------ + design_info_ : + The patsy instructions for generating the design_matrix, X. + + """ + def __init__(self, formula=None): + self.formula = formula + + def _check_X(self, X): + from chainladder.core import Triangle + if isinstance(X, Triangle): + raise AttributeError("X must be a pandas dataframe, not a Triangle") + + def fit(self, X, y=None, sample_weight=None): + self._check_X(X) + self.design_info_ = dmatrix(self.formula, X).design_info + return self + + def transform(self, X): + self._check_X(X) + return dmatrix(self.design_info_, X) diff --git a/chainladder/workflow/tests/test_workflow.py b/chainladder/workflow/tests/test_workflow.py index be9b852e..e4f003a3 100644 --- a/chainladder/workflow/tests/test_workflow.py +++ b/chainladder/workflow/tests/test_workflow.py @@ -41,6 +41,8 @@ def test_pipeline(): X = tri[['CumPaidLoss', 'CaseIncurredLoss']] sample_weight = tri['EarnedPremDIR'].latest_diagonal + X_gt = X.copy() + sample_weight_gt = sample_weight.copy() dev = [cl.Development(), cl.ClarkLDF(), cl.Trend(), cl.IncrementalAdditive(), cl.MunichAdjustment(paid_to_incurred=('CumPaidLoss', 'CaseIncurredLoss')), @@ -49,7 +51,8 @@ def test_pipeline(): ibnr = [cl.Chainladder(), cl.BornhuetterFerguson(), cl.Benktander(n_iters=2), cl.CapeCod()] for model in list(itertools.product(dev, tail, ibnr)): - print(model) + assert X == X_gt + assert sample_weight == sample_weight_gt cl.Pipeline( steps=[('dev', model[0]), ('tail', model[1]), diff --git a/docs/modules/classes.rst b/docs/modules/classes.rst index 2bbe098d..fd47f14d 100644 --- a/docs/modules/classes.rst +++ b/docs/modules/classes.rst @@ -174,3 +174,13 @@ Functions load_sample minimum maximum + +Classes +------- +.. currentmodule:: chainladder + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + PatsyFormula diff --git a/docs/modules/development.rst b/docs/modules/development.rst index 6e5b3b9e..65c6e1d7 100644 --- a/docs/modules/development.rst +++ b/docs/modules/development.rst @@ -617,14 +617,14 @@ The `TweedieGLM` implements the GLM reserving structure discussed by Taylor and A nice property of the GLM framework is that it is highly flexible in terms of including covariates that may be predictive of loss reserves while maintaining a close relationship to traditional methods. Additionally, the framework can be extended in a straightforward -way to incorporate various approaches to measuring prediction errors. - +way to incorporate various approaches to measuring prediction errors. Behind the +scenes, `TweedieGLM` is using scikit-learn's ``TweedieRegressor`` estimator. Long Format ------------- GLMs are fit to triangles in "Long Format". That is, they are converted to pandas DataFrames behind the scenes. Each axis of the `Triangle` is included in the -dataframe. The ``origin`` and ```development`` axes are in columns of the same name. +dataframe. The ``origin`` and ``development`` axes are in columns of the same name. You can inspect what your `Triangle` looks like in long format by calling `to_frame` with ``keepdims=True`` @@ -643,10 +643,12 @@ with ``keepdims=True`` 'origin', 'development', and 'valuation' are reserved keywords for the dataframe. Declaring columns with these names separately will result in error. While you can inspect the `Triangle` in long format, you will not directly convert -to long format yourself. The `TweedieGLM` does this for you. Additionally, it +to long format yourself. The `TweedieGLM` does this for you. Additionally, the `origin` of the design matrix is restated in years from the earliest origin -period. Finally, the `TweedieGLM` will automatically convert the response to an -incremental basis. +period. That is, is if the earliest origin is '1995-01-01' then it gets replaced with +0. Consequently, '1996-04-01' would be replaced with 1.25. This is done because +datetimes have limited support in scikit-learn. Finally, the `TweedieGLM` +will automatically convert the response to an incremental basis. R-style formulas ----------------- @@ -691,7 +693,8 @@ Parsimonious modeling ----------------------- Having full access to all axes of the `Triangle` along with the powerful formulation of ``patsy`` allows for substantial customization of the model fit. For example, -we can include 'LOB' interactions, and +we can include 'LOB' interactions with piecewise linear coefficients to reduce +model complexity. >>> clrd = cl.load_sample('clrd')['CumPaidLoss'].groupby('LOB').sum() >>> clrd=clrd[clrd['LOB'].isin(['ppauto', 'comauto'])] @@ -732,3 +735,88 @@ estimation. .. topic:: References .. [T2016] Taylor, G. and McGuire G., "Stochastic Loss Reserving Using Generalized Linear Models", CAS Monograph #3 + +DevelopmentML +============== +`DevelopmentML` is a general development estimator that works as an interface to +scikit-learn compliant machine learning (ML) estimators. The `TweedieGLM` is +a special case of `DevelopmentML` with the ML algorithm limited to scikit-learn's +``TweedieRegressor`` estimator. + +The Interface +-------------- +ML algorithms are designed to be fit against tabular data like a pandas DataFrame +or a 2D numpy array. A `Triangle` does not meet the definition and so ``DevelopmentML`` +is provided to incorporate ML into a broader reserving workflow. This includes: + + 1. Automatic conversion of Triangle to a dataframe for fitting + 2. Flexibility in expressing any preprocessing as part of a scikit-learn ``Pipeline`` + 3. Predictions through the terminal development age of a `Triangle` to fill in the + lower half + 4. Predictions converted to ``ldf_`` patterns so that the results of the estimator + are compliant with the rest of ``chainladder``, like tail selection and IBNR modeling. + +Features +--------- +Data from any axis of a `Triangle` is available to be used in the `DevelopmentML` +estimator. For example, we can use many of the scikit-learn components to +generate development patterns from both the time axes as well as the ``index`` of +the `Triangle`. + + >>> import chainladder as cl + >>> from sklearn.ensemble import RandomForestRegressor + >>> from sklearn.pipeline import Pipeline + >>> from sklearn.preprocessing import OneHotEncoder + >>> from sklearn.compose import ColumnTransformer + >>> clrd = cl.load_sample('clrd').groupby('LOB').sum()['CumPaidLoss'] + >>> # Decide how to preprocess the X (ML) dataset using sklearn + >>> design_matrix = ColumnTransformer(transformers=[ + ... ('dummy', OneHotEncoder(drop='first'), ['LOB', 'development']), + ... ('passthrough', 'passthrough', ['origin']) + ... ]) + >>> # Wrap preprocessing and model in a larger sklearn Pipeline + >>> estimator_ml = Pipeline(steps=[ + ... ('design_matrix', design_matrix), + ... ('model', RandomForestRegressor()) + ... ]) + >>> # Fitting DevelopmentML fits the underlying ML model and gives access to ldf_ + >>> cl.DevelopmentML(estimator_ml=estimator_ml, y_ml='CumPaidLoss').fit(clrd).ldf_ + Triangle Summary + Valuation: 2261-12 + Grain: OYDY + Shape: (6, 1, 10, 9) + Index: [LOB] + Columns: [CumPaidLoss] + + +Autoregressive +--------------- +The time-series nature of loss development naturally lends to an urge for autoregressive +features. That is, features that are based on predictions, albeit on a lagged basis. +`DevelopmentML` includes an ``autoregressive`` parameter that can be used to +express the response as a lagged feature as well. + +.. note:: + When using ``autoregressive`` features, you must also declare it as a column + in your ``estimator_ml`` Pipeline. + +PatsyFormula +------------- +While the sklearn preprocessing API is powerful, it can be tedious work with in +some instances. In particular, modeling complex interactions is much easier to do +with Patsy. The ``chainladder`` package includes a custom sklearn estimator +to gain access to the patsy API. This is done through the `PatsyFormula` estimator. + + >>> estimator_ml = Pipeline(steps=[ + ... ('design_matrix', cl.PatsyFormula('LOB:C(origin)+LOB:C(development)+development')), + ... ('model', RandomForestRegressor()) + ... ]) + >>> cl.DevelopmentML(estimator_ml=estimator_ml, y_ml='CumPaidLoss').fit(clrd).ldf_.iloc[0, 0, 0].round(2) + 12-24 24-36 36-48 48-60 60-72 72-84 84-96 96-108 108-120 + (All) 2.64 1.42 1.19 1.09 1.04 1.02 1.01 1.01 1.01 + + + .. note:: + `PatsyFormula` is not an estimator designed to work with triangles. It is an sklearn + transformer designed to work with pandas DataFrames allowing it to work directly + in an sklearn Pipeline.