resolves #129

casact · Mar 1, 2021 · 79ea9a4 · 79ea9a4
1 parent 3dfb7d6
commit 79ea9a4
Show file tree

Hide file tree

Showing 19 changed files with 255 additions and 119 deletions.
diff --git a/README.rst b/README.rst
@@ -56,6 +56,10 @@ Available Estimators
 +------------------------------+------------------+-------------------------+-----------------------+-----------------------+
 | `CaseOutstanding`_           |                  |                         |                       |                       |
 +------------------------------+------------------+-------------------------+-----------------------+-----------------------+
+| `TweedieGLM`_                |                  |                         |                       |                       |
++------------------------------+------------------+-------------------------+-----------------------+-----------------------+
+| `DevelopmentML`_             |                  |                         |                       |                       |
++------------------------------+------------------+-------------------------+-----------------------+-----------------------+
 
 Documentation
 -------------
@@ -85,6 +89,8 @@ code documentation.
 .. _VotingChainladder: https://chainladder-python.readthedocs.io/en/latest/modules/workflow.html#votingchainladder
 .. _Trend: https://chainladder-python.readthedocs.io/en/latest/modules/adjustments.html#trend
 .. _CaseOutstanding: https://chainladder-python.readthedocs.io/en/latest/modules/development.html#caseoutstanding
+.. _TweedieGLM: https://chainladder-python.readthedocs.io/en/latest/modules/development.html#tweedieglm
+.. _DevelopmentML: https://chainladder-python.readthedocs.io/en/latest/modules/development.html#developmentml
 .. _Documentation: https://chainladder-python.readthedocs.io/en/latest/
 
 Getting Started Tutorials

diff --git a/chainladder/development/clark.py b/chainladder/development/clark.py
@@ -195,8 +195,6 @@ def solver(x):
         obj._set_slicers()
         self.ldf_ = obj
         self.ldf_.valuation_date = pd.to_datetime(ULT_VAL)
-        self.sigma_ = self.ldf_ * 0 + 1
-        self.std_err_ = self.ldf_ * 0 + 1
         rows = X.index.set_index(X.key_labels).index
         self.omega_ = pd.DataFrame(params[..., 0, 0], index=rows, columns=X.vdims)
         self.theta_ = pd.DataFrame(params[..., 0, 1], index=rows, columns=X.vdims)
@@ -237,8 +235,6 @@ def transform(self, X):
         X_new = X.copy()
         triangles = [
             "ldf_",
-            "sigma_",
-            "std_err_",
             "omega_",
             "theta_",
             "incremental_fits_",

diff --git a/chainladder/development/constant.py b/chainladder/development/constant.py
@@ -79,8 +79,6 @@ def fit(self, X, y=None, sample_weight=None):
         self.ldf_.is_pattern = True
         self.ldf_.is_cumulative = False
         self.ldf_.valuation_date = pd.to_datetime(ULT_VAL)
-        self.sigma_ = self.ldf_ * 0 + 1
-        self.std_err_ = self.ldf_ * 0 + 1
         return self
 
     def transform(self, X):
@@ -97,7 +95,7 @@ def transform(self, X):
             X_new : New triangle with transformed attributes.
         """
         X_new = X.copy()
-        triangles = ["ldf_", "sigma_", "std_err_"]
+        triangles = ["ldf_"]
         for item in triangles:
             setattr(X_new, item, getattr(self, item))
         X_new._set_slicers()

diff --git a/chainladder/development/glm.py b/chainladder/development/glm.py
@@ -1,45 +1,35 @@
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at https://mozilla.org/MPL/2.0/.
-
 import pandas as pd
 import numpy as np
-from patsy import dmatrix
-from sklearn.base import BaseEstimator, TransformerMixin
 from chainladder.development.base import DevelopmentBase
 from chainladder.development.learning import DevelopmentML
 from sklearn.linear_model import TweedieRegressor
 from sklearn.pipeline import Pipeline
-
-
-class PatsyFormula(BaseEstimator, TransformerMixin):
-    """ A sklearn-style wrapper for patsy formulas """
-    def __init__(self, formula=None):
-        self.formula = formula
-
-    def fit(self, X, y=None, sample_weight=None):
-        self.design_info_ = dmatrix(self.formula, X).design_info
-        return self
-
-    def transform(self, X):
-        return dmatrix(self.design_info_, X)
+from chainladder.utils.utility_functions import PatsyFormula
 
 
 class TweedieGLM(DevelopmentBase):
     """ This estimator creates development patterns with a GLM using a Tweedie distribution.
 
-
     The Tweedie family includes several of the more popular distributions including
     the normal, ODP poisson, and gamma distributions.  This class is a special case
     of `DevleopmentML`.  It restricts to just GLM using a TweedieRegressor and
     provides an R-like formulation of the design matrix.
 
+    .. versionadded:: 0.8.1
+
     Parameters
     -----------
     design_matrix : formula-like
         A patsy formula describing the independent variables, X of the GLM
-    response : str, default None
-        Name of the response column.
+    response :  str
+        Column name for the reponse variable of the GLM.  If ommitted, then the
+        first column of the Triangle will be used.
+    weight : str
+        Column name of any weight to use in the GLM. If none specified, then an
+        unweighted regression will be performed.
     power : float, default=0
             The power determines the underlying target distribution according
             to the following table:
@@ -84,9 +74,10 @@ class TweedieGLM(DevelopmentBase):
     """
 
     def __init__(self, design_matrix='C(development) + C(origin)',
-                 response=None, power=1.0, alpha=1.0, link='log',
+                 response=None, weight=None, power=1.0, alpha=1.0, link='log',
                  max_iter=100, tol=0.0001, warm_start=False, verbose=0):
         self.response=response
+        self.weight=weight
         self.design_matrix = design_matrix
         self.power=power
         self.alpha=alpha
@@ -104,7 +95,7 @@ def fit(self, X, y=None, sample_weight=None):
                     link=self.link, power=self.power, max_iter=self.max_iter,
                     tol=self.tol, warm_start=self.warm_start,
                     verbose=self.verbose, fit_intercept=False))]),
-                    y_ml=response).fit(X)
+                    y_ml=response, weight_ml=self.weight).fit(X)
         return self
 
     @property
@@ -119,7 +110,8 @@ def triangle_glm_(self):
     def coef_(self):
         return pd.Series(
             self.model.estimator_ml.named_steps.model.coef_, name='coef_',
-            index=list(self.model.estimator_ml.named_steps.design_matrix.design_info_.column_name_indexes.keys())
+            index=list(self.model.estimator_ml.named_steps.design_matrix.
+                            design_info_.column_name_indexes.keys())
         ).to_frame()
 
     def transform(self, X):

diff --git a/chainladder/development/incremental.py b/chainladder/development/incremental.py
@@ -91,6 +91,8 @@ def fit(self, X, y=None, sample_weight=None):
             X = X.copy()
         if sample_weight.array_backend == "sparse":
             sample_weight = sample_weight.set_backend("numpy")
+        else:
+            sample_weight = sample_weight.copy()
         xp = X.get_array_module()
         sample_weight.is_cumulative = False
         obj = X.cum_to_incr() / sample_weight.values
@@ -141,7 +143,6 @@ def fit(self, X, y=None, sample_weight=None):
             1/(1+future_trend)-1, axis='valuation', start=X.valuation_date,
             end=self.incremental_.valuation_date)
         self.ldf_ = obj.incr_to_cum().link_ratio
-        self.sigma_ = self.std_err_ = 0 * self.ldf_
         return self
 
     def transform(self, X):
@@ -158,6 +159,6 @@ def transform(self, X):
             X_new : New triangle with transformed attributes.
         """
         X_new = X.copy()
-        for item in ["incremental_", "ldf_", "sigma_", "std_err_"]:
+        for item in ["ldf_"]:
             X_new.__dict__[item] = self.__dict__[item]
         return X_new
diff --git a/chainladder/development/learning.py b/chainladder/development/learning.py
@@ -8,42 +8,51 @@
 from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
 from sklearn.compose import ColumnTransformer
 from chainladder.development.base import DevelopmentBase
+from chainladder import ULT_VAL
 
 
 class DevelopmentML(DevelopmentBase):
     """ A Estimator that interfaces with machine learning (ML) tools that implement
     the scikit-learn API.
 
+    The `DevelopmentML` estimator is used to generate ``ldf_`` patterns from
+    the data.
+
     .. versionadded:: 0.8.1
 
+
     Parameters
     ----------
     estimator_ml : skearn Estimator
         Any sklearn compatible regression estimator, including Pipelines and
     y_ml : list or str or sklearn_transformer
         The response column(s) for the machine learning algorithm. It must be
         present within the Triangle.
-    y_features :
+    autoregressive : tuple, (autoregressive_col_name, lag, source_col_name)
         The subset of response column(s) to use as lagged features for the
         Time Series aspects of the model. Predictions from one development period
-        get used as featues in the next development period.
+        get used as featues in the next development period. Lags should be negative
+        integers.
     fit_incrementals :
         Whether the response variable should be converted to an incremental basis
         for fitting.
 
 
     Attributes
     ----------
+    estimator_ml : Estimator
+        An sklearn-style estimator to predict development patterns
     ldf_ : Triangle
         The estimated loss development patterns.
     cdf_ : Triangle
         The estimated cumulative development patterns.
     """
-    def __init__(self, estimator_ml=None,
-                 y_ml=None, y_features=False, fit_incrementals=True):
+    def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False,
+                 weight_ml=None, fit_incrementals=True):
         self.estimator_ml=estimator_ml
         self.y_ml=y_ml
-        self.y_features=y_features
+        self.weight_ml = weight_ml
+        self.autoregressive=autoregressive
         self.fit_incrementals=fit_incrementals
 
     def _get_y_names(self):
@@ -77,14 +86,16 @@ def y_ml_(self):
         else:
             return transformer
 
-    def _get_triangle_ml(self, df):
+    def _get_triangle_ml(self, df, preds=None):
         """ Create fitted Triangle """
         from chainladder.core import Triangle
-        preds = self.estimator_ml.predict(df)
+        if preds is None:
+            preds = self.estimator_ml.predict(df)
         X_r = [df]
         y_r = [preds]
         dgrain = {'Y':12, 'Q':3, 'M': 1}[self.development_grain_]
-        latest_filter = df['origin']+(df['development']-dgrain)/dgrain
+        ograin = {'Y':1, 'Q':4, 'M': 12}[self.origin_grain_]
+        latest_filter = (df['origin']+1)*ograin+(df['development']-dgrain)/dgrain
         latest_filter = latest_filter == latest_filter.max()
         preds=pd.DataFrame(preds.copy())[latest_filter].values
         out = df.loc[latest_filter].copy()
@@ -93,10 +104,12 @@ def _get_triangle_ml(self, df):
             out['development'] = out['development'] + dgrain
             if len(preds.shape) == 1:
                 preds = preds[:, None]
-            if self.y_features:
-                for num, col in enumerate(self.y_features):
+            if self.autoregressive:
+                for num, col in enumerate(self.autoregressive):
                     out[col[0]]=preds[:, num]
             out = out[out['development']<=dev_lags.max()]
+            if len(out) == 0:
+                continue
             X_r.append(out.copy())
             preds = self.estimator_ml.predict(out)
             y_r.append(preds.copy())
@@ -108,8 +121,26 @@ def _get_triangle_ml(self, df):
         out['origin'] = out['origin'].map({v: k for k, v in self.origin_encoder_.items()})
         out = out.merge(self.valuation_vector_, how='left', on=['origin', 'development'])
         return Triangle(
-            out, origin='origin', development='valuation', index=self._key_labels, columns=self._get_y_names()).dropna()
+            out, origin='origin', development='valuation',
+            index=self._key_labels, columns=self._get_y_names(),
+            cumulative=not self.fit_incrementals).dropna()
 
+    def _prep_X_ml(self, X):
+        """ Preps Triangle data ahead of the pipeline """
+        if self.fit_incrementals:
+            X_ = X.cum_to_incr()
+        else:
+            X_ = X.copy()
+        if self.autoregressive:
+            for i in self.autoregressive:
+                lag = X[i[2]].shift(i[1])
+                X_[i[0]] = lag[lag.valuation<=X.valuation_date]
+        df_base = X.incr_to_cum().to_frame(keepdims=True).reset_index().iloc[:, :-1]
+        df = df_base.merge(
+            X.cum_to_incr().to_frame(keepdims=True).reset_index(), how='left',
+            on=list(df_base.columns)).fillna(0)
+        df['origin'] = df['origin'].map(self.origin_encoder_)
+        return df
 
     def fit(self, X, y=None, sample_weight=None):
         """Fit the model with X.
@@ -129,10 +160,6 @@ def fit(self, X, y=None, sample_weight=None):
             Returns the instance itself.
         """
 
-        if self.fit_incrementals:
-            X_ = X.cum_to_incr()
-        else:
-            X_ = X.copy()
         self._columns = list(X.columns)
         self._key_labels = X.key_labels
         self.origin_grain_ = X.origin_grain
@@ -144,24 +171,16 @@ def fit(self, X, y=None, sample_weight=None):
             X.valuation.values.reshape(X.shape[-2:], order='F'),
             index=X.odims, columns=X.ddims).unstack().reset_index()
         self.valuation_vector_.columns=['development', 'origin', 'valuation']
-        # response as a feature
-        if self.y_features:
-            for i in self.y_features:
-                lag = X[i[2]].shift(i[1])
-                X_[i[0]] = lag[lag.valuation<=X.valuation_date]
-
-        df = X_.to_frame(keepdims=True).reset_index().fillna(0)
-        df['origin'] = df['origin'].map(self.origin_encoder_)
-        self.df_ = df # Unncecessary, used for debugging
-
+        df = self._prep_X_ml(X)
+        self.df_ = df
         # Fit model
         self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze())
+        #return self
         self.triangle_ml_ = self._get_triangle_ml(df)
         return self
 
     @property
     def ldf_(self):
-        from chainladder import ULT_VAL
         ldf = self.triangle_ml_.incr_to_cum().link_ratio
         ldf.valuation_date = pd.to_datetime(ULT_VAL)
         return ldf
@@ -179,13 +198,11 @@ def transform(self, X):
         -------
             X_new : New triangle with transformed attributes.
         """
-
         X_new = X.copy()
-        triangles = [
-            "ldf_",
-        ]
-        for item in triangles:
-            setattr(X_new, item, getattr(self, item))
-        X_new.sigma_ = X_new.std_err_ = X_new.ldf_ * 0 + 1
+        X_ml = self._prep_X_ml(X)
+        y_ml=self.estimator_ml.predict(X_ml)
+        triangle_ml = self._get_triangle_ml(X_ml, y_ml)
+        X_new.ldf_ = triangle_ml.incr_to_cum().link_ratio
+        X_new.ldf_.valuation_date = pd.to_datetime(ULT_VAL)
         X_new._set_slicers()
         return X_new
diff --git a/chainladder/development/outstanding.py b/chainladder/development/outstanding.py
@@ -105,7 +105,6 @@ def fit(self, X, y=None, sample_weight=None):
         dev.is_pattern=True
         dev.is_cumulative=True
         self.ldf_ = dev.cum_to_incr()
-        self.std_err_ = self.sigma_ = self.ldf_ * 0 + 1
         return self
 
     @property
@@ -142,7 +141,7 @@ def transform(self, X):
             X_new : New triangle with transformed attributes.
         """
         X_new = X.copy()
-        triangles = ["ldf_", "sigma_", "std_err_"]
+        triangles = ["ldf_"]
         for item in triangles:
             setattr(X_new, item, getattr(self, item))
         X_new._set_slicers()

diff --git a/chainladder/development/tests/test_incremental.py b/chainladder/development/tests/test_incremental.py
@@ -9,7 +9,7 @@ def test_schmidt():
     answer = ia.fit_transform(
         tri.iloc[0, 0], sample_weight=tri.iloc[0, 1].latest_diagonal
     )
-    answer = answer.incremental_.incr_to_cum().values[0, 0, :, -1]
+    answer = ia.incremental_.incr_to_cum().values[0, 0, :, -1]
     check = xp.array(
         [
             3483.0,

diff --git a/chainladder/methods/mack.py b/chainladder/methods/mack.py
@@ -57,7 +57,7 @@ def fit(self, X, y=None, sample_weight=None):
             Returns the instance itself.
         """
         super().fit(X, y, sample_weight)
-        if not ("average_" in self.X_ and "w_" in self.X_):
+        if "sigma_" not in self.X_:
             raise ValueError("Triangle not compatible with MackChainladder")
         # Caching full_triangle_ for fit as it is called a lot
         self.X_._full_triangle_ = self.full_triangle_