From 88a2f1c03be24046de8fee20b8ebbc0f1a83d08e Mon Sep 17 00:00:00 2001 From: "Kenneth S. Hsu" Date: Wed, 3 Apr 2024 15:24:49 -0700 Subject: [PATCH] 497 (#501) * Added labels * Tracing * Investigating the bug * Added approximation_grain * Clean up debugger * Added test cases * Added assert in some test cases * Able to print an empty triangle * Added test cases * Fixed display issues * Update base.py * Addressed a calculation bug * Added debugger * Modified test * Removed debugger --------- Co-authored-by: John S Bogaardt --- chainladder/adjustments/parallelogram.py | 11 +- chainladder/core/base.py | 2 +- chainladder/core/display.py | 19 +- chainladder/core/tests/test_display.py | 24 +- chainladder/core/triangle.py | 22 +- chainladder/development/base.py | 341 +++++++++++++--------- chainladder/methods/tests/test_predict.py | 84 ++++-- chainladder/utils/tests/test_utilities.py | 105 ++++++- chainladder/utils/utility_functions.py | 124 +++++++- 9 files changed, 512 insertions(+), 220 deletions(-) diff --git a/chainladder/adjustments/parallelogram.py b/chainladder/adjustments/parallelogram.py index bdb53b47..4bf6bed7 100644 --- a/chainladder/adjustments/parallelogram.py +++ b/chainladder/adjustments/parallelogram.py @@ -36,11 +36,17 @@ class ParallelogramOLF(BaseEstimator, TransformerMixin, EstimatorIO): """ def __init__( - self, rate_history=None, change_col="", date_col="", vertical_line=False + self, + rate_history=None, + change_col="", + date_col="", + approximation_grain="M", + vertical_line=False, ): self.rate_history = rate_history self.change_col = change_col self.date_col = date_col + self.approximation_grain = approximation_grain self.vertical_line = vertical_line def fit(self, X, y=None, sample_weight=None): @@ -77,6 +83,7 @@ def fit(self, X, y=None, sample_weight=None): end_date=X.origin[-1].to_timestamp(how="e"), grain=X.origin_grain, vertical_line=self.vertical_line, + approximation_grain=self.approximation_grain, ) if len(groups) > 0: @@ -105,7 +112,7 @@ def fit(self, X, y=None, sample_weight=None): return self def transform(self, X, y=None, sample_weight=None): - """ If X and self are of different shapes, align self to X, else + """If X and self are of different shapes, align self to X, else return self. Parameters diff --git a/chainladder/core/base.py b/chainladder/core/base.py index 9d8c3109..51648611 100644 --- a/chainladder/core/base.py +++ b/chainladder/core/base.py @@ -258,7 +258,7 @@ def _to_datetime(data, fields, period_end=False, format=None): def _development_lag(origin, valuation): """For tabular format, this will convert the origin/valuation difference to a development lag""" - return ((valuation - origin) / (365.25/12)).round('1d').dt.days + return ((valuation - origin) / (365.25/12)).dt.round('1d').dt.days @staticmethod diff --git a/chainladder/core/display.py b/chainladder/core/display.py index e7fe4e87..1cf34cd6 100644 --- a/chainladder/core/display.py +++ b/chainladder/core/display.py @@ -13,6 +13,12 @@ class TriangleDisplay: def __repr__(self): + try: + self.values + except: + print("Triangle is empty") + return + if (self.values.shape[0], self.values.shape[1]) == (1, 1): data = self._repr_format() return data.to_string() @@ -33,7 +39,7 @@ def _summary_frame(self): ).to_frame() def _repr_html_(self): - """ Jupyter/Ipython HTML representation """ + """Jupyter/Ipython HTML representation""" if (self.values.shape[0], self.values.shape[1]) == (1, 1): data = self._repr_format() fmt_str = self._get_format_str(data) @@ -66,7 +72,7 @@ def _get_format_str(self, data): def _repr_format(self, origin_as_datetime=False): out = self.compute().set_backend("numpy").values[0, 0] if origin_as_datetime and not self.is_pattern: - origin = self.origin.to_timestamp(how='s') + origin = self.origin.to_timestamp(how="s") else: origin = self.origin.copy() origin.name = None @@ -85,7 +91,7 @@ def _repr_format(self, origin_as_datetime=False): return pd.DataFrame(out, index=origin, columns=development) def heatmap(self, cmap="coolwarm", low=0, high=0, axis=0, subset=None): - """ Color the background in a gradient according to the data in each + """Color the background in a gradient according to the data in each column (optionally row). Requires matplotlib Parameters @@ -134,7 +140,12 @@ def heatmap(self, cmap="coolwarm", low=0, high=0, axis=0, subset=None): else: default_output = ( data.style.format(fmt_str) - .background_gradient(cmap=cmap, low=low, high=high, axis=axis,) + .background_gradient( + cmap=cmap, + low=low, + high=high, + axis=axis, + ) .render() ) output_xnan = re.sub("", "", default_output) diff --git a/chainladder/core/tests/test_display.py b/chainladder/core/tests/test_display.py index 0de9ab8d..981e566c 100644 --- a/chainladder/core/tests/test_display.py +++ b/chainladder/core/tests/test_display.py @@ -3,18 +3,18 @@ def test_heatmap_render(raa): - """ The heatmap method should render correctly given the sample.""" - return raa.heatmap() + """The heatmap method should render correctly given the sample.""" + assert raa.heatmap() -def test_to_frame(raa): - try: - cl.Chainladder().fit(raa).cdf_.to_frame() - cl.Chainladder().fit(raa).cdf_.to_frame(origin_as_datetime=False) - cl.Chainladder().fit(raa).cdf_.to_frame(origin_as_datetime=True) - cl.Chainladder().fit(raa).ultimate_.to_frame() - cl.Chainladder().fit(raa).ultimate_.to_frame(origin_as_datetime=False) - cl.Chainladder().fit(raa).ultimate_.to_frame(origin_as_datetime=True) +def test_empty_triangle(): + assert cl.Triangle() + - except: - assert False +def test_to_frame(raa): + assert cl.Chainladder().fit(raa).cdf_.to_frame() + assert cl.Chainladder().fit(raa).cdf_.to_frame(origin_as_datetime=False) + assert cl.Chainladder().fit(raa).cdf_.to_frame(origin_as_datetime=True) + assert cl.Chainladder().fit(raa).ultimate_.to_frame() + assert cl.Chainladder().fit(raa).ultimate_.to_frame(origin_as_datetime=False) + assert cl.Chainladder().fit(raa).ultimate_.to_frame(origin_as_datetime=True) diff --git a/chainladder/core/triangle.py b/chainladder/core/triangle.py index c0a76464..a54a22df 100644 --- a/chainladder/core/triangle.py +++ b/chainladder/core/triangle.py @@ -130,6 +130,9 @@ def __init__( data, index, columns, origin, development ) + self.columns_label = columns + self.origin_label = origin + # Handle any ultimate vectors in triangles separately data, ult = self._split_ult(data, index, columns, origin, development) # Conform origins and developments to datetimes and determine lowest grains @@ -172,6 +175,7 @@ def __init__( # Deal with labels if not index: index = ["Total"] + self.index_label = index data_agg[index[0]] = "Total" self.kdims, key_idx = self._set_kdims(data_agg, index) @@ -672,8 +676,8 @@ def grain(self, grain="", trailing=False, inplace=False): obj = self.dev_to_val() if ograin_new != ograin_old: freq = {"Y": "A", "S": "2Q"}.get(ograin_new, ograin_new) - if trailing or (obj.origin.freqstr[-3:] != "DEC" and ograin_old != 'M'): - origin_period_end = self.origin[-1].strftime("%b").upper() + if trailing or (obj.origin.freqstr[-3:] != "DEC" and ograin_old != "M"): + origin_period_end = self.origin[-1].strftime("%b").upper() else: origin_period_end = "DEC" indices = ( @@ -687,12 +691,16 @@ def grain(self, grain="", trailing=False, inplace=False): obj = obj.groupby(groups, axis=2).sum() obj.origin_close = origin_period_end d_start = pd.Period( - obj.valuation[0], - freq=dgrain_old if dgrain_old == 'M' else dgrain_old + obj.origin.freqstr[-4:] - ).to_timestamp(how='s') - if (len(obj.ddims) > 1 and obj.origin.to_timestamp(how='s')[0] != d_start): + obj.valuation[0], + freq=dgrain_old + if dgrain_old == "M" + else dgrain_old + obj.origin.freqstr[-4:], + ).to_timestamp(how="s") + if len(obj.ddims) > 1 and obj.origin.to_timestamp(how="s")[0] != d_start: addl_ts = ( - pd.period_range(obj.odims[0], obj.valuation[0], freq=dgrain_old)[:-1] + pd.period_range(obj.odims[0], obj.valuation[0], freq=dgrain_old)[ + :-1 + ] .to_timestamp() .values ) diff --git a/chainladder/development/base.py b/chainladder/development/base.py index 2bf8d7a9..095109a6 100644 --- a/chainladder/development/base.py +++ b/chainladder/development/base.py @@ -12,17 +12,17 @@ class DevelopmentBase(BaseEstimator, TransformerMixin, EstimatorIO, Common): - - def fit(self,X,y=None,sample_weight=None): + def fit(self, X, y=None, sample_weight=None): average_ = self._validate_assumption(y, self.average, axis=3) self.average_ = average_.flatten() exponent = self.xp.array( - [{"regression": 0, "volume": 1, "simple": 2}[x] - for x in average_[0, 0, 0]] + [{"regression": 0, "volume": 1, "simple": 2}[x] for x in average_[0, 0, 0]] ) exponent = self.xp.nan_to_num(exponent * (y * 0 + 1)) w = num_to_nan(sample_weight / (X ** (exponent))) - self.params_ = WeightedRegression(axis=2, thru_orig=True, xp=self.xp).fit(X, y, w) + self.params_ = WeightedRegression(axis=2, thru_orig=True, xp=self.xp).fit( + X, y, w + ) return self def _set_fit_groups(self, X): @@ -74,11 +74,13 @@ def _assign_n_periods_weight_int(X, n_periods): xp = X.get_array_module() dict_map = { - item: _assign_n_periods_weight_int(X, item) for item in set(n_periods.flatten()) + item: _assign_n_periods_weight_int(X, item) + for item in set(n_periods.flatten()) } conc = [ - dict_map[item][..., num : num + 1] for num, item in enumerate(n_periods.flatten()) + dict_map[item][..., num : num + 1] + for num, item in enumerate(n_periods.flatten()) ] return xp.concatenate(tuple(conc), -1) @@ -92,8 +94,9 @@ def _drop_adjustment(self, X, link_ratio): weight = weight * self._drop_valuation(X) if (self.drop_high is not None) | (self.drop_low is not None): - n_periods_ = self._validate_assumption( - X, self.n_periods, axis=3)[0, 0, 0, :-1] + n_periods_ = self._validate_assumption(X, self.n_periods, axis=3)[ + 0, 0, 0, :-1 + ] w_ = self._assign_n_periods_weight(X, n_periods_) w_ = w_.astype("float") @@ -118,7 +121,7 @@ def _drop_adjustment(self, X, link_ratio): # for drop_high and drop_low def _drop_n(self, drop_high, drop_low, X, link_ratio, preserve): - #this is safe because each triangle by index and column has + # this is safe because each triangle by index and column has link_ratios_len = link_ratio.shape[3] def drop_array_helper(drop_type): @@ -147,42 +150,69 @@ def drop_array_helper(drop_type): return drop_type_array - #explicitly setting up 3D arrays for drop parameters to avoid broadcasting bugs - drop_high_array = np.zeros((link_ratio.shape[0],link_ratio.shape[1],link_ratios_len)) - drop_high_array[:,:,:] = drop_array_helper(drop_high) - drop_low_array = np.zeros((link_ratio.shape[0],link_ratio.shape[1],link_ratios_len)) - drop_low_array[:,:,:] = drop_array_helper(drop_low) - n_period_array = np.zeros((link_ratio.shape[0],link_ratio.shape[1],link_ratios_len)) - n_period_array[:,:,:] = drop_array_helper(self.n_periods) - preserve_array = np.zeros((link_ratio.shape[0],link_ratio.shape[1],link_ratios_len)) - preserve_array[:,:,:] = drop_array_helper(preserve) - - #operationalizing the -1 option for n_period + # explicitly setting up 3D arrays for drop parameters to avoid broadcasting bugs + drop_high_array = np.zeros( + (link_ratio.shape[0], link_ratio.shape[1], link_ratios_len) + ) + drop_high_array[:, :, :] = drop_array_helper(drop_high) + drop_low_array = np.zeros( + (link_ratio.shape[0], link_ratio.shape[1], link_ratios_len) + ) + drop_low_array[:, :, :] = drop_array_helper(drop_low) + n_period_array = np.zeros( + (link_ratio.shape[0], link_ratio.shape[1], link_ratios_len) + ) + n_period_array[:, :, :] = drop_array_helper(self.n_periods) + preserve_array = np.zeros( + (link_ratio.shape[0], link_ratio.shape[1], link_ratios_len) + ) + preserve_array[:, :, :] = drop_array_helper(preserve) + + # operationalizing the -1 option for n_period n_period_array = np.where(n_period_array == -1, link_ratios_len, n_period_array) - #ranking factors by itself and volume - link_ratio_ranks = np.lexsort((X.values[...,:-1],link_ratio),axis = 2).argsort(axis=2) + # ranking factors by itself and volume + link_ratio_ranks = np.lexsort((X.values[..., :-1], link_ratio), axis=2).argsort( + axis=2 + ) - #setting up default return - weights = ~np.isnan(link_ratio.transpose((0,1,3,2))) + # setting up default return + weights = ~np.isnan(link_ratio.transpose((0, 1, 3, 2))) - #counting valid factors + # counting valid factors ldf_count = weights.sum(axis=3) - #applying n_period - ldf_count_n_period = np.where(ldf_count > n_period_array, n_period_array, ldf_count) + # applying n_period + ldf_count_n_period = np.where( + ldf_count > n_period_array, n_period_array, ldf_count + ) - #applying drop_high and drop_low + # applying drop_high and drop_low max_rank_unpreserve = ldf_count_n_period - drop_high_array min_rank_unpreserve = drop_low_array - #applying preserve + # applying preserve warning_flag = np.any(max_rank_unpreserve - min_rank_unpreserve < preserve) - max_rank = np.where(max_rank_unpreserve - min_rank_unpreserve < preserve, ldf_count_n_period, max_rank_unpreserve) - min_rank = np.where(max_rank_unpreserve - min_rank_unpreserve < preserve, 0, min_rank_unpreserve) + max_rank = np.where( + max_rank_unpreserve - min_rank_unpreserve < preserve, + ldf_count_n_period, + max_rank_unpreserve, + ) + min_rank = np.where( + max_rank_unpreserve - min_rank_unpreserve < preserve, 0, min_rank_unpreserve + ) - index_array_weights = (link_ratio_ranks.transpose((0,1,3,2)) < max_rank.reshape(max_rank.shape[0],max_rank.shape[1],max_rank.shape[2],1)) & ( - link_ratio_ranks.transpose((0,1,3,2)) > min_rank.reshape(min_rank.shape[0],min_rank.shape[1],min_rank.shape[2],1) - 1 + index_array_weights = ( + link_ratio_ranks.transpose((0, 1, 3, 2)) + < max_rank.reshape( + max_rank.shape[0], max_rank.shape[1], max_rank.shape[2], 1 + ) + ) & ( + link_ratio_ranks.transpose((0, 1, 3, 2)) + > min_rank.reshape( + min_rank.shape[0], min_rank.shape[1], min_rank.shape[2], 1 + ) + - 1 ) weights = index_array_weights @@ -203,11 +233,11 @@ def drop_array_helper(drop_type): ) warnings.warn(warning) - return weights.transpose((0,1,3,2)) + return weights.transpose((0, 1, 3, 2)) # for drop_above and drop_below def _drop_x(self, drop_above, drop_below, X, link_ratio, preserve): - #this is safe because each triangle by index and column has + # this is safe because each triangle by index and column has link_ratios_len = link_ratio.shape[3] def drop_array_helper(drop_type, default_value): @@ -226,31 +256,41 @@ def drop_array_helper(drop_type, default_value): return drop_type_array - #explicitly setting up 3D arrays for drop parameters to avoid broadcasting bugs - drop_above_array = np.zeros((link_ratio.shape[0],link_ratio.shape[1],link_ratios_len)) - drop_above_array[:,:,:] = drop_array_helper(drop_above, np.inf) - drop_below_array = np.zeros((link_ratio.shape[0],link_ratio.shape[1],link_ratios_len)) - drop_below_array[:,:,:] = drop_array_helper(drop_below, 0.0) - preserve_array = np.zeros((link_ratio.shape[0],link_ratio.shape[1],link_ratios_len)) - preserve_array[:,:,:] = drop_array_helper(preserve, preserve) + # explicitly setting up 3D arrays for drop parameters to avoid broadcasting bugs + drop_above_array = np.zeros( + (link_ratio.shape[0], link_ratio.shape[1], link_ratios_len) + ) + drop_above_array[:, :, :] = drop_array_helper(drop_above, np.inf) + drop_below_array = np.zeros( + (link_ratio.shape[0], link_ratio.shape[1], link_ratios_len) + ) + drop_below_array[:, :, :] = drop_array_helper(drop_below, 0.0) + preserve_array = np.zeros( + (link_ratio.shape[0], link_ratio.shape[1], link_ratios_len) + ) + preserve_array[:, :, :] = drop_array_helper(preserve, preserve) - #transposing - link_ratio_T = link_ratio.transpose((0,1,3,2)) + # transposing + link_ratio_T = link_ratio.transpose((0, 1, 3, 2)) - #setting up default return + # setting up default return weights = ~np.isnan(link_ratio_T) - #dropping - index_array_weights = (link_ratio_T < drop_above_array[...,None]) & ( - link_ratio_T > drop_below_array[...,None] + # dropping + index_array_weights = (link_ratio_T < drop_above_array[..., None]) & ( + link_ratio_T > drop_below_array[..., None] ) - #counting remaining factors + # counting remaining factors ldf_count = index_array_weights.sum(axis=3) - #applying preserve + # applying preserve warning_flag = np.any(ldf_count < preserve_array) - weights = np.where(ldf_count[...,None] < preserve_array[...,None], weights, index_array_weights) + weights = np.where( + ldf_count[..., None] < preserve_array[..., None], + weights, + index_array_weights, + ) if warning_flag: if preserve == 1: @@ -268,9 +308,10 @@ def drop_array_helper(drop_type, default_value): ) warnings.warn(warning) - return weights.transpose((0,1,3,2)) + return weights.transpose((0, 1, 3, 2)) def _drop_valuation(self, X): + print("=== in _drop_valuation ===") xp = X.get_array_module() if type(self.drop_valuation) is not list: drop_valuation = [self.drop_valuation] @@ -301,9 +342,9 @@ def _drop(self, X): ] = 0 return arr[:, :-1] - def _param_array_helper(self,size, param, default_value): + def _param_array_helper(self, size, param, default_value): # setting default - param_array = pd.Series(size * [default_value]).astype('object') + param_array = pd.Series(size * [default_value]).astype("object") # only a single parameter is provided if isinstance(param, list): param_array[range(len(param))] = np.array(param) @@ -316,7 +357,7 @@ def _param_array_helper(self,size, param, default_value): param_array = param_array.astype(type(default_value)) return param_array.to_numpy() - def _set_weight_func(self,factor,secondary_rank=None): + def _set_weight_func(self, factor, secondary_rank=None): w = (~np.isnan(factor.values)).astype(float) w = w * self._assign_n_periods_weight_func(factor) if self.drop is not None: @@ -329,17 +370,17 @@ def _set_weight_func(self,factor,secondary_rank=None): w = w * self._drop_x_func(factor) if (self.drop_high is not None) | (self.drop_low is not None): - w = w * self._drop_n_func(factor * num_to_nan(w),secondary_rank) + w = w * self._drop_n_func(factor * num_to_nan(w), secondary_rank) w_tri = factor.copy() w_tri.values = num_to_nan(w) return w_tri - def _assign_n_periods_weight_func(self,factor): + def _assign_n_periods_weight_func(self, factor): """Used to apply the n_periods weight""" - #getting dimensions of factor for various manipulation + # getting dimensions of factor for various manipulation factor_len = factor.shape[3] - #putting n_periods into array - n_periods_array = self._param_array_helper(factor_len,self.n_periods,-1) + # putting n_periods into array + n_periods_array = self._param_array_helper(factor_len, self.n_periods, -1) def _assign_n_periods_weight_int(X, n_periods): xp = X.get_array_module() @@ -362,83 +403,101 @@ def _assign_n_periods_weight_int(X, n_periods): xp = factor.get_array_module() dict_map = { - item: _assign_n_periods_weight_int(factor, item) for item in set(n_periods_array) + item: _assign_n_periods_weight_int(factor, item) + for item in set(n_periods_array) } conc = [ - dict_map[item][..., num : num + 1] for num, item in enumerate(n_periods_array) + dict_map[item][..., num : num + 1] + for num, item in enumerate(n_periods_array) ] return xp.concatenate(tuple(conc), -1) - def _drop_func(self,factor): - #get the appropriate backend for nan_triangle and nan_to_num + def _drop_func(self, factor): + # get the appropriate backend for nan_triangle and nan_to_num xp = factor.get_array_module() - #turn single drop_valuation parameter to list if necessary - drop_list = self.drop if isinstance(self.drop,list) else [self.drop] - #get an starting array of weights + # turn single drop_valuation parameter to list if necessary + drop_list = self.drop if isinstance(self.drop, list) else [self.drop] + # get an starting array of weights arr = factor.nan_triangle.copy() - #accommodate ldf triangle as factor, where the dimensions are '12-24' - dev_list = factor.development.str.split("-",expand=True)[0] if factor.development.dtype == object else factor.development.astype("string") - #create ndarray of drop_list for further operation in numpy + # accommodate ldf triangle as factor, where the dimensions are '12-24' + dev_list = ( + factor.development.str.split("-", expand=True)[0] + if factor.development.dtype == object + else factor.development.astype("string") + ) + # create ndarray of drop_list for further operation in numpy drop_np = np.asarray(drop_list) - #find indices of drop_np - origin_ind = np.where(np.array([factor.origin.astype("string")]) == drop_np[:,[0]])[1] - dev_ind = np.where(np.array([dev_list]) == drop_np[:,[1]])[1] - #set weight of dropped factors to 0 - arr[(origin_ind,dev_ind)] = 0 - return xp.nan_to_num(arr)[None,None] - - def _drop_valuation_func(self,factor): - #get the appropriate backend for nan_to_num + # find indices of drop_np + origin_ind = np.where( + np.array([factor.origin.astype("string")]) == drop_np[:, [0]] + )[1] + dev_ind = np.where(np.array([dev_list]) == drop_np[:, [1]])[1] + # set weight of dropped factors to 0 + arr[(origin_ind, dev_ind)] = 0 + return xp.nan_to_num(arr)[None, None] + + def _drop_valuation_func(self, factor): + # get the appropriate backend for nan_to_num xp = factor.get_array_module() - #turn single drop_valuation parameter to list if necessary - if isinstance(self.drop_valuation,list): + # turn single drop_valuation parameter to list if necessary + if isinstance(self.drop_valuation, list): drop_valuation_list = self.drop_valuation else: drop_valuation_list = [self.drop_valuation] - #turn drop_valuation to same valuation freq as factor - v = pd.PeriodIndex(drop_valuation_list, freq=factor.development_grain).to_timestamp(how="e") - #warn that some drop_valuation are outside of factor + # turn drop_valuation to same valuation freq as factor + v = pd.PeriodIndex( + drop_valuation_list, freq=factor.development_grain + ).to_timestamp(how="e") + # warn that some drop_valuation are outside of factor if np.any(~v.isin(factor.valuation)): warnings.warn("Some valuations could not be dropped.") - #return triangle of 0/1 where dropped factors have 0 - b = xp.nan_to_num(factor.iloc[0,0][~factor.valuation.isin(v)].values * 0 + 1) - #check to make sure some factors are still left + # return triangle of 0/1 where dropped factors have 0 + b = xp.nan_to_num(factor.iloc[0, 0][~factor.valuation.isin(v)].values * 0 + 1) + # check to make sure some factors are still left if b.sum() == 0: raise Exception("The entire triangle has been dropped via drop_valuation.") return b - def _drop_x_func(self,factor): - #getting dimensions of factor for various manipulation + def _drop_x_func(self, factor): + # getting dimensions of factor for various manipulation factor_val = factor.values.copy() factor_len = factor_val.shape[3] indices = factor_val.shape[0] columns = factor_val.shape[1] - #explicitly setting up 3D arrays for drop parameters to avoid broadcasting bugs - drop_above_array = np.zeros((indices,columns,factor_len)) - drop_above_array[:,:,:] = self._param_array_helper(factor_len,self.drop_above, np.inf)[None,None] - drop_below_array = np.zeros((indices,columns,factor_len)) - drop_below_array[:,:,:] = self._param_array_helper(factor_len,self.drop_below, 0.0)[None,None] - preserve_array = np.zeros((indices,columns,factor_len)) - preserve_array[:,:,:] = self._param_array_helper(factor_len,self.preserve,self.preserve)[None,None] - #transposing so columns of factors (same dev age) are in the last index. - #not sure if this is really necessary. will leave for a better dev to find out - factor_val_T = factor_val.transpose((0,1,3,2)) - - #setting up starting array of weights + # explicitly setting up 3D arrays for drop parameters to avoid broadcasting bugs + drop_above_array = np.zeros((indices, columns, factor_len)) + drop_above_array[:, :, :] = self._param_array_helper( + factor_len, self.drop_above, np.inf + )[None, None] + drop_below_array = np.zeros((indices, columns, factor_len)) + drop_below_array[:, :, :] = self._param_array_helper( + factor_len, self.drop_below, 0.0 + )[None, None] + preserve_array = np.zeros((indices, columns, factor_len)) + preserve_array[:, :, :] = self._param_array_helper( + factor_len, self.preserve, self.preserve + )[None, None] + # transposing so columns of factors (same dev age) are in the last index. + # not sure if this is really necessary. will leave for a better dev to find out + factor_val_T = factor_val.transpose((0, 1, 3, 2)) + + # setting up starting array of weights w = ~np.isnan(factor_val_T) - #dropping - index_array_weights = (factor_val_T < drop_above_array[...,None]) & ( - factor_val_T > drop_below_array[...,None] + # dropping + index_array_weights = (factor_val_T < drop_above_array[..., None]) & ( + factor_val_T > drop_below_array[..., None] ) - #counting remaining factors + # counting remaining factors ldf_count = index_array_weights.sum(axis=3) - #applying preserve + # applying preserve warning_flag = np.any(ldf_count < preserve_array) - w = np.where(ldf_count[...,None] < preserve_array[...,None], w, index_array_weights) + w = np.where( + ldf_count[..., None] < preserve_array[..., None], w, index_array_weights + ) if warning_flag: if self.preserve == 1: @@ -456,17 +515,17 @@ def _drop_x_func(self,factor): ) warnings.warn(warning) - return w.transpose((0,1,3,2)).astype(float) + return w.transpose((0, 1, 3, 2)).astype(float) # for drop_high and drop_low - def _drop_n_func(self,factor,secondary_rank=None): - #getting dimensions of factor for various manipulation + def _drop_n_func(self, factor, secondary_rank=None): + # getting dimensions of factor for various manipulation factor_val = factor.values.copy() - #secondary rank is the optional triangle that breaks ties in factor - #the original use case is for dropping the link ratio of 1 with the lowest loss value - #(pass in a reverse rank of loss to drop link of ratio of 1 with the highest loss value) - #leaving to user to ensure that secondary rank is the same dimensions as factor - #also leaving to user to pick whether to trim head or tail + # secondary rank is the optional triangle that breaks ties in factor + # the original use case is for dropping the link ratio of 1 with the lowest loss value + # (pass in a reverse rank of loss to drop link of ratio of 1 with the highest loss value) + # leaving to user to ensure that secondary rank is the same dimensions as factor + # also leaving to user to pick whether to trim head or tail if secondary_rank is None: sec_rank_val = factor_val.copy() else: @@ -475,36 +534,42 @@ def _drop_n_func(self,factor,secondary_rank=None): indices = factor_val.shape[0] columns = factor_val.shape[1] - #explicitly setting up 3D arrays for drop parameters to avoid broadcasting bugs - drop_high_array = np.zeros((indices,columns,factor_len)) - drop_high_array[:,:,:] = self._param_array_helper(factor_len,self.drop_high,0)[None,None] - drop_low_array = np.zeros((indices,columns,factor_len)) - drop_low_array[:,:,:] = self._param_array_helper(factor_len,self.drop_low,0)[None,None] - preserve_array = np.zeros((indices,columns,factor_len)) - preserve_array[:,:,:] = self._param_array_helper(factor_len,self.preserve,self.preserve)[None,None] - - #ranking factors by itself and secondary rank - factor_ranks = np.lexsort((sec_rank_val,factor_val),axis = 2).argsort(axis=2) - - #setting up starting weights - w = ~np.isnan(factor_val.transpose((0,1,3,2))) - - #counting valid factors + # explicitly setting up 3D arrays for drop parameters to avoid broadcasting bugs + drop_high_array = np.zeros((indices, columns, factor_len)) + drop_high_array[:, :, :] = self._param_array_helper( + factor_len, self.drop_high, 0 + )[None, None] + drop_low_array = np.zeros((indices, columns, factor_len)) + drop_low_array[:, :, :] = self._param_array_helper( + factor_len, self.drop_low, 0 + )[None, None] + preserve_array = np.zeros((indices, columns, factor_len)) + preserve_array[:, :, :] = self._param_array_helper( + factor_len, self.preserve, self.preserve + )[None, None] + + # ranking factors by itself and secondary rank + factor_ranks = np.lexsort((sec_rank_val, factor_val), axis=2).argsort(axis=2) + + # setting up starting weights + w = ~np.isnan(factor_val.transpose((0, 1, 3, 2))) + + # counting valid factors ldf_count = w.sum(axis=3) - #getting max index after drop high + # getting max index after drop high max_rank_unpreserve = ldf_count - drop_high_array - #applying preserve + # applying preserve preserve_trigger = (max_rank_unpreserve - drop_low_array) < preserve_array warning_flag = np.any(preserve_trigger) max_rank = np.where(preserve_trigger, ldf_count, max_rank_unpreserve) min_rank = np.where(preserve_trigger, 0, drop_low_array) - #dropping - index_array_weights = (factor_ranks.transpose((0,1,3,2)) < max_rank[...,None]) & ( - factor_ranks.transpose((0,1,3,2)) > min_rank[...,None] - 1 - ) + # dropping + index_array_weights = ( + factor_ranks.transpose((0, 1, 3, 2)) < max_rank[..., None] + ) & (factor_ranks.transpose((0, 1, 3, 2)) > min_rank[..., None] - 1) if warning_flag: if self.preserve == 1: @@ -522,4 +587,4 @@ def _drop_n_func(self,factor,secondary_rank=None): ) warnings.warn(warning) - return index_array_weights.transpose((0,1,3,2)).astype(float) + return index_array_weights.transpose((0, 1, 3, 2)).astype(float) diff --git a/chainladder/methods/tests/test_predict.py b/chainladder/methods/tests/test_predict.py index 482f8a15..9cc40217 100644 --- a/chainladder/methods/tests/test_predict.py +++ b/chainladder/methods/tests/test_predict.py @@ -9,27 +9,21 @@ def test_cc_predict(): cc = cl.CapeCod().fit(raa_1989, sample_weight=apriori_1989) - cc.predict(raa, sample_weight=apriori) + assert cc.predict(raa, sample_weight=apriori) def test_bf_predict(): - cc = cl.BornhuetterFerguson().fit(raa_1989, sample_weight=apriori_1989) - cc.predict(raa, sample_weight=apriori) + bf = cl.BornhuetterFerguson().fit(raa_1989, sample_weight=apriori_1989) + assert bf.predict(raa, sample_weight=apriori) def test_mack_predict(): mack = cl.MackChainladder().fit(raa_1989) - mack.predict(raa_1989) - # mack.predict(raa) + assert mack.predict(raa_1989) def test_bs_random_state_predict(clrd): - tri = ( - clrd - .groupby("LOB") - .sum() - .loc["wkcomp", ["CumPaidLoss", "EarnedPremNet"]] - ) + tri = clrd.groupby("LOB").sum().loc["wkcomp", ["CumPaidLoss", "EarnedPremNet"]] X = cl.BootstrapODPSample(random_state=100).fit_transform(tri["CumPaidLoss"]) bf = cl.BornhuetterFerguson(apriori=0.6, apriori_sigma=0.1, random_state=42).fit( X, sample_weight=tri["EarnedPremNet"].latest_diagonal @@ -56,20 +50,27 @@ def test_basic_transform(raa): cl.BootstrapODPSample().fit_transform(raa) cl.IncrementalAdditive().fit_transform(raa, sample_weight=raa.latest_diagonal) + def test_misaligned_index(prism): - prism = prism['Paid'] - model = cl.Chainladder().fit(cl.Development(groupby=['Line', 'Type']).fit_transform(prism)) + prism = prism["Paid"] + model = cl.Chainladder().fit( + cl.Development(groupby=["Line", "Type"]).fit_transform(prism) + ) a = model.ultimate_.loc[prism.index.iloc[:10]].sum().sum() b = model.predict(prism.iloc[:10]).ultimate_.sum().sum() assert abs(a - b) < 1e-5 def test_misaligned_index2(clrd): - clrd = clrd['CumPaidLoss'] - w = cl.load_sample('clrd')['EarnedPremDIR'].latest_diagonal - bcl = cl.Chainladder().fit(cl.Development(groupby=['LOB']).fit_transform(clrd)) - bbk = cl.Benktander().fit(cl.Development(groupby=['LOB']).fit_transform(clrd), sample_weight=w) - bcc = cl.CapeCod().fit(cl.Development(groupby=['LOB']).fit_transform(clrd), sample_weight=w) + clrd = clrd["CumPaidLoss"] + w = cl.load_sample("clrd")["EarnedPremDIR"].latest_diagonal + bcl = cl.Chainladder().fit(cl.Development(groupby=["LOB"]).fit_transform(clrd)) + bbk = cl.Benktander().fit( + cl.Development(groupby=["LOB"]).fit_transform(clrd), sample_weight=w + ) + bcc = cl.CapeCod().fit( + cl.Development(groupby=["LOB"]).fit_transform(clrd), sample_weight=w + ) a = bcl.ultimate_.iloc[:10].sum().sum() b = bcl.predict(clrd.iloc[:10]).ultimate_.sum().sum() @@ -85,20 +86,36 @@ def test_misaligned_index2(clrd): b = bcl.predict(clrd.iloc[150:153]).ultimate_.sum().sum() assert abs(a - b) < 1e-5 a = bbk.ultimate_.iloc[150:153].sum().sum() - b = bbk.predict(clrd.iloc[150:153], sample_weight=w.iloc[150:153]).ultimate_.sum().sum() + b = ( + bbk.predict(clrd.iloc[150:153], sample_weight=w.iloc[150:153]) + .ultimate_.sum() + .sum() + ) assert abs(a - b) < 1e-5 a = bcc.ultimate_.iloc[150:153].sum().sum() - b = bcc.predict(clrd.iloc[150:153], sample_weight=w.iloc[150:153]).ultimate_.sum().sum() + b = ( + bcc.predict(clrd.iloc[150:153], sample_weight=w.iloc[150:153]) + .ultimate_.sum() + .sum() + ) assert abs(a - b) < 1e-5 a = bcl.ultimate_.iloc[150:152].sum().sum() b = bcl.predict(clrd.iloc[150:152]).ultimate_.sum().sum() assert abs(a - b) < 1e-5 a = bbk.ultimate_.iloc[150:152].sum().sum() - b = bbk.predict(clrd.iloc[150:152], sample_weight=w.iloc[150:152]).ultimate_.sum().sum() + b = ( + bbk.predict(clrd.iloc[150:152], sample_weight=w.iloc[150:152]) + .ultimate_.sum() + .sum() + ) assert abs(a - b) < 1e-5 a = bcc.ultimate_.iloc[150:152].sum().sum() - b = bcc.predict(clrd.iloc[150:152], sample_weight=w.iloc[150:152]).ultimate_.sum().sum() + b = ( + bcc.predict(clrd.iloc[150:152], sample_weight=w.iloc[150:152]) + .ultimate_.sum() + .sum() + ) assert abs(a - b) < 1e-5 a = bcl.ultimate_.iloc[150].sum().sum() @@ -111,22 +128,25 @@ def test_misaligned_index2(clrd): b = bcc.predict(clrd.iloc[150], sample_weight=w.iloc[150]).ultimate_.sum().sum() assert abs(a - b) < 1e-5 + def test_align_cdfs(): - ld = cl.load_sample('raa').latest_diagonal*0+40000 - model = cl.BornhuetterFerguson().fit(cl.load_sample('raa'), sample_weight=ld) - a = model.ultimate_.iloc[..., :4, :] + ld = cl.load_sample("raa").latest_diagonal * 0 + 40000 + model = cl.BornhuetterFerguson().fit(cl.load_sample("raa"), sample_weight=ld) + a = model.ultimate_.iloc[..., :4, :] b = model.predict( - cl.load_sample('raa').dev_to_val().iloc[..., :4, -1].val_to_dev(), - sample_weight=ld.iloc[..., :4, :]).ultimate_ + cl.load_sample("raa").dev_to_val().iloc[..., :4, -1].val_to_dev(), + sample_weight=ld.iloc[..., :4, :], + ).ultimate_ assert a == b - model = cl.Chainladder().fit(cl.load_sample('raa'), sample_weight=ld) - a = model.ultimate_.iloc[..., :4, :] + model = cl.Chainladder().fit(cl.load_sample("raa"), sample_weight=ld) + a = model.ultimate_.iloc[..., :4, :] b = model.predict( - cl.load_sample('raa').dev_to_val().iloc[..., :4, -1].val_to_dev(), - sample_weight=ld.iloc[..., :4, :]).ultimate_ + cl.load_sample("raa").dev_to_val().iloc[..., :4, -1].val_to_dev(), + sample_weight=ld.iloc[..., :4, :], + ).ultimate_ assert a == b def test_check_val_tri_cl(raa): model = cl.Chainladder().fit(raa.dev_to_val()) - assert model.predict(raa.latest_diagonal).ultimate_ == model.ultimate_ \ No newline at end of file + assert model.predict(raa.latest_diagonal).ultimate_ == model.ultimate_ diff --git a/chainladder/utils/tests/test_utilities.py b/chainladder/utils/tests/test_utilities.py index bf8c4f8c..0fbd020a 100644 --- a/chainladder/utils/tests/test_utilities.py +++ b/chainladder/utils/tests/test_utilities.py @@ -2,22 +2,111 @@ from chainladder.utils.cupy import cp import numpy as np import copy +import pandas as pd def test_non_vertical_line(): - true_olf = (1 - 0.5 * ((31 + 31 + 30 + 31 + 30 + 31) / 365.25) ** 2) * 0.2 - olf_low = ( - cl.parallelogram_olf([0.20], ["7/1/2017"], grain="Y").loc["2017"].iloc[0] - 1 + true_olf = ( + 1.20 + / ( + (1 - 0.5 * ((31 + 31 + 30 + 31 + 30 + 31) / 365) ** 2) * 1.0 + + (0.5 * ((31 + 31 + 30 + 31 + 30 + 31) / 365) ** 2) * 1.2 + ) + - 1 ) - olf_high = ( - cl.parallelogram_olf([0.20], ["7/2/2017"], grain="Y").loc["2017"].iloc[0] - 1 + + result = ( + cl.parallelogram_olf([0.20], ["7/1/2017"], approximation_grain="D") + .loc["2017"] + .iloc[0] + - 1 + ) + + assert true_olf == result + + # Monthly approximation + rate_history = pd.DataFrame( + { + "EffDate": ["2010-07-01", "2011-01-01", "2012-07-01", "2013-04-01"], + "RateChange": [0.035, 0.05, 0.10, -0.01], + } + ) + + data = pd.DataFrame( + {"Year": list(range(2006, 2016)), "EarnedPremium": [10_000] * 10} + ) + + prem_tri = cl.Triangle( + data, origin="Year", columns="EarnedPremium", cumulative=True + ) + prem_tri = cl.ParallelogramOLF( + rate_history, + change_col="RateChange", + date_col="EffDate", + approximation_grain="M", + vertical_line=False, + ).fit_transform(prem_tri) + assert ( + np.round(prem_tri.olf_.to_frame().values, 6).flatten() + == [ + 1.183471, + 1.183471, + 1.183471, + 1.183471, + 1.178316, + 1.120181, + 1.075556, + 1.004236, + 0.999684, + 1.000000, + ] + ).all() + + # Daily approximation + rate_history = pd.DataFrame( + { + "EffDate": ["2010-07-01", "2011-01-01", "2012-07-01", "2013-04-01"], + "RateChange": [0.035, 0.05, 0.10, -0.01], + } + ) + + data = pd.DataFrame( + {"Year": list(range(2006, 2016)), "EarnedPremium": [10_000] * 10} ) - assert olf_low < true_olf < olf_high + + prem_tri = cl.Triangle( + data, origin="Year", columns="EarnedPremium", cumulative=True + ) + prem_tri = cl.ParallelogramOLF( + rate_history, + change_col="RateChange", + date_col="EffDate", + approximation_grain="D", + vertical_line=False, + ).fit_transform(prem_tri) + assert ( + np.round(prem_tri.olf_.to_frame().values, 6).flatten() + == [ + 1.183471, + 1.183471, + 1.183471, + 1.183471, + 1.178231, + 1.120105, + 1.075410, + 1.004073, + 0.999693, + 1.000000, + ] + ).all() def test_vertical_line(): - olf = cl.parallelogram_olf([0.20], ["7/1/2017"], grain="Y", vertical_line=True) - assert abs(olf.loc["2017"].iloc[0] - ((1 - 184 / 365) * 0.2 + 1)) < 0.00001 + olf = cl.parallelogram_olf( + [0.20], ["7/1/2017"], approximation_grain="D", vertical_line=True + ) + true_olf = 1.2 / ((1 - 184 / 365) * 1.0 + (184 / 365) * 1.2) + assert abs(olf.loc["2017"].iloc[0] - true_olf) < 0.00001 def test_triangle_json_io(clrd): diff --git a/chainladder/utils/utility_functions.py b/chainladder/utils/utility_functions.py index 96e3410d..9a06df01 100644 --- a/chainladder/utils/utility_functions.py +++ b/chainladder/utils/utility_functions.py @@ -152,7 +152,13 @@ def read_json(json_str, array_backend=None): def parallelogram_olf( - values, date, start_date=None, end_date=None, grain="M", vertical_line=False + values, + date, + start_date=None, + end_date=None, + grain="Y", + approximation_grain="M", + vertical_line=False, ): """Parallelogram approach to on-leveling.""" date = pd.to_datetime(date) @@ -161,23 +167,107 @@ def parallelogram_olf( if not end_date: end_date = "{}-12-31".format(date.max().year) start_date = pd.to_datetime(start_date) - pd.tseries.offsets.DateOffset(days=1) + + date_freq = { + "M": "MS", + "D": "D", + } + + try: + date_freq[approximation_grain] + except: + print("grain must be " "M" " or " "D" "") + date_idx = pd.date_range( - start_date - pd.tseries.offsets.DateOffset(years=1), end_date + start_date - pd.tseries.offsets.DateOffset(years=1), + end_date, + freq=date_freq[approximation_grain], ) - y = pd.Series(np.array(values), np.array(date)) - y = y.reindex(date_idx, fill_value=0) - idx = np.cumprod(y.values + 1) - idx = idx[-1] / idx - y = pd.Series(idx, y.index) - y = y[~((y.index.day == 29) & (y.index.month == 2))] + + rate_changes = pd.Series(np.array(values), np.array(date)) + # print("rate_changes:\n", rate_changes) + rate_changes = rate_changes.reindex(date_idx, fill_value=0) + # print("rate_changes:\n", rate_changes) + cum_rate_changes = np.cumprod(1 + rate_changes.values) + cum_rate_changes = pd.Series(cum_rate_changes, rate_changes.index) + # print("cum_rate_changes:\n", cum_rate_changes) + crl = cum_rate_changes[-1] + # print("crl:", crl) + + cum_avg_rate_non_leaps = cum_rate_changes + cum_avg_rate_leaps = cum_rate_changes + if not vertical_line: - y = y.rolling(365).mean() - y = (y + y.shift(1).values) / 2 - y = y.iloc[366:] - y = y.groupby(y.index.to_period(grain)).mean().reset_index() - y.columns = ["Origin", "OLF"] - y["Origin"] = y["Origin"].astype(str) - return y.set_index("Origin") + rolling_num = { + "M": 12, + "D": 365, + } + + cum_avg_rate_non_leaps = cum_rate_changes.rolling( + rolling_num[approximation_grain] + ).mean() + cum_avg_rate_non_leaps = ( + cum_avg_rate_non_leaps + cum_avg_rate_non_leaps.shift(1).values + ) / 2 + + cum_avg_rate_leaps = cum_rate_changes.rolling( + rolling_num[approximation_grain] + 1 + ).mean() + cum_avg_rate_leaps = ( + cum_avg_rate_leaps + cum_avg_rate_leaps.shift(1).values + ) / 2 + # print("cum_avg_rate_non_leaps\n", cum_avg_rate_non_leaps) + # print("cum_avg_rate_leaps\n", cum_avg_rate_leaps) + + dropdates_num = { + "M": 12, + "D": 366, + } + cum_avg_rate_non_leaps = cum_avg_rate_non_leaps.iloc[ + dropdates_num[approximation_grain] : + ] + cum_avg_rate_leaps = cum_avg_rate_leaps.iloc[ + dropdates_num[approximation_grain] + 1 : + ] + + fcrl_non_leaps = ( + cum_avg_rate_non_leaps.groupby(cum_avg_rate_non_leaps.index.to_period(grain)) + .mean() + .reset_index() + ) + fcrl_non_leaps.columns = ["Origin", "OLF"] + fcrl_non_leaps["Origin"] = fcrl_non_leaps["Origin"].astype(str) + fcrl_non_leaps["OLF"] = crl / fcrl_non_leaps["OLF"] + + fcrl_leaps = ( + cum_avg_rate_leaps.groupby(cum_avg_rate_leaps.index.to_period(grain)) + .mean() + .reset_index() + ) + fcrl_leaps.columns = ["Origin", "OLF"] + fcrl_leaps["Origin"] = fcrl_leaps["Origin"].astype(str) + fcrl_leaps["OLF"] = crl / fcrl_leaps["OLF"] + + combined = fcrl_non_leaps.join(fcrl_leaps, lsuffix="_non_leaps", rsuffix="_leaps") + combined["is_leap"] = pd.to_datetime( + combined["Origin_non_leaps"], format="%Y" + ).dt.is_leap_year + + if approximation_grain == "M": + combined["final_OLF"] = combined["OLF_non_leaps"] + else: + combined["final_OLF"] = np.where( + combined["is_leap"], combined["OLF_leaps"], combined["OLF_non_leaps"] + ) + + combined.drop( + ["OLF_non_leaps", "Origin_leaps", "OLF_leaps", "is_leap"], + axis=1, + inplace=True, + ) + combined.columns = ["Origin", "OLF"] + + return combined.set_index("Origin") def set_common_backend(objs): @@ -377,7 +467,9 @@ def model_diagnostics(model, name=None, groupby=None): latest = obj.X_.sum("development") run_off = obj.full_expectation_.iloc[..., :-1].dev_to_val().cum_to_incr() run_off = run_off[run_off.development > str(obj.X_.valuation_date)] - run_off = run_off.iloc[..., : {"M": 12, "S": 6, "Q": 4, "Y": 1}[obj.X_.development_grain]] + run_off = run_off.iloc[ + ..., : {"M": 12, "S": 6, "Q": 4, "Y": 1}[obj.X_.development_grain] + ] triangles = [] for col in obj.ultimate_.columns: