From a0e61971a6a5759408bd864ab869d8263ec8231d Mon Sep 17 00:00:00 2001 From: John S Bogaardt Date: Sat, 28 Sep 2024 13:50:28 -0600 Subject: [PATCH] #528 --- chainladder/core/dunders.py | 97 ++++++++++++++----------- chainladder/core/tests/test_triangle.py | 37 +--------- chainladder/development/learning.py | 6 +- chainladder/workflow/voting.py | 2 +- environment-dev.yaml | 4 +- 5 files changed, 61 insertions(+), 85 deletions(-) diff --git a/chainladder/core/dunders.py b/chainladder/core/dunders.py index bdd64cd4..0b987985 100644 --- a/chainladder/core/dunders.py +++ b/chainladder/core/dunders.py @@ -60,73 +60,84 @@ def _compatibility_check(self, x, y): return x, y def _prep_index(self, x, y): - """ Preps index and column axes for arithmetic """ if x.kdims.shape[0] == 1 and y.kdims.shape[0] > 1: - # Broadcast x to y x.kdims = y.kdims x.key_labels = y.key_labels return x, y if x.kdims.shape[0] > 1 and y.kdims.shape[0] == 1: - # Broadcast y to x y.kdims = x.kdims y.key_labels = x.key_labels return x, y if x.kdims.shape[0] == y.kdims.shape[0] == 1 and x.key_labels != y.key_labels: - # Broadcast to the triangle with a larger multi-index kdims = x.kdims if len(x.key_labels) > len(y.key_labels) else y.kdims - y.kdims = x.kdims = kdims key_labels = x.key_labels if len(x.key_labels) > len(y.key_labels) else y.key_labels - y.key_labels = x.key_labels = key_labels + x.kdims = y.kdims = kdims + x.key_labels = y.key_labels = key_labels return x, y - a, b = set(x.key_labels), set(y.key_labels) - common = a.intersection(b) - if common in [a, b] and (a != b or (a == b and x.kdims.shape[0] != y.kdims.shape[0])): - # If index labels are subset of other triangle index labels - x = x.groupby(list(common)) - y = y.groupby(list(common)) - return x, y - if common not in [a, b]: - raise ValueError('Index broadcasting is ambiguous between', str(a), 'and', str(b)) - if ( - x.key_labels == y.key_labels - and x.kdims.shape[0] == y.kdims.shape[0] - and y.kdims.shape[0] > 1 - and not x.kdims is y.kdims - and not x.index.equals(y.index) - ): - # Make sure exact but unsorted index labels works - x = x.sort_index() - try: - y = y.loc[x.index] - except: + + # Use sets for faster operations + x_labels = set(x.key_labels) + y_labels = set(y.key_labels) + common = x_labels.intersection(y_labels) + + if common == x_labels or common == y_labels: + if x_labels != y_labels or x.kdims.shape[0] != y.kdims.shape[0]: x = x.groupby(list(common)) y = y.groupby(list(common)) + elif x.kdims.shape[0] > 1 and not np.array_equal(x.kdims, y.kdims) and not x.index.equals(y.index): + x = x.sort_index() + try: + y = y.loc[x.index] + except: + x = x.groupby(list(common)) + y = y.groupby(list(common)) + return x, y + + if common != x_labels and common != y_labels: + raise ValueError('Index broadcasting is ambiguous between ' + str(x_labels) + ' and ' + str(y_labels)) + return x, y def _prep_columns(self, x, y): x_backend, y_backend = x.array_backend, y.array_backend + if len(x.columns) == 1 and len(y.columns) > 1: x.vdims = y.vdims elif len(y.columns) == 1 and len(x.columns) > 1: y.vdims = x.vdims - elif len(y.columns) == 1 and len(x.columns) == 1 and x.columns != y.columns: + elif len(y.columns) == len(x.columns) == 1 and x.columns != y.columns: y.vdims = x.vdims - elif x.shape[1] == y.shape[1] and np.all(x.columns == y.columns): - pass + elif x.shape[1] == y.shape[1] and np.array_equal(x.columns, y.columns): + return x, y else: - col_union = list(x.columns) + [ - item for item in y.columns if item not in x.columns - ] - for item in [item for item in col_union if item not in x.columns]: - x[item] = 0 - x = x[col_union] - for item in [item for item in col_union if item not in y.columns]: - y[item] = 0 - y = y[col_union] - x, y = ( - x.set_backend(x_backend, inplace=True), - y.set_backend(y_backend, inplace=True), - ) + # Use sets for faster operations + x_cols = set(x.columns) + y_cols = set(y.columns) + + # Find columns to add to each triangle + cols_to_add_to_x = y_cols - x_cols + cols_to_add_to_y = x_cols - y_cols + + # Create new columns only if necessary + if cols_to_add_to_x: + new_x_cols = list(x.columns) + list(cols_to_add_to_x) + x = x.reindex(columns=new_x_cols, fill_value=0) + + if cols_to_add_to_y: + new_y_cols = list(y.columns) + list(cols_to_add_to_y) + y = y.reindex(columns=new_y_cols, fill_value=0) + + # Ensure both triangles have the same column order + final_cols = list(x_cols | y_cols) + x = x[final_cols] + y = y[final_cols] + + # Reset backends only if they've changed + if x.array_backend != x_backend: + x = x.set_backend(x_backend, inplace=True) + if y.array_backend != y_backend: + y = y.set_backend(y_backend, inplace=True) + return x, y def _prep_origin_development(self, obj, other): diff --git a/chainladder/core/tests/test_triangle.py b/chainladder/core/tests/test_triangle.py index daed5d65..7f3e8004 100644 --- a/chainladder/core/tests/test_triangle.py +++ b/chainladder/core/tests/test_triangle.py @@ -1,8 +1,6 @@ import chainladder as cl import pandas as pd -import polars as pl import numpy as np -import copy import pytest import io from datetime import datetime @@ -746,9 +744,7 @@ def test_halfyear_development(): ["2012-01-01", "2013-12-31", "incurred", 200.0], ] - df_polars = pl.DataFrame(data) - df_polars.columns = ["origin", "val_date", "idx", "value"] - + assert ( type( cl.Triangle( @@ -760,33 +756,4 @@ def test_halfyear_development(): cumulative=True, ) ) - ) == cl.Triangle - - assert ( - type( - cl.Triangle( - data=df_polars, - index="idx", - columns="value", - origin="origin", - development="val_date", - cumulative=True, - ) - ) - ) == cl.Triangle - - assert cl.Triangle( - data=pd.DataFrame(data, columns=["origin", "val_date", "idx", "value"]), - index="idx", - columns="value", - origin="origin", - development="val_date", - cumulative=True, - ) == cl.Triangle( - data=df_polars, - index="idx", - columns="value", - origin="origin", - development="val_date", - cumulative=True, - ) + ) == cl.Triangle \ No newline at end of file diff --git a/chainladder/development/learning.py b/chainladder/development/learning.py index 1a26b37c..a66bcdc1 100644 --- a/chainladder/development/learning.py +++ b/chainladder/development/learning.py @@ -53,7 +53,7 @@ def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False, self.y_ml=y_ml self.weight_ml = weight_ml self.autoregressive=autoregressive - self.fit_incrementals=fit_incrementals + self.fit_incrementals = fit_incrementals def _get_y_names(self): """ private function to get the response column name""" @@ -153,7 +153,7 @@ def fit(self, X, y=None, sample_weight=None): Parameters ---------- X : Triangle-like - Set of LDFs to which the munich adjustment will be applied. + Set of LDFs to which the estimator will be applied. y : None Ignored, use y_ml to set a reponse variable for the ML algorithm sample_weight : None @@ -180,7 +180,7 @@ def fit(self, X, y=None, sample_weight=None): self.df_ = df # Fit model self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze()) - #return self + #return selffit_incrementals self.triangle_ml_ = self._get_triangle_ml(df) return self diff --git a/chainladder/workflow/voting.py b/chainladder/workflow/voting.py index 9465cb88..896b8385 100644 --- a/chainladder/workflow/voting.py +++ b/chainladder/workflow/voting.py @@ -124,7 +124,7 @@ def fit(self, X, y, sample_weight=None): self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_fit_single_estimator)( clone(clf), X, y, - sample_weight=sample_weight, + fit_params=dict(sample_weight=sample_weight), message_clsname='VotingChainladder', message=self._log_message(names[idx], idx + 1, len(clfs)) diff --git a/environment-dev.yaml b/environment-dev.yaml index 06b56591..a8d74c63 100644 --- a/environment-dev.yaml +++ b/environment-dev.yaml @@ -14,13 +14,11 @@ dependencies: - ipykernel - pandas - - polars - scikit-learn - sparse - - numba - dill - patsy - - matplotlib + - matplotlib-base # testing - lxml