#528

casact · Sep 28, 2024 · a0e6197 · a0e6197
1 parent 7e5dd10
commit a0e6197
Show file tree

Hide file tree

Showing 5 changed files with 61 additions and 85 deletions.
diff --git a/chainladder/core/dunders.py b/chainladder/core/dunders.py
@@ -60,73 +60,84 @@ def _compatibility_check(self, x, y):
         return x, y
 
     def _prep_index(self, x, y):
-        """ Preps index and column axes for arithmetic """
         if x.kdims.shape[0] == 1 and y.kdims.shape[0] > 1:
-            # Broadcast x to y
             x.kdims = y.kdims
             x.key_labels = y.key_labels
             return x, y
         if x.kdims.shape[0] > 1 and y.kdims.shape[0] == 1:
-            # Broadcast y to x
             y.kdims = x.kdims
             y.key_labels = x.key_labels
             return x, y
         if x.kdims.shape[0] == y.kdims.shape[0] == 1 and x.key_labels != y.key_labels:
-            # Broadcast to the triangle with a larger multi-index
             kdims = x.kdims if len(x.key_labels) > len(y.key_labels) else y.kdims
-            y.kdims = x.kdims = kdims
             key_labels = x.key_labels if len(x.key_labels) > len(y.key_labels) else y.key_labels
-            y.key_labels = x.key_labels = key_labels
+            x.kdims = y.kdims = kdims
+            x.key_labels = y.key_labels = key_labels
             return x, y
-        a, b = set(x.key_labels), set(y.key_labels)
-        common = a.intersection(b)
-        if common in [a, b] and (a != b or (a == b and x.kdims.shape[0] != y.kdims.shape[0])):
-            # If index labels are subset of other triangle index labels
-            x = x.groupby(list(common))
-            y = y.groupby(list(common))
-            return x, y
-        if common not in [a, b]:
-            raise ValueError('Index broadcasting is ambiguous between', str(a), 'and', str(b))
-        if (
-            x.key_labels == y.key_labels
-            and x.kdims.shape[0] == y.kdims.shape[0]
-            and y.kdims.shape[0] > 1
-            and not x.kdims is y.kdims
-            and not x.index.equals(y.index)
-        ):
-            # Make sure exact but unsorted index labels works
-            x = x.sort_index()
-            try:
-                y = y.loc[x.index]
-            except:
+
+        # Use sets for faster operations
+        x_labels = set(x.key_labels)
+        y_labels = set(y.key_labels)
+        common = x_labels.intersection(y_labels)
+
+        if common == x_labels or common == y_labels:
+            if x_labels != y_labels or x.kdims.shape[0] != y.kdims.shape[0]:
                 x = x.groupby(list(common))
                 y = y.groupby(list(common))
+            elif x.kdims.shape[0] > 1 and not np.array_equal(x.kdims, y.kdims) and not x.index.equals(y.index):
+                x = x.sort_index()
+                try:
+                    y = y.loc[x.index]
+                except:
+                    x = x.groupby(list(common))
+                    y = y.groupby(list(common))
+            return x, y
+
+        if common != x_labels and common != y_labels:
+            raise ValueError('Index broadcasting is ambiguous between ' + str(x_labels) + ' and ' + str(y_labels))
+
         return x, y
 
     def _prep_columns(self, x, y):
         x_backend, y_backend = x.array_backend, y.array_backend
+
         if len(x.columns) == 1 and len(y.columns) > 1:
             x.vdims = y.vdims
         elif len(y.columns) == 1 and len(x.columns) > 1:
             y.vdims = x.vdims
-        elif len(y.columns) == 1 and len(x.columns) == 1 and x.columns != y.columns:
+        elif len(y.columns) == len(x.columns) == 1 and x.columns != y.columns:
             y.vdims = x.vdims
-        elif x.shape[1] == y.shape[1] and np.all(x.columns == y.columns):
-            pass
+        elif x.shape[1] == y.shape[1] and np.array_equal(x.columns, y.columns):
+            return x, y
         else:
-            col_union = list(x.columns) + [
-                item for item in y.columns if item not in x.columns
-            ]
-            for item in [item for item in col_union if item not in x.columns]:
-                x[item] = 0
-            x = x[col_union]
-            for item in [item for item in col_union if item not in y.columns]:
-                y[item] = 0
-            y = y[col_union]
-        x, y = (
-            x.set_backend(x_backend, inplace=True),
-            y.set_backend(y_backend, inplace=True),
-        )
+            # Use sets for faster operations
+            x_cols = set(x.columns)
+            y_cols = set(y.columns)
+
+            # Find columns to add to each triangle
+            cols_to_add_to_x = y_cols - x_cols
+            cols_to_add_to_y = x_cols - y_cols
+
+            # Create new columns only if necessary
+            if cols_to_add_to_x:
+                new_x_cols = list(x.columns) + list(cols_to_add_to_x)
+                x = x.reindex(columns=new_x_cols, fill_value=0)
+
+            if cols_to_add_to_y:
+                new_y_cols = list(y.columns) + list(cols_to_add_to_y)
+                y = y.reindex(columns=new_y_cols, fill_value=0)
+
+            # Ensure both triangles have the same column order
+            final_cols = list(x_cols | y_cols)
+            x = x[final_cols]
+            y = y[final_cols]
+
+        # Reset backends only if they've changed
+        if x.array_backend != x_backend:
+            x = x.set_backend(x_backend, inplace=True)
+        if y.array_backend != y_backend:
+            y = y.set_backend(y_backend, inplace=True)
+
         return x, y
 
     def _prep_origin_development(self, obj, other):

diff --git a/chainladder/core/tests/test_triangle.py b/chainladder/core/tests/test_triangle.py
@@ -1,8 +1,6 @@
 import chainladder as cl
 import pandas as pd
-import polars as pl
 import numpy as np
-import copy
 import pytest
 import io
 from datetime import datetime
@@ -746,9 +744,7 @@ def test_halfyear_development():
         ["2012-01-01", "2013-12-31", "incurred", 200.0],
     ]
 
-    df_polars = pl.DataFrame(data)
-    df_polars.columns = ["origin", "val_date", "idx", "value"]
-
+
     assert (
         type(
             cl.Triangle(
@@ -760,33 +756,4 @@ def test_halfyear_development():
                 cumulative=True,
             )
         )
-    ) == cl.Triangle
-
-    assert (
-        type(
-            cl.Triangle(
-                data=df_polars,
-                index="idx",
-                columns="value",
-                origin="origin",
-                development="val_date",
-                cumulative=True,
-            )
-        )
-    ) == cl.Triangle
-
-    assert cl.Triangle(
-        data=pd.DataFrame(data, columns=["origin", "val_date", "idx", "value"]),
-        index="idx",
-        columns="value",
-        origin="origin",
-        development="val_date",
-        cumulative=True,
-    ) == cl.Triangle(
-        data=df_polars,
-        index="idx",
-        columns="value",
-        origin="origin",
-        development="val_date",
-        cumulative=True,
-    )
+    ) == cl.Triangle
diff --git a/chainladder/development/learning.py b/chainladder/development/learning.py
@@ -53,7 +53,7 @@ def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False,
         self.y_ml=y_ml
         self.weight_ml = weight_ml
         self.autoregressive=autoregressive
-        self.fit_incrementals=fit_incrementals
+        self.fit_incrementals = fit_incrementals
 
     def _get_y_names(self):
         """ private function to get the response column name"""
@@ -153,7 +153,7 @@ def fit(self, X, y=None, sample_weight=None):
         Parameters
         ----------
         X : Triangle-like
-            Set of LDFs to which the munich adjustment will be applied.
+            Set of LDFs to which the estimator will be applied.
         y : None
             Ignored, use y_ml to set a reponse variable for the ML algorithm
         sample_weight : None
@@ -180,7 +180,7 @@ def fit(self, X, y=None, sample_weight=None):
         self.df_ = df
         # Fit model
         self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze())
-        #return self
+        #return selffit_incrementals 
         self.triangle_ml_ = self._get_triangle_ml(df)
         return self
 

diff --git a/chainladder/workflow/voting.py b/chainladder/workflow/voting.py
@@ -124,7 +124,7 @@ def fit(self, X, y, sample_weight=None):
         self.estimators_ = Parallel(n_jobs=self.n_jobs)(
                 delayed(_fit_single_estimator)(
                         clone(clf), X, y,
-                        sample_weight=sample_weight,
+                        fit_params=dict(sample_weight=sample_weight),
                         message_clsname='VotingChainladder',
                         message=self._log_message(names[idx],
                                                   idx + 1, len(clfs))

diff --git a/environment-dev.yaml b/environment-dev.yaml
@@ -14,13 +14,11 @@ dependencies:
   - ipykernel
 
   - pandas
-  - polars
   - scikit-learn
   - sparse
-  - numba
   - dill
   - patsy
-  - matplotlib
+  - matplotlib-base
 
   # testing
   - lxml