From 80fc6c70c97d1861685961238d87071945065ef6 Mon Sep 17 00:00:00 2001
From: Philipp Verpoort <philipp.verpoort@pik-potsdam.de>
Date: Mon, 7 Oct 2024 13:10:15 +0200
Subject: [PATCH] Feature: performing manipulations with multiple units per
 variable.

Pivoting the dataframe into wide format or performing manipulations with
multiple units for a variable no longer raises an exception but instead
proceeds with the expected result.
---
 python/posted/team.py | 121 ++++++++++++++++++++++++++++--------------
 1 file changed, 82 insertions(+), 39 deletions(-)

diff --git a/python/posted/team.py b/python/posted/team.py
index 1972810..bf9f6a4 100644
--- a/python/posted/team.py
+++ b/python/posted/team.py
@@ -1,6 +1,7 @@
 import re
 import warnings
 from abc import abstractmethod
+from itertools import product
 from typing import Optional, Callable, TypeAlias, Tuple
 
 import numpy as np
@@ -101,6 +102,17 @@ def __init__(self, df: pd.DataFrame):
                              'dataframes that contain at least the variable, '
                              'unit, and value columns.')
 
+        # Check that there are no nans among variable, unit, and value columns.
+        for c in ('variable', 'unit', 'value'):
+            if df[c].isnull().any():
+                ex_msg = ('Can only use .team accessor with team-like '
+                          'dataframes in which there are no nan entries in '
+                          'the variable, unit, and value columns.')
+                if c == 'unit':
+                    ex_msg += (' Please use "dimensionless" or "No Unit" if '
+                               'the variable has no unit.')
+                raise ValueError(ex_msg)
+
         # Warn if 'unfielded' column exists.
         if 'unfielded' in df.columns:
             warnings.warn("Having a column named 'unfielded' in the dataframe "
@@ -182,16 +194,6 @@ def pivot_wide(self):
         """
         ret = self.explode()
 
-        # Check units are harmonised across variables before pivot.
-        units = ret[['variable', 'unit']].drop_duplicates()
-        if not units['variable'].is_unique:
-            duplicate_units = units.loc[units['variable'].duplicated()] \
-                .loc[:, 'variable'] \
-                .tolist()
-            raise Exception(f"Cannot pivot wide on a dataframe where "
-                            f"variables have multiple units: "
-                            f"{', '.join(duplicate_units)}")
-
         # Create dummy field if non exists.
         if not self._fields:
             ret = ret.assign(unfielded=0)
@@ -206,17 +208,19 @@ def pivot_wide(self):
             values='value',
         )
 
-        # Check unit exists for all columns.
-        if ret.columns.get_level_values(level='unit').isna().any():
-            raise Exception('Unit column may not contain NaN entries. Please '
-                            'use "dimensionless" or "No Unit" if the variable '
-                            'has no unit.')
+        # Raise exception if duplicate cases exist.
+        if ret.index.has_duplicates:
+            raise ValueError('Performed pivot_wide on dataframe with '
+                             'duplicate cases. Each variable should only be '
+                             'defined once for each combination of field '
+                             'values.')
+
         return ret.pint.quantify()
 
     # for performing analyses
     def perform(self,
                 *manipulations: AbstractManipulation,
-                dropna: bool = False,
+                dropna: bool = True,
                 only_new: bool = False):
         """
         Perform manipulation(s).
@@ -238,28 +242,66 @@ def perform(self,
         # Pivot dataframe before manipulation.
         df_pivot = self.pivot_wide()
 
-        # Perform analysis or manipulation and bring rows back to
-        # original long dataframe format.
-        for manipulation in manipulations:
-            original_index = df_pivot.index
-            df_pivot = manipulation.perform(df_pivot)
-            if not isinstance(df_pivot, pd.DataFrame):
-                raise Exception('Manipulation must return a dataframe.')
-            if not df_pivot.index.equals(original_index):
-                raise Exception('Manipulation may not change the index.')
-
-        # Ensure that the axis label still exists before melt.
-        df_pivot.rename_axis('variable', axis=1, inplace=True)
-
-        # Pivot back.
-        ret = df_pivot \
-            .pint.dequantify() \
-            .melt(ignore_index=False) \
+        # Create list of column groups of variables and units.
+        col_groups = (
+            pd.Series(df_pivot.columns)
             .reset_index()
+            .groupby('variable')
+            .groups
+        )
+
+        # Raise exception in case of duplicate variables with different units.
+        for col_name in col_groups:
+            df_pivot_sub = df_pivot[col_name]
+            if isinstance(df_pivot_sub, pd.Series):
+                continue
+            duplicate_indexes = (df_pivot_sub.notnull().sum(axis=1) > 1)
+            if duplicate_indexes.any():
+                warnings.warn(f"Duplicate units in variable '{col_name}' for "
+                              f"fields: {df_pivot.index[duplicate_indexes]}")
+
+        # Loop over groups.
+        df_pivot_list = []
+        for col_ids in product(*col_groups.values()):
+            df_pivot_group = df_pivot.iloc[:, list(col_ids)].dropna(how='all')
+
+            # Perform analysis or manipulation and bring rows back to
+            # original long dataframe format.
+            for manipulation in manipulations:
+                original_index = df_pivot_group.index
+                df_pivot_group = manipulation.perform(df_pivot_group)
+                if not isinstance(df_pivot_group, pd.DataFrame):
+                    raise Exception('Manipulation must return a dataframe.')
+                if not df_pivot_group.index.equals(original_index):
+                    raise Exception('Manipulation may not change the index.')
+
+            # Ensure that the axis label still exists before melt.
+            df_pivot_group.rename_axis('variable', axis=1, inplace=True)
+
+            # Pivot back and append.
+            df_pivot_list.append(
+                df_pivot_group
+                    .pint.dequantify()
+                    .melt(ignore_index=False)
+                    .reset_index()
+            )
+
+        # Combine groups into single dataframe.
+        ret = pd.concat(df_pivot_list)
 
         # Drop rows with nan entries in unit or value columns.
         if dropna:
-            ret.dropna(subset=['unit', 'value'], inplace=True)
+            ret.dropna(subset='value', inplace=True)
+
+        # Drop duplicates arising from multiple var-unit groups.
+        ret.drop_duplicates(inplace=True)
+
+        # Raise exception if index has duplicates after the above.
+        duplicates = ret.duplicated(subset=self._fields + ['variable'])
+        if duplicates.any():
+            duplicate_labels = ret.loc[duplicates, self._fields + ['variable']]
+            raise Exception(f"Internal error: variables should only exist "
+                            f"once per case: {duplicate_labels}")
 
         # Keep only new variables if requested.
         if only_new:
@@ -305,11 +347,11 @@ def varsplit(self,
         # Check that precisely one of the two arguments (either `cmd`
         # or `regex`) is provided.
         if cmd is not None and regex is not None:
-            raise Exception(
-                'Only one of the two arguments may be provided: cmd or regex.')
+            raise Exception('Only one of the two arguments may be provided: '
+                            'cmd or regex.')
         if cmd is None and regex is None:
-            raise Exception(
-                'Either a command or a regex string must be provided.')
+            raise Exception('Either a command or a regex string must be '
+                            'provided.')
 
         # Check that target is in columns of dataframe.
         if target not in self._df.columns:
@@ -413,6 +455,7 @@ def varcombine(self,
 
     def unit_convert(self,
                      to: str | pint.Unit | dict[str, str | pint.Unit],
+
                      flow_id: Optional[str] = None):
         """
         Convert units in dataframe.
@@ -458,7 +501,7 @@ def unit_convert(self,
 KeywordAssignment: TypeAlias = int | float | str | Callable
 
 
-# generic manipulation for calculating variables
+# Generic manipulation for calculating variables.
 class CalcVariable(AbstractManipulation):
     _expr_assignments: tuple[ExprAssignment]
     _kw_assignments: dict[str, KeywordAssignment]