catalyst-cooperative · cmgosnell · Jul 25, 2023 · Jul 24, 2023 · Jul 25, 2023 · Jul 25, 2023
diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py
@@ -1049,15 +1049,17 @@ def calculations(self):
         tables in this explosion.
         """
         calc_explode = self.calculation_components_xbrl_ferc1
-        calc_explode = calc_explode[calc_explode.table_name.isin(self.table_names)]
+        calc_explode = calc_explode[
+            calc_explode.table_name_parent.isin(self.table_names)
+        ]
         not_in_explosion_xbrl_factoids = list(
             calc_explode.loc[
-                ~calc_explode.table_name_calc.isin(self.table_names),
-                "xbrl_factoid",
+                ~calc_explode.table_name.isin(self.table_names),
+                "xbrl_factoid_parent",
             ].unique()
         )
         calc_explode = calc_explode[
-            ~calc_explode.xbrl_factoid.isin(not_in_explosion_xbrl_factoids)
+            ~calc_explode.xbrl_factoid_parent.isin(not_in_explosion_xbrl_factoids)
         ].copy()
         return calc_explode
 
@@ -1099,9 +1101,7 @@ def metadata(self):
         # At this point all remaining calculation components should exist within the
         # exploded metadata.
         calc_comps = self.calculations
-        calc_comps_index = calc_comps.set_index(
-            ["table_name_calc", "xbrl_factoid_calc"]
-        ).index
+        calc_comps_index = calc_comps.set_index(["table_name", "xbrl_factoid"]).index
         meta_index = exploded_metadata.set_index(["table_name", "xbrl_factoid"]).index
         nodes_not_in_calculations = [
             x
@@ -1137,12 +1137,14 @@ def redefine_calculations_with_components_out_of_explosion(
         embedded calculations.
         """
         calc_explode = self.calculation_components_xbrl_ferc1
-        calc_explode = calc_explode[calc_explode.table_name.isin(self.table_names)]
+        calc_explode = calc_explode[
+            calc_explode.table_name_parent.isin(self.table_names)
+        ]
 
         not_in_explosion_xbrl_factoids = list(
             calc_explode.loc[
-                ~calc_explode.table_name_calc.isin(self.table_names),
-                "xbrl_factoid",
+                ~calc_explode.table_name.isin(self.table_names),
+                "xbrl_factoid_parent",
             ].unique()
         )
         meta_explode.loc[
@@ -1379,10 +1381,7 @@ def generate_intertable_calculations(
                 f"{list(calculations_intertable.xbrl_factoid.unique())}."
             )
         # compile the lists of columns we are going to use later
-        calc_component_idx = [
-            "table_name_calc",
-            "xbrl_factoid_calc",
-        ] + self.other_dimensions
+        calc_component_idx = ["table_name", "xbrl_factoid"] + self.other_dimensions
         # Merge the reported data and the calculation component metadata to enable
         # validation of calculated values. Here the data table exploded is supplying the
         # values associated with individual calculation components, and the table_name
@@ -1391,35 +1390,36 @@ def generate_intertable_calculations(
         # values so they can be summed directly. This gives us aggregated calculated
         # values that can later be compared to the higher level reported values.
 
-        # the validation is one_many in all instances expect for the xbrl_factoid_calc
+        # the validation is one_many in all instances expect for the xbrl_factoid
         # construction_work_in_progress in the balance_sheet_assets_ferc1 explosion.
         # this may be a problem in the calculations that we should track down in #2717
         validate = (
             "one_to_many"
             if self.root_table != "balance_sheet_assets_ferc1"
             else "many_to_many"
         )
+        # we are going to merge the data onto the calc components with the _parent
+        # column names, so the groupby after the merge needs a set of by cols with the
+        # _parent suffix
+        gby_parent = [
+            f"{col}_parent" if col in ["table_name", "xbrl_factoid"] else col
+            for col in self.exploded_pks
+        ]
         calc_df = (
             pd.merge(
                 calculations_intertable,
-                exploded.rename(
-                    columns={
-                        "table_name": "table_name_calc",
-                        "xbrl_factoid": "xbrl_factoid_calc",
-                    }
-                ),
-                #
+                exploded,
                 validate=validate,
                 on=calc_component_idx,
             )
             # apply the weight from the calc to convey the sign before summing.
             .assign(calculated_amount=lambda x: x[self.value_col] * x.weight)
-            .groupby(self.exploded_pks, as_index=False, dropna=False)[
-                ["calculated_amount"]
-            ]
+            .groupby(gby_parent, as_index=False, dropna=False)[["calculated_amount"]]
             .sum(min_count=1)
         )
-
+        # remove the _parent suffix so we can merge these calculated values back onto
+        # the data using the original pks
+        calc_df.columns = calc_df.columns.str.removesuffix("_parent")
         calculated_df = pd.merge(
             exploded,
             calc_df,

diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py
@@ -3019,18 +3019,24 @@ def process_xbrl_metadata_calculations(
             .explode("source_tables")
             .rename(
                 columns={
-                    "name": "xbrl_factoid_calc",
-                    "source_tables": "table_name_calc",
+                    "xbrl_factoid": "xbrl_factoid_parent",
+                    "name": "xbrl_factoid",
+                    "source_tables": "table_name",
                 }
             )
             .merge(
-                metadata.drop(columns=["calculations"]),
+                metadata.drop(columns=["calculations"]).rename(
+                    columns={
+                        "xbrl_factoid": "xbrl_factoid_parent",
+                    }
+                ),
                 left_index=True,
                 right_index=True,
                 how="outer",
             )
-            .dropna(subset=["xbrl_factoid_calc"])
+            .dropna(subset=["xbrl_factoid"])
             .reset_index(drop=True)
+            .assign(table_name_parent=self.table_id.value)
         )
         return calc_comps
 
@@ -6684,18 +6690,19 @@ def calculation_components_xbrl_ferc1(**kwargs):
     # compile all of the calc comp tables.
     calc_metas = []
     for table_name, transformer in FERC1_TFR_CLASSES.items():
-        calc_meta = (
-            transformer(xbrl_metadata_json=clean_xbrl_metadata_json[table_name])
-            .process_xbrl_metadata_calculations()
-            .assign(table_name=table_name)
-        )
+        calc_meta = transformer(
+            xbrl_metadata_json=clean_xbrl_metadata_json[table_name]
+        ).process_xbrl_metadata_calculations()
         calc_metas.append(calc_meta)
     # squish all of the calc comp tables then add in the implicit table dimensions
     calc_components = pd.concat(calc_metas).pipe(
         make_calculation_dimensions_explicit,
         table_dimensions_ferc1,
         dimensions=other_dimensions(),
     )
+    calc_components = add_parent_dimensions(
+        calc_components, dimensions=other_dimensions()
+    )
     return calc_components
 
 
@@ -6745,31 +6752,100 @@ def make_calculation_dimensions_explicit(
     """
     logger.info(f"Adding {dimensions=} into calculation component table.")
     calc_comps_w_dims = calculation_components.copy()
+    on_cols = ["table_name", "xbrl_factoid"]
     # for each dimension, use split/apply/combine. when there are no dims explict in
     # the calc components, merge in all of the dims.
     for dim_col in dimensions:
         # extract the unique observed instances of this one dimension column & add the
         # _calc suffix so we can merge onto the calculation components.
-        observed_dim = (
-            table_dimensions_ferc1[["table_name", "xbrl_factoid", dim_col]]
-            .drop_duplicates()  # bc there are dupes after we removed the other dim cols
-            .rename(
-                columns={
-                    "xbrl_factoid": "xbrl_factoid_calc",
-                    "table_name": "table_name_calc",
-                }
-            )
-        )
+        observed_dim = table_dimensions_ferc1[
+            on_cols + [dim_col]
+        ].drop_duplicates()  # bc there are dupes after we removed the other dim cols
         null_dim_mask = calc_comps_w_dims[dim_col].isnull()
         null_dim = calc_comps_w_dims[null_dim_mask].drop(columns=[dim_col])
         calc_comps_w_implied_dims = pd.merge(
             null_dim,
             observed_dim,
-            on=["table_name_calc", "xbrl_factoid_calc"],
+            on=on_cols,
             how="left",
         )
         calc_comps_w_explicit_dims = calc_comps_w_dims[~null_dim_mask]
         calc_comps_w_dims = pd.concat(
             [calc_comps_w_implied_dims, calc_comps_w_explicit_dims]
         )
     return calc_comps_w_dims
+
+
+def add_parent_dimensions(
+    calc_comps: pd.DataFrame, dimensions: list[str]
+) -> pd.DataFrame:
+    """Define dimension total calculations and add dimensions to calculation parents.
+
+    In addition to calculations defining how values reported in one set of facts can
+    be aggregated resulting in a value in another fact, there are implied calculation
+    relationships between values reported within a dimension. In particular, when a
+    dimension reports a ``total`` value, we assume it is the sum of all the non-total
+    values for that dimension. This function makes these within-dimension calculation
+    relationships explicit by defining new calculations for each dimension's ``total``.
+
+    Outside of these ``total`` calculations we require that a factoid and its
+    calculation components share the same dimensional values.
+
+    To be able to define these within-dimension calculations we also add dimension
+    columns to all of the parent factoids in the table.
+
+    Args:
+        calc_comps: a table of calculation component records which have had some manual
+            calculation fixes applied.
+        dimensions: list of dimension columns to check.
+
+    Returns:
+        An table associating calculation components with the parents they will be
+        aggregated into. The components and the parents are each identified by
+        ``table_name``, ``xbrl_factoid``, and columns defining the additional dimensions
+        (``utility_type``, ``plant_status``, ``plant_function``). The parent columns
+        have a ``_parent`` suffix.
+    """
+    table_fact_cols = [
+        "table_name_parent",
+        "xbrl_factoid_parent",
+        "table_name",
+        "xbrl_factoid",
+    ]
+    # for each dimension col, add calculations defining the within-dimension totals and
+    # make a new _parent column
+    for dim in dimensions:
+        # Define calculations with the total values as parents and non-total values as
+        # sub-components by broadcast merging the not-total records onto the # new total
+        # records.
+        total_mask = calc_comps[dim] == "total"
+        total_w_subdim_components = (
+            pd.merge(
+                # the total records will become _parent columns in new records
+                left=calc_comps.loc[total_mask, table_fact_cols + dimensions],
+                # the non-total sub-components will become their calculation components
+                right=calc_comps[~total_mask],
+                on=table_fact_cols,
+                how="left",
+                validate="1:m",
+                suffixes=("_parent", ""),
+            )
+            # The new total -> sub-dimension records must always have the same
+            # parent and component fact/table so overwrite the original values
+            .assign(
+                table_name_parent=lambda x: x.table_name,
+                xbrl_factoid_parent=lambda x: x.xbrl_factoid,
+            )
+            # drop the other non-total parent dim cols (they will be merged back on later)
+            .drop(columns=[f"{d}_parent" for d in dimensions if d != dim])
+        )
+        # now we have a bunch of *new* records linking total records to their non-total
+        # sub-components.
+        # Seperately, we now add in parent-dimension values for all of the original
+        # calculation component records. We are assuming all parents should have the
+        # same dimension values as their child components.
+        calc_comps = calc_comps.assign(**{f"{dim}_parent": lambda x: x[dim]})
+        calc_comps = pd.concat([calc_comps, total_w_subdim_components])
+        # we shouldn't be adding any duplicates in this process!
+        assert calc_comps[calc_comps.duplicated()].empty
+    return calc_comps
diff --git a/test/unit/transform/ferc1_test.py b/test/unit/transform/ferc1_test.py
@@ -12,6 +12,7 @@
     TableIdFerc1,
     UnstackBalancesToReportYearInstantXbrl,
     WideToTidy,
+    add_parent_dimensions,
     drop_duplicate_rows_dbf,
     fill_dbf_to_xbrl_map,
     make_calculation_dimensions_explicit,
@@ -391,15 +392,15 @@ def test_unstack_balances_to_report_year_instant_xbrl():
 def test_make_calculation_dimensions_explicit():
     """Test :func:`make_calculation_dimensions_explicit`"""
     calc_comp_idx = [
+        "table_name_parent",
+        "xbrl_factoid_parent",
         "table_name",
         "xbrl_factoid",
-        "table_name_calc",
-        "xbrl_factoid_calc",
     ]
     calc_comps_trek = pd.read_csv(
         StringIO(
             """
-table_name,xbrl_factoid,table_name_calc,xbrl_factoid_calc,dim_x,dim_y
+table_name_parent,xbrl_factoid_parent,table_name,xbrl_factoid,dim_x,dim_y
 table_a,fact_1,table_a,fact_3,voyager,
 table_a,fact_1,table_a,fact_4,voyager,
 table_a,fact_1,table_a,fact_5,ds9,
@@ -417,10 +418,12 @@ def test_make_calculation_dimensions_explicit():
 table_a,fact_3,voyager,in
 table_a,fact_3,voyager,that
 table_a,fact_3,voyager,nebula
+table_a,fact_3,voyager,total
 table_a,fact_4,voyager,coffee
 table_a,fact_4,voyager,in
 table_a,fact_4,voyager,that
 table_a,fact_4,voyager,nebula
+table_a,fact_4,voyager,total
 table_a,fact_5,ds9,
 table_b,fact_6,next_gen,resistance
 table_b,fact_6,next_gen,is
@@ -446,15 +449,17 @@ def test_make_calculation_dimensions_explicit():
     expected_trek = pd.read_csv(
         StringIO(
             """
-table_name,xbrl_factoid,table_name_calc,xbrl_factoid_calc,dim_x,dim_y
+table_name_parent,xbrl_factoid_parent,table_name,xbrl_factoid,dim_x,dim_y
 table_a,fact_1,table_a,fact_3,voyager,coffee
 table_a,fact_1,table_a,fact_3,voyager,in
 table_a,fact_1,table_a,fact_3,voyager,that
 table_a,fact_1,table_a,fact_3,voyager,nebula
+table_a,fact_1,table_a,fact_3,voyager,total
 table_a,fact_1,table_a,fact_4,voyager,coffee
 table_a,fact_1,table_a,fact_4,voyager,in
 table_a,fact_1,table_a,fact_4,voyager,that
 table_a,fact_1,table_a,fact_4,voyager,nebula
+table_a,fact_1,table_a,fact_4,voyager,total
 table_a,fact_1,table_a,fact_5,ds9,
 table_a,fact_2,table_b,fact_6,next_gen,futile
 table_a,fact_2,table_b,fact_7,next_gen,futile
@@ -476,3 +481,46 @@ def test_make_calculation_dimensions_explicit():
         .reset_index(drop=True)
     )
     pd.testing.assert_frame_equal(out_trek, out_reordered, check_like=True)
+
+    expected_parent_dim_trek = pd.read_csv(
+        StringIO(
+            """
+table_name_parent,xbrl_factoid_parent,table_name,xbrl_factoid,dim_x,dim_y,dim_x_parent,dim_y_parent
+table_a,fact_1,table_a,fact_3,voyager,coffee,voyager,coffee
+table_a,fact_1,table_a,fact_3,voyager,in,voyager,in
+table_a,fact_1,table_a,fact_3,voyager,that,voyager,that
+table_a,fact_1,table_a,fact_3,voyager,nebula,voyager,nebula
+table_a,fact_1,table_a,fact_3,voyager,total,voyager,total
+table_a,fact_1,table_a,fact_4,voyager,coffee,voyager,coffee
+table_a,fact_1,table_a,fact_4,voyager,in,voyager,in
+table_a,fact_1,table_a,fact_4,voyager,that,voyager,that
+table_a,fact_1,table_a,fact_4,voyager,nebula,voyager,nebula
+table_a,fact_1,table_a,fact_4,voyager,total,voyager,total
+table_a,fact_1,table_a,fact_5,ds9,,ds9,
+table_a,fact_2,table_b,fact_6,next_gen,futile,next_gen,futile
+table_a,fact_2,table_b,fact_7,next_gen,futile,next_gen,futile
+table_a,fact_2,table_b,fact_8,next_gen,resistance,next_gen,resistance
+table_a,fact_2,table_b,fact_8,next_gen,is,next_gen,is
+table_a,fact_2,table_b,fact_8,next_gen,futile,next_gen,futile
+table_a,fact_3,table_a,fact_3,voyager,coffee,,total
+table_a,fact_3,table_a,fact_3,voyager,in,,total
+table_a,fact_3,table_a,fact_3,voyager,that,,total
+table_a,fact_3,table_a,fact_3,voyager,nebula,,total
+table_a,fact_4,table_a,fact_4,voyager,coffee,,total
+table_a,fact_4,table_a,fact_4,voyager,in,,total
+table_a,fact_4,table_a,fact_4,voyager,that,,total
+table_a,fact_4,table_a,fact_4,voyager,nebula,,total
+"""
+        )
+    )
+    out_parent_dim_trek = (
+        add_parent_dimensions(
+            calc_comps=expected_trek,
+            dimensions=["dim_x", "dim_y"],
+        )
+        .sort_values(calc_comp_idx)
+        .reset_index(drop=True)
+    )
+    pd.testing.assert_frame_equal(
+        expected_parent_dim_trek, out_parent_dim_trek, check_like=True
+    )