catalyst-cooperative · cmgosnell · Feb 2, 2024 · Jan 4, 2024 · Jan 4, 2024 · Jan 10, 2024
diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py
@@ -1154,7 +1154,10 @@ class OffByFactoid(NamedTuple):
 
 
 @asset
-def _out_ferc1__explosion_tags(table_dimensions_ferc1) -> pd.DataFrame:
+def _out_ferc1__explosion_tags(
+    table_dimensions_ferc1: pd.DataFrame,
+    calculation_components_xbrl_ferc1: pd.DataFrame,
+) -> pd.DataFrame:
     """Grab the stored tables of tags and add inferred dimension."""
     rate_tags = _get_tags("xbrl_factoid_rate_base_tags.csv", table_dimensions_ferc1)
     rev_req_tags = _get_tags(
@@ -1180,9 +1183,10 @@ def _out_ferc1__explosion_tags(table_dimensions_ferc1) -> pd.DataFrame:
         plant_function_tags,
         utility_type_tags,
     ]
-    tags_all = (
+    tag_idx = list(NodeId._fields)
+    tags = (
         pd.concat(
-            [df.set_index(list(NodeId._fields)) for df in tag_dfs],
+            [df.set_index(tag_idx) for df in tag_dfs],
             join="outer",
             verify_integrity=True,
             ignore_index=False,
@@ -1191,7 +1195,7 @@ def _out_ferc1__explosion_tags(table_dimensions_ferc1) -> pd.DataFrame:
         .reset_index()
         .drop(columns=["notes"])
     )
-    return tags_all
+    return tags
 
 
 def _get_tags(file_name: str, table_dimensions_ferc1: pd.DataFrame) -> pd.DataFrame:
@@ -1236,7 +1240,10 @@ def _aggregatable_dimension_tags(
         )
         .set_index(idx)
     )
-    table_dimensions_ferc1 = table_dimensions_ferc1.set_index(idx)
+    # don't include the corrections because we will add those in later
+    table_dimensions_ferc1 = table_dimensions_ferc1[
+        ~table_dimensions_ferc1.xbrl_factoid.str.endswith("_correction")
+    ].set_index(idx)
     tags_df = pd.concat(
         [
             tags_df,
@@ -1658,7 +1665,6 @@ def calculation_forest(self: Self) -> "XbrlCalculationForestFerc1":
         """Construct a calculation forest based on class attributes."""
         return XbrlCalculationForestFerc1(
             exploded_calcs=self.exploded_calcs,
-            exploded_meta=self.exploded_meta,
             seeds=self.seed_nodes,
             tags=self.tags,
             group_metric_checks=self.group_metric_checks,
@@ -2016,10 +2022,10 @@ class XbrlCalculationForestFerc1(BaseModel):
 
     # Not sure if dynamically basing this on NodeId is really a good idea here.
     calc_cols: list[str] = list(NodeId._fields)
-    exploded_meta: pd.DataFrame = pd.DataFrame()
     exploded_calcs: pd.DataFrame = pd.DataFrame()
     seeds: list[NodeId] = []
     tags: pd.DataFrame = pd.DataFrame()
+    # TODO: remove the group metric checks and see if things still build / tests still pass
     group_metric_checks: GroupMetricChecks = GroupMetricChecks()
     model_config = ConfigDict(
         arbitrary_types_allowed=True, ignored_types=(cached_property,)
@@ -2135,14 +2141,13 @@ def exploded_calcs_to_digraph(
         Then we compile a dictionary of node attributes, based on the individual
         calculation components in the exploded calcs dataframe.
         """
-        source_nodes = list(
-            exploded_calcs.loc[:, self.parent_cols]
-            .rename(columns=lambda x: x.removesuffix("_parent"))
-            .itertuples(name="NodeId", index=False)
-        )
-        target_nodes = list(
-            exploded_calcs.loc[:, self.calc_cols].itertuples(name="NodeId", index=False)
-        )
+        source_nodes = [
+            NodeId(*x)
+            for x in exploded_calcs.set_index(self.parent_cols).index.to_list()
+        ]
+        target_nodes = [
+            NodeId(*x) for x in exploded_calcs.set_index(self.calc_cols).index.to_list()
+        ]
         edgelist = pd.DataFrame({"source": source_nodes, "target": target_nodes})
         forest = nx.from_pandas_edgelist(edgelist, create_using=nx.DiGraph)
         return forest
@@ -2173,32 +2178,9 @@ def node_attrs(self: Self) -> dict[NodeId, dict[str, dict[str, str]]]:
             .reset_index()
             # Type conversion is necessary to get pd.NA in the index:
             .astype({col: pd.StringDtype() for col in self.calc_cols})
-            # We need a dictionary for *all* nodes, not just those with tags.
-            .merge(
-                self.exploded_meta.loc[:, self.calc_cols],
-                how="left",
-                on=self.calc_cols,
-                validate="one_to_many",
-                indicator=True,
-            )
-            # For nodes with no tags, we assign an empty dictionary:
             .assign(tags=lambda x: np.where(x["tags"].isna(), {}, x["tags"]))
         )
-        lefties = node_attrs[
-            (node_attrs._merge == "left_only")
-            & (node_attrs.table_name.isin(self.table_names))
-        ]
-        if not lefties.empty:
-            logger.warning(
-                f"Found {len(lefties)} tags that only exist in our manually compiled "
-                "tags when expected none. Ensure the compiled tags match the metadata."
-                f"Mismatched tags:\n{lefties}"
-            )
-        return (
-            node_attrs.drop(columns=["_merge"])
-            .set_index(self.calc_cols)
-            .to_dict(orient="index")
-        )
+        return node_attrs.set_index(self.calc_cols).to_dict(orient="index")
 
     @cached_property
     def edge_attrs(self: Self) -> dict[Any, Any]:
@@ -2244,6 +2226,7 @@ def annotated_forest(self: Self) -> nx.DiGraph:
         annotated_forest = deepcopy(self.forest)
         nx.set_node_attributes(annotated_forest, self.node_attrs)
         nx.set_edge_attributes(annotated_forest, self.edge_attrs)
+        annotated_forest = self.propagate_tags(annotated_forest)
 
         logger.info("Checking whether any pruned nodes were also tagged.")
         self.check_lost_tags(lost_nodes=self.pruned)
@@ -2252,6 +2235,55 @@ def annotated_forest(self: Self) -> nx.DiGraph:
         self.check_conflicting_tags(annotated_forest)
         return annotated_forest
 
+    def propagate_tags(self: Self, annotated_forest: nx.DiGraph):
+        """Propagate tags.
+
+        Propagate tags leafwards, rootward &  to the _correction nodes.
+        """
+        existing_tags = nx.get_node_attributes(annotated_forest, "tags")
+        ## Leafwards propagation
+        leafward_inherited_tags = ["in_rate_base"]
+        for node, parent_tags in existing_tags.items():
+            descendants = nx.descendants(annotated_forest, node)
+            descendant_tags = {
+                desc: {
+                    "tags": {
+                        tag_name: parent_tags[tag_name]
+                        for tag_name in leafward_inherited_tags
+                        if tag_name in parent_tags
+                    }
+                    | existing_tags.get(desc, {})
+                }
+                for desc in descendants
+            }
+            nx.set_node_attributes(annotated_forest, descendant_tags)
+
+        # Rootward propagation
+        root_node = self.roots(annotated_forest)[0]
+        _ = recursively_propagate_tags_leafward(
+            annotated_forest, root_node, "in_rate_base"
+        )
+        # Correction Records
+        existing_tags = nx.get_node_attributes(annotated_forest, "tags")
+        correction_nodes = [
+            node
+            for node in annotated_forest
+            if node.xbrl_factoid.endswith("_correction")
+        ]
+        correction_tags = {}
+        for correction_node in correction_nodes:
+            # for every correction node, we assume that that nodes parent tags can apply
+            parents = list(annotated_forest.predecessors(correction_node))
+            # all correction records shoul have a parent and only one
+            assert len(parents) == 1
+            parent = parents[0]
+            correction_tags[correction_node] = {
+                "tags": existing_tags.get(parent, {})
+                | existing_tags.get(correction_node, {})
+            }
+        nx.set_node_attributes(annotated_forest, correction_tags)
+        return annotated_forest
+
     def check_lost_tags(self: Self, lost_nodes: list[NodeId]) -> None:
         """Check whether any of the input lost nodes were also tagged nodes."""
         if lost_nodes:
@@ -2368,7 +2400,7 @@ def seeded_digraph(self: Self) -> nx.DiGraph:
 
         We compile a list of all the :class:`NodeId` values that should be included in
         the pruned graph, and then use that list to select a subset of the exploded
-        metadata to pass to :meth:`exploded_meta_to_digraph`, so that all of the
+        metadata to pass to :meth:`exploded_calcs_to_digraph`, so that all of the
         associated metadata is also added to the pruned graph.
         """
         return self.prune_unrooted(self.full_digraph)
@@ -2496,11 +2528,16 @@ def forest_leaves(self: Self) -> list[NodeId]:
     def orphans(self: Self) -> list[NodeId]:
         """Identify all nodes that appear in metadata but not in the full digraph."""
         nodes = self.full_digraph.nodes
-        return [
-            NodeId(*n)
-            for n in self.exploded_meta.set_index(self.calc_cols).index
-            if n not in nodes
-        ]
+        orphans = []
+        for idx_cols in [self.calc_cols, self.parent_cols]:
+            orphans.extend(
+                [
+                    NodeId(*n)
+                    for n in self.exploded_calcs.set_index(idx_cols).index
+                    if n not in nodes
+                ]
+            )
+        return list(set(orphans))
 
     @cached_property
     def pruned(self: Self) -> list[NodeId]:
@@ -2774,3 +2811,122 @@ def nodes_to_df(calc_forest: nx.DiGraph, nodes: list[NodeId]) -> pd.DataFrame:
     except AttributeError:
         tags = pd.DataFrame()
     return pd.concat([index, tags], axis="columns")
+
+
+def recursively_propagate_tags_leafward(
+    annotated_forest, node, tag_name: Literal["in_rate_base"]
+):
+    """Set the tags for nodes when all of its children have same tag.
+
+    This function returns the value of a tag.
+    """
+
+    def _get_tag(annotated_forest, node, tag_name):
+        return annotated_forest.nodes.get(node, {}).get("tags", {}).get(tag_name, pd.NA)
+
+    logger.info(f"propagaging tags leafward from {node}")
+    tag = pd.NA
+    # i'm a leaf so i stop looking
+    if not list(annotated_forest.successors(node)):
+        tag = _get_tag(annotated_forest, node, tag_name)
+        logger.info(f"    We found a leaf people. w/ {tag=}")
+    # if i have a value you don't need to keep looking at this nodes childern
+    elif not pd.isna(_get_tag(annotated_forest, node, tag_name)):
+        tag = _get_tag(annotated_forest, node, tag_name)
+        logger.info(f"    We found a node w/ tags. w/ {tag=}")
+    else:
+        child_tags = set()
+        for child_node in annotated_forest.successors(node):
+            if not child_node.xbrl_factoid.endswith("_correction"):
+                child_tags.add(
+                    recursively_propagate_tags_leafward(
+                        annotated_forest, child_node, tag_name
+                    )
+                )
+        logger.info(f"   found {child_tags=}")
+        # if all the children tags are the same and non-null
+        if (len(child_tags) == 1) and {t for t in child_tags if not pd.isna(t)}:
+            new_node_tag = child_tags.pop()
+            logger.info(
+                f"    We found a node consitent children tags. w/ {new_node_tag=}"
+            )
+            # actually assign the tag here but don't wipe out any other tags
+            existing_tags = nx.get_node_attributes(annotated_forest, "tags")
+            node_tags = {
+                node: {"tags": {tag_name: new_node_tag} | existing_tags.get(node, {})}
+            }
+            nx.set_node_attributes(annotated_forest, node_tags)
+            tag = new_node_tag
+    return tag
+
+
+@asset
+def out_ferc1__yearly_rate_base(
+    exploded_balance_sheet_assets_ferc1: pd.DataFrame,
+    exploded_balance_sheet_liabilities_ferc1: pd.DataFrame,
+    core_ferc1__yearly_operating_expenses_sched320: pd.DataFrame,
+) -> pd.DataFrame:
+    """Make a table of granular utility rate-base data.
+
+    This table contains granular data consisting of what utilities can
+    include in their rate bases. This information comes from two core
+    inputs: ``exploded_balance_sheet_assets_ferc1`` and
+    ``exploded_balance_sheet_liabilities_ferc1``. These tables include granular
+    data from the nested calculations that are build into the accounting tables.
+    See :class:`Exploder` for more details.
+
+    This rate base table also contains one specific addition from
+    :ref:`core_ferc1__yearly_operating_expenses_sched320`. In standard ratemaking
+    processes, utilities are enabled to include working capital - sometimes referred
+    to as cash on hand or cash reverves. A standard ratemaking process is to consider
+    the available rate-baseable working capital to be one eigth of the average
+    operations and maintenance expense. This function grabs that expense and
+    concatenates it with the rest of the assets and liabilities from the granular
+    exploded data.
+
+    """
+    # get the factoid name to grab the right part of the table
+    xbrl_factoid_name = pudl.transform.ferc1.FERC1_TFR_CLASSES[
+        "core_ferc1__yearly_operating_expenses_sched320"
+    ]().params.xbrl_factoid_name
+    # First grab the working capital out of the operating expense table.
+    # then prep it for concating. Calculate working capital & add tags
+    cash_working_capital = (
+        core_ferc1__yearly_operating_expenses_sched320[
+            core_ferc1__yearly_operating_expenses_sched320[xbrl_factoid_name]
+            == "operations_and_maintenance_expenses_electric"
+        ]
+        .assign(
+            dollar_value=lambda x: x.dollar_value.divide(8),
+            xbrl_factoid="cash_working_capital",  # newly definied (do we need to add it anywhere?)
+            tags_rate_base_category="net_working_capital",
+            tags_aggregatable_utility_type="electric",
+            table_name="core_ferc1__yearly_operating_expenses_sched320",
+        )
+        .drop(columns=[xbrl_factoid_name])
+        # the assets/liabilites both use ending_balance for its main $$ column
+        .rename(columns={"dollar_value": "ending_balance"})
+    )
+    # then select only the leafy exploded records that are in rate base and concat
+    in_rate_base = (
+        pd.concat(
+            [
+                exploded_balance_sheet_assets_ferc1[
+                    exploded_balance_sheet_assets_ferc1.tags_in_rate_base.isin(
+                        ["yes", "partial"]
+                    )
+                ],
+                exploded_balance_sheet_liabilities_ferc1[
+                    exploded_balance_sheet_liabilities_ferc1.tags_in_rate_base.isin(
+                        ["yes", "partial"]
+                    )
+                ],
+                cash_working_capital,
+            ]
+        )
+        # .drop(columns=["tags_in_rate_base"])
+        .sort_values(
+            by=["report_year", "utility_id_ferc1", "table_name"], ascending=False
+        )
+    )
+    return in_rate_base
diff --git a/src/pudl/package_data/ferc1/xbrl_factoid_plant_status_tags.csv b/src/pudl/package_data/ferc1/xbrl_factoid_plant_status_tags.csv
@@ -26,14 +26,4 @@ core_ferc1__yearly_utility_plant_summary_sched200,depreciation_and_amortization_
 core_ferc1__yearly_utility_plant_summary_sched200,abandonment_of_leases,total
 core_ferc1__yearly_utility_plant_summary_sched200,amortization_of_plant_acquisition_adjustment,total
 core_ferc1__yearly_utility_plant_summary_sched200,utility_plant_in_service_classified_and_property_under_capital_leases,in_service
-core_ferc1__yearly_utility_plant_summary_sched200,utility_plant_in_service_plant_purchased_or_sold_correction,in_service
-core_ferc1__yearly_utility_plant_summary_sched200,utility_plant_in_service_experimental_plant_unclassified_correction,in_service
-core_ferc1__yearly_utility_plant_summary_sched200,utility_plant_in_service_classified_and_unclassified_correction,in_service
-core_ferc1__yearly_utility_plant_summary_sched200,utility_plant_and_construction_work_in_progress_correction,construction_work_in_progress
-core_ferc1__yearly_utility_plant_summary_sched200,accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_correction,total
-core_ferc1__yearly_utility_plant_summary_sched200,utility_plant_net_correction,total
-core_ferc1__yearly_utility_plant_summary_sched200,depreciation_utility_plant_in_service_correction,in_service
-core_ferc1__yearly_utility_plant_summary_sched200,depreciation_amortization_and_depletion_utility_plant_leased_to_others_correction,leased
-core_ferc1__yearly_utility_plant_summary_sched200,depreciation_and_amortization_utility_plant_held_for_future_use_correction,future
-core_ferc1__yearly_utility_plant_summary_sched200,utility_plant_in_service_classified_and_property_under_capital_leases_correction,in_service
 core_ferc1__yearly_utility_plant_summary_sched200,abandonment_of_leases,leased