diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index 3e81e5f026..c9452b9411 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -1207,9 +1207,10 @@ def _out_ferc1__detailed_tags(_core_ferc1__table_dimensions) -> pd.DataFrame: plant_function_tags, utility_type_tags, ] - tags_all = ( + tag_idx = list(NodeId._fields) + tags = ( pd.concat( - [df.set_index(list(NodeId._fields)) for df in tag_dfs], + [df.set_index(tag_idx) for df in tag_dfs], join="outer", verify_integrity=True, ignore_index=False, @@ -1218,7 +1219,7 @@ def _out_ferc1__detailed_tags(_core_ferc1__table_dimensions) -> pd.DataFrame: .reset_index() .drop(columns=["notes"]) ) - return tags_all + return tags def _get_tags( @@ -1265,7 +1266,10 @@ def _aggregatable_dimension_tags( ) .set_index(idx) ) - _core_ferc1__table_dimensions = _core_ferc1__table_dimensions.set_index(idx) + # don't include the corrections because we will add those in later + _core_ferc1__table_dimensions = _core_ferc1__table_dimensions[ + ~_core_ferc1__table_dimensions.xbrl_factoid.str.endswith("_correction") + ].set_index(idx) tags_df = pd.concat( [ tags_df, @@ -1693,7 +1697,6 @@ def calculation_forest(self: Self) -> "XbrlCalculationForestFerc1": """Construct a calculation forest based on class attributes.""" return XbrlCalculationForestFerc1( exploded_calcs=self.exploded_calcs, - exploded_meta=self.exploded_meta, seeds=self.seed_nodes, tags=self.tags, group_metric_checks=self.group_metric_checks, @@ -2051,7 +2054,6 @@ class XbrlCalculationForestFerc1(BaseModel): # Not sure if dynamically basing this on NodeId is really a good idea here. calc_cols: list[str] = list(NodeId._fields) - exploded_meta: pd.DataFrame = pd.DataFrame() exploded_calcs: pd.DataFrame = pd.DataFrame() seeds: list[NodeId] = [] tags: pd.DataFrame = pd.DataFrame() @@ -2170,14 +2172,13 @@ def exploded_calcs_to_digraph( Then we compile a dictionary of node attributes, based on the individual calculation components in the exploded calcs dataframe. """ - source_nodes = list( - exploded_calcs.loc[:, self.parent_cols] - .rename(columns=lambda x: x.removesuffix("_parent")) - .itertuples(name="NodeId", index=False) - ) - target_nodes = list( - exploded_calcs.loc[:, self.calc_cols].itertuples(name="NodeId", index=False) - ) + source_nodes = [ + NodeId(*x) + for x in exploded_calcs.set_index(self.parent_cols).index.to_list() + ] + target_nodes = [ + NodeId(*x) for x in exploded_calcs.set_index(self.calc_cols).index.to_list() + ] edgelist = pd.DataFrame({"source": source_nodes, "target": target_nodes}) forest = nx.from_pandas_edgelist(edgelist, create_using=nx.DiGraph) return forest @@ -2191,7 +2192,10 @@ def node_attrs(self: Self) -> dict[NodeId, dict[str, dict[str, str]]]: # Reshape the tags to turn them into a dictionary of values per-node. This # will make it easier to add arbitrary sets of tags later on. tags_dict = ( - self.tags.convert_dtypes().set_index(self.calc_cols).to_dict(orient="index") + self.tags.convert_dtypes() + .set_index(self.calc_cols) + .dropna(how="all") + .to_dict(orient="index") ) # Drop None tags created by combining multiple tagging CSVs clean_tags_dict = { @@ -2208,32 +2212,9 @@ def node_attrs(self: Self) -> dict[NodeId, dict[str, dict[str, str]]]: .reset_index() # Type conversion is necessary to get pd.NA in the index: .astype({col: pd.StringDtype() for col in self.calc_cols}) - # We need a dictionary for *all* nodes, not just those with tags. - .merge( - self.exploded_meta.loc[:, self.calc_cols], - how="left", - on=self.calc_cols, - validate="one_to_many", - indicator=True, - ) - # For nodes with no tags, we assign an empty dictionary: .assign(tags=lambda x: np.where(x["tags"].isna(), {}, x["tags"])) ) - lefties = node_attrs[ - (node_attrs._merge == "left_only") - & (node_attrs.table_name.isin(self.table_names)) - ] - if not lefties.empty: - logger.warning( - f"Found {len(lefties)} tags that only exist in our manually compiled " - "tags when expected none. Ensure the compiled tags match the metadata." - f"Mismatched tags:\n{lefties}" - ) - return ( - node_attrs.drop(columns=["_merge"]) - .set_index(self.calc_cols) - .to_dict(orient="index") - ) + return node_attrs.set_index(self.calc_cols).to_dict(orient="index") @cached_property def edge_attrs(self: Self) -> dict[Any, Any]: @@ -2279,6 +2260,7 @@ def annotated_forest(self: Self) -> nx.DiGraph: annotated_forest = deepcopy(self.forest) nx.set_node_attributes(annotated_forest, self.node_attrs) nx.set_edge_attributes(annotated_forest, self.edge_attrs) + annotated_forest = self.propagate_node_attributes(annotated_forest) logger.info("Checking whether any pruned nodes were also tagged.") self.check_lost_tags(lost_nodes=self.pruned) @@ -2287,6 +2269,19 @@ def annotated_forest(self: Self) -> nx.DiGraph: self.check_conflicting_tags(annotated_forest) return annotated_forest + def propagate_node_attributes(self: Self, annotated_forest: nx.DiGraph): + """Propagate tags. + + Propagate tags leafwards, rootward & to the _correction nodes. + """ + ## Leafwards propagation + annotated_forest = _propagate_tags_leafward(annotated_forest, ["in_rate_base"]) + # Rootward propagation + annotated_forest = _propagate_tag_rootward(annotated_forest, "in_rate_base") + # Correction Records + annotated_forest = _propagate_tags_to_corrections(annotated_forest) + return annotated_forest + def check_lost_tags(self: Self, lost_nodes: list[NodeId]) -> None: """Check whether any of the input lost nodes were also tagged nodes.""" if lost_nodes: @@ -2403,7 +2398,7 @@ def seeded_digraph(self: Self) -> nx.DiGraph: We compile a list of all the :class:`NodeId` values that should be included in the pruned graph, and then use that list to select a subset of the exploded - metadata to pass to :meth:`exploded_meta_to_digraph`, so that all of the + metadata to pass to :meth:`exploded_calcs_to_digraph`, so that all of the associated metadata is also added to the pruned graph. """ return self.prune_unrooted(self.full_digraph) @@ -2529,13 +2524,22 @@ def forest_leaves(self: Self) -> list[NodeId]: @cached_property def orphans(self: Self) -> list[NodeId]: - """Identify all nodes that appear in metadata but not in the full digraph.""" + """Identify all nodes that appear in the exploded_calcs but not in the full digraph. + + Because we removed the metadata and are now building the tree entirely based on + the exploded_calcs, this should now never produce any orphans and is a bit redundant. + """ nodes = self.full_digraph.nodes - return [ - NodeId(*n) - for n in self.exploded_meta.set_index(self.calc_cols).index - if n not in nodes - ] + orphans = [] + for idx_cols in [self.calc_cols, self.parent_cols]: + orphans.extend( + [ + NodeId(*n) + for n in self.exploded_calcs.set_index(idx_cols).index + if NodeId(*n) not in nodes + ] + ) + return list(set(orphans)) @cached_property def pruned(self: Self) -> list[NodeId]: @@ -2809,3 +2813,241 @@ def nodes_to_df(calc_forest: nx.DiGraph, nodes: list[NodeId]) -> pd.DataFrame: except AttributeError: tags = pd.DataFrame() return pd.concat([index, tags], axis="columns") + + +def _propagate_tags_leafward( + annotated_forest: nx.DiGraph, leafward_inherited_tags: list[str] +) -> nx.DiGraph: + """Push a parent's tags down to its descendants. + + Only push the `leafward_inherited_tags` - others will be left alone. + """ + existing_tags = nx.get_node_attributes(annotated_forest, "tags") + for node, parent_tags in existing_tags.items(): + descendants = nx.descendants(annotated_forest, node) + descendant_tags = { + desc: { + "tags": { + tag_name: parent_tags[tag_name] + for tag_name in leafward_inherited_tags + if tag_name in parent_tags + } + | existing_tags.get(desc, {}) + } + for desc in descendants + } + nx.set_node_attributes(annotated_forest, descendant_tags) + return annotated_forest + + +def _propagate_tag_rootward( + annotated_forest: nx.DiGraph, tag_name: Literal["in_rate_base"] +) -> nx.DiGraph: + """Set the tag for nodes when all of its children have same tag. + + This function returns the value of a tag, but also sets node attributes + down the tree when all children of a node share the same tag. + """ + + def _get_tag(annotated_forest, node, tag_name): + return annotated_forest.nodes.get(node, {}).get("tags", {}).get(tag_name) + + generations = list(nx.topological_generations(annotated_forest)) + for gen in reversed(generations): + untagged_nodes = { + node_id + for node_id in gen + if _get_tag(annotated_forest, node_id, tag_name) is None + } + for parent_node in untagged_nodes: + child_tags = { + _get_tag(annotated_forest, c, tag_name) + for c in annotated_forest.successors(parent_node) + if not c.xbrl_factoid.endswith("_correction") + } + non_null_tags = child_tags - {None} + # sometimes, all children can share same tag but it's null. + if len(child_tags) == 1 and non_null_tags: + # actually assign the tag here but don't wipe out any other tags + new_node_tag = non_null_tags.pop() + existing_tags = nx.get_node_attributes(annotated_forest, "tags") + node_tags = { + parent_node: { + "tags": {tag_name: new_node_tag} + | existing_tags.get(parent_node, {}) + } + } + nx.set_node_attributes(annotated_forest, node_tags) + return annotated_forest + + +def _propagate_tags_to_corrections(annotated_forest: nx.DiGraph) -> nx.DiGraph: + existing_tags = nx.get_node_attributes(annotated_forest, "tags") + correction_nodes = [ + node for node in annotated_forest if node.xbrl_factoid.endswith("_correction") + ] + correction_tags = {} + for correction_node in correction_nodes: + # for every correction node, we assume that that nodes parent tags can apply + parents = list(annotated_forest.predecessors(correction_node)) + # all correction records shoul have a parent and only one + assert len(parents) == 1 + parent = parents[0] + correction_tags[correction_node] = { + "tags": existing_tags.get(parent, {}) + | existing_tags.get(correction_node, {}) + } + nx.set_node_attributes(annotated_forest, correction_tags) + return annotated_forest + + +def check_tag_propagation_compared_to_compiled_tags( + df: pd.DataFrame, + propagated_tag: Literal["in_rate_base"], + _out_ferc1__explosion_tags: pd.DataFrame, +): + """Check if tags got propagated. + + Args: + df: table to check. This should be either the + :func:`out_ferc1__yearly_rate_base`, ``exploded_balance_sheet_assets_ferc1`` + or ``exploded_balance_sheet_liabilities_ferc1``. The + ``exploded_income_statement_ferc1`` table does not currently have propagated + tags. + propagated_tag: name of tag. Currently ``in_rate_base`` is the only propagated tag. + _out_ferc1__explosion_tags: mannually compiled tags. This table includes tags from + many of the explosion tables so we will filter it before checking if the tag was + propagated. + + Raises: + AssertionError: If there are more mannually compiled tags for the ``xbrl_factoids`` + in ``df`` than found in ``_out_ferc1__explosion_tags``. + AssertionError: If there are more mannually compiled tags for the correction + ``xbrl_factoids`` in ``df`` than found in ``_out_ferc1__explosion_tags``. + """ + # the tag df has all tags - not just those in a specific explosion + # so we need to drop + node_idx = list(NodeId._fields) + df_filtered = df.filter(node_idx).drop_duplicates() + df_tags = _out_ferc1__explosion_tags.merge( + df_filtered, on=list(df_filtered.columns), how="right" + ) + manually_tagged = df_tags[df_tags[propagated_tag].notnull()].xbrl_factoid.unique() + detailed_tagged = df[df[f"tags_{propagated_tag}"].notnull()].xbrl_factoid.unique() + if len(detailed_tagged) < len(manually_tagged): + raise AssertionError( + f"Found more {len(manually_tagged)} mannually compiled tagged xbrl_factoids" + " than tags in propagated detailed data." + ) + manually_tagged_corrections = df_tags[ + df_tags[propagated_tag].notnull() + & df_tags.xbrl_factoid.str.endswith("_correction") + ].xbrl_factoid.unique() + detailed_tagged_corrections = df[ + df[f"tags_{propagated_tag}"].notnull() + & df.xbrl_factoid.str.endswith("_correction") + ].xbrl_factoid.unique() + if len(detailed_tagged_corrections) < len(manually_tagged_corrections): + raise AssertionError( + f"Found more {len(manually_tagged_corrections)} mannually compiled " + "tagged xbrl_factoids than tags in propagated detailed data." + ) + + +def check_for_correction_xbrl_factoids_with_tag( + df: pd.DataFrame, propagated_tag: Literal["in_rate_base"] +): + """Check if any correction records have tags. + + Args: + df: table to check. This should be either the + :func:`out_ferc1__yearly_rate_base`, ``exploded_balance_sheet_assets_ferc1`` + or ``exploded_balance_sheet_liabilities_ferc1``. The + ``exploded_income_statement_ferc1`` table does not currently have propagated + tags. + propagated_tag: name of tag. Currently ``in_rate_base`` is the only propagated tag. + + Raises: + AssertionError: If there are zero correction ``xbrl_factoids`` in ``df`` with tags. + """ + detailed_tagged_corrections = df[ + df[f"tags_{propagated_tag}"].notnull() + & df.xbrl_factoid.str.endswith("_correction") + ].xbrl_factoid.unique() + if len(detailed_tagged_corrections) == 0: + raise AssertionError( + "We expect there to be more than zero correction recrods with tags, but " + f"found {len(detailed_tagged_corrections)}." + ) + + +@asset +def out_ferc1__yearly_rate_base( + _out_ferc1__detailed_balance_sheet_assets: pd.DataFrame, + _out_ferc1__detailed_balance_sheet_liabilities: pd.DataFrame, + core_ferc1__yearly_operating_expenses_sched320: pd.DataFrame, + _out_ferc1__detailed_tags: pd.DataFrame, +) -> pd.DataFrame: + """Make a table of granular utility rate-base data. + + This table contains granular data consisting of what utilities can + include in their rate bases. This information comes from two core + inputs: ``exploded_balance_sheet_assets_ferc1`` and + ``exploded_balance_sheet_liabilities_ferc1``. These tables include granular + data from the nested calculations that are build into the accounting tables. + See :class:`Exploder` for more details. + + This rate base table also contains one specific addition from + :ref:`core_ferc1__yearly_operating_expenses_sched320`. In standard ratemaking + processes, utilities are enabled to include working capital - sometimes referred + to as cash on hand or cash reverves. A standard ratemaking process is to consider + the available rate-baseable working capital to be one eigth of the average + operations and maintenance expense. This function grabs that expense and + concatenates it with the rest of the assets and liabilities from the granular + exploded data. + + """ + # get the factoid name to grab the right part of the table + xbrl_factoid_name = pudl.transform.ferc1.FERC1_TFR_CLASSES[ + "core_ferc1__yearly_operating_expenses_sched320" + ]().params.xbrl_factoid_name + # First grab the working capital out of the operating expense table. + # then prep it for concating. Calculate working capital & add tags + cash_working_capital = ( + core_ferc1__yearly_operating_expenses_sched320[ + core_ferc1__yearly_operating_expenses_sched320[xbrl_factoid_name] + == "operations_and_maintenance_expenses_electric" + ] + .assign( + dollar_value=lambda x: x.dollar_value.divide(8), + xbrl_factoid="cash_working_capital", # newly definied (do we need to add it anywhere?) + tags_rate_base_category="net_working_capital", + tags_aggregatable_utility_type="electric", + table_name="core_ferc1__yearly_operating_expenses_sched320", + ) + .drop(columns=[xbrl_factoid_name]) + # the assets/liabilites both use ending_balance for its main $$ column + .rename(columns={"dollar_value": "ending_balance"}) + ) + # then select only the leafy exploded records that are in rate base and concat + in_rate_base = pd.concat( + [ + _out_ferc1__detailed_balance_sheet_assets[ + _out_ferc1__detailed_balance_sheet_assets.tags_in_rate_base.isin( + ["yes", "partial"] + ) + ], + _out_ferc1__detailed_balance_sheet_liabilities[ + _out_ferc1__detailed_balance_sheet_liabilities.tags_in_rate_base.isin( + ["yes", "partial"] + ) + ].assign(ending_balance=lambda x: -x.ending_balance), + cash_working_capital, + ] + ).sort_values(by=["report_year", "utility_id_ferc1", "table_name"], ascending=False) + # note: we need the `tags_in_rate_base` column for these checks + check_tag_propagation_compared_to_compiled_tags( + in_rate_base, "in_rate_base", _out_ferc1__detailed_tags + ) + check_for_correction_xbrl_factoids_with_tag(in_rate_base, "in_rate_base") + return in_rate_base diff --git a/src/pudl/package_data/ferc1/xbrl_factoid_plant_status_tags.csv b/src/pudl/package_data/ferc1/xbrl_factoid_plant_status_tags.csv index b7df41c279..59fb27acf9 100644 --- a/src/pudl/package_data/ferc1/xbrl_factoid_plant_status_tags.csv +++ b/src/pudl/package_data/ferc1/xbrl_factoid_plant_status_tags.csv @@ -26,14 +26,4 @@ core_ferc1__yearly_utility_plant_summary_sched200,depreciation_and_amortization_ core_ferc1__yearly_utility_plant_summary_sched200,abandonment_of_leases,total core_ferc1__yearly_utility_plant_summary_sched200,amortization_of_plant_acquisition_adjustment,total core_ferc1__yearly_utility_plant_summary_sched200,utility_plant_in_service_classified_and_property_under_capital_leases,in_service -core_ferc1__yearly_utility_plant_summary_sched200,utility_plant_in_service_plant_purchased_or_sold_correction,in_service -core_ferc1__yearly_utility_plant_summary_sched200,utility_plant_in_service_experimental_plant_unclassified_correction,in_service -core_ferc1__yearly_utility_plant_summary_sched200,utility_plant_in_service_classified_and_unclassified_correction,in_service -core_ferc1__yearly_utility_plant_summary_sched200,utility_plant_and_construction_work_in_progress_correction,construction_work_in_progress -core_ferc1__yearly_utility_plant_summary_sched200,accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_correction,total -core_ferc1__yearly_utility_plant_summary_sched200,utility_plant_net_correction,total -core_ferc1__yearly_utility_plant_summary_sched200,depreciation_utility_plant_in_service_correction,in_service -core_ferc1__yearly_utility_plant_summary_sched200,depreciation_amortization_and_depletion_utility_plant_leased_to_others_correction,leased -core_ferc1__yearly_utility_plant_summary_sched200,depreciation_and_amortization_utility_plant_held_for_future_use_correction,future -core_ferc1__yearly_utility_plant_summary_sched200,utility_plant_in_service_classified_and_property_under_capital_leases_correction,in_service core_ferc1__yearly_utility_plant_summary_sched200,abandonment_of_leases,leased diff --git a/test/unit/output/ferc1_test.py b/test/unit/output/ferc1_test.py index cec121a5db..d86faf1613 100644 --- a/test/unit/output/ferc1_test.py +++ b/test/unit/output/ferc1_test.py @@ -1,8 +1,6 @@ """Tests for the FERC Form 1 output functions. -These need to be recreated to work with the new XbrlCalculationForest implementation. - -Stuff to test: +Stuff we could test: - construction of basic tree from input metadata - do nodes not part of any calculation get orphaned? - do nodes not in the seeded digraph get pruned? @@ -12,54 +10,360 @@ - pruning of passthrough nodes & associated corrections - propagation of weights - conflicting weights -- propagation of tags - conflicting tags - validation of calculations using only leaf-nodes to reproduce root node values +Stuff we are testing: +- propagation of tags + """ -import json import logging +import unittest +import networkx as nx import pandas as pd import pytest -from pudl.output.ferc1 import get_core_ferc1_asset_description - -# from pudl.output.ferc1 import NodeId, XbrlCalculationForestFerc1 +from pudl.helpers import dedupe_n_flatten_list_of_lists +from pudl.output.ferc1 import ( + NodeId, + XbrlCalculationForestFerc1, + get_core_ferc1_asset_description, +) logger = logging.getLogger(__name__) -EXPLODED_META_IDX = ["table_name", "xbrl_factoid"] -TEST_CALC_1 = [ - {"name": "reported_1", "weight": 1.0, "source_tables": ["table_1"]}, - {"name": "reported_2", "weight": -1.0, "source_tables": ["table_1"]}, -] - -TEST_CALC_2 = [ - {"name": "reported_1", "weight": 1.0, "source_tables": ["table_1", "table_2"]}, - {"name": "reported_2", "weight": -1.0, "source_tables": ["table_1"]}, -] - -TEST_CALC_3 = [ - {"name": "reported_1", "weight": 1.0, "source_tables": ["table_1"]}, - {"name": "reported_3", "weight": 1.0, "source_tables": ["table_3"]}, -] - -TEST_EXPLODED_META: pd.DataFrame = ( - pd.DataFrame( - columns=["table_name", "xbrl_factoid", "calculations", "xbrl_factoid_original"], - data=[ - ("table_1", "reported_1", "[]", "reported_original_1"), - ("table_1", "reported_2", "[]", "reported_original_2"), - ("table_1", "calc_1", json.dumps(TEST_CALC_1), "calc_original_1"), - ("table_2", "calc_2", json.dumps(TEST_CALC_2), "calc_original_2"), - ("table_1", "calc_3", json.dumps(TEST_CALC_3), "calc_original_3"), - ], - ) - .convert_dtypes() - .set_index(EXPLODED_META_IDX) -) + +class TestForestSetup(unittest.TestCase): + def setUp(self): + # this is where you add nodes you want to use + pass + + def _exploded_calcs_from_edges(self, edges: list[tuple[NodeId, NodeId]]): + records = [] + for parent, child in edges: + record = {"weight": 1} + for field in NodeId._fields: + record[f"{field}_parent"] = parent.__getattribute__(field) + record[field] = child.__getattribute__(field) + records.append(record) + dtype_parent = {f"{col}_parent": pd.StringDtype() for col in NodeId._fields} + dtype_child = {col: pd.StringDtype() for col in NodeId._fields} + dtype_weight = {"weight": pd.Int64Dtype()} + + return pd.DataFrame.from_records(records).astype( + dtype_child | dtype_parent | dtype_weight + ) + + def build_forest( + self, edges: list[tuple[NodeId, NodeId]], tags: pd.DataFrame, seeds=None + ): + if not seeds: + seeds = [self.parent] + forest = XbrlCalculationForestFerc1( + exploded_calcs=self._exploded_calcs_from_edges(edges), + seeds=seeds, + tags=tags, + ) + return forest + + def build_forest_and_annotated_tags( + self, edges: list[tuple[NodeId, NodeId]], tags: pd.DataFrame, seeds=None + ): + """Build a forest, test forest nodes and return annotated tags. + + Args: + edges: list of tuples + tags: dataframe of tags + seeds: list of seed nodes. Default is None and will assume seed node is + ``parent``. + """ + simple_forest = self.build_forest(edges, tags, seeds) + annotated_forest = simple_forest.annotated_forest + # ensure no nodes got dropped + assert len(annotated_forest.nodes) == len(dedupe_n_flatten_list_of_lists(edges)) + annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + return annotated_tags + + +class TestPrunnedNode(TestForestSetup): + def setUp(self): + self.root = NodeId( + table_name="table_1", + xbrl_factoid="reported_1", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) + self.root_child = NodeId( + table_name="table_1", + xbrl_factoid="reported_11", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) + self.root_other = NodeId( + table_name="table_1", + xbrl_factoid="reported_2", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) + self.root_other_child = NodeId( + table_name="table_1", + xbrl_factoid="reported_21", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) + + def test_pruned_nodes(self): + edges = [(self.root, self.root_child), (self.root_other, self.root_other_child)] + tags = pd.DataFrame(columns=list(NodeId._fields)).convert_dtypes() + forest = XbrlCalculationForestFerc1( + exploded_calcs=self._exploded_calcs_from_edges(edges), + seeds=[self.root], + tags=tags, + ) + pruned = forest.pruned + assert set(pruned) == {self.root_other, self.root_other_child} + + +class TestTagPropagation(TestForestSetup): + def setUp(self): + self.parent = NodeId( + table_name="table_1", + xbrl_factoid="reported_1", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) + self.parent_correction = NodeId( + table_name="table_1", + xbrl_factoid="reported_1_correction", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) + self.child1 = NodeId( + table_name="table_1", + xbrl_factoid="reported_1_1", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) + self.child2 = NodeId( + table_name="table_1", + xbrl_factoid="reported_1_2", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) + self.grand_child11 = NodeId( + table_name="table_1", + xbrl_factoid="reported_1_1_1", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) + self.grand_child12 = NodeId( + table_name="table_1", + xbrl_factoid="reported_1_1_2", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) + self.child1_correction = NodeId( + table_name="table_1", + xbrl_factoid="reported_1_1_correction", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) + + def test_leafward_prop_undecided_children(self): + edges = [(self.parent, self.child1), (self.parent, self.child2)] + tags = pd.DataFrame([self.parent, self.child1, self.child2]).assign( + in_rate_base=["yes", pd.NA, pd.NA] + ) + annotated_tags = self.build_forest_and_annotated_tags(edges, tags) + assert annotated_tags[self.parent]["in_rate_base"] == "yes" + for child_node in [self.child1, self.child2]: + assert ( + annotated_tags[child_node]["in_rate_base"] + == annotated_tags[self.parent]["in_rate_base"] + ) + + def test_leafward_prop_disagreeing_child(self): + """Don't force the diagreeing child to follow the parent.""" + edges = [(self.parent, self.child1), (self.parent, self.child2)] + tags = pd.DataFrame([self.parent, self.child1, self.child2]).assign( + in_rate_base=["yes", "no", pd.NA] + ) + annotated_tags = self.build_forest_and_annotated_tags(edges, tags) + assert annotated_tags[self.parent]["in_rate_base"] == "yes" + assert annotated_tags[self.child1]["in_rate_base"] == "no" + assert ( + annotated_tags[self.child2]["in_rate_base"] + == annotated_tags[self.parent]["in_rate_base"] + ) + + def test_leafward_prop_preserve_non_propagating_tags(self): + """Only propagate tags that actually get inherited - i.e., not `in_root_boose`.""" + edges = [(self.parent, self.child1), (self.parent, self.child2)] + tags = pd.DataFrame([self.parent, self.child1, self.child2]).assign( + in_rate_base=["yes", "no", pd.NA], + in_root_boose=["yus", "nu", pd.NA], + ) + annotated_tags = self.build_forest_and_annotated_tags(edges, tags) + assert annotated_tags[self.parent]["in_rate_base"] == "yes" + assert annotated_tags[self.child1]["in_rate_base"] == "no" + assert ( + annotated_tags[self.child2]["in_rate_base"] + == annotated_tags[self.parent]["in_rate_base"] + ) + assert annotated_tags[self.parent]["in_root_boose"] == "yus" + assert annotated_tags[self.child1]["in_root_boose"] == "nu" + assert not annotated_tags[self.child2].get("in_root_boose") + + def test_rootward_prop_disagreeing_children(self): + """Parents should not pick sides between disagreeing children.""" + edges = [(self.parent, self.child1), (self.parent, self.child2)] + tags = pd.DataFrame([self.child1, self.child2]).assign( + in_rate_base=["no", "yes"] + ) + annotated_tags = self.build_forest_and_annotated_tags(edges, tags) + assert not annotated_tags.get(self.parent) + assert annotated_tags[self.child1]["in_rate_base"] == "no" + assert annotated_tags[self.child2]["in_rate_base"] == "yes" + + def test_prop_no_tags(self): + """If no tags, don't propagate anything. + + This also tests whether a fully null tag input behaves the same as an + empty df. It also checks whether we get the expected behavior when + the propogated tags are all null but there is another non-propagating + tag. + """ + edges = [(self.parent, self.child1), (self.parent, self.child2)] + null_tag_edges = [self.parent, self.child1, self.child2] + tags = pd.DataFrame(null_tag_edges).assign(in_rate_base=[pd.NA, pd.NA, pd.NA]) + annotated_tags = self.build_forest_and_annotated_tags(edges, tags) + for node in null_tag_edges: + assert not annotated_tags.get(node) + + tags = pd.DataFrame(columns=NodeId._fields).convert_dtypes() + annotated_tags = self.build_forest_and_annotated_tags(edges, tags) + for node in null_tag_edges: + assert not annotated_tags.get(node) + + tags = pd.DataFrame([self.parent, self.child1, self.child2]).assign( + in_rate_base=[pd.NA, pd.NA, pd.NA], + a_non_propped_tag=["hi", "hello", "what_am_i_doing_here_even"], + ) + annotated_tags = self.build_forest_and_annotated_tags(edges, tags) + for node in null_tag_edges: + assert "in_rate_base" not in annotated_tags[node] + # do we still have a non-null value for the non-propped tag + assert annotated_tags[node].get("a_non_propped_tag") + + def test_annotated_forest_propagates_rootward(self): + """If two grandchildren have the same tag, their parent does inhert the tag. + + But, the rootward propagation only happens when all of a nodes children have + the same tag. + """ + edges = [ + (self.parent, self.child1), + (self.parent, self.child2), + (self.child1, self.grand_child11), + (self.child1, self.grand_child12), + ] + tags = pd.DataFrame([self.grand_child11, self.grand_child12]).assign( + in_rate_base=["yes", "yes"] + ) + annotated_tags = self.build_forest_and_annotated_tags(edges, tags) + assert self.parent not in annotated_tags + assert annotated_tags[self.child1]["in_rate_base"] == "yes" + assert self.child2 not in annotated_tags + assert annotated_tags[self.grand_child11]["in_rate_base"] == "yes" + assert annotated_tags[self.grand_child12]["in_rate_base"] == "yes" + + def test_annotated_forest_propagates_rootward_disagreeing_sibling(self): + """If two siblings disagree, their parent does not inherit either of their tag values.""" + edges = [ + (self.parent, self.child1), + (self.parent, self.child2), + (self.child1, self.grand_child11), + (self.child1, self.grand_child12), + ] + tags = pd.DataFrame([self.grand_child11, self.grand_child12]).assign( + in_rate_base=["yes", "no"] + ) + annotated_tags = self.build_forest_and_annotated_tags(edges, tags) + assert not annotated_tags.get(self.parent) + assert not annotated_tags.get(self.child1) + assert not annotated_tags.get(self.child2) + assert annotated_tags[self.grand_child11]["in_rate_base"] == "yes" + assert annotated_tags[self.grand_child12]["in_rate_base"] == "no" + + def test_annotated_forest_propagates_rootward_correction(self): + edges = [ + (self.child1, self.grand_child11), + (self.child1, self.child1_correction), + ] + tags = pd.DataFrame([self.child1]).assign(in_rate_base=["yes"]) + annotated_tags = self.build_forest_and_annotated_tags( + edges, tags, seeds=[self.child1] + ) + assert annotated_tags[self.child1]["in_rate_base"] == "yes" + assert annotated_tags[self.grand_child11]["in_rate_base"] == "yes" + assert ( + annotated_tags[self.child1_correction]["in_rate_base"] + == annotated_tags[self.child1]["in_rate_base"] + ) + + def test_annotated_forest_propagates_rootward_two_layers(self): + edges = [ + (self.parent, self.child1), + (self.parent, self.child2), + (self.child1, self.grand_child11), + (self.child1, self.grand_child12), + ] + pre_assigned_yes_nodes = [self.child2, self.grand_child11, self.grand_child12] + tags = pd.DataFrame(pre_assigned_yes_nodes).assign( + in_rate_base=["yes"] * len(pre_assigned_yes_nodes), + ) + annotated_tags = self.build_forest_and_annotated_tags(edges, tags) + for pre_yes_node in pre_assigned_yes_nodes: + assert annotated_tags[pre_yes_node]["in_rate_base"] == "yes" + for post_yes_node in [self.child1, self.parent]: + assert annotated_tags[post_yes_node]["in_rate_base"] == "yes" + + def test_annotated_forest_propagates_rootward_two_layers_plus_corrections(self): + edges = [ + (self.parent, self.child1), + (self.parent, self.child2), + (self.parent, self.parent_correction), + (self.child1, self.grand_child11), + (self.child1, self.grand_child12), + (self.child1, self.child1_correction), + ] + pre_assigned_yes_nodes = [self.child2, self.grand_child11, self.grand_child12] + tags = pd.DataFrame(pre_assigned_yes_nodes).assign( + in_rate_base=["yes"] * len(pre_assigned_yes_nodes), + ) + annotated_tags = self.build_forest_and_annotated_tags(edges, tags) + for pre_yes_node in pre_assigned_yes_nodes: + assert annotated_tags[pre_yes_node]["in_rate_base"] == "yes" + for post_yes_node in [ + self.child1, + self.parent, + self.child1_correction, + self.parent_correction, + ]: + assert annotated_tags[post_yes_node]["in_rate_base"] == "yes" def test_get_core_ferc1_asset_description(): @@ -72,21 +376,3 @@ def test_get_core_ferc1_asset_description(): invalid_core_ferc1_asset_name = "core_ferc1__income_statements" with pytest.raises(ValueError): get_core_ferc1_asset_description(invalid_core_ferc1_asset_name) - - -# LEAF_NODE_1 = XbrlCalculationForestFerc1( -# exploded_meta=TEST_EXPLODED_META, -# seeds=[NodeId("table_1", "reported_1")], -# ) -# LEAF_NODE_2 = XbrlCalculationForestFerc1( -# exploded_meta=TEST_EXPLODED_META, -# seeds=[NodeId("table_1", "reported_2")], -# ) -# CALC_TREE_1 = XbrlCalculationForestFerc1( -# exploded_meta=TEST_EXPLODED_META, -# seeds=[NodeId("table_1", "calc_1")], -# ) -# CALC_TREE_2 = XbrlCalculationForestFerc1( -# exploded_meta=TEST_EXPLODED_META, -# seeds=[NodeId("table_2", "calc_2")], -# )