From 3a9548c43e0abbe60cac9534e04da69843a5a6c6 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Thu, 4 Jan 2024 13:25:19 -0500 Subject: [PATCH 01/17] ensure all the corrections get tags and add the begining of a rate base asset --- src/pudl/output/ferc1.py | 39 ++++++++++++++++++- .../ferc1/xbrl_factoid_plant_status_tags.csv | 10 ----- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index edf6f68247..509b3f047e 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -1191,7 +1191,22 @@ def _out_ferc1__explosion_tags(table_dimensions_ferc1) -> pd.DataFrame: .reset_index() .drop(columns=["notes"]) ) - return tags_all + # Add the correction records to the tags with the same tags as the parent + idx = list(NodeId._fields) + correction_index = ( + table_dimensions_ferc1[ + ~table_dimensions_ferc1.xbrl_factoid.str.endswith("_correction") + ] + .set_index(idx) + .index + ) + corrections = tags_all.set_index(idx) + corrections = ( + corrections.loc[corrections.index.intersection(correction_index)] + .reset_index() + .assign(xbrl_factoid=lambda x: x.xbrl_factoid + "_correction") + ) + return pd.concat([tags_all, corrections]) def _get_tags(file_name: str, table_dimensions_ferc1: pd.DataFrame) -> pd.DataFrame: @@ -1236,7 +1251,10 @@ def _aggregatable_dimension_tags( ) .set_index(idx) ) - table_dimensions_ferc1 = table_dimensions_ferc1.set_index(idx) + # don't include the corrections + table_dimensions_ferc1 = table_dimensions_ferc1[ + ~table_dimensions_ferc1.xbrl_factoid.str.endswith("_correction") + ].set_index(idx) tags_df = pd.concat( [ tags_df, @@ -2774,3 +2792,20 @@ def nodes_to_df(calc_forest: nx.DiGraph, nodes: list[NodeId]) -> pd.DataFrame: except AttributeError: tags = pd.DataFrame() return pd.concat([index, tags], axis="columns") + + +def out_ferc1__yearly_rate_base( + exploded_balance_sheet_assets_ferc1, exploded_balance_sheet_liabilities_ferc1 +): + """Make a table of only rate-base data.""" + in_rate_base = pd.concat( + [ + exploded_balance_sheet_assets_ferc1[ + exploded_balance_sheet_assets_ferc1.tags_in_rate_base == "yes" + ], + exploded_balance_sheet_liabilities_ferc1[ + exploded_balance_sheet_liabilities_ferc1.tags_in_rate_base == "yes" + ], + ] + ).drop(columns=["tags_in_rate_base"]) + return in_rate_base diff --git a/src/pudl/package_data/ferc1/xbrl_factoid_plant_status_tags.csv b/src/pudl/package_data/ferc1/xbrl_factoid_plant_status_tags.csv index b7df41c279..59fb27acf9 100644 --- a/src/pudl/package_data/ferc1/xbrl_factoid_plant_status_tags.csv +++ b/src/pudl/package_data/ferc1/xbrl_factoid_plant_status_tags.csv @@ -26,14 +26,4 @@ core_ferc1__yearly_utility_plant_summary_sched200,depreciation_and_amortization_ core_ferc1__yearly_utility_plant_summary_sched200,abandonment_of_leases,total core_ferc1__yearly_utility_plant_summary_sched200,amortization_of_plant_acquisition_adjustment,total core_ferc1__yearly_utility_plant_summary_sched200,utility_plant_in_service_classified_and_property_under_capital_leases,in_service -core_ferc1__yearly_utility_plant_summary_sched200,utility_plant_in_service_plant_purchased_or_sold_correction,in_service -core_ferc1__yearly_utility_plant_summary_sched200,utility_plant_in_service_experimental_plant_unclassified_correction,in_service -core_ferc1__yearly_utility_plant_summary_sched200,utility_plant_in_service_classified_and_unclassified_correction,in_service -core_ferc1__yearly_utility_plant_summary_sched200,utility_plant_and_construction_work_in_progress_correction,construction_work_in_progress -core_ferc1__yearly_utility_plant_summary_sched200,accumulated_provision_for_depreciation_amortization_and_depletion_of_plant_utility_correction,total -core_ferc1__yearly_utility_plant_summary_sched200,utility_plant_net_correction,total -core_ferc1__yearly_utility_plant_summary_sched200,depreciation_utility_plant_in_service_correction,in_service -core_ferc1__yearly_utility_plant_summary_sched200,depreciation_amortization_and_depletion_utility_plant_leased_to_others_correction,leased -core_ferc1__yearly_utility_plant_summary_sched200,depreciation_and_amortization_utility_plant_held_for_future_use_correction,future -core_ferc1__yearly_utility_plant_summary_sched200,utility_plant_in_service_classified_and_property_under_capital_leases_correction,in_service core_ferc1__yearly_utility_plant_summary_sched200,abandonment_of_leases,leased From f33aa8217881a8faeb27702ee09e1f2dabc15fa0 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Wed, 10 Jan 2024 11:33:25 -0500 Subject: [PATCH 02/17] Add in cash on hand as an additional factoid into rate base table --- src/pudl/output/ferc1.py | 60 ++++++++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 11 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index 509b3f047e..a51cacee80 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -2795,17 +2795,55 @@ def nodes_to_df(calc_forest: nx.DiGraph, nodes: list[NodeId]) -> pd.DataFrame: def out_ferc1__yearly_rate_base( - exploded_balance_sheet_assets_ferc1, exploded_balance_sheet_liabilities_ferc1 -): + exploded_balance_sheet_assets_ferc1: pd.DataFrame, + exploded_balance_sheet_liabilities_ferc1: pd.DataFrame, + core_ferc1__yearly_operating_expenses_sched320: pd.DataFrame, +) -> pd.DataFrame: """Make a table of only rate-base data.""" - in_rate_base = pd.concat( - [ - exploded_balance_sheet_assets_ferc1[ - exploded_balance_sheet_assets_ferc1.tags_in_rate_base == "yes" - ], - exploded_balance_sheet_liabilities_ferc1[ - exploded_balance_sheet_liabilities_ferc1.tags_in_rate_base == "yes" - ], + # First grab the cash on hand out of the operating expense table. + xbrl_factoid_name = pudl.transform.ferc1.FERC1_TFR_CLASSES[ + "core_ferc1__yearly_operating_expenses_sched320" + ]().params.xbrl_factoid_name + pks = pudl.metadata.classes.Resource.from_id( + "core_ferc1__yearly_operating_expenses_sched320" + ).schema.primary_key + cash_working_capital = ( + core_ferc1__yearly_operating_expenses_sched320[ + core_ferc1__yearly_operating_expenses_sched320[xbrl_factoid_name].isin( + [ + f"operations_and_maintenance_expenses_electric{suffix}" + for suffix in ["", "_correction"] + ] + ) ] - ).drop(columns=["tags_in_rate_base"]) + .groupby(pks + ["utility_type"], as_index=False)[["dollar_value"]] + .sum(min_count=1) + .assign( + dollar_value=lambda x: x.dollar_value / 8, + xbrl_factoid="cash_on_hand", # newly definied (do we need to add it anywhere?) + tags_rate_base_category="net_working_capital", + tags_aggregatable_utility_type="electric", + table_name="core_ferc1__yearly_operating_expenses_sched320", + ) + .drop(columns=[xbrl_factoid_name]) + .rename(columns={"dollar_value": "ending_balance"}) + ) + # then select only the leafy exploded records that are in rate base and concat + in_rate_base = ( + pd.concat( + [ + exploded_balance_sheet_assets_ferc1[ + exploded_balance_sheet_assets_ferc1.tags_in_rate_base == "yes" + ], + exploded_balance_sheet_liabilities_ferc1[ + exploded_balance_sheet_liabilities_ferc1.tags_in_rate_base == "yes" + ], + cash_working_capital, + ] + ) + .drop(columns=["tags_in_rate_base"]) + .sort_values( + by=["report_year", "utility_id_ferc1", "table_name"], ascending=False + ) + ) return in_rate_base From ea8301e5853ca816af9b804c100e91e1f41be2eb Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Wed, 10 Jan 2024 16:35:20 -0500 Subject: [PATCH 03/17] add documentation for rate base table --- src/pudl/output/ferc1.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index a51cacee80..bd4de0e425 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -2799,7 +2799,25 @@ def out_ferc1__yearly_rate_base( exploded_balance_sheet_liabilities_ferc1: pd.DataFrame, core_ferc1__yearly_operating_expenses_sched320: pd.DataFrame, ) -> pd.DataFrame: - """Make a table of only rate-base data.""" + """Make a table of granular utility rate-base data. + + This table contains granular data consisting of what utilities can + include in their rate bases. This information comes from two core + inputs: ``exploded_balance_sheet_assets_ferc1`` and + ``exploded_balance_sheet_liabilities_ferc1``. These tables include granular + data from the nested calculations that are build into the accounting tables. + See :class:`Exploder` for more details. + + This rate base table also contains one specific addition from + :ref:`core_ferc1__yearly_operating_expenses_sched320`. In standard ratemaking + processes, utilities are enabled to include working capital - sometimes referred + to as cash on hand or cash reverves. A standard ratemaking process is to consider + the available rate-baseable working capital to be one eigth of the average + operations and maintenance expense. This function grabs that expense and + concatenates it with the rest of the assets and liabilities from the granular + exploded data. + + """ # First grab the cash on hand out of the operating expense table. xbrl_factoid_name = pudl.transform.ferc1.FERC1_TFR_CLASSES[ "core_ferc1__yearly_operating_expenses_sched320" @@ -2807,6 +2825,8 @@ def out_ferc1__yearly_rate_base( pks = pudl.metadata.classes.Resource.from_id( "core_ferc1__yearly_operating_expenses_sched320" ).schema.primary_key + # grab the factoid and its correction records - then group them together + # to produce on cash_on_hand factoid to concat cash_working_capital = ( core_ferc1__yearly_operating_expenses_sched320[ core_ferc1__yearly_operating_expenses_sched320[xbrl_factoid_name].isin( @@ -2826,6 +2846,7 @@ def out_ferc1__yearly_rate_base( table_name="core_ferc1__yearly_operating_expenses_sched320", ) .drop(columns=[xbrl_factoid_name]) + # the assets/liabilites both use ending_balance for its main $$ column .rename(columns={"dollar_value": "ending_balance"}) ) # then select only the leafy exploded records that are in rate base and concat From 24cf1cfb21554777b5b50b8a8e6fefc09e908478 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Wed, 10 Jan 2024 16:45:43 -0500 Subject: [PATCH 04/17] remove _correction record from the expense. the correction corrects the calculations of the parent (operating expense) and its child subcomponents. if we were calculating the expense i would want to include the correction but i don't want it if we are just grabbing the reported value --- src/pudl/output/ferc1.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index bd4de0e425..b5523440dd 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -2818,26 +2818,17 @@ def out_ferc1__yearly_rate_base( exploded data. """ - # First grab the cash on hand out of the operating expense table. + # get the factoid name to grab the right part of the table xbrl_factoid_name = pudl.transform.ferc1.FERC1_TFR_CLASSES[ "core_ferc1__yearly_operating_expenses_sched320" ]().params.xbrl_factoid_name - pks = pudl.metadata.classes.Resource.from_id( - "core_ferc1__yearly_operating_expenses_sched320" - ).schema.primary_key - # grab the factoid and its correction records - then group them together - # to produce on cash_on_hand factoid to concat + # First grab the cash on hand out of the operating expense table. + # then prep it for concating. Calculate cash on hand & add tags cash_working_capital = ( core_ferc1__yearly_operating_expenses_sched320[ - core_ferc1__yearly_operating_expenses_sched320[xbrl_factoid_name].isin( - [ - f"operations_and_maintenance_expenses_electric{suffix}" - for suffix in ["", "_correction"] - ] - ) + core_ferc1__yearly_operating_expenses_sched320[xbrl_factoid_name] + == "operations_and_maintenance_expenses_electric" ] - .groupby(pks + ["utility_type"], as_index=False)[["dollar_value"]] - .sum(min_count=1) .assign( dollar_value=lambda x: x.dollar_value / 8, xbrl_factoid="cash_on_hand", # newly definied (do we need to add it anywhere?) From 6d41c5cd0326c6fcc7043be613e5800a890b45ee Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Tue, 16 Jan 2024 11:00:12 -0500 Subject: [PATCH 05/17] attempt to associate tags with _correction factoids when all child calc componets have same tags --- src/pudl/output/ferc1.py | 109 +++++++++++++++++++++++++++++---------- 1 file changed, 82 insertions(+), 27 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index b5523440dd..6955f00317 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -1154,7 +1154,10 @@ class OffByFactoid(NamedTuple): @asset -def _out_ferc1__explosion_tags(table_dimensions_ferc1) -> pd.DataFrame: +def _out_ferc1__explosion_tags( + table_dimensions_ferc1: pd.DataFrame, + calculation_components_xbrl_ferc1: pd.DataFrame, +) -> pd.DataFrame: """Grab the stored tables of tags and add inferred dimension.""" rate_tags = _get_tags("xbrl_factoid_rate_base_tags.csv", table_dimensions_ferc1) rev_req_tags = _get_tags( @@ -1180,9 +1183,10 @@ def _out_ferc1__explosion_tags(table_dimensions_ferc1) -> pd.DataFrame: plant_function_tags, utility_type_tags, ] - tags_all = ( + tag_idx = list(NodeId._fields) + tags = ( pd.concat( - [df.set_index(list(NodeId._fields)) for df in tag_dfs], + [df.set_index(tag_idx) for df in tag_dfs], join="outer", verify_integrity=True, ignore_index=False, @@ -1191,22 +1195,10 @@ def _out_ferc1__explosion_tags(table_dimensions_ferc1) -> pd.DataFrame: .reset_index() .drop(columns=["notes"]) ) - # Add the correction records to the tags with the same tags as the parent - idx = list(NodeId._fields) - correction_index = ( - table_dimensions_ferc1[ - ~table_dimensions_ferc1.xbrl_factoid.str.endswith("_correction") - ] - .set_index(idx) - .index - ) - corrections = tags_all.set_index(idx) - corrections = ( - corrections.loc[corrections.index.intersection(correction_index)] - .reset_index() - .assign(xbrl_factoid=lambda x: x.xbrl_factoid + "_correction") - ) - return pd.concat([tags_all, corrections]) + # Add the correction records to the tags... + corrections = make_correction_tags(tags, calculation_components_xbrl_ferc1) + tags = pd.concat([tags, corrections]) + return tags def _get_tags(file_name: str, table_dimensions_ferc1: pd.DataFrame) -> pd.DataFrame: @@ -1251,7 +1243,7 @@ def _aggregatable_dimension_tags( ) .set_index(idx) ) - # don't include the corrections + # don't include the corrections because we will add those in later table_dimensions_ferc1 = table_dimensions_ferc1[ ~table_dimensions_ferc1.xbrl_factoid.str.endswith("_correction") ].set_index(idx) @@ -1267,6 +1259,64 @@ def _aggregatable_dimension_tags( return tags_df[tags_df[aggregatable_col] != "total"] +def make_correction_tags( + tags_all: pd.DataFrame, calc_components: pd.DataFrame +) -> pd.DataFrame: + """Make tags for correction records. + + We need to check to see if any of the tags in each of the calculated + parent factoids are the same for all of their child components. So in this + function, we're going to merge on the tags to the children then groupby the + parents. For each tag, see if the childrens'tags contains only one unique value. + If so grab the tag to associate with the correction record of the parent. If not, + no tag will be associated with the record. + """ + tag_idx = list(NodeId._fields) + calcs_w_tags = ( + pd.merge( # remove the correction records bc those are the ones we want to + calc_components[~calc_components.xbrl_factoid.str.contains("_correction")], + tags_all, + on=tag_idx, + how="left", + validate="m:1", + ) + ) + # use the same groupby to get the number of unique tags and the first one + # we will only use the first tag if the tags are unique + tag_cols = list(tags_all.drop(columns=tag_idx).columns) + tag_gb = calcs_w_tags.groupby([f"{c}_parent" for c in tag_idx], dropna=False)[ + tag_cols + ] + tag_check = pd.merge( + tag_gb.nunique( + dropna=False + ), # bc if null and non-null tag we want to know that + tag_gb.first(), + right_index=True, + left_index=True, + suffixes=("_n", ""), + validate="1:1", + ) + # null out all of the tags that have non-unique tags for each parent + for col in tag_cols: + non_unique_mask = tag_check[f"{col}_n"] != 1 + tag_check.loc[non_unique_mask, col] = pd.NA + # specifically for in_rate_base assign partial when it is a mix + tag_check.loc[tag_check["in_rate_base_n"] > 1, "in_rate_base"] = "partial" + # remove the fully null tags bc there's nothing new in there and + # drop all of the _n columns + tag_check = tag_check.dropna(how="all", subset=tag_cols)[tag_cols] + # remove the parent from the index name + tag_check.index.names = [ + col.removesuffix("_parent") for col in tag_check.index.names + ] + correction_tags = tag_check.reset_index().assign( + xbrl_factoid=lambda x: x.xbrl_factoid + "_correction" + ) + logger.info(f"Found {len(correction_tags)=}") + return correction_tags + + def exploded_table_asset_factory( root_table: str, table_names: list[str], @@ -2794,6 +2844,7 @@ def nodes_to_df(calc_forest: nx.DiGraph, nodes: list[NodeId]) -> pd.DataFrame: return pd.concat([index, tags], axis="columns") +@asset def out_ferc1__yearly_rate_base( exploded_balance_sheet_assets_ferc1: pd.DataFrame, exploded_balance_sheet_liabilities_ferc1: pd.DataFrame, @@ -2822,16 +2873,16 @@ def out_ferc1__yearly_rate_base( xbrl_factoid_name = pudl.transform.ferc1.FERC1_TFR_CLASSES[ "core_ferc1__yearly_operating_expenses_sched320" ]().params.xbrl_factoid_name - # First grab the cash on hand out of the operating expense table. - # then prep it for concating. Calculate cash on hand & add tags + # First grab the working capital out of the operating expense table. + # then prep it for concating. Calculate working capital & add tags cash_working_capital = ( core_ferc1__yearly_operating_expenses_sched320[ core_ferc1__yearly_operating_expenses_sched320[xbrl_factoid_name] == "operations_and_maintenance_expenses_electric" ] .assign( - dollar_value=lambda x: x.dollar_value / 8, - xbrl_factoid="cash_on_hand", # newly definied (do we need to add it anywhere?) + dollar_value=lambda x: x.dollar_value.divide(8), + xbrl_factoid="cash_working_capital", # newly definied (do we need to add it anywhere?) tags_rate_base_category="net_working_capital", tags_aggregatable_utility_type="electric", table_name="core_ferc1__yearly_operating_expenses_sched320", @@ -2845,15 +2896,19 @@ def out_ferc1__yearly_rate_base( pd.concat( [ exploded_balance_sheet_assets_ferc1[ - exploded_balance_sheet_assets_ferc1.tags_in_rate_base == "yes" + exploded_balance_sheet_assets_ferc1.tags_in_rate_base.isin( + ["yes", "partial"] + ) ], exploded_balance_sheet_liabilities_ferc1[ - exploded_balance_sheet_liabilities_ferc1.tags_in_rate_base == "yes" + exploded_balance_sheet_liabilities_ferc1.tags_in_rate_base.isin( + ["yes", "partial"] + ) ], cash_working_capital, ] ) - .drop(columns=["tags_in_rate_base"]) + # .drop(columns=["tags_in_rate_base"]) .sort_values( by=["report_year", "utility_id_ferc1", "table_name"], ascending=False ) From 80ccf5d8ceceb20747dd14ba6c559da9fb678215 Mon Sep 17 00:00:00 2001 From: Dazhong Xia Date: Wed, 17 Jan 2024 16:21:19 -0500 Subject: [PATCH 06/17] Add a simple XbrlCalculationForest test. Just see if we can get an annotated forest at all right now. TODO: test for tag propagation behavior. Co-authored-by: Christina Gosnell --- src/pudl/output/ferc1.py | 16 +++--- test/unit/output/ferc1_test.py | 99 ++++++++++++++++++---------------- 2 files changed, 62 insertions(+), 53 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index 6955f00317..f8145c05c0 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -2088,6 +2088,7 @@ class XbrlCalculationForestFerc1(BaseModel): exploded_calcs: pd.DataFrame = pd.DataFrame() seeds: list[NodeId] = [] tags: pd.DataFrame = pd.DataFrame() + # TODO: remove the group metric checks and see if things still build / tests still pass group_metric_checks: GroupMetricChecks = GroupMetricChecks() model_config = ConfigDict( arbitrary_types_allowed=True, ignored_types=(cached_property,) @@ -2203,14 +2204,13 @@ def exploded_calcs_to_digraph( Then we compile a dictionary of node attributes, based on the individual calculation components in the exploded calcs dataframe. """ - source_nodes = list( - exploded_calcs.loc[:, self.parent_cols] - .rename(columns=lambda x: x.removesuffix("_parent")) - .itertuples(name="NodeId", index=False) - ) - target_nodes = list( - exploded_calcs.loc[:, self.calc_cols].itertuples(name="NodeId", index=False) - ) + source_nodes = [ + NodeId(*x) + for x in exploded_calcs.set_index(self.parent_cols).index.to_list() + ] + target_nodes = [ + NodeId(*x) for x in exploded_calcs.set_index(self.calc_cols).index.to_list() + ] edgelist = pd.DataFrame({"source": source_nodes, "target": target_nodes}) forest = nx.from_pandas_edgelist(edgelist, create_using=nx.DiGraph) return forest diff --git a/test/unit/output/ferc1_test.py b/test/unit/output/ferc1_test.py index b7c02312a0..f5947b8d31 100644 --- a/test/unit/output/ferc1_test.py +++ b/test/unit/output/ferc1_test.py @@ -18,59 +18,68 @@ """ -import json import logging import pandas as pd -# from pudl.output.ferc1 import NodeId, XbrlCalculationForestFerc1 +from pudl.output.ferc1 import NodeId, XbrlCalculationForestFerc1 logger = logging.getLogger(__name__) -EXPLODED_META_IDX = ["table_name", "xbrl_factoid"] -TEST_CALC_1 = [ - {"name": "reported_1", "weight": 1.0, "source_tables": ["table_1"]}, - {"name": "reported_2", "weight": -1.0, "source_tables": ["table_1"]}, -] -TEST_CALC_2 = [ - {"name": "reported_1", "weight": 1.0, "source_tables": ["table_1", "table_2"]}, - {"name": "reported_2", "weight": -1.0, "source_tables": ["table_1"]}, -] - -TEST_CALC_3 = [ - {"name": "reported_1", "weight": 1.0, "source_tables": ["table_1"]}, - {"name": "reported_3", "weight": 1.0, "source_tables": ["table_3"]}, -] +# TODO: give this a better name once we know what behavior we're actually testing +def test_annotated_forest(): + tags = pd.DataFrame( + columns=[ + "table_name", + "xbrl_factoid", + "utility_type", + "plant_status", + "plant_function", + ] + ) + parent = NodeId( + table_name="table_1", + xbrl_factoid="reported_1", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) + child1 = NodeId( + table_name="table_1", + xbrl_factoid="reported_1_1", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) + child2 = NodeId( + table_name="table_1", + xbrl_factoid="reported_1_2", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) -TEST_EXPLODED_META: pd.DataFrame = ( - pd.DataFrame( - columns=["table_name", "xbrl_factoid", "calculations", "xbrl_factoid_original"], - data=[ - ("table_1", "reported_1", "[]", "reported_original_1"), - ("table_1", "reported_2", "[]", "reported_original_2"), - ("table_1", "calc_1", json.dumps(TEST_CALC_1), "calc_original_1"), - ("table_2", "calc_2", json.dumps(TEST_CALC_2), "calc_original_2"), - ("table_1", "calc_3", json.dumps(TEST_CALC_3), "calc_original_3"), - ], + edges = [(parent, child1), (parent, child2)] + records = [] + for parent, child in edges: + record = {"weight": 1} + for field in NodeId._fields: + record[f"{field}_parent"] = parent.__getattribute__(field) + record[field] = child.__getattribute__(field) + records.append(record) + dtype_child = {col: pd.StringDtype() for col in NodeId._fields} + dtype_parent = {f"{col}_parent": pd.StringDtype() for col in NodeId._fields} + dtype_weight = {"weight": pd.Int64Dtype()} + exploded_calcs = pd.DataFrame.from_records(records).astype( + dtype_child | dtype_parent | dtype_weight ) - .convert_dtypes() - .set_index(EXPLODED_META_IDX) -) + exploded_meta = pd.DataFrame([parent, child1, child2]).astype(dtype_child) -# LEAF_NODE_1 = XbrlCalculationForestFerc1( -# exploded_meta=TEST_EXPLODED_META, -# seeds=[NodeId("table_1", "reported_1")], -# ) -# LEAF_NODE_2 = XbrlCalculationForestFerc1( -# exploded_meta=TEST_EXPLODED_META, -# seeds=[NodeId("table_1", "reported_2")], -# ) -# CALC_TREE_1 = XbrlCalculationForestFerc1( -# exploded_meta=TEST_EXPLODED_META, -# seeds=[NodeId("table_1", "calc_1")], -# ) -# CALC_TREE_2 = XbrlCalculationForestFerc1( -# exploded_meta=TEST_EXPLODED_META, -# seeds=[NodeId("table_2", "calc_2")], -# ) + simple_forest = XbrlCalculationForestFerc1( + exploded_meta=exploded_meta, + exploded_calcs=exploded_calcs, + seeds=[parent], + tags=tags, + ) + assert len(simple_forest.annotated_forest.nodes) == 3 From 50615cbab89895e13010d47239e90acdf3213454 Mon Sep 17 00:00:00 2001 From: Dazhong Xia Date: Wed, 17 Jan 2024 16:48:52 -0500 Subject: [PATCH 07/17] WIP: write down some to-dos for test cases. --- test/unit/output/ferc1_test.py | 53 +++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/test/unit/output/ferc1_test.py b/test/unit/output/ferc1_test.py index f5947b8d31..e1c427415f 100644 --- a/test/unit/output/ferc1_test.py +++ b/test/unit/output/ferc1_test.py @@ -20,6 +20,7 @@ import logging +import networkx as nx import pandas as pd from pudl.output.ferc1 import NodeId, XbrlCalculationForestFerc1 @@ -27,17 +28,11 @@ logger = logging.getLogger(__name__) -# TODO: give this a better name once we know what behavior we're actually testing -def test_annotated_forest(): - tags = pd.DataFrame( - columns=[ - "table_name", - "xbrl_factoid", - "utility_type", - "plant_status", - "plant_function", - ] - ) +# TODO: combine these into a class because we have a lot of similar method names +# TODO: make graph construction easier with helper functions + + +def test_annotated_forest_propagates_leafward(): parent = NodeId( table_name="table_1", xbrl_factoid="reported_1", @@ -71,15 +66,47 @@ def test_annotated_forest(): dtype_child = {col: pd.StringDtype() for col in NodeId._fields} dtype_parent = {f"{col}_parent": pd.StringDtype() for col in NodeId._fields} dtype_weight = {"weight": pd.Int64Dtype()} + exploded_calcs = pd.DataFrame.from_records(records).astype( dtype_child | dtype_parent | dtype_weight ) exploded_meta = pd.DataFrame([parent, child1, child2]).astype(dtype_child) - + tags = pd.DataFrame([parent]).assign(in_rate_base="yes") simple_forest = XbrlCalculationForestFerc1( exploded_meta=exploded_meta, exploded_calcs=exploded_calcs, seeds=[parent], tags=tags, ) - assert len(simple_forest.annotated_forest.nodes) == 3 + annotated_forest = simple_forest.annotated_forest + assert len(annotated_forest.nodes) == 3 + annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + assert annotated_tags[parent]["in_rate_base"] == "yes" + assert ( + annotated_tags[parent]["in_rate_base"] == annotated_tags[child1]["in_rate_base"] + ) + assert ( + annotated_tags[parent]["in_rate_base"] == annotated_tags[child2]["in_rate_base"] + ) + + +def test_annotated_forest_propagates_rootward(): + pass + + +def test_annotated_forest_propagates_corrections(): + pass + + +def test_annotate_forest_propagates_both_dirs_with_corrections(): + pass + + +def test_annotate_forest_does_not_propagate(): + # if a parent has two disagreeing children + pass + + +def test_annoted_forest_does_propagate_null_and_value(): + # if a parent has some children with one value and some with nulls + pass From 1ee6c7cae17be052260d9d06d07a2a8a7062b4b3 Mon Sep 17 00:00:00 2001 From: Dazhong Xia Date: Thu, 18 Jan 2024 17:22:02 -0500 Subject: [PATCH 08/17] Get leafward propagation working --- src/pudl/output/ferc1.py | 25 ++++ test/unit/output/ferc1_test.py | 237 ++++++++++++++++++++++++--------- 2 files changed, 197 insertions(+), 65 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index f8145c05c0..f6f55893ae 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -2312,6 +2312,7 @@ def annotated_forest(self: Self) -> nx.DiGraph: annotated_forest = deepcopy(self.forest) nx.set_node_attributes(annotated_forest, self.node_attrs) nx.set_edge_attributes(annotated_forest, self.edge_attrs) + annotated_forest = self.propagate_tags(annotated_forest) logger.info("Checking whether any pruned nodes were also tagged.") self.check_lost_tags(lost_nodes=self.pruned) @@ -2320,6 +2321,30 @@ def annotated_forest(self: Self) -> nx.DiGraph: self.check_conflicting_tags(annotated_forest) return annotated_forest + def propagate_tags(self: Self, annotated_forest: nx.DiGraph): + """Propagate tags. + + Propagate tags leafwards, rootward & to the _correction nodes. + """ + existing_tags = nx.get_node_attributes(annotated_forest, "tags") + leafward_inherited_tags = ["in_rate_base"] + + for node, parent_tags in existing_tags.items(): + descendants = nx.descendants(annotated_forest, node) + descendant_tags = { + desc: { + "tags": { + tag_name: parent_tags[tag_name] + for tag_name in leafward_inherited_tags + if tag_name in parent_tags + } + | existing_tags.get(desc, {}) + } + for desc in descendants + } + nx.set_node_attributes(annotated_forest, descendant_tags) + return annotated_forest + def check_lost_tags(self: Self, lost_nodes: list[NodeId]) -> None: """Check whether any of the input lost nodes were also tagged nodes.""" if lost_nodes: diff --git a/test/unit/output/ferc1_test.py b/test/unit/output/ferc1_test.py index e1c427415f..a7b9f594ab 100644 --- a/test/unit/output/ferc1_test.py +++ b/test/unit/output/ferc1_test.py @@ -19,9 +19,11 @@ """ import logging +import unittest import networkx as nx import pandas as pd +import pytest from pudl.output.ferc1 import NodeId, XbrlCalculationForestFerc1 @@ -32,62 +34,177 @@ # TODO: make graph construction easier with helper functions -def test_annotated_forest_propagates_leafward(): - parent = NodeId( - table_name="table_1", - xbrl_factoid="reported_1", - utility_type="electric", - plant_status=pd.NA, - plant_function=pd.NA, - ) - child1 = NodeId( - table_name="table_1", - xbrl_factoid="reported_1_1", - utility_type="electric", - plant_status=pd.NA, - plant_function=pd.NA, - ) - child2 = NodeId( - table_name="table_1", - xbrl_factoid="reported_1_2", - utility_type="electric", - plant_status=pd.NA, - plant_function=pd.NA, - ) - - edges = [(parent, child1), (parent, child2)] - records = [] - for parent, child in edges: - record = {"weight": 1} - for field in NodeId._fields: - record[f"{field}_parent"] = parent.__getattribute__(field) - record[field] = child.__getattribute__(field) - records.append(record) - dtype_child = {col: pd.StringDtype() for col in NodeId._fields} - dtype_parent = {f"{col}_parent": pd.StringDtype() for col in NodeId._fields} - dtype_weight = {"weight": pd.Int64Dtype()} - - exploded_calcs = pd.DataFrame.from_records(records).astype( - dtype_child | dtype_parent | dtype_weight - ) - exploded_meta = pd.DataFrame([parent, child1, child2]).astype(dtype_child) - tags = pd.DataFrame([parent]).assign(in_rate_base="yes") - simple_forest = XbrlCalculationForestFerc1( - exploded_meta=exploded_meta, - exploded_calcs=exploded_calcs, - seeds=[parent], - tags=tags, - ) - annotated_forest = simple_forest.annotated_forest - assert len(annotated_forest.nodes) == 3 - annotated_tags = nx.get_node_attributes(annotated_forest, "tags") - assert annotated_tags[parent]["in_rate_base"] == "yes" - assert ( - annotated_tags[parent]["in_rate_base"] == annotated_tags[child1]["in_rate_base"] - ) - assert ( - annotated_tags[parent]["in_rate_base"] == annotated_tags[child2]["in_rate_base"] +class TestTagPropagation(unittest.TestCase): + def setUp(self): + self.parent = NodeId( + table_name="table_1", + xbrl_factoid="reported_1", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) + self.child1 = NodeId( + table_name="table_1", + xbrl_factoid="reported_1_1", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) + self.child2 = NodeId( + table_name="table_1", + xbrl_factoid="reported_1_2", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) + + dtype_node = {col: pd.StringDtype() for col in NodeId._fields} + self.exploded_meta = pd.DataFrame( + [self.parent, self.child1, self.child2] + ).astype(dtype_node) + + def _exploded_calcs_from_edges(self, edges: list[tuple[NodeId, NodeId]]): + records = [] + for parent, child in edges: + record = {"weight": 1} + for field in NodeId._fields: + record[f"{field}_parent"] = parent.__getattribute__(field) + record[field] = child.__getattribute__(field) + records.append(record) + dtype_parent = {f"{col}_parent": pd.StringDtype() for col in NodeId._fields} + dtype_child = {col: pd.StringDtype() for col in NodeId._fields} + dtype_weight = {"weight": pd.Int64Dtype()} + + return pd.DataFrame.from_records(records).astype( + dtype_child | dtype_parent | dtype_weight + ) + + def test_leafward_prop_undecided_children(self): + edges = [(self.parent, self.child1), (self.parent, self.child2)] + tags = pd.DataFrame([self.parent, self.child1, self.child2]).assign( + in_rate_base=["yes", pd.NA, pd.NA] + ) + + simple_forest = XbrlCalculationForestFerc1( + exploded_meta=self.exploded_meta, + exploded_calcs=self._exploded_calcs_from_edges(edges), + seeds=[self.parent], + tags=tags, + ) + + annotated_forest = simple_forest.annotated_forest + assert len(annotated_forest.nodes) == 3 + annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + assert annotated_tags[self.parent]["in_rate_base"] == "yes" + assert annotated_tags[self.child1]["in_rate_base"] == "yes" + assert annotated_tags[self.child2]["in_rate_base"] == "yes" + + def test_leafward_prop_disagreeing_child(self): + """Don't force the diagreeing child to follow the parent.""" + edges = [(self.parent, self.child1), (self.parent, self.child2)] + tags = pd.DataFrame([self.parent, self.child1, self.child2]).assign( + in_rate_base=["yes", "no", pd.NA] + ) + + simple_forest = XbrlCalculationForestFerc1( + exploded_meta=self.exploded_meta, + exploded_calcs=self._exploded_calcs_from_edges(edges), + seeds=[self.parent], + tags=tags, + ) + + annotated_forest = simple_forest.annotated_forest + assert len(annotated_forest.nodes) == 3 + annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + assert annotated_tags[self.parent]["in_rate_base"] == "yes" + assert annotated_tags[self.child1]["in_rate_base"] == "no" + assert annotated_tags[self.child2]["in_rate_base"] == "yes" + + def test_leafward_prop_preserve_non_propagating_tags(self): + """Don't force the diagreeing child to follow the parent.""" + edges = [(self.parent, self.child1), (self.parent, self.child2)] + tags = pd.DataFrame([self.parent, self.child1, self.child2]).assign( + in_rate_base=["yes", "no", pd.NA], + in_root_boose=["yus", "nu", "purtiul"], + ) + + simple_forest = XbrlCalculationForestFerc1( + exploded_meta=self.exploded_meta, + exploded_calcs=self._exploded_calcs_from_edges(edges), + seeds=[self.parent], + tags=tags, + ) + + annotated_forest = simple_forest.annotated_forest + assert len(annotated_forest.nodes) == 3 + annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + assert annotated_tags[self.parent]["in_rate_base"] == "yes" + assert annotated_tags[self.child1]["in_rate_base"] == "no" + assert annotated_tags[self.child2]["in_rate_base"] == "yes" + assert annotated_tags[self.parent]["in_root_boose"] == "yus" + assert annotated_tags[self.child1]["in_root_boose"] == "nu" + assert annotated_tags[self.child2]["in_root_boose"] == "purtiul" + + def test_rootward_prop_disagreeing_children(self): + """Parents should not pick sides between disagreeing children.""" + edges = [(self.parent, self.child1), (self.parent, self.child2)] + tags = pd.DataFrame([self.parent, self.child1, self.child2]).assign( + in_rate_base=[pd.NA, "no", "yes"] + ) + + simple_forest = XbrlCalculationForestFerc1( + exploded_meta=self.exploded_meta, + exploded_calcs=self._exploded_calcs_from_edges(edges), + seeds=[self.parent], + tags=tags, + ) + + annotated_forest = simple_forest.annotated_forest + assert len(annotated_forest.nodes) == 3 + annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + assert annotated_tags[self.parent] == {} + assert annotated_tags[self.child1]["in_rate_base"] == "no" + assert annotated_tags[self.child2]["in_rate_base"] == "yes" + + @pytest.mark.xfail( + reason="we haven't implemented this behavior correctly yet", strict=True ) + def test_prop_no_tags(self): + """If no tags, don't propagate anything.""" + edges = [(self.parent, self.child1), (self.parent, self.child2)] + tags = pd.DataFrame([self.parent, self.child1, self.child2]).assign( + in_rate_base=[pd.NA, pd.NA, pd.NA] + ) + + simple_forest = XbrlCalculationForestFerc1( + exploded_meta=self.exploded_meta, + exploded_calcs=self._exploded_calcs_from_edges(edges), + seeds=[self.parent], + tags=tags, + ) + + annotated_forest = simple_forest.annotated_forest + assert len(annotated_forest.nodes) == 3 + annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + assert annotated_tags[self.parent] == {} + assert annotated_tags[self.child1] == {} + assert annotated_tags[self.child2] == {} + + tags = pd.DataFrame(columns=NodeId._fields).convert_dtypes() + + simple_forest = XbrlCalculationForestFerc1( + exploded_meta=self.exploded_meta, + exploded_calcs=self._exploded_calcs_from_edges(edges), + seeds=[self.parent], + tags=tags, + ) + + annotated_forest = simple_forest.annotated_forest + assert len(annotated_forest.nodes) == 3 + annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + assert annotated_tags[self.parent] == {} + assert annotated_tags[self.child1] == {} + assert annotated_tags[self.child2] == {} def test_annotated_forest_propagates_rootward(): @@ -100,13 +217,3 @@ def test_annotated_forest_propagates_corrections(): def test_annotate_forest_propagates_both_dirs_with_corrections(): pass - - -def test_annotate_forest_does_not_propagate(): - # if a parent has two disagreeing children - pass - - -def test_annoted_forest_does_propagate_null_and_value(): - # if a parent has some children with one value and some with nulls - pass From 9b19f8b1d5e6feaabc0cfc78e9e51a36363be88a Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Thu, 25 Jan 2024 17:32:43 -0500 Subject: [PATCH 09/17] first pass of adding leafward tags one layer and an attempt at a recursive method --- src/pudl/output/ferc1.py | 87 +++++++++++++++++++- test/unit/output/ferc1_test.py | 141 ++++++++++++++++++++++++++++++++- 2 files changed, 223 insertions(+), 5 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index 7c955ec219..36ab3ee632 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -2327,8 +2327,8 @@ def propagate_tags(self: Self, annotated_forest: nx.DiGraph): Propagate tags leafwards, rootward & to the _correction nodes. """ existing_tags = nx.get_node_attributes(annotated_forest, "tags") + ## Leafwards propagation leafward_inherited_tags = ["in_rate_base"] - for node, parent_tags in existing_tags.items(): descendants = nx.descendants(annotated_forest, node) descendant_tags = { @@ -2343,6 +2343,63 @@ def propagate_tags(self: Self, annotated_forest: nx.DiGraph): for desc in descendants } nx.set_node_attributes(annotated_forest, descendant_tags) + + # Rootward propagation + existing_tags = nx.get_node_attributes(annotated_forest, "tags") + rootward_tags = {} + rootward_inherited_tags = ["in_rate_base"] + for node in existing_tags: + # what node is your successor node? + # does that sucessor node have children that all have the same tag? + # if so then apply that tag to the sucessor + # print(nx.ancestors(simple_forest.forest, node)) + + # we assume that no nodes have multiple parents + parents = list(annotated_forest.predecessors(node)) + # if you have no parents then no need to check nothing + if len(parents) == 0: + continue + assert len(parents) == 1 + parent = parents[0] + sibling_tags = { + sib_node: existing_tags.get(sib_node, {}) + for sib_node in annotated_forest.successors(parent) + if not sib_node.xbrl_factoid.endswith("_correction") + } + for rootward_tag in rootward_inherited_tags: + sibling_tag_values = { + # must return na bc we don't want to propagate unless all siblings + # have same tag + sibling_tag.get(rootward_tag, pd.NA) + for sibling_tag in sibling_tags.values() + } + if len(sibling_tag_values) == 1: + parent_tags = { + parent: { + "tags": {rootward_tag: sibling_tag_values.pop()} + | existing_tags.get(parent, {}) + } + } + rootward_tags = rootward_tags | parent_tags + nx.set_node_attributes(annotated_forest, rootward_tags) + # Correction Records + existing_tags = nx.get_node_attributes(annotated_forest, "tags") + correction_nodes = [ + node + for node in annotated_forest + if node.xbrl_factoid.endswith("_correction") + ] + correction_tags = {} + for correction_node in correction_nodes: + # for every correction node, we assume that that nodes parent tags can apply + parents = list(annotated_forest.predecessors(correction_node)) + # all correction records shoul have a parent and only one + assert len(parents) == 1 + parent = parents[0] + correction_tags[correction_node] = { + "tags": existing_tags.get(parent, {}) + | existing_tags.get(correction_node, {}) + } return annotated_forest def check_lost_tags(self: Self, lost_nodes: list[NodeId]) -> None: @@ -2869,6 +2926,34 @@ def nodes_to_df(calc_forest: nx.DiGraph, nodes: list[NodeId]) -> pd.DataFrame: return pd.concat([index, tags], axis="columns") +def aggregate_child_tags( + annotated_forest, node, tag_name: Literal["in_rate_base"] +) -> dict: + """Set the tags for nodes when all of its children have same tag.""" + tag = pd.NA + # i'm a leaf so i stop looking + if not annotated_forest.successors(node): + tag = annotated_forest.get(node, {}).get(tag_name, pd.NA) + # if i have a value you don't need to keep looking at this nodes childern + elif annotated_forest.get(node, {}).get(tag_name, pd.NA) != pd.NA: + tag = annotated_forest[node][tag_name] + else: + child_tags = {} + for child_node in annotated_forest.successors(node): + child_tags.add(aggregate_child_tags(annotated_forest, child_node, tag_name)) + # if all the children tags are the same and non-null + if (len(child_tags) == 1) and {t for t in child_tags if not pd.isna(t)}: + new_node_tag = child_tags.pop() + # actually assign the tag here but don't wipe out any other tags + existing_tags = nx.get_node_attributes(annotated_forest, "tags") + node_tags = { + node: {"tags": {tag_name: new_node_tag} | existing_tags.get(node, {})} + } + nx.set_node_attributes(annotated_forest, node_tags) + tag = new_node_tag + return tag + + @asset def out_ferc1__yearly_rate_base( exploded_balance_sheet_assets_ferc1: pd.DataFrame, diff --git a/test/unit/output/ferc1_test.py b/test/unit/output/ferc1_test.py index a7b9f594ab..e6e0b9ae09 100644 --- a/test/unit/output/ferc1_test.py +++ b/test/unit/output/ferc1_test.py @@ -43,6 +43,13 @@ def setUp(self): plant_status=pd.NA, plant_function=pd.NA, ) + self.parent_correction = NodeId( + table_name="table_1", + xbrl_factoid="reported_1_correction", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) self.child1 = NodeId( table_name="table_1", xbrl_factoid="reported_1_1", @@ -57,10 +64,37 @@ def setUp(self): plant_status=pd.NA, plant_function=pd.NA, ) - + self.grand_child11 = NodeId( + table_name="table_1", + xbrl_factoid="reported_1_1_1", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) + self.grand_child12 = NodeId( + table_name="table_1", + xbrl_factoid="reported_1_1_2", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) + self.child1_correction = NodeId( + table_name="table_1", + xbrl_factoid="reported_1_1_correction", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) dtype_node = {col: pd.StringDtype() for col in NodeId._fields} self.exploded_meta = pd.DataFrame( - [self.parent, self.child1, self.child2] + [ + self.parent, + self.child1, + self.child2, + self.grand_child11, + self.grand_child12, + self.child1_correction, + ] ).astype(dtype_node) def _exploded_calcs_from_edges(self, edges: list[tuple[NodeId, NodeId]]): @@ -206,9 +240,108 @@ def test_prop_no_tags(self): assert annotated_tags[self.child1] == {} assert annotated_tags[self.child2] == {} + def test_annotated_forest_propagates_rootward(self): + edges = [ + (self.parent, self.child1), + (self.parent, self.child2), + (self.child1, self.grand_child11), + (self.child1, self.grand_child12), + ] + tags = pd.DataFrame([self.grand_child11, self.grand_child12]).assign( + in_rate_base=["yes", "yes"] + ) -def test_annotated_forest_propagates_rootward(): - pass + simple_forest = XbrlCalculationForestFerc1( + exploded_meta=self.exploded_meta, + exploded_calcs=self._exploded_calcs_from_edges(edges), + seeds=[self.parent], + tags=tags, + ) + annotated_forest = simple_forest.annotated_forest + assert len(annotated_forest.nodes) == 5 + annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + # TODO: WHY THO it doesn't show up + # assert annotated_tags[self.parent] == {} + assert annotated_tags.get(self.parent, {}) == {} + assert annotated_tags[self.child1]["in_rate_base"] == "yes" + assert annotated_tags.get(self.child2, {}) == {} + assert annotated_tags[self.grand_child11]["in_rate_base"] == "yes" + assert annotated_tags[self.grand_child12]["in_rate_base"] == "yes" + + def test_annotated_forest_propagates_rootward_disagreeing_sibling(self): + edges = [ + (self.parent, self.child1), + (self.parent, self.child2), + (self.child1, self.grand_child11), + (self.child1, self.grand_child12), + ] + tags = pd.DataFrame([self.grand_child11, self.grand_child12]).assign( + in_rate_base=["yes", "no"] + ) + + simple_forest = XbrlCalculationForestFerc1( + exploded_meta=self.exploded_meta, + exploded_calcs=self._exploded_calcs_from_edges(edges), + seeds=[self.parent], + tags=tags, + ) + annotated_forest = simple_forest.annotated_forest + assert len(annotated_forest.nodes) == 5 + annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + assert annotated_tags.get(self.parent, {}) == {} + assert annotated_tags.get(self.child1, {}) == {} + assert annotated_tags.get(self.child2, {}) == {} + assert annotated_tags[self.grand_child11]["in_rate_base"] == "yes" + assert annotated_tags[self.grand_child12]["in_rate_base"] == "no" + + def test_annotated_forest_propagates_rootward_correction(self): + edges = [ + (self.child1, self.grand_child11), + (self.child1, self.child1_correction), + ] + tags = pd.DataFrame([self.child1]).assign(in_rate_base=["yes"]) + + simple_forest = XbrlCalculationForestFerc1( + exploded_meta=self.exploded_meta, + exploded_calcs=self._exploded_calcs_from_edges(edges), + seeds=[self.child1], + tags=tags, + ) + annotated_forest = simple_forest.annotated_forest + assert len(annotated_forest.nodes) == 3 + annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + assert annotated_tags[self.child1]["in_rate_base"] == "yes" + assert annotated_tags[self.grand_child11]["in_rate_base"] == "yes" + assert annotated_tags[self.child1_correction]["in_rate_base"] == "yes" + + @pytest.mark.xfail( + reason="we haven't implemented this behavior correctly yet", strict=True + ) + def test_annotated_forest_propagates_rootward_two_layers(self): + edges = [ + (self.parent, self.child1), + (self.parent, self.child2), + (self.child1, self.grand_child11), + (self.child1, self.grand_child12), + ] + pre_assigned_yes_nodes = [self.child2, self.grand_child11, self.grand_child12] + tags = pd.DataFrame(pre_assigned_yes_nodes).assign( + in_rate_base=["yes"] * len(pre_assigned_yes_nodes), + ) + + simple_forest = XbrlCalculationForestFerc1( + exploded_meta=self.exploded_meta, + exploded_calcs=self._exploded_calcs_from_edges(edges), + seeds=[self.parent], + tags=tags, + ) + annotated_forest = simple_forest.annotated_forest + assert len(annotated_forest.nodes) == 5 + annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + for pre_yes_node in pre_assigned_yes_nodes: + assert annotated_tags[pre_yes_node]["in_rate_base"] == "yes" + for post_yes_node in [self.child1, self.parent]: + assert annotated_tags[post_yes_node]["in_rate_base"] == "yes" def test_annotated_forest_propagates_corrections(): From d1347c17103ef4ec753129391c7d296d73dd8ac2 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Thu, 25 Jan 2024 18:13:45 -0500 Subject: [PATCH 10/17] integrate the recursive tag propagation method --- src/pudl/output/ferc1.py | 79 ++++++++++++++-------------------- test/unit/output/ferc1_test.py | 36 ++++++++++++++-- 2 files changed, 66 insertions(+), 49 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index 36ab3ee632..230b76c26e 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -2345,43 +2345,10 @@ def propagate_tags(self: Self, annotated_forest: nx.DiGraph): nx.set_node_attributes(annotated_forest, descendant_tags) # Rootward propagation - existing_tags = nx.get_node_attributes(annotated_forest, "tags") - rootward_tags = {} - rootward_inherited_tags = ["in_rate_base"] - for node in existing_tags: - # what node is your successor node? - # does that sucessor node have children that all have the same tag? - # if so then apply that tag to the sucessor - # print(nx.ancestors(simple_forest.forest, node)) - - # we assume that no nodes have multiple parents - parents = list(annotated_forest.predecessors(node)) - # if you have no parents then no need to check nothing - if len(parents) == 0: - continue - assert len(parents) == 1 - parent = parents[0] - sibling_tags = { - sib_node: existing_tags.get(sib_node, {}) - for sib_node in annotated_forest.successors(parent) - if not sib_node.xbrl_factoid.endswith("_correction") - } - for rootward_tag in rootward_inherited_tags: - sibling_tag_values = { - # must return na bc we don't want to propagate unless all siblings - # have same tag - sibling_tag.get(rootward_tag, pd.NA) - for sibling_tag in sibling_tags.values() - } - if len(sibling_tag_values) == 1: - parent_tags = { - parent: { - "tags": {rootward_tag: sibling_tag_values.pop()} - | existing_tags.get(parent, {}) - } - } - rootward_tags = rootward_tags | parent_tags - nx.set_node_attributes(annotated_forest, rootward_tags) + root_node = self.roots(annotated_forest)[0] + _ = recursively_propagate_tags_leafward( + annotated_forest, root_node, "in_rate_base" + ) # Correction Records existing_tags = nx.get_node_attributes(annotated_forest, "tags") correction_nodes = [ @@ -2400,6 +2367,7 @@ def propagate_tags(self: Self, annotated_forest: nx.DiGraph): "tags": existing_tags.get(parent, {}) | existing_tags.get(correction_node, {}) } + nx.set_node_attributes(annotated_forest, correction_tags) return annotated_forest def check_lost_tags(self: Self, lost_nodes: list[NodeId]) -> None: @@ -2926,24 +2894,43 @@ def nodes_to_df(calc_forest: nx.DiGraph, nodes: list[NodeId]) -> pd.DataFrame: return pd.concat([index, tags], axis="columns") -def aggregate_child_tags( +def recursively_propagate_tags_leafward( annotated_forest, node, tag_name: Literal["in_rate_base"] -) -> dict: - """Set the tags for nodes when all of its children have same tag.""" +): + """Set the tags for nodes when all of its children have same tag. + + This function returns the value of a tag. + """ + + def _get_tag(annotated_forest, node, tag_name): + return annotated_forest.nodes.get(node, {}).get("tags", {}).get(tag_name, pd.NA) + + logger.info(f"propagaging tags leafward from {node}") tag = pd.NA # i'm a leaf so i stop looking - if not annotated_forest.successors(node): - tag = annotated_forest.get(node, {}).get(tag_name, pd.NA) + if not list(annotated_forest.successors(node)): + tag = _get_tag(annotated_forest, node, tag_name) + logger.info(f" We found a leaf people. w/ {tag=}") # if i have a value you don't need to keep looking at this nodes childern - elif annotated_forest.get(node, {}).get(tag_name, pd.NA) != pd.NA: - tag = annotated_forest[node][tag_name] + elif not pd.isna(_get_tag(annotated_forest, node, tag_name)): + tag = _get_tag(annotated_forest, node, tag_name) + logger.info(f" We found a node w/ tags. w/ {tag=}") else: - child_tags = {} + child_tags = set() for child_node in annotated_forest.successors(node): - child_tags.add(aggregate_child_tags(annotated_forest, child_node, tag_name)) + if not child_node.xbrl_factoid.endswith("_correction"): + child_tags.add( + recursively_propagate_tags_leafward( + annotated_forest, child_node, tag_name + ) + ) + logger.info(f" found {child_tags=}") # if all the children tags are the same and non-null if (len(child_tags) == 1) and {t for t in child_tags if not pd.isna(t)}: new_node_tag = child_tags.pop() + logger.info( + f" We found a node consitent children tags. w/ {new_node_tag=}" + ) # actually assign the tag here but don't wipe out any other tags existing_tags = nx.get_node_attributes(annotated_forest, "tags") node_tags = { diff --git a/test/unit/output/ferc1_test.py b/test/unit/output/ferc1_test.py index e6e0b9ae09..35f0ac6829 100644 --- a/test/unit/output/ferc1_test.py +++ b/test/unit/output/ferc1_test.py @@ -314,9 +314,6 @@ def test_annotated_forest_propagates_rootward_correction(self): assert annotated_tags[self.grand_child11]["in_rate_base"] == "yes" assert annotated_tags[self.child1_correction]["in_rate_base"] == "yes" - @pytest.mark.xfail( - reason="we haven't implemented this behavior correctly yet", strict=True - ) def test_annotated_forest_propagates_rootward_two_layers(self): edges = [ (self.parent, self.child1), @@ -343,6 +340,39 @@ def test_annotated_forest_propagates_rootward_two_layers(self): for post_yes_node in [self.child1, self.parent]: assert annotated_tags[post_yes_node]["in_rate_base"] == "yes" + def test_annotated_forest_propagates_rootward_two_layers_plus_corrections(self): + edges = [ + (self.parent, self.child1), + (self.parent, self.child2), + (self.parent, self.parent_correction), + (self.child1, self.grand_child11), + (self.child1, self.grand_child12), + (self.child1, self.child1_correction), + ] + pre_assigned_yes_nodes = [self.child2, self.grand_child11, self.grand_child12] + tags = pd.DataFrame(pre_assigned_yes_nodes).assign( + in_rate_base=["yes"] * len(pre_assigned_yes_nodes), + ) + + simple_forest = XbrlCalculationForestFerc1( + exploded_meta=self.exploded_meta, + exploded_calcs=self._exploded_calcs_from_edges(edges), + seeds=[self.parent], + tags=tags, + ) + annotated_forest = simple_forest.annotated_forest + assert len(annotated_forest.nodes) == 7 + annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + for pre_yes_node in pre_assigned_yes_nodes: + assert annotated_tags[pre_yes_node]["in_rate_base"] == "yes" + for post_yes_node in [ + self.child1, + self.parent, + self.child1_correction, + self.parent_correction, + ]: + assert annotated_tags[post_yes_node]["in_rate_base"] == "yes" + def test_annotated_forest_propagates_corrections(): pass From d1a42b4f71772e6a6c0702d6b09bf08fcb62398b Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Fri, 26 Jan 2024 11:41:46 -0500 Subject: [PATCH 11/17] remove old correction tagging and standardize unit tests a bit --- src/pudl/output/ferc1.py | 61 -------------- test/unit/output/ferc1_test.py | 149 ++++++++------------------------- 2 files changed, 36 insertions(+), 174 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index 230b76c26e..fcb76a9faa 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -1195,9 +1195,6 @@ def _out_ferc1__explosion_tags( .reset_index() .drop(columns=["notes"]) ) - # Add the correction records to the tags... - corrections = make_correction_tags(tags, calculation_components_xbrl_ferc1) - tags = pd.concat([tags, corrections]) return tags @@ -1259,64 +1256,6 @@ def _aggregatable_dimension_tags( return tags_df[tags_df[aggregatable_col] != "total"] -def make_correction_tags( - tags_all: pd.DataFrame, calc_components: pd.DataFrame -) -> pd.DataFrame: - """Make tags for correction records. - - We need to check to see if any of the tags in each of the calculated - parent factoids are the same for all of their child components. So in this - function, we're going to merge on the tags to the children then groupby the - parents. For each tag, see if the childrens'tags contains only one unique value. - If so grab the tag to associate with the correction record of the parent. If not, - no tag will be associated with the record. - """ - tag_idx = list(NodeId._fields) - calcs_w_tags = ( - pd.merge( # remove the correction records bc those are the ones we want to - calc_components[~calc_components.xbrl_factoid.str.contains("_correction")], - tags_all, - on=tag_idx, - how="left", - validate="m:1", - ) - ) - # use the same groupby to get the number of unique tags and the first one - # we will only use the first tag if the tags are unique - tag_cols = list(tags_all.drop(columns=tag_idx).columns) - tag_gb = calcs_w_tags.groupby([f"{c}_parent" for c in tag_idx], dropna=False)[ - tag_cols - ] - tag_check = pd.merge( - tag_gb.nunique( - dropna=False - ), # bc if null and non-null tag we want to know that - tag_gb.first(), - right_index=True, - left_index=True, - suffixes=("_n", ""), - validate="1:1", - ) - # null out all of the tags that have non-unique tags for each parent - for col in tag_cols: - non_unique_mask = tag_check[f"{col}_n"] != 1 - tag_check.loc[non_unique_mask, col] = pd.NA - # specifically for in_rate_base assign partial when it is a mix - tag_check.loc[tag_check["in_rate_base_n"] > 1, "in_rate_base"] = "partial" - # remove the fully null tags bc there's nothing new in there and - # drop all of the _n columns - tag_check = tag_check.dropna(how="all", subset=tag_cols)[tag_cols] - # remove the parent from the index name - tag_check.index.names = [ - col.removesuffix("_parent") for col in tag_check.index.names - ] - correction_tags = tag_check.reset_index().assign( - xbrl_factoid=lambda x: x.xbrl_factoid + "_correction" - ) - logger.info(f"Found {len(correction_tags)=}") - return correction_tags - - def exploded_table_asset_factory( root_table: str, table_names: list[str], diff --git a/test/unit/output/ferc1_test.py b/test/unit/output/ferc1_test.py index 35f0ac6829..1d2b16ee19 100644 --- a/test/unit/output/ferc1_test.py +++ b/test/unit/output/ferc1_test.py @@ -25,6 +25,7 @@ import pandas as pd import pytest +from pudl.helpers import dedupe_n_flatten_list_of_lists from pudl.output.ferc1 import NodeId, XbrlCalculationForestFerc1 logger = logging.getLogger(__name__) @@ -113,22 +114,37 @@ def _exploded_calcs_from_edges(self, edges: list[tuple[NodeId, NodeId]]): dtype_child | dtype_parent | dtype_weight ) - def test_leafward_prop_undecided_children(self): - edges = [(self.parent, self.child1), (self.parent, self.child2)] - tags = pd.DataFrame([self.parent, self.child1, self.child2]).assign( - in_rate_base=["yes", pd.NA, pd.NA] - ) - + def build_forest_and_annotated_tags( + self, edges: list[tuple[NodeId, NodeId]], tags: pd.DataFrame, seeds=None + ): + """Build a forest, test forest nodes and return annotated tags. + + Args: + edges: list of tuples + tags: dataframe of tags + seeds: list of seed nodes. Default is None and will assume seed node is + ``parent``. + """ + if not seeds: + seeds = [self.parent] simple_forest = XbrlCalculationForestFerc1( exploded_meta=self.exploded_meta, exploded_calcs=self._exploded_calcs_from_edges(edges), - seeds=[self.parent], + seeds=seeds, tags=tags, ) - annotated_forest = simple_forest.annotated_forest - assert len(annotated_forest.nodes) == 3 + # ensure no nodes got dropped + assert len(annotated_forest.nodes) == len(dedupe_n_flatten_list_of_lists(edges)) annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + return annotated_tags + + def test_leafward_prop_undecided_children(self): + edges = [(self.parent, self.child1), (self.parent, self.child2)] + tags = pd.DataFrame([self.parent, self.child1, self.child2]).assign( + in_rate_base=["yes", pd.NA, pd.NA] + ) + annotated_tags = self.build_forest_and_annotated_tags(edges, tags) assert annotated_tags[self.parent]["in_rate_base"] == "yes" assert annotated_tags[self.child1]["in_rate_base"] == "yes" assert annotated_tags[self.child2]["in_rate_base"] == "yes" @@ -139,17 +155,7 @@ def test_leafward_prop_disagreeing_child(self): tags = pd.DataFrame([self.parent, self.child1, self.child2]).assign( in_rate_base=["yes", "no", pd.NA] ) - - simple_forest = XbrlCalculationForestFerc1( - exploded_meta=self.exploded_meta, - exploded_calcs=self._exploded_calcs_from_edges(edges), - seeds=[self.parent], - tags=tags, - ) - - annotated_forest = simple_forest.annotated_forest - assert len(annotated_forest.nodes) == 3 - annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + annotated_tags = self.build_forest_and_annotated_tags(edges, tags) assert annotated_tags[self.parent]["in_rate_base"] == "yes" assert annotated_tags[self.child1]["in_rate_base"] == "no" assert annotated_tags[self.child2]["in_rate_base"] == "yes" @@ -161,17 +167,7 @@ def test_leafward_prop_preserve_non_propagating_tags(self): in_rate_base=["yes", "no", pd.NA], in_root_boose=["yus", "nu", "purtiul"], ) - - simple_forest = XbrlCalculationForestFerc1( - exploded_meta=self.exploded_meta, - exploded_calcs=self._exploded_calcs_from_edges(edges), - seeds=[self.parent], - tags=tags, - ) - - annotated_forest = simple_forest.annotated_forest - assert len(annotated_forest.nodes) == 3 - annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + annotated_tags = self.build_forest_and_annotated_tags(edges, tags) assert annotated_tags[self.parent]["in_rate_base"] == "yes" assert annotated_tags[self.child1]["in_rate_base"] == "no" assert annotated_tags[self.child2]["in_rate_base"] == "yes" @@ -185,17 +181,7 @@ def test_rootward_prop_disagreeing_children(self): tags = pd.DataFrame([self.parent, self.child1, self.child2]).assign( in_rate_base=[pd.NA, "no", "yes"] ) - - simple_forest = XbrlCalculationForestFerc1( - exploded_meta=self.exploded_meta, - exploded_calcs=self._exploded_calcs_from_edges(edges), - seeds=[self.parent], - tags=tags, - ) - - annotated_forest = simple_forest.annotated_forest - assert len(annotated_forest.nodes) == 3 - annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + annotated_tags = self.build_forest_and_annotated_tags(edges, tags) assert annotated_tags[self.parent] == {} assert annotated_tags[self.child1]["in_rate_base"] == "no" assert annotated_tags[self.child2]["in_rate_base"] == "yes" @@ -209,33 +195,13 @@ def test_prop_no_tags(self): tags = pd.DataFrame([self.parent, self.child1, self.child2]).assign( in_rate_base=[pd.NA, pd.NA, pd.NA] ) - - simple_forest = XbrlCalculationForestFerc1( - exploded_meta=self.exploded_meta, - exploded_calcs=self._exploded_calcs_from_edges(edges), - seeds=[self.parent], - tags=tags, - ) - - annotated_forest = simple_forest.annotated_forest - assert len(annotated_forest.nodes) == 3 - annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + annotated_tags = self.build_forest_and_annotated_tags(edges, tags) assert annotated_tags[self.parent] == {} assert annotated_tags[self.child1] == {} assert annotated_tags[self.child2] == {} tags = pd.DataFrame(columns=NodeId._fields).convert_dtypes() - - simple_forest = XbrlCalculationForestFerc1( - exploded_meta=self.exploded_meta, - exploded_calcs=self._exploded_calcs_from_edges(edges), - seeds=[self.parent], - tags=tags, - ) - - annotated_forest = simple_forest.annotated_forest - assert len(annotated_forest.nodes) == 3 - annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + annotated_tags = self.build_forest_and_annotated_tags(edges, tags) assert annotated_tags[self.parent] == {} assert annotated_tags[self.child1] == {} assert annotated_tags[self.child2] == {} @@ -250,16 +216,7 @@ def test_annotated_forest_propagates_rootward(self): tags = pd.DataFrame([self.grand_child11, self.grand_child12]).assign( in_rate_base=["yes", "yes"] ) - - simple_forest = XbrlCalculationForestFerc1( - exploded_meta=self.exploded_meta, - exploded_calcs=self._exploded_calcs_from_edges(edges), - seeds=[self.parent], - tags=tags, - ) - annotated_forest = simple_forest.annotated_forest - assert len(annotated_forest.nodes) == 5 - annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + annotated_tags = self.build_forest_and_annotated_tags(edges, tags) # TODO: WHY THO it doesn't show up # assert annotated_tags[self.parent] == {} assert annotated_tags.get(self.parent, {}) == {} @@ -278,16 +235,7 @@ def test_annotated_forest_propagates_rootward_disagreeing_sibling(self): tags = pd.DataFrame([self.grand_child11, self.grand_child12]).assign( in_rate_base=["yes", "no"] ) - - simple_forest = XbrlCalculationForestFerc1( - exploded_meta=self.exploded_meta, - exploded_calcs=self._exploded_calcs_from_edges(edges), - seeds=[self.parent], - tags=tags, - ) - annotated_forest = simple_forest.annotated_forest - assert len(annotated_forest.nodes) == 5 - annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + annotated_tags = self.build_forest_and_annotated_tags(edges, tags) assert annotated_tags.get(self.parent, {}) == {} assert annotated_tags.get(self.child1, {}) == {} assert annotated_tags.get(self.child2, {}) == {} @@ -300,16 +248,9 @@ def test_annotated_forest_propagates_rootward_correction(self): (self.child1, self.child1_correction), ] tags = pd.DataFrame([self.child1]).assign(in_rate_base=["yes"]) - - simple_forest = XbrlCalculationForestFerc1( - exploded_meta=self.exploded_meta, - exploded_calcs=self._exploded_calcs_from_edges(edges), - seeds=[self.child1], - tags=tags, + annotated_tags = self.build_forest_and_annotated_tags( + edges, tags, seeds=[self.child1] ) - annotated_forest = simple_forest.annotated_forest - assert len(annotated_forest.nodes) == 3 - annotated_tags = nx.get_node_attributes(annotated_forest, "tags") assert annotated_tags[self.child1]["in_rate_base"] == "yes" assert annotated_tags[self.grand_child11]["in_rate_base"] == "yes" assert annotated_tags[self.child1_correction]["in_rate_base"] == "yes" @@ -325,16 +266,7 @@ def test_annotated_forest_propagates_rootward_two_layers(self): tags = pd.DataFrame(pre_assigned_yes_nodes).assign( in_rate_base=["yes"] * len(pre_assigned_yes_nodes), ) - - simple_forest = XbrlCalculationForestFerc1( - exploded_meta=self.exploded_meta, - exploded_calcs=self._exploded_calcs_from_edges(edges), - seeds=[self.parent], - tags=tags, - ) - annotated_forest = simple_forest.annotated_forest - assert len(annotated_forest.nodes) == 5 - annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + annotated_tags = self.build_forest_and_annotated_tags(edges, tags) for pre_yes_node in pre_assigned_yes_nodes: assert annotated_tags[pre_yes_node]["in_rate_base"] == "yes" for post_yes_node in [self.child1, self.parent]: @@ -353,16 +285,7 @@ def test_annotated_forest_propagates_rootward_two_layers_plus_corrections(self): tags = pd.DataFrame(pre_assigned_yes_nodes).assign( in_rate_base=["yes"] * len(pre_assigned_yes_nodes), ) - - simple_forest = XbrlCalculationForestFerc1( - exploded_meta=self.exploded_meta, - exploded_calcs=self._exploded_calcs_from_edges(edges), - seeds=[self.parent], - tags=tags, - ) - annotated_forest = simple_forest.annotated_forest - assert len(annotated_forest.nodes) == 7 - annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + annotated_tags = self.build_forest_and_annotated_tags(edges, tags) for pre_yes_node in pre_assigned_yes_nodes: assert annotated_tags[pre_yes_node]["in_rate_base"] == "yes" for post_yes_node in [ From 829757adb5e42c1cf0c3e19fb492433bb721cb6d Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Fri, 26 Jan 2024 17:13:20 -0500 Subject: [PATCH 12/17] remove metadata from forest builder and cleanup unit tests --- src/pudl/output/ferc1.py | 44 +++++++++---------------------- test/unit/output/ferc1_test.py | 48 ++++++++-------------------------- 2 files changed, 23 insertions(+), 69 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index fcb76a9faa..7bc331d770 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -1665,7 +1665,6 @@ def calculation_forest(self: Self) -> "XbrlCalculationForestFerc1": """Construct a calculation forest based on class attributes.""" return XbrlCalculationForestFerc1( exploded_calcs=self.exploded_calcs, - exploded_meta=self.exploded_meta, seeds=self.seed_nodes, tags=self.tags, group_metric_checks=self.group_metric_checks, @@ -2023,7 +2022,6 @@ class XbrlCalculationForestFerc1(BaseModel): # Not sure if dynamically basing this on NodeId is really a good idea here. calc_cols: list[str] = list(NodeId._fields) - exploded_meta: pd.DataFrame = pd.DataFrame() exploded_calcs: pd.DataFrame = pd.DataFrame() seeds: list[NodeId] = [] tags: pd.DataFrame = pd.DataFrame() @@ -2180,32 +2178,9 @@ def node_attrs(self: Self) -> dict[NodeId, dict[str, dict[str, str]]]: .reset_index() # Type conversion is necessary to get pd.NA in the index: .astype({col: pd.StringDtype() for col in self.calc_cols}) - # We need a dictionary for *all* nodes, not just those with tags. - .merge( - self.exploded_meta.loc[:, self.calc_cols], - how="left", - on=self.calc_cols, - validate="one_to_many", - indicator=True, - ) - # For nodes with no tags, we assign an empty dictionary: .assign(tags=lambda x: np.where(x["tags"].isna(), {}, x["tags"])) ) - lefties = node_attrs[ - (node_attrs._merge == "left_only") - & (node_attrs.table_name.isin(self.table_names)) - ] - if not lefties.empty: - logger.warning( - f"Found {len(lefties)} tags that only exist in our manually compiled " - "tags when expected none. Ensure the compiled tags match the metadata." - f"Mismatched tags:\n{lefties}" - ) - return ( - node_attrs.drop(columns=["_merge"]) - .set_index(self.calc_cols) - .to_dict(orient="index") - ) + return node_attrs.set_index(self.calc_cols).to_dict(orient="index") @cached_property def edge_attrs(self: Self) -> dict[Any, Any]: @@ -2425,7 +2400,7 @@ def seeded_digraph(self: Self) -> nx.DiGraph: We compile a list of all the :class:`NodeId` values that should be included in the pruned graph, and then use that list to select a subset of the exploded - metadata to pass to :meth:`exploded_meta_to_digraph`, so that all of the + metadata to pass to :meth:`exploded_calcs_to_digraph`, so that all of the associated metadata is also added to the pruned graph. """ return self.prune_unrooted(self.full_digraph) @@ -2553,11 +2528,16 @@ def forest_leaves(self: Self) -> list[NodeId]: def orphans(self: Self) -> list[NodeId]: """Identify all nodes that appear in metadata but not in the full digraph.""" nodes = self.full_digraph.nodes - return [ - NodeId(*n) - for n in self.exploded_meta.set_index(self.calc_cols).index - if n not in nodes - ] + orphans = [] + for idx_cols in [self.calc_cols, self.parent_cols]: + orphans.extend( + [ + NodeId(*n) + for n in self.exploded_calcs.set_index(idx_cols).index + if n not in nodes + ] + ) + return list(set(orphans)) @cached_property def pruned(self: Self) -> list[NodeId]: diff --git a/test/unit/output/ferc1_test.py b/test/unit/output/ferc1_test.py index 1d2b16ee19..8a1dfdc6c1 100644 --- a/test/unit/output/ferc1_test.py +++ b/test/unit/output/ferc1_test.py @@ -23,7 +23,6 @@ import networkx as nx import pandas as pd -import pytest from pudl.helpers import dedupe_n_flatten_list_of_lists from pudl.output.ferc1 import NodeId, XbrlCalculationForestFerc1 @@ -86,17 +85,6 @@ def setUp(self): plant_status=pd.NA, plant_function=pd.NA, ) - dtype_node = {col: pd.StringDtype() for col in NodeId._fields} - self.exploded_meta = pd.DataFrame( - [ - self.parent, - self.child1, - self.child2, - self.grand_child11, - self.grand_child12, - self.child1_correction, - ] - ).astype(dtype_node) def _exploded_calcs_from_edges(self, edges: list[tuple[NodeId, NodeId]]): records = [] @@ -128,7 +116,6 @@ def build_forest_and_annotated_tags( if not seeds: seeds = [self.parent] simple_forest = XbrlCalculationForestFerc1( - exploded_meta=self.exploded_meta, exploded_calcs=self._exploded_calcs_from_edges(edges), seeds=seeds, tags=tags, @@ -178,17 +165,14 @@ def test_leafward_prop_preserve_non_propagating_tags(self): def test_rootward_prop_disagreeing_children(self): """Parents should not pick sides between disagreeing children.""" edges = [(self.parent, self.child1), (self.parent, self.child2)] - tags = pd.DataFrame([self.parent, self.child1, self.child2]).assign( - in_rate_base=[pd.NA, "no", "yes"] + tags = pd.DataFrame([self.child1, self.child2]).assign( + in_rate_base=["no", "yes"] ) annotated_tags = self.build_forest_and_annotated_tags(edges, tags) - assert annotated_tags[self.parent] == {} + assert not annotated_tags.get(self.parent) assert annotated_tags[self.child1]["in_rate_base"] == "no" assert annotated_tags[self.child2]["in_rate_base"] == "yes" - @pytest.mark.xfail( - reason="we haven't implemented this behavior correctly yet", strict=True - ) def test_prop_no_tags(self): """If no tags, don't propagate anything.""" edges = [(self.parent, self.child1), (self.parent, self.child2)] @@ -202,9 +186,9 @@ def test_prop_no_tags(self): tags = pd.DataFrame(columns=NodeId._fields).convert_dtypes() annotated_tags = self.build_forest_and_annotated_tags(edges, tags) - assert annotated_tags[self.parent] == {} - assert annotated_tags[self.child1] == {} - assert annotated_tags[self.child2] == {} + assert not annotated_tags.get(self.parent) + assert not annotated_tags.get(self.child1) + assert not annotated_tags.get(self.child2) def test_annotated_forest_propagates_rootward(self): edges = [ @@ -217,11 +201,9 @@ def test_annotated_forest_propagates_rootward(self): in_rate_base=["yes", "yes"] ) annotated_tags = self.build_forest_and_annotated_tags(edges, tags) - # TODO: WHY THO it doesn't show up - # assert annotated_tags[self.parent] == {} - assert annotated_tags.get(self.parent, {}) == {} + assert not annotated_tags.get(self.parent) assert annotated_tags[self.child1]["in_rate_base"] == "yes" - assert annotated_tags.get(self.child2, {}) == {} + assert not annotated_tags.get(self.child2) assert annotated_tags[self.grand_child11]["in_rate_base"] == "yes" assert annotated_tags[self.grand_child12]["in_rate_base"] == "yes" @@ -236,9 +218,9 @@ def test_annotated_forest_propagates_rootward_disagreeing_sibling(self): in_rate_base=["yes", "no"] ) annotated_tags = self.build_forest_and_annotated_tags(edges, tags) - assert annotated_tags.get(self.parent, {}) == {} - assert annotated_tags.get(self.child1, {}) == {} - assert annotated_tags.get(self.child2, {}) == {} + assert not annotated_tags.get(self.parent) + assert not annotated_tags.get(self.child1) + assert not annotated_tags.get(self.child2) assert annotated_tags[self.grand_child11]["in_rate_base"] == "yes" assert annotated_tags[self.grand_child12]["in_rate_base"] == "no" @@ -295,11 +277,3 @@ def test_annotated_forest_propagates_rootward_two_layers_plus_corrections(self): self.parent_correction, ]: assert annotated_tags[post_yes_node]["in_rate_base"] == "yes" - - -def test_annotated_forest_propagates_corrections(): - pass - - -def test_annotate_forest_propagates_both_dirs_with_corrections(): - pass From 33fa1efe36e0b06f7ec42a3c5b8d72fde9dac20d Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Tue, 30 Jan 2024 10:54:28 -0500 Subject: [PATCH 13/17] add "validation" checks and standardize null tag behavior` --- src/pudl/output/ferc1.py | 245 ++++++++++++++++++++++----------- test/unit/output/ferc1_test.py | 55 ++++++-- 2 files changed, 205 insertions(+), 95 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index 7bc331d770..026ba2fde4 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -2161,7 +2161,10 @@ def node_attrs(self: Self) -> dict[NodeId, dict[str, dict[str, str]]]: # Reshape the tags to turn them into a dictionary of values per-node. This # will make it easier to add arbitrary sets of tags later on. tags_dict = ( - self.tags.convert_dtypes().set_index(self.calc_cols).to_dict(orient="index") + self.tags.convert_dtypes() + .set_index(self.calc_cols) + .dropna(how="all") + .to_dict(orient="index") ) # Drop None tags created by combining multiple tagging CSVs clean_tags_dict = { @@ -2226,7 +2229,7 @@ def annotated_forest(self: Self) -> nx.DiGraph: annotated_forest = deepcopy(self.forest) nx.set_node_attributes(annotated_forest, self.node_attrs) nx.set_edge_attributes(annotated_forest, self.edge_attrs) - annotated_forest = self.propagate_tags(annotated_forest) + annotated_forest = self.propagate_node_attributes(annotated_forest) logger.info("Checking whether any pruned nodes were also tagged.") self.check_lost_tags(lost_nodes=self.pruned) @@ -2235,53 +2238,18 @@ def annotated_forest(self: Self) -> nx.DiGraph: self.check_conflicting_tags(annotated_forest) return annotated_forest - def propagate_tags(self: Self, annotated_forest: nx.DiGraph): + def propagate_node_attributes(self: Self, annotated_forest: nx.DiGraph): """Propagate tags. Propagate tags leafwards, rootward & to the _correction nodes. """ - existing_tags = nx.get_node_attributes(annotated_forest, "tags") ## Leafwards propagation - leafward_inherited_tags = ["in_rate_base"] - for node, parent_tags in existing_tags.items(): - descendants = nx.descendants(annotated_forest, node) - descendant_tags = { - desc: { - "tags": { - tag_name: parent_tags[tag_name] - for tag_name in leafward_inherited_tags - if tag_name in parent_tags - } - | existing_tags.get(desc, {}) - } - for desc in descendants - } - nx.set_node_attributes(annotated_forest, descendant_tags) - + annotated_forest = _propagate_tags_leafward(annotated_forest, ["in_rate_base"]) # Rootward propagation root_node = self.roots(annotated_forest)[0] - _ = recursively_propagate_tags_leafward( - annotated_forest, root_node, "in_rate_base" - ) + _ = _propagate_tag_rootward(annotated_forest, root_node, "in_rate_base") # Correction Records - existing_tags = nx.get_node_attributes(annotated_forest, "tags") - correction_nodes = [ - node - for node in annotated_forest - if node.xbrl_factoid.endswith("_correction") - ] - correction_tags = {} - for correction_node in correction_nodes: - # for every correction node, we assume that that nodes parent tags can apply - parents = list(annotated_forest.predecessors(correction_node)) - # all correction records shoul have a parent and only one - assert len(parents) == 1 - parent = parents[0] - correction_tags[correction_node] = { - "tags": existing_tags.get(parent, {}) - | existing_tags.get(correction_node, {}) - } - nx.set_node_attributes(annotated_forest, correction_tags) + annotated_forest = _propagate_tags_to_corrections(annotated_forest) return annotated_forest def check_lost_tags(self: Self, lost_nodes: list[NodeId]) -> None: @@ -2813,12 +2781,34 @@ def nodes_to_df(calc_forest: nx.DiGraph, nodes: list[NodeId]) -> pd.DataFrame: return pd.concat([index, tags], axis="columns") -def recursively_propagate_tags_leafward( - annotated_forest, node, tag_name: Literal["in_rate_base"] -): - """Set the tags for nodes when all of its children have same tag. +def _propagate_tags_leafward( + annotated_forest: nx.DiGraph, leafward_inherited_tags: list[str] +) -> nx.DiGraph: + existing_tags = nx.get_node_attributes(annotated_forest, "tags") + for node, parent_tags in existing_tags.items(): + descendants = nx.descendants(annotated_forest, node) + descendant_tags = { + desc: { + "tags": { + tag_name: parent_tags[tag_name] + for tag_name in leafward_inherited_tags + if tag_name in parent_tags + } + | existing_tags.get(desc, {}) + } + for desc in descendants + } + nx.set_node_attributes(annotated_forest, descendant_tags) + return annotated_forest + + +def _propagate_tag_rootward( + annotated_forest: nx.DiGraph, node, tag_name: Literal["in_rate_base"] +) -> str: + """Set the tag for nodes when all of its children have same tag. - This function returns the value of a tag. + This function returns the value of a tag, but also sets node attributes + down the tree when all children of a node share the same tag. """ def _get_tag(annotated_forest, node, tag_name): @@ -2826,30 +2816,23 @@ def _get_tag(annotated_forest, node, tag_name): logger.info(f"propagaging tags leafward from {node}") tag = pd.NA - # i'm a leaf so i stop looking - if not list(annotated_forest.successors(node)): - tag = _get_tag(annotated_forest, node, tag_name) - logger.info(f" We found a leaf people. w/ {tag=}") - # if i have a value you don't need to keep looking at this nodes childern - elif not pd.isna(_get_tag(annotated_forest, node, tag_name)): + # i'm a leaf so i stop looking or + # if i have a value you don't need to keep looking at this node's childern + if not list(annotated_forest.successors(node)) or not pd.isna( + _get_tag(annotated_forest, node, tag_name) + ): tag = _get_tag(annotated_forest, node, tag_name) - logger.info(f" We found a node w/ tags. w/ {tag=}") + else: child_tags = set() for child_node in annotated_forest.successors(node): if not child_node.xbrl_factoid.endswith("_correction"): child_tags.add( - recursively_propagate_tags_leafward( - annotated_forest, child_node, tag_name - ) + _propagate_tag_rootward(annotated_forest, child_node, tag_name) ) - logger.info(f" found {child_tags=}") # if all the children tags are the same and non-null if (len(child_tags) == 1) and {t for t in child_tags if not pd.isna(t)}: new_node_tag = child_tags.pop() - logger.info( - f" We found a node consitent children tags. w/ {new_node_tag=}" - ) # actually assign the tag here but don't wipe out any other tags existing_tags = nx.get_node_attributes(annotated_forest, "tags") node_tags = { @@ -2857,14 +2840,117 @@ def _get_tag(annotated_forest, node, tag_name): } nx.set_node_attributes(annotated_forest, node_tags) tag = new_node_tag + # elif the children disagree then the node's tag shouldn't be set and + # the og null tag should be returned return tag +def _propagate_tags_to_corrections(annotated_forest: nx.DiGraph) -> nx.DiGraph: + existing_tags = nx.get_node_attributes(annotated_forest, "tags") + correction_nodes = [ + node for node in annotated_forest if node.xbrl_factoid.endswith("_correction") + ] + correction_tags = {} + for correction_node in correction_nodes: + # for every correction node, we assume that that nodes parent tags can apply + parents = list(annotated_forest.predecessors(correction_node)) + # all correction records shoul have a parent and only one + assert len(parents) == 1 + parent = parents[0] + correction_tags[correction_node] = { + "tags": existing_tags.get(parent, {}) + | existing_tags.get(correction_node, {}) + } + nx.set_node_attributes(annotated_forest, correction_tags) + return annotated_forest + + +def check_tag_propagation_compared_to_compiled_tags( + df: pd.DataFrame, + propogated_tag: Literal["in_rate_base"], + _out_ferc1__explosion_tags: pd.DataFrame, +): + """Check if tags got propagated. + + Args: + df: table to check. This should be either the + :func:`out_ferc1__yearly_rate_base`, ``exploded_balance_sheet_assets_ferc1`` + or ``exploded_balance_sheet_liabilities_ferc1``. The + ``exploded_income_statement_ferc1`` table does not currently have propagated + tags. + propogated_tag: name of tag. Currently ``in_rate_base`` is the only propagated tag. + _out_ferc1__explosion_tags: mannually compiled tags. This table includes tags from + many of the explosion tables so we will filter it before checking if the tag was + propagated. + + Raises: + AssertionError: If there are more mannually compiled tags for the ``xbrl_factoids`` + in ``df`` than found in ``_out_ferc1__explosion_tags``. + AssertionError: If there are more mannually compiled tags for the correction + ``xbrl_factoids`` in ``df`` than found in ``_out_ferc1__explosion_tags``. + """ + # the tag df has all tags - not just those in a specific explosion + # so we need to drop + node_idx = list(NodeId._fields) + df_filtered = df.filter(node_idx).drop_duplicates() + df_tags = _out_ferc1__explosion_tags.merge( + df_filtered, on=list(df_filtered.columns), how="right" + ) + mannually_tagged = df_tags[df_tags[propogated_tag].notnull()].xbrl_factoid.unique() + detailed_tagged = df[df[f"tags_{propogated_tag}"].notnull()].xbrl_factoid.unique() + if len(detailed_tagged) < len(mannually_tagged): + raise AssertionError( + f"Found more {len(mannually_tagged)} mannually compiled tagged xbrl_factoids" + " than tags in propagated detailed data." + ) + mannually_tagged_corrections = df_tags[ + df_tags[propogated_tag].notnull() + & df_tags.xbrl_factoid.str.endswith("_correction") + ].xbrl_factoid.unique() + detailed_tagged_corrections = df[ + df[f"tags_{propogated_tag}"].notnull() + & df.xbrl_factoid.str.endswith("_correction") + ].xbrl_factoid.unique() + if len(detailed_tagged_corrections) < len(mannually_tagged_corrections): + raise AssertionError( + f"Found more {len(mannually_tagged)} mannually compiled tagged " + "xbrl_factoids than tags in propagated detailed data." + ) + + +def check_for_correction_xbrl_factoids_with_tag( + df: pd.DataFrame, propogated_tag: Literal["in_rate_base"] +): + """Check if any correction records have tags. + + Args: + df: table to check. This should be either the + :func:`out_ferc1__yearly_rate_base`, ``exploded_balance_sheet_assets_ferc1`` + or ``exploded_balance_sheet_liabilities_ferc1``. The + ``exploded_income_statement_ferc1`` table does not currently have propagated + tags. + propogated_tag: name of tag. Currently ``in_rate_base`` is the only propagated tag. + + Raises: + AssertionError: If there are zero correction ``xbrl_factoids`` in ``df`` with tags. + """ + detailed_tagged_corrections = df[ + df[f"tags_{propogated_tag}"].notnull() + & df.xbrl_factoid.str.endswith("_correction") + ].xbrl_factoid.unique() + if len(detailed_tagged_corrections) == 0: + raise AssertionError( + "We expect there to be more than zero correction recrods with tags, but " + f"found {len(detailed_tagged_corrections)}." + ) + + @asset def out_ferc1__yearly_rate_base( exploded_balance_sheet_assets_ferc1: pd.DataFrame, exploded_balance_sheet_liabilities_ferc1: pd.DataFrame, core_ferc1__yearly_operating_expenses_sched320: pd.DataFrame, + _out_ferc1__explosion_tags: pd.DataFrame, ) -> pd.DataFrame: """Make a table of granular utility rate-base data. @@ -2908,25 +2994,24 @@ def out_ferc1__yearly_rate_base( .rename(columns={"dollar_value": "ending_balance"}) ) # then select only the leafy exploded records that are in rate base and concat - in_rate_base = ( - pd.concat( - [ - exploded_balance_sheet_assets_ferc1[ - exploded_balance_sheet_assets_ferc1.tags_in_rate_base.isin( - ["yes", "partial"] - ) - ], - exploded_balance_sheet_liabilities_ferc1[ - exploded_balance_sheet_liabilities_ferc1.tags_in_rate_base.isin( - ["yes", "partial"] - ) - ], - cash_working_capital, - ] - ) - # .drop(columns=["tags_in_rate_base"]) - .sort_values( - by=["report_year", "utility_id_ferc1", "table_name"], ascending=False - ) + in_rate_base = pd.concat( + [ + exploded_balance_sheet_assets_ferc1[ + exploded_balance_sheet_assets_ferc1.tags_in_rate_base.isin( + ["yes", "partial"] + ) + ], + exploded_balance_sheet_liabilities_ferc1[ + exploded_balance_sheet_liabilities_ferc1.tags_in_rate_base.isin( + ["yes", "partial"] + ) + ].assign(ending_balance=lambda x: -x.ending_balance), + cash_working_capital, + ] + ).sort_values(by=["report_year", "utility_id_ferc1", "table_name"], ascending=False) + # note: we need the `tags_in_rate_base` column for these checks + check_tag_propagation_compared_to_compiled_tags( + in_rate_base, "in_rate_base", _out_ferc1__explosion_tags ) + check_for_correction_xbrl_factoids_with_tag(in_rate_base, "in_rate_base") return in_rate_base diff --git a/test/unit/output/ferc1_test.py b/test/unit/output/ferc1_test.py index 8a1dfdc6c1..e36eb959dd 100644 --- a/test/unit/output/ferc1_test.py +++ b/test/unit/output/ferc1_test.py @@ -133,8 +133,11 @@ def test_leafward_prop_undecided_children(self): ) annotated_tags = self.build_forest_and_annotated_tags(edges, tags) assert annotated_tags[self.parent]["in_rate_base"] == "yes" - assert annotated_tags[self.child1]["in_rate_base"] == "yes" - assert annotated_tags[self.child2]["in_rate_base"] == "yes" + for child_node in [self.child1, self.child2]: + assert ( + annotated_tags[child_node]["in_rate_base"] + == annotated_tags[self.parent]["in_rate_base"] + ) def test_leafward_prop_disagreeing_child(self): """Don't force the diagreeing child to follow the parent.""" @@ -145,7 +148,10 @@ def test_leafward_prop_disagreeing_child(self): annotated_tags = self.build_forest_and_annotated_tags(edges, tags) assert annotated_tags[self.parent]["in_rate_base"] == "yes" assert annotated_tags[self.child1]["in_rate_base"] == "no" - assert annotated_tags[self.child2]["in_rate_base"] == "yes" + assert ( + annotated_tags[self.child2]["in_rate_base"] + == annotated_tags[self.parent]["in_rate_base"] + ) def test_leafward_prop_preserve_non_propagating_tags(self): """Don't force the diagreeing child to follow the parent.""" @@ -157,7 +163,10 @@ def test_leafward_prop_preserve_non_propagating_tags(self): annotated_tags = self.build_forest_and_annotated_tags(edges, tags) assert annotated_tags[self.parent]["in_rate_base"] == "yes" assert annotated_tags[self.child1]["in_rate_base"] == "no" - assert annotated_tags[self.child2]["in_rate_base"] == "yes" + assert ( + annotated_tags[self.child2]["in_rate_base"] + == annotated_tags[self.parent]["in_rate_base"] + ) assert annotated_tags[self.parent]["in_root_boose"] == "yus" assert annotated_tags[self.child1]["in_root_boose"] == "nu" assert annotated_tags[self.child2]["in_root_boose"] == "purtiul" @@ -174,21 +183,34 @@ def test_rootward_prop_disagreeing_children(self): assert annotated_tags[self.child2]["in_rate_base"] == "yes" def test_prop_no_tags(self): - """If no tags, don't propagate anything.""" + """If no tags, don't propagate anything. + + This also tests whether a fully null tag input behaves the same as an + empty df. It also checks whether we get the expected behavior when + the propogated tags are all null but there is another non-propagating + tag. + """ edges = [(self.parent, self.child1), (self.parent, self.child2)] - tags = pd.DataFrame([self.parent, self.child1, self.child2]).assign( - in_rate_base=[pd.NA, pd.NA, pd.NA] - ) + null_tag_edges = [self.parent, self.child1, self.child2] + tags = pd.DataFrame(null_tag_edges).assign(in_rate_base=[pd.NA, pd.NA, pd.NA]) annotated_tags = self.build_forest_and_annotated_tags(edges, tags) - assert annotated_tags[self.parent] == {} - assert annotated_tags[self.child1] == {} - assert annotated_tags[self.child2] == {} + for node in null_tag_edges: + assert not annotated_tags.get(node) tags = pd.DataFrame(columns=NodeId._fields).convert_dtypes() annotated_tags = self.build_forest_and_annotated_tags(edges, tags) - assert not annotated_tags.get(self.parent) - assert not annotated_tags.get(self.child1) - assert not annotated_tags.get(self.child2) + for node in null_tag_edges: + assert not annotated_tags.get(node) + + tags = pd.DataFrame([self.parent, self.child1, self.child2]).assign( + in_rate_base=[pd.NA, pd.NA, pd.NA], + a_non_propped_tag=["hi", "hello", "what_am_i_doing_here_even"], + ) + annotated_tags = self.build_forest_and_annotated_tags(edges, tags) + for node in null_tag_edges: + assert not annotated_tags[node].get("in_rate_base") + # do we still have a non-null value for the non-propped tag + assert annotated_tags[node].get("a_non_propped_tag") def test_annotated_forest_propagates_rootward(self): edges = [ @@ -235,7 +257,10 @@ def test_annotated_forest_propagates_rootward_correction(self): ) assert annotated_tags[self.child1]["in_rate_base"] == "yes" assert annotated_tags[self.grand_child11]["in_rate_base"] == "yes" - assert annotated_tags[self.child1_correction]["in_rate_base"] == "yes" + assert ( + annotated_tags[self.child1_correction]["in_rate_base"] + == annotated_tags[self.child1]["in_rate_base"] + ) def test_annotated_forest_propagates_rootward_two_layers(self): edges = [ From 0f3b6540eb59111528127b9fc7564832e0168694 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Tue, 30 Jan 2024 11:47:38 -0500 Subject: [PATCH 14/17] light cleaning --- src/pudl/output/ferc1.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index 026ba2fde4..ecd6bfae37 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -1154,10 +1154,7 @@ class OffByFactoid(NamedTuple): @asset -def _out_ferc1__explosion_tags( - table_dimensions_ferc1: pd.DataFrame, - calculation_components_xbrl_ferc1: pd.DataFrame, -) -> pd.DataFrame: +def _out_ferc1__explosion_tags(table_dimensions_ferc1: pd.DataFrame) -> pd.DataFrame: """Grab the stored tables of tags and add inferred dimension.""" rate_tags = _get_tags("xbrl_factoid_rate_base_tags.csv", table_dimensions_ferc1) rev_req_tags = _get_tags( @@ -2025,7 +2022,6 @@ class XbrlCalculationForestFerc1(BaseModel): exploded_calcs: pd.DataFrame = pd.DataFrame() seeds: list[NodeId] = [] tags: pd.DataFrame = pd.DataFrame() - # TODO: remove the group metric checks and see if things still build / tests still pass group_metric_checks: GroupMetricChecks = GroupMetricChecks() model_config = ConfigDict( arbitrary_types_allowed=True, ignored_types=(cached_property,) From 3e5c2cdd11a47447e6d9063941d70614d786d721 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Wed, 31 Jan 2024 07:39:54 -0700 Subject: [PATCH 15/17] root boose docs! Co-authored-by: Dazhong Xia --- test/unit/output/ferc1_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/output/ferc1_test.py b/test/unit/output/ferc1_test.py index e36eb959dd..cf78429c22 100644 --- a/test/unit/output/ferc1_test.py +++ b/test/unit/output/ferc1_test.py @@ -154,7 +154,7 @@ def test_leafward_prop_disagreeing_child(self): ) def test_leafward_prop_preserve_non_propagating_tags(self): - """Don't force the diagreeing child to follow the parent.""" + """Only propagate tags that actually get inherited - i.e., not `in_root_boose`.""" edges = [(self.parent, self.child1), (self.parent, self.child2)] tags = pd.DataFrame([self.parent, self.child1, self.child2]).assign( in_rate_base=["yes", "no", pd.NA], From b8758ddfaf65dbcbed913c4a693bbe3cf6d297de Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Wed, 31 Jan 2024 12:06:37 -0500 Subject: [PATCH 16/17] respond to dazhong's comments --- src/pudl/output/ferc1.py | 97 +++++++++++++++++----------------- test/unit/output/ferc1_test.py | 28 +++++----- 2 files changed, 64 insertions(+), 61 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index ecd6bfae37..96054eb85f 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -2242,8 +2242,7 @@ def propagate_node_attributes(self: Self, annotated_forest: nx.DiGraph): ## Leafwards propagation annotated_forest = _propagate_tags_leafward(annotated_forest, ["in_rate_base"]) # Rootward propagation - root_node = self.roots(annotated_forest)[0] - _ = _propagate_tag_rootward(annotated_forest, root_node, "in_rate_base") + annotated_forest = _propagate_tag_rootward(annotated_forest, "in_rate_base") # Correction Records annotated_forest = _propagate_tags_to_corrections(annotated_forest) return annotated_forest @@ -2780,6 +2779,10 @@ def nodes_to_df(calc_forest: nx.DiGraph, nodes: list[NodeId]) -> pd.DataFrame: def _propagate_tags_leafward( annotated_forest: nx.DiGraph, leafward_inherited_tags: list[str] ) -> nx.DiGraph: + """Push a parent's tags down to its descendants. + + Only push the `leafward_inherited_tags` - others will be left alone. + """ existing_tags = nx.get_node_attributes(annotated_forest, "tags") for node, parent_tags in existing_tags.items(): descendants = nx.descendants(annotated_forest, node) @@ -2799,7 +2802,7 @@ def _propagate_tags_leafward( def _propagate_tag_rootward( - annotated_forest: nx.DiGraph, node, tag_name: Literal["in_rate_base"] + annotated_forest: nx.DiGraph, tag_name: Literal["in_rate_base"] ) -> str: """Set the tag for nodes when all of its children have same tag. @@ -2808,37 +2811,35 @@ def _propagate_tag_rootward( """ def _get_tag(annotated_forest, node, tag_name): - return annotated_forest.nodes.get(node, {}).get("tags", {}).get(tag_name, pd.NA) - - logger.info(f"propagaging tags leafward from {node}") - tag = pd.NA - # i'm a leaf so i stop looking or - # if i have a value you don't need to keep looking at this node's childern - if not list(annotated_forest.successors(node)) or not pd.isna( - _get_tag(annotated_forest, node, tag_name) - ): - tag = _get_tag(annotated_forest, node, tag_name) - - else: - child_tags = set() - for child_node in annotated_forest.successors(node): - if not child_node.xbrl_factoid.endswith("_correction"): - child_tags.add( - _propagate_tag_rootward(annotated_forest, child_node, tag_name) - ) - # if all the children tags are the same and non-null - if (len(child_tags) == 1) and {t for t in child_tags if not pd.isna(t)}: - new_node_tag = child_tags.pop() - # actually assign the tag here but don't wipe out any other tags - existing_tags = nx.get_node_attributes(annotated_forest, "tags") - node_tags = { - node: {"tags": {tag_name: new_node_tag} | existing_tags.get(node, {})} + return annotated_forest.nodes.get(node, {}).get("tags", {}).get(tag_name) + + generations = list(nx.topological_generations(annotated_forest)) + for gen in reversed(generations): + untagged_nodes = { + node_id + for node_id in gen + if _get_tag(annotated_forest, node_id, tag_name) is None + } + for parent_node in untagged_nodes: + child_tags = { + _get_tag(annotated_forest, c, tag_name) + for c in annotated_forest.successors(parent_node) + if not c.xbrl_factoid.endswith("_correction") } - nx.set_node_attributes(annotated_forest, node_tags) - tag = new_node_tag - # elif the children disagree then the node's tag shouldn't be set and - # the og null tag should be returned - return tag + non_null_tags = child_tags - {None} + # sometimes, all children can share same tag but it's null. + if len(child_tags) == 1 and non_null_tags: + # actually assign the tag here but don't wipe out any other tags + new_node_tag = non_null_tags.pop() + existing_tags = nx.get_node_attributes(annotated_forest, "tags") + node_tags = { + parent_node: { + "tags": {tag_name: new_node_tag} + | existing_tags.get(parent_node, {}) + } + } + nx.set_node_attributes(annotated_forest, node_tags) + return annotated_forest def _propagate_tags_to_corrections(annotated_forest: nx.DiGraph) -> nx.DiGraph: @@ -2863,7 +2864,7 @@ def _propagate_tags_to_corrections(annotated_forest: nx.DiGraph) -> nx.DiGraph: def check_tag_propagation_compared_to_compiled_tags( df: pd.DataFrame, - propogated_tag: Literal["in_rate_base"], + propagated_tag: Literal["in_rate_base"], _out_ferc1__explosion_tags: pd.DataFrame, ): """Check if tags got propagated. @@ -2874,7 +2875,7 @@ def check_tag_propagation_compared_to_compiled_tags( or ``exploded_balance_sheet_liabilities_ferc1``. The ``exploded_income_statement_ferc1`` table does not currently have propagated tags. - propogated_tag: name of tag. Currently ``in_rate_base`` is the only propagated tag. + propagated_tag: name of tag. Currently ``in_rate_base`` is the only propagated tag. _out_ferc1__explosion_tags: mannually compiled tags. This table includes tags from many of the explosion tables so we will filter it before checking if the tag was propagated. @@ -2892,30 +2893,30 @@ def check_tag_propagation_compared_to_compiled_tags( df_tags = _out_ferc1__explosion_tags.merge( df_filtered, on=list(df_filtered.columns), how="right" ) - mannually_tagged = df_tags[df_tags[propogated_tag].notnull()].xbrl_factoid.unique() - detailed_tagged = df[df[f"tags_{propogated_tag}"].notnull()].xbrl_factoid.unique() - if len(detailed_tagged) < len(mannually_tagged): + manually_tagged = df_tags[df_tags[propagated_tag].notnull()].xbrl_factoid.unique() + detailed_tagged = df[df[f"tags_{propagated_tag}"].notnull()].xbrl_factoid.unique() + if len(detailed_tagged) < len(manually_tagged): raise AssertionError( - f"Found more {len(mannually_tagged)} mannually compiled tagged xbrl_factoids" + f"Found more {len(manually_tagged)} mannually compiled tagged xbrl_factoids" " than tags in propagated detailed data." ) - mannually_tagged_corrections = df_tags[ - df_tags[propogated_tag].notnull() + manually_tagged_corrections = df_tags[ + df_tags[propagated_tag].notnull() & df_tags.xbrl_factoid.str.endswith("_correction") ].xbrl_factoid.unique() detailed_tagged_corrections = df[ - df[f"tags_{propogated_tag}"].notnull() + df[f"tags_{propagated_tag}"].notnull() & df.xbrl_factoid.str.endswith("_correction") ].xbrl_factoid.unique() - if len(detailed_tagged_corrections) < len(mannually_tagged_corrections): + if len(detailed_tagged_corrections) < len(manually_tagged_corrections): raise AssertionError( - f"Found more {len(mannually_tagged)} mannually compiled tagged " - "xbrl_factoids than tags in propagated detailed data." + f"Found more {len(manually_tagged_corrections)} mannually compiled " + "tagged xbrl_factoids than tags in propagated detailed data." ) def check_for_correction_xbrl_factoids_with_tag( - df: pd.DataFrame, propogated_tag: Literal["in_rate_base"] + df: pd.DataFrame, propagated_tag: Literal["in_rate_base"] ): """Check if any correction records have tags. @@ -2925,13 +2926,13 @@ def check_for_correction_xbrl_factoids_with_tag( or ``exploded_balance_sheet_liabilities_ferc1``. The ``exploded_income_statement_ferc1`` table does not currently have propagated tags. - propogated_tag: name of tag. Currently ``in_rate_base`` is the only propagated tag. + propagated_tag: name of tag. Currently ``in_rate_base`` is the only propagated tag. Raises: AssertionError: If there are zero correction ``xbrl_factoids`` in ``df`` with tags. """ detailed_tagged_corrections = df[ - df[f"tags_{propogated_tag}"].notnull() + df[f"tags_{propagated_tag}"].notnull() & df.xbrl_factoid.str.endswith("_correction") ].xbrl_factoid.unique() if len(detailed_tagged_corrections) == 0: diff --git a/test/unit/output/ferc1_test.py b/test/unit/output/ferc1_test.py index cf78429c22..8a6ea43a8b 100644 --- a/test/unit/output/ferc1_test.py +++ b/test/unit/output/ferc1_test.py @@ -1,8 +1,6 @@ """Tests for the FERC Form 1 output functions. -These need to be recreated to work with the new XbrlCalculationForest implementation. - -Stuff to test: +Stuff we could test: - construction of basic tree from input metadata - do nodes not part of any calculation get orphaned? - do nodes not in the seeded digraph get pruned? @@ -12,10 +10,12 @@ - pruning of passthrough nodes & associated corrections - propagation of weights - conflicting weights -- propagation of tags - conflicting tags - validation of calculations using only leaf-nodes to reproduce root node values +Stuff we are testing: +- propagation of tags + """ import logging @@ -30,10 +30,6 @@ logger = logging.getLogger(__name__) -# TODO: combine these into a class because we have a lot of similar method names -# TODO: make graph construction easier with helper functions - - class TestTagPropagation(unittest.TestCase): def setUp(self): self.parent = NodeId( @@ -158,7 +154,7 @@ def test_leafward_prop_preserve_non_propagating_tags(self): edges = [(self.parent, self.child1), (self.parent, self.child2)] tags = pd.DataFrame([self.parent, self.child1, self.child2]).assign( in_rate_base=["yes", "no", pd.NA], - in_root_boose=["yus", "nu", "purtiul"], + in_root_boose=["yus", "nu", pd.NA], ) annotated_tags = self.build_forest_and_annotated_tags(edges, tags) assert annotated_tags[self.parent]["in_rate_base"] == "yes" @@ -169,7 +165,7 @@ def test_leafward_prop_preserve_non_propagating_tags(self): ) assert annotated_tags[self.parent]["in_root_boose"] == "yus" assert annotated_tags[self.child1]["in_root_boose"] == "nu" - assert annotated_tags[self.child2]["in_root_boose"] == "purtiul" + assert not annotated_tags[self.child2].get("in_root_boose") def test_rootward_prop_disagreeing_children(self): """Parents should not pick sides between disagreeing children.""" @@ -208,11 +204,16 @@ def test_prop_no_tags(self): ) annotated_tags = self.build_forest_and_annotated_tags(edges, tags) for node in null_tag_edges: - assert not annotated_tags[node].get("in_rate_base") + assert "in_rate_base" not in annotated_tags[node] # do we still have a non-null value for the non-propped tag assert annotated_tags[node].get("a_non_propped_tag") def test_annotated_forest_propagates_rootward(self): + """If two grandchildren have the same tag, their parent does inhert the tag. + + But, the rootward propagation only happens when all of a nodes children have + the same tag. + """ edges = [ (self.parent, self.child1), (self.parent, self.child2), @@ -223,13 +224,14 @@ def test_annotated_forest_propagates_rootward(self): in_rate_base=["yes", "yes"] ) annotated_tags = self.build_forest_and_annotated_tags(edges, tags) - assert not annotated_tags.get(self.parent) + assert self.parent not in annotated_tags assert annotated_tags[self.child1]["in_rate_base"] == "yes" - assert not annotated_tags.get(self.child2) + assert self.child2 not in annotated_tags assert annotated_tags[self.grand_child11]["in_rate_base"] == "yes" assert annotated_tags[self.grand_child12]["in_rate_base"] == "yes" def test_annotated_forest_propagates_rootward_disagreeing_sibling(self): + """If two siblings disagree, their parent does not inherit either of their tag values.""" edges = [ (self.parent, self.child1), (self.parent, self.child2), From da8df115e5a8d04815c80dd87ade65e73e003aa6 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Fri, 2 Feb 2024 10:27:07 -0500 Subject: [PATCH 17/17] add a test about pruned nodes and add the NodeId(*n) into the orphans --- src/pudl/output/ferc1.py | 10 ++- test/unit/output/ferc1_test.py | 137 +++++++++++++++++++++++---------- 2 files changed, 103 insertions(+), 44 deletions(-) diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py index 2eab3ed65f..c9452b9411 100644 --- a/src/pudl/output/ferc1.py +++ b/src/pudl/output/ferc1.py @@ -2524,7 +2524,11 @@ def forest_leaves(self: Self) -> list[NodeId]: @cached_property def orphans(self: Self) -> list[NodeId]: - """Identify all nodes that appear in metadata but not in the full digraph.""" + """Identify all nodes that appear in the exploded_calcs but not in the full digraph. + + Because we removed the metadata and are now building the tree entirely based on + the exploded_calcs, this should now never produce any orphans and is a bit redundant. + """ nodes = self.full_digraph.nodes orphans = [] for idx_cols in [self.calc_cols, self.parent_cols]: @@ -2532,7 +2536,7 @@ def orphans(self: Self) -> list[NodeId]: [ NodeId(*n) for n in self.exploded_calcs.set_index(idx_cols).index - if n not in nodes + if NodeId(*n) not in nodes ] ) return list(set(orphans)) @@ -2838,7 +2842,7 @@ def _propagate_tags_leafward( def _propagate_tag_rootward( annotated_forest: nx.DiGraph, tag_name: Literal["in_rate_base"] -) -> str: +) -> nx.DiGraph: """Set the tag for nodes when all of its children have same tag. This function returns the value of a tag, but also sets node attributes diff --git a/test/unit/output/ferc1_test.py b/test/unit/output/ferc1_test.py index 16e04a68f1..d86faf1613 100644 --- a/test/unit/output/ferc1_test.py +++ b/test/unit/output/ferc1_test.py @@ -35,7 +35,102 @@ logger = logging.getLogger(__name__) -class TestTagPropagation(unittest.TestCase): +class TestForestSetup(unittest.TestCase): + def setUp(self): + # this is where you add nodes you want to use + pass + + def _exploded_calcs_from_edges(self, edges: list[tuple[NodeId, NodeId]]): + records = [] + for parent, child in edges: + record = {"weight": 1} + for field in NodeId._fields: + record[f"{field}_parent"] = parent.__getattribute__(field) + record[field] = child.__getattribute__(field) + records.append(record) + dtype_parent = {f"{col}_parent": pd.StringDtype() for col in NodeId._fields} + dtype_child = {col: pd.StringDtype() for col in NodeId._fields} + dtype_weight = {"weight": pd.Int64Dtype()} + + return pd.DataFrame.from_records(records).astype( + dtype_child | dtype_parent | dtype_weight + ) + + def build_forest( + self, edges: list[tuple[NodeId, NodeId]], tags: pd.DataFrame, seeds=None + ): + if not seeds: + seeds = [self.parent] + forest = XbrlCalculationForestFerc1( + exploded_calcs=self._exploded_calcs_from_edges(edges), + seeds=seeds, + tags=tags, + ) + return forest + + def build_forest_and_annotated_tags( + self, edges: list[tuple[NodeId, NodeId]], tags: pd.DataFrame, seeds=None + ): + """Build a forest, test forest nodes and return annotated tags. + + Args: + edges: list of tuples + tags: dataframe of tags + seeds: list of seed nodes. Default is None and will assume seed node is + ``parent``. + """ + simple_forest = self.build_forest(edges, tags, seeds) + annotated_forest = simple_forest.annotated_forest + # ensure no nodes got dropped + assert len(annotated_forest.nodes) == len(dedupe_n_flatten_list_of_lists(edges)) + annotated_tags = nx.get_node_attributes(annotated_forest, "tags") + return annotated_tags + + +class TestPrunnedNode(TestForestSetup): + def setUp(self): + self.root = NodeId( + table_name="table_1", + xbrl_factoid="reported_1", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) + self.root_child = NodeId( + table_name="table_1", + xbrl_factoid="reported_11", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) + self.root_other = NodeId( + table_name="table_1", + xbrl_factoid="reported_2", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) + self.root_other_child = NodeId( + table_name="table_1", + xbrl_factoid="reported_21", + utility_type="electric", + plant_status=pd.NA, + plant_function=pd.NA, + ) + + def test_pruned_nodes(self): + edges = [(self.root, self.root_child), (self.root_other, self.root_other_child)] + tags = pd.DataFrame(columns=list(NodeId._fields)).convert_dtypes() + forest = XbrlCalculationForestFerc1( + exploded_calcs=self._exploded_calcs_from_edges(edges), + seeds=[self.root], + tags=tags, + ) + pruned = forest.pruned + assert set(pruned) == {self.root_other, self.root_other_child} + + +class TestTagPropagation(TestForestSetup): def setUp(self): self.parent = NodeId( table_name="table_1", @@ -87,46 +182,6 @@ def setUp(self): plant_function=pd.NA, ) - def _exploded_calcs_from_edges(self, edges: list[tuple[NodeId, NodeId]]): - records = [] - for parent, child in edges: - record = {"weight": 1} - for field in NodeId._fields: - record[f"{field}_parent"] = parent.__getattribute__(field) - record[field] = child.__getattribute__(field) - records.append(record) - dtype_parent = {f"{col}_parent": pd.StringDtype() for col in NodeId._fields} - dtype_child = {col: pd.StringDtype() for col in NodeId._fields} - dtype_weight = {"weight": pd.Int64Dtype()} - - return pd.DataFrame.from_records(records).astype( - dtype_child | dtype_parent | dtype_weight - ) - - def build_forest_and_annotated_tags( - self, edges: list[tuple[NodeId, NodeId]], tags: pd.DataFrame, seeds=None - ): - """Build a forest, test forest nodes and return annotated tags. - - Args: - edges: list of tuples - tags: dataframe of tags - seeds: list of seed nodes. Default is None and will assume seed node is - ``parent``. - """ - if not seeds: - seeds = [self.parent] - simple_forest = XbrlCalculationForestFerc1( - exploded_calcs=self._exploded_calcs_from_edges(edges), - seeds=seeds, - tags=tags, - ) - annotated_forest = simple_forest.annotated_forest - # ensure no nodes got dropped - assert len(annotated_forest.nodes) == len(dedupe_n_flatten_list_of_lists(edges)) - annotated_tags = nx.get_node_attributes(annotated_forest, "tags") - return annotated_tags - def test_leafward_prop_undecided_children(self): edges = [(self.parent, self.child1), (self.parent, self.child2)] tags = pd.DataFrame([self.parent, self.child1, self.child2]).assign(