Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add parent dimensions into calculation component table #2753

Merged
merged 5 commits into from
Jul 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 26 additions & 26 deletions src/pudl/output/ferc1.py
Original file line number Diff line number Diff line change
Expand Up @@ -1049,15 +1049,17 @@ def calculations(self):
tables in this explosion.
"""
calc_explode = self.calculation_components_xbrl_ferc1
calc_explode = calc_explode[calc_explode.table_name.isin(self.table_names)]
calc_explode = calc_explode[
calc_explode.table_name_parent.isin(self.table_names)
]
not_in_explosion_xbrl_factoids = list(
calc_explode.loc[
~calc_explode.table_name_calc.isin(self.table_names),
"xbrl_factoid",
~calc_explode.table_name.isin(self.table_names),
"xbrl_factoid_parent",
].unique()
)
calc_explode = calc_explode[
~calc_explode.xbrl_factoid.isin(not_in_explosion_xbrl_factoids)
~calc_explode.xbrl_factoid_parent.isin(not_in_explosion_xbrl_factoids)
].copy()
return calc_explode

Expand Down Expand Up @@ -1099,9 +1101,7 @@ def metadata(self):
# At this point all remaining calculation components should exist within the
# exploded metadata.
calc_comps = self.calculations
calc_comps_index = calc_comps.set_index(
["table_name_calc", "xbrl_factoid_calc"]
).index
calc_comps_index = calc_comps.set_index(["table_name", "xbrl_factoid"]).index
meta_index = exploded_metadata.set_index(["table_name", "xbrl_factoid"]).index
nodes_not_in_calculations = [
x
Expand Down Expand Up @@ -1137,12 +1137,14 @@ def redefine_calculations_with_components_out_of_explosion(
embedded calculations.
"""
calc_explode = self.calculation_components_xbrl_ferc1
calc_explode = calc_explode[calc_explode.table_name.isin(self.table_names)]
calc_explode = calc_explode[
calc_explode.table_name_parent.isin(self.table_names)
]

not_in_explosion_xbrl_factoids = list(
calc_explode.loc[
~calc_explode.table_name_calc.isin(self.table_names),
"xbrl_factoid",
~calc_explode.table_name.isin(self.table_names),
"xbrl_factoid_parent",
].unique()
)
meta_explode.loc[
Expand Down Expand Up @@ -1379,10 +1381,7 @@ def generate_intertable_calculations(
f"{list(calculations_intertable.xbrl_factoid.unique())}."
)
# compile the lists of columns we are going to use later
calc_component_idx = [
"table_name_calc",
"xbrl_factoid_calc",
] + self.other_dimensions
calc_component_idx = ["table_name", "xbrl_factoid"] + self.other_dimensions
# Merge the reported data and the calculation component metadata to enable
# validation of calculated values. Here the data table exploded is supplying the
# values associated with individual calculation components, and the table_name
Expand All @@ -1391,35 +1390,36 @@ def generate_intertable_calculations(
# values so they can be summed directly. This gives us aggregated calculated
# values that can later be compared to the higher level reported values.

# the validation is one_many in all instances expect for the xbrl_factoid_calc
# the validation is one_many in all instances expect for the xbrl_factoid
# construction_work_in_progress in the balance_sheet_assets_ferc1 explosion.
# this may be a problem in the calculations that we should track down in #2717
validate = (
"one_to_many"
if self.root_table != "balance_sheet_assets_ferc1"
else "many_to_many"
)
# we are going to merge the data onto the calc components with the _parent
# column names, so the groupby after the merge needs a set of by cols with the
# _parent suffix
gby_parent = [
f"{col}_parent" if col in ["table_name", "xbrl_factoid"] else col
for col in self.exploded_pks
]
calc_df = (
pd.merge(
calculations_intertable,
exploded.rename(
columns={
"table_name": "table_name_calc",
"xbrl_factoid": "xbrl_factoid_calc",
}
),
#
exploded,
validate=validate,
on=calc_component_idx,
)
# apply the weight from the calc to convey the sign before summing.
.assign(calculated_amount=lambda x: x[self.value_col] * x.weight)
.groupby(self.exploded_pks, as_index=False, dropna=False)[
["calculated_amount"]
]
.groupby(gby_parent, as_index=False, dropna=False)[["calculated_amount"]]
.sum(min_count=1)
)

# remove the _parent suffix so we can merge these calculated values back onto
# the data using the original pks
calc_df.columns = calc_df.columns.str.removesuffix("_parent")
calculated_df = pd.merge(
exploded,
calc_df,
Expand Down
116 changes: 96 additions & 20 deletions src/pudl/transform/ferc1.py
Original file line number Diff line number Diff line change
Expand Up @@ -3019,18 +3019,24 @@ def process_xbrl_metadata_calculations(
.explode("source_tables")
.rename(
columns={
"name": "xbrl_factoid_calc",
"source_tables": "table_name_calc",
"xbrl_factoid": "xbrl_factoid_parent",
"name": "xbrl_factoid",
"source_tables": "table_name",
}
)
.merge(
metadata.drop(columns=["calculations"]),
metadata.drop(columns=["calculations"]).rename(
columns={
"xbrl_factoid": "xbrl_factoid_parent",
}
),
left_index=True,
right_index=True,
how="outer",
)
.dropna(subset=["xbrl_factoid_calc"])
.dropna(subset=["xbrl_factoid"])
.reset_index(drop=True)
.assign(table_name_parent=self.table_id.value)
)
return calc_comps

Expand Down Expand Up @@ -6684,18 +6690,19 @@ def calculation_components_xbrl_ferc1(**kwargs):
# compile all of the calc comp tables.
calc_metas = []
for table_name, transformer in FERC1_TFR_CLASSES.items():
calc_meta = (
transformer(xbrl_metadata_json=clean_xbrl_metadata_json[table_name])
.process_xbrl_metadata_calculations()
.assign(table_name=table_name)
)
calc_meta = transformer(
xbrl_metadata_json=clean_xbrl_metadata_json[table_name]
).process_xbrl_metadata_calculations()
calc_metas.append(calc_meta)
# squish all of the calc comp tables then add in the implicit table dimensions
calc_components = pd.concat(calc_metas).pipe(
make_calculation_dimensions_explicit,
table_dimensions_ferc1,
dimensions=other_dimensions(),
)
calc_components = add_parent_dimensions(
calc_components, dimensions=other_dimensions()
)
return calc_components


Expand Down Expand Up @@ -6745,31 +6752,100 @@ def make_calculation_dimensions_explicit(
"""
logger.info(f"Adding {dimensions=} into calculation component table.")
calc_comps_w_dims = calculation_components.copy()
on_cols = ["table_name", "xbrl_factoid"]
# for each dimension, use split/apply/combine. when there are no dims explict in
# the calc components, merge in all of the dims.
for dim_col in dimensions:
# extract the unique observed instances of this one dimension column & add the
# _calc suffix so we can merge onto the calculation components.
observed_dim = (
table_dimensions_ferc1[["table_name", "xbrl_factoid", dim_col]]
.drop_duplicates() # bc there are dupes after we removed the other dim cols
.rename(
columns={
"xbrl_factoid": "xbrl_factoid_calc",
"table_name": "table_name_calc",
}
)
)
observed_dim = table_dimensions_ferc1[
on_cols + [dim_col]
].drop_duplicates() # bc there are dupes after we removed the other dim cols
null_dim_mask = calc_comps_w_dims[dim_col].isnull()
null_dim = calc_comps_w_dims[null_dim_mask].drop(columns=[dim_col])
calc_comps_w_implied_dims = pd.merge(
null_dim,
observed_dim,
on=["table_name_calc", "xbrl_factoid_calc"],
on=on_cols,
how="left",
)
calc_comps_w_explicit_dims = calc_comps_w_dims[~null_dim_mask]
calc_comps_w_dims = pd.concat(
[calc_comps_w_implied_dims, calc_comps_w_explicit_dims]
)
return calc_comps_w_dims


def add_parent_dimensions(
calc_comps: pd.DataFrame, dimensions: list[str]
) -> pd.DataFrame:
"""Define dimension total calculations and add dimensions to calculation parents.

In addition to calculations defining how values reported in one set of facts can
be aggregated resulting in a value in another fact, there are implied calculation
relationships between values reported within a dimension. In particular, when a
dimension reports a ``total`` value, we assume it is the sum of all the non-total
values for that dimension. This function makes these within-dimension calculation
relationships explicit by defining new calculations for each dimension's ``total``.

Outside of these ``total`` calculations we require that a factoid and its
calculation components share the same dimensional values.

To be able to define these within-dimension calculations we also add dimension
columns to all of the parent factoids in the table.

Args:
calc_comps: a table of calculation component records which have had some manual
calculation fixes applied.
dimensions: list of dimension columns to check.

Returns:
An table associating calculation components with the parents they will be
aggregated into. The components and the parents are each identified by
``table_name``, ``xbrl_factoid``, and columns defining the additional dimensions
(``utility_type``, ``plant_status``, ``plant_function``). The parent columns
have a ``_parent`` suffix.
"""
table_fact_cols = [
"table_name_parent",
"xbrl_factoid_parent",
"table_name",
"xbrl_factoid",
]
# for each dimension col, add calculations defining the within-dimension totals and
# make a new _parent column
for dim in dimensions:
# Define calculations with the total values as parents and non-total values as
# sub-components by broadcast merging the not-total records onto the # new total
# records.
total_mask = calc_comps[dim] == "total"
total_w_subdim_components = (
pd.merge(
# the total records will become _parent columns in new records
left=calc_comps.loc[total_mask, table_fact_cols + dimensions],
# the non-total sub-components will become their calculation components
right=calc_comps[~total_mask],
on=table_fact_cols,
how="left",
validate="1:m",
suffixes=("_parent", ""),
)
# The new total -> sub-dimension records must always have the same
# parent and component fact/table so overwrite the original values
.assign(
table_name_parent=lambda x: x.table_name,
xbrl_factoid_parent=lambda x: x.xbrl_factoid,
)
# drop the other non-total parent dim cols (they will be merged back on later)
.drop(columns=[f"{d}_parent" for d in dimensions if d != dim])
)
# now we have a bunch of *new* records linking total records to their non-total
# sub-components.
# Seperately, we now add in parent-dimension values for all of the original
# calculation component records. We are assuming all parents should have the
# same dimension values as their child components.
calc_comps = calc_comps.assign(**{f"{dim}_parent": lambda x: x[dim]})
calc_comps = pd.concat([calc_comps, total_w_subdim_components])
# we shouldn't be adding any duplicates in this process!
cmgosnell marked this conversation as resolved.
Show resolved Hide resolved
assert calc_comps[calc_comps.duplicated()].empty
return calc_comps
56 changes: 52 additions & 4 deletions test/unit/transform/ferc1_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
TableIdFerc1,
UnstackBalancesToReportYearInstantXbrl,
WideToTidy,
add_parent_dimensions,
drop_duplicate_rows_dbf,
fill_dbf_to_xbrl_map,
make_calculation_dimensions_explicit,
Expand Down Expand Up @@ -391,15 +392,15 @@ def test_unstack_balances_to_report_year_instant_xbrl():
def test_make_calculation_dimensions_explicit():
"""Test :func:`make_calculation_dimensions_explicit`"""
calc_comp_idx = [
"table_name_parent",
"xbrl_factoid_parent",
"table_name",
"xbrl_factoid",
"table_name_calc",
"xbrl_factoid_calc",
]
calc_comps_trek = pd.read_csv(
StringIO(
"""
table_name,xbrl_factoid,table_name_calc,xbrl_factoid_calc,dim_x,dim_y
table_name_parent,xbrl_factoid_parent,table_name,xbrl_factoid,dim_x,dim_y
table_a,fact_1,table_a,fact_3,voyager,
table_a,fact_1,table_a,fact_4,voyager,
table_a,fact_1,table_a,fact_5,ds9,
Expand All @@ -417,10 +418,12 @@ def test_make_calculation_dimensions_explicit():
table_a,fact_3,voyager,in
table_a,fact_3,voyager,that
table_a,fact_3,voyager,nebula
table_a,fact_3,voyager,total
table_a,fact_4,voyager,coffee
table_a,fact_4,voyager,in
table_a,fact_4,voyager,that
table_a,fact_4,voyager,nebula
table_a,fact_4,voyager,total
table_a,fact_5,ds9,
table_b,fact_6,next_gen,resistance
table_b,fact_6,next_gen,is
Expand All @@ -446,15 +449,17 @@ def test_make_calculation_dimensions_explicit():
expected_trek = pd.read_csv(
StringIO(
"""
table_name,xbrl_factoid,table_name_calc,xbrl_factoid_calc,dim_x,dim_y
table_name_parent,xbrl_factoid_parent,table_name,xbrl_factoid,dim_x,dim_y
table_a,fact_1,table_a,fact_3,voyager,coffee
table_a,fact_1,table_a,fact_3,voyager,in
table_a,fact_1,table_a,fact_3,voyager,that
table_a,fact_1,table_a,fact_3,voyager,nebula
table_a,fact_1,table_a,fact_3,voyager,total
table_a,fact_1,table_a,fact_4,voyager,coffee
table_a,fact_1,table_a,fact_4,voyager,in
table_a,fact_1,table_a,fact_4,voyager,that
table_a,fact_1,table_a,fact_4,voyager,nebula
table_a,fact_1,table_a,fact_4,voyager,total
table_a,fact_1,table_a,fact_5,ds9,
table_a,fact_2,table_b,fact_6,next_gen,futile
table_a,fact_2,table_b,fact_7,next_gen,futile
Expand All @@ -476,3 +481,46 @@ def test_make_calculation_dimensions_explicit():
.reset_index(drop=True)
)
pd.testing.assert_frame_equal(out_trek, out_reordered, check_like=True)

expected_parent_dim_trek = pd.read_csv(
StringIO(
"""
table_name_parent,xbrl_factoid_parent,table_name,xbrl_factoid,dim_x,dim_y,dim_x_parent,dim_y_parent
table_a,fact_1,table_a,fact_3,voyager,coffee,voyager,coffee
table_a,fact_1,table_a,fact_3,voyager,in,voyager,in
table_a,fact_1,table_a,fact_3,voyager,that,voyager,that
table_a,fact_1,table_a,fact_3,voyager,nebula,voyager,nebula
table_a,fact_1,table_a,fact_3,voyager,total,voyager,total
table_a,fact_1,table_a,fact_4,voyager,coffee,voyager,coffee
table_a,fact_1,table_a,fact_4,voyager,in,voyager,in
table_a,fact_1,table_a,fact_4,voyager,that,voyager,that
table_a,fact_1,table_a,fact_4,voyager,nebula,voyager,nebula
table_a,fact_1,table_a,fact_4,voyager,total,voyager,total
table_a,fact_1,table_a,fact_5,ds9,,ds9,
table_a,fact_2,table_b,fact_6,next_gen,futile,next_gen,futile
table_a,fact_2,table_b,fact_7,next_gen,futile,next_gen,futile
table_a,fact_2,table_b,fact_8,next_gen,resistance,next_gen,resistance
table_a,fact_2,table_b,fact_8,next_gen,is,next_gen,is
table_a,fact_2,table_b,fact_8,next_gen,futile,next_gen,futile
table_a,fact_3,table_a,fact_3,voyager,coffee,,total
table_a,fact_3,table_a,fact_3,voyager,in,,total
table_a,fact_3,table_a,fact_3,voyager,that,,total
table_a,fact_3,table_a,fact_3,voyager,nebula,,total
table_a,fact_4,table_a,fact_4,voyager,coffee,,total
table_a,fact_4,table_a,fact_4,voyager,in,,total
table_a,fact_4,table_a,fact_4,voyager,that,,total
table_a,fact_4,table_a,fact_4,voyager,nebula,,total
"""
)
)
out_parent_dim_trek = (
add_parent_dimensions(
calc_comps=expected_trek,
dimensions=["dim_x", "dim_y"],
)
.sort_values(calc_comp_idx)
.reset_index(drop=True)
)
pd.testing.assert_frame_equal(
expected_parent_dim_trek, out_parent_dim_trek, check_like=True
)
Loading