Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add parent dimensions into calculation component table #2753

Merged
merged 5 commits into from
Jul 25, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 26 additions & 26 deletions src/pudl/output/ferc1.py
Original file line number Diff line number Diff line change
Expand Up @@ -1049,15 +1049,17 @@ def calculations(self):
tables in this explosion.
"""
calc_explode = self.calculation_components_xbrl_ferc1
calc_explode = calc_explode[calc_explode.table_name.isin(self.table_names)]
calc_explode = calc_explode[
calc_explode.table_name_parent.isin(self.table_names)
]
not_in_explosion_xbrl_factoids = list(
calc_explode.loc[
~calc_explode.table_name_calc.isin(self.table_names),
"xbrl_factoid",
~calc_explode.table_name.isin(self.table_names),
"xbrl_factoid_parent",
].unique()
)
calc_explode = calc_explode[
~calc_explode.xbrl_factoid.isin(not_in_explosion_xbrl_factoids)
~calc_explode.xbrl_factoid_parent.isin(not_in_explosion_xbrl_factoids)
].copy()
return calc_explode

Expand Down Expand Up @@ -1099,9 +1101,7 @@ def metadata(self):
# At this point all remaining calculation components should exist within the
# exploded metadata.
calc_comps = self.calculations
calc_comps_index = calc_comps.set_index(
["table_name_calc", "xbrl_factoid_calc"]
).index
calc_comps_index = calc_comps.set_index(["table_name", "xbrl_factoid"]).index
meta_index = exploded_metadata.set_index(["table_name", "xbrl_factoid"]).index
nodes_not_in_calculations = [
x
Expand Down Expand Up @@ -1137,12 +1137,14 @@ def redefine_calculations_with_components_out_of_explosion(
embedded calculations.
"""
calc_explode = self.calculation_components_xbrl_ferc1
calc_explode = calc_explode[calc_explode.table_name.isin(self.table_names)]
calc_explode = calc_explode[
calc_explode.table_name_parent.isin(self.table_names)
]

not_in_explosion_xbrl_factoids = list(
calc_explode.loc[
~calc_explode.table_name_calc.isin(self.table_names),
"xbrl_factoid",
~calc_explode.table_name.isin(self.table_names),
"xbrl_factoid_parent",
].unique()
)
meta_explode.loc[
Expand Down Expand Up @@ -1379,10 +1381,7 @@ def generate_intertable_calculations(
f"{list(calculations_intertable.xbrl_factoid.unique())}."
)
# compile the lists of columns we are going to use later
calc_component_idx = [
"table_name_calc",
"xbrl_factoid_calc",
] + self.other_dimensions
calc_component_idx = ["table_name", "xbrl_factoid"] + self.other_dimensions
# Merge the reported data and the calculation component metadata to enable
# validation of calculated values. Here the data table exploded is supplying the
# values associated with individual calculation components, and the table_name
Expand All @@ -1391,35 +1390,36 @@ def generate_intertable_calculations(
# values so they can be summed directly. This gives us aggregated calculated
# values that can later be compared to the higher level reported values.

# the validation is one_many in all instances expect for the xbrl_factoid_calc
# the validation is one_many in all instances expect for the xbrl_factoid
# construction_work_in_progress in the balance_sheet_assets_ferc1 explosion.
# this may be a problem in the calculations that we should track down in #2717
validate = (
"one_to_many"
if self.root_table != "balance_sheet_assets_ferc1"
else "many_to_many"
)
# we are going to merge the data onto the calc components with the _parent
# column names, so the groupby after the merge needs a set of by cols with the
# _parent suffix
gby_parent = [
f"{col}_parent" if col in ["table_name", "xbrl_factoid"] else col
for col in self.exploded_pks
]
calc_df = (
pd.merge(
calculations_intertable,
exploded.rename(
columns={
"table_name": "table_name_calc",
"xbrl_factoid": "xbrl_factoid_calc",
}
),
#
exploded,
validate=validate,
on=calc_component_idx,
)
# apply the weight from the calc to convey the sign before summing.
.assign(calculated_amount=lambda x: x[self.value_col] * x.weight)
.groupby(self.exploded_pks, as_index=False, dropna=False)[
["calculated_amount"]
]
.groupby(gby_parent, as_index=False, dropna=False)[["calculated_amount"]]
.sum(min_count=1)
)

# remove the _parent suffix so we can merge these calculated values back onto
# the data using the original pks
calc_df.columns = calc_df.columns.str.removesuffix("_parent")
calculated_df = pd.merge(
exploded,
calc_df,
Expand Down
116 changes: 96 additions & 20 deletions src/pudl/transform/ferc1.py
Original file line number Diff line number Diff line change
Expand Up @@ -3019,18 +3019,24 @@ def process_xbrl_metadata_calculations(
.explode("source_tables")
.rename(
columns={
"name": "xbrl_factoid_calc",
"source_tables": "table_name_calc",
"xbrl_factoid": "xbrl_factoid_parent",
"name": "xbrl_factoid",
"source_tables": "table_name",
}
)
.merge(
metadata.drop(columns=["calculations"]),
metadata.drop(columns=["calculations"]).rename(
columns={
"xbrl_factoid": "xbrl_factoid_parent",
}
),
left_index=True,
right_index=True,
how="outer",
)
.dropna(subset=["xbrl_factoid_calc"])
.dropna(subset=["xbrl_factoid"])
.reset_index(drop=True)
.assign(table_name_parent=self.table_id.value)
)
return calc_comps

Expand Down Expand Up @@ -6684,18 +6690,19 @@ def calculation_components_xbrl_ferc1(**kwargs):
# compile all of the calc comp tables.
calc_metas = []
for table_name, transformer in FERC1_TFR_CLASSES.items():
calc_meta = (
transformer(xbrl_metadata_json=clean_xbrl_metadata_json[table_name])
.process_xbrl_metadata_calculations()
.assign(table_name=table_name)
)
calc_meta = transformer(
xbrl_metadata_json=clean_xbrl_metadata_json[table_name]
).process_xbrl_metadata_calculations()
calc_metas.append(calc_meta)
# squish all of the calc comp tables then add in the implicit table dimensions
calc_components = pd.concat(calc_metas).pipe(
make_calculation_dimensions_explicit,
table_dimensions_ferc1,
dimensions=other_dimensions(),
)
calc_components = add_parent_dimensions(
calc_components, dimensions=other_dimensions()
)
return calc_components


Expand Down Expand Up @@ -6745,31 +6752,100 @@ def make_calculation_dimensions_explicit(
"""
logger.info(f"Adding {dimensions=} into calculation component table.")
calc_comps_w_dims = calculation_components.copy()
on_cols = ["table_name", "xbrl_factoid"]
# for each dimension, use split/apply/combine. when there are no dims explict in
# the calc components, merge in all of the dims.
for dim_col in dimensions:
# extract the unique observed instances of this one dimension column & add the
# _calc suffix so we can merge onto the calculation components.
observed_dim = (
table_dimensions_ferc1[["table_name", "xbrl_factoid", dim_col]]
.drop_duplicates() # bc there are dupes after we removed the other dim cols
.rename(
columns={
"xbrl_factoid": "xbrl_factoid_calc",
"table_name": "table_name_calc",
}
)
)
observed_dim = table_dimensions_ferc1[
on_cols + [dim_col]
].drop_duplicates() # bc there are dupes after we removed the other dim cols
null_dim_mask = calc_comps_w_dims[dim_col].isnull()
null_dim = calc_comps_w_dims[null_dim_mask].drop(columns=[dim_col])
calc_comps_w_implied_dims = pd.merge(
null_dim,
observed_dim,
on=["table_name_calc", "xbrl_factoid_calc"],
on=on_cols,
how="left",
)
calc_comps_w_explicit_dims = calc_comps_w_dims[~null_dim_mask]
calc_comps_w_dims = pd.concat(
[calc_comps_w_implied_dims, calc_comps_w_explicit_dims]
)
return calc_comps_w_dims


def add_parent_dimensions(calc_comps, dimensions):
"""Add in the dimensions of the parents in the calculation components.

Together :meth:`process_xbrl_metadata_calculations` and
:func:`make_calculation_dimensions_explicit` generates calculation component table
with pk columns of: ``table_name_parent``, ``xbrl_factoid_parent``, ``table_name``,
``xbrl_factoid``. The table also contains dimension columns (ex: ``utility_type``,
``plant_function``, etc.) which indicate the dimensions of the calculation component
factoid - not the parent factoid. This function attempts to add in additional
``dimension`` columns for the parent-side of the calculation.

First, we treat the totals in these dimension columns by assuming that any "total"
we observe (i.e. ``utility_type=="total"``) can be summed up of all non-total
values within that same dimension (i.e.
``utility_type.isin(["electric", "gas", "other"])``). We implement this by creating
new calculation component records which have a parent of dimension "total" and
calculation components of all observed non-total dimension values. This is done by
a broadcast merge of the non-total dimension calculation component values onto
parent total records. The result of this is the creation of new records which
communicate this total -> non-total calculation that was not previously encapsulated
within the calculation components.

Then, we add parent-level dimension values for all of the original calculation
component records in ``calc_comps``. We assume that every parent factoid should have
the same dimension values as its calculation component dimension values.

Args:
calc_comps: a table of calculation component records which have had some manual
calculation fixes applied.
dimensions: list of dimension columns to check.
"""
calc_comp_idx = [
"table_name_parent",
"xbrl_factoid_parent",
"table_name",
"xbrl_factoid",
]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do the PKs necessarily not include the dimension columns? E.g. couldn't there be cases where we define different calculations depending utility_type? This almost happens where we have one calculation with lots of details from the plant_in_service_ferc1 table, and a different simpler calculation for the gas and other utility types. In that case I think it happens to be the case that the calculation components are being reported in different tables, but does that necessarily have to be true?

Or is it the case that at this point in the processing of the calculation components table we're still adding values to the dimension columns, and at the end of this step they're going to be part of the PKs in a way that they weren't before? That seems true with respect to the addition of the total calculations, but is it true more generally?

# for each dimension col, make a new _parent column
for dim in dimensions:
# Treat the totals. Broadcast merge the sub-components (not-totals) onto the
# total records. Making new records where the totals are the parents and the
# non-totals are the sub-components
total_mask = calc_comps[dim] == "total"
total_w_subdim_components = (
pd.merge(
# the totals will become the parent record
left=calc_comps.loc[total_mask, calc_comp_idx + dimensions],
# the sub-components will become the calc component records
right=calc_comps[~total_mask],
on=calc_comp_idx,
how="left",
validate="1:m",
suffixes=("_parent", ""),
)
# these new total -> sub-dimension records should have the same
# parent and component fact/table so overwrite the og
.assign(
table_name_parent=lambda x: x.table_name,
xbrl_factoid_parent=lambda x: x.xbrl_factoid,
)
zaneselvans marked this conversation as resolved.
Show resolved Hide resolved
# drop the other non-total parent dim cols (they will be merged back on later)
.drop(columns=[f"{d}_parent" for d in dimensions if d != dim])
)
# now we have a bunch of *new* records linking total records to their non-total
# sub-components.
# Seperately, we can add in parent-dimension values for all of
# the original calculation component records. We are assuming all parents should
# have the same dimension values as their child components.
calc_comps = calc_comps.assign(**{f"{dim}_parent": lambda x: x[dim]})
calc_comps = pd.concat([calc_comps, total_w_subdim_components])
# we shouldn't be adding any duplicates in this process!
cmgosnell marked this conversation as resolved.
Show resolved Hide resolved
assert calc_comps[calc_comps.duplicated()].empty
return calc_comps
56 changes: 52 additions & 4 deletions test/unit/transform/ferc1_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
TableIdFerc1,
UnstackBalancesToReportYearInstantXbrl,
WideToTidy,
add_parent_dimensions,
drop_duplicate_rows_dbf,
fill_dbf_to_xbrl_map,
make_calculation_dimensions_explicit,
Expand Down Expand Up @@ -391,15 +392,15 @@ def test_unstack_balances_to_report_year_instant_xbrl():
def test_make_calculation_dimensions_explicit():
"""Test :func:`make_calculation_dimensions_explicit`"""
calc_comp_idx = [
"table_name_parent",
"xbrl_factoid_parent",
"table_name",
"xbrl_factoid",
"table_name_calc",
"xbrl_factoid_calc",
]
calc_comps_trek = pd.read_csv(
StringIO(
"""
table_name,xbrl_factoid,table_name_calc,xbrl_factoid_calc,dim_x,dim_y
table_name_parent,xbrl_factoid_parent,table_name,xbrl_factoid,dim_x,dim_y
table_a,fact_1,table_a,fact_3,voyager,
table_a,fact_1,table_a,fact_4,voyager,
table_a,fact_1,table_a,fact_5,ds9,
Expand All @@ -417,10 +418,12 @@ def test_make_calculation_dimensions_explicit():
table_a,fact_3,voyager,in
table_a,fact_3,voyager,that
table_a,fact_3,voyager,nebula
table_a,fact_3,voyager,total
table_a,fact_4,voyager,coffee
table_a,fact_4,voyager,in
table_a,fact_4,voyager,that
table_a,fact_4,voyager,nebula
table_a,fact_4,voyager,total
table_a,fact_5,ds9,
table_b,fact_6,next_gen,resistance
table_b,fact_6,next_gen,is
Expand All @@ -446,15 +449,17 @@ def test_make_calculation_dimensions_explicit():
expected_trek = pd.read_csv(
StringIO(
"""
table_name,xbrl_factoid,table_name_calc,xbrl_factoid_calc,dim_x,dim_y
table_name_parent,xbrl_factoid_parent,table_name,xbrl_factoid,dim_x,dim_y
table_a,fact_1,table_a,fact_3,voyager,coffee
table_a,fact_1,table_a,fact_3,voyager,in
table_a,fact_1,table_a,fact_3,voyager,that
table_a,fact_1,table_a,fact_3,voyager,nebula
table_a,fact_1,table_a,fact_3,voyager,total
table_a,fact_1,table_a,fact_4,voyager,coffee
table_a,fact_1,table_a,fact_4,voyager,in
table_a,fact_1,table_a,fact_4,voyager,that
table_a,fact_1,table_a,fact_4,voyager,nebula
table_a,fact_1,table_a,fact_4,voyager,total
table_a,fact_1,table_a,fact_5,ds9,
table_a,fact_2,table_b,fact_6,next_gen,futile
table_a,fact_2,table_b,fact_7,next_gen,futile
Expand All @@ -476,3 +481,46 @@ def test_make_calculation_dimensions_explicit():
.reset_index(drop=True)
)
pd.testing.assert_frame_equal(out_trek, out_reordered, check_like=True)

expected_parent_dim_trek = pd.read_csv(
StringIO(
"""
table_name_parent,xbrl_factoid_parent,table_name,xbrl_factoid,dim_x,dim_y,dim_x_parent,dim_y_parent
table_a,fact_1,table_a,fact_3,voyager,coffee,voyager,coffee
table_a,fact_1,table_a,fact_3,voyager,in,voyager,in
table_a,fact_1,table_a,fact_3,voyager,that,voyager,that
table_a,fact_1,table_a,fact_3,voyager,nebula,voyager,nebula
table_a,fact_1,table_a,fact_3,voyager,total,voyager,total
table_a,fact_1,table_a,fact_4,voyager,coffee,voyager,coffee
table_a,fact_1,table_a,fact_4,voyager,in,voyager,in
table_a,fact_1,table_a,fact_4,voyager,that,voyager,that
table_a,fact_1,table_a,fact_4,voyager,nebula,voyager,nebula
table_a,fact_1,table_a,fact_4,voyager,total,voyager,total
table_a,fact_1,table_a,fact_5,ds9,,ds9,
table_a,fact_2,table_b,fact_6,next_gen,futile,next_gen,futile
table_a,fact_2,table_b,fact_7,next_gen,futile,next_gen,futile
table_a,fact_2,table_b,fact_8,next_gen,resistance,next_gen,resistance
table_a,fact_2,table_b,fact_8,next_gen,is,next_gen,is
table_a,fact_2,table_b,fact_8,next_gen,futile,next_gen,futile
table_a,fact_3,table_a,fact_3,voyager,coffee,,total
table_a,fact_3,table_a,fact_3,voyager,in,,total
table_a,fact_3,table_a,fact_3,voyager,that,,total
table_a,fact_3,table_a,fact_3,voyager,nebula,,total
table_a,fact_4,table_a,fact_4,voyager,coffee,,total
table_a,fact_4,table_a,fact_4,voyager,in,,total
table_a,fact_4,table_a,fact_4,voyager,that,,total
table_a,fact_4,table_a,fact_4,voyager,nebula,,total
"""
)
)
out_parent_dim_trek = (
add_parent_dimensions(
calc_comps=expected_trek,
dimensions=["dim_x", "dim_y"],
)
.sort_values(calc_comp_idx)
.reset_index(drop=True)
)
pd.testing.assert_frame_equal(
expected_parent_dim_trek, out_parent_dim_trek, check_like=True
)
Loading