Skip to content

Commit

Permalink
Make new extractor compatible with 2021 data
Browse files Browse the repository at this point in the history
The new extractor added some data to the 2021 XBRL archives. This caused some integration and validation test fails. I added some plants to the pudl_id mapping spreadsheet, all of which are considered totals. I.e., not real plants, but we're mapping them for the sake of giving them an ID (they are not connected to EIA records). Because this is how we treat other total records reported to FERC1.

This also updates the way that values were assigned to a slice of the ferc1_eia_train output spreadsheets. NA values were causing an issue, so I had to change how the values were being converted.

This also updates the test_minmax_rows test to reflect the new rows in the 2021 data.
  • Loading branch information
aesharpe committed Sep 26, 2023
1 parent 031c848 commit cf34e93
Show file tree
Hide file tree
Showing 5 changed files with 301 additions and 30 deletions.
244 changes: 244 additions & 0 deletions migrations/versions/11a43f756905_idk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
"""idk
Revision ID: 11a43f756905
Revises: 273a78878b74
Create Date: 2023-09-25 13:06:55.676082
"""
import sqlalchemy as sa
from alembic import op

# revision identifiers, used by Alembic.
revision = "11a43f756905"
down_revision = "273a78878b74"
branch_labels = None
depends_on = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table(
"denorm_depreciation_amortization_summary_ferc1", schema=None
) as batch_op:
batch_op.add_column(
sa.Column(
"utility_type",
sa.Text(),
nullable=True,
comment="Listing of utility plant types. Examples include Electric Utility, Gas Utility, and Other Utility.",
)
)

with op.batch_alter_table(
"denorm_electric_operating_expenses_ferc1", schema=None
) as batch_op:
batch_op.add_column(
sa.Column(
"utility_type",
sa.Text(),
nullable=True,
comment="Listing of utility plant types. Examples include Electric Utility, Gas Utility, and Other Utility.",
)
)

with op.batch_alter_table(
"denorm_electric_operating_revenues_ferc1", schema=None
) as batch_op:
batch_op.add_column(
sa.Column(
"utility_type",
sa.Text(),
nullable=True,
comment="Listing of utility plant types. Examples include Electric Utility, Gas Utility, and Other Utility.",
)
)

with op.batch_alter_table(
"denorm_electric_plant_depreciation_functional_ferc1", schema=None
) as batch_op:
batch_op.add_column(
sa.Column(
"depreciation_type",
sa.Text(),
nullable=True,
comment="Type of depreciation provision within FERC Account 108, including cost ofremoval, depreciation expenses, salvage, cost of retired plant, etc.",
)
)
batch_op.drop_column("ferc_account")

with op.batch_alter_table("denorm_plant_in_service_ferc1", schema=None) as batch_op:
batch_op.add_column(
sa.Column(
"utility_type",
sa.Text(),
nullable=True,
comment="Listing of utility plant types. Examples include Electric Utility, Gas Utility, and Other Utility.",
)
)

with op.batch_alter_table("denorm_purchased_power_ferc1", schema=None) as batch_op:
batch_op.add_column(
sa.Column(
"purchased_storage_mwh",
sa.Float(),
nullable=True,
comment="Number of megawatt hours purchased during the period for energy storage.",
)
)
batch_op.add_column(
sa.Column(
"purchased_other_than_storage_mwh",
sa.Float(),
nullable=True,
comment="Number of megawatt hours purchased during the period for other than energy storage.",
)
)

with op.batch_alter_table(
"depreciation_amortization_summary_ferc1", schema=None
) as batch_op:
batch_op.add_column(
sa.Column(
"utility_type",
sa.Text(),
nullable=True,
comment="Listing of utility plant types. Examples include Electric Utility, Gas Utility, and Other Utility.",
)
)
batch_op.add_column(
sa.Column(
"row_type_xbrl",
sa.Enum("calculated_value", "reported_value", "correction"),
nullable=True,
comment="Indicates whether the value reported in the row is calculated, or uniquely reported within the table.",
)
)

with op.batch_alter_table(
"electric_operating_expenses_ferc1", schema=None
) as batch_op:
batch_op.add_column(
sa.Column(
"utility_type",
sa.Text(),
nullable=True,
comment="Listing of utility plant types. Examples include Electric Utility, Gas Utility, and Other Utility.",
)
)

with op.batch_alter_table(
"electric_operating_revenues_ferc1", schema=None
) as batch_op:
batch_op.add_column(
sa.Column(
"utility_type",
sa.Text(),
nullable=True,
comment="Listing of utility plant types. Examples include Electric Utility, Gas Utility, and Other Utility.",
)
)

with op.batch_alter_table(
"electric_plant_depreciation_functional_ferc1", schema=None
) as batch_op:
batch_op.add_column(
sa.Column(
"depreciation_type",
sa.Text(),
nullable=True,
comment="Type of depreciation provision within FERC Account 108, including cost ofremoval, depreciation expenses, salvage, cost of retired plant, etc.",
)
)
batch_op.drop_column("ferc_account")

with op.batch_alter_table("plant_in_service_ferc1", schema=None) as batch_op:
batch_op.add_column(
sa.Column(
"utility_type",
sa.Text(),
nullable=True,
comment="Listing of utility plant types. Examples include Electric Utility, Gas Utility, and Other Utility.",
)
)

with op.batch_alter_table("purchased_power_ferc1", schema=None) as batch_op:
batch_op.add_column(
sa.Column(
"purchased_storage_mwh",
sa.Float(),
nullable=True,
comment="Number of megawatt hours purchased during the period for energy storage.",
)
)
batch_op.add_column(
sa.Column(
"purchased_other_than_storage_mwh",
sa.Float(),
nullable=True,
comment="Number of megawatt hours purchased during the period for other than energy storage.",
)
)

# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table("purchased_power_ferc1", schema=None) as batch_op:
batch_op.drop_column("purchased_other_than_storage_mwh")
batch_op.drop_column("purchased_storage_mwh")

with op.batch_alter_table("plant_in_service_ferc1", schema=None) as batch_op:
batch_op.drop_column("utility_type")

with op.batch_alter_table(
"electric_plant_depreciation_functional_ferc1", schema=None
) as batch_op:
batch_op.add_column(sa.Column("ferc_account", sa.TEXT(), nullable=True))
batch_op.drop_column("depreciation_type")

with op.batch_alter_table(
"electric_operating_revenues_ferc1", schema=None
) as batch_op:
batch_op.drop_column("utility_type")

with op.batch_alter_table(
"electric_operating_expenses_ferc1", schema=None
) as batch_op:
batch_op.drop_column("utility_type")

with op.batch_alter_table(
"depreciation_amortization_summary_ferc1", schema=None
) as batch_op:
batch_op.drop_column("row_type_xbrl")
batch_op.drop_column("utility_type")

with op.batch_alter_table("denorm_purchased_power_ferc1", schema=None) as batch_op:
batch_op.drop_column("purchased_other_than_storage_mwh")
batch_op.drop_column("purchased_storage_mwh")

with op.batch_alter_table("denorm_plant_in_service_ferc1", schema=None) as batch_op:
batch_op.drop_column("utility_type")

with op.batch_alter_table(
"denorm_electric_plant_depreciation_functional_ferc1", schema=None
) as batch_op:
batch_op.add_column(sa.Column("ferc_account", sa.TEXT(), nullable=True))
batch_op.drop_column("depreciation_type")

with op.batch_alter_table(
"denorm_electric_operating_revenues_ferc1", schema=None
) as batch_op:
batch_op.drop_column("utility_type")

with op.batch_alter_table(
"denorm_electric_operating_expenses_ferc1", schema=None
) as batch_op:
batch_op.drop_column("utility_type")

with op.batch_alter_table(
"denorm_depreciation_amortization_summary_ferc1", schema=None
) as batch_op:
batch_op.drop_column("utility_type")

# ### end Alembic commands ###
56 changes: 42 additions & 14 deletions migrations/versions/273a78878b74_purchased_storage_mwh.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,33 +9,61 @@
from alembic import op

# revision identifiers, used by Alembic.
revision = '273a78878b74'
down_revision = 'b5226cb31143'
revision = "273a78878b74"
down_revision = "b5226cb31143"
branch_labels = None
depends_on = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('denorm_purchased_power_ferc1', schema=None) as batch_op:
batch_op.add_column(sa.Column('purchased_storage_mwh', sa.Float(), nullable=True, comment='Number of megawatt hours purchased for energy storage during the period.'))
batch_op.add_column(sa.Column('purchased_other_than_storage_mwh', sa.Float(), nullable=True, comment='Number of megawatt hours purchased for other than energy storage during the period.'))
with op.batch_alter_table("denorm_purchased_power_ferc1", schema=None) as batch_op:
batch_op.add_column(
sa.Column(
"purchased_storage_mwh",
sa.Float(),
nullable=True,
comment="Number of megawatt hours purchased for energy storage during the period.",
)
)
batch_op.add_column(
sa.Column(
"purchased_other_than_storage_mwh",
sa.Float(),
nullable=True,
comment="Number of megawatt hours purchased for other than energy storage during the period.",
)
)

with op.batch_alter_table('purchased_power_ferc1', schema=None) as batch_op:
batch_op.add_column(sa.Column('purchased_storage_mwh', sa.Float(), nullable=True, comment='Number of megawatt hours purchased for energy storage during the period.'))
batch_op.add_column(sa.Column('purchased_other_than_storage_mwh', sa.Float(), nullable=True, comment='Number of megawatt hours purchased for other than energy storage during the period.'))
with op.batch_alter_table("purchased_power_ferc1", schema=None) as batch_op:
batch_op.add_column(
sa.Column(
"purchased_storage_mwh",
sa.Float(),
nullable=True,
comment="Number of megawatt hours purchased for energy storage during the period.",
)
)
batch_op.add_column(
sa.Column(
"purchased_other_than_storage_mwh",
sa.Float(),
nullable=True,
comment="Number of megawatt hours purchased for other than energy storage during the period.",
)
)

# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('purchased_power_ferc1', schema=None) as batch_op:
batch_op.drop_column('purchased_other_than_storage_mwh')
batch_op.drop_column('purchased_storage_mwh')
with op.batch_alter_table("purchased_power_ferc1", schema=None) as batch_op:
batch_op.drop_column("purchased_other_than_storage_mwh")
batch_op.drop_column("purchased_storage_mwh")

with op.batch_alter_table('denorm_purchased_power_ferc1', schema=None) as batch_op:
batch_op.drop_column('purchased_other_than_storage_mwh')
batch_op.drop_column('purchased_storage_mwh')
with op.batch_alter_table("denorm_purchased_power_ferc1", schema=None) as batch_op:
batch_op.drop_column("purchased_other_than_storage_mwh")
batch_op.drop_column("purchased_storage_mwh")

# ### end Alembic commands ###
19 changes: 9 additions & 10 deletions src/pudl/analysis/ferc1_eia_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@ def _prep_ferc1_eia(ferc1_eia, utils_eia860) -> pd.DataFrame:
logger.debug("Prepping FERC-EIA table")
# Only want to keep the plant_name_ppe field which replaces plant_name_eia
ferc1_eia_prep = ferc1_eia.copy().drop(columns="plant_name_eia")

# Add utility_name_eia - this must happen before renaming the cols or else there
# will be duplicate utility_name_eia columns.
utils_eia860.loc[:, "report_year"] = utils_eia860.report_date.dt.year
Expand All @@ -183,23 +182,24 @@ def _prep_ferc1_eia(ferc1_eia, utils_eia860) -> pd.DataFrame:
ferc1_eia_prep = ferc1_eia_prep.rename(columns=RENAME_COLS_FERC1_EIA)[
list(RENAME_COLS_FERC1_EIA.values())
]

# Add in pct diff values
for pct_diff_col in [x for x in RENAME_COLS_FERC1_EIA.values() if "_pct_diff" in x]:
ferc1_eia_prep = _pct_diff(ferc1_eia_prep, pct_diff_col)

# Add in fuel_type_code_pudl diff (qualitative bool)
ferc1_eia_prep.loc[
ferc1_eia_prep["fuel_type_code_pudl_diff"] = False
ferc1_eia_prep_nona = ferc1_eia_prep[
ferc1_eia_prep.fuel_type_code_pudl_eia.notna()
& ferc1_eia_prep.fuel_type_code_pudl_ferc1.notna(),
"fuel_type_code_pudl_diff",
] = ferc1_eia_prep.fuel_type_code_pudl_eia == (
ferc1_eia_prep.fuel_type_code_pudl_ferc1
& ferc1_eia_prep.fuel_type_code_pudl_ferc1.notna()
].copy()
ferc1_eia_prep_nona["fuel_type_code_pudl_diff"] = (
ferc1_eia_prep_nona.fuel_type_code_pudl_eia
== ferc1_eia_prep_nona.fuel_type_code_pudl_ferc1
)
ferc1_eia_prep.update(ferc1_eia_prep_nona)

# Add in installation_year diff (diff vs. pct_diff)
ferc1_eia_prep.loc[
:, "installation_year_ferc1"
ferc1_eia_prep.installation_year_ferc1.notna(), "installation_year_ferc1"
] = ferc1_eia_prep.installation_year_ferc1.astype("Int64")

ferc1_eia_prep.loc[
Expand All @@ -212,7 +212,6 @@ def _prep_ferc1_eia(ferc1_eia, utils_eia860) -> pd.DataFrame:

# Add best match col
ferc1_eia_prep = _is_best_match(ferc1_eia_prep)

return ferc1_eia_prep


Expand Down
Binary file modified src/pudl/package_data/glue/pudl_id_mapping.xlsx
Binary file not shown.
12 changes: 6 additions & 6 deletions test/validate/ferc1_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,16 +84,16 @@ def test_no_null_cols_ferc1(pudl_out_ferc1, live_dbs, cols, df_name):
@pytest.mark.parametrize(
"df_name,expected_rows",
[
("fbp_ferc1", 25_421),
("fuel_ferc1", 48_841),
("plant_in_service_ferc1", 315_206),
("plants_all_ferc1", 54_284),
("fbp_ferc1", 25_423),
("fuel_ferc1", 48_843),
("plant_in_service_ferc1", 315_208),
("plants_all_ferc1", 54_384),
("plants_hydro_ferc1", 6_796),
("plants_pumped_storage_ferc1", 544),
("plants_small_ferc1", 16_235),
("plants_steam_ferc1", 30_709),
("plants_steam_ferc1", 30_809),
("pu_ferc1", 7_425),
("purchased_power_ferc1", 197_523),
("purchased_power_ferc1", 197_665),
],
)
def test_minmax_rows(pudl_out_ferc1, live_dbs, expected_rows, df_name):
Expand Down

0 comments on commit cf34e93

Please sign in to comment.