Skip to content

Commit

Permalink
Integrate 2023 FERC1 Update (#3701)
Browse files Browse the repository at this point in the history
* very draft first pass of transforming the 2023 ferc1 data

this was done with a manual hand-off of the xbrl sqlite db

* update the filter_for_freshest_data thredhold

* Update XBRL settings & metadata to extract 2023 XBRL data

* Disable experiment tracking by default

* map new ferc1 plants

* update min/max rows with new year of data

* add release notes

* bump the apply_diffs threshhold for the fast tests

* light docs updates for project_num fix

---------

Co-authored-by: Zane Selvans <[email protected]>
Co-authored-by: zschira <[email protected]>
  • Loading branch information
3 people authored Jul 13, 2024
1 parent cba40ab commit 1c83595
Show file tree
Hide file tree
Showing 10 changed files with 37 additions and 23 deletions.
5 changes: 5 additions & 0 deletions docs/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ v2024.X.X (2024-XX-XX)
New Data Coverage
^^^^^^^^^^^^^^^^^

FERC Form 1
~~~~~~~~~~~

* Added FERC Form 1 data from 2023. See issue :issue:`3700` and PR :pr:`3701`.

EIA AEO
~~~~~~~

Expand Down
2 changes: 1 addition & 1 deletion src/pudl/analysis/ml_tools/experiment_tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ class ExperimentTrackerConfig(Config):
"""Dagster config to setup experiment tracking with mlflow."""

tracking_uri: str = f"sqlite:///{Path('./').absolute()}/experiments.sqlite"
tracking_enabled: bool = True
tracking_enabled: bool = False
run_context: str = "production"
#: Location to store artifacts. Artifact storage not currently used.
artifact_location: str = str(Path("./").absolute())
Expand Down
18 changes: 11 additions & 7 deletions src/pudl/io_managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -758,14 +758,18 @@ def __compare_dedupe_methodologies(
)

# 2024-04-10: this threshold set by looking at existing values for FERC
# <=2022.
threshold_pct = 0.3
if n_diffs / n_best > (1 + threshold_pct / 100):
# <=2022. It was updated from .3 to .44 during the 2023 update.
threshold_ratio = 1.0044
if (found_ratio := n_diffs / n_best) > threshold_ratio:
raise ValueError(
f"Found {n_diffs} non-null values with apply-diffs"
f"methodology, and {n_best} with best-snapshot. "
f"apply-diffs shouldn't be more than {threshold_pct}% "
"greater than best-snapshot."
"Found more than expected excess non-null values using the "
f"currently implemented apply_diffs methodology (#{n_diffs}) as "
f"compared to the best_snapshot methodology (#{n_best}). We expected"
" the apply_diffs methodology to result in no more than "
f"{threshold_ratio:.2%} non-null records but found {found_ratio:.2%}.\n\n"
"We are concerned about excess non-null values because apply-diffs "
"grabs the most recent non-null values. If this error is raised, "
"investigate filter_for_freshest_data."
)

filing_metadata_cols = {"publication_time", "filing_name"}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ core_ferc1__yearly_operating_revenues_sched300,other_operating_revenues,core_fer
core_ferc1__yearly_operating_revenues_sched300,other_operating_revenues,core_ferc1__yearly_operating_revenues_sched300,sales_of_water_and_water_power,1.0,,,
core_ferc1__yearly_operating_revenues_sched300,sales_to_ultimate_consumers,core_ferc1__yearly_operating_revenues_sched300,large_or_industrial,1.0,,,
core_ferc1__yearly_operating_revenues_sched300,sales_to_ultimate_consumers,core_ferc1__yearly_operating_revenues_sched300,small_or_commercial,1.0,,,
core_ferc1__yearly_operating_revenues_sched300,sales_to_ultimate_consumers,core_ferc1__yearly_sales_by_rate_schedules_sched304,commercial_and_industrial,,,,
core_ferc1__yearly_depreciation_changes_sched219,ending_balance,core_ferc1__yearly_depreciation_changes_sched219,book_cost_of_asset_retirement_costs,1.0,,,
core_ferc1__yearly_depreciation_changes_sched219,ending_balance,core_ferc1__yearly_depreciation_changes_sched219,depreciation_provision,1.0,,,
core_ferc1__yearly_depreciation_changes_sched219,ending_balance,core_ferc1__yearly_depreciation_changes_sched219,net_charges_for_retired_plant,1.0,,,
Expand Down
Binary file modified src/pudl/package_data/glue/pudl_id_mapping.xlsx
Binary file not shown.
4 changes: 2 additions & 2 deletions src/pudl/package_data/settings/etl_fast.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ ferc_to_sqlite_settings:
# What years of original FERC data should be cloned into the SQLite DB?
years: [2019, 2020]
ferc1_xbrl_to_sqlite_settings:
years: [2021, 2022]
years: [2021, 2023]
ferc2_dbf_to_sqlite_settings:
years: [2019, 2020]
ferc2_xbrl_to_sqlite_settings:
Expand Down Expand Up @@ -35,7 +35,7 @@ description: >
version: 0.1.0
datasets:
ferc1:
years: [2020, 2021, 2022]
years: [2020, 2021, 2023]
ferc714:
years: [2019, 2020]
eia:
Expand Down
1 change: 1 addition & 0 deletions src/pudl/package_data/settings/etl_full.yml
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ datasets:
2020,
2021,
2022,
2023,
]
ferc714:
years:
Expand Down
8 changes: 6 additions & 2 deletions src/pudl/transform/ferc1.py
Original file line number Diff line number Diff line change
Expand Up @@ -3406,8 +3406,12 @@ class HydroelectricPlantsTableTransformer(Ferc1AbstractTableTransformer):
table_id: TableIdFerc1 = TableIdFerc1.HYDROELECTRIC_PLANTS

def transform_main(self, df):
"""Add bespoke removal of duplicate record after standard transform_main."""
return super().transform_main(df).pipe(self.targeted_drop_duplicates)
"""Standard transform_main, bespoke remove duplicate record & remove ``.`` from project_num column."""
df = super().transform_main(df).pipe(self.targeted_drop_duplicates)
# project_num is an integer column but in 2023 some of them have .'s
# as prefixes
df.project_num = df.project_num.str.removeprefix(".")
return df

def targeted_drop_duplicates(self, df):
"""Targeted removal of known duplicate record.
Expand Down
1 change: 1 addition & 0 deletions src/pudl/transform/params/ferc1.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,7 @@
"natural",
"natural gas",
"natural gas",
"natural gas/fuel oil",
"ng",
"ng, fo",
"prop",
Expand Down
20 changes: 10 additions & 10 deletions test/validate/ferc1_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,16 +85,16 @@ def test_no_null_cols_ferc1(pudl_out_ferc1, live_dbs, cols, df_name):
@pytest.mark.parametrize(
"df_name,expected_rows",
[
("fbp_ferc1", 26_188),
("fuel_ferc1", 50_039),
("plant_in_service_ferc1", 335_756),
("plants_all_ferc1", 56_407),
("plants_hydro_ferc1", 6_979),
("plants_pumped_storage_ferc1", 562),
("plants_small_ferc1", 16_987),
("plants_steam_ferc1", 31_879),
("pu_ferc1", 7_698),
("purchased_power_ferc1", 204_720),
("fbp_ferc1", 26_947),
("fuel_ferc1", 51_238),
("plant_in_service_ferc1", 355_918),
("plants_all_ferc1", 58_520),
("plants_hydro_ferc1", 7_202),
("plants_pumped_storage_ferc1", 580),
("plants_small_ferc1", 17_763),
("plants_steam_ferc1", 32_975),
("pu_ferc1", 7_887),
("purchased_power_ferc1", 211_794),
],
)
def test_minmax_rows(pudl_out_ferc1, live_dbs, expected_rows, df_name):
Expand Down

0 comments on commit 1c83595

Please sign in to comment.