From e6b07f18b50bae85f9168dbf2103889630fdd89f Mon Sep 17 00:00:00 2001 From: Roberto Zanchi Date: Thu, 9 Nov 2023 16:58:28 -0500 Subject: [PATCH 01/14] new eia923-2022 DOI update --- src/pudl/workspace/datastore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pudl/workspace/datastore.py b/src/pudl/workspace/datastore.py index 94fdfa5ade..98b9a08449 100644 --- a/src/pudl/workspace/datastore.py +++ b/src/pudl/workspace/datastore.py @@ -175,8 +175,8 @@ class ZenodoDoiSettings(BaseSettings): # eia860m: ZenodoDoi = "10.5072/zenodo.1225517" eia861: ZenodoDoi = "10.5281/zenodo.8231268" # eia861: ZenodoDoi = "10.5072/zenodo.1229930" - eia923: ZenodoDoi = "10.5281/zenodo.8172818" - # eia923: ZenodoDoi = "10.5072/zenodo.1217724" + # eia923: ZenodoDoi = "10.5281/zenodo.8172818" + eia923: ZenodoDoi = "10.5281/zenodo.10067550" eia_bulk_elec: ZenodoDoi = "10.5281/zenodo.7067367" # eia_bulk_elec: ZenodoDoi = "10.5072/zenodo.1103572" epacamd_eia: ZenodoDoi = "10.5281/zenodo.7900974" From 5ca4d2a53d1584d0ed35c9ed64bda5d48ab740c1 Mon Sep 17 00:00:00 2001 From: Roberto Zanchi Date: Tue, 14 Nov 2023 09:45:09 -0500 Subject: [PATCH 02/14] Updated DOI --- src/pudl/workspace/input-output.ts | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 src/pudl/workspace/input-output.ts diff --git a/src/pudl/workspace/input-output.ts b/src/pudl/workspace/input-output.ts new file mode 100644 index 0000000000..848d0a2d0d --- /dev/null +++ b/src/pudl/workspace/input-output.ts @@ -0,0 +1,3 @@ +export PUDL_OUTPUT=/Users/rzanchi/Desktop/pudl/pudl_output + +export PUDL_INPUT=/Users/rzanchi/Desktop/pudl/pudl_input From 2f390120275ac6a4b3cd5defe784071301cc495a Mon Sep 17 00:00:00 2001 From: Roberto Zanchi Date: Tue, 21 Nov 2023 14:34:07 -0500 Subject: [PATCH 03/14] Update file_map and skiprows --- src/pudl/package_data/eia923/file_map.csv | 20 ++++++++++---------- src/pudl/package_data/eia923/skiprows.csv | 22 +++++++++++----------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/pudl/package_data/eia923/file_map.csv b/src/pudl/package_data/eia923/file_map.csv index 73dc128416..de7407422b 100644 --- a/src/pudl/package_data/eia923/file_map.csv +++ b/src/pudl/package_data/eia923/file_map.csv @@ -1,14 +1,14 @@ page,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023 -boiler_fuel,-1,-1,-1,-1,-1,-1,-1,eia923December2008.xls,EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS,EIA923 SCHEDULES 2_3_4_5 Final 2010.xls,EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2012_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2015_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2016_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2017_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2018_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2020_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2021_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2022_Early_Release.xlsx,EIA923_Schedules_2_3_4_5_M_04_2023_21JUN2023.xlsx +boiler_fuel,-1,-1,-1,-1,-1,-1,-1,eia923December2008.xls,EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS,EIA923 SCHEDULES 2_3_4_5 Final 2010.xls,EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2012_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2015_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2016_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2017_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2018_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2020_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2021_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2022_Final.xlsx,EIA923_Schedules_2_3_4_5_M_08_2023_19OCT2023.xlsx coal_stocks,-1,f906920y2002.xls,f906920_2003.xls,f906920_2004.xls,f906920_2005.xls,f906920_2006.xls,f906920_2007.xls,eia923December2008.xls,EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS,EIA923 SCHEDULES 2_3_4_5 Final 2010.xls,EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2012_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2015_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2016_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2017_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2018_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2020_Final_Revision.xlsx,-1,-1,-1 -energy_storage,-1,-1,-1,-1,-1,-1,-1,eia923December2008.xls,EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS,EIA923 SCHEDULES 2_3_4_5 Final 2010.xls,EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2012_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2015_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2016_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2017_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2018_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2020_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2021_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2022_Early_Release.xlsx,EIA923_Schedules_2_3_4_5_M_04_2023_21JUN2023.xlsx -fuel_receipts_costs,-1,-1,-1,-1,-1,-1,-1,eia923December2008.xls,EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS,EIA923 SCHEDULES 2_3_4_5 Final 2010.xls,EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2012_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2015_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2016_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2017_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2018_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2020_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2021_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2022_Early_Release.xlsx,EIA923_Schedules_2_3_4_5_M_04_2023_21JUN2023.xlsx -generation_fuel,f906920y2001.xls,f906920y2002.xls,f906920_2003.xls,f906920_2004.xls,f906920_2005.xls,f906920_2006.xls,f906920_2007.xls,eia923December2008.xls,EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS,EIA923 SCHEDULES 2_3_4_5 Final 2010.xls,EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2012_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2015_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2016_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2017_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2018_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2020_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2021_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2022_Early_Release.xlsx,EIA923_Schedules_2_3_4_5_M_04_2023_21JUN2023.xlsx -generator,-1,-1,-1,-1,-1,-1,-1,eia923December2008.xls,EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS,EIA923 SCHEDULES 2_3_4_5 Final 2010.xls,EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2012_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2015_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2016_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2017_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2018_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2020_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2021_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2022_Early_Release.xlsx,EIA923_Schedules_2_3_4_5_M_04_2023_21JUN2023.xlsx -oil_stocks,-1,f906920y2002.xls,f906920_2003.xls,f906920_2004.xls,f906920_2005.xls,f906920_2006.xls,f906920_2007.xls,eia923December2008.xls,EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS,EIA923 SCHEDULES 2_3_4_5 Final 2010.xls,EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2012_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2015_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2016_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2017_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2018_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2020_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2021_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2022_Early_Release.xlsx,EIA923_Schedules_2_3_4_5_M_04_2023_21JUN2023.xlsx +energy_storage,-1,-1,-1,-1,-1,-1,-1,eia923December2008.xls,EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS,EIA923 SCHEDULES 2_3_4_5 Final 2010.xls,EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2012_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2015_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2016_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2017_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2018_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2020_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2021_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2022_Final.xlsx,EIA923_Schedules_2_3_4_5_M_08_2023_19OCT2023.xlsx +fuel_receipts_costs,-1,-1,-1,-1,-1,-1,-1,eia923December2008.xls,EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS,EIA923 SCHEDULES 2_3_4_5 Final 2010.xls,EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2012_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2015_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2016_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2017_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2018_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2020_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2021_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2022_Final.xlsx,EIA923_Schedules_2_3_4_5_M_08_2023_19OCT2023.xlsx +generation_fuel,f906920y2001.xls,f906920y2002.xls,f906920_2003.xls,f906920_2004.xls,f906920_2005.xls,f906920_2006.xls,f906920_2007.xls,eia923December2008.xls,EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS,EIA923 SCHEDULES 2_3_4_5 Final 2010.xls,EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2012_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2015_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2016_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2017_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2018_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2020_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2021_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2022_Final.xlsx,EIA923_Schedules_2_3_4_5_M_08_2023_19OCT2023.xlsx +generator,-1,-1,-1,-1,-1,-1,-1,eia923December2008.xls,EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS,EIA923 SCHEDULES 2_3_4_5 Final 2010.xls,EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2012_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2015_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2016_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2017_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2018_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2020_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2021_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2022_Final.xlsx,EIA923_Schedules_2_3_4_5_M_08_2023_19OCT2023.xlsx +oil_stocks,-1,f906920y2002.xls,f906920_2003.xls,f906920_2004.xls,f906920_2005.xls,f906920_2006.xls,f906920_2007.xls,eia923December2008.xls,EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS,EIA923 SCHEDULES 2_3_4_5 Final 2010.xls,EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2012_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2015_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2016_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2017_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2018_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2020_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2021_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2022_Final.xlsx,EIA923_Schedules_2_3_4_5_M_08_2023_19OCT2023.xlsx petcoke_stocks,-1,f906920y2002.xls,f906920_2003.xls,f906920_2004.xls,f906920_2005.xls,f906920_2006.xls,f906920_2007.xls,eia923December2008.xls,EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS,EIA923 SCHEDULES 2_3_4_5 Final 2010.xls,EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2012_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2015_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2016_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2017_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2018_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2020_Final_Revision.xlsx,-1,-1,-1 -plant_frame,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2012_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2015_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2016_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2017_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2018_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2020_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2021_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2022_Early_Release.xlsx,EIA923_Schedules_2_3_4_5_M_04_2023_21JUN2023.xlsx -puerto_rico,-1,-1,-1,-1,-1,-1,-1,eia923December2008.xls,EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS,EIA923 SCHEDULES 2_3_4_5 Final 2010.xls,EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2012_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2015_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2016_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2017_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2018_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2020_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2021_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2022_Early_Release.xlsx,EIA923_Schedules_2_3_4_5_M_04_2023_21JUN2023.xlsx -stocks,f906920y2001.xls,f906920y2002.xls,f906920_2003.xls,f906920_2004.xls,f906920_2005.xls,f906920_2006.xls,f906920_2007.xls,eia923December2008.xls,EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS,EIA923 SCHEDULES 2_3_4_5 Final 2010.xls,EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2012_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2015_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2016_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2017_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2018_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2020_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2021_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2022_Early_Release.xlsx,EIA923_Schedules_2_3_4_5_M_04_2023_21JUN2023.xlsx -plant_frame_puerto_rico,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2020_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2021_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2022_Early_Release.xlsx,EIA923_Schedules_2_3_4_5_M_04_2023_21JUN2023.xlsx +plant_frame,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2012_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2015_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2016_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2017_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2018_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2020_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2021_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2022_Final.xlsx,EIA923_Schedules_2_3_4_5_M_08_2023_19OCT2023.xlsx +puerto_rico,-1,-1,-1,-1,-1,-1,-1,eia923December2008.xls,EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS,EIA923 SCHEDULES 2_3_4_5 Final 2010.xls,EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2012_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2015_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2016_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2017_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2018_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2020_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2021_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2022_Final.xlsx,EIA923_Schedules_2_3_4_5_M_08_2023_19OCT2023.xlsx +stocks,f906920y2001.xls,f906920y2002.xls,f906920_2003.xls,f906920_2004.xls,f906920_2005.xls,f906920_2006.xls,f906920_2007.xls,eia923December2008.xls,EIA923 SCHEDULES 2_3_4_5 M Final 2009 REVISED 05252011.XLS,EIA923 SCHEDULES 2_3_4_5 Final 2010.xls,EIA923_Schedules_2_3_4_5_2011_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2012_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_2013_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2014_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2015_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2016_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2017_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2018_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2020_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2021_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2022_Final.xlsx,EIA923_Schedules_2_3_4_5_M_08_2023_19OCT2023.xlsx +plant_frame_puerto_rico,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2020_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2021_Final_Revision.xlsx,EIA923_Schedules_2_3_4_5_M_12_2022_Final.xlsx,EIA923_Schedules_2_3_4_5_M_08_2023_19OCT2023.xlsx emissions_control,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,EIA923_Schedule_8_Annual_Environmental_Information_2012_Final_Revision.xlsx,EIA923_Schedule_8_PartsA-D_EnvData_2013_Final_Revision.xlsx,EIA923_Schedule_8_Annual_Environmental_Information_2014_Final_Revision.xlsx,EIA923_Schedule_8_Annual_Environmental_Information_2015_Final_Revision.xlsx,EIA923_Schedule_8_Annual_Environmental_Information_2016_Final_Revision.xlsx,EIA923_Schedule_8_Annual_Envir_Infor_2017_Final.xlsx,EIA923_Schedule_8_Annual_Environmental_Information_2018_Final.xlsx,EIA923_Schedule_8_Annual_Environmental_Information_2019_Final_Revision.xlsx,EIA923_Schedule_8_Annual_Environmental_Information_2020_Final_Revision.xlsx,EIA923_Schedule_8_Annual_Environmental_Information_2021_Final_Revision.xlsx,EIA923_Schedule_8_Annual_Environmental_Information_2022_Early_Release.xlsx,-1 \ No newline at end of file diff --git a/src/pudl/package_data/eia923/skiprows.csv b/src/pudl/package_data/eia923/skiprows.csv index 758744ab14..8bd8fd0531 100644 --- a/src/pudl/package_data/eia923/skiprows.csv +++ b/src/pudl/package_data/eia923/skiprows.csv @@ -1,14 +1,14 @@ year_index,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023 -generation_fuel,7,7,7,7,7,7,7,7,7,7,5,5,5,5,5,5,5,5,5,5,5,6,5 -puerto_rico,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,5,5,6,6,6,7,5 -stocks,7,7,7,7,7,7,7,7,7,7,5,5,5,5,5,5,5,5,5,5,5,6,4 -oil_stocks,-1,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,4 +generation_fuel,7,7,7,7,7,7,7,7,7,7,5,5,5,5,5,5,5,5,5,5,5,5,5 +puerto_rico,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,5,5,6,6,6,6,5 +stocks,7,7,7,7,7,7,7,7,7,7,5,5,5,5,5,5,5,5,5,5,5,5,4 +oil_stocks,-1,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4 coal_stocks,-1,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,-1,-1,-1 petcoke_stocks,-1,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,-1,-1,-1 -energy_storage,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,5,5,5,5,6,4 -boiler_fuel,-1,-1,-1,-1,-1,-1,-1,7,7,7,5,5,5,5,5,5,5,5,5,5,5,6,4 -generator,-1,-1,-1,-1,-1,-1,-1,7,7,7,5,5,5,5,5,5,5,5,5,5,5,6,4 -fuel_receipts_costs,-1,-1,-1,-1,-1,-1,-1,7,6,7,4,4,4,4,4,4,4,4,4,4,4,5,3 -plant_frame,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,4,4,4,4,4,4,4,4,4,4,4,5,3 -plant_frame_puerto_rico,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,4,4,4,5,3 -emissions_control,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,4,4,4,4,4,4,4,4,4,4,5,-1 \ No newline at end of file +energy_storage,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,5,5,5,5,5,4 +boiler_fuel,-1,-1,-1,-1,-1,-1,-1,7,7,7,5,5,5,5,5,5,5,5,5,5,5,5,4 +generator,-1,-1,-1,-1,-1,-1,-1,7,7,7,5,5,5,5,5,5,5,5,5,5,5,5,4 +fuel_receipts_costs,-1,-1,-1,-1,-1,-1,-1,7,6,7,4,4,4,4,4,4,4,4,4,4,4,4,3 +plant_frame,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,4,4,4,4,4,4,4,4,4,4,4,4,3 +plant_frame_puerto_rico,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,4,4,4,4,3 +emissions_control,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,4,4,4,4,4,4,4,4,4,4,4,-1 \ No newline at end of file From 861ead147eb8f752d92d4937345fb4a5a230141e Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Wed, 22 Nov 2023 11:02:47 -0300 Subject: [PATCH 04/14] Remove accidentally committed input-output.ts file --- src/pudl/workspace/input-output.ts | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 src/pudl/workspace/input-output.ts diff --git a/src/pudl/workspace/input-output.ts b/src/pudl/workspace/input-output.ts deleted file mode 100644 index 848d0a2d0d..0000000000 --- a/src/pudl/workspace/input-output.ts +++ /dev/null @@ -1,3 +0,0 @@ -export PUDL_OUTPUT=/Users/rzanchi/Desktop/pudl/pudl_output - -export PUDL_INPUT=/Users/rzanchi/Desktop/pudl/pudl_input From 8e174bbfde63c2e1cab24d6e40110fad6b673b08 Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Wed, 22 Nov 2023 11:07:05 -0300 Subject: [PATCH 05/14] Remove sandbox reference DOIs because they are now invalid since Zenodo wiped their sandbox server --- src/pudl/workspace/datastore.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/pudl/workspace/datastore.py b/src/pudl/workspace/datastore.py index cce538abd0..7683e9fcef 100644 --- a/src/pudl/workspace/datastore.py +++ b/src/pudl/workspace/datastore.py @@ -166,35 +166,20 @@ def get_json_string(self) -> str: class ZenodoDoiSettings(BaseSettings): """Digital Object Identifiers pointing to currently used Zenodo archives.""" - # Sandbox DOIs are provided for reference censusdp1tract: ZenodoDoi = "10.5281/zenodo.4127049" - # censusdp1tract: ZenodoDoi = "10.5072/zenodo.674992" eia860: ZenodoDoi = "10.5281/zenodo.10067566" - # eia860: ZenodoDoi = "10.5072/zenodo.1222854" eia860m: ZenodoDoi = "10.5281/zenodo.8188017" - # eia860m: ZenodoDoi = "10.5072/zenodo.1225517" eia861: ZenodoDoi = "10.5281/zenodo.10093091" - # eia861: ZenodoDoi = "10.5072/zenodo.1229930" - # eia923: ZenodoDoi = "10.5281/zenodo.8172818" eia923: ZenodoDoi = "10.5281/zenodo.10067550" eia_bulk_elec: ZenodoDoi = "10.5281/zenodo.7067367" - # eia_bulk_elec: ZenodoDoi = "10.5072/zenodo.1103572" epacamd_eia: ZenodoDoi = "10.5281/zenodo.7900974" - # epacamd_eia: ZenodoDoi = "10.5072/zenodo.1199170" epacems: ZenodoDoi = "10.5281/zenodo.8235497" - # epacems: ZenodoDoi = "10.5072/zenodo.1228519" ferc1: ZenodoDoi = "10.5281/zenodo.8326634" - # ferc1: ZenodoDoi = "10.5072/zenodo.1234455" ferc2: ZenodoDoi = "10.5281/zenodo.8326697" - # ferc2: ZenodoDoi = "10.5072/zenodo.1236695" ferc6: ZenodoDoi = "10.5281/zenodo.8326696" - # ferc6: ZenodoDoi = "10.5072/zenodo.1236703" ferc60: ZenodoDoi = "10.5281/zenodo.8326695" - # ferc60: ZenodoDoi = "10.5072/zenodo.1236694" ferc714: ZenodoDoi = "10.5281/zenodo.8326694" - # ferc714: ZenodoDoi = "10.5072/zenodo.1237565" phmsagas: ZenodoDoi = "10.5281/zenodo.8346646" - # phmsagas: ZenodoDoi = "10.5072/zenodo.1239253" class Config: """Pydantic config, reads from .env file.""" From cdd4bf52c15ca111071054f1c48bec350444cf26 Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Mon, 27 Nov 2023 14:21:17 -0300 Subject: [PATCH 06/14] Update minmax rows for new EIA923 data --- test/validate/eia_test.py | 16 ++++++++-------- test/validate/mcoe_test.py | 10 +++++----- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/test/validate/eia_test.py b/test/validate/eia_test.py index 2201ef4ac0..3d9743e4dd 100644 --- a/test/validate/eia_test.py +++ b/test/validate/eia_test.py @@ -45,15 +45,15 @@ def test_no_null_cols_eia(pudl_out_eia, live_dbs, cols, df_name): @pytest.mark.parametrize( "df_name,raw_rows,monthly_rows,annual_rows", [ - ("bf_eia923", 1_559_257, 1_559_257, 127_412), - ("bga_eia860", 141_652, 141_652, 141_652), - ("boil_eia860", 83_356, 83_356, 83_356), - ("frc_eia923", 639_647, 261_583, 25_370), - ("gen_eia923", None, 5_179_377, 433_332), - ("gens_eia860", 556_948, 556_948, 556_948), - ("gf_eia923", 2_879_884, 2_879_884, 244_795), + ("bf_eia923", 1_569_568, 1_569_568, 128_252), + ("bga_eia860", 142_391, 142_391, 142_391), + ("boil_eia860", 83_416, 83_416, 83_416), + ("frc_eia923", 646_677, 264_043, 25_443), + ("gen_eia923", None, 5_179_478, 433_336), + ("gens_eia860", 556_949, 556_949, 556_949), + ("gf_eia923", 2_907_735, 2_907_735, 246_324), ("own_eia860", 89_741, 89_741, 89_741), - ("plants_eia860", 200_514, 200_514, 200_514), + ("plants_eia860", 200_511, 200_511, 200_511), ("pu_eia860", 199_635, 199_635, 199_635), ("utils_eia860", 139_883, 139_883, 139_883), ("emissions_control_equipment_eia860", 56_616, 56_616, 56_616), diff --git a/test/validate/mcoe_test.py b/test/validate/mcoe_test.py index b1efd9c982..4d3e845b73 100644 --- a/test/validate/mcoe_test.py +++ b/test/validate/mcoe_test.py @@ -106,11 +106,11 @@ def test_no_null_rows_mcoe(pudl_out_mcoe, live_dbs, df_name, thresh): @pytest.mark.parametrize( "df_name,monthly_rows,annual_rows", [ - ("hr_by_unit", 387_670, 32_414), - ("hr_by_gen", 599_496, 50_070), - ("fuel_cost", 599_496, 50_070), - ("capacity_factor", 5_179_377, 433_332), - ("mcoe", 5_179_785, 433_366), + ("hr_by_unit", 389_530, 32_569), + ("hr_by_gen", 602_580, 50_327), + ("fuel_cost", 602_580, 50_327), + ("capacity_factor", 5_179_478, 433_336), + ("mcoe", 5_179_886, 433_370), ], ) def test_minmax_rows_mcoe(pudl_out_mcoe, live_dbs, monthly_rows, annual_rows, df_name): From 741f2dd8fdfbcda5c334a089e0cef71f80cc7746 Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Mon, 27 Nov 2023 15:38:41 -0300 Subject: [PATCH 07/14] Remove ref: env.GITHUB_REF in zenodo-cache-sync to avoid errors related to recieving pushes from forked repos --- .github/workflows/zenodo-cache-sync.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/zenodo-cache-sync.yml b/.github/workflows/zenodo-cache-sync.yml index 10564ede56..22da3fa684 100644 --- a/.github/workflows/zenodo-cache-sync.yml +++ b/.github/workflows/zenodo-cache-sync.yml @@ -41,8 +41,6 @@ jobs: - name: Checkout desired branch uses: actions/checkout@v4 - with: - ref: ${{ env.GITHUB_REF }} - name: Install Conda environment using mamba uses: mamba-org/setup-micromamba@v1 From 49dcd1c9c8d4a707a69857726138c7d397f5a45f Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Mon, 27 Nov 2023 16:24:05 -0300 Subject: [PATCH 08/14] Add release notes note about 923 final release and quarterly update --- docs/release_notes.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/release_notes.rst b/docs/release_notes.rst index 2f3c936634..5173f35357 100644 --- a/docs/release_notes.rst +++ b/docs/release_notes.rst @@ -72,8 +72,8 @@ Data Coverage * Updated :doc:`data_sources/eia860` to include final release data from 2022. * Updated :doc:`data_sources/eia861` to include final release data from 2022. -* Updated :doc:`data_sources/eia923` to include early release data from 2022 and - monthly YTD data as of April 2023. +* Updated :doc:`data_sources/eia923` to include final release data from 2022 and + monthly YTD data as of October 2023. * Updated :doc:`data_sources/epacems` to switch from the old FTP server to the new CAMPD API, and to include 2022 data. Due to changes in the ETL, Alaska, Puerto Rico and Hawaii are now included in CEMS processing. See issue :issue:`1264` & PRs From 3a7bfa6e68dd0ef97dceaf1dcd1e1bad484965bb Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Mon, 27 Nov 2023 19:06:35 -0300 Subject: [PATCH 09/14] Fix merge failure with dev --- src/pudl/workspace/datastore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pudl/workspace/datastore.py b/src/pudl/workspace/datastore.py index 7683e9fcef..a85db32660 100644 --- a/src/pudl/workspace/datastore.py +++ b/src/pudl/workspace/datastore.py @@ -168,8 +168,8 @@ class ZenodoDoiSettings(BaseSettings): censusdp1tract: ZenodoDoi = "10.5281/zenodo.4127049" eia860: ZenodoDoi = "10.5281/zenodo.10067566" - eia860m: ZenodoDoi = "10.5281/zenodo.8188017" - eia861: ZenodoDoi = "10.5281/zenodo.10093091" + eia860m: ZenodoDoi = "10.5281/zenodo.10204686" + eia861: ZenodoDoi = "10.5281/zenodo.10204708" eia923: ZenodoDoi = "10.5281/zenodo.10067550" eia_bulk_elec: ZenodoDoi = "10.5281/zenodo.7067367" epacamd_eia: ZenodoDoi = "10.5281/zenodo.7900974" From f0ec29bf57f5f4588eb09dead21a5ebd04a9384f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 28 Nov 2023 17:41:33 +0000 Subject: [PATCH 10/14] [pre-commit.ci] auto fixes from pre-commit.com hooks For more information, see https://pre-commit.ci --- src/pudl/workspace/datastore.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pudl/workspace/datastore.py b/src/pudl/workspace/datastore.py index c78e9fa217..e1043945a0 100644 --- a/src/pudl/workspace/datastore.py +++ b/src/pudl/workspace/datastore.py @@ -186,6 +186,7 @@ class ZenodoDoiSettings(BaseSettings): phmsagas: ZenodoDoi = "10.5281/zenodo.8346646" model_config = SettingsConfigDict(env_prefix="pudl_zenodo_doi_", env_file=".env") + class ZenodoFetcher: """API for fetching datapackage descriptors and resource contents from zenodo.""" From 855b701b5e7e03176b3cb96ea8a4604d5bd0b32c Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Tue, 28 Nov 2023 23:25:57 -0300 Subject: [PATCH 11/14] Update references to years of EIA923 data ingested by PUDL. Also update the 923 data sources page to say that we do ingest the monthly data. --- README.rst | 8 ++++---- docs/templates/eia923_child.rst.jinja | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index 8eea0e93b4..1b194c859a 100644 --- a/README.rst +++ b/README.rst @@ -52,11 +52,11 @@ What data is available? PUDL currently integrates data from: -* `EIA Form 860 `__: 2001-2022 +* `EIA Form 860 `__: 2001 - 2022 * `EIA Form 860m `__: 2023-06 -* `EIA Form 861 `__: 2001-2022 -* `EIA Form 923 `__: 2001-2022 -* `EPA Continuous Emissions Monitoring System (CEMS) `__: 1995-2022 +* `EIA Form 861 `__: 2001 - 2022 +* `EIA Form 923 `__: 2001 - 2023-10 +* `EPA Continuous Emissions Monitoring System (CEMS) `__: 1995 - 2022 * `FERC Form 1 `__: 1994-2021 * `FERC Form 714 `__: 2006-2020 * `US Census Demographic Profile 1 Geodatabase `__: 2010 diff --git a/docs/templates/eia923_child.rst.jinja b/docs/templates/eia923_child.rst.jinja index af6bca536a..7f04f7b12f 100644 --- a/docs/templates/eia923_child.rst.jinja +++ b/docs/templates/eia923_child.rst.jinja @@ -35,7 +35,8 @@ in `EIA Form 423 replaced the earlier FERC Form 423). If you're interested in this earlier data, get in touch with us! -Monthly interim EIA-923 data releases are not yet integrated into PUDL. In addition, We +Monthly interim EIA-923 data are periodically integrated into PUDL as well. Incomplete +year-to-date data are excluded from the annualized tables to avoid confusion. We have not yet integrated tables reporting fuel stocks, data from Puerto Rico, or EIA-923 schedules 6, 7, and 8. {% endblock %} From b29c0421662f045e50c017e7d1211bb03313f6a3 Mon Sep 17 00:00:00 2001 From: Austen Sharpe Date: Tue, 28 Nov 2023 23:44:49 -0300 Subject: [PATCH 12/14] Fix the month included in the EIA923 data years in the README --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 1b194c859a..f56642a3af 100644 --- a/README.rst +++ b/README.rst @@ -55,7 +55,7 @@ PUDL currently integrates data from: * `EIA Form 860 `__: 2001 - 2022 * `EIA Form 860m `__: 2023-06 * `EIA Form 861 `__: 2001 - 2022 -* `EIA Form 923 `__: 2001 - 2023-10 +* `EIA Form 923 `__: 2001 - 2023-08 * `EPA Continuous Emissions Monitoring System (CEMS) `__: 1995 - 2022 * `FERC Form 1 `__: 1994-2021 * `FERC Form 714 `__: 2006-2020 From c4f3d98d4aa8afc006290240b0e82d194cef61dd Mon Sep 17 00:00:00 2001 From: Jan Rous Date: Wed, 29 Nov 2023 17:28:46 -0700 Subject: [PATCH 13/14] Split xbrl2sqlite into smaller ops. Instead of using single monolith op that loops over all forms, we can use ops factory and XbrlRuntimeSettings resource to simplify how stuff is passed in. This way, single runtime settings exists for all xbrl ops and one op is generated for each form. This should allow for better parallelism, even though it might interfere with the num_workers being set to num cpus by default (i.e. this way we will oversubscribe available cores twice, one for dagster workers, and once for xbrl num workers). This, hovewer, should be an easy fix. --- src/pudl/extract/xbrl.py | 75 +++++++++++------------------ src/pudl/ferc_to_sqlite/__init__.py | 16 ++++-- 2 files changed, 39 insertions(+), 52 deletions(-) diff --git a/src/pudl/extract/xbrl.py b/src/pudl/extract/xbrl.py index 108fa5e025..4e54ed88f9 100644 --- a/src/pudl/extract/xbrl.py +++ b/src/pudl/extract/xbrl.py @@ -1,9 +1,10 @@ """Generic extractor for all FERC XBRL data.""" import io +from collections.abc import Callable from datetime import date from pathlib import Path -from dagster import Field, Noneable, op +from dagster import ConfigurableResource, op from ferc_xbrl_extractor.cli import run_main import pudl @@ -14,6 +15,14 @@ logger = pudl.logging_helpers.get_logger(__name__) +class XbrlRuntimeSettings(ConfigurableResource): + """Encodes runtime setting for the XBRL extraction.""" + # TODO(rousik): Using BaseSettings here might allow configuring this via environment variables. + clobber: bool = False + num_workers: None | int = None + batch_size: int = 50 + + class FercXbrlDatastore: """Simple datastore wrapper for accessing ferc1 xbrl resources.""" @@ -43,52 +52,24 @@ def get_filings(self, year: int, form: XbrlFormNumber) -> io.BytesIO: ) ) - -@op( - config_schema={ - "clobber": Field( - bool, description="Clobber existing ferc1 database.", default_value=False - ), - "workers": Field( - Noneable(int), - description="Specify number of worker processes for parsing XBRL filings.", - default_value=None, - ), - "batch_size": Field( - int, - description="Specify number of XBRL instances to be processed at a time (defaults to 50)", - default_value=50, - ), - }, - required_resource_keys={"ferc_to_sqlite_settings", "datastore"}, -) -def xbrl2sqlite(context) -> None: - """Clone the FERC Form 1 XBRL Database to SQLite.""" - output_path = PudlPaths().output_dir - clobber = context.op_config["clobber"] - batch_size = context.op_config["batch_size"] - workers = context.op_config["workers"] - ferc_to_sqlite_settings = context.resources.ferc_to_sqlite_settings - datastore = context.resources.datastore - datastore = FercXbrlDatastore(datastore) - - # Loop through all other forms and perform conversion - for form in XbrlFormNumber: - # Get desired settings object - settings = ferc_to_sqlite_settings.get_xbrl_dataset_settings(form) - - # If no settings for form in question, skip - if settings is None: - continue - - if settings.disabled: - logger.info(f"Dataset ferc{form}_xbrl is disabled, skipping") - continue - +def xbrl2sqlite_op_factory(form: XbrlFormNumber) -> Callable: + """Generates xbrl2sqlite op for a given FERC form.""" + @op( + name=f"ferc{form.value}_xbrl", + required_resource_keys={"ferc_to_sqlite_settings", "datastore", "xbrl_runtime_settings"} + ) + def inner_xbrl2sqlite(context) -> None: + output_path = PudlPaths().output_dir + runtime_settings: XbrlRuntimeSettings = context.resources.xbrl_runtime_settings + settings = context.resources.ferc_to_sqlite_settings.get_xbrl_dataset_settings(form) + datastore = FercXbrlDatastore(context.resources.datastore) + + if settings is None or settings.disabled: + logger.info(f"Skipping dataset ferc{form}_xbrl: no config or is disabled.") sql_path = PudlPaths().sqlite_db_path(f"ferc{form.value}_xbrl") if sql_path.exists(): - if clobber: + if runtime_settings.clobber: sql_path.unlink() else: raise RuntimeError( @@ -101,10 +82,10 @@ def xbrl2sqlite(context) -> None: datastore, output_path=output_path, sql_path=sql_path, - batch_size=batch_size, - workers=workers, + batch_size=runtime_settings.batch_size, + workers=runtime_settings.num_workers, ) - + return inner_xbrl2sqlite def convert_form( form_settings: FercGenericXbrlToSqliteSettings, diff --git a/src/pudl/ferc_to_sqlite/__init__.py b/src/pudl/ferc_to_sqlite/__init__.py index d9dd5b48ef..2b8c8f3df7 100644 --- a/src/pudl/ferc_to_sqlite/__init__.py +++ b/src/pudl/ferc_to_sqlite/__init__.py @@ -5,9 +5,9 @@ import pudl from pudl.extract.ferc import dbf2sqlite -from pudl.extract.xbrl import xbrl2sqlite +from pudl.extract.xbrl import XbrlRuntimeSettings, xbrl2sqlite_op_factory from pudl.resources import datastore, ferc_to_sqlite_settings -from pudl.settings import EtlSettings +from pudl.settings import EtlSettings, XbrlFormNumber logger = pudl.logging_helpers.get_logger(__name__) @@ -16,7 +16,8 @@ def ferc_to_sqlite(): """Clone the FERC FoxPro databases and XBRL filings into SQLite.""" dbf2sqlite() - xbrl2sqlite() + for form in XbrlFormNumber: + xbrl2sqlite_op_factory(form)() @graph @@ -28,11 +29,12 @@ def ferc_to_sqlite_dbf_only(): @graph def ferc_to_sqlite_xbrl_only(): """Clone the FERC XBRL databases into SQLite.""" - xbrl2sqlite() - + for form in XbrlFormNumber: + xbrl2sqlite_op_factory(form)() default_resources_defs = { "ferc_to_sqlite_settings": ferc_to_sqlite_settings, + "xbrl_runtime_settings": XbrlRuntimeSettings(), "datastore": datastore, } @@ -53,6 +55,10 @@ def ferc_to_sqlite_xbrl_only(): "ferc_to_sqlite_settings": { "config": ferc_to_sqlite_fast_settings.model_dump(), }, + "xbrl_runtime_settings": { + # TODO(rousik): do we need to set some defaults here? + "config": {}, + } }, }, ) From 428d04f7a7cb18b65ef9225b33c0cf03e25b5ea1 Mon Sep 17 00:00:00 2001 From: Jan Rous Date: Thu, 30 Nov 2023 00:29:09 -0700 Subject: [PATCH 14/14] Break ferc_to_sqlite op monoliths. Refactor monolithic dbf2sqlite and xbrl2sqlite methods into per-dataset smaller ops that are invoked within the graphs. This should allow us to better make use of dagster parallelism and speed up ferc_to_sqlite processing. It seems that current unit/integration tests only use FERC1 raw data, so I've modified the fixtures to only run the relevant pieces of processing. --- src/pudl/extract/dbf.py | 28 ++++++++- src/pudl/extract/ferc.py | 37 ++--------- src/pudl/extract/xbrl.py | 42 +++++++------ src/pudl/ferc_to_sqlite/__init__.py | 24 ++++--- src/pudl/ferc_to_sqlite/cli.py | 11 +--- src/pudl/resources.py | 11 +++- test/conftest.py | 88 ++++++++++++-------------- test/unit/extract/xbrl_test.py | 98 ++++++++++++----------------- 8 files changed, 165 insertions(+), 174 deletions(-) diff --git a/src/pudl/extract/dbf.py b/src/pudl/extract/dbf.py index e48b9c3f25..838825cdf0 100644 --- a/src/pudl/extract/dbf.py +++ b/src/pudl/extract/dbf.py @@ -4,13 +4,14 @@ import importlib.resources import zipfile from collections import defaultdict -from collections.abc import Iterator +from collections.abc import Callable, Iterator from functools import lru_cache from pathlib import Path from typing import IO, Any, Protocol, Self import pandas as pd import sqlalchemy as sa +from dagster import op from dbfread import DBF, FieldParser import pudl @@ -18,6 +19,7 @@ from pudl.metadata.classes import DataSource from pudl.settings import FercToSqliteSettings, GenericDatasetSettings from pudl.workspace.datastore import Datastore +from pudl.workspace.setup import PudlPaths logger = pudl.logging_helpers.get_logger(__name__) @@ -464,6 +466,30 @@ def get_db_path(self) -> str: db_path = str(Path(self.output_path) / self.DATABASE_NAME) return f"sqlite:///{db_path}" + @classmethod + def get_dagster_op(cls) -> Callable: + """Returns dagstger op that runs this extractor.""" + + @op( + name=f"dbf_{cls.DATASET}", + required_resource_keys={ + "ferc_to_sqlite_settings", + "datastore", + "runtime_settings", + }, + ) + def inner_method(context) -> None: + """Instantiates dbf extractor and runs it.""" + dbf_extractor = cls( + datastore=context.resources.datastore, + settings=context.resources.ferc_to_sqlite_settings, + clobber=context.resources.runtime_settings.clobber, + output_path=PudlPaths().output_dir, + ) + dbf_extractor.execute() + + return inner_method + def execute(self): """Runs the extraction of the data from dbf to sqlite.""" logger.info( diff --git a/src/pudl/extract/ferc.py b/src/pudl/extract/ferc.py index bf7a8514f0..823f4024db 100644 --- a/src/pudl/extract/ferc.py +++ b/src/pudl/extract/ferc.py @@ -1,42 +1,17 @@ """Hooks to integrate ferc to sqlite functionality into dagster graph.""" -from dagster import Field, op - import pudl from pudl.extract.ferc1 import Ferc1DbfExtractor from pudl.extract.ferc2 import Ferc2DbfExtractor from pudl.extract.ferc6 import Ferc6DbfExtractor from pudl.extract.ferc60 import Ferc60DbfExtractor -from pudl.workspace.setup import PudlPaths logger = pudl.logging_helpers.get_logger(__name__) - -@op( - config_schema={ - "clobber": Field( - bool, description="Clobber existing ferc1 database.", default_value=False - ), - }, - required_resource_keys={"ferc_to_sqlite_settings", "datastore"}, -) -def dbf2sqlite(context) -> None: - """Clone the FERC Form 1 Visual FoxPro databases into SQLite.""" - # TODO(rousik): this thin wrapper seems to be somewhat quirky. Maybe there's a way - # to make the integration # between the class and dagster better? Investigate. - logger.info(f"dbf2sqlite settings: {context.resources.ferc_to_sqlite_settings}") - - extractors = [ - Ferc1DbfExtractor, - Ferc2DbfExtractor, - Ferc6DbfExtractor, - Ferc60DbfExtractor, - ] - for xclass in extractors: - xclass( - datastore=context.resources.datastore, - settings=context.resources.ferc_to_sqlite_settings, - clobber=context.op_config["clobber"], - output_path=PudlPaths().output_dir, - ).execute() +ALL_DBF_EXTRACTORS = [ + Ferc1DbfExtractor, + Ferc2DbfExtractor, + Ferc6DbfExtractor, + Ferc60DbfExtractor, +] diff --git a/src/pudl/extract/xbrl.py b/src/pudl/extract/xbrl.py index 4e54ed88f9..bc3812e369 100644 --- a/src/pudl/extract/xbrl.py +++ b/src/pudl/extract/xbrl.py @@ -4,10 +4,11 @@ from datetime import date from pathlib import Path -from dagster import ConfigurableResource, op +from dagster import op from ferc_xbrl_extractor.cli import run_main import pudl +from pudl.resources import RuntimeSettings from pudl.settings import FercGenericXbrlToSqliteSettings, XbrlFormNumber from pudl.workspace.datastore import Datastore from pudl.workspace.setup import PudlPaths @@ -15,14 +16,6 @@ logger = pudl.logging_helpers.get_logger(__name__) -class XbrlRuntimeSettings(ConfigurableResource): - """Encodes runtime setting for the XBRL extraction.""" - # TODO(rousik): Using BaseSettings here might allow configuring this via environment variables. - clobber: bool = False - num_workers: None | int = None - batch_size: int = 50 - - class FercXbrlDatastore: """Simple datastore wrapper for accessing ferc1 xbrl resources.""" @@ -52,22 +45,33 @@ def get_filings(self, year: int, form: XbrlFormNumber) -> io.BytesIO: ) ) + def xbrl2sqlite_op_factory(form: XbrlFormNumber) -> Callable: """Generates xbrl2sqlite op for a given FERC form.""" + @op( name=f"ferc{form.value}_xbrl", - required_resource_keys={"ferc_to_sqlite_settings", "datastore", "xbrl_runtime_settings"} + required_resource_keys={ + "ferc_to_sqlite_settings", + "datastore", + "runtime_settings", + }, ) - def inner_xbrl2sqlite(context) -> None: + def inner_op(context) -> None: output_path = PudlPaths().output_dir - runtime_settings: XbrlRuntimeSettings = context.resources.xbrl_runtime_settings - settings = context.resources.ferc_to_sqlite_settings.get_xbrl_dataset_settings(form) + runtime_settings: RuntimeSettings = context.resources.runtime_settings + settings = context.resources.ferc_to_sqlite_settings.get_xbrl_dataset_settings( + form + ) datastore = FercXbrlDatastore(context.resources.datastore) if settings is None or settings.disabled: - logger.info(f"Skipping dataset ferc{form}_xbrl: no config or is disabled.") - sql_path = PudlPaths().sqlite_db_path(f"ferc{form.value}_xbrl") + logger.info( + f"Skipping dataset ferc{form.value}_xbrl: no config or is disabled." + ) + return + sql_path = PudlPaths().sqlite_db_path(f"ferc{form.value}_xbrl") if sql_path.exists(): if runtime_settings.clobber: sql_path.unlink() @@ -82,10 +86,12 @@ def inner_xbrl2sqlite(context) -> None: datastore, output_path=output_path, sql_path=sql_path, - batch_size=runtime_settings.batch_size, - workers=runtime_settings.num_workers, + batch_size=runtime_settings.xbrl_batch_size, + workers=runtime_settings.xbrl_num_workers, ) - return inner_xbrl2sqlite + + return inner_op + def convert_form( form_settings: FercGenericXbrlToSqliteSettings, diff --git a/src/pudl/ferc_to_sqlite/__init__.py b/src/pudl/ferc_to_sqlite/__init__.py index 2b8c8f3df7..2c5848d0c9 100644 --- a/src/pudl/ferc_to_sqlite/__init__.py +++ b/src/pudl/ferc_to_sqlite/__init__.py @@ -4,9 +4,13 @@ from dagster import Definitions, graph import pudl -from pudl.extract.ferc import dbf2sqlite -from pudl.extract.xbrl import XbrlRuntimeSettings, xbrl2sqlite_op_factory -from pudl.resources import datastore, ferc_to_sqlite_settings +from pudl.extract.ferc import ALL_DBF_EXTRACTORS +from pudl.extract.ferc1 import Ferc1DbfExtractor +from pudl.extract.ferc2 import Ferc2DbfExtractor +from pudl.extract.ferc6 import Ferc6DbfExtractor +from pudl.extract.ferc60 import Ferc60DbfExtractor +from pudl.extract.xbrl import xbrl2sqlite_op_factory +from pudl.resources import RuntimeSettings, datastore, ferc_to_sqlite_settings from pudl.settings import EtlSettings, XbrlFormNumber logger = pudl.logging_helpers.get_logger(__name__) @@ -15,7 +19,8 @@ @graph def ferc_to_sqlite(): """Clone the FERC FoxPro databases and XBRL filings into SQLite.""" - dbf2sqlite() + for extractor in ALL_DBF_EXTRACTORS: + extractor.get_dagster_op()() for form in XbrlFormNumber: xbrl2sqlite_op_factory(form)() @@ -23,7 +28,8 @@ def ferc_to_sqlite(): @graph def ferc_to_sqlite_dbf_only(): """Clone the FERC FoxPro databases into SQLite.""" - dbf2sqlite() + for extractor in ALL_DBF_EXTRACTORS: + extractor.get_dagster_op()() @graph @@ -32,9 +38,10 @@ def ferc_to_sqlite_xbrl_only(): for form in XbrlFormNumber: xbrl2sqlite_op_factory(form)() + default_resources_defs = { "ferc_to_sqlite_settings": ferc_to_sqlite_settings, - "xbrl_runtime_settings": XbrlRuntimeSettings(), + "runtime_settings": RuntimeSettings(), "datastore": datastore, } @@ -55,10 +62,9 @@ def ferc_to_sqlite_xbrl_only(): "ferc_to_sqlite_settings": { "config": ferc_to_sqlite_fast_settings.model_dump(), }, - "xbrl_runtime_settings": { - # TODO(rousik): do we need to set some defaults here? + "runtime_settings": { "config": {}, - } + }, }, }, ) diff --git a/src/pudl/ferc_to_sqlite/cli.py b/src/pudl/ferc_to_sqlite/cli.py index 5754ee97df..067d4beb74 100644 --- a/src/pudl/ferc_to_sqlite/cli.py +++ b/src/pudl/ferc_to_sqlite/cli.py @@ -154,18 +154,13 @@ def main(): # noqa: C901 else "", }, }, - }, - "ops": { - "xbrl2sqlite": { + "runtime_settings": { "config": { - "workers": args.workers, - "batch_size": args.batch_size, "clobber": args.clobber, + "xbrl_num_workers": args.workers, + "xbrl_batch_size": args.batch_size, }, }, - "dbf2sqlite": { - "config": {"clobber": args.clobber}, - }, }, }, raise_on_error=True, diff --git a/src/pudl/resources.py b/src/pudl/resources.py index 13d2a50471..ab58b715b5 100644 --- a/src/pudl/resources.py +++ b/src/pudl/resources.py @@ -1,12 +1,21 @@ """Collection of Dagster resources for PUDL.""" -from dagster import Field, resource +from dagster import ConfigurableResource, Field, resource from pudl.settings import DatasetsSettings, FercToSqliteSettings, create_dagster_config from pudl.workspace.datastore import Datastore from pudl.workspace.setup import PudlPaths +class RuntimeSettings(ConfigurableResource): + """Encodes runtime settings for the ferc_to_sqlite graphs.""" + + # TODO(rousik): Using BaseSettings here might allow configuring this via environment variables. + clobber: bool = False + xbrl_num_workers: None | int = None + xbrl_batch_size: int = 50 + + @resource(config_schema=create_dagster_config(DatasetsSettings())) def dataset_settings(init_context) -> DatasetsSettings: """Dagster resource for parameterizing PUDL ETL assets. diff --git a/test/conftest.py b/test/conftest.py index a09dc516f0..dbb5b81915 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -9,13 +9,13 @@ import pytest import sqlalchemy as sa -from dagster import build_init_resource_context, materialize_to_memory +from dagster import build_init_resource_context, graph, materialize_to_memory import pudl from pudl import resources from pudl.cli.etl import pudl_etl_job_factory -from pudl.extract.ferc1 import raw_xbrl_metadata_json -from pudl.ferc_to_sqlite.cli import ferc_to_sqlite_job_factory +from pudl.extract.ferc1 import Ferc1DbfExtractor, raw_xbrl_metadata_json +from pudl.extract.xbrl import xbrl2sqlite_op_factory from pudl.io_managers import ( PudlSQLiteIOManager, ferc1_dbf_sqlite_io_manager, @@ -24,7 +24,12 @@ ) from pudl.metadata.classes import Package from pudl.output.pudltabl import PudlTabl -from pudl.settings import DatasetsSettings, EtlSettings, FercToSqliteSettings +from pudl.settings import ( + DatasetsSettings, + EtlSettings, + FercToSqliteSettings, + XbrlFormNumber, +) from pudl.workspace.datastore import Datastore from pudl.workspace.setup import PudlPaths @@ -180,14 +185,22 @@ def pudl_out_orig(live_dbs: bool, pudl_engine: sa.Engine) -> PudlTabl: @pytest.fixture(scope="session") -def ferc_to_sqlite_dbf_only( - live_dbs: bool, pudl_datastore_config, etl_settings: EtlSettings +def ferc1_dbf_extract( + live_dbs: bool, + pudl_datastore_config, + etl_settings: EtlSettings, ): - """Create raw FERC 1 SQLite DBs, but only based on DBF sources.""" + """Creates raw FERC 1 SQlite DBs, based only on DBF sources.""" + + @graph + def local_dbf_ferc1_graph(): + Ferc1DbfExtractor.get_dagster_op()() + if not live_dbs: - ferc_to_sqlite_job_factory( - enable_xbrl=False, - )().execute_in_process( + local_dbf_ferc1_graph.to_job( + name="ferc_to_sqlite_dbf_ferc1", + resource_defs=pudl.ferc_to_sqlite.default_resources_defs, + ).execute_in_process( run_config={ "resources": { "ferc_to_sqlite_settings": { @@ -196,62 +209,43 @@ def ferc_to_sqlite_dbf_only( "datastore": { "config": pudl_datastore_config, }, + "runtime_settings": {"config": {}}, }, }, ) @pytest.fixture(scope="session") -def ferc_to_sqlite_xbrl_only( +def ferc1_xbrl_extract( live_dbs: bool, pudl_datastore_config, etl_settings: EtlSettings ): - """Create raw FERC 1 SQLite DBs, but only based on XBRL sources.""" - if not live_dbs: - ferc_to_sqlite_job_factory( - enable_dbf=False, - )().execute_in_process( - run_config={ - "resources": { - "ferc_to_sqlite_settings": { - "config": etl_settings.ferc_to_sqlite_settings.model_dump() - }, - "datastore": { - "config": pudl_datastore_config, - }, - }, - }, - ) - + """Runs ferc_to_sqlite dagster job for FERC Form 1 XBRL data.""" -@pytest.fixture(scope="session") -def ferc_to_sqlite(live_dbs, pudl_datastore_config, etl_settings: EtlSettings): - """Create raw FERC 1 SQLite DBs. + @graph + def local_xbrl_ferc1_graph(): + xbrl2sqlite_op_factory(XbrlFormNumber.FERC1)() - If we are using the test database, we initialize it from scratch first. If we're - using the live database, then the sql engine fixtures will return connections to the - existing databases - """ if not live_dbs: - logger.info( - f"ferc_to_sqlite_settings: {etl_settings.ferc_to_sqlite_settings.model_dump()}" - ) - logger.info(f"ferc_to_sqlite PUDL_OUTPUT: {os.getenv('PUDL_OUTPUT')}") - ferc_to_sqlite_job_factory()().execute_in_process( + local_xbrl_ferc1_graph.to_job( + name="ferc_to_sqlite_xbrl_ferc1", + resource_defs=pudl.ferc_to_sqlite.default_resources_defs, + ).execute_in_process( run_config={ "resources": { "ferc_to_sqlite_settings": { - "config": etl_settings.ferc_to_sqlite_settings.model_dump() + "config": etl_settings.ferc_to_sqlite_settings.model_dump(), }, "datastore": { "config": pudl_datastore_config, }, + "runtime_settings": {"config": {}}, }, - }, + } ) @pytest.fixture(scope="session", name="ferc1_engine_dbf") -def ferc1_dbf_sql_engine(ferc_to_sqlite_dbf_only: FercToSqliteSettings) -> sa.Engine: +def ferc1_dbf_sql_engine(ferc1_dbf_extract, dataset_settings_config) -> sa.Engine: """Grab a connection to the FERC Form 1 DB clone.""" context = build_init_resource_context( resources={"dataset_settings": dataset_settings_config} @@ -260,9 +254,7 @@ def ferc1_dbf_sql_engine(ferc_to_sqlite_dbf_only: FercToSqliteSettings) -> sa.En @pytest.fixture(scope="session", name="ferc1_engine_xbrl") -def ferc1_xbrl_sql_engine( - ferc_to_sqlite_xbrl_only: FercToSqliteSettings, dataset_settings_config -) -> sa.Engine: +def ferc1_xbrl_sql_engine(ferc1_xbrl_extract, dataset_settings_config) -> sa.Engine: """Grab a connection to the FERC Form 1 DB clone.""" context = build_init_resource_context( resources={"dataset_settings": dataset_settings_config} @@ -342,9 +334,7 @@ def configure_paths_for_tests(tmp_path_factory, request): gha = os.environ.get("GITHUB_ACTIONS", False) # Under what circumstances do we want to use a temporary input directory? # This will force a re-download of raw inputs from Zenodo or the GCS cache: - if (gha and "PUDL_INPUT" not in os.environ) or ( - request.config.getoption("--tmp-data") - ): + if request.config.getoption("--tmp-data") or ("PUDL_INPUT" not in os.environ): in_tmp = pudl_tmpdir / "input" in_tmp.mkdir() PudlPaths.set_path_overrides( diff --git a/test/unit/extract/xbrl_test.py b/test/unit/extract/xbrl_test.py index 61ff1bdb07..12f13caca9 100644 --- a/test/unit/extract/xbrl_test.py +++ b/test/unit/extract/xbrl_test.py @@ -1,9 +1,11 @@ """Tests for xbrl extraction module.""" import pytest -from dagster import build_op_context +from dagster import ResourceDefinition, build_op_context -from pudl.extract.xbrl import FercXbrlDatastore, convert_form, xbrl2sqlite +from pudl.extract.xbrl import FercXbrlDatastore, convert_form, xbrl2sqlite_op_factory +from pudl.ferc_to_sqlite import ferc_to_sqlite_xbrl_only +from pudl.resources import RuntimeSettings from pudl.settings import ( Ferc1DbfToSqliteSettings, Ferc1XbrlToSqliteSettings, @@ -99,28 +101,21 @@ def test_xbrl2sqlite(settings, forms, mocker, tmp_path): mock_datastore = mocker.MagicMock() mocker.patch("pudl.extract.xbrl.FercXbrlDatastore", return_value=mock_datastore) - # always use tmp path here so that we don't clobber the live DB when --live-dbs is passed - mock_pudl_paths = mocker.MagicMock( - spec=PudlPaths(), - sqlite_db_path=lambda form_name: tmp_path / f"{form_name}.sqlite", - output_dir=PudlPaths().output_dir, - ) - mocker.patch("pudl.extract.xbrl.PudlPaths", return_value=mock_pudl_paths) - - # Construct xbrl2sqlite op context - context = build_op_context( + # always use tmp ath here so that we don't clobber the live DB when --live-dbs is passed + ferc_to_sqlite_xbrl_only.execute_in_process( resources={ "ferc_to_sqlite_settings": settings, - "datastore": "datastore", - }, - config={ - "workers": 10, - "batch_size": 20, - "clobber": True, - }, + "datastore": ResourceDefinition.mock_resource(), + "runtime_settings": RuntimeSettings( + xbrl_batch_size=20, + xbrl_num_workers=10, + clobber=True, + ), + } ) - xbrl2sqlite(context) + # TODO(rousik): do we need to use this, or can we simply set PUDL_OUTPUT env + # variable to some random path? assert convert_form_mock.call_count == len(forms) @@ -130,13 +125,16 @@ def test_xbrl2sqlite(settings, forms, mocker, tmp_path): form, mock_datastore, output_path=PudlPaths().output_dir, - sql_path=tmp_path / f"ferc{form.value}_xbrl.sqlite", + sql_path=PudlPaths().output_dir / f"ferc{form.value}_xbrl.sqlite", batch_size=20, workers=10, ) -def test_xbrl2sqlite_db_exists_no_clobber(mocker): +def test_xbrl2sqlite_db_exists_no_clobber(mocker, live_dbs): + if live_dbs: + return + convert_form_mock = mocker.MagicMock() mocker.patch("pudl.extract.xbrl.convert_form", new=convert_form_mock) @@ -147,31 +145,30 @@ def test_xbrl2sqlite_db_exists_no_clobber(mocker): ferc1_sqlite_path = PudlPaths().output_dir / "ferc1_xbrl.sqlite" ferc1_sqlite_path.touch() settings = FercToSqliteSettings( - ferc1_dbf_to_sqlite_settings=Ferc1DbfToSqliteSettings(), ferc1_xbrl_to_sqlite_settings=Ferc1XbrlToSqliteSettings(), - ferc2_xbrl_to_sqlite_settings=None, - ferc6_xbrl_to_sqlite_settings=None, - ferc60_xbrl_to_sqlite_settings=None, - ferc714_xbrl_to_sqlite_settings=None, ) - # Construct xbrl2sqlite op context context = build_op_context( resources={ "ferc_to_sqlite_settings": settings, "datastore": "datastore", - }, - config={ - "workers": 10, - "batch_size": 20, - "clobber": False, + "runtime_settings": RuntimeSettings( + clobber=False, + xbrl_batch_size=20, + xbrl_num_workers=10, + ), }, ) + assert ferc1_sqlite_path.exists() with pytest.raises(RuntimeError, match="Found existing DB"): - xbrl2sqlite(context) + xbrl2sqlite_op_factory(XbrlFormNumber.FORM1)(context) + assert ferc1_sqlite_path.exists() + +def test_xbrl2sqlite_db_exists_yes_clobber(mocker, live_dbs): + if live_dbs: + return -def test_xbrl2sqlite_db_exists_yes_clobber(mocker, tmp_path): convert_form_mock = mocker.MagicMock() mocker.patch("pudl.extract.xbrl.convert_form", new=convert_form_mock) @@ -180,40 +177,27 @@ def test_xbrl2sqlite_db_exists_yes_clobber(mocker, tmp_path): mocker.patch("pudl.extract.xbrl.FercXbrlDatastore", return_value=mock_datastore) # always use tmp path here so that we don't clobber the live DB when --live-dbs is passed - ferc1_sqlite_path = tmp_path / "ferc1_xbrl.sqlite" + ferc1_sqlite_path = PudlPaths().output_dir / "ferc1_xbrl.sqlite" ferc1_sqlite_path.touch() - - # mock the db path so we can assert it gets clobbered - mock_db_path = mocker.MagicMock(spec=ferc1_sqlite_path) - mock_pudl_paths = mocker.MagicMock( - spec=PudlPaths(), sqlite_db_path=lambda _x: mock_db_path - ) - mocker.patch("pudl.extract.xbrl.PudlPaths", return_value=mock_pudl_paths) - settings = FercToSqliteSettings( - ferc1_dbf_to_sqlite_settings=Ferc1DbfToSqliteSettings(), ferc1_xbrl_to_sqlite_settings=Ferc1XbrlToSqliteSettings(), - ferc2_xbrl_to_sqlite_settings=None, - ferc6_xbrl_to_sqlite_settings=None, - ferc60_xbrl_to_sqlite_settings=None, - ferc714_xbrl_to_sqlite_settings=None, ) context = build_op_context( resources={ "ferc_to_sqlite_settings": settings, "datastore": "datastore", - }, - config={ - "workers": 10, - "batch_size": 20, - "clobber": True, + "runtime_settings": RuntimeSettings( + clobber=True, + xbrl_batch_size=20, + xbrl_num_workers=10, + ), }, ) - xbrl2sqlite(context) - - mock_db_path.unlink.assert_any_call() + assert ferc1_sqlite_path.exists() + xbrl2sqlite_op_factory(XbrlFormNumber.FORM1)(context) + assert not ferc1_sqlite_path.exists() def test_convert_form(mocker):