Skip to content

Commit

Permalink
Merge branch 'split-ferc2sqlite-ops' into pytest-with-f2s-ops
Browse files Browse the repository at this point in the history
  • Loading branch information
rousik committed Nov 30, 2023
2 parents 1ea46f5 + 428d04f commit f6f8d97
Show file tree
Hide file tree
Showing 17 changed files with 221 additions and 244 deletions.
2 changes: 0 additions & 2 deletions .github/workflows/zenodo-cache-sync.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@ jobs:
- name: Checkout desired branch
uses: actions/checkout@v4
with:
ref: ${{ env.GITHUB_REF }}

- name: Install Conda environment using mamba
uses: mamba-org/setup-micromamba@v1
Expand Down
8 changes: 4 additions & 4 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,11 @@ What data is available?

PUDL currently integrates data from:

* `EIA Form 860 <https://www.eia.gov/electricity/data/eia860/>`__: 2001-2022
* `EIA Form 860 <https://www.eia.gov/electricity/data/eia860/>`__: 2001 - 2022
* `EIA Form 860m <https://www.eia.gov/electricity/data/eia860m/>`__: 2023-06
* `EIA Form 861 <https://www.eia.gov/electricity/data/eia861/>`__: 2001-2022
* `EIA Form 923 <https://www.eia.gov/electricity/data/eia923/>`__: 2001-2022
* `EPA Continuous Emissions Monitoring System (CEMS) <https://campd.epa.gov/>`__: 1995-2022
* `EIA Form 861 <https://www.eia.gov/electricity/data/eia861/>`__: 2001 - 2022
* `EIA Form 923 <https://www.eia.gov/electricity/data/eia923/>`__: 2001 - 2023-08
* `EPA Continuous Emissions Monitoring System (CEMS) <https://campd.epa.gov/>`__: 1995 - 2022
* `FERC Form 1 <https://www.ferc.gov/industries-data/electric/general-information/electric-industry-forms/form-1-electric-utility-annual>`__: 1994-2021
* `FERC Form 714 <https://www.ferc.gov/industries-data/electric/general-information/electric-industry-forms/form-no-714-annual-electric/data>`__: 2006-2020
* `US Census Demographic Profile 1 Geodatabase <https://www.census.gov/geographies/mapping-files/2010/geo/tiger-data.html>`__: 2010
Expand Down
4 changes: 2 additions & 2 deletions docs/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ Data Coverage

* Updated :doc:`data_sources/eia860` to include final release data from 2022.
* Updated :doc:`data_sources/eia861` to include final release data from 2022.
* Updated :doc:`data_sources/eia923` to include early release data from 2022 and
monthly YTD data as of April 2023.
* Updated :doc:`data_sources/eia923` to include final release data from 2022 and
monthly YTD data as of October 2023.
* Updated :doc:`data_sources/epacems` to switch from the old FTP server to the new
CAMPD API, and to include 2022 data. Due to changes in the ETL, Alaska, Puerto Rico
and Hawaii are now included in CEMS processing. See issue :issue:`1264` & PRs
Expand Down
3 changes: 2 additions & 1 deletion docs/templates/eia923_child.rst.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ in `EIA Form 423
replaced the earlier FERC Form 423). If you're interested in this earlier data, get in
touch with us!

Monthly interim EIA-923 data releases are not yet integrated into PUDL. In addition, We
Monthly interim EIA-923 data are periodically integrated into PUDL as well. Incomplete
year-to-date data are excluded from the annualized tables to avoid confusion. We
have not yet integrated tables reporting fuel stocks, data from Puerto Rico, or EIA-923
schedules 6, 7, and 8.
{% endblock %}
Expand Down
28 changes: 27 additions & 1 deletion src/pudl/extract/dbf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,22 @@
import importlib.resources
import zipfile
from collections import defaultdict
from collections.abc import Iterator
from collections.abc import Callable, Iterator
from functools import lru_cache
from pathlib import Path
from typing import IO, Any, Protocol, Self

import pandas as pd
import sqlalchemy as sa
from dagster import op
from dbfread import DBF, FieldParser

import pudl
import pudl.logging_helpers
from pudl.metadata.classes import DataSource
from pudl.settings import FercToSqliteSettings, GenericDatasetSettings
from pudl.workspace.datastore import Datastore
from pudl.workspace.setup import PudlPaths

logger = pudl.logging_helpers.get_logger(__name__)

Expand Down Expand Up @@ -464,6 +466,30 @@ def get_db_path(self) -> str:
db_path = str(Path(self.output_path) / self.DATABASE_NAME)
return f"sqlite:///{db_path}"

@classmethod
def get_dagster_op(cls) -> Callable:
"""Returns dagstger op that runs this extractor."""

@op(
name=f"dbf_{cls.DATASET}",
required_resource_keys={
"ferc_to_sqlite_settings",
"datastore",
"runtime_settings",
},
)
def inner_method(context) -> None:
"""Instantiates dbf extractor and runs it."""
dbf_extractor = cls(
datastore=context.resources.datastore,
settings=context.resources.ferc_to_sqlite_settings,
clobber=context.resources.runtime_settings.clobber,
output_path=PudlPaths().output_dir,
)
dbf_extractor.execute()

return inner_method

def execute(self):
"""Runs the extraction of the data from dbf to sqlite."""
logger.info(
Expand Down
37 changes: 6 additions & 31 deletions src/pudl/extract/ferc.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,17 @@
"""Hooks to integrate ferc to sqlite functionality into dagster graph."""


from dagster import Field, op

import pudl
from pudl.extract.ferc1 import Ferc1DbfExtractor
from pudl.extract.ferc2 import Ferc2DbfExtractor
from pudl.extract.ferc6 import Ferc6DbfExtractor
from pudl.extract.ferc60 import Ferc60DbfExtractor
from pudl.workspace.setup import PudlPaths

logger = pudl.logging_helpers.get_logger(__name__)


@op(
config_schema={
"clobber": Field(
bool, description="Clobber existing ferc1 database.", default_value=False
),
},
required_resource_keys={"ferc_to_sqlite_settings", "datastore"},
)
def dbf2sqlite(context) -> None:
"""Clone the FERC Form 1 Visual FoxPro databases into SQLite."""
# TODO(rousik): this thin wrapper seems to be somewhat quirky. Maybe there's a way
# to make the integration # between the class and dagster better? Investigate.
logger.info(f"dbf2sqlite settings: {context.resources.ferc_to_sqlite_settings}")

extractors = [
Ferc1DbfExtractor,
Ferc2DbfExtractor,
Ferc6DbfExtractor,
Ferc60DbfExtractor,
]
for xclass in extractors:
xclass(
datastore=context.resources.datastore,
settings=context.resources.ferc_to_sqlite_settings,
clobber=context.op_config["clobber"],
output_path=PudlPaths().output_dir,
).execute()
ALL_DBF_EXTRACTORS = [
Ferc1DbfExtractor,
Ferc2DbfExtractor,
Ferc6DbfExtractor,
Ferc60DbfExtractor,
]
77 changes: 32 additions & 45 deletions src/pudl/extract/xbrl.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
"""Generic extractor for all FERC XBRL data."""
import io
from collections.abc import Callable
from datetime import date
from pathlib import Path

from dagster import Field, Noneable, op
from dagster import op
from ferc_xbrl_extractor.cli import run_main

import pudl
from pudl.resources import RuntimeSettings
from pudl.settings import FercGenericXbrlToSqliteSettings, XbrlFormNumber
from pudl.workspace.datastore import Datastore
from pudl.workspace.setup import PudlPaths
Expand Down Expand Up @@ -44,51 +46,34 @@ def get_filings(self, year: int, form: XbrlFormNumber) -> io.BytesIO:
)


@op(
config_schema={
"clobber": Field(
bool, description="Clobber existing ferc1 database.", default_value=False
),
"workers": Field(
Noneable(int),
description="Specify number of worker processes for parsing XBRL filings.",
default_value=None,
),
"batch_size": Field(
int,
description="Specify number of XBRL instances to be processed at a time (defaults to 50)",
default_value=50,
),
},
required_resource_keys={"ferc_to_sqlite_settings", "datastore"},
)
def xbrl2sqlite(context) -> None:
"""Clone the FERC Form 1 XBRL Database to SQLite."""
output_path = PudlPaths().output_dir
clobber = context.op_config["clobber"]
batch_size = context.op_config["batch_size"]
workers = context.op_config["workers"]
ferc_to_sqlite_settings = context.resources.ferc_to_sqlite_settings
datastore = context.resources.datastore
datastore = FercXbrlDatastore(datastore)

# Loop through all other forms and perform conversion
for form in XbrlFormNumber:
# Get desired settings object
settings = ferc_to_sqlite_settings.get_xbrl_dataset_settings(form)

# If no settings for form in question, skip
if settings is None:
continue

if settings.disabled:
logger.info(f"Dataset ferc{form}_xbrl is disabled, skipping")
continue
def xbrl2sqlite_op_factory(form: XbrlFormNumber) -> Callable:
"""Generates xbrl2sqlite op for a given FERC form."""

@op(
name=f"ferc{form.value}_xbrl",
required_resource_keys={
"ferc_to_sqlite_settings",
"datastore",
"runtime_settings",
},
)
def inner_op(context) -> None:
output_path = PudlPaths().output_dir
runtime_settings: RuntimeSettings = context.resources.runtime_settings
settings = context.resources.ferc_to_sqlite_settings.get_xbrl_dataset_settings(
form
)
datastore = FercXbrlDatastore(context.resources.datastore)

sql_path = PudlPaths().sqlite_db_path(f"ferc{form.value}_xbrl")
if settings is None or settings.disabled:
logger.info(
f"Skipping dataset ferc{form.value}_xbrl: no config or is disabled."
)
return

sql_path = PudlPaths().sqlite_db_path(f"ferc{form.value}_xbrl")
if sql_path.exists():
if clobber:
if runtime_settings.clobber:
sql_path.unlink()
else:
raise RuntimeError(
Expand All @@ -101,10 +86,12 @@ def xbrl2sqlite(context) -> None:
datastore,
output_path=output_path,
sql_path=sql_path,
batch_size=batch_size,
workers=workers,
batch_size=runtime_settings.xbrl_batch_size,
workers=runtime_settings.xbrl_num_workers,
)

return inner_op


def convert_form(
form_settings: FercGenericXbrlToSqliteSettings,
Expand Down
28 changes: 20 additions & 8 deletions src/pudl/ferc_to_sqlite/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,35 +4,44 @@
from dagster import Definitions, graph

import pudl
from pudl.extract.ferc import dbf2sqlite
from pudl.extract.xbrl import xbrl2sqlite
from pudl.resources import datastore, ferc_to_sqlite_settings
from pudl.settings import EtlSettings
from pudl.extract.ferc import ALL_DBF_EXTRACTORS
from pudl.extract.ferc1 import Ferc1DbfExtractor
from pudl.extract.ferc2 import Ferc2DbfExtractor
from pudl.extract.ferc6 import Ferc6DbfExtractor
from pudl.extract.ferc60 import Ferc60DbfExtractor
from pudl.extract.xbrl import xbrl2sqlite_op_factory
from pudl.resources import RuntimeSettings, datastore, ferc_to_sqlite_settings
from pudl.settings import EtlSettings, XbrlFormNumber

logger = pudl.logging_helpers.get_logger(__name__)


@graph
def ferc_to_sqlite():
"""Clone the FERC FoxPro databases and XBRL filings into SQLite."""
dbf2sqlite()
xbrl2sqlite()
for extractor in ALL_DBF_EXTRACTORS:
extractor.get_dagster_op()()
for form in XbrlFormNumber:
xbrl2sqlite_op_factory(form)()


@graph
def ferc_to_sqlite_dbf_only():
"""Clone the FERC FoxPro databases into SQLite."""
dbf2sqlite()
for extractor in ALL_DBF_EXTRACTORS:
extractor.get_dagster_op()()


@graph
def ferc_to_sqlite_xbrl_only():
"""Clone the FERC XBRL databases into SQLite."""
xbrl2sqlite()
for form in XbrlFormNumber:
xbrl2sqlite_op_factory(form)()


default_resources_defs = {
"ferc_to_sqlite_settings": ferc_to_sqlite_settings,
"runtime_settings": RuntimeSettings(),
"datastore": datastore,
}

Expand All @@ -53,6 +62,9 @@ def ferc_to_sqlite_xbrl_only():
"ferc_to_sqlite_settings": {
"config": ferc_to_sqlite_fast_settings.model_dump(),
},
"runtime_settings": {
"config": {},
},
},
},
)
Expand Down
11 changes: 3 additions & 8 deletions src/pudl/ferc_to_sqlite/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,18 +154,13 @@ def main(): # noqa: C901
else "",
},
},
},
"ops": {
"xbrl2sqlite": {
"runtime_settings": {
"config": {
"workers": args.workers,
"batch_size": args.batch_size,
"clobber": args.clobber,
"xbrl_num_workers": args.workers,
"xbrl_batch_size": args.batch_size,
},
},
"dbf2sqlite": {
"config": {"clobber": args.clobber},
},
},
},
raise_on_error=True,
Expand Down
Loading

0 comments on commit f6f8d97

Please sign in to comment.