Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/dev' into parallel-pytest
Browse files Browse the repository at this point in the history
  • Loading branch information
rousik committed Nov 14, 2023
2 parents ca9854e + fcf4ccc commit b08ead3
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 15 deletions.
7 changes: 4 additions & 3 deletions src/pudl/extract/eia923.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,10 @@ def process_raw(self, df, page, **partition):
df = remove_leading_zeros_from_numeric_strings(df=df, col_name=col)
df = self.add_data_maturity(df, page, **partition)
# Fill in blank reporting_frequency_code for monthly data
df.loc[
df["data_maturity"] == "incremental_ytd", "reporting_frequency_code"
] = "M"
if "reporting_frequency_code" in df.columns:
df.loc[
df["data_maturity"] == "incremental_ytd", "reporting_frequency_code"
] = "M"
# the 2021 early release data had some ding dang "."'s and nulls in the year column
if "report_year" in df.columns:
mask = (df.report_year == ".") | df.report_year.isnull()
Expand Down
16 changes: 9 additions & 7 deletions src/pudl/output/ferc714.py
Original file line number Diff line number Diff line change
Expand Up @@ -652,18 +652,20 @@ def summarized_demand_ferc714(
demand_hourly_pa_ferc714.loc[
:, ["report_date", "respondent_id_ferc714", "demand_mwh"]
],
on=["report_date", "respondent_id_ferc714"],
how="left",
)
.groupby(["report_date", "respondent_id_ferc714"])
.agg({"demand_mwh": sum})
.groupby(["report_date", "respondent_id_ferc714"], as_index=False)[
["demand_mwh"]
]
.sum(min_count=1)
.rename(columns={"demand_mwh": "demand_annual_mwh"})
.reset_index()
.merge(
georeferenced_counties_ferc714.groupby(
["report_date", "respondent_id_ferc714"]
)
.agg({"population": sum, "area_km2": sum})
.reset_index()
["report_date", "respondent_id_ferc714"], as_index=False
)[["population", "area_km2"]].sum(min_count=1),
on=["report_date", "respondent_id_ferc714"],
how="left",
)
.assign(
population_density_km2=lambda x: x.population / x.area_km2,
Expand Down
4 changes: 2 additions & 2 deletions test/validate/mcoe_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,8 @@ def test_no_null_rows_mcoe(pudl_out_mcoe, live_dbs, df_name, thresh):
("hr_by_unit", 387_694, 32_416),
("hr_by_gen", 599_496, 50_070),
("fuel_cost", 599_496, 50_070),
("capacity_factor", 5_178_892, 433_277),
("mcoe", 5_179_300, 433_311),
("capacity_factor", 5_178_828, 433_286),
("mcoe", 5_179_236, 433_320),
],
)
def test_minmax_rows_mcoe(pudl_out_mcoe, live_dbs, monthly_rows, annual_rows, df_name):
Expand Down
42 changes: 41 additions & 1 deletion test/validate/service_territory_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"df_name,expected_rows",
[
("summarized_demand_ferc714", 3_195),
("fipsified_respondents_ferc714", 135_627),
("fipsified_respondents_ferc714", 135_537),
("compiled_geometry_balancing_authority_eia861", 112_507),
("compiled_geometry_utility_eia861", 247_705),
],
Expand Down Expand Up @@ -46,3 +46,43 @@ def test_minmax_rows(
pv.check_max_rows, expected_rows=expected_rows, margin=0.0, df_name=df_name
)
)


@pytest.mark.parametrize(
"df_name,expected_rows",
[("demand_hourly_pa_ferc714", 15_608_154)],
)
def test_minmax_rows_and_year_in_demand_hourly_pa_ferc714(
pudl_out_orig: "pudl.output.pudltabl.PudlTabl",
live_dbs: bool,
expected_rows: int,
df_name: str,
):
"""Test if the majority of the years in the two date columns line up & min/max rows.
We are parameterizing this test even though it only has one input because the
test_minmax_rows is a common test across many tables and we wanted to preserve the
format.
"""
if not live_dbs:
pytest.skip("Data validation only works with a live PUDL DB.")
demand_hourly_pa_ferc714 = pudl_out_orig.__getattribute__(df_name)()
_ = demand_hourly_pa_ferc714.pipe(
pv.check_min_rows, expected_rows=expected_rows, margin=0.0, df_name=df_name
).pipe(pv.check_max_rows, expected_rows=expected_rows, margin=0.0, df_name=df_name)

logger.info("Checking the consistency of the year in the multiple date columns.")
mismatched_report_years = demand_hourly_pa_ferc714[
(
demand_hourly_pa_ferc714.utc_datetime.dt.year
!= demand_hourly_pa_ferc714.report_date.dt.year
)
]
if (
off_ratio := len(mismatched_report_years) / len(demand_hourly_pa_ferc714)
) > 0.001:
raise AssertionError(
f"Found more ({off_ratio:.2%}) than expected (>.1%) FERC714 records"
" where the report year from the utc_datetime differs from the "
"report_date column."
)
4 changes: 2 additions & 2 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,8 @@ commands =
{[testenv:unit]commands}
{[testenv:integration]commands}
bash -c 'rm -f tox-nuke.log'
bash -c 'coverage run --append src/pudl/convert/ferc_to_sqlite.py --logfile tox-nuke.log --clobber src/pudl/package_data/settings/etl_full.yml'
bash -c 'coverage run --append src/pudl/cli/etl.py --logfile tox-nuke.log --clobber src/pudl/package_data/settings/etl_full.yml'
bash -c 'coverage run --append src/pudl/ferc_to_sqlite/cli.py --logfile tox-nuke.log --clobber src/pudl/package_data/settings/etl_full.yml'
bash -c 'coverage run --append src/pudl/cli/etl.py --logfile tox-nuke.log src/pudl/package_data/settings/etl_full.yml'
pytest {tty:--color=yes} --live-dbs {posargs} {[testenv]covargs} \
--etl-settings src/pudl/package_data/settings/etl_full.yml \
test/integration
Expand Down

0 comments on commit b08ead3

Please sign in to comment.