Skip to content

Commit

Permalink
move vectorizers into op
Browse files Browse the repository at this point in the history
  • Loading branch information
katie-lamb committed Jan 8, 2024
1 parent 190a4f3 commit 35260b0
Show file tree
Hide file tree
Showing 3 changed files with 204 additions and 249 deletions.
111 changes: 54 additions & 57 deletions src/pudl/analysis/record_linkage/classify_plants_ferc1.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,57 +27,60 @@
]


dataframe_vectorizers = {
"plant_name": embed_dataframe.ColumnVectorizer(
transform_steps=[
embed_dataframe.NameCleaner(),
embed_dataframe.TextVectorizer(),
],
weight=2.0,
columns=["plant_name_ferc1"],
),
"plant_type": embed_dataframe.ColumnVectorizer(
transform_steps=[
embed_dataframe.ColumnCleaner(cleaning_function="null_to_empty_str"),
embed_dataframe.CategoricalVectorizer(),
],
weight=2.0,
columns=["plant_type"],
),
"construction_type": embed_dataframe.ColumnVectorizer(
transform_steps=[
embed_dataframe.ColumnCleaner(cleaning_function="null_to_empty_str"),
embed_dataframe.CategoricalVectorizer(),
],
columns=["construction_type"],
),
"capacity_mw": embed_dataframe.ColumnVectorizer(
transform_steps=[
embed_dataframe.ColumnCleaner(cleaning_function="null_to_zero"),
embed_dataframe.NumericalVectorizer(),
],
columns=["capacity_mw"],
),
"construction_year": embed_dataframe.ColumnVectorizer(
transform_steps=[
embed_dataframe.ColumnCleaner(cleaning_function="fix_int_na"),
embed_dataframe.CategoricalVectorizer(),
],
columns=["construction_year"],
),
"utility_id_ferc1": embed_dataframe.ColumnVectorizer(
transform_steps=[embed_dataframe.CategoricalVectorizer()],
columns=["utility_id_ferc1"],
),
"fuel_fractions": embed_dataframe.ColumnVectorizer(
transform_steps=[
embed_dataframe.ColumnCleaner(cleaning_function="null_to_zero"),
embed_dataframe.NumericalVectorizer(),
embed_dataframe.NumericalNormalizer(),
],
columns=_FUEL_COLS,
),
}
@op
def get_vectorizers():
"""Get the dictionary of vectorizer transforms for each column."""
return {
"plant_name": embed_dataframe.ColumnVectorizer(
transform_steps=[
embed_dataframe.NameCleaner(),
embed_dataframe.TextVectorizer(),
],
weight=2.0,
columns=["plant_name_ferc1"],
),
"plant_type": embed_dataframe.ColumnVectorizer(
transform_steps=[
embed_dataframe.ColumnCleaner(cleaning_function="null_to_empty_str"),
embed_dataframe.CategoricalVectorizer(),
],
weight=2.0,
columns=["plant_type"],
),
"construction_type": embed_dataframe.ColumnVectorizer(
transform_steps=[
embed_dataframe.ColumnCleaner(cleaning_function="null_to_empty_str"),
embed_dataframe.CategoricalVectorizer(),
],
columns=["construction_type"],
),
"capacity_mw": embed_dataframe.ColumnVectorizer(
transform_steps=[
embed_dataframe.ColumnCleaner(cleaning_function="null_to_zero"),
embed_dataframe.NumericalVectorizer(),
],
columns=["capacity_mw"],
),
"construction_year": embed_dataframe.ColumnVectorizer(
transform_steps=[
embed_dataframe.ColumnCleaner(cleaning_function="fix_int_na"),
embed_dataframe.CategoricalVectorizer(),
],
columns=["construction_year"],
),
"utility_id_ferc1": embed_dataframe.ColumnVectorizer(
transform_steps=[embed_dataframe.CategoricalVectorizer()],
columns=["utility_id_ferc1"],
),
"fuel_fractions": embed_dataframe.ColumnVectorizer(
transform_steps=[
embed_dataframe.ColumnCleaner(cleaning_function="null_to_zero"),
embed_dataframe.NumericalVectorizer(),
embed_dataframe.NumericalNormalizer(),
],
columns=_FUEL_COLS,
),
}


@op
Expand Down Expand Up @@ -139,12 +142,6 @@ def merge_steam_fuel_dfs(
).astype({"plant_type": str, "construction_type": str})


@op
def get_vectorizers():
"""Get the dictionary of vectorizer transforms for each column."""
return dataframe_vectorizers


@graph_asset
def _out_ferc1__yearly_steam_plants_sched402_with_plant_ids(
core_ferc1__yearly_steam_plants_sched402: pd.DataFrame,
Expand Down
Loading

0 comments on commit 35260b0

Please sign in to comment.