Skip to content

Commit

Permalink
Refine filter_df logic
Browse files Browse the repository at this point in the history
* Rename filter argument to to_keep
* Make target dataframe positional-only
* Revise docstring
  • Loading branch information
AdrianSosic committed Dec 20, 2024
1 parent 3d1c0eb commit a6cc05a
Showing 1 changed file with 7 additions and 5 deletions.
12 changes: 7 additions & 5 deletions baybe/utils/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -607,7 +607,7 @@ def get_transform_objects(


def filter_df(
df: pd.DataFrame, filter: pd.DataFrame, complement: bool = False
df: pd.DataFrame, /, to_keep: pd.DataFrame, complement: bool = False
) -> pd.DataFrame:
"""Filter a dataframe based on a second dataframe defining filtering conditions.
Expand All @@ -616,9 +616,11 @@ def filter_df(
Args:
df: The dataframe to be filtered.
filter: The dataframe defining the filtering conditions.
to_keep: The dataframe defining the filtering conditions. By default
(see ``complement`` argument), it defines the rows to be kept in the sense
of an inner join.
complement: If ``False``, the filter dataframe determines the rows to be kept
(i.e. selection via regular join). If ``True``, the filtering mechanism is
(i.e. selection via inner join). If ``True``, the filtering mechanism is
inverted so that the complement set of rows is kept (i.e. selection
via anti-join).
Expand Down Expand Up @@ -661,15 +663,15 @@ def filter_df(
"""
# Handle special case of empty filter
if filter.empty:
if to_keep.empty:
return df if complement else pd.DataFrame(columns=df.columns)

# Remember original index name
index_name = df.index.name

# Identify rows to be dropped
out = pd.merge(
df.reset_index(names="_df_index"), filter, how="left", indicator=True
df.reset_index(names="_df_index"), to_keep, how="left", indicator=True
).set_index("_df_index")
to_drop = out["_merge"] == ("both" if complement else "left_only")

Expand Down

0 comments on commit a6cc05a

Please sign in to comment.