Skip to content

Commit

Permalink
adding a subset columns arg to drop nans
Browse files Browse the repository at this point in the history
  • Loading branch information
brifordwylie committed Apr 21, 2024
1 parent 6e65b3a commit d4c1c7b
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,12 @@ def transform_impl(self, **kwargs):
# Compute/add all the Molecular Descriptors
self.output_df = self.compute_molecular_descriptors(self.input_df)

# Get the columns that are descriptors
desc_columns = set(self.output_df.columns) - set(self.input_df.columns)

# Drop any NaNs (and INFs)
current_rows = self.output_df.shape[0]
self.output_df = pandas_utils.drop_nans(self.output_df, how="any")
self.output_df = pandas_utils.drop_nans(self.output_df, how="any", subset=desc_columns)
self.log.warning(f"Dropped {current_rows - self.output_df.shape[0]} NaN rows")

def compute_molecular_descriptors(self, process_df: pd.DataFrame) -> pd.DataFrame:
Expand Down
23 changes: 16 additions & 7 deletions src/sageworks/utils/pandas_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,13 @@ def numeric_stats(df):
return df.describe().round(2).T.drop("count", axis=1)


def drop_nans(input_df: pd.DataFrame, how: str = "all", nan_drop_percent: float = 50) -> pd.DataFrame:
"""Dropping NaNs in rows and columns. Obviously lots of ways to do this, so picked some reasonable defaults,
we can certainly change this later with a more formal set of operations and arguments
def drop_nans(input_df: pd.DataFrame, how: str = "all", nan_drop_percent: float = 50, subset: list = None) -> pd.DataFrame:
"""Dropping NaNs in rows and columns. Optionally, focus on specific columns.
Args:
input_df (pd.DataFrame): Input data frame.
how (str): 'all' to drop rows where all values are NaN, 'any' to drop rows where any value is NaN.
nan_drop_percent (float): Percentage threshold to drop columns with missing values exceeding this rate.
subset (list): Specific subset of columns to check for NaNs when dropping rows.
"""

# Grab input number of rows
Expand All @@ -111,12 +115,17 @@ def drop_nans(input_df: pd.DataFrame, how: str = "all", nan_drop_percent: float
if percent > nan_warn_percent:
log.important(f"Column ({name}) has {percent}% NaN Values")
if percent > nan_drop_percent:
log.warning(f"Dropping Column ({name}) with {percent}% NaN Values!")
log.warning(f"Column ({name}) with {percent}% NaN Values!")

# Conditionally drop rows based on NaNs in specified columns
if subset is not None:
output_df.dropna(axis=0, how=how, subset=subset, inplace=True)
else:
output_df.dropna(axis=0, how=how, inplace=True)

# Drop Rows that have NaNs in them
output_df.dropna(axis=0, how=how, inplace=True)
if len(output_df) != orig_num_rows:
log.important(f"Dropping {orig_num_rows - len(output_df)} rows that have a NaN in them")
dropped_rows = orig_num_rows - len(output_df)
log.important(f"Dropping {dropped_rows} rows that have a NaN in them")
output_df.reset_index(drop=True, inplace=True)

return output_df
Expand Down

0 comments on commit d4c1c7b

Please sign in to comment.