From d4c1c7b78821b3949e859a566798af363972b696 Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Sun, 21 Apr 2024 15:57:18 -0600 Subject: [PATCH] adding a subset columns arg to drop nans --- .../light/molecular_descriptors.py | 5 +++- src/sageworks/utils/pandas_utils.py | 23 +++++++++++++------ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/src/sageworks/core/transforms/data_to_features/light/molecular_descriptors.py b/src/sageworks/core/transforms/data_to_features/light/molecular_descriptors.py index bf0f067ec..1baf58fe8 100644 --- a/src/sageworks/core/transforms/data_to_features/light/molecular_descriptors.py +++ b/src/sageworks/core/transforms/data_to_features/light/molecular_descriptors.py @@ -74,9 +74,12 @@ def transform_impl(self, **kwargs): # Compute/add all the Molecular Descriptors self.output_df = self.compute_molecular_descriptors(self.input_df) + # Get the columns that are descriptors + desc_columns = set(self.output_df.columns) - set(self.input_df.columns) + # Drop any NaNs (and INFs) current_rows = self.output_df.shape[0] - self.output_df = pandas_utils.drop_nans(self.output_df, how="any") + self.output_df = pandas_utils.drop_nans(self.output_df, how="any", subset=desc_columns) self.log.warning(f"Dropped {current_rows - self.output_df.shape[0]} NaN rows") def compute_molecular_descriptors(self, process_df: pd.DataFrame) -> pd.DataFrame: diff --git a/src/sageworks/utils/pandas_utils.py b/src/sageworks/utils/pandas_utils.py index fb5c3c2c2..15b320144 100644 --- a/src/sageworks/utils/pandas_utils.py +++ b/src/sageworks/utils/pandas_utils.py @@ -89,9 +89,13 @@ def numeric_stats(df): return df.describe().round(2).T.drop("count", axis=1) -def drop_nans(input_df: pd.DataFrame, how: str = "all", nan_drop_percent: float = 50) -> pd.DataFrame: - """Dropping NaNs in rows and columns. Obviously lots of ways to do this, so picked some reasonable defaults, - we can certainly change this later with a more formal set of operations and arguments +def drop_nans(input_df: pd.DataFrame, how: str = "all", nan_drop_percent: float = 50, subset: list = None) -> pd.DataFrame: + """Dropping NaNs in rows and columns. Optionally, focus on specific columns. + Args: + input_df (pd.DataFrame): Input data frame. + how (str): 'all' to drop rows where all values are NaN, 'any' to drop rows where any value is NaN. + nan_drop_percent (float): Percentage threshold to drop columns with missing values exceeding this rate. + subset (list): Specific subset of columns to check for NaNs when dropping rows. """ # Grab input number of rows @@ -111,12 +115,17 @@ def drop_nans(input_df: pd.DataFrame, how: str = "all", nan_drop_percent: float if percent > nan_warn_percent: log.important(f"Column ({name}) has {percent}% NaN Values") if percent > nan_drop_percent: - log.warning(f"Dropping Column ({name}) with {percent}% NaN Values!") + log.warning(f"Column ({name}) with {percent}% NaN Values!") + + # Conditionally drop rows based on NaNs in specified columns + if subset is not None: + output_df.dropna(axis=0, how=how, subset=subset, inplace=True) + else: + output_df.dropna(axis=0, how=how, inplace=True) - # Drop Rows that have NaNs in them - output_df.dropna(axis=0, how=how, inplace=True) if len(output_df) != orig_num_rows: - log.important(f"Dropping {orig_num_rows - len(output_df)} rows that have a NaN in them") + dropped_rows = orig_num_rows - len(output_df) + log.important(f"Dropping {dropped_rows} rows that have a NaN in them") output_df.reset_index(drop=True, inplace=True) return output_df