adding a subset columns arg to drop nans

SuperCowPowers · Apr 21, 2024 · d4c1c7b · d4c1c7b
1 parent 6e65b3a
commit d4c1c7b
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 8 deletions.
diff --git a/src/sageworks/core/transforms/data_to_features/light/molecular_descriptors.py b/src/sageworks/core/transforms/data_to_features/light/molecular_descriptors.py
@@ -74,9 +74,12 @@ def transform_impl(self, **kwargs):
         # Compute/add all the Molecular Descriptors
         self.output_df = self.compute_molecular_descriptors(self.input_df)
 
+        # Get the columns that are descriptors
+        desc_columns = set(self.output_df.columns) - set(self.input_df.columns)
+
         # Drop any NaNs (and INFs)
         current_rows = self.output_df.shape[0]
-        self.output_df = pandas_utils.drop_nans(self.output_df, how="any")
+        self.output_df = pandas_utils.drop_nans(self.output_df, how="any", subset=desc_columns)
         self.log.warning(f"Dropped {current_rows - self.output_df.shape[0]} NaN rows")
 
     def compute_molecular_descriptors(self, process_df: pd.DataFrame) -> pd.DataFrame:

diff --git a/src/sageworks/utils/pandas_utils.py b/src/sageworks/utils/pandas_utils.py
@@ -89,9 +89,13 @@ def numeric_stats(df):
     return df.describe().round(2).T.drop("count", axis=1)
 
 
-def drop_nans(input_df: pd.DataFrame, how: str = "all", nan_drop_percent: float = 50) -> pd.DataFrame:
-    """Dropping NaNs in rows and columns. Obviously lots of ways to do this, so picked some reasonable defaults,
-    we can certainly change this later with a more formal set of operations and arguments
+def drop_nans(input_df: pd.DataFrame, how: str = "all", nan_drop_percent: float = 50, subset: list = None) -> pd.DataFrame:
+    """Dropping NaNs in rows and columns. Optionally, focus on specific columns.
+    Args:
+        input_df (pd.DataFrame): Input data frame.
+        how (str): 'all' to drop rows where all values are NaN, 'any' to drop rows where any value is NaN.
+        nan_drop_percent (float): Percentage threshold to drop columns with missing values exceeding this rate.
+        subset (list): Specific subset of columns to check for NaNs when dropping rows.
     """
 
     # Grab input number of rows
@@ -111,12 +115,17 @@ def drop_nans(input_df: pd.DataFrame, how: str = "all", nan_drop_percent: float
         if percent > nan_warn_percent:
             log.important(f"Column ({name}) has {percent}% NaN Values")
         if percent > nan_drop_percent:
-            log.warning(f"Dropping Column ({name}) with {percent}% NaN Values!")
+            log.warning(f"Column ({name}) with {percent}% NaN Values!")
+
+    # Conditionally drop rows based on NaNs in specified columns
+    if subset is not None:
+        output_df.dropna(axis=0, how=how, subset=subset, inplace=True)
+    else:
+        output_df.dropna(axis=0, how=how, inplace=True)
 
-    # Drop Rows that have NaNs in them
-    output_df.dropna(axis=0, how=how, inplace=True)
     if len(output_df) != orig_num_rows:
-        log.important(f"Dropping {orig_num_rows - len(output_df)} rows that have a NaN in them")
+        dropped_rows = orig_num_rows - len(output_df)
+        log.important(f"Dropping {dropped_rows} rows that have a NaN in them")
         output_df.reset_index(drop=True, inplace=True)
 
     return output_df