[GEN-1622] remove sample class filter (#581)

* remove sample_class_filter function * add function to check both SAMPLE_CLASS fieldName and cfDNA sample * add function to check SAMPLE_CLASS and cfDNA existence
Sage-Bionetworks · Dec 3, 2024 · 567f3f0 · 567f3f0
1 parent 28c33ee
commit 567f3f0
Show file tree

Hide file tree

Showing 4 changed files with 120 additions and 27 deletions.
diff --git a/genie/consortium_to_public.py b/genie/consortium_to_public.py
@@ -2,10 +2,10 @@
 
 import logging
 import os
+import sys
 
-import synapseutils
 import pandas as pd
-
+import synapseutils
 from genie import (
     create_case_lists,
     database_to_staging,
@@ -15,6 +15,9 @@
 )
 
 logger = logging.getLogger(__name__)
+stdout_handler = logging.StreamHandler(stream=sys.stdout)
+stdout_handler.setLevel(logging.INFO)
+logger.addHandler(stdout_handler)
 
 
 # TODO: Add to transform.py
@@ -119,8 +122,6 @@ def consortiumToPublic(
         clinicalDf, processingDate, publicReleaseCutOff
     )
     logger.info("SAMPLE CLASS FILTER")
-    remove_sc_samples = database_to_staging.sample_class_filter(clinical_df=clinicalDf)
-    removeForPublicSamples = list(set(removeForPublicSamples).union(remove_sc_samples))
     # comment back in when public release filter back on
     # publicReleaseSamples = publicReleaseSamples.append(keepForPublicSamples)
     # Make sure all null oncotree codes are removed
@@ -147,7 +148,19 @@ def consortiumToPublic(
         query_string=f"SELECT * FROM {clinical_tier_release_scope_synid} where releaseScope = 'public'",
     )
 
+    # check if SAMPLE_CLASS is present
+    if not process_functions.check_values_in_column(
+        publicRelease, "fieldName", "SAMPLE_CLASS"
+    ):
+        logger.error("Must have SAMPLE_CLASS column in the public release scope.")
+
     allClin = clinicalDf[clinicalDf["SAMPLE_ID"].isin(publicReleaseSamples)]
+    # check if cfDNA samples are present
+    if not process_functions.check_values_in_column(allClin, "SAMPLE_CLASS", "cfDNA"):
+        logger.error(
+            "cfDNA samples should not be filtered out in the clinical dataframe."
+        )
+
     allClin.to_csv(clinical_path, sep="\t", index=False)
 
     gene_matrixdf = gene_matrixdf[gene_matrixdf["SAMPLE_ID"].isin(publicReleaseSamples)]

diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py
@@ -501,24 +501,6 @@ def seq_date_filter(clinicalDf, processingDate, consortiumReleaseCutOff):
     return removeSeqDateSamples
 
 
-def sample_class_filter(clinical_df: pd.DataFrame) -> list:
-    """Filter samples by SAMPLE_CLASS
-
-    Args:
-        clinical_df (pd.DataFrame): Clinical dataframe
-
-    Returns:
-        list: List of samples to filter out
-    """
-    if clinical_df.get("SAMPLE_CLASS") is not None:
-        remove_samples = clinical_df["SAMPLE_ID"][
-            clinical_df["SAMPLE_CLASS"] == "cfDNA"
-        ].tolist()
-    else:
-        remove_samples = []
-    return remove_samples
-
-
 # TODO: Add to transform.py
 def mutation_in_cis_filter(
     syn,

diff --git a/genie/process_functions.py b/genie/process_functions.py
@@ -980,3 +980,24 @@ def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series:
         elif data_type == "boolean":
             dataset[column] = dataset[column].astype(pd.BooleanDtype())
     return dataset[list(schema.keys())]
+
+
+def check_values_in_column(
+    df: pd.DataFrame, col: str, values: Union[str, list]
+) -> bool:
+    """Check if a column in a dataframe contains specific values
+    Args:
+        df (pd.DataFrame): The clinical dataframe
+        col (str): The column name
+        values (list): Expected values in the column
+    Returns:
+        bool: True if the column contains the specified values
+    """
+    if not checkColExist(df, col):
+        logger.error(f"Must have {col} column in the dataframe.")
+    else:
+        # Ensure values is always a list for next step
+        if isinstance(values, str):
+            values = [values]
+        result = df[col].isin(values).any()
+        return result
diff --git a/tests/test_process_functions.py b/tests/test_process_functions.py
@@ -1,18 +1,17 @@
-from unittest.mock import Mock, patch
 import uuid
+from unittest.mock import Mock, patch
 
 import pandas as pd
+import pytest
+import synapseclient
+from genie import process_functions
 from pandas.api.types import (
     is_bool_dtype,
     is_float_dtype,
     is_integer_dtype,
     is_string_dtype,
 )
 from pandas.testing import assert_frame_equal
-import pytest
-import synapseclient
-
-from genie import process_functions
 
 DATABASE_DF = pd.DataFrame(
     {
@@ -715,3 +714,81 @@ def test_that_create_missing_columns_returns_expected_output_with_multi_col_df()
     assert result.isna().sum().sum() == 11
 
     assert_frame_equal(result, expected_output, check_exact=True)
+
+
+@pytest.mark.parametrize(
+    "input_df,col,values",
+    [(pd.DataFrame({"some_col": ["Val1", "Val1", "Val2"]}), "test_col", "test_value")],
+    ids=["missing_the_column"],
+)
+def test_check_values_in_column_no_column(input_df, col, values):
+    with patch.object(process_functions, "logger") as mock_logger:
+        results = process_functions.check_values_in_column(input_df, col, values)
+    mock_logger.error.assert_called_once_with(
+        "Must have test_col column in the dataframe."
+    )
+
+
+@pytest.mark.parametrize(
+    "input_df,col,values,expected_results",
+    [
+        (
+            pd.DataFrame(
+                {"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["Val1", "Val1", "Val2"]}
+            ),
+            "SAMPLE_CLASS",
+            "cfDNA",
+            False,
+        ),
+        (
+            pd.DataFrame(
+                {"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["Val1", "Val1", "Val2"]}
+            ),
+            "SAMPLE_CLASS",
+            ["test_value", "cfDNA"],
+            False,
+        ),
+        (
+            pd.DataFrame(
+                {"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["cfDNA", "Val1", "Val2"]}
+            ),
+            "SAMPLE_CLASS",
+            "cfDNA",
+            True,
+        ),
+        (
+            pd.DataFrame(
+                {"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["cfDNA", "Tumor", "Val2"]}
+            ),
+            "SAMPLE_CLASS",
+            ["cfDNA", "Tumor"],
+            True,
+        ),
+        (
+            pd.DataFrame(
+                {"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["cfDNA", "Tumor", "Val2"]}
+            ),
+            "SAMPLE_CLASS",
+            ["cfDNA", "Tumor", "test_value"],
+            True,
+        ),
+        (
+            pd.DataFrame({"SAMPLE_ID": [], "SAMPLE_CLASS": []}),
+            "SAMPLE_CLASS",
+            ["cfDNA", "Tumor", "test_value"],
+            False,
+        ),
+    ],
+    ids=[
+        "no_expected_single_value",
+        "no_expected_value_list",
+        "have_expected_single_value",
+        "have_expected_value_list",
+        "have_partial_expected_value_list",
+        "empty_dataframe_with_required_column",
+    ],
+)
+def test_check_values_in_column_has_column(input_df, col, values, expected_results):
+    results = process_functions.check_values_in_column(input_df, col, values)
+
+    assert results == expected_results