add function to check SAMPLE_CLASS and cfDNA existence

Sage-Bionetworks · Nov 25, 2024 · 9a3ba06 · 9a3ba06
1 parent 28c33ee
commit 9a3ba06
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 9 deletions.
diff --git a/genie/consortium_to_public.py b/genie/consortium_to_public.py
@@ -3,9 +3,8 @@
 import logging
 import os
 
-import synapseutils
 import pandas as pd
-
+import synapseutils
 from genie import (
     create_case_lists,
     database_to_staging,
@@ -119,8 +118,6 @@ def consortiumToPublic(
         clinicalDf, processingDate, publicReleaseCutOff
     )
     logger.info("SAMPLE CLASS FILTER")
-    remove_sc_samples = database_to_staging.sample_class_filter(clinical_df=clinicalDf)
-    removeForPublicSamples = list(set(removeForPublicSamples).union(remove_sc_samples))
     # comment back in when public release filter back on
     # publicReleaseSamples = publicReleaseSamples.append(keepForPublicSamples)
     # Make sure all null oncotree codes are removed
@@ -147,7 +144,15 @@ def consortiumToPublic(
         query_string=f"SELECT * FROM {clinical_tier_release_scope_synid} where releaseScope = 'public'",
     )
 
+    # check if SAMPLE_CLASS is present
+    if not process_functions.checkColExist(publicRelease, "SAMPLE_CLASS"):
+        logger.error("Must have SAMPLE_CLASS column in the public release scope.")
+
     allClin = clinicalDf[clinicalDf["SAMPLE_ID"].isin(publicReleaseSamples)]
+    # check if cfDNA samples are present
+    if not process_functions.has_cfDNA_samples(allClin):
+        logger.error("cfDNA samples should not be filtered out.")
+
     allClin.to_csv(clinical_path, sep="\t", index=False)
 
     gene_matrixdf = gene_matrixdf[gene_matrixdf["SAMPLE_ID"].isin(publicReleaseSamples)]

diff --git a/genie/process_functions.py b/genie/process_functions.py
@@ -980,3 +980,17 @@ def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series:
         elif data_type == "boolean":
             dataset[column] = dataset[column].astype(pd.BooleanDtype())
     return dataset[list(schema.keys())]
+
+
+def has_cfDNA_samples(df: pd.DataFrame) -> bool:
+    """Check if cfDNA exist in SAMPLE_CLASS column of the clinical dataframe.
+    Args:
+        df (pd.DataFrame): The clinical dataframe
+    Returns:
+        bool: True if cfDNA samples exist(s)
+    """
+    if not checkColExist(df, "SAMPLE_CLASS"):
+        logger.error("Must have SAMPLE_CLASS column in the dataframe.")
+    else:
+        result = df.SAMPLE_CLASS.isin(["cfDNA"]).any()
+        return result
diff --git a/tests/test_process_functions.py b/tests/test_process_functions.py
@@ -1,18 +1,17 @@
-from unittest.mock import Mock, patch
 import uuid
+from unittest.mock import Mock, patch
 
 import pandas as pd
+import pytest
+import synapseclient
+from genie import process_functions
 from pandas.api.types import (
     is_bool_dtype,
     is_float_dtype,
     is_integer_dtype,
     is_string_dtype,
 )
 from pandas.testing import assert_frame_equal
-import pytest
-import synapseclient
-
-from genie import process_functions
 
 DATABASE_DF = pd.DataFrame(
     {
@@ -715,3 +714,44 @@ def test_that_create_missing_columns_returns_expected_output_with_multi_col_df()
     assert result.isna().sum().sum() == 11
 
     assert_frame_equal(result, expected_output, check_exact=True)
+
+
+@pytest.mark.parametrize(
+    "input_df",
+    [
+        pd.DataFrame({"some_col": ["Val1", "Val1", "Val2"]}),
+    ],
+    ids=["missing_SAMPLE_CLASS_column"],
+)
+def test_has_cfDNA_samples_no_SAMPLE_CLASS_column(input_df):
+    with patch.object(process_functions, "logger") as mock_logger:
+        results = process_functions.has_cfDNA_samples(input_df)
+    mock_logger.error.assert_called_once_with(
+        "Must have SAMPLE_CLASS column in the dataframe."
+    )
+
+
+@pytest.mark.parametrize(
+    "input_df, expected_results",
+    [
+        (
+            pd.DataFrame(
+                {"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["Val1", "Val1", "Val2"]}
+            ),
+            False,
+        ),
+        (
+            pd.DataFrame(
+                {"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["cfDNA", "Val1", "Val2"]}
+            ),
+            True,
+        ),
+    ],
+    ids=["no_cfDNA_sampless", "have_cfDNA_samples"],
+)
+def test_has_cfDNA_samples_has_SAMPLE_CLASS_column(input_df, expected_results):
+    results = process_functions.has_cfDNA_samples(input_df)
+    import pdb
+
+    pdb.set_trace()
+    assert results == expected_results