From 9a3ba06d9e00261d91614f6ea3839cd5359f6e18 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Mon, 25 Nov 2024 23:05:47 +0000 Subject: [PATCH] add function to check SAMPLE_CLASS and cfDNA existence --- genie/consortium_to_public.py | 13 ++++++--- genie/process_functions.py | 14 +++++++++ tests/test_process_functions.py | 50 +++++++++++++++++++++++++++++---- 3 files changed, 68 insertions(+), 9 deletions(-) diff --git a/genie/consortium_to_public.py b/genie/consortium_to_public.py index eead602c..4d67869e 100644 --- a/genie/consortium_to_public.py +++ b/genie/consortium_to_public.py @@ -3,9 +3,8 @@ import logging import os -import synapseutils import pandas as pd - +import synapseutils from genie import ( create_case_lists, database_to_staging, @@ -119,8 +118,6 @@ def consortiumToPublic( clinicalDf, processingDate, publicReleaseCutOff ) logger.info("SAMPLE CLASS FILTER") - remove_sc_samples = database_to_staging.sample_class_filter(clinical_df=clinicalDf) - removeForPublicSamples = list(set(removeForPublicSamples).union(remove_sc_samples)) # comment back in when public release filter back on # publicReleaseSamples = publicReleaseSamples.append(keepForPublicSamples) # Make sure all null oncotree codes are removed @@ -147,7 +144,15 @@ def consortiumToPublic( query_string=f"SELECT * FROM {clinical_tier_release_scope_synid} where releaseScope = 'public'", ) + # check if SAMPLE_CLASS is present + if not process_functions.checkColExist(publicRelease, "SAMPLE_CLASS"): + logger.error("Must have SAMPLE_CLASS column in the public release scope.") + allClin = clinicalDf[clinicalDf["SAMPLE_ID"].isin(publicReleaseSamples)] + # check if cfDNA samples are present + if not process_functions.has_cfDNA_samples(allClin): + logger.error("cfDNA samples should not be filtered out.") + allClin.to_csv(clinical_path, sep="\t", index=False) gene_matrixdf = gene_matrixdf[gene_matrixdf["SAMPLE_ID"].isin(publicReleaseSamples)] diff --git a/genie/process_functions.py b/genie/process_functions.py index adea2f4f..4ca70b0f 100644 --- a/genie/process_functions.py +++ b/genie/process_functions.py @@ -980,3 +980,17 @@ def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series: elif data_type == "boolean": dataset[column] = dataset[column].astype(pd.BooleanDtype()) return dataset[list(schema.keys())] + + +def has_cfDNA_samples(df: pd.DataFrame) -> bool: + """Check if cfDNA exist in SAMPLE_CLASS column of the clinical dataframe. + Args: + df (pd.DataFrame): The clinical dataframe + Returns: + bool: True if cfDNA samples exist(s) + """ + if not checkColExist(df, "SAMPLE_CLASS"): + logger.error("Must have SAMPLE_CLASS column in the dataframe.") + else: + result = df.SAMPLE_CLASS.isin(["cfDNA"]).any() + return result diff --git a/tests/test_process_functions.py b/tests/test_process_functions.py index 72f72663..c18dfcc2 100644 --- a/tests/test_process_functions.py +++ b/tests/test_process_functions.py @@ -1,7 +1,10 @@ -from unittest.mock import Mock, patch import uuid +from unittest.mock import Mock, patch import pandas as pd +import pytest +import synapseclient +from genie import process_functions from pandas.api.types import ( is_bool_dtype, is_float_dtype, @@ -9,10 +12,6 @@ is_string_dtype, ) from pandas.testing import assert_frame_equal -import pytest -import synapseclient - -from genie import process_functions DATABASE_DF = pd.DataFrame( { @@ -715,3 +714,44 @@ def test_that_create_missing_columns_returns_expected_output_with_multi_col_df() assert result.isna().sum().sum() == 11 assert_frame_equal(result, expected_output, check_exact=True) + + +@pytest.mark.parametrize( + "input_df", + [ + pd.DataFrame({"some_col": ["Val1", "Val1", "Val2"]}), + ], + ids=["missing_SAMPLE_CLASS_column"], +) +def test_has_cfDNA_samples_no_SAMPLE_CLASS_column(input_df): + with patch.object(process_functions, "logger") as mock_logger: + results = process_functions.has_cfDNA_samples(input_df) + mock_logger.error.assert_called_once_with( + "Must have SAMPLE_CLASS column in the dataframe." + ) + + +@pytest.mark.parametrize( + "input_df, expected_results", + [ + ( + pd.DataFrame( + {"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["Val1", "Val1", "Val2"]} + ), + False, + ), + ( + pd.DataFrame( + {"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["cfDNA", "Val1", "Val2"]} + ), + True, + ), + ], + ids=["no_cfDNA_sampless", "have_cfDNA_samples"], +) +def test_has_cfDNA_samples_has_SAMPLE_CLASS_column(input_df, expected_results): + results = process_functions.has_cfDNA_samples(input_df) + import pdb + + pdb.set_trace() + assert results == expected_results