diff --git a/genie/consortium_to_public.py b/genie/consortium_to_public.py index eead602c..4ba7c5a7 100644 --- a/genie/consortium_to_public.py +++ b/genie/consortium_to_public.py @@ -2,10 +2,10 @@ import logging import os +import sys -import synapseutils import pandas as pd - +import synapseutils from genie import ( create_case_lists, database_to_staging, @@ -15,6 +15,9 @@ ) logger = logging.getLogger(__name__) +stdout_handler = logging.StreamHandler(stream=sys.stdout) +stdout_handler.setLevel(logging.INFO) +logger.addHandler(stdout_handler) # TODO: Add to transform.py @@ -119,8 +122,6 @@ def consortiumToPublic( clinicalDf, processingDate, publicReleaseCutOff ) logger.info("SAMPLE CLASS FILTER") - remove_sc_samples = database_to_staging.sample_class_filter(clinical_df=clinicalDf) - removeForPublicSamples = list(set(removeForPublicSamples).union(remove_sc_samples)) # comment back in when public release filter back on # publicReleaseSamples = publicReleaseSamples.append(keepForPublicSamples) # Make sure all null oncotree codes are removed @@ -147,7 +148,19 @@ def consortiumToPublic( query_string=f"SELECT * FROM {clinical_tier_release_scope_synid} where releaseScope = 'public'", ) + # check if SAMPLE_CLASS is present + if not process_functions.check_values_in_column( + publicRelease, "fieldName", "SAMPLE_CLASS" + ): + logger.error("Must have SAMPLE_CLASS column in the public release scope.") + allClin = clinicalDf[clinicalDf["SAMPLE_ID"].isin(publicReleaseSamples)] + # check if cfDNA samples are present + if not process_functions.check_values_in_column(allClin, "SAMPLE_CLASS", "cfDNA"): + logger.error( + "cfDNA samples should not be filtered out in the clinical dataframe." + ) + allClin.to_csv(clinical_path, sep="\t", index=False) gene_matrixdf = gene_matrixdf[gene_matrixdf["SAMPLE_ID"].isin(publicReleaseSamples)] diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index 78acd297..d8edd578 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -501,24 +501,6 @@ def seq_date_filter(clinicalDf, processingDate, consortiumReleaseCutOff): return removeSeqDateSamples -def sample_class_filter(clinical_df: pd.DataFrame) -> list: - """Filter samples by SAMPLE_CLASS - - Args: - clinical_df (pd.DataFrame): Clinical dataframe - - Returns: - list: List of samples to filter out - """ - if clinical_df.get("SAMPLE_CLASS") is not None: - remove_samples = clinical_df["SAMPLE_ID"][ - clinical_df["SAMPLE_CLASS"] == "cfDNA" - ].tolist() - else: - remove_samples = [] - return remove_samples - - # TODO: Add to transform.py def mutation_in_cis_filter( syn, diff --git a/genie/process_functions.py b/genie/process_functions.py index adea2f4f..61327a9a 100644 --- a/genie/process_functions.py +++ b/genie/process_functions.py @@ -980,3 +980,24 @@ def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series: elif data_type == "boolean": dataset[column] = dataset[column].astype(pd.BooleanDtype()) return dataset[list(schema.keys())] + + +def check_values_in_column( + df: pd.DataFrame, col: str, values: Union[str, list] +) -> bool: + """Check if a column in a dataframe contains specific values + Args: + df (pd.DataFrame): The clinical dataframe + col (str): The column name + values (list): Expected values in the column + Returns: + bool: True if the column contains the specified values + """ + if not checkColExist(df, col): + logger.error(f"Must have {col} column in the dataframe.") + else: + # Ensure values is always a list for next step + if isinstance(values, str): + values = [values] + result = df[col].isin(values).any() + return result diff --git a/tests/test_process_functions.py b/tests/test_process_functions.py index 72f72663..5463e8be 100644 --- a/tests/test_process_functions.py +++ b/tests/test_process_functions.py @@ -1,7 +1,10 @@ -from unittest.mock import Mock, patch import uuid +from unittest.mock import Mock, patch import pandas as pd +import pytest +import synapseclient +from genie import process_functions from pandas.api.types import ( is_bool_dtype, is_float_dtype, @@ -9,10 +12,6 @@ is_string_dtype, ) from pandas.testing import assert_frame_equal -import pytest -import synapseclient - -from genie import process_functions DATABASE_DF = pd.DataFrame( { @@ -715,3 +714,81 @@ def test_that_create_missing_columns_returns_expected_output_with_multi_col_df() assert result.isna().sum().sum() == 11 assert_frame_equal(result, expected_output, check_exact=True) + + +@pytest.mark.parametrize( + "input_df,col,values", + [(pd.DataFrame({"some_col": ["Val1", "Val1", "Val2"]}), "test_col", "test_value")], + ids=["missing_the_column"], +) +def test_check_values_in_column_no_column(input_df, col, values): + with patch.object(process_functions, "logger") as mock_logger: + results = process_functions.check_values_in_column(input_df, col, values) + mock_logger.error.assert_called_once_with( + "Must have test_col column in the dataframe." + ) + + +@pytest.mark.parametrize( + "input_df,col,values,expected_results", + [ + ( + pd.DataFrame( + {"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["Val1", "Val1", "Val2"]} + ), + "SAMPLE_CLASS", + "cfDNA", + False, + ), + ( + pd.DataFrame( + {"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["Val1", "Val1", "Val2"]} + ), + "SAMPLE_CLASS", + ["test_value", "cfDNA"], + False, + ), + ( + pd.DataFrame( + {"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["cfDNA", "Val1", "Val2"]} + ), + "SAMPLE_CLASS", + "cfDNA", + True, + ), + ( + pd.DataFrame( + {"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["cfDNA", "Tumor", "Val2"]} + ), + "SAMPLE_CLASS", + ["cfDNA", "Tumor"], + True, + ), + ( + pd.DataFrame( + {"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["cfDNA", "Tumor", "Val2"]} + ), + "SAMPLE_CLASS", + ["cfDNA", "Tumor", "test_value"], + True, + ), + ( + pd.DataFrame({"SAMPLE_ID": [], "SAMPLE_CLASS": []}), + "SAMPLE_CLASS", + ["cfDNA", "Tumor", "test_value"], + False, + ), + ], + ids=[ + "no_expected_single_value", + "no_expected_value_list", + "have_expected_single_value", + "have_expected_value_list", + "have_partial_expected_value_list", + "empty_dataframe_with_required_column", + ], +) +def test_check_values_in_column_has_column(input_df, col, values, expected_results): + results = process_functions.check_values_in_column(input_df, col, values) + + assert results == expected_results