Skip to content

Commit

Permalink
[GEN-1622] remove sample class filter (#581)
Browse files Browse the repository at this point in the history
* remove sample_class_filter function
* add function to check both SAMPLE_CLASS fieldName and cfDNA sample
* add function to check SAMPLE_CLASS and cfDNA existence
  • Loading branch information
danlu1 authored Dec 3, 2024
1 parent 28c33ee commit 567f3f0
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 27 deletions.
21 changes: 17 additions & 4 deletions genie/consortium_to_public.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

import logging
import os
import sys

import synapseutils
import pandas as pd

import synapseutils
from genie import (
create_case_lists,
database_to_staging,
Expand All @@ -15,6 +15,9 @@
)

logger = logging.getLogger(__name__)
stdout_handler = logging.StreamHandler(stream=sys.stdout)
stdout_handler.setLevel(logging.INFO)
logger.addHandler(stdout_handler)


# TODO: Add to transform.py
Expand Down Expand Up @@ -119,8 +122,6 @@ def consortiumToPublic(
clinicalDf, processingDate, publicReleaseCutOff
)
logger.info("SAMPLE CLASS FILTER")
remove_sc_samples = database_to_staging.sample_class_filter(clinical_df=clinicalDf)
removeForPublicSamples = list(set(removeForPublicSamples).union(remove_sc_samples))
# comment back in when public release filter back on
# publicReleaseSamples = publicReleaseSamples.append(keepForPublicSamples)
# Make sure all null oncotree codes are removed
Expand All @@ -147,7 +148,19 @@ def consortiumToPublic(
query_string=f"SELECT * FROM {clinical_tier_release_scope_synid} where releaseScope = 'public'",
)

# check if SAMPLE_CLASS is present
if not process_functions.check_values_in_column(
publicRelease, "fieldName", "SAMPLE_CLASS"
):
logger.error("Must have SAMPLE_CLASS column in the public release scope.")

allClin = clinicalDf[clinicalDf["SAMPLE_ID"].isin(publicReleaseSamples)]
# check if cfDNA samples are present
if not process_functions.check_values_in_column(allClin, "SAMPLE_CLASS", "cfDNA"):
logger.error(
"cfDNA samples should not be filtered out in the clinical dataframe."
)

allClin.to_csv(clinical_path, sep="\t", index=False)

gene_matrixdf = gene_matrixdf[gene_matrixdf["SAMPLE_ID"].isin(publicReleaseSamples)]
Expand Down
18 changes: 0 additions & 18 deletions genie/database_to_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,24 +501,6 @@ def seq_date_filter(clinicalDf, processingDate, consortiumReleaseCutOff):
return removeSeqDateSamples


def sample_class_filter(clinical_df: pd.DataFrame) -> list:
"""Filter samples by SAMPLE_CLASS
Args:
clinical_df (pd.DataFrame): Clinical dataframe
Returns:
list: List of samples to filter out
"""
if clinical_df.get("SAMPLE_CLASS") is not None:
remove_samples = clinical_df["SAMPLE_ID"][
clinical_df["SAMPLE_CLASS"] == "cfDNA"
].tolist()
else:
remove_samples = []
return remove_samples


# TODO: Add to transform.py
def mutation_in_cis_filter(
syn,
Expand Down
21 changes: 21 additions & 0 deletions genie/process_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -980,3 +980,24 @@ def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series:
elif data_type == "boolean":
dataset[column] = dataset[column].astype(pd.BooleanDtype())
return dataset[list(schema.keys())]


def check_values_in_column(
df: pd.DataFrame, col: str, values: Union[str, list]
) -> bool:
"""Check if a column in a dataframe contains specific values
Args:
df (pd.DataFrame): The clinical dataframe
col (str): The column name
values (list): Expected values in the column
Returns:
bool: True if the column contains the specified values
"""
if not checkColExist(df, col):
logger.error(f"Must have {col} column in the dataframe.")
else:
# Ensure values is always a list for next step
if isinstance(values, str):
values = [values]
result = df[col].isin(values).any()
return result
87 changes: 82 additions & 5 deletions tests/test_process_functions.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
from unittest.mock import Mock, patch
import uuid
from unittest.mock import Mock, patch

import pandas as pd
import pytest
import synapseclient
from genie import process_functions
from pandas.api.types import (
is_bool_dtype,
is_float_dtype,
is_integer_dtype,
is_string_dtype,
)
from pandas.testing import assert_frame_equal
import pytest
import synapseclient

from genie import process_functions

DATABASE_DF = pd.DataFrame(
{
Expand Down Expand Up @@ -715,3 +714,81 @@ def test_that_create_missing_columns_returns_expected_output_with_multi_col_df()
assert result.isna().sum().sum() == 11

assert_frame_equal(result, expected_output, check_exact=True)


@pytest.mark.parametrize(
"input_df,col,values",
[(pd.DataFrame({"some_col": ["Val1", "Val1", "Val2"]}), "test_col", "test_value")],
ids=["missing_the_column"],
)
def test_check_values_in_column_no_column(input_df, col, values):
with patch.object(process_functions, "logger") as mock_logger:
results = process_functions.check_values_in_column(input_df, col, values)
mock_logger.error.assert_called_once_with(
"Must have test_col column in the dataframe."
)


@pytest.mark.parametrize(
"input_df,col,values,expected_results",
[
(
pd.DataFrame(
{"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["Val1", "Val1", "Val2"]}
),
"SAMPLE_CLASS",
"cfDNA",
False,
),
(
pd.DataFrame(
{"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["Val1", "Val1", "Val2"]}
),
"SAMPLE_CLASS",
["test_value", "cfDNA"],
False,
),
(
pd.DataFrame(
{"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["cfDNA", "Val1", "Val2"]}
),
"SAMPLE_CLASS",
"cfDNA",
True,
),
(
pd.DataFrame(
{"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["cfDNA", "Tumor", "Val2"]}
),
"SAMPLE_CLASS",
["cfDNA", "Tumor"],
True,
),
(
pd.DataFrame(
{"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["cfDNA", "Tumor", "Val2"]}
),
"SAMPLE_CLASS",
["cfDNA", "Tumor", "test_value"],
True,
),
(
pd.DataFrame({"SAMPLE_ID": [], "SAMPLE_CLASS": []}),
"SAMPLE_CLASS",
["cfDNA", "Tumor", "test_value"],
False,
),
],
ids=[
"no_expected_single_value",
"no_expected_value_list",
"have_expected_single_value",
"have_expected_value_list",
"have_partial_expected_value_list",
"empty_dataframe_with_required_column",
],
)
def test_check_values_in_column_has_column(input_df, col, values, expected_results):
results = process_functions.check_values_in_column(input_df, col, values)

assert results == expected_results

0 comments on commit 567f3f0

Please sign in to comment.