Skip to content

Commit

Permalink
add function to check SAMPLE_CLASS and cfDNA existence
Browse files Browse the repository at this point in the history
  • Loading branch information
danlu1 committed Nov 25, 2024
1 parent 28c33ee commit 9a3ba06
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 9 deletions.
13 changes: 9 additions & 4 deletions genie/consortium_to_public.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@
import logging
import os

import synapseutils
import pandas as pd

import synapseutils
from genie import (
create_case_lists,
database_to_staging,
Expand Down Expand Up @@ -119,8 +118,6 @@ def consortiumToPublic(
clinicalDf, processingDate, publicReleaseCutOff
)
logger.info("SAMPLE CLASS FILTER")
remove_sc_samples = database_to_staging.sample_class_filter(clinical_df=clinicalDf)
removeForPublicSamples = list(set(removeForPublicSamples).union(remove_sc_samples))
# comment back in when public release filter back on
# publicReleaseSamples = publicReleaseSamples.append(keepForPublicSamples)
# Make sure all null oncotree codes are removed
Expand All @@ -147,7 +144,15 @@ def consortiumToPublic(
query_string=f"SELECT * FROM {clinical_tier_release_scope_synid} where releaseScope = 'public'",
)

# check if SAMPLE_CLASS is present
if not process_functions.checkColExist(publicRelease, "SAMPLE_CLASS"):
logger.error("Must have SAMPLE_CLASS column in the public release scope.")

allClin = clinicalDf[clinicalDf["SAMPLE_ID"].isin(publicReleaseSamples)]
# check if cfDNA samples are present
if not process_functions.has_cfDNA_samples(allClin):
logger.error("cfDNA samples should not be filtered out.")

allClin.to_csv(clinical_path, sep="\t", index=False)

gene_matrixdf = gene_matrixdf[gene_matrixdf["SAMPLE_ID"].isin(publicReleaseSamples)]
Expand Down
14 changes: 14 additions & 0 deletions genie/process_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -980,3 +980,17 @@ def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series:
elif data_type == "boolean":
dataset[column] = dataset[column].astype(pd.BooleanDtype())
return dataset[list(schema.keys())]


def has_cfDNA_samples(df: pd.DataFrame) -> bool:
"""Check if cfDNA exist in SAMPLE_CLASS column of the clinical dataframe.
Args:
df (pd.DataFrame): The clinical dataframe
Returns:
bool: True if cfDNA samples exist(s)
"""
if not checkColExist(df, "SAMPLE_CLASS"):
logger.error("Must have SAMPLE_CLASS column in the dataframe.")
else:
result = df.SAMPLE_CLASS.isin(["cfDNA"]).any()
return result
50 changes: 45 additions & 5 deletions tests/test_process_functions.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
from unittest.mock import Mock, patch
import uuid
from unittest.mock import Mock, patch

import pandas as pd
import pytest
import synapseclient
from genie import process_functions
from pandas.api.types import (
is_bool_dtype,
is_float_dtype,
is_integer_dtype,
is_string_dtype,
)
from pandas.testing import assert_frame_equal
import pytest
import synapseclient

from genie import process_functions

DATABASE_DF = pd.DataFrame(
{
Expand Down Expand Up @@ -715,3 +714,44 @@ def test_that_create_missing_columns_returns_expected_output_with_multi_col_df()
assert result.isna().sum().sum() == 11

assert_frame_equal(result, expected_output, check_exact=True)


@pytest.mark.parametrize(
"input_df",
[
pd.DataFrame({"some_col": ["Val1", "Val1", "Val2"]}),
],
ids=["missing_SAMPLE_CLASS_column"],
)
def test_has_cfDNA_samples_no_SAMPLE_CLASS_column(input_df):
with patch.object(process_functions, "logger") as mock_logger:
results = process_functions.has_cfDNA_samples(input_df)
mock_logger.error.assert_called_once_with(
"Must have SAMPLE_CLASS column in the dataframe."
)


@pytest.mark.parametrize(
"input_df, expected_results",
[
(
pd.DataFrame(
{"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["Val1", "Val1", "Val2"]}
),
False,
),
(
pd.DataFrame(
{"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["cfDNA", "Val1", "Val2"]}
),
True,
),
],
ids=["no_cfDNA_sampless", "have_cfDNA_samples"],
)
def test_has_cfDNA_samples_has_SAMPLE_CLASS_column(input_df, expected_results):
results = process_functions.has_cfDNA_samples(input_df)
import pdb

pdb.set_trace()
assert results == expected_results

0 comments on commit 9a3ba06

Please sign in to comment.