Skip to content

Commit

Permalink
[GEN-1313] Export detailed columns for NAACCR codes (#567)
Browse files Browse the repository at this point in the history
* export detailed columns for NAACCR codes

* remove GENIE data analysis group since it no longer exists

* replace hard code in consortium and public release
  • Loading branch information
danlu1 authored May 28, 2024
1 parent d3e0730 commit 2f3f2f2
Show file tree
Hide file tree
Showing 5 changed files with 139 additions and 17 deletions.
15 changes: 11 additions & 4 deletions genie/consortium_to_public.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,10 +139,12 @@ def consortiumToPublic(
)

# Clinical release scope filter
# If consortium -> Don't release to public
# TODO: check why this synapse id is hard coded?
clinical_tier_release_scope_synid = databaseSynIdMappingDf["Id"][
databaseSynIdMappingDf["Database"] == "clinical_tier_release_scope"
][0]
publicRelease = extract.get_syntabledf(
syn=syn, query_string="SELECT * FROM syn8545211 where releaseScope = 'public'"
syn=syn,
query_string=f"SELECT * FROM {clinical_tier_release_scope_synid} where releaseScope = 'public'",
)

allClin = clinicalDf[clinicalDf["SAMPLE_ID"].isin(publicReleaseSamples)]
Expand Down Expand Up @@ -186,7 +188,12 @@ def consortiumToPublic(
)

# Grab mapping table to fill in clinical headers
mapping = extract.get_syntabledf(syn=syn, query_string="SELECT * FROM syn9621600")
clinical_code_to_desc_map_synid = databaseSynIdMappingDf["Id"][
databaseSynIdMappingDf["Database"] == "clinical_code_to_desc_map"
][0]
mapping = extract.get_syntabledf(
syn=syn, query_string=f"SELECT * FROM {clinical_code_to_desc_map_synid}"
)
genePanelEntities = []
for entName, entId in consortiumRelease[2]:
is_deprecated_file = entName in ["data_fusions.txt"]
Expand Down
17 changes: 14 additions & 3 deletions genie/database_to_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -1013,6 +1013,7 @@ def store_clinical_files(
release_synid,
current_release_staging,
center_mappingdf,
databaseSynIdMappingDf,
used=None,
):
"""
Expand All @@ -1030,6 +1031,7 @@ def store_clinical_files(
release_synid: Synapse id to store release file
current_release_staging: Staging flag
center_mappingdf: Center mapping dataframe
databaseSynIdMappingDf: Database to Synapse Id mapping
Returns:
pandas.DataFrame: configured clinical dataframe
Expand Down Expand Up @@ -1154,7 +1156,12 @@ def store_clinical_files(
keep_merged_consortium_samples = clinicaldf.SAMPLE_ID
# This mapping table is the GENIE clinical code to description
# mapping to generate the headers of the clinical file
mapping = extract.get_syntabledf(syn=syn, query_string="SELECT * FROM syn9621600")
clinical_code_to_desc_map_synid = databaseSynIdMappingDf["Id"][
databaseSynIdMappingDf["Database"] == "clinical_code_to_desc_map"
][0]
mapping = extract.get_syntabledf(
syn=syn, query_string=f"SELECT * FROM {clinical_code_to_desc_map_synid}"
)
clinical_path = os.path.join(GENIE_RELEASE_DIR, "data_clinical.txt")
clinical_sample_path = os.path.join(GENIE_RELEASE_DIR, "data_clinical_sample.txt")
clinical_patient_path = os.path.join(GENIE_RELEASE_DIR, "data_clinical_patient.txt")
Expand Down Expand Up @@ -1564,6 +1571,9 @@ def stagingToCbio(
sv_synid = databaseSynIdMappingDf["Id"][databaseSynIdMappingDf["Database"] == "sv"][
0
]
clinical_tier_release_scope_synid = databaseSynIdMappingDf["Id"][
databaseSynIdMappingDf["Database"] == "clinical_tier_release_scope"
][0]
# Grab assay information
assay_info_ind = databaseSynIdMappingDf["Database"] == "assayinfo"
assay_info_synid = databaseSynIdMappingDf["Id"][assay_info_ind][0]
Expand Down Expand Up @@ -1592,7 +1602,8 @@ def stagingToCbio(
# Clinical release scope filter
# If private -> Don't release to public
clinicalReleaseScopeDf = extract.get_syntabledf(
syn, "SELECT * FROM syn8545211 where releaseScope <> 'private'"
syn,
f"SELECT * FROM {clinical_tier_release_scope_synid} where releaseScope <> 'private'",
)

patientCols = clinicalReleaseScopeDf["fieldName"][
Expand Down Expand Up @@ -1655,6 +1666,7 @@ def stagingToCbio(
consortiumReleaseSynId,
current_release_staging,
CENTER_MAPPING_DF,
databaseSynIdMappingDf,
used=[sample_used, patient_used],
)

Expand Down Expand Up @@ -1884,7 +1896,6 @@ def create_link_version(
]
if clinical_ent:
# Set private permission for the data_clinical.txt link
syn.setPermissions(clinical_ent[0], principalId=3346558, accessType=[])
syn.setPermissions(clinical_ent[0], principalId=3326313, accessType=[])

for ents in case_list_entities:
Expand Down
23 changes: 18 additions & 5 deletions genie_registry/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,8 +306,16 @@ def remap_clinical_values(
sampletype_mapping.index = sampletype_mapping["CODE"]
sampletype_dict = sampletype_mapping.to_dict()

if clinicaldf.get("SAMPLE_TYPE") is not None:
clinicaldf["SAMPLE_TYPE_DETAILED"] = clinicaldf["SAMPLE_TYPE"]
for column in [
"PRIMARY_RACE",
"SECONDARY_RACE",
"TERTIARY_RACE",
"SEX",
"ETHNICITY",
"SAMPLE_TYPE",
]:
if column in clinicaldf.columns:
clinicaldf[f"{column}_DETAILED"] = clinicaldf[column]

# Use pandas mapping feature
clinicaldf = clinicaldf.replace(
Expand All @@ -316,9 +324,14 @@ def remap_clinical_values(
"SECONDARY_RACE": race_dict["CBIO_LABEL"],
"TERTIARY_RACE": race_dict["CBIO_LABEL"],
"SAMPLE_TYPE": sampletype_dict["CBIO_LABEL"],
"SAMPLE_TYPE_DETAILED": sampletype_dict["DESCRIPTION"],
"SEX": sex_dict["CBIO_LABEL"],
"ETHNICITY": ethnicity_dict["CBIO_LABEL"],
"PRIMARY_RACE_DETAILED": race_dict["DESCRIPTION"],
"SECONDARY_RACE_DETAILED": race_dict["DESCRIPTION"],
"TERTIARY_RACE_DETAILED": race_dict["DESCRIPTION"],
"SAMPLE_TYPE_DETAILED": sampletype_dict["DESCRIPTION"],
"SEX_DETAILED": sex_dict["DESCRIPTION"],
"ETHNICITY_DETAILED": ethnicity_dict["DESCRIPTION"],
}
)

Expand Down Expand Up @@ -481,12 +494,12 @@ def preprocess(self, newpath):
# hardcoded because it never changes
# TODO: Add clinical tier release scope to GENIE config
patient_cols_table = self.syn.tableQuery(
"select fieldName from syn8545211 where "
f"select fieldName from {self.genie_config['clinical_tier_release_scope']} where "
"patient is True and inClinicalDb is True"
)
patient_cols = patient_cols_table.asDataFrame()["fieldName"].tolist()
sample_cols_table = self.syn.tableQuery(
"select fieldName from syn8545211 where "
f"select fieldName from {self.genie_config['clinical_tier_release_scope']} where "
"sample is True and inClinicalDb is True"
)
sample_cols = sample_cols_table.asDataFrame()["fieldName"].tolist()
Expand Down
1 change: 1 addition & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def genie_config():
"race_mapping": "syn7434236",
"sex_mapping": "syn7434222",
"sampletype_mapping": "syn7434273",
"clinical_tier_release_scope": "syn8545211",
}
return config

Expand Down
100 changes: 95 additions & 5 deletions tests/test_clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import synapseclient
from genie import process_functions, validate
from genie_registry.clinical import Clinical
import pdb


def createMockTable(dataframe):
Expand Down Expand Up @@ -37,11 +38,32 @@ def table_query_results(*args):
)
)

patientdf = pd.DataFrame(
dict(
fieldName=["PATIENT_ID", "SEX", "PRIMARY_RACE"],
patient=[True, True, True],
sample=[True, False, False],
)
)
sampledf = pd.DataFrame(
dict(
fieldName=["PATIENT_ID", "SAMPLE_ID"],
patient=[True, False],
sample=[True, True],
)
)

table_query_results_map = {
("select * from syn7434222",): createMockTable(sexdf),
("select * from syn7434236",): createMockTable(no_nan),
("select * from syn7434242",): createMockTable(no_nan),
("select * from syn7434273",): createMockTable(no_nan),
(
"select fieldName from syn8545211 where patient is True and inClinicalDb is True",
): createMockTable(patientdf),
(
"select fieldName from syn8545211 where sample is True and inClinicalDb is True",
): createMockTable(sampledf),
}

json_oncotreeurl = (
Expand Down Expand Up @@ -952,16 +974,61 @@ def test_remap_clinical_values_sampletype():


@pytest.mark.parametrize(
"col", ["SEX", "PRIMARY_RACE", "SECONDARY_RACE", "TERTIARY_RACE", "ETHNICITY"]
("testdf", "expecteddf"),
[
(
pd.DataFrame(
{
"SEX": [1, 2, 99],
"PRIMARY_RACE": [1, 2, 99],
"SECONDARY_RACE": [1, 2, 99],
"TERTIARY_RACE": [1, 2, 99],
"ETHNICITY": [1, 2, 99],
}
),
pd.DataFrame(
{
"SEX": ["Male", "Female", "Unknown"],
"PRIMARY_RACE": ["Male", "Female", "Unknown"],
"SECONDARY_RACE": ["Male", "Female", "Unknown"],
"TERTIARY_RACE": ["Male", "Female", "Unknown"],
"ETHNICITY": ["Male", "Female", "Unknown"],
"ETHNICITY_DETAILED": ["Male", "Female", "Not coded"],
"PRIMARY_RACE_DETAILED": ["Male", "Female", "Not coded"],
"SECONDARY_RACE_DETAILED": ["Male", "Female", "Not coded"],
"SEX_DETAILED": ["Male", "Female", "Not coded"],
"TERTIARY_RACE_DETAILED": ["Male", "Female", "Not coded"],
}
),
),
(
pd.DataFrame({"SEX": [1, 2, 99], "PRIMARY_RACE": [1, 2, 99]}),
pd.DataFrame(
{
"SEX": ["Male", "Female", "Unknown"],
"PRIMARY_RACE": ["Male", "Female", "Unknown"],
"PRIMARY_RACE_DETAILED": ["Male", "Female", "Not coded"],
"SEX_DETAILED": ["Male", "Female", "Not coded"],
}
),
),
(
pd.DataFrame({"CENTER": [1, 2, 99]}),
pd.DataFrame(
{
"CENTER": [1, 2, 99],
}
),
),
],
ids=["all_detailed_columns", "some_detailed_columns", "no_detailed_columns"],
)
def test_remap_clinical_values(col):
def test_remap_clinical_values(testdf, expecteddf):
"""Test Remapping clinical values"""
testdf = pd.DataFrame({col: [1, 2, 99]})
expecteddf = pd.DataFrame({col: ["Male", "Female", "Unknown"]})
remappeddf = genie_registry.clinical.remap_clinical_values(
testdf, sexdf, sexdf, sexdf, sexdf
)
assert expecteddf.equals(remappeddf)
assert expecteddf.sort_index(axis=1).equals(remappeddf.sort_index(axis=1))


def test__check_int_year_consistency_valid():
Expand Down Expand Up @@ -1555,3 +1622,26 @@ def test_that__cross_validate_assay_info_has_seq_returns_expected_msg_if_valid(
)
assert warnings == expected_warning
assert errors == expected_error


def test_preprocess(clin_class, newpath=None):
"""Test preprocess function"""
expected = {
"clinicalTemplate": pd.DataFrame(
columns=["PATIENT_ID", "SEX", "PRIMARY_RACE", "SAMPLE_ID"]
),
"sample": True,
"patient": True,
"patientCols": ["PATIENT_ID", "SEX", "PRIMARY_RACE"],
"sampleCols": ["PATIENT_ID", "SAMPLE_ID"],
}
results = clin_class.preprocess(newpath)
assert (
results["clinicalTemplate"]
.sort_index(axis=1)
.equals(expected["clinicalTemplate"].sort_index(axis=1))
)
assert results["sample"] == expected["sample"]
assert results["patient"] == expected["patient"]
assert results["patientCols"] == expected["patientCols"]
assert results["sampleCols"] == expected["sampleCols"]

0 comments on commit 2f3f2f2

Please sign in to comment.