Skip to content

Commit

Permalink
Merge branch 'develop' into GEN-1299-align-col-names
Browse files Browse the repository at this point in the history
  • Loading branch information
thomasyu888 authored Aug 15, 2024
2 parents c5be347 + 238970f commit 0a7226f
Show file tree
Hide file tree
Showing 11 changed files with 195 additions and 66 deletions.
1 change: 0 additions & 1 deletion R/install_packages.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ renv::restore()
library(synapser)
library(dplyr)
library(argparse)
library(UpSetR)
library(rmarkdown)
library(testthat)
library(VariantAnnotation)
Expand Down
2 changes: 1 addition & 1 deletion genie/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@

# create version in __init__.py
# https://packaging.python.org/en/latest/guides/single-sourcing-package-version/
__version__ = "16.3.0"
__version__ = "16.4.0"

__all__ = ["__version__"]
6 changes: 0 additions & 6 deletions genie/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,12 +279,6 @@ def get_genie_config(
center_mapping_df.index = center_mapping_df.center
# Add center configurations including input/staging synapse ids
genie_config["center_config"] = center_mapping_df.to_dict("index")

genie_config["ethnicity_mapping"] = "syn7434242"
genie_config["race_mapping"] = "syn7434236"
genie_config["sex_mapping"] = "syn7434222"
genie_config["sampletype_mapping"] = "syn7434273"

return genie_config


Expand Down
1 change: 1 addition & 0 deletions genie/process_mutation.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@
"Exon_Number",
"genomic_location_explanation",
"Annotation_Status",
"Variant_Classification",
]


Expand Down
29 changes: 25 additions & 4 deletions genie_registry/cna.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import os
from typing import Union

import pandas as pd
import synapseclient
Expand All @@ -10,17 +11,37 @@
logger = logging.getLogger(__name__)


def validateSymbol(gene, bedDf, returnMappedDf=True):
def validateSymbol(
gene: str, bedDf: pd.DataFrame, returnMappedDf: bool = True
) -> Union[str, float, bool]:
"""
Validate gene symbol
Validates the gene symbol against the gene symbol in the bed database.
Note that gene symbols in the bed database have gone through processing and
have been remapped to allowed actual genes if needed.
Two conditions must be met for the gene to be VALID:
1. The gene exists in the bed database table's Hugo_Symbol column
2. The gene exists in the bed database table's ID column. Under this condition,
the gene in the cna file will be REMAPPED temporarily to the bed database
table's Hugo_Symbol value for the purpose of validation. The ID column is the
original Hugo_Symbol column of the bed files before the Hugo_Symbol column gets
mapped to valid possible gene values in the Actual Gene Positions (GRCh37)
database table. See the bed fileformat module's remap_symbols function and
how it gets used in processing for more info on this.
The validation throws a WARNING if the gene doesn't satisfy
either of the above two conditions
Args:
gene: Gene name
bedDf: Bed pandas dataframe
bedDf: The bed database table as a pandas dataframe
returnMappedDf: Return a mapped gene. Defaults to True
Returns:
gene name or boolean for whether a gene is valid
Union[str, float, bool]:
Returns gene symbol (str if valid, a float("nan") if invalid) if returnMappedDf is True
Returns boolean for whether a gene is valid if returnMappedDf is False
"""
valid = False
if sum(bedDf["Hugo_Symbol"] == gene) > 0:
Expand Down
39 changes: 37 additions & 2 deletions genie_registry/maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,43 @@


def _check_allele_col_validity(df):
"""If maf file has both TSA1 and TSA2,
TSA1 must equal REF, or TSA1 must equal TSA2, and REF must not equal TSA2
"""There are two linked validation rules in this function:
1) If maf file has ALL three of the following columns:
- TUMOR_SEQ_ALLELE1 (TSA1)
- TUMOR_SEQ_ALLELE2 (TSA2)
- REFERENCE ALLELE (REF)
THEN
ALL rows of TSA1 must equal REF
OR
ALL rows of TSA1 must equal TSA2
TSA1 is used by Genome Nexus (GN) to annotate data when it senses there is ambiguity
regarding which variant (TSA1 vs TSA2) to use. This is
why there cannot be mixed rows where some rows have TSA1 == REF and some rows
have TSA1 == TSA2.
e.g:
VALID
| REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2
| C | C | A
| T | T | C
VALID
| REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2
| C | A | A
| T | C | C
INVALID
| REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2
| C | C | A
| C | A | A
See https://github.com/genome-nexus/annotation-tools/issues/26 for
more background regarding why this validation rule was implemented.
2) There can't be ANY rows where REF == TSA2. This is a missense mutation
flagged as invalid by GN
"""
tsa2_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE2")
tsa1_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE1")
Expand Down
41 changes: 1 addition & 40 deletions templates/dashboardTemplate.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ title: '`r release`'
suppressMessages(library(synapser))
foo = capture.output(synLogin())
suppressMessages(library(ggplot2))
suppressMessages(library(UpSetR))
suppressMessages(library(RColorBrewer))
suppressMessages(library(jsonlite))
suppressMessages(library(knitr))
Expand Down Expand Up @@ -57,19 +56,6 @@ getFileDf <- function(fileName, releaseFiles) {
}
}
makePanelList = function(assay, bed) {
return(unique(as.character(bed$Hugo_Symbol[bed$SEQ_PIPELINE_ID == assay])))
}
plotPanelOverlap <- function(bed, assays) {
listInput = lapply(as.list(assays), function(x) { makePanelList(x, bed) })
names(listInput) = assays
upset(fromList(listInput),
order.by = "freq",
nsets = length(assays),
nintersects = 30)
}
plotCenterXRace <- function(genieClinData) {
t = as.data.frame.matrix(table(genieClinData$CENTER,genieClinData$PRIMARY_RACE))
t = data.frame(n = rowSums(t),t)
Expand Down Expand Up @@ -260,11 +246,7 @@ if (is.null(this_bed)) {
this_assays = as.character(unique(this_samples$SEQ_ASSAY_ID))
this_mut <- getFileDf("data_mutations_extended.txt", releaseFiles)
assay_infodf = getFileDf("assay_information.txt", releaseFiles)
black_list_variants <- synTableQuery(
"select * from syn18459663 where filter_variant is true",
includeRowIdAndRowVersion = F
)
black_list_variantsdf = black_list_variants$asDataFrame()
# this_cna <- getFileDf("data_CNA.txt", releaseFiles)
#this_fus <- getFileDf("data_fusions.txt", releaseFiles)
Expand Down Expand Up @@ -438,27 +420,6 @@ Files &rarr; Centers &rarr; [Center name] &rarr; Errors &rarr; failed_annotation

View the version comment column in Synapse for your report to find the version associated with this release.

---

## Flagged Mutations
This is a count of how many flagged mutations a center has. Most of these variants are potential
artifacts flagged by manual review of cBioPortal. Please inform Sage Bionetworks about:

* Suggestions for variants that should be part of this list
* Any variant shouldn't be part of this list

```{r blacklist}
blacklist_variants = paste(black_list_variantsdf$Hugo_Symbol,
black_list_variantsdf$HGVSp_Short)
subset_mut = this_mut[this_mut$Hugo_Symbol %in% black_list_variantsdf$Hugo_Symbol, ]
subset_mut$blacklist = paste(subset_mut$Hugo_Symbol,
subset_mut$HGVSp_Short)
subset_mut = subset_mut[subset_mut$blacklist %in% blacklist_variants,]
kable(table(subset_mut$blacklist, subset_mut$Center),
caption = "Blacklist variant count")
```


---

## Distribution of Clinical Attributes
Expand Down
10 changes: 6 additions & 4 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def genie_config():
"releaseFolder": "syn17079016",
"assayinfo": "syn18404286",
"logs": "syn10155804",
"sv": "syn51663925",
"center_config": {
"SAGE": {
"center": "SAGE",
Expand Down Expand Up @@ -73,11 +74,12 @@ def genie_config():
},
},
"genie_annotation_pkg": "/path/to/nexus",
"ethnicity_mapping": "syn7434242",
"race_mapping": "syn7434236",
"sex_mapping": "syn7434222",
"sampletype_mapping": "syn7434273",
"ethnicity_mapping": "syn60548943",
"race_mapping": "syn60548944",
"sex_mapping": "syn60548946",
"sampletype_mapping": "syn60548941",
"clinical_tier_release_scope": "syn8545211",
"clinical_code_to_desc_map": "syn59486337",
}
return config

Expand Down
8 changes: 4 additions & 4 deletions tests/test_clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,10 @@ def table_query_results(*args):
)

table_query_results_map = {
("select * from syn7434222",): createMockTable(sexdf),
("select * from syn7434236",): createMockTable(no_nan),
("select * from syn7434242",): createMockTable(no_nan),
("select * from syn7434273",): createMockTable(no_nan),
("select * from syn60548946",): createMockTable(sexdf),
("select * from syn60548944",): createMockTable(no_nan),
("select * from syn60548943",): createMockTable(no_nan),
("select * from syn60548941",): createMockTable(no_nan),
(
"select fieldName from syn8545211 where patient is True and inClinicalDb is True",
): createMockTable(patientdf),
Expand Down
51 changes: 50 additions & 1 deletion tests/test_extract.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Test genie.extract module"""

from unittest.mock import patch
from unittest.mock import patch, MagicMock

import pandas as pd
import pytest
Expand Down Expand Up @@ -72,3 +72,52 @@ def test_none__getoncotreelink(syn, genie_config):
oncolink = extract._get_oncotreelink(syn, genie_config)
patch_synget.assert_called_once_with(genie_config["oncotreeLink"])
assert oncolink == url


def test_that_get_genie_config_has_expected_calls(syn):
mock_project = MagicMock()
mock_project.annotations = {"dbMapping": ["db_syn_id"]}
mock_db_mapping_config = {
"errorTracker": "syn11601244",
"centerMapping": "syn11601248",
"processTracker": "syn11604890",
}
mock_center_config = pd.DataFrame(
{
"center": ["TEST"],
"inputSynId": ["syn11601340"],
"stagingSynId": ["syn11601342"],
"errorsSynId": ["syn53239081"],
"release": [True],
"mutationInCisFilter": ["ON"],
}
)
expected_genie_config = {
"errorTracker": "syn11601244",
"centerMapping": "syn11601248",
"processTracker": "syn11604890",
"center_config": {
"TEST": {
"center": "TEST",
"inputSynId": "syn11601340",
"stagingSynId": "syn11601342",
"errorsSynId": "syn53239081",
"release": True,
"mutationInCisFilter": "ON",
},
},
}

with patch.object(syn, "get", return_value=mock_project) as patch_get, patch.object(
extract, "_get_database_mapping_config", return_value=mock_db_mapping_config
) as patch_get_db_mapping, patch.object(
extract, "get_syntabledf", return_value=mock_center_config
) as patch_get_center_mapping:
result = extract.get_genie_config(syn, project_id="syn7208886")

patch_get.assert_called_once_with("syn7208886")
patch_get_db_mapping.assert_called_once_with(syn=syn, synid="db_syn_id")
patch_get_center_mapping.assert_called_once_with(
syn=syn, query_string=f"SELECT * FROM syn11601248 where release is true"
)
assert result == expected_genie_config
73 changes: 70 additions & 3 deletions tests/test_process_mutation.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,79 @@ def test_determine_dtype(self):
col_types = process_mutation.determine_dtype("test.csv")
assert col_types == self.column_types

def test__convert_to_str_dtype(self):
@pytest.mark.parametrize(
"input_columns_types, known_str_cols, expected_new_column_types",
[
(
{"foo": "int64", "bar": "object"},
["foo"],
{"foo": "object", "bar": "object"},
),
(
{
"Hugo_Symbol": "object",
"Entrez_Gene_Id": "int64",
"Chromosome": "object",
"Start_Position": "int64",
"End_Position": "int64",
"Reference_Allele": "object",
"Variant_Classification": "float64",
"Annotation_Status": "int64",
},
process_mutation.KNOWN_STRING_COLS,
{
"Hugo_Symbol": "object",
"Entrez_Gene_Id": "int64",
"Chromosome": "object",
"Start_Position": "int64",
"End_Position": "int64",
"Reference_Allele": "object",
"Variant_Classification": "object",
"Annotation_Status": "object",
},
),
(
{
"Hugo_Symbol": "object",
"Entrez_Gene_Id": "int64",
"Chromosome": "object",
"Start_Position": "int64",
"End_Position": "int64",
"Reference_Allele": "object",
"Tumor_Seq_Allele1": "object",
"Tumor_Seq_Allele2": "object",
"Tumor_Sample_Barcode": "object",
"Annotation_Status": "object",
},
process_mutation.KNOWN_STRING_COLS,
{
"Hugo_Symbol": "object",
"Entrez_Gene_Id": "int64",
"Chromosome": "object",
"Start_Position": "int64",
"End_Position": "int64",
"Reference_Allele": "object",
"Tumor_Seq_Allele1": "object",
"Tumor_Seq_Allele2": "object",
"Tumor_Sample_Barcode": "object",
"Annotation_Status": "object",
},
),
],
ids=[
"test_int_to_obj",
"test_changes_with_constant",
"test_no_changes_with_constant",
],
)
def test__convert_to_str_dtype(
self, input_columns_types, known_str_cols, expected_new_column_types
):
"""Tests converting dtypes to str dtypes"""
new_column_types = process_mutation._convert_to_str_dtype(
self.column_types, ["foo"]
input_columns_types, known_str_cols
)
assert new_column_types == {"foo": "object", "bar": "object"}
assert new_column_types == expected_new_column_types

def test_move_maf_rename(self):
"""Test moving mafs when maf column headers need to be remapped"""
Expand Down

0 comments on commit 0a7226f

Please sign in to comment.