Merge branch 'develop' into GEN-1299-align-col-names

Sage-Bionetworks · Aug 15, 2024 · 0a7226f · 0a7226f
2 parents c5be347 + 238970f
commit 0a7226f
Show file tree

Hide file tree

Showing 11 changed files with 195 additions and 66 deletions.
diff --git a/R/install_packages.R b/R/install_packages.R
@@ -3,7 +3,6 @@ renv::restore()
 library(synapser)
 library(dplyr)
 library(argparse)
-library(UpSetR)
 library(rmarkdown)
 library(testthat)
 library(VariantAnnotation)

diff --git a/genie/__init__.py b/genie/__init__.py
@@ -7,6 +7,6 @@
 
 # create version in __init__.py
 # https://packaging.python.org/en/latest/guides/single-sourcing-package-version/
-__version__ = "16.3.0"
+__version__ = "16.4.0"
 
 __all__ = ["__version__"]
diff --git a/genie/extract.py b/genie/extract.py
@@ -279,12 +279,6 @@ def get_genie_config(
     center_mapping_df.index = center_mapping_df.center
     # Add center configurations including input/staging synapse ids
     genie_config["center_config"] = center_mapping_df.to_dict("index")
-
-    genie_config["ethnicity_mapping"] = "syn7434242"
-    genie_config["race_mapping"] = "syn7434236"
-    genie_config["sex_mapping"] = "syn7434222"
-    genie_config["sampletype_mapping"] = "syn7434273"
-
     return genie_config
 
 

diff --git a/genie/process_mutation.py b/genie/process_mutation.py
@@ -115,6 +115,7 @@
     "Exon_Number",
     "genomic_location_explanation",
     "Annotation_Status",
+    "Variant_Classification",
 ]
 
 

diff --git a/genie_registry/cna.py b/genie_registry/cna.py
@@ -1,5 +1,6 @@
 import logging
 import os
+from typing import Union
 
 import pandas as pd
 import synapseclient
@@ -10,17 +11,37 @@
 logger = logging.getLogger(__name__)
 
 
-def validateSymbol(gene, bedDf, returnMappedDf=True):
+def validateSymbol(
+    gene: str, bedDf: pd.DataFrame, returnMappedDf: bool = True
+) -> Union[str, float, bool]:
     """
-    Validate gene symbol
+    Validates the gene symbol against the gene symbol in the bed database.
+    Note that gene symbols in the bed database have gone through processing and
+    have been remapped to allowed actual genes if needed.
+
+    Two conditions must be met for the gene to be VALID:
+        1. The gene exists in the bed database table's Hugo_Symbol column
+
+        2. The gene exists in the bed database table's ID column. Under this condition,
+        the gene in the cna file will be REMAPPED temporarily to the bed database
+        table's Hugo_Symbol value for the purpose of validation. The ID column is the
+        original Hugo_Symbol column of the bed files before the Hugo_Symbol column gets
+        mapped to valid possible gene values in the Actual Gene Positions (GRCh37)
+        database table. See the bed fileformat module's remap_symbols function and
+        how it gets used in processing for more info on this.
+
+    The validation throws a WARNING if the gene doesn't satisfy
+    either of the above two conditions
 
     Args:
         gene: Gene name
-        bedDf: Bed pandas dataframe
+        bedDf: The bed database table as a pandas dataframe
         returnMappedDf: Return a mapped gene. Defaults to True
 
     Returns:
-        gene name or boolean for whether a gene is valid
+        Union[str, float, bool]:
+        Returns gene symbol (str if valid, a float("nan") if invalid) if returnMappedDf is True
+        Returns boolean for whether a gene is valid if returnMappedDf is False
     """
     valid = False
     if sum(bedDf["Hugo_Symbol"] == gene) > 0:

diff --git a/genie_registry/maf.py b/genie_registry/maf.py
@@ -12,8 +12,43 @@
 
 
 def _check_allele_col_validity(df):
-    """If maf file has both TSA1 and TSA2,
-    TSA1 must equal REF, or TSA1 must equal TSA2, and REF must not equal TSA2
+    """There are two linked validation rules in this function:
+
+    1) If maf file has ALL three of the following columns:
+        - TUMOR_SEQ_ALLELE1 (TSA1)
+        - TUMOR_SEQ_ALLELE2 (TSA2)
+        - REFERENCE ALLELE (REF)
+        THEN
+        ALL rows of TSA1 must equal REF
+        OR
+        ALL rows of TSA1 must equal TSA2
+
+        TSA1 is used by Genome Nexus (GN) to annotate data when it senses there is ambiguity
+        regarding which variant (TSA1 vs TSA2) to use. This is
+        why there cannot be mixed rows where some rows have TSA1 == REF and some rows
+        have TSA1 == TSA2.
+
+        e.g:
+        VALID
+        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2
+        | C                | C                 | A
+        | T                | T                 | C
+
+        VALID
+        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2
+        | C                | A                 | A
+        | T                | C                 | C
+
+        INVALID
+        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2
+        | C                | C                 | A
+        | C                | A                 | A
+
+        See https://github.com/genome-nexus/annotation-tools/issues/26 for
+        more background regarding why this validation rule was implemented.
+
+    2) There can't be ANY rows where REF == TSA2. This is a missense mutation
+    flagged as invalid by GN
     """
     tsa2_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE2")
     tsa1_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE1")

diff --git a/templates/dashboardTemplate.Rmd b/templates/dashboardTemplate.Rmd
@@ -25,7 +25,6 @@ title: '`r release`'
 suppressMessages(library(synapser))
 foo = capture.output(synLogin())
 suppressMessages(library(ggplot2))
-suppressMessages(library(UpSetR))
 suppressMessages(library(RColorBrewer))
 suppressMessages(library(jsonlite))
 suppressMessages(library(knitr))
@@ -57,19 +56,6 @@ getFileDf <- function(fileName, releaseFiles) {
   }
 }
 
-makePanelList = function(assay, bed) {
-  return(unique(as.character(bed$Hugo_Symbol[bed$SEQ_PIPELINE_ID == assay])))
-}
-
-plotPanelOverlap <- function(bed, assays) {
-  listInput = lapply(as.list(assays), function(x) { makePanelList(x, bed) })
-  names(listInput) = assays
-  upset(fromList(listInput),
-        order.by = "freq",
-        nsets = length(assays),
-        nintersects = 30)
-}
-
 plotCenterXRace <- function(genieClinData) {
   t = as.data.frame.matrix(table(genieClinData$CENTER,genieClinData$PRIMARY_RACE))
   t = data.frame(n = rowSums(t),t)
@@ -260,11 +246,7 @@ if (is.null(this_bed)) {
 this_assays = as.character(unique(this_samples$SEQ_ASSAY_ID))
 this_mut <- getFileDf("data_mutations_extended.txt", releaseFiles)
 assay_infodf = getFileDf("assay_information.txt", releaseFiles)
-black_list_variants <- synTableQuery(
-  "select * from syn18459663 where filter_variant is true",
-  includeRowIdAndRowVersion = F
-)
-black_list_variantsdf = black_list_variants$asDataFrame()
+
 # this_cna <- getFileDf("data_CNA.txt", releaseFiles)
 #this_fus <- getFileDf("data_fusions.txt", releaseFiles)
 
@@ -438,27 +420,6 @@ Files &rarr; Centers &rarr; [Center name] &rarr; Errors &rarr; failed_annotation
 
 View the version comment column in Synapse for your report to find the version associated with this release.
 
----
-
-## Flagged Mutations
-This is a count of how many flagged mutations a center has. Most of these variants are potential
-artifacts flagged by manual review of cBioPortal. Please inform Sage Bionetworks about:
-
-* Suggestions for variants that should be part of this list
-* Any variant shouldn't be part of this list
-
-```{r blacklist}
-blacklist_variants = paste(black_list_variantsdf$Hugo_Symbol,
-                           black_list_variantsdf$HGVSp_Short)
-subset_mut = this_mut[this_mut$Hugo_Symbol %in% black_list_variantsdf$Hugo_Symbol, ]
-subset_mut$blacklist = paste(subset_mut$Hugo_Symbol,
-                             subset_mut$HGVSp_Short)
-subset_mut = subset_mut[subset_mut$blacklist %in% blacklist_variants,]
-kable(table(subset_mut$blacklist, subset_mut$Center),
-      caption = "Blacklist variant count")
-```
-
-
 ---
 
 ## Distribution of Clinical Attributes

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -46,6 +46,7 @@ def genie_config():
         "releaseFolder": "syn17079016",
         "assayinfo": "syn18404286",
         "logs": "syn10155804",
+        "sv": "syn51663925",
         "center_config": {
             "SAGE": {
                 "center": "SAGE",
@@ -73,11 +74,12 @@ def genie_config():
             },
         },
         "genie_annotation_pkg": "/path/to/nexus",
-        "ethnicity_mapping": "syn7434242",
-        "race_mapping": "syn7434236",
-        "sex_mapping": "syn7434222",
-        "sampletype_mapping": "syn7434273",
+        "ethnicity_mapping": "syn60548943",
+        "race_mapping": "syn60548944",
+        "sex_mapping": "syn60548946",
+        "sampletype_mapping": "syn60548941",
         "clinical_tier_release_scope": "syn8545211",
+        "clinical_code_to_desc_map": "syn59486337",
     }
     return config
 

diff --git a/tests/test_clinical.py b/tests/test_clinical.py
@@ -54,10 +54,10 @@ def table_query_results(*args):
 )
 
 table_query_results_map = {
-    ("select * from syn7434222",): createMockTable(sexdf),
-    ("select * from syn7434236",): createMockTable(no_nan),
-    ("select * from syn7434242",): createMockTable(no_nan),
-    ("select * from syn7434273",): createMockTable(no_nan),
+    ("select * from syn60548946",): createMockTable(sexdf),
+    ("select * from syn60548944",): createMockTable(no_nan),
+    ("select * from syn60548943",): createMockTable(no_nan),
+    ("select * from syn60548941",): createMockTable(no_nan),
     (
         "select fieldName from syn8545211 where patient is True and inClinicalDb is True",
     ): createMockTable(patientdf),

diff --git a/tests/test_extract.py b/tests/test_extract.py
@@ -1,6 +1,6 @@
 """Test genie.extract module"""
 
-from unittest.mock import patch
+from unittest.mock import patch, MagicMock
 
 import pandas as pd
 import pytest
@@ -72,3 +72,52 @@ def test_none__getoncotreelink(syn, genie_config):
         oncolink = extract._get_oncotreelink(syn, genie_config)
         patch_synget.assert_called_once_with(genie_config["oncotreeLink"])
         assert oncolink == url
+
+
+def test_that_get_genie_config_has_expected_calls(syn):
+    mock_project = MagicMock()
+    mock_project.annotations = {"dbMapping": ["db_syn_id"]}
+    mock_db_mapping_config = {
+        "errorTracker": "syn11601244",
+        "centerMapping": "syn11601248",
+        "processTracker": "syn11604890",
+    }
+    mock_center_config = pd.DataFrame(
+        {
+            "center": ["TEST"],
+            "inputSynId": ["syn11601340"],
+            "stagingSynId": ["syn11601342"],
+            "errorsSynId": ["syn53239081"],
+            "release": [True],
+            "mutationInCisFilter": ["ON"],
+        }
+    )
+    expected_genie_config = {
+        "errorTracker": "syn11601244",
+        "centerMapping": "syn11601248",
+        "processTracker": "syn11604890",
+        "center_config": {
+            "TEST": {
+                "center": "TEST",
+                "inputSynId": "syn11601340",
+                "stagingSynId": "syn11601342",
+                "errorsSynId": "syn53239081",
+                "release": True,
+                "mutationInCisFilter": "ON",
+            },
+        },
+    }
+
+    with patch.object(syn, "get", return_value=mock_project) as patch_get, patch.object(
+        extract, "_get_database_mapping_config", return_value=mock_db_mapping_config
+    ) as patch_get_db_mapping, patch.object(
+        extract, "get_syntabledf", return_value=mock_center_config
+    ) as patch_get_center_mapping:
+        result = extract.get_genie_config(syn, project_id="syn7208886")
+
+        patch_get.assert_called_once_with("syn7208886")
+        patch_get_db_mapping.assert_called_once_with(syn=syn, synid="db_syn_id")
+        patch_get_center_mapping.assert_called_once_with(
+            syn=syn, query_string=f"SELECT * FROM syn11601248 where release is true"
+        )
+        assert result == expected_genie_config
diff --git a/tests/test_process_mutation.py b/tests/test_process_mutation.py
@@ -54,12 +54,79 @@ def test_determine_dtype(self):
             col_types = process_mutation.determine_dtype("test.csv")
             assert col_types == self.column_types
 
-    def test__convert_to_str_dtype(self):
+    @pytest.mark.parametrize(
+        "input_columns_types, known_str_cols, expected_new_column_types",
+        [
+            (
+                {"foo": "int64", "bar": "object"},
+                ["foo"],
+                {"foo": "object", "bar": "object"},
+            ),
+            (
+                {
+                    "Hugo_Symbol": "object",
+                    "Entrez_Gene_Id": "int64",
+                    "Chromosome": "object",
+                    "Start_Position": "int64",
+                    "End_Position": "int64",
+                    "Reference_Allele": "object",
+                    "Variant_Classification": "float64",
+                    "Annotation_Status": "int64",
+                },
+                process_mutation.KNOWN_STRING_COLS,
+                {
+                    "Hugo_Symbol": "object",
+                    "Entrez_Gene_Id": "int64",
+                    "Chromosome": "object",
+                    "Start_Position": "int64",
+                    "End_Position": "int64",
+                    "Reference_Allele": "object",
+                    "Variant_Classification": "object",
+                    "Annotation_Status": "object",
+                },
+            ),
+            (
+                {
+                    "Hugo_Symbol": "object",
+                    "Entrez_Gene_Id": "int64",
+                    "Chromosome": "object",
+                    "Start_Position": "int64",
+                    "End_Position": "int64",
+                    "Reference_Allele": "object",
+                    "Tumor_Seq_Allele1": "object",
+                    "Tumor_Seq_Allele2": "object",
+                    "Tumor_Sample_Barcode": "object",
+                    "Annotation_Status": "object",
+                },
+                process_mutation.KNOWN_STRING_COLS,
+                {
+                    "Hugo_Symbol": "object",
+                    "Entrez_Gene_Id": "int64",
+                    "Chromosome": "object",
+                    "Start_Position": "int64",
+                    "End_Position": "int64",
+                    "Reference_Allele": "object",
+                    "Tumor_Seq_Allele1": "object",
+                    "Tumor_Seq_Allele2": "object",
+                    "Tumor_Sample_Barcode": "object",
+                    "Annotation_Status": "object",
+                },
+            ),
+        ],
+        ids=[
+            "test_int_to_obj",
+            "test_changes_with_constant",
+            "test_no_changes_with_constant",
+        ],
+    )
+    def test__convert_to_str_dtype(
+        self, input_columns_types, known_str_cols, expected_new_column_types
+    ):
         """Tests converting dtypes to str dtypes"""
         new_column_types = process_mutation._convert_to_str_dtype(
-            self.column_types, ["foo"]
+            input_columns_types, known_str_cols
         )
-        assert new_column_types == {"foo": "object", "bar": "object"}
+        assert new_column_types == expected_new_column_types
 
     def test_move_maf_rename(self):
         """Test moving mafs when maf column headers need to be remapped"""