From c064b406e548c07e5f44df98ccc149be25730816 Mon Sep 17 00:00:00 2001
From: rxu17 <26471741+rxu17@users.noreply.github.com>
Date: Wed, 3 Apr 2024 11:30:02 -0700
Subject: [PATCH 1/7] [GEN-1237] Add documentation to maf allele validation
 (#557)

* add documentation for _check_allele_col_validity in maf validation

* add visual ex

* lint
---
 genie_registry/maf.py | 39 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/genie_registry/maf.py b/genie_registry/maf.py
index 3a97b97f..29cbb26d 100644
--- a/genie_registry/maf.py
+++ b/genie_registry/maf.py
@@ -12,8 +12,43 @@
 
 
 def _check_allele_col_validity(df):
-    """If maf file has both TSA1 and TSA2,
-    TSA1 must equal REF, or TSA1 must equal TSA2, and REF must not equal TSA2
+    """There are two linked validation rules in this function:
+
+    1) If maf file has ALL three of the following columns:
+        - TUMOR_SEQ_ALLELE1 (TSA1)
+        - TUMOR_SEQ_ALLELE2 (TSA2)
+        - REFERENCE ALLELE (REF)
+        THEN
+        ALL rows of TSA1 must equal REF
+        OR
+        ALL rows of TSA1 must equal TSA2
+
+        TSA1 is used by Genome Nexus (GN) to annotate data when it senses there is ambiguity
+        regarding which variant (TSA1 vs TSA2) to use. This is
+        why there cannot be mixed rows where some rows have TSA1 == REF and some rows
+        have TSA1 == TSA2.
+
+        e.g:
+        VALID
+        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2
+        | C                | C                 | A
+        | T                | T                 | C
+
+        VALID
+        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2
+        | C                | A                 | A
+        | T                | C                 | C
+
+        INVALID
+        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2
+        | C                | C                 | A
+        | C                | A                 | A
+
+        See https://github.com/genome-nexus/annotation-tools/issues/26 for
+        more background regarding why this validation rule was implemented.
+
+    2) There can't be ANY rows where REF == TSA2. This is a missense mutation
+    flagged as invalid by GN
     """
     tsa2_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE2")
     tsa1_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE1")

From 9e0ca87af5f8a61cb86069b5ed63174c9631f87c Mon Sep 17 00:00:00 2001
From: rxu17 <26471741+rxu17@users.noreply.github.com>
Date: Tue, 9 Jul 2024 14:21:41 -0700
Subject: [PATCH 2/7] [GEN-1348] Allow in mapping tables (#572)

* remove hardcoded mapping tabl synids

* update synids
---
 genie/extract.py       |  6 -----
 tests/conftest.py      | 10 +++++----
 tests/test_clinical.py |  8 +++----
 tests/test_extract.py  | 51 +++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 60 insertions(+), 15 deletions(-)

diff --git a/genie/extract.py b/genie/extract.py
index 7ea84225..8ca5831a 100644
--- a/genie/extract.py
+++ b/genie/extract.py
@@ -279,12 +279,6 @@ def get_genie_config(
     center_mapping_df.index = center_mapping_df.center
     # Add center configurations including input/staging synapse ids
     genie_config["center_config"] = center_mapping_df.to_dict("index")
-
-    genie_config["ethnicity_mapping"] = "syn7434242"
-    genie_config["race_mapping"] = "syn7434236"
-    genie_config["sex_mapping"] = "syn7434222"
-    genie_config["sampletype_mapping"] = "syn7434273"
-
     return genie_config
 
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 6879b915..59a449e3 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -46,6 +46,7 @@ def genie_config():
         "releaseFolder": "syn17079016",
         "assayinfo": "syn18404286",
         "logs": "syn10155804",
+        "sv": "syn51663925",
         "center_config": {
             "SAGE": {
                 "center": "SAGE",
@@ -73,11 +74,12 @@ def genie_config():
             },
         },
         "genie_annotation_pkg": "/path/to/nexus",
-        "ethnicity_mapping": "syn7434242",
-        "race_mapping": "syn7434236",
-        "sex_mapping": "syn7434222",
-        "sampletype_mapping": "syn7434273",
+        "ethnicity_mapping": "syn60548943",
+        "race_mapping": "syn60548944",
+        "sex_mapping": "syn60548946",
+        "sampletype_mapping": "syn60548941",
         "clinical_tier_release_scope": "syn8545211",
+        "clinical_code_to_desc_map": "syn59486337",
     }
     return config
 
diff --git a/tests/test_clinical.py b/tests/test_clinical.py
index 75248e41..af71b903 100644
--- a/tests/test_clinical.py
+++ b/tests/test_clinical.py
@@ -54,10 +54,10 @@ def table_query_results(*args):
 )
 
 table_query_results_map = {
-    ("select * from syn7434222",): createMockTable(sexdf),
-    ("select * from syn7434236",): createMockTable(no_nan),
-    ("select * from syn7434242",): createMockTable(no_nan),
-    ("select * from syn7434273",): createMockTable(no_nan),
+    ("select * from syn60548946",): createMockTable(sexdf),
+    ("select * from syn60548944",): createMockTable(no_nan),
+    ("select * from syn60548943",): createMockTable(no_nan),
+    ("select * from syn60548941",): createMockTable(no_nan),
     (
         "select fieldName from syn8545211 where patient is True and inClinicalDb is True",
     ): createMockTable(patientdf),
diff --git a/tests/test_extract.py b/tests/test_extract.py
index 2d8c5ac2..6fc745f8 100644
--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -1,6 +1,6 @@
 """Test genie.extract module"""
 
-from unittest.mock import patch
+from unittest.mock import patch, MagicMock
 
 import pandas as pd
 import pytest
@@ -72,3 +72,52 @@ def test_none__getoncotreelink(syn, genie_config):
         oncolink = extract._get_oncotreelink(syn, genie_config)
         patch_synget.assert_called_once_with(genie_config["oncotreeLink"])
         assert oncolink == url
+
+
+def test_that_get_genie_config_has_expected_calls(syn):
+    mock_project = MagicMock()
+    mock_project.annotations = {"dbMapping": ["db_syn_id"]}
+    mock_db_mapping_config = {
+        "errorTracker": "syn11601244",
+        "centerMapping": "syn11601248",
+        "processTracker": "syn11604890",
+    }
+    mock_center_config = pd.DataFrame(
+        {
+            "center": ["TEST"],
+            "inputSynId": ["syn11601340"],
+            "stagingSynId": ["syn11601342"],
+            "errorsSynId": ["syn53239081"],
+            "release": [True],
+            "mutationInCisFilter": ["ON"],
+        }
+    )
+    expected_genie_config = {
+        "errorTracker": "syn11601244",
+        "centerMapping": "syn11601248",
+        "processTracker": "syn11604890",
+        "center_config": {
+            "TEST": {
+                "center": "TEST",
+                "inputSynId": "syn11601340",
+                "stagingSynId": "syn11601342",
+                "errorsSynId": "syn53239081",
+                "release": True,
+                "mutationInCisFilter": "ON",
+            },
+        },
+    }
+
+    with patch.object(syn, "get", return_value=mock_project) as patch_get, patch.object(
+        extract, "_get_database_mapping_config", return_value=mock_db_mapping_config
+    ) as patch_get_db_mapping, patch.object(
+        extract, "get_syntabledf", return_value=mock_center_config
+    ) as patch_get_center_mapping:
+        result = extract.get_genie_config(syn, project_id="syn7208886")
+
+        patch_get.assert_called_once_with("syn7208886")
+        patch_get_db_mapping.assert_called_once_with(syn=syn, synid="db_syn_id")
+        patch_get_center_mapping.assert_called_once_with(
+            syn=syn, query_string=f"SELECT * FROM syn11601248 where release is true"
+        )
+        assert result == expected_genie_config

From ad0dc9328af3f30ab8224125affad19dacf7b2dd Mon Sep 17 00:00:00 2001
From: rxu17 <26471741+rxu17@users.noreply.github.com>
Date: Thu, 11 Jul 2024 15:37:51 -0700
Subject: [PATCH 3/7] Update docstring for cna's validateSymbol function (#575)

* update docstring for cna's validateSymbol function

* add more clarity
---
 genie_registry/cna.py | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/genie_registry/cna.py b/genie_registry/cna.py
index 13b793d9..c6bed1ed 100644
--- a/genie_registry/cna.py
+++ b/genie_registry/cna.py
@@ -1,5 +1,6 @@
 import logging
 import os
+from typing import Union
 
 import pandas as pd
 import synapseclient
@@ -10,17 +11,37 @@
 logger = logging.getLogger(__name__)
 
 
-def validateSymbol(gene, bedDf, returnMappedDf=True):
+def validateSymbol(
+    gene: str, bedDf: pd.DataFrame, returnMappedDf: bool = True
+) -> Union[str, float, bool]:
     """
-    Validate gene symbol
+    Validates the gene symbol against the gene symbol in the bed database.
+    Note that gene symbols in the bed database have gone through processing and
+    have been remapped to allowed actual genes if needed.
+
+    Two conditions must be met for the gene to be VALID:
+        1. The gene exists in the bed database table's Hugo_Symbol column
+
+        2. The gene exists in the bed database table's ID column. Under this condition,
+        the gene in the cna file will be REMAPPED temporarily to the bed database
+        table's Hugo_Symbol value for the purpose of validation. The ID column is the
+        original Hugo_Symbol column of the bed files before the Hugo_Symbol column gets
+        mapped to valid possible gene values in the Actual Gene Positions (GRCh37)
+        database table. See the bed fileformat module's remap_symbols function and
+        how it gets used in processing for more info on this.
+
+    The validation throws a WARNING if the gene doesn't satisfy
+    either of the above two conditions
 
     Args:
         gene: Gene name
-        bedDf: Bed pandas dataframe
+        bedDf: The bed database table as a pandas dataframe
         returnMappedDf: Return a mapped gene. Defaults to True
 
     Returns:
-        gene name or boolean for whether a gene is valid
+        Union[str, float, bool]:
+        Returns gene symbol (str if valid, a float("nan") if invalid) if returnMappedDf is True
+        Returns boolean for whether a gene is valid if returnMappedDf is False
     """
     valid = False
     if sum(bedDf["Hugo_Symbol"] == gene) > 0:

From f08ac29e4b6b22e7fd9fe36c7e5725a189071d60 Mon Sep 17 00:00:00 2001
From: rxu17 <26471741+rxu17@users.noreply.github.com>
Date: Sun, 14 Jul 2024 23:48:41 -0700
Subject: [PATCH 4/7] add variant_classification as known str col

---
 genie/process_mutation.py      |  1 +
 tests/test_process_mutation.py | 73 ++++++++++++++++++++++++++++++++--
 2 files changed, 71 insertions(+), 3 deletions(-)

diff --git a/genie/process_mutation.py b/genie/process_mutation.py
index 2d54617d..ed151539 100644
--- a/genie/process_mutation.py
+++ b/genie/process_mutation.py
@@ -115,6 +115,7 @@
     "Exon_Number",
     "genomic_location_explanation",
     "Annotation_Status",
+    "Variant_Classification",
 ]
 
 
diff --git a/tests/test_process_mutation.py b/tests/test_process_mutation.py
index b24e6209..e828ff6b 100644
--- a/tests/test_process_mutation.py
+++ b/tests/test_process_mutation.py
@@ -54,12 +54,79 @@ def test_determine_dtype(self):
             col_types = process_mutation.determine_dtype("test.csv")
             assert col_types == self.column_types
 
-    def test__convert_to_str_dtype(self):
+    @pytest.mark.parametrize(
+        "input_columns_types, known_str_cols, expected_new_column_types",
+        [
+            (
+                {"foo": "int64", "bar": "object"},
+                ["foo"],
+                {"foo": "object", "bar": "object"},
+            ),
+            (
+                {
+                    "Hugo_Symbol": "object",
+                    "Entrez_Gene_Id": "int64",
+                    "Chromosome": "object",
+                    "Start_Position": "int64",
+                    "End_Position": "int64",
+                    "Reference_Allele": "object",
+                    "Variant_Classification": "float64",
+                    "Annotation_Status": "int64",
+                },
+                process_mutation.KNOWN_STRING_COLS,
+                {
+                    "Hugo_Symbol": "object",
+                    "Entrez_Gene_Id": "int64",
+                    "Chromosome": "object",
+                    "Start_Position": "int64",
+                    "End_Position": "int64",
+                    "Reference_Allele": "object",
+                    "Variant_Classification": "object",
+                    "Annotation_Status": "object",
+                },
+            ),
+            (
+                {
+                    "Hugo_Symbol": "object",
+                    "Entrez_Gene_Id": "int64",
+                    "Chromosome": "object",
+                    "Start_Position": "int64",
+                    "End_Position": "int64",
+                    "Reference_Allele": "object",
+                    "Tumor_Seq_Allele1": "object",
+                    "Tumor_Seq_Allele2": "object",
+                    "Tumor_Sample_Barcode": "object",
+                    "Annotation_Status": "object",
+                },
+                process_mutation.KNOWN_STRING_COLS,
+                {
+                    "Hugo_Symbol": "object",
+                    "Entrez_Gene_Id": "int64",
+                    "Chromosome": "object",
+                    "Start_Position": "int64",
+                    "End_Position": "int64",
+                    "Reference_Allele": "object",
+                    "Tumor_Seq_Allele1": "object",
+                    "Tumor_Seq_Allele2": "object",
+                    "Tumor_Sample_Barcode": "object",
+                    "Annotation_Status": "object",
+                },
+            ),
+        ],
+        ids=[
+            "test_int_to_obj",
+            "test_changes_with_constant",
+            "test_no_changes_with_constant",
+        ],
+    )
+    def test__convert_to_str_dtype(
+        self, input_columns_types, known_str_cols, expected_new_column_types
+    ):
         """Tests converting dtypes to str dtypes"""
         new_column_types = process_mutation._convert_to_str_dtype(
-            self.column_types, ["foo"]
+            input_columns_types, known_str_cols
         )
-        assert new_column_types == {"foo": "object", "bar": "object"}
+        assert new_column_types == expected_new_column_types
 
     def test_move_maf_rename(self):
         """Test moving mafs when maf column headers need to be remapped"""

From 8bd546461a7072418e1ef7bcef6a48e1a532b76a Mon Sep 17 00:00:00 2001
From: rxu17 <26471741+rxu17@users.noreply.github.com>
Date: Tue, 23 Jul 2024 05:00:55 -0700
Subject: [PATCH 5/7] release version 16.4.0

---
 genie/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/genie/__init__.py b/genie/__init__.py
index f4e1680f..b43a1cf4 100644
--- a/genie/__init__.py
+++ b/genie/__init__.py
@@ -7,6 +7,6 @@
 
 # create version in __init__.py
 # https://packaging.python.org/en/latest/guides/single-sourcing-package-version/
-__version__ = "16.3.0"
+__version__ = "16.4.0"
 
 __all__ = ["__version__"]

From 2aabb555450a937c31ed52a7b360a141e72b8c75 Mon Sep 17 00:00:00 2001
From: Thomas Yu <thomas.yu@sagebase.org>
Date: Tue, 13 Aug 2024 22:10:29 -0700
Subject: [PATCH 6/7] [GEN-1325] Remove section on flagged mutations (#569)

* Remove section on flagged mutations

* Remove black list variants table query
---
 templates/dashboardTemplate.Rmd | 27 +--------------------------
 1 file changed, 1 insertion(+), 26 deletions(-)

diff --git a/templates/dashboardTemplate.Rmd b/templates/dashboardTemplate.Rmd
index bb984c21..8344ec80 100644
--- a/templates/dashboardTemplate.Rmd
+++ b/templates/dashboardTemplate.Rmd
@@ -260,11 +260,7 @@ if (is.null(this_bed)) {
 this_assays = as.character(unique(this_samples$SEQ_ASSAY_ID))
 this_mut <- getFileDf("data_mutations_extended.txt", releaseFiles)
 assay_infodf = getFileDf("assay_information.txt", releaseFiles)
-black_list_variants <- synTableQuery(
-  "select * from syn18459663 where filter_variant is true",
-  includeRowIdAndRowVersion = F
-)
-black_list_variantsdf = black_list_variants$asDataFrame()
+
 # this_cna <- getFileDf("data_CNA.txt", releaseFiles)
 #this_fus <- getFileDf("data_fusions.txt", releaseFiles)
 
@@ -438,27 +434,6 @@ Files &rarr; Centers &rarr; [Center name] &rarr; Errors &rarr; failed_annotation
 
 View the version comment column in Synapse for your report to find the version associated with this release.
 
----
-
-## Flagged Mutations
-This is a count of how many flagged mutations a center has. Most of these variants are potential
-artifacts flagged by manual review of cBioPortal. Please inform Sage Bionetworks about:
-
-* Suggestions for variants that should be part of this list
-* Any variant shouldn't be part of this list
-
-```{r blacklist}
-blacklist_variants = paste(black_list_variantsdf$Hugo_Symbol,
-                           black_list_variantsdf$HGVSp_Short)
-subset_mut = this_mut[this_mut$Hugo_Symbol %in% black_list_variantsdf$Hugo_Symbol, ]
-subset_mut$blacklist = paste(subset_mut$Hugo_Symbol,
-                             subset_mut$HGVSp_Short)
-subset_mut = subset_mut[subset_mut$blacklist %in% blacklist_variants,]
-kable(table(subset_mut$blacklist, subset_mut$Center),
-      caption = "Blacklist variant count")
-```
-
-
 ---
 
 ## Distribution of Clinical Attributes

From 238970f5728fbf86ee3298f188c347868532ec07 Mon Sep 17 00:00:00 2001
From: Thomas Yu <thomas.yu@sagebase.org>
Date: Tue, 13 Aug 2024 22:17:31 -0700
Subject: [PATCH 7/7] [GEN-1326] Remove panel overlap charts (#570)

* Remove panel overlap charts

* Remove unused function

* remove upset

* Remove upsetR
---
 R/install_packages.R            |  1 -
 templates/dashboardTemplate.Rmd | 82 ---------------------------------
 2 files changed, 83 deletions(-)

diff --git a/R/install_packages.R b/R/install_packages.R
index 33d1a2c1..0878bdd3 100644
--- a/R/install_packages.R
+++ b/R/install_packages.R
@@ -3,7 +3,6 @@ renv::restore()
 library(synapser)
 library(dplyr)
 library(argparse)
-library(UpSetR)
 library(rmarkdown)
 library(testthat)
 library(VariantAnnotation)
diff --git a/templates/dashboardTemplate.Rmd b/templates/dashboardTemplate.Rmd
index 8344ec80..5f5bcd10 100644
--- a/templates/dashboardTemplate.Rmd
+++ b/templates/dashboardTemplate.Rmd
@@ -25,7 +25,6 @@ title: '`r release`'
 suppressMessages(library(synapser))
 foo = capture.output(synLogin())
 suppressMessages(library(ggplot2))
-suppressMessages(library(UpSetR))
 suppressMessages(library(RColorBrewer))
 suppressMessages(library(jsonlite))
 suppressMessages(library(knitr))
@@ -57,19 +56,6 @@ getFileDf <- function(fileName, releaseFiles) {
   }
 }
 
-makePanelList = function(assay, bed) {
-  return(unique(as.character(bed$Hugo_Symbol[bed$SEQ_PIPELINE_ID == assay])))
-}
-
-plotPanelOverlap <- function(bed, assays) {
-  listInput = lapply(as.list(assays), function(x) { makePanelList(x, bed) })
-  names(listInput) = assays
-  upset(fromList(listInput),
-        order.by = "freq",
-        nsets = length(assays),
-        nintersects = 30)
-}
-
 plotCenterXRace <- function(genieClinData) {
   t = as.data.frame.matrix(table(genieClinData$CENTER,genieClinData$PRIMARY_RACE))
   t = data.frame(n = rowSums(t),t)
@@ -672,74 +658,6 @@ if (!is.null(assay_infodf) & !is.null(this_bed)) {
 
 ---
 
-## Gene Panel Overlaps
-This section will show the number of genes that overlap between the myeloid, small and large GENIE panels.
-
-```{r bed_process, include=F}
-if (!is.null(this_bed)) {
-  this_bed <- this_bed[this_bed$Feature_Type == "exon",]
-  #Make it so that I use include in panel
-  if (!is.null(this_bed$includeInPanel)) {
-    this_bed <- this_bed[this_bed$includeInPanel == "True",]
-  }
-  noneExistentAssays = this_assays[!this_assays %in% this_bed$SEQ_ASSAY_ID]
-  if (length(noneExistentAssays) > 0) {
-    print(paste("These assays do not have bed files associated with them: ",
-                paste(noneExistentAssays, collapse = ", ")))
-  }
-  this_assays = this_assays[this_assays %in% this_bed$SEQ_ASSAY_ID]
-  myeloid_panels = c("VICC-01-MYELOID","UHN-54-V1","UCHI-ONCOHEME55-V1","CHOP-HEMEP","MSK-IMPACT-HEME-399")
-  myeloid = this_assays[this_assays %in% myeloid_panels]
-
-  normal = this_assays[!this_assays %in% myeloid_panels]
-  seq_pipeline_ids = unique(
-    this_bed$SEQ_PIPELINE_ID[this_bed$SEQ_ASSAY_ID %in% normal]
-  )
-  smallPanels = c()
-  largePanels = c()
-  for (panel in seq_pipeline_ids) {
-    panelDf = this_bed[this_bed$SEQ_PIPELINE_ID == panel,]
-    if (length(table(panelDf$Hugo_Symbol)) < 100) {
-      smallPanels = c(smallPanels, panel)
-      # Don't add to panel if more than 1500 genes
-    } else if (length(table(panelDf$Hugo_Symbol)) < 1500) {
-      largePanels = c(largePanels, panel)
-    }
-  }
-} else {
-  largePanels = c()
-  smallPanels = c()
-  myeloid = c()
-}
-```
-
-### Meyloid Gene Panels
-
-```{r myeloid_panel_upset}
-if (length(myeloid) > 1) {
-  plotPanelOverlap(this_bed, myeloid)
-}
-
-```
-
-### Small (<100) Gene panels
-
-```{r small_panel_upset, fig.height=12}
-if (length(smallPanels) > 1) {
-  plotPanelOverlap(this_bed, smallPanels)
-}
-```
-
-### Large ($\geq$ 100) Gene panels
-
-```{r large_panel_upset, fig.height=12}
-if (length(largePanels) > 1) {
-  plotPanelOverlap(this_bed, largePanels)
-}
-```
-
----
-
 ## Possible Non-center Related Data Issues
 
 This section includes QC issues that are mostly related to the Sage Bionetworks pipeline, Genome Nexus or _maybe_ center related issues.