Merge branch 'develop' into gen-1330-run-staging

Sage-Bionetworks · Aug 21, 2024 · add32eb · add32eb
2 parents 268ee02 + 9295c5c
commit add32eb
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 120 deletions.
diff --git a/R/install_packages.R b/R/install_packages.R
@@ -3,7 +3,6 @@ renv::restore()
 library(synapser)
 library(dplyr)
 library(argparse)
-library(UpSetR)
 library(rmarkdown)
 library(testthat)
 library(VariantAnnotation)

diff --git a/genie/__init__.py b/genie/__init__.py
@@ -7,6 +7,6 @@
 
 # create version in __init__.py
 # https://packaging.python.org/en/latest/guides/single-sourcing-package-version/
-__version__ = "16.3.0"
+__version__ = "16.4.0"
 
 __all__ = ["__version__"]
diff --git a/genie_registry/cna.py b/genie_registry/cna.py
@@ -1,5 +1,6 @@
 import logging
 import os
+from typing import Union
 
 import pandas as pd
 import synapseclient
@@ -10,17 +11,37 @@
 logger = logging.getLogger(__name__)
 
 
-def validateSymbol(gene, bedDf, returnMappedDf=True):
+def validateSymbol(
+    gene: str, bedDf: pd.DataFrame, returnMappedDf: bool = True
+) -> Union[str, float, bool]:
     """
-    Validate gene symbol
+    Validates the gene symbol against the gene symbol in the bed database.
+    Note that gene symbols in the bed database have gone through processing and
+    have been remapped to allowed actual genes if needed.
+
+    Two conditions must be met for the gene to be VALID:
+        1. The gene exists in the bed database table's Hugo_Symbol column
+
+        2. The gene exists in the bed database table's ID column. Under this condition,
+        the gene in the cna file will be REMAPPED temporarily to the bed database
+        table's Hugo_Symbol value for the purpose of validation. The ID column is the
+        original Hugo_Symbol column of the bed files before the Hugo_Symbol column gets
+        mapped to valid possible gene values in the Actual Gene Positions (GRCh37)
+        database table. See the bed fileformat module's remap_symbols function and
+        how it gets used in processing for more info on this.
+
+    The validation throws a WARNING if the gene doesn't satisfy
+    either of the above two conditions
 
     Args:
         gene: Gene name
-        bedDf: Bed pandas dataframe
+        bedDf: The bed database table as a pandas dataframe
         returnMappedDf: Return a mapped gene. Defaults to True
 
     Returns:
-        gene name or boolean for whether a gene is valid
+        Union[str, float, bool]:
+        Returns gene symbol (str if valid, a float("nan") if invalid) if returnMappedDf is True
+        Returns boolean for whether a gene is valid if returnMappedDf is False
     """
     valid = False
     if sum(bedDf["Hugo_Symbol"] == gene) > 0:

diff --git a/genie_registry/maf.py b/genie_registry/maf.py
@@ -12,8 +12,43 @@
 
 
 def _check_allele_col_validity(df):
-    """If maf file has both TSA1 and TSA2,
-    TSA1 must equal REF, or TSA1 must equal TSA2, and REF must not equal TSA2
+    """There are two linked validation rules in this function:
+
+    1) If maf file has ALL three of the following columns:
+        - TUMOR_SEQ_ALLELE1 (TSA1)
+        - TUMOR_SEQ_ALLELE2 (TSA2)
+        - REFERENCE ALLELE (REF)
+        THEN
+        ALL rows of TSA1 must equal REF
+        OR
+        ALL rows of TSA1 must equal TSA2
+
+        TSA1 is used by Genome Nexus (GN) to annotate data when it senses there is ambiguity
+        regarding which variant (TSA1 vs TSA2) to use. This is
+        why there cannot be mixed rows where some rows have TSA1 == REF and some rows
+        have TSA1 == TSA2.
+
+        e.g:
+        VALID
+        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2
+        | C                | C                 | A
+        | T                | T                 | C
+
+        VALID
+        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2
+        | C                | A                 | A
+        | T                | C                 | C
+
+        INVALID
+        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2
+        | C                | C                 | A
+        | C                | A                 | A
+
+        See https://github.com/genome-nexus/annotation-tools/issues/26 for
+        more background regarding why this validation rule was implemented.
+
+    2) There can't be ANY rows where REF == TSA2. This is a missense mutation
+    flagged as invalid by GN
     """
     tsa2_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE2")
     tsa1_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE1")

diff --git a/templates/dashboardTemplate.Rmd b/templates/dashboardTemplate.Rmd
@@ -25,7 +25,6 @@ title: '`r release`'
 suppressMessages(library(synapser))
 foo = capture.output(synLogin())
 suppressMessages(library(ggplot2))
-suppressMessages(library(UpSetR))
 suppressMessages(library(RColorBrewer))
 suppressMessages(library(jsonlite))
 suppressMessages(library(knitr))
@@ -57,19 +56,6 @@ getFileDf <- function(fileName, releaseFiles) {
   }
 }
 
-makePanelList = function(assay, bed) {
-  return(unique(as.character(bed$Hugo_Symbol[bed$SEQ_PIPELINE_ID == assay])))
-}
-
-plotPanelOverlap <- function(bed, assays) {
-  listInput = lapply(as.list(assays), function(x) { makePanelList(x, bed) })
-  names(listInput) = assays
-  upset(fromList(listInput),
-        order.by = "freq",
-        nsets = length(assays),
-        nintersects = 30)
-}
-
 plotCenterXRace <- function(genieClinData) {
   t = as.data.frame.matrix(table(genieClinData$CENTER,genieClinData$PRIMARY_RACE))
   t = data.frame(n = rowSums(t),t)
@@ -267,11 +253,7 @@ if (is.null(this_bed)) {
 this_assays = as.character(unique(this_samples$SEQ_ASSAY_ID))
 this_mut <- getFileDf("data_mutations_extended.txt", releaseFiles)
 assay_infodf = getFileDf("assay_information.txt", releaseFiles)
-black_list_variants <- synTableQuery(
-  "select * from syn18459663 where filter_variant is true",
-  includeRowIdAndRowVersion = F
-)
-black_list_variantsdf = black_list_variants$asDataFrame()
+
 # this_cna <- getFileDf("data_CNA.txt", releaseFiles)
 #this_fus <- getFileDf("data_fusions.txt", releaseFiles)
 
@@ -445,27 +427,6 @@ Files &rarr; Centers &rarr; [Center name] &rarr; Errors &rarr; failed_annotation
 
 View the version comment column in Synapse for your report to find the version associated with this release.
 
----
-
-## Flagged Mutations
-This is a count of how many flagged mutations a center has. Most of these variants are potential
-artifacts flagged by manual review of cBioPortal. Please inform Sage Bionetworks about:
-
-* Suggestions for variants that should be part of this list
-* Any variant shouldn't be part of this list
-
-```{r blacklist}
-blacklist_variants = paste(black_list_variantsdf$Hugo_Symbol,
-                           black_list_variantsdf$HGVSp_Short)
-subset_mut = this_mut[this_mut$Hugo_Symbol %in% black_list_variantsdf$Hugo_Symbol, ]
-subset_mut$blacklist = paste(subset_mut$Hugo_Symbol,
-                             subset_mut$HGVSp_Short)
-subset_mut = subset_mut[subset_mut$blacklist %in% blacklist_variants,]
-kable(table(subset_mut$blacklist, subset_mut$Center),
-      caption = "Blacklist variant count")
-```
-
-
 ---
 
 ## Distribution of Clinical Attributes
@@ -691,87 +652,19 @@ if (!is.null(assay_infodf) & !is.null(this_bed)) {
   num_submitted_genes = colSums(submitted_bool_per_panel)
   colnames(number_of_genes) = "num_genes_remapped"
   number_of_genes$num_genes_submitted = NA
-  number_of_genes$num_expected_genes = NA
+  number_of_genes$num_genes_expected = NA
   number_of_genes[names(num_submitted_genes), "num_genes_submitted"] = num_submitted_genes
-  number_of_genes[assay_infodf$SEQ_ASSAY_ID, "num_expected_genes"] = assay_infodf$number_of_genes
+  number_of_genes[assay_infodf$SEQ_ASSAY_ID, "num_genes_expected"] = assay_infodf$number_of_genes
 
-  different_from_expected = number_of_genes[c(which(number_of_genes$num_genes_remapped != number_of_genes$num_expected_genes),
-                                              which(is.na(number_of_genes$num_expected_genes))),]
+  different_from_expected = number_of_genes[c(which(number_of_genes$num_genes_remapped != number_of_genes$num_genes_expected),
+                                              which(is.na(number_of_genes$num_genes_expected))),]
   kable(different_from_expected,
         caption = "Number of submitted vs expected genes in a bed file")
 }
 ```
 
 ---
 
-## Gene Panel Overlaps
-This section will show the number of genes that overlap between the myeloid, small and large GENIE panels.
-
-```{r bed_process, include=F}
-if (!is.null(this_bed)) {
-  this_bed <- this_bed[this_bed$Feature_Type == "exon",]
-  #Make it so that I use include in panel
-  if (!is.null(this_bed$includeInPanel)) {
-    this_bed <- this_bed[this_bed$includeInPanel == "True",]
-  }
-  noneExistentAssays = this_assays[!this_assays %in% this_bed$SEQ_ASSAY_ID]
-  if (length(noneExistentAssays) > 0) {
-    print(paste("These assays do not have bed files associated with them: ",
-                paste(noneExistentAssays, collapse = ", ")))
-  }
-  this_assays = this_assays[this_assays %in% this_bed$SEQ_ASSAY_ID]
-  myeloid_panels = c("VICC-01-MYELOID","UHN-54-V1","UCHI-ONCOHEME55-V1","CHOP-HEMEP","MSK-IMPACT-HEME-399")
-  myeloid = this_assays[this_assays %in% myeloid_panels]
-
-  normal = this_assays[!this_assays %in% myeloid_panels]
-  seq_pipeline_ids = unique(
-    this_bed$SEQ_PIPELINE_ID[this_bed$SEQ_ASSAY_ID %in% normal]
-  )
-  smallPanels = c()
-  largePanels = c()
-  for (panel in seq_pipeline_ids) {
-    panelDf = this_bed[this_bed$SEQ_PIPELINE_ID == panel,]
-    if (length(table(panelDf$Hugo_Symbol)) < 100) {
-      smallPanels = c(smallPanels, panel)
-      # Don't add to panel if more than 1500 genes
-    } else if (length(table(panelDf$Hugo_Symbol)) < 1500) {
-      largePanels = c(largePanels, panel)
-    }
-  }
-} else {
-  largePanels = c()
-  smallPanels = c()
-  myeloid = c()
-}
-```
-
-### Meyloid Gene Panels
-
-```{r myeloid_panel_upset}
-if (length(myeloid) > 1) {
-  plotPanelOverlap(this_bed, myeloid)
-}
-
-```
-
-### Small (<100) Gene panels
-
-```{r small_panel_upset, fig.height=12}
-if (length(smallPanels) > 1) {
-  plotPanelOverlap(this_bed, smallPanels)
-}
-```
-
-### Large ($\geq$ 100) Gene panels
-
-```{r large_panel_upset, fig.height=12}
-if (length(largePanels) > 1) {
-  plotPanelOverlap(this_bed, largePanels)
-}
-```
-
----
-
 ## Possible Non-center Related Data Issues
 
 This section includes QC issues that are mostly related to the Sage Bionetworks pipeline, Genome Nexus or _maybe_ center related issues.