Skip to content

Commit

Permalink
Merge branch 'develop' into gen-1330-run-staging
Browse files Browse the repository at this point in the history
  • Loading branch information
rxu17 committed Aug 21, 2024
2 parents 268ee02 + 9295c5c commit add32eb
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 120 deletions.
1 change: 0 additions & 1 deletion R/install_packages.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ renv::restore()
library(synapser)
library(dplyr)
library(argparse)
library(UpSetR)
library(rmarkdown)
library(testthat)
library(VariantAnnotation)
Expand Down
2 changes: 1 addition & 1 deletion genie/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@

# create version in __init__.py
# https://packaging.python.org/en/latest/guides/single-sourcing-package-version/
__version__ = "16.3.0"
__version__ = "16.4.0"

__all__ = ["__version__"]
29 changes: 25 additions & 4 deletions genie_registry/cna.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import os
from typing import Union

import pandas as pd
import synapseclient
Expand All @@ -10,17 +11,37 @@
logger = logging.getLogger(__name__)


def validateSymbol(gene, bedDf, returnMappedDf=True):
def validateSymbol(
gene: str, bedDf: pd.DataFrame, returnMappedDf: bool = True
) -> Union[str, float, bool]:
"""
Validate gene symbol
Validates the gene symbol against the gene symbol in the bed database.
Note that gene symbols in the bed database have gone through processing and
have been remapped to allowed actual genes if needed.
Two conditions must be met for the gene to be VALID:
1. The gene exists in the bed database table's Hugo_Symbol column
2. The gene exists in the bed database table's ID column. Under this condition,
the gene in the cna file will be REMAPPED temporarily to the bed database
table's Hugo_Symbol value for the purpose of validation. The ID column is the
original Hugo_Symbol column of the bed files before the Hugo_Symbol column gets
mapped to valid possible gene values in the Actual Gene Positions (GRCh37)
database table. See the bed fileformat module's remap_symbols function and
how it gets used in processing for more info on this.
The validation throws a WARNING if the gene doesn't satisfy
either of the above two conditions
Args:
gene: Gene name
bedDf: Bed pandas dataframe
bedDf: The bed database table as a pandas dataframe
returnMappedDf: Return a mapped gene. Defaults to True
Returns:
gene name or boolean for whether a gene is valid
Union[str, float, bool]:
Returns gene symbol (str if valid, a float("nan") if invalid) if returnMappedDf is True
Returns boolean for whether a gene is valid if returnMappedDf is False
"""
valid = False
if sum(bedDf["Hugo_Symbol"] == gene) > 0:
Expand Down
39 changes: 37 additions & 2 deletions genie_registry/maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,43 @@


def _check_allele_col_validity(df):
"""If maf file has both TSA1 and TSA2,
TSA1 must equal REF, or TSA1 must equal TSA2, and REF must not equal TSA2
"""There are two linked validation rules in this function:
1) If maf file has ALL three of the following columns:
- TUMOR_SEQ_ALLELE1 (TSA1)
- TUMOR_SEQ_ALLELE2 (TSA2)
- REFERENCE ALLELE (REF)
THEN
ALL rows of TSA1 must equal REF
OR
ALL rows of TSA1 must equal TSA2
TSA1 is used by Genome Nexus (GN) to annotate data when it senses there is ambiguity
regarding which variant (TSA1 vs TSA2) to use. This is
why there cannot be mixed rows where some rows have TSA1 == REF and some rows
have TSA1 == TSA2.
e.g:
VALID
| REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2
| C | C | A
| T | T | C
VALID
| REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2
| C | A | A
| T | C | C
INVALID
| REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2
| C | C | A
| C | A | A
See https://github.com/genome-nexus/annotation-tools/issues/26 for
more background regarding why this validation rule was implemented.
2) There can't be ANY rows where REF == TSA2. This is a missense mutation
flagged as invalid by GN
"""
tsa2_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE2")
tsa1_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE1")
Expand Down
117 changes: 5 additions & 112 deletions templates/dashboardTemplate.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ title: '`r release`'
suppressMessages(library(synapser))
foo = capture.output(synLogin())
suppressMessages(library(ggplot2))
suppressMessages(library(UpSetR))
suppressMessages(library(RColorBrewer))
suppressMessages(library(jsonlite))
suppressMessages(library(knitr))
Expand Down Expand Up @@ -57,19 +56,6 @@ getFileDf <- function(fileName, releaseFiles) {
}
}
makePanelList = function(assay, bed) {
return(unique(as.character(bed$Hugo_Symbol[bed$SEQ_PIPELINE_ID == assay])))
}
plotPanelOverlap <- function(bed, assays) {
listInput = lapply(as.list(assays), function(x) { makePanelList(x, bed) })
names(listInput) = assays
upset(fromList(listInput),
order.by = "freq",
nsets = length(assays),
nintersects = 30)
}
plotCenterXRace <- function(genieClinData) {
t = as.data.frame.matrix(table(genieClinData$CENTER,genieClinData$PRIMARY_RACE))
t = data.frame(n = rowSums(t),t)
Expand Down Expand Up @@ -267,11 +253,7 @@ if (is.null(this_bed)) {
this_assays = as.character(unique(this_samples$SEQ_ASSAY_ID))
this_mut <- getFileDf("data_mutations_extended.txt", releaseFiles)
assay_infodf = getFileDf("assay_information.txt", releaseFiles)
black_list_variants <- synTableQuery(
"select * from syn18459663 where filter_variant is true",
includeRowIdAndRowVersion = F
)
black_list_variantsdf = black_list_variants$asDataFrame()
# this_cna <- getFileDf("data_CNA.txt", releaseFiles)
#this_fus <- getFileDf("data_fusions.txt", releaseFiles)
Expand Down Expand Up @@ -445,27 +427,6 @@ Files &rarr; Centers &rarr; [Center name] &rarr; Errors &rarr; failed_annotation

View the version comment column in Synapse for your report to find the version associated with this release.

---

## Flagged Mutations
This is a count of how many flagged mutations a center has. Most of these variants are potential
artifacts flagged by manual review of cBioPortal. Please inform Sage Bionetworks about:

* Suggestions for variants that should be part of this list
* Any variant shouldn't be part of this list

```{r blacklist}
blacklist_variants = paste(black_list_variantsdf$Hugo_Symbol,
black_list_variantsdf$HGVSp_Short)
subset_mut = this_mut[this_mut$Hugo_Symbol %in% black_list_variantsdf$Hugo_Symbol, ]
subset_mut$blacklist = paste(subset_mut$Hugo_Symbol,
subset_mut$HGVSp_Short)
subset_mut = subset_mut[subset_mut$blacklist %in% blacklist_variants,]
kable(table(subset_mut$blacklist, subset_mut$Center),
caption = "Blacklist variant count")
```


---

## Distribution of Clinical Attributes
Expand Down Expand Up @@ -691,87 +652,19 @@ if (!is.null(assay_infodf) & !is.null(this_bed)) {
num_submitted_genes = colSums(submitted_bool_per_panel)
colnames(number_of_genes) = "num_genes_remapped"
number_of_genes$num_genes_submitted = NA
number_of_genes$num_expected_genes = NA
number_of_genes$num_genes_expected = NA
number_of_genes[names(num_submitted_genes), "num_genes_submitted"] = num_submitted_genes
number_of_genes[assay_infodf$SEQ_ASSAY_ID, "num_expected_genes"] = assay_infodf$number_of_genes
number_of_genes[assay_infodf$SEQ_ASSAY_ID, "num_genes_expected"] = assay_infodf$number_of_genes
different_from_expected = number_of_genes[c(which(number_of_genes$num_genes_remapped != number_of_genes$num_expected_genes),
which(is.na(number_of_genes$num_expected_genes))),]
different_from_expected = number_of_genes[c(which(number_of_genes$num_genes_remapped != number_of_genes$num_genes_expected),
which(is.na(number_of_genes$num_genes_expected))),]
kable(different_from_expected,
caption = "Number of submitted vs expected genes in a bed file")
}
```

---

## Gene Panel Overlaps
This section will show the number of genes that overlap between the myeloid, small and large GENIE panels.

```{r bed_process, include=F}
if (!is.null(this_bed)) {
this_bed <- this_bed[this_bed$Feature_Type == "exon",]
#Make it so that I use include in panel
if (!is.null(this_bed$includeInPanel)) {
this_bed <- this_bed[this_bed$includeInPanel == "True",]
}
noneExistentAssays = this_assays[!this_assays %in% this_bed$SEQ_ASSAY_ID]
if (length(noneExistentAssays) > 0) {
print(paste("These assays do not have bed files associated with them: ",
paste(noneExistentAssays, collapse = ", ")))
}
this_assays = this_assays[this_assays %in% this_bed$SEQ_ASSAY_ID]
myeloid_panels = c("VICC-01-MYELOID","UHN-54-V1","UCHI-ONCOHEME55-V1","CHOP-HEMEP","MSK-IMPACT-HEME-399")
myeloid = this_assays[this_assays %in% myeloid_panels]
normal = this_assays[!this_assays %in% myeloid_panels]
seq_pipeline_ids = unique(
this_bed$SEQ_PIPELINE_ID[this_bed$SEQ_ASSAY_ID %in% normal]
)
smallPanels = c()
largePanels = c()
for (panel in seq_pipeline_ids) {
panelDf = this_bed[this_bed$SEQ_PIPELINE_ID == panel,]
if (length(table(panelDf$Hugo_Symbol)) < 100) {
smallPanels = c(smallPanels, panel)
# Don't add to panel if more than 1500 genes
} else if (length(table(panelDf$Hugo_Symbol)) < 1500) {
largePanels = c(largePanels, panel)
}
}
} else {
largePanels = c()
smallPanels = c()
myeloid = c()
}
```

### Meyloid Gene Panels

```{r myeloid_panel_upset}
if (length(myeloid) > 1) {
plotPanelOverlap(this_bed, myeloid)
}
```

### Small (<100) Gene panels

```{r small_panel_upset, fig.height=12}
if (length(smallPanels) > 1) {
plotPanelOverlap(this_bed, smallPanels)
}
```

### Large ($\geq$ 100) Gene panels

```{r large_panel_upset, fig.height=12}
if (length(largePanels) > 1) {
plotPanelOverlap(this_bed, largePanels)
}
```

---

## Possible Non-center Related Data Issues

This section includes QC issues that are mostly related to the Sage Bionetworks pipeline, Genome Nexus or _maybe_ center related issues.
Expand Down

0 comments on commit add32eb

Please sign in to comment.