From c064b406e548c07e5f44df98ccc149be25730816 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Wed, 3 Apr 2024 11:30:02 -0700 Subject: [PATCH 1/7] [GEN-1237] Add documentation to maf allele validation (#557) * add documentation for _check_allele_col_validity in maf validation * add visual ex * lint --- genie_registry/maf.py | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/genie_registry/maf.py b/genie_registry/maf.py index 3a97b97f..29cbb26d 100644 --- a/genie_registry/maf.py +++ b/genie_registry/maf.py @@ -12,8 +12,43 @@ def _check_allele_col_validity(df): - """If maf file has both TSA1 and TSA2, - TSA1 must equal REF, or TSA1 must equal TSA2, and REF must not equal TSA2 + """There are two linked validation rules in this function: + + 1) If maf file has ALL three of the following columns: + - TUMOR_SEQ_ALLELE1 (TSA1) + - TUMOR_SEQ_ALLELE2 (TSA2) + - REFERENCE ALLELE (REF) + THEN + ALL rows of TSA1 must equal REF + OR + ALL rows of TSA1 must equal TSA2 + + TSA1 is used by Genome Nexus (GN) to annotate data when it senses there is ambiguity + regarding which variant (TSA1 vs TSA2) to use. This is + why there cannot be mixed rows where some rows have TSA1 == REF and some rows + have TSA1 == TSA2. + + e.g: + VALID + | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 + | C | C | A + | T | T | C + + VALID + | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 + | C | A | A + | T | C | C + + INVALID + | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 + | C | C | A + | C | A | A + + See https://github.com/genome-nexus/annotation-tools/issues/26 for + more background regarding why this validation rule was implemented. + + 2) There can't be ANY rows where REF == TSA2. This is a missense mutation + flagged as invalid by GN """ tsa2_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE2") tsa1_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE1") From 9e0ca87af5f8a61cb86069b5ed63174c9631f87c Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Tue, 9 Jul 2024 14:21:41 -0700 Subject: [PATCH 2/7] [GEN-1348] Allow in mapping tables (#572) * remove hardcoded mapping tabl synids * update synids --- genie/extract.py | 6 ----- tests/conftest.py | 10 +++++---- tests/test_clinical.py | 8 +++---- tests/test_extract.py | 51 +++++++++++++++++++++++++++++++++++++++++- 4 files changed, 60 insertions(+), 15 deletions(-) diff --git a/genie/extract.py b/genie/extract.py index 7ea84225..8ca5831a 100644 --- a/genie/extract.py +++ b/genie/extract.py @@ -279,12 +279,6 @@ def get_genie_config( center_mapping_df.index = center_mapping_df.center # Add center configurations including input/staging synapse ids genie_config["center_config"] = center_mapping_df.to_dict("index") - - genie_config["ethnicity_mapping"] = "syn7434242" - genie_config["race_mapping"] = "syn7434236" - genie_config["sex_mapping"] = "syn7434222" - genie_config["sampletype_mapping"] = "syn7434273" - return genie_config diff --git a/tests/conftest.py b/tests/conftest.py index 6879b915..59a449e3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -46,6 +46,7 @@ def genie_config(): "releaseFolder": "syn17079016", "assayinfo": "syn18404286", "logs": "syn10155804", + "sv": "syn51663925", "center_config": { "SAGE": { "center": "SAGE", @@ -73,11 +74,12 @@ def genie_config(): }, }, "genie_annotation_pkg": "/path/to/nexus", - "ethnicity_mapping": "syn7434242", - "race_mapping": "syn7434236", - "sex_mapping": "syn7434222", - "sampletype_mapping": "syn7434273", + "ethnicity_mapping": "syn60548943", + "race_mapping": "syn60548944", + "sex_mapping": "syn60548946", + "sampletype_mapping": "syn60548941", "clinical_tier_release_scope": "syn8545211", + "clinical_code_to_desc_map": "syn59486337", } return config diff --git a/tests/test_clinical.py b/tests/test_clinical.py index 75248e41..af71b903 100644 --- a/tests/test_clinical.py +++ b/tests/test_clinical.py @@ -54,10 +54,10 @@ def table_query_results(*args): ) table_query_results_map = { - ("select * from syn7434222",): createMockTable(sexdf), - ("select * from syn7434236",): createMockTable(no_nan), - ("select * from syn7434242",): createMockTable(no_nan), - ("select * from syn7434273",): createMockTable(no_nan), + ("select * from syn60548946",): createMockTable(sexdf), + ("select * from syn60548944",): createMockTable(no_nan), + ("select * from syn60548943",): createMockTable(no_nan), + ("select * from syn60548941",): createMockTable(no_nan), ( "select fieldName from syn8545211 where patient is True and inClinicalDb is True", ): createMockTable(patientdf), diff --git a/tests/test_extract.py b/tests/test_extract.py index 2d8c5ac2..6fc745f8 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -1,6 +1,6 @@ """Test genie.extract module""" -from unittest.mock import patch +from unittest.mock import patch, MagicMock import pandas as pd import pytest @@ -72,3 +72,52 @@ def test_none__getoncotreelink(syn, genie_config): oncolink = extract._get_oncotreelink(syn, genie_config) patch_synget.assert_called_once_with(genie_config["oncotreeLink"]) assert oncolink == url + + +def test_that_get_genie_config_has_expected_calls(syn): + mock_project = MagicMock() + mock_project.annotations = {"dbMapping": ["db_syn_id"]} + mock_db_mapping_config = { + "errorTracker": "syn11601244", + "centerMapping": "syn11601248", + "processTracker": "syn11604890", + } + mock_center_config = pd.DataFrame( + { + "center": ["TEST"], + "inputSynId": ["syn11601340"], + "stagingSynId": ["syn11601342"], + "errorsSynId": ["syn53239081"], + "release": [True], + "mutationInCisFilter": ["ON"], + } + ) + expected_genie_config = { + "errorTracker": "syn11601244", + "centerMapping": "syn11601248", + "processTracker": "syn11604890", + "center_config": { + "TEST": { + "center": "TEST", + "inputSynId": "syn11601340", + "stagingSynId": "syn11601342", + "errorsSynId": "syn53239081", + "release": True, + "mutationInCisFilter": "ON", + }, + }, + } + + with patch.object(syn, "get", return_value=mock_project) as patch_get, patch.object( + extract, "_get_database_mapping_config", return_value=mock_db_mapping_config + ) as patch_get_db_mapping, patch.object( + extract, "get_syntabledf", return_value=mock_center_config + ) as patch_get_center_mapping: + result = extract.get_genie_config(syn, project_id="syn7208886") + + patch_get.assert_called_once_with("syn7208886") + patch_get_db_mapping.assert_called_once_with(syn=syn, synid="db_syn_id") + patch_get_center_mapping.assert_called_once_with( + syn=syn, query_string=f"SELECT * FROM syn11601248 where release is true" + ) + assert result == expected_genie_config From ad0dc9328af3f30ab8224125affad19dacf7b2dd Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Thu, 11 Jul 2024 15:37:51 -0700 Subject: [PATCH 3/7] Update docstring for cna's validateSymbol function (#575) * update docstring for cna's validateSymbol function * add more clarity --- genie_registry/cna.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/genie_registry/cna.py b/genie_registry/cna.py index 13b793d9..c6bed1ed 100644 --- a/genie_registry/cna.py +++ b/genie_registry/cna.py @@ -1,5 +1,6 @@ import logging import os +from typing import Union import pandas as pd import synapseclient @@ -10,17 +11,37 @@ logger = logging.getLogger(__name__) -def validateSymbol(gene, bedDf, returnMappedDf=True): +def validateSymbol( + gene: str, bedDf: pd.DataFrame, returnMappedDf: bool = True +) -> Union[str, float, bool]: """ - Validate gene symbol + Validates the gene symbol against the gene symbol in the bed database. + Note that gene symbols in the bed database have gone through processing and + have been remapped to allowed actual genes if needed. + + Two conditions must be met for the gene to be VALID: + 1. The gene exists in the bed database table's Hugo_Symbol column + + 2. The gene exists in the bed database table's ID column. Under this condition, + the gene in the cna file will be REMAPPED temporarily to the bed database + table's Hugo_Symbol value for the purpose of validation. The ID column is the + original Hugo_Symbol column of the bed files before the Hugo_Symbol column gets + mapped to valid possible gene values in the Actual Gene Positions (GRCh37) + database table. See the bed fileformat module's remap_symbols function and + how it gets used in processing for more info on this. + + The validation throws a WARNING if the gene doesn't satisfy + either of the above two conditions Args: gene: Gene name - bedDf: Bed pandas dataframe + bedDf: The bed database table as a pandas dataframe returnMappedDf: Return a mapped gene. Defaults to True Returns: - gene name or boolean for whether a gene is valid + Union[str, float, bool]: + Returns gene symbol (str if valid, a float("nan") if invalid) if returnMappedDf is True + Returns boolean for whether a gene is valid if returnMappedDf is False """ valid = False if sum(bedDf["Hugo_Symbol"] == gene) > 0: From f08ac29e4b6b22e7fd9fe36c7e5725a189071d60 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Sun, 14 Jul 2024 23:48:41 -0700 Subject: [PATCH 4/7] add variant_classification as known str col --- genie/process_mutation.py | 1 + tests/test_process_mutation.py | 73 ++++++++++++++++++++++++++++++++-- 2 files changed, 71 insertions(+), 3 deletions(-) diff --git a/genie/process_mutation.py b/genie/process_mutation.py index 2d54617d..ed151539 100644 --- a/genie/process_mutation.py +++ b/genie/process_mutation.py @@ -115,6 +115,7 @@ "Exon_Number", "genomic_location_explanation", "Annotation_Status", + "Variant_Classification", ] diff --git a/tests/test_process_mutation.py b/tests/test_process_mutation.py index b24e6209..e828ff6b 100644 --- a/tests/test_process_mutation.py +++ b/tests/test_process_mutation.py @@ -54,12 +54,79 @@ def test_determine_dtype(self): col_types = process_mutation.determine_dtype("test.csv") assert col_types == self.column_types - def test__convert_to_str_dtype(self): + @pytest.mark.parametrize( + "input_columns_types, known_str_cols, expected_new_column_types", + [ + ( + {"foo": "int64", "bar": "object"}, + ["foo"], + {"foo": "object", "bar": "object"}, + ), + ( + { + "Hugo_Symbol": "object", + "Entrez_Gene_Id": "int64", + "Chromosome": "object", + "Start_Position": "int64", + "End_Position": "int64", + "Reference_Allele": "object", + "Variant_Classification": "float64", + "Annotation_Status": "int64", + }, + process_mutation.KNOWN_STRING_COLS, + { + "Hugo_Symbol": "object", + "Entrez_Gene_Id": "int64", + "Chromosome": "object", + "Start_Position": "int64", + "End_Position": "int64", + "Reference_Allele": "object", + "Variant_Classification": "object", + "Annotation_Status": "object", + }, + ), + ( + { + "Hugo_Symbol": "object", + "Entrez_Gene_Id": "int64", + "Chromosome": "object", + "Start_Position": "int64", + "End_Position": "int64", + "Reference_Allele": "object", + "Tumor_Seq_Allele1": "object", + "Tumor_Seq_Allele2": "object", + "Tumor_Sample_Barcode": "object", + "Annotation_Status": "object", + }, + process_mutation.KNOWN_STRING_COLS, + { + "Hugo_Symbol": "object", + "Entrez_Gene_Id": "int64", + "Chromosome": "object", + "Start_Position": "int64", + "End_Position": "int64", + "Reference_Allele": "object", + "Tumor_Seq_Allele1": "object", + "Tumor_Seq_Allele2": "object", + "Tumor_Sample_Barcode": "object", + "Annotation_Status": "object", + }, + ), + ], + ids=[ + "test_int_to_obj", + "test_changes_with_constant", + "test_no_changes_with_constant", + ], + ) + def test__convert_to_str_dtype( + self, input_columns_types, known_str_cols, expected_new_column_types + ): """Tests converting dtypes to str dtypes""" new_column_types = process_mutation._convert_to_str_dtype( - self.column_types, ["foo"] + input_columns_types, known_str_cols ) - assert new_column_types == {"foo": "object", "bar": "object"} + assert new_column_types == expected_new_column_types def test_move_maf_rename(self): """Test moving mafs when maf column headers need to be remapped""" From 8bd546461a7072418e1ef7bcef6a48e1a532b76a Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Tue, 23 Jul 2024 05:00:55 -0700 Subject: [PATCH 5/7] release version 16.4.0 --- genie/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genie/__init__.py b/genie/__init__.py index f4e1680f..b43a1cf4 100644 --- a/genie/__init__.py +++ b/genie/__init__.py @@ -7,6 +7,6 @@ # create version in __init__.py # https://packaging.python.org/en/latest/guides/single-sourcing-package-version/ -__version__ = "16.3.0" +__version__ = "16.4.0" __all__ = ["__version__"] From 2aabb555450a937c31ed52a7b360a141e72b8c75 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Tue, 13 Aug 2024 22:10:29 -0700 Subject: [PATCH 6/7] [GEN-1325] Remove section on flagged mutations (#569) * Remove section on flagged mutations * Remove black list variants table query --- templates/dashboardTemplate.Rmd | 27 +-------------------------- 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/templates/dashboardTemplate.Rmd b/templates/dashboardTemplate.Rmd index bb984c21..8344ec80 100644 --- a/templates/dashboardTemplate.Rmd +++ b/templates/dashboardTemplate.Rmd @@ -260,11 +260,7 @@ if (is.null(this_bed)) { this_assays = as.character(unique(this_samples$SEQ_ASSAY_ID)) this_mut <- getFileDf("data_mutations_extended.txt", releaseFiles) assay_infodf = getFileDf("assay_information.txt", releaseFiles) -black_list_variants <- synTableQuery( - "select * from syn18459663 where filter_variant is true", - includeRowIdAndRowVersion = F -) -black_list_variantsdf = black_list_variants$asDataFrame() + # this_cna <- getFileDf("data_CNA.txt", releaseFiles) #this_fus <- getFileDf("data_fusions.txt", releaseFiles) @@ -438,27 +434,6 @@ Files → Centers → [Center name] → Errors → failed_annotation View the version comment column in Synapse for your report to find the version associated with this release. ---- - -## Flagged Mutations -This is a count of how many flagged mutations a center has. Most of these variants are potential -artifacts flagged by manual review of cBioPortal. Please inform Sage Bionetworks about: - -* Suggestions for variants that should be part of this list -* Any variant shouldn't be part of this list - -```{r blacklist} -blacklist_variants = paste(black_list_variantsdf$Hugo_Symbol, - black_list_variantsdf$HGVSp_Short) -subset_mut = this_mut[this_mut$Hugo_Symbol %in% black_list_variantsdf$Hugo_Symbol, ] -subset_mut$blacklist = paste(subset_mut$Hugo_Symbol, - subset_mut$HGVSp_Short) -subset_mut = subset_mut[subset_mut$blacklist %in% blacklist_variants,] -kable(table(subset_mut$blacklist, subset_mut$Center), - caption = "Blacklist variant count") -``` - - --- ## Distribution of Clinical Attributes From 238970f5728fbf86ee3298f188c347868532ec07 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Tue, 13 Aug 2024 22:17:31 -0700 Subject: [PATCH 7/7] [GEN-1326] Remove panel overlap charts (#570) * Remove panel overlap charts * Remove unused function * remove upset * Remove upsetR --- R/install_packages.R | 1 - templates/dashboardTemplate.Rmd | 82 --------------------------------- 2 files changed, 83 deletions(-) diff --git a/R/install_packages.R b/R/install_packages.R index 33d1a2c1..0878bdd3 100644 --- a/R/install_packages.R +++ b/R/install_packages.R @@ -3,7 +3,6 @@ renv::restore() library(synapser) library(dplyr) library(argparse) -library(UpSetR) library(rmarkdown) library(testthat) library(VariantAnnotation) diff --git a/templates/dashboardTemplate.Rmd b/templates/dashboardTemplate.Rmd index 8344ec80..5f5bcd10 100644 --- a/templates/dashboardTemplate.Rmd +++ b/templates/dashboardTemplate.Rmd @@ -25,7 +25,6 @@ title: '`r release`' suppressMessages(library(synapser)) foo = capture.output(synLogin()) suppressMessages(library(ggplot2)) -suppressMessages(library(UpSetR)) suppressMessages(library(RColorBrewer)) suppressMessages(library(jsonlite)) suppressMessages(library(knitr)) @@ -57,19 +56,6 @@ getFileDf <- function(fileName, releaseFiles) { } } -makePanelList = function(assay, bed) { - return(unique(as.character(bed$Hugo_Symbol[bed$SEQ_PIPELINE_ID == assay]))) -} - -plotPanelOverlap <- function(bed, assays) { - listInput = lapply(as.list(assays), function(x) { makePanelList(x, bed) }) - names(listInput) = assays - upset(fromList(listInput), - order.by = "freq", - nsets = length(assays), - nintersects = 30) -} - plotCenterXRace <- function(genieClinData) { t = as.data.frame.matrix(table(genieClinData$CENTER,genieClinData$PRIMARY_RACE)) t = data.frame(n = rowSums(t),t) @@ -672,74 +658,6 @@ if (!is.null(assay_infodf) & !is.null(this_bed)) { --- -## Gene Panel Overlaps -This section will show the number of genes that overlap between the myeloid, small and large GENIE panels. - -```{r bed_process, include=F} -if (!is.null(this_bed)) { - this_bed <- this_bed[this_bed$Feature_Type == "exon",] - #Make it so that I use include in panel - if (!is.null(this_bed$includeInPanel)) { - this_bed <- this_bed[this_bed$includeInPanel == "True",] - } - noneExistentAssays = this_assays[!this_assays %in% this_bed$SEQ_ASSAY_ID] - if (length(noneExistentAssays) > 0) { - print(paste("These assays do not have bed files associated with them: ", - paste(noneExistentAssays, collapse = ", "))) - } - this_assays = this_assays[this_assays %in% this_bed$SEQ_ASSAY_ID] - myeloid_panels = c("VICC-01-MYELOID","UHN-54-V1","UCHI-ONCOHEME55-V1","CHOP-HEMEP","MSK-IMPACT-HEME-399") - myeloid = this_assays[this_assays %in% myeloid_panels] - - normal = this_assays[!this_assays %in% myeloid_panels] - seq_pipeline_ids = unique( - this_bed$SEQ_PIPELINE_ID[this_bed$SEQ_ASSAY_ID %in% normal] - ) - smallPanels = c() - largePanels = c() - for (panel in seq_pipeline_ids) { - panelDf = this_bed[this_bed$SEQ_PIPELINE_ID == panel,] - if (length(table(panelDf$Hugo_Symbol)) < 100) { - smallPanels = c(smallPanels, panel) - # Don't add to panel if more than 1500 genes - } else if (length(table(panelDf$Hugo_Symbol)) < 1500) { - largePanels = c(largePanels, panel) - } - } -} else { - largePanels = c() - smallPanels = c() - myeloid = c() -} -``` - -### Meyloid Gene Panels - -```{r myeloid_panel_upset} -if (length(myeloid) > 1) { - plotPanelOverlap(this_bed, myeloid) -} - -``` - -### Small (<100) Gene panels - -```{r small_panel_upset, fig.height=12} -if (length(smallPanels) > 1) { - plotPanelOverlap(this_bed, smallPanels) -} -``` - -### Large ($\geq$ 100) Gene panels - -```{r large_panel_upset, fig.height=12} -if (length(largePanels) > 1) { - plotPanelOverlap(this_bed, largePanels) -} -``` - ---- - ## Possible Non-center Related Data Issues This section includes QC issues that are mostly related to the Sage Bionetworks pipeline, Genome Nexus or _maybe_ center related issues.