From eedf846067e37ce01cac6f61fe19a7379e41e918 Mon Sep 17 00:00:00 2001 From: Ryan Corbett Date: Mon, 6 Nov 2023 13:18:13 -0500 Subject: [PATCH] rm address_conflicting_interpretations() function and command --- scripts/02-annotate_variants_CAVATICA_input.R | 41 ------------------- scripts/02-annotate_variants_custom_input.R | 37 ----------------- 2 files changed, 78 deletions(-) diff --git a/scripts/02-annotate_variants_CAVATICA_input.R b/scripts/02-annotate_variants_CAVATICA_input.R index 89ce5df..055474c 100644 --- a/scripts/02-annotate_variants_CAVATICA_input.R +++ b/scripts/02-annotate_variants_CAVATICA_input.R @@ -84,42 +84,6 @@ output_tab_abr_file <- paste0(output_name, ".cavatica_input.annotations_report.a ## allocate more memory capacity Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2) - -address_conflicting_interp <- function(clinvar_anno_vcf_df) { ## if conflicting intrep. take the call with most calls in CLNSIGCONF field - - clinvar_nr <- clinvar_anno_vcf_df %>% - dplyr::filter(Stars == "1NR" & !is.na(Stars)) - - for (i in 1:nrow(clinvar_nr)) { - conf_section <- str_match(clinvar_nr$INFO[i], "CLNSIGCONF\\=.+\\;CLNVC") ## part to parse and count calls - call_names <- c("Pathogenic", "Likely_pathogenic", "Benign", "Likely_benign", "Uncertain_significance") - - P <- (str_match(conf_section, "Pathogenic\\((\\d+)\\)")[, 2]) - LP <- (str_match(conf_section, "Likely_pathogenic\\((\\d+)\\)")[, 2]) - B <- (str_match(conf_section, "Benign\\((\\d+)\\)")[, 2]) - LB <- (str_match(conf_section, "Likely_benign\\((\\d+)\\)")[, 2]) - U <- (str_match(conf_section, "Uncertain_significance\\((\\d+)\\)")[, 2]) - - ## make vector out of possible calls to get max - calls <- c(P, LP, B, LB, U) - - if (length(which(calls == max(calls, na.rm = TRUE))) > 1) { - next - } - - highest_ind <- which.max(calls) - consensus_call <- call_names[highest_ind] - - clinvar_nr[i, ]$final_call_clinvar <- consensus_call - } - - clinvar_anno_vcf_df <- clinvar_anno_vcf_df %>% - left_join(clinvar_nr[, c("vcf_id", "final_call_clinvar")], by = "vcf_id", suffix = c(".orig", ".resolved")) %>% - dplyr::mutate(final_call_clinvar = coalesce(final_call_clinvar.resolved, final_call_clinvar.orig)) %>% - dplyr::select(-final_call_clinvar.resolved, -final_call_clinvar.orig) %>% - return(clinvar_anno_vcf_df) -} - address_ambiguous_calls <- function(results_tab_abridged) { ## address ambiguous calls (non L/LB/P/LP/VUS) by taking the InterVar final call results_tab_abridged <- results_tab_abridged %>% @@ -162,11 +126,6 @@ clinvar_anno_vcf_df <- clinvar_anno_vcf_df %>% final_call_clinvar = str_match(INFO, "CLNSIG\\=(\\w+)([\\|\\/]\\w+)*\\;")[, 2] ) - -## if conflicting intrep. take the call with most calls in CLNSIGCONF field -clinvar_anno_vcf_df <- address_conflicting_interp(clinvar_anno_vcf_df) - - ## get latest calls from variant and submission summary files variant_summary_df <- vroom(input_variant_summary, show_col_types = FALSE) %>% filter(vcf_id %in% clinvar_anno_vcf_df$vcf_id) %>% diff --git a/scripts/02-annotate_variants_custom_input.R b/scripts/02-annotate_variants_custom_input.R index 1ffd364..751be80 100644 --- a/scripts/02-annotate_variants_custom_input.R +++ b/scripts/02-annotate_variants_custom_input.R @@ -111,41 +111,6 @@ address_ambiguous_calls <- function(results_tab_abridged) { ## address ambiguous return(results_tab_abridged) } -address_conflicting_interp <- function(clinvar_anno_vcf_df) { ## if conflicting intrep. take the call with most calls in CLNSIGCONF field - - clinvar_nr <- clinvar_anno_vcf_df %>% - dplyr::filter(Stars == "1NR" & !is.na(Stars)) - - for (i in 1:nrow(clinvar_nr)) { - conf_section <- str_match(clinvar_nr$INFO[i], "CLNSIGCONF\\=.+\\;CLNVC") ## part to parse and count calls - call_names <- c("Pathogenic", "Likely_pathogenic", "Benign", "Likely_benign", "Uncertain_significance") - - P <- (str_match(conf_section, "Pathogenic\\((\\d+)\\)")[, 2]) - LP <- (str_match(conf_section, "Likely_pathogenic\\((\\d+)\\)")[, 2]) - B <- (str_match(conf_section, "Benign\\((\\d+)\\)")[, 2]) - LB <- (str_match(conf_section, "Likely_benign\\((\\d+)\\)")[, 2]) - U <- (str_match(conf_section, "Uncertain_significance\\((\\d+)\\)")[, 2]) - - ## make vector out of possible calls to get max - calls <- c(P, LP, B, LB, U) - - if (length(which(calls == max(calls, na.rm = TRUE))) > 1) { - next - } - - highest_ind <- which.max(calls) - consensus_call <- call_names[highest_ind] - - clinvar_nr[i, ]$final_call_clinvar <- consensus_call - } - - clinvar_anno_vcf_df <- clinvar_anno_vcf_df %>% - left_join(clinvar_nr[, c("vcf_id", "final_call_clinvar")], by = "vcf_id", suffix = c(".orig", ".resolved")) %>% - dplyr::mutate(final_call_clinvar = coalesce(final_call_clinvar.resolved, final_call_clinvar.orig)) %>% - dplyr::select(-final_call_clinvar.resolved, -final_call_clinvar.orig) %>% - return(clinvar_anno_vcf_df) -} - ## make vcf dataframe and add vcf_if column vcf_df <- vroom(input_vcf_file, comment = "#", delim = "\t", col_names = c("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "Sample"), trim_ws = TRUE, show_col_types = FALSE) %>% mutate( @@ -174,8 +139,6 @@ clinvar_anno_vcf_df <- vroom(input_clinVar_file, comment = "#", delim = "\t", co final_call_clinvar = str_match(INFO, "CLNSIG\\=(\\w+)([\\|\\/]\\w+)*\\;")[, 2] ) -clinvar_anno_vcf_df <- address_conflicting_interp(clinvar_anno_vcf_df) - ## store variants without clinvar info clinvar_anti_join_vcf_df <- anti_join(vcf_df, clinvar_anno_vcf_df, by = "vcf_id") %>% dplyr::mutate(