From eedf846067e37ce01cac6f61fe19a7379e41e918 Mon Sep 17 00:00:00 2001
From: Ryan Corbett <corbettr@chop.edu>
Date: Mon, 6 Nov 2023 13:18:13 -0500
Subject: [PATCH] rm address_conflicting_interpretations() function and command

---
 scripts/02-annotate_variants_CAVATICA_input.R | 41 -------------------
 scripts/02-annotate_variants_custom_input.R   | 37 -----------------
 2 files changed, 78 deletions(-)

diff --git a/scripts/02-annotate_variants_CAVATICA_input.R b/scripts/02-annotate_variants_CAVATICA_input.R
index 89ce5df..055474c 100644
--- a/scripts/02-annotate_variants_CAVATICA_input.R
+++ b/scripts/02-annotate_variants_CAVATICA_input.R
@@ -84,42 +84,6 @@ output_tab_abr_file <- paste0(output_name, ".cavatica_input.annotations_report.a
 ## allocate more memory capacity
 Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2)
 
-
-address_conflicting_interp <- function(clinvar_anno_vcf_df) { ## if conflicting intrep. take the call with most calls in CLNSIGCONF field
-
-  clinvar_nr <- clinvar_anno_vcf_df %>%
-    dplyr::filter(Stars == "1NR" & !is.na(Stars))
-
-  for (i in 1:nrow(clinvar_nr)) {
-    conf_section <- str_match(clinvar_nr$INFO[i], "CLNSIGCONF\\=.+\\;CLNVC") ## part to parse and count calls
-    call_names <- c("Pathogenic", "Likely_pathogenic", "Benign", "Likely_benign", "Uncertain_significance")
-
-    P <- (str_match(conf_section, "Pathogenic\\((\\d+)\\)")[, 2])
-    LP <- (str_match(conf_section, "Likely_pathogenic\\((\\d+)\\)")[, 2])
-    B <- (str_match(conf_section, "Benign\\((\\d+)\\)")[, 2])
-    LB <- (str_match(conf_section, "Likely_benign\\((\\d+)\\)")[, 2])
-    U <- (str_match(conf_section, "Uncertain_significance\\((\\d+)\\)")[, 2])
-
-    ## make vector out of possible calls to get max
-    calls <- c(P, LP, B, LB, U)
-
-    if (length(which(calls == max(calls, na.rm = TRUE))) > 1) {
-      next
-    }
-
-    highest_ind <- which.max(calls)
-    consensus_call <- call_names[highest_ind]
-
-    clinvar_nr[i, ]$final_call_clinvar <- consensus_call
-  }
-
-  clinvar_anno_vcf_df <- clinvar_anno_vcf_df %>%
-    left_join(clinvar_nr[, c("vcf_id", "final_call_clinvar")], by = "vcf_id", suffix = c(".orig", ".resolved")) %>%
-    dplyr::mutate(final_call_clinvar = coalesce(final_call_clinvar.resolved, final_call_clinvar.orig)) %>%
-    dplyr::select(-final_call_clinvar.resolved, -final_call_clinvar.orig) %>%
-    return(clinvar_anno_vcf_df)
-}
-
 address_ambiguous_calls <- function(results_tab_abridged) { ## address ambiguous calls (non L/LB/P/LP/VUS) by taking the InterVar final call
 
   results_tab_abridged <- results_tab_abridged %>%
@@ -162,11 +126,6 @@ clinvar_anno_vcf_df <- clinvar_anno_vcf_df %>%
     final_call_clinvar = str_match(INFO, "CLNSIG\\=(\\w+)([\\|\\/]\\w+)*\\;")[, 2]
   )
 
-
-## if conflicting intrep. take the call with most calls in CLNSIGCONF field
-clinvar_anno_vcf_df <- address_conflicting_interp(clinvar_anno_vcf_df)
-
-
 ## get latest calls from variant and submission summary files
 variant_summary_df <- vroom(input_variant_summary, show_col_types = FALSE) %>%
   filter(vcf_id %in% clinvar_anno_vcf_df$vcf_id) %>%
diff --git a/scripts/02-annotate_variants_custom_input.R b/scripts/02-annotate_variants_custom_input.R
index 1ffd364..751be80 100644
--- a/scripts/02-annotate_variants_custom_input.R
+++ b/scripts/02-annotate_variants_custom_input.R
@@ -111,41 +111,6 @@ address_ambiguous_calls <- function(results_tab_abridged) { ## address ambiguous
   return(results_tab_abridged)
 }
 
-address_conflicting_interp <- function(clinvar_anno_vcf_df) { ## if conflicting intrep. take the call with most calls in CLNSIGCONF field
-
-  clinvar_nr <- clinvar_anno_vcf_df %>%
-    dplyr::filter(Stars == "1NR" & !is.na(Stars))
-
-  for (i in 1:nrow(clinvar_nr)) {
-    conf_section <- str_match(clinvar_nr$INFO[i], "CLNSIGCONF\\=.+\\;CLNVC") ## part to parse and count calls
-    call_names <- c("Pathogenic", "Likely_pathogenic", "Benign", "Likely_benign", "Uncertain_significance")
-
-    P <- (str_match(conf_section, "Pathogenic\\((\\d+)\\)")[, 2])
-    LP <- (str_match(conf_section, "Likely_pathogenic\\((\\d+)\\)")[, 2])
-    B <- (str_match(conf_section, "Benign\\((\\d+)\\)")[, 2])
-    LB <- (str_match(conf_section, "Likely_benign\\((\\d+)\\)")[, 2])
-    U <- (str_match(conf_section, "Uncertain_significance\\((\\d+)\\)")[, 2])
-
-    ## make vector out of possible calls to get max
-    calls <- c(P, LP, B, LB, U)
-
-    if (length(which(calls == max(calls, na.rm = TRUE))) > 1) {
-      next
-    }
-
-    highest_ind <- which.max(calls)
-    consensus_call <- call_names[highest_ind]
-
-    clinvar_nr[i, ]$final_call_clinvar <- consensus_call
-  }
-
-  clinvar_anno_vcf_df <- clinvar_anno_vcf_df %>%
-    left_join(clinvar_nr[, c("vcf_id", "final_call_clinvar")], by = "vcf_id", suffix = c(".orig", ".resolved")) %>%
-    dplyr::mutate(final_call_clinvar = coalesce(final_call_clinvar.resolved, final_call_clinvar.orig)) %>%
-    dplyr::select(-final_call_clinvar.resolved, -final_call_clinvar.orig) %>%
-    return(clinvar_anno_vcf_df)
-}
-
 ## make vcf dataframe and add vcf_if column
 vcf_df <- vroom(input_vcf_file, comment = "#", delim = "\t", col_names = c("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "Sample"), trim_ws = TRUE, show_col_types = FALSE) %>%
   mutate(
@@ -174,8 +139,6 @@ clinvar_anno_vcf_df <- vroom(input_clinVar_file, comment = "#", delim = "\t", co
     final_call_clinvar = str_match(INFO, "CLNSIG\\=(\\w+)([\\|\\/]\\w+)*\\;")[, 2]
   )
 
-clinvar_anno_vcf_df <- address_conflicting_interp(clinvar_anno_vcf_df)
-
 ## store variants without clinvar info
 clinvar_anti_join_vcf_df <- anti_join(vcf_df, clinvar_anno_vcf_df, by = "vcf_id") %>%
   dplyr::mutate(