From 04481d23d7c8cbd2a334788db5e0791d13915720 Mon Sep 17 00:00:00 2001 From: pamelaxu213 Date: Wed, 16 Mar 2022 15:10:52 -0400 Subject: [PATCH 1/8] modify scripts to generate mosaic variant report --- cre.gemini.variant_impacts.vcf2db.sh | 6 +++++- cre.gemini2txt.vcf2db.sh | 6 +++++- cre.sh | 7 +++++-- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/cre.gemini.variant_impacts.vcf2db.sh b/cre.gemini.variant_impacts.vcf2db.sh index 99deadf..fa5b911 100755 --- a/cre.gemini.variant_impacts.vcf2db.sh +++ b/cre.gemini.variant_impacts.vcf2db.sh @@ -31,10 +31,14 @@ fi #if pipeline is cre, filter out variants only called by one of freebayes, samtools, platypus callers=`gemini db_info $file | grep -w "variants" | grep -w "callers"` -if [ ! -z "$callers" ] +if [ ! -z "$callers" ] && [ "$type" != "wes.mosaic" ] then callers="v.callers" caller_filter="and v.callers not in ('freebayes', 'samtools', 'platypus')" +elif [ ! -z "$callers" ] && [ "$type" == "wes.mosaic" ] +then + callers="callers" + caller_filter="" else callers="00" caller_filter="" diff --git a/cre.gemini2txt.vcf2db.sh b/cre.gemini2txt.vcf2db.sh index e5fa5a5..0b7c068 100755 --- a/cre.gemini2txt.vcf2db.sh +++ b/cre.gemini2txt.vcf2db.sh @@ -33,10 +33,14 @@ gemini query -q "select name from samples order by name" $file > samples.txt #if pipeline is cre, filter out variants only called by one of freebayes, samtools, platypus callers=`gemini db_info $file | grep -w "variants" | grep -w "callers"` -if [ ! -z "$callers" ] +if [ ! -z "$callers" ] && [ "$type" != "wes.mosaic" ] then callers="callers" caller_filter="and Callers not in ('freebayes', 'samtools', 'platypus')" +elif [ ! -z "$callers" ] && [ "$type" == "wes.mosaic" ] +then + callers="00" + caller_filter="" else callers="00" caller_filter="" diff --git a/cre.sh b/cre.sh index aec59da..741010b 100755 --- a/cre.sh +++ b/cre.sh @@ -7,7 +7,7 @@ # family = [family_id] (=project_id=case_id=folder_name, main result file should be family/family-ensemble.db) # cleanup= [0|1] default = 0 # make_report=[0|1] default = 1, don't make report for WGS analysis first -# type = [ wes.regular (default) | wes.synonymous | wes.fast | rnaseq | wgs | annotate (only for cleaning) | +# type = [ wes.regular (default) | wes.synonymous | wes.mosaic | wes.fast | rnaseq | wgs | annotate (only for cleaning) | # denovo (all rare variants in wgs, proband should have phenotype=2, parents=phenotype1 also sex for parents in gemini.db) ] # max_af = af filter, default = 0.01 # database = path to folder where c4r count files and hgmd.csv are found. @@ -97,6 +97,9 @@ function f_cleanup ln -s ${family}-precalled.db ${family}-ensemble.db ln -s ${family}-precalled-annotated-decomposed.vcf.gz ${family}-ensemble-annotated-decomposed.vcf.gz ln -s ${family}-precalled-annotated-decomposed.vcf.gz.tbi ${family}-ensemble-annotated-decomposed.vcf.gz.tbi + #elif [ "$type" == "wes.mosaic"] + #then + #break # skip the rest of loop else # we don't need gemini databases for individual variant callers rm ${family}-freebayes.db @@ -140,7 +143,7 @@ function f_make_report export depth_threshold=10 fi - if [ "$type" == "wgs" ] || [ "$type" == "rnaseq" ] || [ "$type" == "denovo" ] || [ "$type" == "wes.all" ] + if [ "$type" == "wgs" ] || [ "$type" == "rnaseq" ] || [ "$type" == "denovo" ] || [ "$type" == "wes.all" ] || [ "$type" == "wes.mosaic" ] then export severity_filter=ALL elif [ "$type" == "wes.synonymous" ] From f0d699b3fd16cf3b39cbeb56da2e258f6fc0d59c Mon Sep 17 00:00:00 2001 From: pamelaxu213 Date: Wed, 4 May 2022 10:18:34 -0400 Subject: [PATCH 2/8] added mosaic caller --- cre.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cre.sh b/cre.sh index 741010b..df90259 100755 --- a/cre.sh +++ b/cre.sh @@ -143,7 +143,7 @@ function f_make_report export depth_threshold=10 fi - if [ "$type" == "wgs" ] || [ "$type" == "rnaseq" ] || [ "$type" == "denovo" ] || [ "$type" == "wes.all" ] || [ "$type" == "wes.mosaic" ] + if [ "$type" == "wgs" ] || [ "$type" == "rnaseq" ] || [ "$type" == "denovo" ] || [ "$type" == "wes.all" ] then export severity_filter=ALL elif [ "$type" == "wes.synonymous" ] From 79393d14e48109b398c8bd734a71a1f4cd69d1c7 Mon Sep 17 00:00:00 2001 From: pamelaxu213 Date: Tue, 17 May 2022 12:00:05 -0400 Subject: [PATCH 3/8] removed redundant line --- cre.gemini.variant_impacts.vcf2db.sh | 7 ++----- cre.gemini2txt.vcf2db.sh | 4 ---- cre.sh | 3 --- 3 files changed, 2 insertions(+), 12 deletions(-) diff --git a/cre.gemini.variant_impacts.vcf2db.sh b/cre.gemini.variant_impacts.vcf2db.sh index fa5b911..6cf7a00 100755 --- a/cre.gemini.variant_impacts.vcf2db.sh +++ b/cre.gemini.variant_impacts.vcf2db.sh @@ -30,15 +30,12 @@ else fi #if pipeline is cre, filter out variants only called by one of freebayes, samtools, platypus +#else pipeline (mosaic or crg) uses one caller (ie. no "callers" information in gemini database) callers=`gemini db_info $file | grep -w "variants" | grep -w "callers"` -if [ ! -z "$callers" ] && [ "$type" != "wes.mosaic" ] +if [ ! -z "$callers" ] then callers="v.callers" caller_filter="and v.callers not in ('freebayes', 'samtools', 'platypus')" -elif [ ! -z "$callers" ] && [ "$type" == "wes.mosaic" ] -then - callers="callers" - caller_filter="" else callers="00" caller_filter="" diff --git a/cre.gemini2txt.vcf2db.sh b/cre.gemini2txt.vcf2db.sh index 0b7c068..6c2cb28 100755 --- a/cre.gemini2txt.vcf2db.sh +++ b/cre.gemini2txt.vcf2db.sh @@ -37,10 +37,6 @@ if [ ! -z "$callers" ] && [ "$type" != "wes.mosaic" ] then callers="callers" caller_filter="and Callers not in ('freebayes', 'samtools', 'platypus')" -elif [ ! -z "$callers" ] && [ "$type" == "wes.mosaic" ] -then - callers="00" - caller_filter="" else callers="00" caller_filter="" diff --git a/cre.sh b/cre.sh index df90259..fdb039e 100755 --- a/cre.sh +++ b/cre.sh @@ -97,9 +97,6 @@ function f_cleanup ln -s ${family}-precalled.db ${family}-ensemble.db ln -s ${family}-precalled-annotated-decomposed.vcf.gz ${family}-ensemble-annotated-decomposed.vcf.gz ln -s ${family}-precalled-annotated-decomposed.vcf.gz.tbi ${family}-ensemble-annotated-decomposed.vcf.gz.tbi - #elif [ "$type" == "wes.mosaic"] - #then - #break # skip the rest of loop else # we don't need gemini databases for individual variant callers rm ${family}-freebayes.db From 08c5cc1fd0728d7c4332fb7b5a59363d4aedf887 Mon Sep 17 00:00:00 2001 From: pamelaxu213 Date: Tue, 17 May 2022 12:18:30 -0400 Subject: [PATCH 4/8] add comments --- cre.gemini.variant_impacts.vcf2db.sh | 4 ++-- cre.gemini2txt.vcf2db.sh | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cre.gemini.variant_impacts.vcf2db.sh b/cre.gemini.variant_impacts.vcf2db.sh index 6cf7a00..6418896 100755 --- a/cre.gemini.variant_impacts.vcf2db.sh +++ b/cre.gemini.variant_impacts.vcf2db.sh @@ -30,9 +30,9 @@ else fi #if pipeline is cre, filter out variants only called by one of freebayes, samtools, platypus -#else pipeline (mosaic or crg) uses one caller (ie. no "callers" information in gemini database) +#else pipeline is mosaic/crg (uses one caller), do not filter by caller, i.e. no "callers" in the gemini db callers=`gemini db_info $file | grep -w "variants" | grep -w "callers"` -if [ ! -z "$callers" ] +if [ ! -z "$callers" ] #variable $callers is not an empty string, i.e. it exists in the gemini db then callers="v.callers" caller_filter="and v.callers not in ('freebayes', 'samtools', 'platypus')" diff --git a/cre.gemini2txt.vcf2db.sh b/cre.gemini2txt.vcf2db.sh index 6c2cb28..9f7f0c2 100755 --- a/cre.gemini2txt.vcf2db.sh +++ b/cre.gemini2txt.vcf2db.sh @@ -32,8 +32,9 @@ alt_depth=3 gemini query -q "select name from samples order by name" $file > samples.txt #if pipeline is cre, filter out variants only called by one of freebayes, samtools, platypus +#else pipeline is mosaic/crg (uses one caller), do not filter by caller, i.e. no "callers" in the gemini db callers=`gemini db_info $file | grep -w "variants" | grep -w "callers"` -if [ ! -z "$callers" ] && [ "$type" != "wes.mosaic" ] +if [ ! -z "$callers" ] #variable $callers is not an empty string, i.e. it exists in the gemini db then callers="callers" caller_filter="and Callers not in ('freebayes', 'samtools', 'platypus')" From 71931d9a9cdc80b0d0d7cbf0f14f5ef9b67da246 Mon Sep 17 00:00:00 2001 From: pamelaxu213 Date: Wed, 3 Aug 2022 15:39:58 -0400 Subject: [PATCH 5/8] allows report generation when spliceAI is NA --- cre.vcf2db.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cre.vcf2db.R b/cre.vcf2db.R index 45867fb..37e5e1f 100644 --- a/cre.vcf2db.R +++ b/cre.vcf2db.R @@ -290,7 +290,7 @@ create_report <- function(family, samples, type){ variants <- add_placeholder(variants, "SpliceAI_impact", "") for (i in 1:nrow(variants)){ print(i) - if (variants[i,"SpliceAI_score"] == ""){ + if (is.na(variants[i,"SpliceAI_score"])){ variants[i, "SpliceAI_impact"] <- "NA|NA|NA" variants[i, "SpliceAI_score"] <- 0 } else { @@ -346,7 +346,7 @@ create_report <- function(family, samples, type){ # Column44 = cadd # Column45 = vest3 for (i in 1:nrow(variants)){ - v_vest <- strsplit(variants[i,"Vest3_score"], ",", fixed = T)[[1]] + v_vest <- strsplit(as.character(variants[i,"Vest3_score"]), ",", fixed = T)[[1]] variants[i, "Vest3_score"] <- max(v_vest) } From 4427b5e4946a41dd4eb47c648e4ee20f39d4b1c8 Mon Sep 17 00:00:00 2001 From: pamelaxu213 Date: Mon, 15 Aug 2022 13:37:09 -0400 Subject: [PATCH 6/8] add mosaic-panel functionality --- cre.vcf2db.R | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/cre.vcf2db.R b/cre.vcf2db.R index 37e5e1f..8141717 100644 --- a/cre.vcf2db.R +++ b/cre.vcf2db.R @@ -47,6 +47,26 @@ genotype2zygocity <- function (genotype_str, ref, alt_depth){ return(result) } +# when no variants are made into report when running WES mosaic-panel pipeline +check_empty <- function(family, type){ + file <- paste0(family, ".variants.txt") + variants <- get_variants_from_file(file) + + if (nrow(variants) == 0){ + variants = data.frame() + variants[1,1] = paste0("No variants are reported for ", family) + + write.csv(variants, paste0(family,".wes.regular.", datetime, ".csv"), row.names = F) + write.csv(variants, paste0(family,".wes.mosaic.", datetime, ".csv"), row.names = F) + write.csv(variants, paste0(family,".clinical.wes.regular.", datetime, ".csv"), row.names = F) + write.csv(variants, paste0(family,".clinical.wes.mosaic.", datetime, ".csv"), row.names = F) + + }else{ + message("Create report ...") + } + +} + # output : family.ensemble.txt create_report <- function(family, samples, type){ file <- paste0(family, ".variants.txt") @@ -891,6 +911,8 @@ samples <- gsub("-", ".", samples) print("Loading tables") load_tables(debug) +print("Check empty tables") +check_empty(family) print("Creating report") create_report(family,samples,type) print("Merging reports") From 08fd91735d128cce505f0ef44f457177cbf44f1c Mon Sep 17 00:00:00 2001 From: pamelaxu213 Date: Tue, 30 Aug 2022 11:14:18 -0400 Subject: [PATCH 7/8] Fixed bug by making columns character in read.delim() --- cre.vcf2db.R | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/cre.vcf2db.R b/cre.vcf2db.R index 8141717..462b543 100644 --- a/cre.vcf2db.R +++ b/cre.vcf2db.R @@ -11,7 +11,8 @@ add_placeholder <- function(variants, column_name, placeholder){ } get_variants_from_file <- function (filename){ - variants <- read.delim(filename, stringsAsFactors = F) + variants <- read.delim(filename, stringsAsFactors=FALSE, + colClasses = c( "character" )) return(variants) } @@ -56,13 +57,13 @@ check_empty <- function(family, type){ variants = data.frame() variants[1,1] = paste0("No variants are reported for ", family) - write.csv(variants, paste0(family,".wes.regular.", datetime, ".csv"), row.names = F) - write.csv(variants, paste0(family,".wes.mosaic.", datetime, ".csv"), row.names = F) - write.csv(variants, paste0(family,".clinical.wes.regular.", datetime, ".csv"), row.names = F) - write.csv(variants, paste0(family,".clinical.wes.mosaic.", datetime, ".csv"), row.names = F) + write.csv(variants, paste0(family,".wes.regular.", datetime, ".csv"), row.names = F) + write.csv(variants, paste0(family,".wes.mosaic.", datetime, ".csv"), row.names = F) + write.csv(variants, paste0(family,".clinical.wes.regular.", datetime, ".csv"), row.names = F) + write.csv(variants, paste0(family,".clinical.wes.mosaic.", datetime, ".csv"), row.names = F) - }else{ - message("Create report ...") + } else{ + message("Go ahead to create report with at least 1 variant") } } @@ -71,6 +72,7 @@ check_empty <- function(family, type){ create_report <- function(family, samples, type){ file <- paste0(family, ".variants.txt") variants <- get_variants_from_file(file) + #print(paste0("In create_report function, Alt is ", variants$Alt)) impact_file <- paste0(family, ".variant_impacts.txt") impacts <- get_variants_from_file(impact_file) @@ -475,6 +477,7 @@ select_and_write2 <- function(variants, samples, prefix, type) "Number_of_callers", "Old_multiallelic", "UCE_100bp", "UCE_200bp"), noncoding_cols)] variants <- variants[order(variants$Position),] + #print(paste0("In select_and_write2, Alt is ", variants$Alt)) if (type == 'denovo'){ variants <- variants[variants$C4R_WGS_counts < 10,] @@ -493,7 +496,8 @@ fix_column_name <- function(column_name){ # merges ensembl, gatk-haplotype reports merge_reports <- function(family, samples, type){ ensemble_file <- paste0(family, ".create_report.csv") - ensemble <- read.csv(ensemble_file, stringsAsFactors = F) + ensemble <- read.csv(ensemble_file, stringsAsFactors = F, colClasses = c("character")) + #print(paste0("In select_and_write2 from merge_reports function, Alt is ", ensemble$Alt)) ensemble$superindex <- with(ensemble, paste(Position, Ref, Alt, sep = '-')) for (i in 1:nrow(ensemble)){ @@ -509,7 +513,8 @@ merge_reports <- function(family, samples, type){ ensemble_table_file <- paste0(family, ".table") if (file.exists(ensemble_table_file)){ - ensemble_table <- read.delim(ensemble_table_file, stringsAsFactors = F) + ensemble_table <- read.delim(ensemble_table_file, stringsAsFactors = F, colClasses = c("character")) + #print(paste0("In select_and_write2 ens_table_file from merge_reports function, Alt is ", ensemble$Alt)) ensemble_table$superindex <- with(ensemble_table, paste(paste0(CHROM,":",POS), REF, ALT, sep = '-')) ensemble_table[c("CHROM", "POS", "REF", "ALT")] <- NULL for (i in 1:nrow(ensemble_table)){ @@ -744,7 +749,7 @@ parse_ad <- function(ad_cell) { } annotate_w_care4rare <- function(family,samples,type){ - variants <- read.csv(paste0(family, ".merge_reports.csv"), stringsAsFactors = F) + variants <- read.csv(paste0(family, ".merge_reports.csv"), stringsAsFactors = F, colClasses = c( "character" )) variants$superindex <- with(variants, paste(Position, Ref, Alt, sep='-')) From ccbdab7c943d99dee0761d88c8cd5570efb9c83c Mon Sep 17 00:00:00 2001 From: pamelaxu213 Date: Wed, 31 Aug 2022 15:04:52 -0400 Subject: [PATCH 8/8] remove unnessecary lines --- cre.vcf2db.R | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/cre.vcf2db.R b/cre.vcf2db.R index 462b543..2cf3999 100644 --- a/cre.vcf2db.R +++ b/cre.vcf2db.R @@ -11,8 +11,7 @@ add_placeholder <- function(variants, column_name, placeholder){ } get_variants_from_file <- function (filename){ - variants <- read.delim(filename, stringsAsFactors=FALSE, - colClasses = c( "character" )) + variants <- read.delim(filename, stringsAsFactors=FALSE, colClasses = c("character")) return(variants) } @@ -72,7 +71,6 @@ check_empty <- function(family, type){ create_report <- function(family, samples, type){ file <- paste0(family, ".variants.txt") variants <- get_variants_from_file(file) - #print(paste0("In create_report function, Alt is ", variants$Alt)) impact_file <- paste0(family, ".variant_impacts.txt") impacts <- get_variants_from_file(impact_file) @@ -497,7 +495,6 @@ fix_column_name <- function(column_name){ merge_reports <- function(family, samples, type){ ensemble_file <- paste0(family, ".create_report.csv") ensemble <- read.csv(ensemble_file, stringsAsFactors = F, colClasses = c("character")) - #print(paste0("In select_and_write2 from merge_reports function, Alt is ", ensemble$Alt)) ensemble$superindex <- with(ensemble, paste(Position, Ref, Alt, sep = '-')) for (i in 1:nrow(ensemble)){ @@ -514,7 +511,6 @@ merge_reports <- function(family, samples, type){ ensemble_table_file <- paste0(family, ".table") if (file.exists(ensemble_table_file)){ ensemble_table <- read.delim(ensemble_table_file, stringsAsFactors = F, colClasses = c("character")) - #print(paste0("In select_and_write2 ens_table_file from merge_reports function, Alt is ", ensemble$Alt)) ensemble_table$superindex <- with(ensemble_table, paste(paste0(CHROM,":",POS), REF, ALT, sep = '-')) ensemble_table[c("CHROM", "POS", "REF", "ALT")] <- NULL for (i in 1:nrow(ensemble_table)){ @@ -749,8 +745,7 @@ parse_ad <- function(ad_cell) { } annotate_w_care4rare <- function(family,samples,type){ - variants <- read.csv(paste0(family, ".merge_reports.csv"), stringsAsFactors = F, colClasses = c( "character" )) - + variants <- read.csv(paste0(family, ".merge_reports.csv"), stringsAsFactors = F, colClasses = c("character")) variants$superindex <- with(variants, paste(Position, Ref, Alt, sep='-')) if(exists("seen_in_c4r_counts")){