From a0f77effd5beaeb082377abf0749a58fe351be8a Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Fri, 9 Jun 2023 14:15:13 +1000 Subject: [PATCH 01/11] Switch to sash inputs --- NAMESPACE | 9 +- R/oncoviral.R | 141 ++++ R/purple.R | 175 +++-- R/rmd.R | 22 +- R/sv.R | 91 +-- R/utils.R | 14 +- README.md | 15 +- inst/cli/canrep.R | 16 +- inst/rmd/umccrise/_navbar.html | 17 +- inst/rmd/umccrise/cancer_report.Rmd | 606 ++++++------------ inst/rmd/umccrise/render.R | 1 - inst/rmd/umccrise/render.sh | 1 - man/cancer_rmd.Rd | 17 +- man/is_vcf_bpi.Rd | 14 - man/purple_cnv_germ_process.Rd | 26 - man/purple_cnv_germ_read.Rd | 21 - man/purple_cnv_som_process.Rd | 3 +- .../test-roxytest-testexamples-purple.R | 28 +- .../testthat/test-roxytest-testexamples-sv.R | 8 +- .../test-roxytest-testexamples-utils.R | 6 +- 20 files changed, 578 insertions(+), 653 deletions(-) create mode 100644 R/oncoviral.R delete mode 100644 man/is_vcf_bpi.Rd delete mode 100644 man/purple_cnv_germ_process.Rd delete mode 100644 man/purple_cnv_germ_read.Rd diff --git a/NAMESPACE b/NAMESPACE index c12c891..26cdef5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,11 +1,12 @@ # Generated by roxygen2: do not edit by hand +export(abbreviate_effect) export(af_summary) export(cancer_rmd) +export(count_pieces) export(date_log) export(hrd_results_tabs) export(is_vcf) -export(is_vcf_bpi) export(linx_breakend_process) export(linx_breakend_read) export(linx_clusters_process) @@ -32,8 +33,8 @@ export(pkg_exists) export(plot_bnd_sr_pr_tot_hist) export(plot_bnd_sr_pr_tot_lines) export(process_sv) -export(purple_cnv_germ_process) -export(purple_cnv_germ_read) +export(purple_cnv_som_ann_process) +export(purple_cnv_som_ann_read) export(purple_cnv_som_gene_process) export(purple_cnv_som_gene_read) export(purple_cnv_som_process) @@ -47,6 +48,8 @@ export(session_info_kable) export(tsv_is_empty) export(umccrise_read_sv_tsv) export(vcf_is_empty) +export(virusbreakend_summary_read) +export(virusbreakend_vcf_read) export(write_jsongz) export(write_tsvgz) export(write_tsvjsongz) diff --git a/R/oncoviral.R b/R/oncoviral.R new file mode 100644 index 0000000..7c87f0d --- /dev/null +++ b/R/oncoviral.R @@ -0,0 +1,141 @@ +#' Read VIRUSBreakend Summary File +#' +#' Reads the `virusbreakend.vcf.summary.tsv` file. +#' +#' @param x Path to `virusbreakend.vcf.summary.tsv` file. +#' +#' @return List with two elements: +#' * `tab`: Tibble containing data. +#' * `descr`: Description of tibble columns. +#' +#' @examples +#' x <- system.file("extdata/virusbreakend/virusbreakend.vcf.summary.tsv", package = "gpgr") +#' (vb <- virusbreakend_summary_read(x)) +#' @testexamples +#' expect_equal(colnames(vb)[ncol(vb)], "QCStatus") +#' +#' @export +virusbreakend_summary_read <- function(x) { + + nm <- c( + "taxid_genus" = "c", + "name_genus" = "c", + "reads_genus_tree" = "i", + "taxid_species" = "c", + "name_species" = "c", + "reads_species_tree" = "i", + "taxid_assigned" = "c", + "name_assigned" = "c", + "reads_assigned_tree" = "i", + "reads_assigned_direct" = "i", + "reference" = "c", + "reference_taxid" = "c", + "reference_kmer_count" = "i", + "alternate_kmer_count" = "i", + "rname" = "c", + "startpos" = "i", + "endpos" = "i", + "numreads" = "i", + "covbases" = "d", + "coverage" = "d", + "meandepth" = "d", + "meanbaseq" = "d", + "meanmapq" = "d", + "integrations" = "i", + "QCStatus" = "c" + ) + + ctypes <- paste(nm, collapse = "") + virusbreakend_summary <- readr::read_tsv(x, col_types = ctypes) + + if (nrow(virusbreakend_summary) > 0) { + assertthat::assert_that(ncol(virusbreakend_summary) == length(nm)) + assertthat::assert_that(all(colnames(virusbreakend_summary) == names(nm))) + + virusbreakend_summary <- virusbreakend_summary |> + dplyr::select( + Virus="name_assigned", + Length="endpos", + Reads="numreads", + Coverage="coverage", + `Mean depth`="meandepth", + Intergrations="integrations", + QC="QCStatus", + ) + } + + descr <- dplyr::tribble( + ~Column, ~Description, + "Virus", "Assigned NCBI taxonomy name of viral reference", + "Length", "Length of viral contig", + "Reads", "Number of reads mapped to adjusted viral reference", + "Coverage", "Percentage of viral positions with at least one read mapped", + "Mean depth", "Mean alignment depth", + "Intergrations", "Number of detected integration breakpoints", + "QC", "QC status of viral intergrations", + ) + + list( + tab = virusbreakend_summary, + descr = descr + ) +} + +#' Read VIRUSBreakend VCF File +#' +#' Reads the `virusbreakend.vcf` file and selects data to present. +#' +#' @param x Path to `virusbreakend.vcf` file. +#' +#' @return List with two elements: +#' * `tab`: Tibble containing selected data. +#' * `descr`: Description of tibble columns. +#' +#' @examples +#' x <- system.file("extdata/virusbreakend/virusbreakend.vcf", package = "gpgr") +#' (vb <- virusbreakend_vcf_read(x)) +#' @testexamples +#' expect_equal(colnames(vb)[ncol(vb)], "QC") +#' +#' @export +virusbreakend_vcf_read <- function(x) { + + d <- bedr::read.vcf(x, split.info = TRUE, verbose = FALSE) + + if (nrow(d$vcf) > 0) { + virusbreakend_integrations <- tibble::as_tibble(d$vcf) |> + dplyr::select( + Contig="CHROM", + Position="POS", + "Fragment support"="BVF", + "Fragment support (unmapped)"="BUM", + "Softclip read support"="BSC", + Reference="REF", + Alt="ALT", + `Breakend ID`="ID", + `Mate ID`="MATEID", + QC="FILTER", + ) + } else { + virusbreakend_integrations <- tibble::tibble() + } + + descr <- dplyr::tribble( + ~Column, ~Description, + "Contig", "Name of contig", + "Position", "Position of breakend in contig", + "Breakend ID", "ID of integration breakend", + "Mate ID", "ID of integration breakend mate", + "Reference", "Reference allele", + "Alt", "Alternative allele", + "QC", "VCF filter values", + "Fragment support", "Total number of fragments supporting breakend", + "Fragment support (unmapped)", "Number of fragments supporting breakend that have one read unmapped", + "Softclip read support", "Number of softclipped reads supporting breakend" + ) + + list( + tab = virusbreakend_integrations, + description = descr + ) +} diff --git a/R/purple.R b/R/purple.R index a417d42..dfd5bde 100644 --- a/R/purple.R +++ b/R/purple.R @@ -18,13 +18,12 @@ purple_cnv_som_gene_read <- function(x) { nm <- c( "chromosome" = "c", "start" = "i", "end" = "i", "gene" = "c", - "minCopyNumber" = "d", "maxCopyNumber" = "d", - "unused" = "c", "somaticRegions" = "d", "germlineHomDeletionRegions" = "d", - "germlineHetToHomDeletionRegions" = "d", - "transcriptId" = "c", "transcriptVersion" = "c", "chromosomeBand" = "c", + "minCopyNumber" = "d", "maxCopyNumber" = "d", "somaticRegions" = "d", + "transcriptId" = "c", "isCanonical" = "c", "chromosomeBand" = "c", "minRegions" = "d", "minRegionStart" = "i", "minRegionEnd" = "i", "minRegionStartSupport" = "c", "minRegionEndSupport" = "c", - "minRegionMethod" = "c", "minMinorAlleleCopyNumber" = "d" + "minRegionMethod" = "c", "minMinorAlleleCopyNumber" = "d", + "depthWindowCount" = "i" ) ctypes <- paste(nm, collapse = "") @@ -34,7 +33,6 @@ purple_cnv_som_gene_read <- function(x) { purple_cnv_gene } - #' Process PURPLE CNV Gene File for UMCCRISE #' #' Processes the `purple.cnv.gene.tsv` file. Keeps genes that are in the @@ -80,13 +78,12 @@ purple_cnv_som_gene_process <- function(x, g = NULL) { dplyr::filter(.data$gene %in% genes$symbol) |> dplyr::mutate( chromosome = as.factor(.data$chromosome), - transcriptID = paste0(.data$transcriptId, ".", .data$transcriptVersion), + transcriptID = paste0(.data$transcriptId), minRegStartEnd = paste0(.data$minRegionStart, "-", .data$minRegionEnd), minRegSupportStartEndMethod = paste0( .data$minRegionStartSupport, "-", .data$minRegionEndSupport, " (", .data$minRegionMethod, ")" ), - germDelReg = paste0(.data$germlineHomDeletionRegions, "/", .data$germlineHetToHomDeletionRegions), oncogene = .data$gene %in% oncogenes, tsgene = .data$gene %in% tsgenes, onco_or_ts = dplyr::case_when( @@ -101,7 +98,7 @@ purple_cnv_som_gene_process <- function(x, g = NULL) { chrom = "chromosome", "start", "end", chrBand = "chromosomeBand", "onco_or_ts", "transcriptID", minMinorAlleleCN = "minMinorAlleleCopyNumber", - somReg = "somaticRegions", "germDelReg", minReg = "minRegions", + somReg = "somaticRegions", minReg = "minRegions", "minRegStartEnd", "minRegSupportStartEndMethod" ) @@ -115,7 +112,6 @@ purple_cnv_som_gene_process <- function(x, g = NULL) { "transcriptID", "Ensembl transcript ID (dot version)", "minMinorAlleleCN", "Minimum allele ploidy found over the gene exons - useful for identifying LOH events", "somReg (somaticRegions)", "Count of somatic copy number regions this gene spans", - "germDelReg (germlineHomDeletionRegions / germlineHetToHomDeletionRegions)", "Number of regions spanned by this gene that are (homozygously deleted in the germline / both heterozygously deleted in the germline and homozygously deleted in the tumor)", "minReg (minRegions)", "Number of somatic regions inside the gene that share the min copy number", "minRegStartEnd", "Start/End base of the copy number region overlapping the gene with the minimum copy number", "minRegSupportStartEndMethod", "Start/end support of the CN region overlapping the gene with the min CN (plus determination method)" @@ -162,8 +158,7 @@ purple_cnv_som_read <- function(x) { #' Process PURPLE CNV Somatic File for UMCCRISE #' -#' Processes the `purple.cnv.somatic.tsv` file. -#' and selects columns of interest. +#' Processes the `purple.cnv.somatic.tsv` file and selects columns of interest. #' #' @param x Path to `purple.cnv.somatic.tsv` file. #' @@ -230,49 +225,142 @@ purple_cnv_som_process <- function(x) { ) } -#' Read PURPLE CNV Germline File +#' Process an Annotated PURPLE CNV Somatic File #' -#' Reads the `purple.cnv.germline.tsv` file. +#' Processes the annotated and prioritised `purple.cnv.somatic.tsv` file. #' -#' @param x Path to `purple.cnv.germline.tsv` file. +#' @param x Path to annotated and prioritised `purple.cnv.somatic.tsv` file. #' -#' @return The input file as a tibble. +#' @return List with two elements: +#' * `tab`: Tibble containing selected data. +#' * `descr`: Description of tibble columns. #' #' @examples -#' x <- system.file("extdata/purple/purple.cnv.germline.tsv", package = "gpgr") -#' (p <- purple_cnv_germ_read(x)) +#' x <- system.file("extdata/sash/purple.cnv.gene.annotated.tsv", package = "gpgr") +#' (p <- purple_cnv_som_ann_process(x)) #' @testexamples -#' expect_equal(colnames(p)[ncol(p)], "majorAlleleCopyNumber") +#' expect_equal(colnames(p)[ncol(p)], "annotation") #' #' @export -purple_cnv_germ_read <- function(x) { - # as of PURPLE v2.39, germline and somatic files have same columns. - purple_cnv_germline <- purple_cnv_som_read(x) - purple_cnv_germline +purple_cnv_som_ann_process <- function(x) { + purple_cnv_somatic_ann <- purple_cnv_som_ann_read(x) + + purple_cnv_somatic_ann___d <- purple_cnv_somatic_ann |> + dplyr::mutate( + Chr = as.factor(.data$chromosome), + minorAlleleCopyNumber = round(.data$minorAlleleCopyNumber, 1), + majorAlleleCopyNumber = round(.data$majorAlleleCopyNumber, 1), + `CopyNumber Min+Maj` = paste0(.data$minorAlleleCopyNumber, "+", .data$majorAlleleCopyNumber), + copyNumber = round(.data$copyNumber, 1), + bafAdj = round(.data$baf, 2), + gcContent = round(.data$gcContent, 2), + `Start/End SegSupport` = paste0(.data$segmentStartSupport, "-", .data$segmentEndSupport), + `BAF (count)` = paste0(.data$bafAdj, " (", .data$bafCount, ")"), + `GC (windowCount)` = paste0(.data$gcContent, " (", .data$depthWindowCount, ")"), + TopTier = sv_top_tier, + annotation = simple_ann + ) |> + dplyr::select( + "Chr", + Start = "start", End = "end", Type = "svtype", CN = "copyNumber", + `CN Min+Maj` = "CopyNumber Min+Maj", "Start/End SegSupport", + Method = "method", "BAF (count)", "GC (windowCount)", "TopTier", "annotation" + ) + + abbreviate_effectv <- Vectorize(gpgr::abbreviate_effect) + + purple_cnv_somatic_ann <- purple_cnv_somatic_ann___d |> + dplyr::mutate(annotation = strsplit(.data$annotation, ",")) |> + tidyr::unnest(.data$annotation) |> + tidyr::separate( + .data$annotation, c("Event", "Effect", "Genes", "Transcript", "Detail", "Tier"), + sep = "\\|", convert = FALSE + ) |> + dplyr::mutate( + ntrx = gpgr::count_pieces(.data$Transcript, "&"), + ngen = gpgr::count_pieces(.data$Genes, "&"), + neff = gpgr::count_pieces(.data$Effect, "&"), + Transcript = .data$Transcript |> stringr::str_replace_all("&", ", "), + Genes = .data$Genes |> stringr::str_replace_all("&", ", "), + Effect = abbreviate_effectv(.data$Effect), + `Tier (Top)` = glue::glue("{Tier} ({TopTier})") + ) |> + dplyr::distinct() |> + dplyr::filter(Detail != 'unprioritized') |> + dplyr::arrange(.data$`Tier (Top)`, .data$Genes, .data$Effect) + + descr <- dplyr::tribble( + ~Column, ~Description, + "Chr/Start/End", "Coordinates of copy number segment", + "Type", "Simple type interpretation of SV.", + "CN", "Fitted absolute copy number of segment adjusted for purity and ploidy", + "CN Min+Maj", "CopyNumber of minor + major allele adjusted for purity", + "Start/End SegSupport", paste0( + "Type of SV support for the CN breakpoint at ", + "start/end of region. Allowed values: ", + "CENTROMERE, TELOMERE, INV, DEL, DUP, BND (translocation), ", + "SGL (single breakend SV support), NONE (no SV support for CN breakpoint), ", + "MULT (multiple SV support at exact breakpoint)" + ), + "Method", paste0( + "Method used to determine the CN of the region. Allowed values: ", + "BAF_WEIGHTED (avg of all depth windows for the region), ", + "STRUCTURAL_VARIANT (inferred using ploidy of flanking SVs), ", + "LONG_ARM (inferred from the long arm), GERMLINE_AMPLIFICATION ", + "(inferred using special logic to handle regions of germline amplification)" + ), + "BAF (count)", "Tumor BAF after adjusted for purity and ploidy (Count of AMBER baf points covered by this segment)", + "GC (windowCount)", "Proportion of segment that is G or C (Count of COBALT windows covered by this segment)", + "TierTop", "Top priority of the event (from simple_sv_annotation: 1 highest, 4 lowest).", + "annotation", "INFO/SIMPLE_ANN: Simplified structural variant annotation: 'SVTYPE | EFFECT | GENE(s) | TRANSCRIPT | PRIORITY (1-4)'" + ) + + list( + tab = purple_cnv_somatic_ann, + descr = descr + ) } -#' Process PURPLE CNV germline File for UMCCRISE +#' Read an Annotated PURPLE CNV Somatic File #' -#' Processes the `purple.cnv.germline.tsv` file. -#' and selects columns of interest. +#' Reads the annotated and prioritised `purple.cnv.somatic.tsv` file. #' -#' @param x Path to `purple.cnv.germline.tsv` file. +#' @param x Path to annotated and prioritised `purple.cnv.somatic.tsv` file. #' -#' @return List with two elements: -#' * `tab`: Tibble with more condensed columns. -#' * `descr`: Description of tibble columns. +#' @return The input file as a tibble. #' #' @examples -#' x <- system.file("extdata/purple/purple.cnv.germline.tsv", package = "gpgr") -#' (pp <- purple_cnv_germ_process(x)) +#' x <- system.file("extdata/sash/purple.cnv.gene.annotated.tsv", package = "gpgr") +#' (p <- purple_cnv_som_ann_read(x)) #' @testexamples -#' expect_equal(colnames(pp$tab)[ncol(pp$tab)], "GC (windowCount)") +#' expect_equal(colnames(p)[ncol(p)], "simple_ann") #' #' @export -purple_cnv_germ_process <- function(x) { - # as of PURPLE v2.39, germline and somatic files have same columns. - processed_purple_cnv_germline <- purple_cnv_som_process(x) - processed_purple_cnv_germline +purple_cnv_som_ann_read <- function(x) { + nm <- c( + "chromosome" = "c", + "start" = "i", + "end" = "i", + "svtype" = "c", + "baf" = "d", + "bafCount" = "d", + "copyNumber" = "d", + "depthWindowCount" = "i", + "gcContent" = "d", + "majorAlleleCopyNumber" = "d", + "method" = "c", + "minorAlleleCopyNumber" = "d", + "segmentEndSupport" = "c", + "segmentStartSupport" = "c", + "sv_top_tier" = "i", + "simple_ann" = "c" + ) + + ctypes <- paste(nm, collapse = "") + purple_cnv_somatic_ann <- readr::read_tsv(x, col_types = ctypes) + assertthat::assert_that(ncol(purple_cnv_somatic_ann) == length(nm)) + assertthat::assert_that(all(colnames(purple_cnv_somatic_ann) == names(nm))) + purple_cnv_somatic_ann } #' Read PURPLE version file @@ -328,7 +416,8 @@ purple_qc_read <- function(x) { nm <- c( "QCStatus", "Method", "CopyNumberSegments", "UnsupportedCopyNumberSegments", "Purity", "AmberGender", - "CobaltGender", "DeletedGenes", "Contamination", "GermlineAberrations" + "CobaltGender", "DeletedGenes", "Contamination", "GermlineAberrations", + "AmberMeanDepth" ) assertthat::assert_that(all(purple_qc$key == nm)) @@ -350,6 +439,8 @@ purple_qc_read <- function(x) { "Rate of contamination in tumor sample as determined by AMBER.", 16, "GermlineAberrations", glue::glue('{q["GermlineAberrations"]}'), "Can be one or more of: KLINEFELTER, TRISOMY_X/21/13/18/15, XYY, MOSAIC_X.", + 18, "AmberMeanDepth", glue::glue('{q["AmberMeanDepth"]}'), + "Mean depth as determined by AMBER.", ) list( @@ -401,7 +492,9 @@ purple_purity_read <- function(x) { "tmlStatus", "c", "tmbPerMb", "d", "tmbStatus", "c", - "svTumorMutationalBurden", "d" + "svTumorMutationalBurden", "d", + "runMode", "c", + "targeted", "c" ) ctypes <- paste(tab$type, collapse = "") @@ -475,7 +568,7 @@ purple_snv_vcf_read <- function(x) { dplyr::select("ID", "Description") info_cols <- c( - "AF", "PURPLE_AF", "PURPLE_CN", + "PURPLE_AF", "PURPLE_CN", "PURPLE_GERMLINE", "PURPLE_MACN", "PURPLE_VCN", "HMF_HOTSPOT", "KT", "MH", "SUBCL", "TNC" ) @@ -506,7 +599,7 @@ purple_snv_vcf_read <- function(x) { purple_kataegis <- function(x) { d <- purple_snv_vcf_read(x) info_cols <- c( - "KT", "AF", "PURPLE_AF", "PURPLE_CN", + "KT", "PURPLE_AF", "PURPLE_CN", "PURPLE_MACN", "PURPLE_VCN", "SUBCL", "MH", "TNC" ) diff --git a/R/rmd.R b/R/rmd.R index 91ecdf9..5326144 100644 --- a/R/rmd.R +++ b/R/rmd.R @@ -67,16 +67,15 @@ linx_rmd <- function(sample, table_dir, plot_dir, out_file = NULL, quiet = FALSE #' @param conda_list Path to `conda_pkg_list.txt` file. #' @param img_dir Path to directory containing PURPLE plots. #' @param key_genes Path to UMCCR cancer gene file. -#' @param oncoviral_breakpoints_tsv Path to `oncoviruses/oncoviral_breakpoints.tsv`. -#' @param oncoviral_present_viruses Path to `oncoviruses/present_viruses.txt`. -#' @param purple_germ_cnv Path to `purple.cnv.germline.tsv`. +#' @param virusbreakend_tsv Path to VIRUSBreakend summary file. +#' @param virusbreakend_vcf Path to VIRUSBreakend VCF file. #' @param purple_purity Path to `purple.purity.tsv`. #' @param purple_qc Path to `purple.qc`. #' @param purple_som_cnv Path to `purple.cnv.somatic.tsv`. +#' @param purple_som_cnv_ann Path to annotated and prioritised `purple.cnv.somatic.tsv`. #' @param purple_som_gene_cnv Path to `purple.cnv.gene.tsv`. #' @param purple_som_snv_vcf Path to `purple.somatic.vcf.gz`. #' @param result_outdir Path to directory to write tidy JSON/TSV results. -#' @param somatic_snv_summary Path to `somatic_snv_summary.json` JSON. #' @param somatic_snv_vcf Path to `somatic-PASS.vcf.gz` SNV VCF. #' @param somatic_sv_tsv Path to `manta.tsv` TSV file. #' @param somatic_sv_vcf Path to `manta.vcf.gz` VCF file. @@ -87,11 +86,9 @@ linx_rmd <- function(sample, table_dir, plot_dir, out_file = NULL, quiet = FALSE #' @return Path to rendered HTML report. #' @export cancer_rmd <- function(af_global, af_keygenes, batch_name, conda_list, img_dir, key_genes, - oncoviral_breakpoints_tsv, oncoviral_present_viruses, - purple_germ_cnv, purple_purity, purple_qc, purple_som_cnv, - purple_som_gene_cnv, purple_som_snv_vcf, somatic_snv_summary, - somatic_snv_vcf, somatic_sv_tsv, somatic_sv_vcf, - result_outdir, tumor_name, + virusbreakend_tsv, virusbreakend_vcf, purple_purity, purple_qc, + purple_som_cnv_ann, purple_som_cnv, purple_som_gene_cnv, purple_som_snv_vcf, + somatic_snv_vcf, somatic_sv_tsv, somatic_sv_vcf, result_outdir, tumor_name, out_file = NULL, quiet = FALSE) { assertthat::assert_that( dir.exists(img_dir), @@ -122,18 +119,17 @@ cancer_rmd <- function(af_global, af_keygenes, batch_name, conda_list, img_dir, conda_list = conda_list, img_dir = img_dir_b, key_genes = key_genes, - somatic_snv_summary = somatic_snv_summary, somatic_snv_vcf = somatic_snv_vcf, somatic_sv_tsv = somatic_sv_tsv, somatic_sv_vcf = somatic_sv_vcf, purple_som_gene_cnv = purple_som_gene_cnv, + purple_som_cnv_ann = purple_som_cnv_ann, purple_som_cnv = purple_som_cnv, - purple_germ_cnv = purple_germ_cnv, purple_purity = purple_purity, purple_qc = purple_qc, purple_som_snv_vcf = purple_som_snv_vcf, - oncoviral_present_viruses = oncoviral_present_viruses, - oncoviral_breakpoints_tsv = oncoviral_breakpoints_tsv, + virusbreakend_tsv = virusbreakend_tsv, + virusbreakend_vcf = virusbreakend_vcf, result_outdir = result_outdir, tumor_name = tumor_name ) diff --git a/R/sv.R b/R/sv.R index 53acd85..62e1f91 100644 --- a/R/sv.R +++ b/R/sv.R @@ -73,6 +73,7 @@ split_double_col <- function(d, nms) { #' expect_equal(m, 2) #' expect_error(gpgr:::count_pieces("foo", NA)) #' +#' @export count_pieces <- function(x, sep) { ifelse(nchar(x) == 0, 0, stringr::str_count(x, sep) + 1) } @@ -107,6 +108,7 @@ count_pieces <- function(x, sep) { #' expect_equal(e3, "TFBSDel, TFBSVar") #' expect_equal(e4, "badaboom, bar, foo, StopGain") #' +#' @export abbreviate_effect <- function(effects) { effect_abbrev_nms <- names(gpgr::EFFECT_ABBREVIATIONS) @@ -147,22 +149,18 @@ umccrise_read_sv_tsv <- function(x) { "sample", "Tumor sample name", "c", "chrom", "CHROM column in VCF", "c", "start", "POS column in VCF", "i", - "end", "INFO/END: End position of the variant described in this record", "i", "svtype", "INFO/SVTYPE: Type of structural variant", "c", - "split_read_support", "FORMAT/SR of tumor sample: Split reads for the ref and alt alleles in the order listed, for reads where P(allele|read)>0.999", "c", - "paired_support_PE", "FORMAT/PE of tumor sample: ??", "c", - "paired_support_PR", "FORMAT/PR of tumor sample: Spanning paired-read support for the ref and alt alleles in the order listed, for reads where P(allele|read)>0.999", "c", - "AF_BPI", "INFO/BPI_AF: AF at each breakpoint (so AF_BPI1,AF_BPI2)", "c", - "somaticscore", "INFO/SOMATICSCORE: Somatic variant quality score", "i", + "SR_alt", "FORMAT/SR of tumor sample", "i", + "PR_alt", "FORMAT/RP of tumor sample", "i", + "SR_ref", "FORMAT/REF of tumor sample", "i", + "PR_ref", "FORMAT/REFPAIR of tumor sample", "i", + "QUAL", "QUAL column in VCF", "f", "tier", "INFO/SV_TOP_TIER (or 4 if missing): Highest priority tier for the effects of a variant entry", "c", "annotation", "INFO/SIMPLE_ANN: Simplified structural variant annotation: 'SVTYPE | EFFECT | GENE(s) | TRANSCRIPT | PRIORITY (1-4)'", "c", "AF_PURPLE", "INFO/PURPLE_AF: AF at each breakend (purity adjusted) (so AF_PURPLE1,AF_PURPLE2)", "c", "CN_PURPLE", "INFO/PURPLE_CN: CN at each breakend (purity adjusted) (so CN_PURPLE1,CN_PURPLE2)", "c", "CN_change_PURPLE", "INFO/PURPLE_CN_CHANGE: change in CN at each breakend (purity adjusted) (so CN_change_PURPLE1,CN_change_PURPLE2)", "c", - "Ploidy_PURPLE", "INFO/PURPLE_PLOIDY: Ploidy of variant (purity adjusted)", "d", "PURPLE_status", "INFERRED if FILTER=INFERRED, or RECOVERED if has INFO/RECOVERED, else blank. INFERRED: Breakend inferred from copy number transition", "c", - "START_BPI", "INFO/BPI_START: BPI adjusted breakend location", "i", - "END_BPI", "INFO/BPI_END: BPI adjusted breakend location", "i", "ID", "ID column in VCF", "c", "MATEID", "INFO/MATEID: ID of mate breakend", "c", "ALT", "ALT column in VCF", "c" @@ -205,6 +203,7 @@ process_sv <- function(x) { melted = NULL )) } + col_descr <- dplyr::tribble( ~Column, ~Description, "nrow", "Row number that connects variants between tables in same tab set.", @@ -213,8 +212,8 @@ process_sv <- function(x) { "Tier", "Priority of the specific event (from simple_sv_annotation: 1 highest, 4 lowest).", "Tier (Top)", "Priority of the specific event (Top tier of all event's annotations): 1 highest, 4 lowest. ", "Chr", "Chromosome.", - "Start", "Start position as inferred by BPI. For PURPLE-inferred SVs we use POS.", - "End", "End position. For BNDs = Chr:Start of the mate.Values are inferred by BPI (PURPLE-inferred SVs do not have an End).", + "Start", "Start position.", + "End", "End position.", "ID", "ID of BND from Manta. For PURPLE-inferred SVs this is PURPLE.", "MATEID", "ID of BND mate from Manta.", "BND_ID", "ID of BND pair simplified. BNDs with the same BND_ID belong to the same translocation event.", @@ -223,40 +222,31 @@ process_sv <- function(x) { "Transcript", "Transcripts involved in the event. DEL/DUP/INS events involving more than 2 transcripts are shown within separate table.", "Effect", "SV effect (based on http://snpeff.sourceforge.net/SnpEff_manual.html#input).", "Detail", "Prioritisation detail (from simple_sv_annotation).", - "Ploidy", "Ploidy of variant from PURPLE (purity adjusted).", "AF_PURPLE", "PURPLE AF at each breakend preceded by their average.", - "AF_BPI", "BPI AF at each breakend preceded by their average.", "CN", "Copy Number at each breakend preceded by their average.", "CNC", "Copy Number Change at each breakend preceded by their average.", "SR_alt", "Number of Split Reads supporting the alt allele, where P(allele|read)>0.999.", "PR_alt", "Number of Paired Reads supporting the alt allele, where P(allele|read)>0.999.", "SR_PR_ref", "Number of Split Reads and Paired Reads supporting the ref allele, where P(allele|read)>0.999.", "SR_PR_sum", "Sum of SR_alt and PR_alt.", - "Type", "Type of structural variant.", - "SScore", "Somatic variant quality score.", + "Type", "Simple type interpretation of SV.", + "Quality", "Variant quality score.", "ntrx", "Number of transcripts for given event.", "ngen", "Number of genes for given event.", "nann", "Number of annotations for given event." ) - cols_to_split <- c("AF_BPI", "AF_PURPLE", "CN_PURPLE", "CN_change_PURPLE") + cols_to_split <- c("AF_PURPLE", "CN_PURPLE", "CN_change_PURPLE") double_cols <- split_double_col(sv, cols_to_split) unmelted <- sv |> dplyr::select(-dplyr::all_of(c(cols_to_split, "caller", "sample"))) |> - dplyr::bind_cols(double_cols) |> - tidyr::separate(.data$split_read_support, c("SR_ref", "SR_alt"), ",", convert = TRUE) |> - tidyr::separate(.data$paired_support_PR, c("PR_ref", "PR_alt"), ",", convert = TRUE) + dplyr::bind_cols(double_cols) unmelted <- unmelted |> dplyr::mutate( - SR_PR_ref = paste0(.data$SR_ref, ",", .data$PR_ref), - Ploidy = round(as.double(.data$Ploidy_PURPLE), 2), + SR_PR_ref = paste0(.data$SR_ref, ",", .data$PR_alt), chrom = sub("chr", "", .data$chrom), svtype = ifelse(is.na(.data$PURPLE_status), .data$svtype, "PURPLE_inf"), - # when BPI cannot be run, just use start - Start = ifelse(is.na(.data$PURPLE_status), - ifelse(is.na(.data$START_BPI), .data$start, .data$START_BPI), - .data$start - ), + Start = .data$start, nann = count_pieces(.data$annotation, ","), vcfnum = dplyr::row_number(), vcfnum = sprintf(glue::glue("%0{nchar(nrow(unmelted))}d"), .data$vcfnum) @@ -268,16 +258,17 @@ process_sv <- function(x) { # BND IDs # Two BND mates share the same ID up to the last digit (0 or 1) unmelted_bnd1 <- unmelted |> - dplyr::filter(.data$svtype == "BND") |> + dplyr::filter(!.data$svtype %in% c("PURPLE_inf", "SGL")) |> tidyr::separate(.data$ID, into = c("BND_group", "BND_mate"), sep = -1, convert = TRUE, remove = FALSE) |> dplyr::group_by(.data$BND_group) + assertthat::assert_that(all(unmelted_bnd1$BND_mate %in% c("o", "h"))) unmelted_bnd1 <- unmelted_bnd1 |> dplyr::mutate( # index per group 1, 2, 3.. BND_ID = dplyr::cur_group_id(), # turns into 001, 002, 003... if you've got 100+ rows BND_ID = sprintf(glue::glue("%0{nchar(nrow(unmelted_bnd1))}d"), .data$BND_ID), - BND_mate = ifelse(.data$BND_mate == 0, "A", "B") + BND_mate = ifelse(.data$BND_mate == "o", "A", "B") ) |> dplyr::ungroup() @@ -299,7 +290,7 @@ process_sv <- function(x) { ) unmelted_other <- unmelted |> - dplyr::filter(.data$svtype != "BND") + dplyr::filter(.data$svtype %in% c("PURPLE_inf", "SGL")) unmelted_all <- dplyr::bind_rows( @@ -309,24 +300,40 @@ process_sv <- function(x) { dplyr::mutate( Start = base::format(.data$Start, big.mark = ",", trim = TRUE), Start = paste0(.data$chrom, ":", .data$Start), - END_tmp = ifelse(is.na(.data$END_BPI), .data$end, .data$END_BPI), - END_tmp = ifelse(is.na(.data$END_tmp) & .data$svtype == "BND", .data$BND_mate_end, .data$END_tmp), - END_tmp = base::format(.data$END_tmp, big.mark = ",", trim = TRUE), - End = paste0( - ifelse(.data$svtype == "BND", .data$BND_mate_chrom, .data$chrom), - ":", - .data$END_tmp + END_tmp = ifelse(!.data$svtype %in% c("PURPLE_inf", "SGL"), .data$BND_mate_end, NA_character_), + END_tmp = ifelse( + is.na(.data$END_tmp), + NA_character_, + base::format(as.integer(.data$END_tmp), big.mark = ",", trim = TRUE) + ), + End = ifelse( + is.na(.data$END_tmp), + NA_character_, + paste0( + ifelse(.data$svtype == "BND", .data$BND_mate_chrom, .data$chrom), + ":", + .data$END_tmp + ) ) ) |> dplyr::select( - "vcfnum", "nann", + "vcfnum", + "nann", TierTop = "tier", - "Start", "End", + "Start", + "End", Type = "svtype", - "BND_ID", "BND_mate", - "SR_alt", "PR_alt", "SR_PR_sum", "SR_PR_ref", "Ploidy", - "AF_PURPLE", "AF_BPI", CNC = "CN_change_PURPLE", - CN = "CN_PURPLE", SScore = "somaticscore", "annotation" + "BND_ID", + "BND_mate", + "SR_alt", + "PR_alt", + "SR_PR_sum", + "SR_PR_ref", + "AF_PURPLE", + CNC = "CN_change_PURPLE", + CN = "CN_PURPLE", + Quality= "QUAL", + "annotation", ) abbreviate_effectv <- Vectorize(abbreviate_effect) diff --git a/R/utils.R b/R/utils.R index e0b3852..d57901e 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,13 +1,3 @@ -#' BPI run on VCF -#' -#' @param x Path to VCF file. -#' @export -is_vcf_bpi <- function(x) { - assertthat::assert_that(is_vcf(x)) - d <- bedr::read.vcf(x, nrows = 0, verbose = FALSE) # nrows 0 parses just the vcf header - "bpiVersion" %in% names(d[["header"]]) -} - #' Does the input file look like a VCF? #' #' Quickly checks that the input file has a 'vcf' or 'vcf.gz' suffix, and that @@ -44,9 +34,11 @@ is_vcf <- function(x) { n_max = 1 ) + # COMMIT NOTE: FORMAT FIELD IS NOT MANDITORY + vcf_cols <- c( "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", - "FILTER", "INFO", "FORMAT" + "FILTER", "INFO" ) d_cols <- colnames(d) diff --git a/README.md b/README.md index 9931b82..e70f982 100644 --- a/README.md +++ b/README.md @@ -120,13 +120,12 @@ export PATH="${gpgr_cli}:${PATH}" SOMATIC_SNV_SUMMARY --somatic_snv_vcf SOMATIC_SNV_VCF --somatic_sv_tsv SOMATIC_SV_TSV --somatic_sv_vcf SOMATIC_SV_VCF --purple_som_gene_cnv PURPLE_SOM_GENE_CNV - --purple_som_cnv PURPLE_SOM_CNV --purple_germ_cnv - PURPLE_GERM_CNV --purple_purity PURPLE_PURITY --purple_qc - PURPLE_QC --purple_som_snv_vcf PURPLE_SOM_SNV_VCF - --oncoviral_present_viruses ONCOVIRAL_PRESENT_VIRUSES - --oncoviral_breakpoints_tsv ONCOVIRAL_BREAKPOINTS_TSV - [--out_file OUT_FILE] [--quiet] --result_outdir - RESULT_OUTDIR --tumor_name TUMOR_NAME + --purple_som_cnv PURPLE_SOM_CNV --purple_purity + PURPLE_PURITY --purple_qc PURPLE_QC --purple_som_snv_vcf + PURPLE_SOM_SNV_VCF --oncoviral_present_viruses + ONCOVIRAL_PRESENT_VIRUSES --oncoviral_breakpoints_tsv + ONCOVIRAL_BREAKPOINTS_TSV [--out_file OUT_FILE] [--quiet] + --result_outdir RESULT_OUTDIR --tumor_name TUMOR_NAME options: -h, --help show this help message and exit @@ -153,8 +152,6 @@ export PATH="${gpgr_cli}:${PATH}" Path to `purple.cnv.gene.tsv`. --purple_som_cnv PURPLE_SOM_CNV Path to `purple.cnv.somatic.tsv`. - --purple_germ_cnv PURPLE_GERM_CNV - Path to `purple.cnv.germline.tsv`. --purple_purity PURPLE_PURITY Path to `purple.purity.tsv`. --purple_qc PURPLE_QC diff --git a/inst/cli/canrep.R b/inst/cli/canrep.R index 520598c..b6bea8e 100644 --- a/inst/cli/canrep.R +++ b/inst/cli/canrep.R @@ -3,21 +3,20 @@ canrep_add_args <- function(subp) { canrep$add_argument("--af_global", help = "Path to `af_tumor.txt` file.", required = TRUE) canrep$add_argument("--af_keygenes", help = "Path to `af_tumor_keygenes.txt` file.", required = TRUE) canrep$add_argument("--batch_name", help = "Name of batch sample.", required = TRUE) - canrep$add_argument("--conda_list", help = "Path to `conda_pkg_list.txt` file.", required = TRUE) + canrep$add_argument("--conda_list", help = "Path to `conda_pkg_list.txt` file.") canrep$add_argument("--img_dir", help = "Path to directory containing PURPLE plots.", required = TRUE) canrep$add_argument("--key_genes", help = "Path to UMCCR cancer gene file.", required = TRUE) - canrep$add_argument("--somatic_snv_summary", help = "Path to `somatic_snv_summary.json`.", required = TRUE) canrep$add_argument("--somatic_snv_vcf", help = "Path to `somatic-PASS.vcf.gz` SNV VCF.", required = TRUE) canrep$add_argument("--somatic_sv_tsv", help = "Path to `manta.tsv` TSV file.", required = TRUE) canrep$add_argument("--somatic_sv_vcf", help = "Path to `manta.vcf.gz` VCF file.", required = TRUE) canrep$add_argument("--purple_som_gene_cnv", help = "Path to `purple.cnv.gene.tsv`.", required = TRUE) + canrep$add_argument("--purple_som_cnv_ann", help = "Path to annotated and prioritised `purple.cnv.somatic.tsv`.", required = TRUE) canrep$add_argument("--purple_som_cnv", help = "Path to `purple.cnv.somatic.tsv`.", required = TRUE) - canrep$add_argument("--purple_germ_cnv", help = "Path to `purple.cnv.germline.tsv`.", required = TRUE) canrep$add_argument("--purple_purity", help = "Path to `purple.purity.tsv`.", required = TRUE) canrep$add_argument("--purple_qc", help = "Path to `purple.qc`.", required = TRUE) canrep$add_argument("--purple_som_snv_vcf", help = "Path to `purple.somatic.vcf.gz`.", required = TRUE) - canrep$add_argument("--oncoviral_present_viruses", help = "Path to `oncoviruses/present_viruses.txt`.", required = TRUE) - canrep$add_argument("--oncoviral_breakpoints_tsv", help = "Path to `oncoviruses/oncoviral_breakpoints.tsv`.", required = TRUE) + canrep$add_argument("--virusbreakend_tsv", help = "Path to VIRUSBreakend summary file.", required = TRUE) + canrep$add_argument("--virusbreakend_vcf", help = "Path to VIRUSBreakend VCF file.", required = TRUE) canrep$add_argument("--out_file", help = "Path to output HTML file (needs '.html' suffix) [def: {tumor_name}_cancer_report.html].") canrep$add_argument("--quiet", help = "Suppress log printing during rendering.", action = "store_true") canrep$add_argument("--result_outdir", help = "Path to directory to write tidy JSON/TSV results.", required = TRUE) @@ -33,18 +32,17 @@ canrep_parse_args <- function(args) { conda_list = args$conda_list, img_dir = args$img_dir, key_genes = args$key_genes, - somatic_snv_summary = args$somatic_snv_summary, somatic_snv_vcf = args$somatic_snv_vcf, somatic_sv_tsv = args$somatic_sv_tsv, somatic_sv_vcf = args$somatic_sv_vcf, purple_som_gene_cnv = args$purple_som_gene_cnv, + purple_som_cnv_ann = args$purple_som_cnv_ann, purple_som_cnv = args$purple_som_cnv, - purple_germ_cnv = args$purple_germ_cnv, purple_purity = args$purple_purity, purple_qc = args$purple_qc, purple_som_snv_vcf = args$purple_som_snv_vcf, - oncoviral_present_viruses = args$oncoviral_present_viruses, - oncoviral_breakpoints_tsv = args$oncoviral_breakpoints_tsv, + virusbreakend_tsv = args$virusbreakend_tsv, + virusbreakend_vcf = args$virusbreakend_vcf, out_file = args$out_file, quiet = args$quiet, result_outdir = args$result_outdir, diff --git a/inst/rmd/umccrise/_navbar.html b/inst/rmd/umccrise/_navbar.html index 9c3a4ed..53533bf 100644 --- a/inst/rmd/umccrise/_navbar.html +++ b/inst/rmd/umccrise/_navbar.html @@ -28,19 +28,10 @@ - -
  • Oncoviruses
  • diff --git a/inst/rmd/umccrise/cancer_report.Rmd b/inst/rmd/umccrise/cancer_report.Rmd index 45d15a2..09bfdd0 100644 --- a/inst/rmd/umccrise/cancer_report.Rmd +++ b/inst/rmd/umccrise/cancer_report.Rmd @@ -16,16 +16,15 @@ params: conda_list: NULL img_dir: NULL key_genes: NULL - oncoviral_breakpoints_tsv: NULL - oncoviral_present_viruses: NULL - purple_germ_cnv: NULL + virusbreakend_tsv: NULL + virusbreakend_vcf: NULL purple_purity: NULL purple_qc: NULL + purple_som_cnv_ann: NULL purple_som_cnv: NULL purple_som_gene_cnv: NULL purple_som_snv_vcf: NULL result_outdir: NULL - somatic_snv_summary: NULL somatic_snv_vcf: NULL somatic_sv_tsv: NULL somatic_sv_vcf: NULL @@ -113,11 +112,10 @@ sv_detail_table <- function(tab) { ```{r hrd_prep, results='hide'} #---- Homologous Recombination Deficiency ----# -chord_sv_df <- sigrap::chord_mantavcf2df(params$somatic_sv_vcf) chord_res <- sigrap::chord_run( vcf.snv = params$somatic_snv_vcf, - df.sv = chord_sv_df, - sv.caller = "manta", + vcf.sv = params$somatic_sv_vcf, + sv.caller = "gridss", sample.name = params$tumor_name, ref.genome = "hg38", verbose = TRUE @@ -225,15 +223,13 @@ purple_qc_summary <- dplyr::bind_rows( #---- PURPLE CNV Tables ----# purple_cnv_som <- gpgr::purple_cnv_som_process(params$purple_som_cnv) purple_cnv_som_gene <- gpgr::purple_cnv_som_gene_process(params$purple_som_gene_cnv, params$key_genes) -purple_cnv_germ <- gpgr::purple_cnv_germ_process(params$purple_germ_cnv) gpgr::write_tsvjsongz(purple_cnv_som$tab, glue("purple/{bnm}-purple_cnv_som"), result_outdir) gpgr::write_tsvjsongz(purple_cnv_som_gene$tab, glue("purple/{bnm}-purple_cnv_som_gene"), result_outdir) -gpgr::write_tsvjsongz(purple_cnv_germ$tab, glue("purple/{bnm}-purple_cnv_germ"), result_outdir) ``` ```{r, results='asis'} -blank_lines(2) +blank_lines(1) ``` ## __Summary__ @@ -241,23 +237,6 @@ blank_lines(2)
    Description -__SNV Summary__ - -Starting from the raw DRAGEN calls, count the number of variants remaining after -each filtration stage. - -- `raw`: raw DRAGEN calls. -- `raw_pass`: PASSed raw. -- `noalt`: in chr1-22, X, Y, M. -- `sage`: including rescued by SAGE. -- `pregnomad`: should be the same as above. -- `gnomad`: in the hypermutated case, total remaining after removing those - with `gnomAD_AF >= 0.01` (that are not in hotspots). -- `anno`: after any additional filters for hypermutated case. -- `filt`: after applying main filters (tumor allele frequency, PCGR tiers etc.). Should be the same as above. -- `filt_pass`: after removing those that failed to pass the thresholds set above. -- `giab`: after removing those that are __not__ in GIAB confident regions. - __PURPLE Summary__ PURPLE outputs a QC status along with a summary @@ -298,28 +277,13 @@ summarise_sigs <- function(mut_sig_contr) { paste(collapse = ", ") } -snv_summary <- params$somatic_snv_summary %>% - jsonlite::read_json() -snvs_max <- 500000 -hypermutated <- with(snv_summary, !is.null(pregnomad) && pregnomad > snvs_max && !is.null(gnomad) && gnomad > snvs_max) -bpi_enabled <- gpgr::is_vcf_bpi(params$somatic_sv_vcf) -snv_summary <- snv_summary %>% - base::unlist() %>% - tibble::enframe() %>% - dplyr::mutate( - value = base::format(value, big.mark = ",", trim = TRUE), - x = glue("{name}: {value}") - ) %>% - dplyr::pull(x) %>% - base::paste(collapse = "; ") - hrd_summary <- list( chord = chord_res[["prediction"]][, "p_hrd", drop = TRUE], hrdetect = hrdetect_res[, "Probability", drop = TRUE] ) cnv_summary <- - list(som = purple_cnv_som$tab, germ = purple_cnv_germ$tab) %>% + list(som = purple_cnv_som$tab) %>% purrr::map(~ dplyr::summarise( .x, Min = round(min(CN), 2), @@ -347,10 +311,6 @@ if (no_sv_found) { qc_summary_all <- dplyr::tribble( ~n, ~variable, ~value, ~details, - 4, "Hypermutated", ifelse(hypermutated, "TRUE", "FALSE"), - ifelse(hypermutated, "More than 500K SNVs detected, so keep only small variants within UMCCR cancer gene regions (Ensembl v107).", ""), - 4, "BPI Enabled", ifelse(bpi_enabled, "TRUE", "FALSE"), - ifelse(bpi_enabled, "", "BPI was skipped, so SV tables will not have AF_BPI/AF_PURPLE values."), 5, "MutSigs (Old)", glue::glue("{summarise_sigs(sigs_snv_2015)}"), "2015 COSMIC (https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt)", 5, "MutSigs (New)", glue::glue("{summarise_sigs(sigs_snv_2020)}"), @@ -362,12 +322,6 @@ qc_summary_all <- dplyr::tribble( "Max: {cnv_summary[['som']]$Max}; ", "N: {cnv_summary[['som']]$N} " ), "", - 21, "CNVs (Germline)", glue::glue( - "Min: {cnv_summary[['germ']]$Min}; ", - "Max: {cnv_summary[['germ']]$Max}; ", - "N: {cnv_summary[['germ']]$N} " - ), "", - 17, "SNVs (Somatic)", snv_summary, "Summary of SNV filters, from raw to final.", 18, "SVs (unmelted)", paste(rev(sv_summary$unmelted), collapse = ", "), "Summary of SVs as presented in raw VCF.", 19, "SVs (melted)", paste(rev(sv_summary$melted), collapse = ", "), @@ -414,16 +368,6 @@ qc_summary_all %>% rows = grepl("TRUE", value) & variable == "Hypermutated" ) ) %>% - gt::tab_style( - style = list( - cell_fill(color = orange), - cell_text(weight = "bold") - ), - locations = cells_body( - columns = value, - rows = grepl("FALSE", value) & variable == "BPI Enabled" - ) - ) %>% gt::tab_style( style = list( cell_text(weight = "bold") @@ -458,6 +402,7 @@ blank_lines(1) ``` ### __Allelic Frequencies__ {#allelic-freqs} + Summarised below are the allele frequencies (AFs) for somatic variants detected genome-wide (__Global__) vs. within the coding sequence of ~1,100 UMCCR cancer genes (__Key Genes CDS__). @@ -598,10 +543,6 @@ mp_plot_bias4 <- MutationalPatterns::plot_strand_bias(strand_bias_rep) (mp_plot_bias1 / mp_plot_bias2) | (mp_plot_bias3 / mp_plot_bias4) ``` -```{r, results='asis'} -blank_lines(1) -``` - ### __SNV__ Signature Contribution {.tabset .tabset-fade #sig-snv-contr}
    @@ -635,7 +576,7 @@ sigs_snv_2015 %>% ``` ```{r, results='asis'} -blank_lines(1) +blank_lines(2) ``` #### 2020 @@ -649,7 +590,7 @@ sigs_snv_2020 %>% ``` ```{r, results='asis'} -blank_lines(1) +blank_lines(2) ``` ### __INDEL/DBS__ Signatures & Contributions {.tabset .tabset-fade #sig-indeldbs} @@ -674,7 +615,7 @@ sigs_indel %>% ``` ```{r, results='asis'} -blank_lines(1) +blank_lines(2) ``` #### DBS @@ -689,10 +630,11 @@ sigs_dbs %>% ``` ```{r, results='asis'} -blank_lines(1) +blank_lines(2) ``` ### __Rainfall Plot__ {#rainfall} + Rainfall plots show the distribution of mutations along the genome, with mutation types indicated with different colors. The y-axis corresponds to the distance of a mutation from the previous mutation, and is log10 transformed. Drop-downs from the plots indicate clusters or @@ -722,6 +664,7 @@ if (will_work) { ``` ### __Kataegis__ + Somatic variants of type C>T and C>G in a TpCpN context are annotated as showing Kataegis (Nik Zainal et al., 2012) if there are three or more mutations of that type, strand and context localised within a @@ -748,7 +691,7 @@ if (nrow(kat$data) > 0) { ``` ```{r purple_plot_somaticrainfall, out.width="70%"} -purple_plot_somaticrainfall <- file.path(img_dir, paste0(bnm, ".somatic.rainfall.png")) +purple_plot_somaticrainfall <- file.path(img_dir, paste0(params$tumor_name, ".somatic.rainfall.png")) if (file.exists(purple_plot_somaticrainfall)) { knitr::include_graphics(purple_plot_somaticrainfall) } else { @@ -821,6 +764,7 @@ blank_lines(1) ``` ## __Circos Plots__ {.tabset .tabset-fade #circos} + Circos plots are generated by PURPLE. The first BAF plot is based on PURPLE data and configuration files. @@ -854,10 +798,9 @@ The first BAF plot is based on PURPLE data and configuration files.
    ```{r circos_baf_plot, out.width="80%"} -knitr::include_graphics(file.path(img_dir, paste0(bnm, ".circos_baf.png"))) +knitr::include_graphics(file.path(img_dir, paste0(params$tumor_name, ".circos_baf.png"))) ``` - ### SNVs/Indels, Total/Minor CN, SVs
    @@ -890,7 +833,7 @@ knitr::include_graphics(file.path(img_dir, paste0(bnm, ".circos_baf.png")))
    ```{r circos_default1, out.width="80%"} -knitr::include_graphics(file.path(img_dir, paste0(bnm, ".circos.png"))) +knitr::include_graphics(file.path(img_dir, paste0(params$tumor_name, ".circos.png"))) ``` ### Allele Ratios, BAF @@ -908,117 +851,14 @@ knitr::include_graphics(file.path(img_dir, paste0(bnm, ".circos.png")))
    ```{r circos_default2, out.width="80%"} -knitr::include_graphics(file.path(img_dir, paste0(bnm, ".input.png"))) +knitr::include_graphics(file.path(img_dir, paste0(params$tumor_name, ".input.png"))) ``` -## __Structural Variants__ {#sv-summary} -Structural variants are inferred with [Manta](https://github.com/illumina/manta), -adjusted using [PURPLE](https://github.com/hartwigmedical/hmftools/tree/master/purity-ploidy-estimator#7-structural-variant-recovery-and-single-breakend-filtering), -and prioritised using [simple_sv_annotation](https://github.com/AstraZeneca-NGS/simple_sv_annotation). -Allele frequencies, copy number changes and ploidy are purity-adjusted. - -
    -SV Prioritisation Process - -The input file corresponds to `umccrised//structural/-manta.tsv`. - -It's generated through the following steps: - -__Step 1: Processing__ - -* __Input__: Manta structural variant calls from bcbio -(`final//-sv-prioritize-manta.vcf.gz` (or -`-manta.vcf.gz` if not prioritised)) -* Remove following annotations from Manta VCF: - 'INFO/SIMPLE_ANN', 'INFO/SV_HIGHEST_TIER', 'FILTER/Intergenic', 'FILTER/MissingAnn', 'FILTER/REJECT' -* Prioritise variants with [simple_sv_annotation](https://github.com/vladsaveliev/simple_sv_annotation)` -* __Output__: `work/{batch}/structural/prioritize/{batch}-manta.vcf.gz` -* Keep PASS variants -* If more than 100,000 variants, keep only variants where `INFO/SV_TOP_TIER <= 3` -* __Output__: `work/{batch}/structural/keep_pass/{batch}-manta.vcf` -* Deal with chromosome capitalisation occurring from SnpEff -* Run BreakPointInspector (BPI) if it was disabled in bcbio -* __Output__: `work/{batch}/structural/maybe_bpi/{batch}-manta.vcf` - -__Step 2: Filtering__ - -* Keep PASS variants (since BPI updates the FILTER column) -* For BND variants require paired reads support (PR) to be higher than split read support (SR) -* Keep all `INFO/SV_TOP_TIER <= 2` variants -* For `INFO/SV_TOP_TIER > 2` variants require split _or_ paired reads support of at least 5x -* For `INFO/SV_TOP_TIER > 2` variants with low allele frequency at any breakpoint (`BPI_AF[0 or 1] < 0.1`), - require SR or PR support of at least 10x -* __Output__: `work/{batch}/structural/filt/{batch}-manta.vcf` - -__Step 3: PURPLE and FFPE conditional__ - -* If the sample _is not_ FFPE: - * Feed above filtered SVs to PURPLE, which outputs `purple.sv.vcf.gz` that contains rescued SVs - * Prioritise variants (again) - * Remove 'INFO/ANN' annotation - * __Output__: `{batch}/structural/{batch}-manta.vcf.gz` -* If the sample _is_ FFPE: - * Just copy `filtered` variants and don't do anything - (i.e. we don't want the rescued SVs from PURPLE since they'll likely be heaps) - (note that PURPLE will still get fed with the `filtered` SVs) - * __Output__: `{batch}/structural/{batch}-manta.vcf.gz` - -__Step 4: TSV final output__ - -* Input: `{batch}/structural/{batch}-manta.vcf.gz` VCF -* Output: `{batch}/structural/{batch}-manta.tsv` TSV - - -```{r manta_tsv_description} -sv_tsv_descr %>% - dplyr::arrange(Column) %>% - dplyr::mutate(Column = kableExtra::cell_spec(Column, bold = TRUE)) %>% - knitr::kable(escape = FALSE, caption = "Description of Manta TSV columns") %>% - kableExtra::kable_paper(c("hover", "striped"), full_width = FALSE, position = "left") %>% - kableExtra::scroll_box(height = "250px") -``` - -__Prioritisation process__ - -1. Annotate with [SnpEff](http://snpeff.sourceforge.net/SnpEff_manual.html) based on Ensembl gene model -2. Subset annotations to [APPRIS principal transcripts](http://appris.bioinfo.cnio.es/#/) -3. Prioritize variants with [simple_sv_annotation](https://github.com/vladsaveliev/simple_sv_annotation) - 1(high)-2(moderate)-3(low)-4(no interest): - * exon loss - - on prioritisation gene list (1) - - other (2) - * gene_fusion - - paired (hits two genes) - - on list of known pairs (1) (curated by [HMF](https://resources.hartwigmedicalfoundation.nl)) - - one gene is a known promiscuous fusion gene (1) (curated by [HMF](https://resources.hartwigmedicalfoundation.nl)) - - on list of FusionCatcher known pairs (2) - - other: - - one or two genes on prioritisation gene list (2) - - neither gene on prioritisation gene list (3) - - unpaired (hits one gene) - - on prioritisation gene list (2) - - others (3) - * upstream or downstream - - on prioritisation gene list genes (2) - e.g. one gene is got into - control of another gene's promoter and get overexpressed (oncogene) or underexpressed (tsgene) - * LoF or HIGH impact in a tumor suppressor - - on prioritisation gene list (2) - - other TS gene (3) - * other (4) - -* Use PURPLE copy number caller to infer more SV calls from copy number transitions (marked as 'PURPLE Inferred') - -```{r effect_abbrev_tab} -gpgr::EFFECT_ABBREVIATIONS %>% - tibble::enframe(value = "abbreviation") %>% - dplyr::arrange(abbreviation) %>% - dplyr::mutate(name = kableExtra::cell_spec(name, bold = TRUE)) %>% - knitr::kable(escape = FALSE) %>% - kableExtra::kable_paper(c("hover", "striped"), full_width = FALSE, position = "left") %>% - kableExtra::scroll_box(height = "250px") +```{r, results='asis'} +blank_lines(1) ``` -
    +## __Structural Variants__
    What do the SV tables show? @@ -1029,17 +869,6 @@ annotations for these genes/transcripts are included in the original SV VCF, but they are extremely hard to display and browse inside a HTML table, due to their size. -Therefore, we first show a 'Map' of the SVs corresponding -to the main columns from the original SV VCF, but we don't show the -`INFO/SIMPLE_ANN` annotation column, which contains the -`Effect`, `Genes`, `Transcript`, and `Detail` values of a given SV. This makes -a bit more sense if you take a look at the `nann` Map column, -which shows the number of annotations corresponding to each VCF row. For example, -`nann = 500` would mean that this variant corresponds to 500 annotations. -These get shown in the next tables in a reshaped format, so that instead of -showing 500 annotations for a single row in a single cell, we split that over -multiple columns and rows. You can connect those tables to the Map by `vcfnum`. - Example SV with -only- 3 annotations: ``` @@ -1055,15 +884,9 @@ DEL | downstream_gene_variant | HFE2 | ENST00000336751_exon_1/4 DEL | intergenic_region | TXNIP-HFE2 | ENSG00000265972-ENSG00000168509 | unprioritized | 4 ``` -- The Map is sorted by `TierTop` then `vcfnum`: - - `TierTop` is the top tier of all annotations for a given variant. - - `vcfnum` is the variant row number in the umccrised VCF. -
    - - -### Summary {.tabset .tabset-fade} +### Summary {.tabset .tabset-fade #sv-summary} #### Plots @@ -1116,43 +939,14 @@ d %>% gt::tab_options(table.align = "left") ``` -```{r, results='asis'} -blank_lines(1) -``` - -### __SV Map__ {#sv-unmelted} - -```{r svtab_raw, eval=!no_sv_found} -svtab_raw <- sv_unmelted %>% - dplyr::select(-annotation) %>% - dplyr::arrange(TierTop, vcfnum) - -svtab_rawdt <- svtab_raw %>% - DT::datatable( - filter = list(position = "top", clear = FALSE, plain = TRUE), - class = "cell-border display compact", - rownames = FALSE, extensions = c("Scroller", "Buttons", "KeyTable"), - options = list( - scroller = TRUE, scrollY = 400, scrollX = TRUE, autoWidth = FALSE, keys = TRUE, - buttons = c("csv", "excel"), dom = "Blfrtip" - ) - ) -``` - -```{r svtab_raw_details, eval=!no_sv_found} -details::details(sv_detail_table(svtab_raw), lang = "none", summary = "Column Description") -svtab_rawdt -``` - ```{r, results='asis'} blank_lines(2) ``` -### __Translocations (BNDs)__ {.tabset .tabset-fade #translocations} +### __Breakend Annotations__ {.tabset .tabset-fade #breakend-annotations} ```{r svtab_BND, eval=!no_sv_found} sv_BND <- sv_all %>% - dplyr::filter(Type == "BND") %>% dplyr::mutate( nrow = dplyr::row_number(), nrow = sprintf(glue::glue("%0{nchar(nrow(.))}d"), nrow) @@ -1162,17 +956,17 @@ sv_BND <- sv_all %>% sv_BND_main <- sv_BND %>% dplyr::select( - nrow, vcfnum, `Tier (Top)`, Start, End, + nrow, vcfnum, Type, `Tier (Top)`, Start, End, BND_ID, BND_mate, - Genes, Effect, Detail, SR_alt, PR_alt, SR_PR_sum, Ploidy, AF_PURPLE + Genes, Effect, Detail, SR_alt, PR_alt, SR_PR_sum, AF_PURPLE ) sv_BND_other <- sv_BND %>% - dplyr::select(nrow, AF_BPI, CNC, CN, SR_PR_ref, SScore, ntrx, Transcript) + dplyr::select(nrow, CNC, CN, SR_PR_ref, Quality, ntrx, Transcript) sv_BND_purple <- sv_all %>% dplyr::filter(Type %in% c("PURPLE_inf")) %>% - dplyr::select(`Tier (Top)`, Start, Effect, Detail, Ploidy, CN, CNC) + dplyr::select(`Tier (Top)`, Start, Effect, Detail, CN, CNC) gpgr::write_tsvjsongz( dplyr::bind_cols(sv_BND_main, sv_BND_other %>% dplyr::select(-nrow)), @@ -1240,14 +1034,14 @@ details::details(sv_detail_table(sv_BND_purple), lang = "none", summary = "Colum sv_BND_purpledt ``` -### __DEL/DUP/INS__ {.tabset .tabset-fade #deldupins} +### __CNV annotations__ {.tabset .tabset-fade #cnv-annotations} -```{r svtab_noBND, eval=!no_sv_found} -sv_noBND <- sv_all %>% - dplyr::filter(!Type %in% c("BND", "PURPLE_inf")) %>% +```{r svtab_cnv} +cnv <- gpgr::purple_cnv_som_ann_process(params$purple_som_cnv_ann) + +cnv_temp <- cnv$tab %>% dplyr::mutate( nrow = dplyr::row_number(), - nrow = sprintf(glue::glue("%0{nchar(nrow(.))}d"), nrow), Genes_original = Genes, Transcript_original = Transcript ) @@ -1255,7 +1049,7 @@ sv_noBND <- sv_all %>% max_genes <- 2 max_transcripts <- 2 -sv_noBND_main <- sv_noBND %>% +cnv_main <- cnv_temp %>% dplyr::mutate( Genes = ifelse(ngen > max_genes, glue::glue("Many Genes ({ngen})"), @@ -1267,130 +1061,102 @@ sv_noBND_main <- sv_noBND %>% ) ) %>% dplyr::select( - nrow, vcfnum, `Tier (Top)`, Type, Start, End, Effect, - Genes, Transcript, Detail, SR_alt, PR_alt, SR_PR_sum, AF_PURPLE + nrow, `Tier (Top)`, Type, Chr, Start, End, Effect, + Genes, Transcript, Detail, CN, `CN Min+Maj`, `Start/End SegSupport`, Method, + `BAF (count)`, `GC (windowCount)` ) -gpgr::write_tsvjsongz(sv_noBND_main, glue("sv/{bnm}-05_sv_noBND_main"), result_outdir) - -sv_noBND_other <- sv_noBND %>% - dplyr::select(vcfnum, Ploidy, AF_BPI, CNC, CN, SR_PR_ref, SScore) %>% - dplyr::distinct() - -gpgr::write_tsvjsongz(sv_noBND_other, glue("sv/{bnm}-06_sv_noBND_other"), result_outdir) +gpgr::write_tsvjsongz(cnv_main, glue("sv/{bnm}-01_cnv_main"), result_outdir) -sv_noBND_manygenes <- sv_noBND %>% +cnv_main_manygenes <- cnv_temp %>% dplyr::filter(ngen > max_genes) %>% - dplyr::select(nrow, vcfnum, `Tier (Top)`, Type, Start, End, + dplyr::select(nrow, `Tier (Top)`, Type, Start, End, Effect, ngen, Genes = Genes_original ) -gpgr::write_tsvjsongz(sv_noBND_manygenes, glue("sv/{bnm}-07_sv_noBND_manygenes"), result_outdir) +gpgr::write_tsvjsongz(cnv_main_manygenes, glue("sv/{bnm}-02_cnv_main_manygenes"), result_outdir) -sv_noBND_manytx <- sv_noBND %>% +cnv_main_manytx <- cnv_temp %>% dplyr::filter(ntrx > max_transcripts) %>% - dplyr::select(nrow, vcfnum, `Tier (Top)`, Type, Start, End, + dplyr::select(nrow, `Tier (Top)`, Type, Start, End, Effect, ntrx, Transcript = Transcript_original ) -gpgr::write_tsvjsongz(sv_noBND_manytx, glue("sv/{bnm}-08_sv_noBND_manytranscripts"), result_outdir) +gpgr::write_tsvjsongz(cnv_main_manytx, glue("sv/{bnm}-03_cnv_main_manytx"), result_outdir) -sv_noBND_maindt <- sv_noBND_main %>% +cnv_maindt <- cnv_main %>% DT::datatable( filter = list(position = "top", clear = FALSE, plain = TRUE), class = "cell-border display compact", - rownames = FALSE, extensions = c("Buttons"), + rownames = FALSE, extensions = c("Scroller", "Buttons", "KeyTable"), options = list( - pageLength = 20, - autoWidth = FALSE, + scroller = TRUE, scrollY = 400, scrollX = TRUE, autoWidth = FALSE, keys = TRUE, buttons = c("csv", "excel"), dom = "Blfrtip" ) - ) - -sv_noBND_otherdt <- sv_noBND_other %>% - DT::datatable( - filter = list(position = "top", clear = FALSE, plain = TRUE), - class = "cell-border display compact", - rownames = FALSE, extensions = c("Buttons"), - options = list( - pageLength = 20, - autoWidth = FALSE, - buttons = c("csv", "excel"), - dom = "Blfrtip" - ) - ) + ) |> + DT::formatCurrency(~ Start + End, currency = "", interval = 3, mark = ",", digits = 0) -sv_noBND_manygenesdt <- sv_noBND_manygenes %>% +cnv_main_manygenesdt <- cnv_main_manygenes %>% DT::datatable( filter = list(position = "top", clear = FALSE, plain = TRUE), class = "cell-border display compact", - rownames = FALSE, extensions = c("Buttons", "KeyTable"), + rownames = FALSE, extensions = c("Scroller", "Buttons", "KeyTable"), options = list( - pageLength = 10, - autoWidth = FALSE, keys = TRUE, - buttons = c("csv", "excel"), - dom = "Blfrtip" + scroller = TRUE, scrollY = 400, scrollX = TRUE, autoWidth = FALSE, keys = TRUE, + buttons = c("csv", "excel"), dom = "Blfrtip" ) - ) + ) |> + DT::formatCurrency(~ Start + End + ngen, currency = "", interval = 3, mark = ",", digits = 0) -sv_noBND_manytxdt <- sv_noBND_manytx %>% +cnv_main_manytxdt <- cnv_main_manytx %>% DT::datatable( filter = list(position = "top", clear = FALSE, plain = TRUE), class = "cell-border display compact", - rownames = FALSE, extensions = c("Buttons", "KeyTable"), + rownames = FALSE, extensions = c("Scroller", "Buttons", "KeyTable"), options = list( - pageLength = 10, - autoWidth = FALSE, keys = TRUE, - buttons = c("csv", "excel"), - dom = "Blfrtip" + scroller = TRUE, scrollY = 400, scrollX = TRUE, autoWidth = FALSE, keys = TRUE, + buttons = c("csv", "excel"), dom = "Blfrtip" ) - ) + ) |> + DT::formatCurrency(~ Start + End, currency = "", interval = 3, mark = ",", digits = 0) ``` #### Main Columns -```{r sv_noBND_main, eval=!no_sv_found} -details::details(sv_detail_table(sv_noBND_main), lang = "none", summary = "Column Description") -sv_noBND_maindt +```{r cnv_main} +details::details(sv_detail_table(cnv_main), lang = "none", summary = "Column Description") +cnv_maindt ``` ```{r, results='asis'} blank_lines(2) ``` -#### Other Columns +#### Many Genes -```{r sv_noBND2_other, eval=!no_sv_found} -details::details(sv_detail_table(sv_noBND_other), lang = "none", summary = "Column Description") -sv_noBND_otherdt +```{r cnv_main_manygenes} +details::details(sv_detail_table(cnv_main_manygenes), lang = "none", summary = "Column Description") +cnv_main_manygenesdt ``` -#### Many Genes - -```{r sv_noBND_manygenes, eval=!no_sv_found} -details::details(sv_detail_table(sv_noBND_manygenes), lang = "none", summary = "Column Description") -sv_noBND_manygenesdt +```{r, results='asis'} +blank_lines(2) ``` #### Many Transcripts -```{r sv_noBND_manytx, eval=!no_sv_found} -details::details(sv_detail_table(sv_noBND_manytx), lang = "none", summary = "Column Description") -sv_noBND_manytxdt +```{r cnv_main_manytx} +details::details(sv_detail_table(cnv_main_manytx), lang = "none", summary = "Column Description") +cnv_main_manytxdt ``` -## __Copy Number Variants__ -The purity and ploidy estimator -[PURPLE](https://github.com/hartwigmedical/hmftools/tree/master/purity-ploidy-estimator) -is used to generate a copy number profile for the somatic sample. - ```{r, results='asis'} -blank_lines(1) +blank_lines(2) ``` -### UMCCR __Gene Somatic__ CNV Calls {#cnv-gene} +### UMCCR __Gene Somatic__ CNV Calls {#gene-panel-cnv}
    Description @@ -1429,7 +1195,7 @@ purple_cnv_som_gene$tab %>% blank_lines(2) ``` -### Genome-wide __Somatic__ CNV Segments {#cnv-segs} +### Genome-wide __Somatic__ CNV Segments {#cnv-segments}
    Description @@ -1471,36 +1237,18 @@ purple_cnv_som$tab %>% blank_lines(2) ``` -### Genome-wide __Germline__ CNV Segments - -```{r purple_cnv_germ_tab} -purple_cnv_germ$tab %>% - DT::datatable( - filter = list(position = "top", clear = FALSE, plain = TRUE), - class = "cell-border display compact", - rownames = FALSE, extensions = c("Scroller", "Buttons", "KeyTable"), - options = list( - scroller = TRUE, scrollY = 400, scrollX = TRUE, autoWidth = TRUE, keys = TRUE, - buttons = c("csv", "excel"), dom = "Bfrtip" - ) - ) %>% - DT::formatCurrency(~ Start + End, currency = "", interval = 3, mark = ",", digits = 0) -``` - -```{r, results='asis'} -blank_lines(2) -``` - ### __PURPLE Charts__ {.tabset .tabset-fade #purple-charts} + PURPLE generates charts for summarising tumor sample characteristics. Description is from the [PURPLE docs](https://github.com/hartwigmedical/hmftools/tree/master/purity-ploidy-estimator#charts). #### Purity/ploidy + The following 'sunrise' chart shows the range of scores of all examined solutions of purity and ploidy. Crosshairs identify the best purity / ploidy solution. ```{r purple_plot_purityrange, out.width="40%"} -knitr::include_graphics(file.path(img_dir, paste0(bnm, ".purity.range.png"))) +knitr::include_graphics(file.path(img_dir, paste0(params$tumor_name, ".purity.range.png"))) ``` ```{r, results='asis'} @@ -1508,6 +1256,7 @@ blank_lines(2) ``` #### Copy number / Minor allele ploidy + The following figures show the AMBER BAF count weighted distribution of copy number and minor allele ploidy throughout the fitted segments. Copy numbers are broken down by colour into their respective minor @@ -1516,20 +1265,22 @@ down by copy number. ```{r purple_plot_copynumber, out.width="40%"} knitr::include_graphics(c( - file.path(img_dir, paste0(bnm, ".copynumber.png")), - file.path(img_dir, paste0(bnm, ".map.png")) + file.path(img_dir, paste0(params$tumor_name, ".copynumber.png")), + file.path(img_dir, paste0(params$tumor_name, ".map.png")) )) ``` #### Ploidy by CN + If a somatic variant VCF has been supplied, a figure will be produced showing the somatic variant ploidy broken down by copy number. ```{r purple_plot_somatic, out.width="40%"} -knitr::include_graphics(file.path(img_dir, paste0(bnm, ".somatic.png"))) +knitr::include_graphics(file.path(img_dir, paste0(params$tumor_name, ".somatic.png"))) ``` #### Clonality + The following diagram illustrates the clonality model of a typical sample. The top figure shows the histogram of somatic ploidy for all SNVs and INDELs in blue. @@ -1541,67 +1292,82 @@ We can determine the likelihood of a variant being subclonal at any given ploidy as shown in the bottom half of the figure. ```{r purple_plot_somaticclonality, out.width="50%"} -knitr::include_graphics(file.path(img_dir, paste0(bnm, ".somatic.clonality.png"))) +knitr::include_graphics(file.path(img_dir, paste0(params$tumor_name, ".somatic.clonality.png"))) ``` #### Segment + The contribution of each fitted segment to the final score of the best fit is shown in the following figure. Each segment is divided into its major and minor allele ploidy. The area of each circle shows the weight (AMBER baf count) of each segment. ```{r purple_plot_segment, out.width="40%"} -knitr::include_graphics(file.path(img_dir, paste0(bnm, ".segment.png"))) +knitr::include_graphics(file.path(img_dir, paste0(params$tumor_name, ".segment.png"))) ``` -## __Oncoviruses__ {.tabset .tabset-fade #oncoviruses} +## __Oncoviruses__ {#oncoviruses} -Oncoviruses and their integration sites. Viral sequences are obtained from the -GDC database. -Host genes are reported if the integration site falls on a gene or at least before 100kbp of the gene start. +### __Viral content__ -```{r oncoviruses_table} -present_viruses <- params$oncoviral_present_viruses -breakpoints_tsv <- params$oncoviral_breakpoints_tsv +```{r oncoviral_summary} +vb_summary <- gpgr::virusbreakend_summary_read(params$virusbreakend_tsv) -if (!is.na(breakpoints_tsv) && breakpoints_tsv != "" && file.exists(breakpoints_tsv)) { - breakpoints <- readr::read_tsv(breakpoints_tsv, col_names = T, col_types = "cciicicccc") - breakpoints %>% - dplyr::mutate(Coord = ifelse(is.na(end), start, str_c(start, "-", end))) %>% - dplyr::select( - Contig = contig, Coord, `Read support` = PAIR_COUNT, - `Disrupted genes` = DisruptedGenes, `Upstream genes` = UpstreamGenes, - `SV type` = svtype, ID, MATEID - ) %>% - DT::datatable( - filter = list(position = "top", clear = FALSE, plain = TRUE), - class = "cell-border display compact", - rownames = FALSE, extensions = c("Scroller", "Buttons", "KeyTable"), - options = list( - scroller = TRUE, scrollY = 400, scrollX = TRUE, autoWidth = TRUE, keys = TRUE, - buttons = c("csv", "excel"), dom = "Bfrtip" - ) - ) %>% - DT::formatCurrency(~Coord, currency = "", interval = 3, mark = ",", digits = 0) +details::details(knitr::kable(vb_summary$descr), lang = "none", summary = "Column Description") + +if (nrow(vb_summary$tab) > 0) { + DT::datatable(vb_summary$tab, + filter = list(position = "top", clear = FALSE, plain = TRUE), + class = "cell-border display compact", + rownames = FALSE, extensions = c("Scroller", "Buttons", "KeyTable"), + options = list( + scroller = TRUE, scrollY = 200, scrollX = TRUE, autoWidth = FALSE, keys = TRUE, + buttons = c("csv", "excel"), dom = "Blfrtip" + ) + ) |> + DT::formatCurrency(~ Length + Reads, currency = "", interval = 3, mark = ",", digits = 0) } else { - if (!is.na(present_viruses) && present_viruses != "" && file.exists(present_viruses)) { - viruses <- readr::read_file(present_viruses) - if (str_length(viruses) > 0) { - cat(str_c( - "Detected significant traces of content of the following viruses: ", - viruses, - ", however no evidence of viral integration into the host genome is observed." - )) - } else { - cat("No oncoviral content detected in this sample.") - } - } else { - cat("Oncoviral detection was disabled in the umccrise run with `-E oncoviruses`.") - } + cat("No oncoviral content detected.") } ``` ```{r, results='asis'} -blank_lines(1) +blank_lines(2) +``` + +### __Integration sites__ + +```{r oncoviral_integrations} +vb_integrations <- gpgr::virusbreakend_vcf_read(params$virusbreakend_vcf) + +details::details(knitr::kable(vb_integrations$descr), lang = "none", summary = "Column Description") + +if (nrow(vb_integrations$tab) > 0) { + + format_common_cols <- c( + "Position", + "Fragment support", + "Fragment support (unmapped)", + "Softclip read support" + ) + + DT::datatable(vb_integrations$tab, + filter = list(position = "top", clear = FALSE, plain = TRUE), + class = "cell-border display compact", + rownames = FALSE, extensions = c("Scroller", "Buttons", "KeyTable"), + options = list( + scroller = TRUE, scrollY = 200, scrollX = TRUE, autoWidth = FALSE, keys = TRUE, + buttons = c("csv", "excel"), dom = "Blfrtip" + ) + ) |> + DT::formatCurrency(format_common_cols, currency = "", interval = 3, mark = ",", digits = 0) + +} else { + cat("No oncoviral integrations detected.") +} +``` + +```{r, results='asis'} +blank_lines(2) ``` ## __Addendum__ {.tabset .tabset-fade #addendum} @@ -1611,43 +1377,47 @@ blank_lines(1) ### Conda Pkgs Main ```{r conda_list_main} -conda_pkgs <- - readr::read_table(params$conda_list, - col_names = c("env", "name", "version", "build", "channel"), - col_types = "ccccc" - ) %>% - dplyr::mutate( - channel = ifelse(is.na(channel), "MISSING", channel), - env = sub("^env", "umccrise", env), - env = sub("^umccrise_", "", env) +if (is.null(params$conda_list)) { + cat("No conda package list provided.") +} else { + conda_pkgs <- + readr::read_table(params$conda_list, + col_names = c("env", "name", "version", "build", "channel"), + col_types = "ccccc" + ) %>% + dplyr::mutate( + channel = ifelse(is.na(channel), "MISSING", channel), + env = sub("^env", "umccrise", env), + env = sub("^umccrise_", "", env) + ) %>% + dplyr::arrange(name) %>% + dplyr::group_by(name, version, build, channel) %>% + dplyr::summarise(envs = paste(env, collapse = ", "), .groups = "drop_last") %>% + dplyr::ungroup() + + # main pkgs + tibble::tibble( + name = c( + "umccrise", "r-gpgr", "r-sigrap", "r-base", "pcgr", "cpsr", "cacao", + "bioconductor-mutationalpatterns", "r-signature.tools.lib", "r-chord", + "hmftools-purple", "hmftools-amber", "hmftools-cobalt", "hmftools-sage", + "break-point-inspector", "multiqc", "multiqc-bcbio", "reference-data", + "bed_annotation", "vcf-stuff", "snakemake", "dvc", "ensembl-vep", + "samtools", "bcftools", "htslib", "ngs_utils", "python", "pandoc" + ) ) %>% - dplyr::arrange(name) %>% - dplyr::group_by(name, version, build, channel) %>% - dplyr::summarise(envs = paste(env, collapse = ", "), .groups = "drop_last") %>% - dplyr::ungroup() - -# main pkgs -tibble::tibble( - name = c( - "umccrise", "r-gpgr", "r-sigrap", "r-base", "pcgr", "cpsr", "cacao", - "bioconductor-mutationalpatterns", "r-signature.tools.lib", "r-chord", - "hmftools-purple", "hmftools-amber", "hmftools-cobalt", "hmftools-sage", - "break-point-inspector", "multiqc", "multiqc-bcbio", "reference-data", - "bed_annotation", "vcf-stuff", "snakemake", "dvc", "ensembl-vep", - "samtools", "bcftools", "htslib", "ngs_utils", "python", "pandoc" - ) -) %>% - dplyr::left_join(conda_pkgs, by = "name") %>% - knitr::kable(format = "html") %>% - kableExtra::kable_paper(c("hover", "striped"), full_width = TRUE, position = "left") %>% - kableExtra::column_spec(1, bold = TRUE) %>% - kableExtra::scroll_box(height = "300px") + dplyr::left_join(conda_pkgs, by = "name") %>% + knitr::kable(format = "html") %>% + kableExtra::kable_paper(c("hover", "striped"), full_width = TRUE, position = "left") %>% + kableExtra::column_spec(1, bold = TRUE) %>% + kableExtra::scroll_box(height = "300px") +} ``` ### Report Inputs ```{r report_inputs} -report_inputs <- dplyr::tibble(key = names(params), value = unlist(params)) +report_inputs <- dplyr::tibble(key = names(params), value = params) gpgr::write_tsvjsongz(report_inputs, glue("{bnm}-report_inputs"), result_outdir) report_inputs %>% @@ -1660,13 +1430,17 @@ report_inputs %>% ### Conda Pkgs All ```{r conda_list_all} -conda_pkgs %>% - DT::datatable( - filter = list(position = "top", clear = FALSE, plain = TRUE), - rownames = FALSE, extensions = c("Scroller", "Buttons"), - options = list( - scroller = TRUE, scrollX = TRUE, scrollY = 400, - buttons = c("csv"), dom = "Bfrtip" +if (is.null(params$conda_list)) { + cat("No conda package list provided.") +} else { + conda_pkgs %>% + DT::datatable( + filter = list(position = "top", clear = FALSE, plain = TRUE), + rownames = FALSE, extensions = c("Scroller", "Buttons"), + options = list( + scroller = TRUE, scrollX = TRUE, scrollY = 400, + buttons = c("csv"), dom = "Bfrtip" + ) ) - ) +} ``` diff --git a/inst/rmd/umccrise/render.R b/inst/rmd/umccrise/render.R index fd790e4..b482b5c 100644 --- a/inst/rmd/umccrise/render.R +++ b/inst/rmd/umccrise/render.R @@ -22,7 +22,6 @@ params <- list( oncoviral_breakpoints_tsv = glue::glue("{umccrised_dir}/work/{batch_name}/oncoviruses/oncoviral_breakpoints.tsv"), oncoviral_present_viruses = glue::glue("{umccrised_dir}/work/{batch_name}/oncoviruses/present_viruses.txt"), out_file = here::here("nogit/umccrised_data/SBJ02269/foo_cancer_report.html"), - purple_germ_cnv = glue::glue("{umccrised_dir}/work/{batch_name}/purple/{batch_name}.purple.cnv.germline.tsv"), purple_purity = glue::glue("{umccrised_dir}/work/{batch_name}/purple/{batch_name}.purple.purity.tsv"), purple_qc = glue::glue("{umccrised_dir}/work/{batch_name}/purple/{batch_name}.purple.qc"), purple_som_cnv = glue::glue("{umccrised_dir}/work/{batch_name}/purple/{batch_name}.purple.cnv.somatic.tsv"), diff --git a/inst/rmd/umccrise/render.sh b/inst/rmd/umccrise/render.sh index 03bbc0f..9d608bf 100644 --- a/inst/rmd/umccrise/render.sh +++ b/inst/rmd/umccrise/render.sh @@ -24,7 +24,6 @@ $gpgr_cli canrep \ --somatic_sv_vcf "${umccrised_dir}/${batch_name}/structural/${batch_name}-manta.vcf.gz" \ --purple_som_gene_cnv "${umccrised_dir}/work/${batch_name}/purple/${batch_name}.purple.cnv.gene.tsv" \ --purple_som_cnv "${umccrised_dir}/work/${batch_name}/purple/${batch_name}.purple.cnv.somatic.tsv" \ - --purple_germ_cnv "${umccrised_dir}/work/${batch_name}/purple/${batch_name}.purple.cnv.germline.tsv" \ --purple_purity "${umccrised_dir}/work/${batch_name}/purple/${batch_name}.purple.purity.tsv" \ --purple_qc "${umccrised_dir}/work/${batch_name}/purple/${batch_name}.purple.qc" \ --purple_som_snv_vcf "${umccrised_dir}/work/${batch_name}/purple/${batch_name}.purple.somatic.vcf.gz" \ diff --git a/man/cancer_rmd.Rd b/man/cancer_rmd.Rd index 683eeab..9f1e309 100644 --- a/man/cancer_rmd.Rd +++ b/man/cancer_rmd.Rd @@ -11,15 +11,14 @@ cancer_rmd( conda_list, img_dir, key_genes, - oncoviral_breakpoints_tsv, - oncoviral_present_viruses, - purple_germ_cnv, + virusbreakend_tsv, + virusbreakend_vcf, purple_purity, purple_qc, + purple_som_cnv_ann, purple_som_cnv, purple_som_gene_cnv, purple_som_snv_vcf, - somatic_snv_summary, somatic_snv_vcf, somatic_sv_tsv, somatic_sv_vcf, @@ -42,24 +41,22 @@ cancer_rmd( \item{key_genes}{Path to UMCCR cancer gene file.} -\item{oncoviral_breakpoints_tsv}{Path to \code{oncoviruses/oncoviral_breakpoints.tsv}.} +\item{virusbreakend_tsv}{Path to VIRUSBreakend summary file.} -\item{oncoviral_present_viruses}{Path to \code{oncoviruses/present_viruses.txt}.} - -\item{purple_germ_cnv}{Path to \code{purple.cnv.germline.tsv}.} +\item{virusbreakend_vcf}{Path to VIRUSBreakend VCF file.} \item{purple_purity}{Path to \code{purple.purity.tsv}.} \item{purple_qc}{Path to \code{purple.qc}.} +\item{purple_som_cnv_ann}{Path to annotated and prioritised \code{purple.cnv.somatic.tsv}.} + \item{purple_som_cnv}{Path to \code{purple.cnv.somatic.tsv}.} \item{purple_som_gene_cnv}{Path to \code{purple.cnv.gene.tsv}.} \item{purple_som_snv_vcf}{Path to \code{purple.somatic.vcf.gz}.} -\item{somatic_snv_summary}{Path to \code{somatic_snv_summary.json} JSON.} - \item{somatic_snv_vcf}{Path to \code{somatic-PASS.vcf.gz} SNV VCF.} \item{somatic_sv_tsv}{Path to \code{manta.tsv} TSV file.} diff --git a/man/is_vcf_bpi.Rd b/man/is_vcf_bpi.Rd deleted file mode 100644 index b6ef2a6..0000000 --- a/man/is_vcf_bpi.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{is_vcf_bpi} -\alias{is_vcf_bpi} -\title{BPI run on VCF} -\usage{ -is_vcf_bpi(x) -} -\arguments{ -\item{x}{Path to VCF file.} -} -\description{ -BPI run on VCF -} diff --git a/man/purple_cnv_germ_process.Rd b/man/purple_cnv_germ_process.Rd deleted file mode 100644 index 73658b9..0000000 --- a/man/purple_cnv_germ_process.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/purple.R -\name{purple_cnv_germ_process} -\alias{purple_cnv_germ_process} -\title{Process PURPLE CNV germline File for UMCCRISE} -\usage{ -purple_cnv_germ_process(x) -} -\arguments{ -\item{x}{Path to \code{purple.cnv.germline.tsv} file.} -} -\value{ -List with two elements: -\itemize{ -\item \code{tab}: Tibble with more condensed columns. -\item \code{descr}: Description of tibble columns. -} -} -\description{ -Processes the \code{purple.cnv.germline.tsv} file. -and selects columns of interest. -} -\examples{ -x <- system.file("extdata/purple/purple.cnv.germline.tsv", package = "gpgr") -(pp <- purple_cnv_germ_process(x)) -} diff --git a/man/purple_cnv_germ_read.Rd b/man/purple_cnv_germ_read.Rd deleted file mode 100644 index 82e82f7..0000000 --- a/man/purple_cnv_germ_read.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/purple.R -\name{purple_cnv_germ_read} -\alias{purple_cnv_germ_read} -\title{Read PURPLE CNV Germline File} -\usage{ -purple_cnv_germ_read(x) -} -\arguments{ -\item{x}{Path to \code{purple.cnv.germline.tsv} file.} -} -\value{ -The input file as a tibble. -} -\description{ -Reads the \code{purple.cnv.germline.tsv} file. -} -\examples{ -x <- system.file("extdata/purple/purple.cnv.germline.tsv", package = "gpgr") -(p <- purple_cnv_germ_read(x)) -} diff --git a/man/purple_cnv_som_process.Rd b/man/purple_cnv_som_process.Rd index b90fcb8..851e122 100644 --- a/man/purple_cnv_som_process.Rd +++ b/man/purple_cnv_som_process.Rd @@ -17,8 +17,7 @@ List with two elements: } } \description{ -Processes the \code{purple.cnv.somatic.tsv} file. -and selects columns of interest. +Processes the \code{purple.cnv.somatic.tsv} file and selects columns of interest. } \examples{ x <- system.file("extdata/purple/purple.cnv.somatic.tsv", package = "gpgr") diff --git a/tests/testthat/test-roxytest-testexamples-purple.R b/tests/testthat/test-roxytest-testexamples-purple.R index 6af5c55..2f1bbcb 100644 --- a/tests/testthat/test-roxytest-testexamples-purple.R +++ b/tests/testthat/test-roxytest-testexamples-purple.R @@ -10,7 +10,7 @@ test_that("Function purple_cnv_som_gene_read() @ L18", { }) -test_that("Function purple_cnv_som_gene_process() @ L62", { +test_that("Function purple_cnv_som_gene_process() @ L60", { x <- system.file("extdata/purple/purple.cnv.gene.tsv", package = "gpgr") g <- system.file("extdata/ref/umccr_cancer_genes_2019-03-20.tsv", package = "gpgr") @@ -19,7 +19,7 @@ test_that("Function purple_cnv_som_gene_process() @ L62", { }) -test_that("Function purple_cnv_som_read() @ L147", { +test_that("Function purple_cnv_som_read() @ L143", { x <- system.file("extdata/purple/purple.cnv.somatic.tsv", package = "gpgr") (p <- purple_cnv_som_read(x)) @@ -27,7 +27,7 @@ test_that("Function purple_cnv_som_read() @ L147", { }) -test_that("Function purple_cnv_som_process() @ L181", { +test_that("Function purple_cnv_som_process() @ L176", { x <- system.file("extdata/purple/purple.cnv.somatic.tsv", package = "gpgr") (pp <- purple_cnv_som_process(x)) @@ -35,23 +35,23 @@ test_that("Function purple_cnv_som_process() @ L181", { }) -test_that("Function purple_cnv_germ_read() @ L248", { +test_that("Function purple_cnv_som_ann_process() @ L245", { - x <- system.file("extdata/purple/purple.cnv.germline.tsv", package = "gpgr") - (p <- purple_cnv_germ_read(x)) - expect_equal(colnames(p)[ncol(p)], "majorAlleleCopyNumber") + x <- system.file("extdata/sash/purple.cnv.gene.annotated.tsv", package = "gpgr") + (p <- purple_cnv_som_ann_process(x)) + expect_equal(colnames(p)[ncol(p)], "annotation") }) -test_that("Function purple_cnv_germ_process() @ L272", { +test_that("Function purple_cnv_som_ann_read() @ L339", { - x <- system.file("extdata/purple/purple.cnv.germline.tsv", package = "gpgr") - (pp <- purple_cnv_germ_process(x)) - expect_equal(colnames(pp$tab)[ncol(pp$tab)], "GC (windowCount)") + x <- system.file("extdata/sash/purple.cnv.gene.annotated.tsv", package = "gpgr") + (p <- purple_cnv_som_ann_read(x)) + expect_equal(colnames(p)[ncol(p)], "simple_ann") }) -test_that("Function purple_version_read() @ L297", { +test_that("Function purple_version_read() @ L385", { x <- system.file("extdata/purple/purple.version", package = "gpgr") (v <- purple_version_read(x)) @@ -61,7 +61,7 @@ test_that("Function purple_version_read() @ L297", { }) -test_that("Function purple_qc_read() @ L323", { +test_that("Function purple_qc_read() @ L411", { x <- system.file("extdata/purple/purple.qc", package = "gpgr") (q <- purple_qc_read(x)) @@ -69,7 +69,7 @@ test_that("Function purple_qc_read() @ L323", { }) -test_that("Function purple_purity_read() @ L378", { +test_that("Function purple_purity_read() @ L469", { x <- system.file("extdata/purple/purple.purity.tsv", package = "gpgr") (p <- purple_purity_read(x)) diff --git a/tests/testthat/test-roxytest-testexamples-sv.R b/tests/testthat/test-roxytest-testexamples-sv.R index 782d5af..e287ced 100644 --- a/tests/testthat/test-roxytest-testexamples-sv.R +++ b/tests/testthat/test-roxytest-testexamples-sv.R @@ -20,7 +20,7 @@ test_that("Function split_double_col() @ L27", { }) -test_that("Function count_pieces() @ L76", { +test_that("Function count_pieces() @ L77", { (a <- gpgr:::count_pieces("foo,bar,baz", sep = ",")) (b <- gpgr:::count_pieces("foo", sep = ",")) @@ -34,7 +34,7 @@ test_that("Function count_pieces() @ L76", { }) -test_that("Function abbreviate_effect() @ L110", { +test_that("Function abbreviate_effect() @ L112", { (e1 <- gpgr:::abbreviate_effect("3_prime_UTR_truncation&start_lost&splice_region_variant")) (e2 <- gpgr:::abbreviate_effect("duplication&foo&gene_fusion&BOOM&intron_variant")) @@ -47,7 +47,7 @@ test_that("Function abbreviate_effect() @ L110", { }) -test_that("Function umccrise_read_sv_tsv() @ L141", { +test_that("Function umccrise_read_sv_tsv() @ L143", { x <- system.file("extdata/umccrise/sv/manta.tsv", package = "gpgr") (sv <- umccrise_read_sv_tsv(x)$data) @@ -55,7 +55,7 @@ test_that("Function umccrise_read_sv_tsv() @ L141", { }) -test_that("Function process_sv() @ L197", { +test_that("Function process_sv() @ L195", { x <- system.file("extdata/umccrise/sv/manta.tsv", package = "gpgr") (sv <- process_sv(x)) diff --git a/tests/testthat/test-roxytest-testexamples-utils.R b/tests/testthat/test-roxytest-testexamples-utils.R index fb93447..26990c1 100644 --- a/tests/testthat/test-roxytest-testexamples-utils.R +++ b/tests/testthat/test-roxytest-testexamples-utils.R @@ -2,7 +2,7 @@ # File R/utils.R: @testexamples -test_that("Function is_vcf() @ L31", { +test_that("Function is_vcf() @ L21", { x <- system.file("extdata/umccrise/snv/somatic-ensemble-PASS.vcf.gz", package = "gpgr") (y <- is_vcf(x)) @@ -14,7 +14,7 @@ test_that("Function is_vcf() @ L31", { }) -test_that("Function vcf_is_empty() @ L105", { +test_that("Function vcf_is_empty() @ L97", { x <- system.file("extdata/umccrise/snv/somatic-ensemble-PASS.vcf.gz", package = "gpgr") (y <- vcf_is_empty(x)) @@ -41,7 +41,7 @@ test_that("Function vcf_is_empty() @ L105", { }) -test_that("Function tsv_is_empty() @ L141", { +test_that("Function tsv_is_empty() @ L133", { tmp1 <- tempfile() From e60051329e01eb507ce2fa3687cbc7c34b1a68b5 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Fri, 27 Oct 2023 15:39:42 +1100 Subject: [PATCH 02/11] First pass on processing new SV, CNV annotations --- NAMESPACE | 5 +- R/purple.R | 601 +++++++++++++----- R/rmd.R | 6 +- R/sv.R | 417 ++++++------ inst/cli/canrep.R | 2 + inst/rmd/umccrise/_navbar.html | 8 +- inst/rmd/umccrise/cancer_report.Rmd | 349 +++++----- man/cancer_rmd.Rd | 3 + man/process_sv.Rd | 21 - man/purple_version_read.Rd | 25 - man/umccrise_read_sv_tsv.Rd | 21 - man/virusbreakend_summary_read.Rd | 25 + man/virusbreakend_vcf_read.Rd | 25 + .../test-roxytest-testexamples-oncoviral.R | 19 + .../test-roxytest-testexamples-purple.R | 34 +- .../testthat/test-roxytest-testexamples-sv.R | 18 - 16 files changed, 924 insertions(+), 655 deletions(-) delete mode 100644 man/process_sv.Rd delete mode 100644 man/purple_version_read.Rd delete mode 100644 man/umccrise_read_sv_tsv.Rd create mode 100644 man/virusbreakend_summary_read.Rd create mode 100644 man/virusbreakend_vcf_read.Rd create mode 100644 tests/testthat/test-roxytest-testexamples-oncoviral.R diff --git a/NAMESPACE b/NAMESPACE index 26cdef5..a4f699f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -32,9 +32,8 @@ export(mkdir) export(pkg_exists) export(plot_bnd_sr_pr_tot_hist) export(plot_bnd_sr_pr_tot_lines) +export(process_cnv_tsv) export(process_sv) -export(purple_cnv_som_ann_process) -export(purple_cnv_som_ann_read) export(purple_cnv_som_gene_process) export(purple_cnv_som_gene_read) export(purple_cnv_som_process) @@ -43,10 +42,8 @@ export(purple_kataegis) export(purple_purity_read) export(purple_qc_read) export(purple_snv_vcf_read) -export(purple_version_read) export(session_info_kable) export(tsv_is_empty) -export(umccrise_read_sv_tsv) export(vcf_is_empty) export(virusbreakend_summary_read) export(virusbreakend_vcf_read) diff --git a/R/purple.R b/R/purple.R index dfd5bde..eb1fca2 100644 --- a/R/purple.R +++ b/R/purple.R @@ -123,6 +123,440 @@ purple_cnv_som_gene_process <- function(x, g = NULL) { ) } +sash_read_cnv_tsv <- function(x) { + + nm <- c( + "chromosome" = "c", + "start" = "i", + "end" = "i", + "svtype" = "c", + "baf" = "d", + "bafCount" = "d", + "copyNumber" = "d", + "depthWindowCount" = "i", + "gcContent" = "d", + "majorAlleleCopyNumber" = "d", + "method" = "c", + "minorAlleleCopyNumber" = "d", + "segmentEndSupport" = "c", + "segmentStartSupport" = "c", + "sv_top_tier" = "i", + "simple_ann" = "c" + ) + + ctypes <- paste(nm, collapse = "") + d <- readr::read_tsv(x, col_types = ctypes) + assertthat::assert_that(ncol(d) == length(nm)) + assertthat::assert_that(all(colnames(d) == names(nm))) + d +} + +split_chromosome_annotations <- function(x) { + x.grouped <- x |> + dplyr::group_by( + record_type=ifelse(grepl('chrom_[0-9XY]+', Detail), 'chromosome', 'other') + ) + + keys <- x.grouped |> + dplyr::group_keys() |> + dplyr::pull(record_type) + + x.split <- x.grouped |> + dplyr::group_split(.keep=FALSE) |> + purrr::set_names(keys) + + chromosome <- purrr::pluck(x.split, 'chromosome') |> + dplyr::select( + "Event ID" = "event_id", + "Tier (top)", + "Start" = "start", + "End" = "end", + "SV Type" = "svtype", + "Effect", + "Chromosome" = "Detail", + "PURPLE CN" = "copyNumber", + "PURPLE CN Min+Maj", + ) + + list( + chromosome=chromosome, + other=purrr::pluck(x.split, 'other') + ) +} + +split_fusion_annotations <- function(x) { + + fusion_annotations <- c( + 'BidFusG', + 'FusG', + 'bidirectional_gene_fusion', + 'gene_fusion' + ) + + x.grouped <- x |> + dplyr::group_by( + record_type=ifelse(Effect %in% fusion_annotations, 'fusion', 'other') + ) + + keys <- x.grouped |> + dplyr::group_keys() |> + dplyr::pull(record_type) + + x.split <- x.grouped |> + dplyr::group_split(.keep=FALSE) |> + purrr::set_names(keys) + + fusions <- purrr::pluck(x.split, 'fusion') |> + dplyr::select( + "Event ID" = "event_id", + "Annotation ID" = "annotation_id", + "Tier (top)", + "Start" = "start", + "End" = "end", + "SV Type" = "svtype", + "Effect", + "Genes", + "Transcripts" = "Transcript", + "Detail", + "PURPLE CN" = "copyNumber", + "PURPLE CN Min+Maj", + ) + + list( + fusions=fusions, + other=purrr::pluck(x.split, 'other') + ) +} + +count_event_genes <- function(.data) { + .data |> + dplyr::pull(Genes) |> + strsplit('&') |> + unlist() |> + unique() |> + length() +} + +set_event_type <- function(x) { + x |> + dplyr::group_by(event_id) |> + dplyr::mutate( + `Gene count (event)` = count_event_genes(dplyr::pick(Genes)), + event_category = dplyr::case_when( + `Gene count (event)` == 0 ~ 'none', + `Gene count (event)` == 1 ~ 'small', + `Gene count (event)` >= 2 & `Gene count (event)` <= 3 ~ 'medium', + .default = 'large', + ) + ) |> + dplyr::ungroup() +} + +set_many_transcripts_cnv <- function(x) { + # Set many transcripts + x.tmp <- x |> + dplyr::rowwise() |> + dplyr::mutate( + transcript_count = strsplit(Transcript, '&') |> unlist() |> unique() |> length() + ) |> + dplyr::ungroup() |> + dplyr::mutate( + many_transcripts = ifelse(transcript_count > 2, 'many_transcripts', 'few_transcripts'), + many_transcripts = ifelse(event_category %in% c('small', 'medium'), many_transcripts, NA), + ) |> + # Sort rows + dplyr::arrange(`Tier (top)`, Genes, Effect) + + # Build the many transcripts table + mt <- x.tmp |> + dplyr::filter(many_transcripts == 'many_transcripts') |> + # Remove unneeded columns and rename others + dplyr::select( + "Event ID" = "event_id", + "Annotation ID" = "annotation_id", + "Tier (top)", + "Start" = "start", + "End" = "end", + "Transcript count" = "transcript_count", + "Transcripts" = "Transcript", + ) + + # Clear transcript where appropriate + x.ready <- x.tmp |> + dplyr::mutate( + Transcript = ifelse( + many_transcripts == 'few_transcripts' | is.na(many_transcripts), + Transcript, + paste0('Many transcripts (', transcript_count, ')') + ) + ) |> + dplyr::select(c(-many_transcripts, -transcript_count)) + + list( + many_transcripts=mt, + cnv=x.ready + ) +} + +split_priority <- function(x) { + x.grouped <- x |> + dplyr::group_by( + prioritised = ifelse(Tier < 4, 'prioritised', 'unprioritised') + ) + + keys <- x.grouped |> + dplyr::group_keys() |> + dplyr::pull(prioritised) + + x.split <- x.grouped |> + dplyr::group_split(.keep=FALSE) |> + purrr::set_names(keys) + + list( + prioritised = purrr::pluck(x.split, 'prioritised'), + unprioritised = purrr::pluck(x.split, 'unprioritised', .default=x[NULL, ]) |> dplyr::select(-Transcript) + ) +} + +split_event_type <- function(x) { + + x.grouped <- x |> + dplyr::group_by(event_category) + + keys <- x.grouped |> + dplyr::group_keys() |> + dplyr::pull(event_category) + + x.split <- x.grouped |> + dplyr::group_split(.keep=FALSE) |> + purrr::set_names(keys) + + list( + none = purrr::pluck(x.split, 'none'), + small = purrr::pluck(x.split, 'small'), + medium = purrr::pluck(x.split, 'medium'), + large = purrr::pluck(x.split, 'large') + ) +} + +collapse_annotations <- function(x) { + x |> + dplyr::summarise( + `Gene count (effect)` = length(unique(unlist(strsplit(Genes, '&')))), + Genes = paste0(sort(unique(unlist(strsplit(Genes, '&')))), collapse=','), + Detail = paste0(sort(unique(Detail)), collapse=','), + `Tier (highest)` = min(Tier), + `Tier (lowest)` = max(Tier), + ) |> + dplyr::ungroup() +} + +set_many_genes <- function(x) { + # Set many genes + x.tmp <- x |> + dplyr::rowwise() |> + dplyr::mutate( + gene_count = strsplit(Genes, c(',|&')) |> unlist() |> unique() |> length() + ) |> + dplyr::ungroup() |> + dplyr::mutate( + many_genes = ifelse(gene_count > 2, 'many_genes', 'few_genes'), + ) |> + # Sort rows + dplyr::arrange(`Tier (top)`, Genes, Effect) + + # Build the many genes table + mt <- x.tmp |> + dplyr::filter(many_genes == 'many_genes') |> + # Remove unneeded columns and rename others + dplyr::select( + "Event ID" = "event_id", + "Tier (top)", + "Start" = "start", + "End" = "end", + "SV Type" = "svtype", + "Effect", + "Gene count (event)", + "Gene count (effect)", + "Genes", + ) + + # Clear genes where appropriate + x.ready <- x.tmp |> + dplyr::mutate( + Genes = ifelse( + many_genes == 'few_genes' | is.na(many_genes), + Genes, + paste0('Many genes (', gene_count, ')') + ) + ) |> + dplyr::select(c(-many_genes, -gene_count)) + + list( + many_genes=mt, + cnv=x.ready + ) +} + +#' @export +process_cnv_tsv <- function(x) { + # Read input + cnv.input <- sash_read_cnv_tsv(x) + + # Prepare input + cnv.ready <- cnv.input |> + dplyr::mutate( + chrom_simple = stringr::str_remove(chromosome, 'chr'), + start = paste(chrom_simple, base::format(start, big.mark = ",", trim = TRUE), sep=":"), + end = paste(chrom_simple, base::format(end, big.mark = ",", trim = TRUE), sep=":"), + ) |> + dplyr::select(-c( + chromosome, + chrom_simple, + )) + + # Melt annotations + cnv.tmp <- cnv.ready |> + dplyr::mutate(event_id = dplyr::row_number()) |> + # Split into individual annotations + dplyr::mutate(annotation = strsplit(simple_ann, ",")) |> + # Convert annotation fields into columns + tidyr::unnest(annotation) |> + tidyr::separate( + annotation, c("Event", "Effect", "Genes", "Transcript", "Detail", "Tier"), + sep = "\\|", convert = FALSE + ) |> + # Create new columns and modify existing ones + dplyr::mutate( + copyNumber = as.numeric(copyNumber) |> round(2) %>% sprintf("%.2f", .), + minorAlleleCopyNumber = as.numeric(minorAlleleCopyNumber) |> round(2) %>% sprintf("%.2f", .), + majorAlleleCopyNumber = as.numeric(majorAlleleCopyNumber) |> round(2) %>% sprintf("%.2f", .), + 'PURPLE CN Min+Maj' = paste0(minorAlleleCopyNumber, "+", majorAlleleCopyNumber), + "Tier (top)" = paste0(Tier, " (", sv_top_tier, ")"), + ) |> + # Set unique annotation ID + dplyr::mutate(annotation_id = dplyr::row_number()) |> + # Remove unused columns + dplyr::select(-c( + baf, + bafCount, + depthWindowCount, + Event, + gcContent, + majorAlleleCopyNumber, + method, + minorAlleleCopyNumber, + segmentEndSupport, + segmentStartSupport, + simple_ann, + sv_top_tier, + )) |> + # Sort rows + dplyr::arrange(`Tier (top)`, Genes, Effect) + + # Drop specific annotations + cnv.tmp <- cnv.tmp |> + dplyr::filter(Effect != 'intergenic_region') + + # Abbreviate effects + abbreviate_effectv <- Vectorize(gpgr::abbreviate_effect) + cnv.tmp$Effect <- abbreviate_effectv(cnv.tmp$Effect) + + # Split chromosome annotations + cnv.chromosome_split <- split_chromosome_annotations(cnv.tmp) + cnv.chromosome <- cnv.chromosome_split$chromosome + cnv.tmp <- cnv.chromosome_split$other + + # Split fusion annotations + cnv.fusion_split <- split_fusion_annotations(cnv.tmp) + cnv.fusion <- cnv.fusion_split$fusion + cnv.tmp <- cnv.fusion_split$other + + # Set event type based on the number of genes impacted + cnv.tmp <- set_event_type(cnv.tmp) + + # Set and create many transcripts table + cnv.many_transcripts_data <- set_many_transcripts_cnv(cnv.tmp) + cnv.many_transcripts <- purrr::pluck(cnv.many_transcripts_data, 'many_transcripts') + cnv.tmp <- purrr::pluck(cnv.many_transcripts_data, 'cnv') + + # Split into prioritised and unprioritised + cnv.priority_split <- split_priority(cnv.tmp) + cnv.unprioritised.tmp <- cnv.priority_split$unprioritised + cnv.prioritised.tmp <- cnv.priority_split$prioritised + + # Split into event type assigned above + cnv.event_type_split <- split_event_type(cnv.prioritised.tmp) + cnv.small.tmp <- cnv.event_type_split$small + cnv.medium.tmp <- cnv.event_type_split$medium + cnv.large.tmp <- cnv.event_type_split$large + + # Temporary set dplyr summarise warning off + summarise_info_opt <- getOption('dplyr.summarise.inform') + options(dplyr.summarise.inform = FALSE) + + # Collapse large event type annotations into single records, drop transcripts + cnv.large.tmp <- cnv.large.tmp |> + dplyr::group_by(dplyr::across(c(-Genes, -Transcript, -Tier, -Detail, -annotation_id))) |> + collapse_annotations() + + cnv.large.many_genes_data <- set_many_genes(cnv.large.tmp) + cnv.large.many_genes <- cnv.large.many_genes_data$many_genes + cnv.large.tmp <- cnv.large.many_genes_data$cnv + + # Select then collapse only large event type annotations into single records + cnv.unprioritised.tmp <- cnv.unprioritised.tmp |> + dplyr::filter(event_category == 'large') |> + dplyr::group_by(dplyr::across(c(-Genes, -Tier, -Detail, -annotation_id))) |> + collapse_annotations() |> + dplyr::ungroup() |> + dplyr::bind_rows( + cnv.unprioritised.tmp |> dplyr::filter(event_category != 'large') + ) + cnv.unprioritised.many_genes_data <- set_many_genes(cnv.unprioritised.tmp) + cnv.unprioritised.many_genes <- cnv.unprioritised.many_genes_data$many_genes + cnv.unprioritised.tmp <- cnv.unprioritised.many_genes_data$cnv + + # Restore dplyr summarise warning setting + options(dplyr.summarise.inform = summarise_info_opt) + + # Select columns for remaining tables + columns <- c( + "Event ID" = "event_id", + "Tier (top)", + "Tier lowest (effect)", + "Tier lowest (effect)", + "Start" = "start", + "End" = "end", + "SV Type" = "svtype", + "Effect", + "Gene count (event)", + "Gene count (effect)", + "Genes", + "Transcripts" = "Transcript", + "Detail", + "PURPLE CN" = "copyNumber", + "PURPLE CN Min+Maj" + ) + + cnv.small <- cnv.small.tmp |> dplyr::select(tidyselect::any_of(columns)) + cnv.medium <- cnv.medium.tmp |> dplyr::select(tidyselect::any_of(columns)) + cnv.large <- cnv.large.tmp |> dplyr::select(tidyselect::any_of(columns)) + cnv.unprioritised <- cnv.unprioritised.tmp |> dplyr::select(tidyselect::any_of(columns)) + + list( + small = cnv.small, + medium = cnv.medium, + large = cnv.large, + many_genes_prioritised = cnv.large.many_genes, + many_transcripts = cnv.many_transcripts, + unprioritised = cnv.unprioritised, + many_genes_unprioritised = cnv.unprioritised.many_genes, + fusion = cnv.fusion, + chromsome = cnv.chromosome + ) +} + #' Read PURPLE CNV Somatic File #' #' Reads the `purple.cnv.somatic.tsv` file, which contains the copy number @@ -225,173 +659,6 @@ purple_cnv_som_process <- function(x) { ) } -#' Process an Annotated PURPLE CNV Somatic File -#' -#' Processes the annotated and prioritised `purple.cnv.somatic.tsv` file. -#' -#' @param x Path to annotated and prioritised `purple.cnv.somatic.tsv` file. -#' -#' @return List with two elements: -#' * `tab`: Tibble containing selected data. -#' * `descr`: Description of tibble columns. -#' -#' @examples -#' x <- system.file("extdata/sash/purple.cnv.gene.annotated.tsv", package = "gpgr") -#' (p <- purple_cnv_som_ann_process(x)) -#' @testexamples -#' expect_equal(colnames(p)[ncol(p)], "annotation") -#' -#' @export -purple_cnv_som_ann_process <- function(x) { - purple_cnv_somatic_ann <- purple_cnv_som_ann_read(x) - - purple_cnv_somatic_ann___d <- purple_cnv_somatic_ann |> - dplyr::mutate( - Chr = as.factor(.data$chromosome), - minorAlleleCopyNumber = round(.data$minorAlleleCopyNumber, 1), - majorAlleleCopyNumber = round(.data$majorAlleleCopyNumber, 1), - `CopyNumber Min+Maj` = paste0(.data$minorAlleleCopyNumber, "+", .data$majorAlleleCopyNumber), - copyNumber = round(.data$copyNumber, 1), - bafAdj = round(.data$baf, 2), - gcContent = round(.data$gcContent, 2), - `Start/End SegSupport` = paste0(.data$segmentStartSupport, "-", .data$segmentEndSupport), - `BAF (count)` = paste0(.data$bafAdj, " (", .data$bafCount, ")"), - `GC (windowCount)` = paste0(.data$gcContent, " (", .data$depthWindowCount, ")"), - TopTier = sv_top_tier, - annotation = simple_ann - ) |> - dplyr::select( - "Chr", - Start = "start", End = "end", Type = "svtype", CN = "copyNumber", - `CN Min+Maj` = "CopyNumber Min+Maj", "Start/End SegSupport", - Method = "method", "BAF (count)", "GC (windowCount)", "TopTier", "annotation" - ) - - abbreviate_effectv <- Vectorize(gpgr::abbreviate_effect) - - purple_cnv_somatic_ann <- purple_cnv_somatic_ann___d |> - dplyr::mutate(annotation = strsplit(.data$annotation, ",")) |> - tidyr::unnest(.data$annotation) |> - tidyr::separate( - .data$annotation, c("Event", "Effect", "Genes", "Transcript", "Detail", "Tier"), - sep = "\\|", convert = FALSE - ) |> - dplyr::mutate( - ntrx = gpgr::count_pieces(.data$Transcript, "&"), - ngen = gpgr::count_pieces(.data$Genes, "&"), - neff = gpgr::count_pieces(.data$Effect, "&"), - Transcript = .data$Transcript |> stringr::str_replace_all("&", ", "), - Genes = .data$Genes |> stringr::str_replace_all("&", ", "), - Effect = abbreviate_effectv(.data$Effect), - `Tier (Top)` = glue::glue("{Tier} ({TopTier})") - ) |> - dplyr::distinct() |> - dplyr::filter(Detail != 'unprioritized') |> - dplyr::arrange(.data$`Tier (Top)`, .data$Genes, .data$Effect) - - descr <- dplyr::tribble( - ~Column, ~Description, - "Chr/Start/End", "Coordinates of copy number segment", - "Type", "Simple type interpretation of SV.", - "CN", "Fitted absolute copy number of segment adjusted for purity and ploidy", - "CN Min+Maj", "CopyNumber of minor + major allele adjusted for purity", - "Start/End SegSupport", paste0( - "Type of SV support for the CN breakpoint at ", - "start/end of region. Allowed values: ", - "CENTROMERE, TELOMERE, INV, DEL, DUP, BND (translocation), ", - "SGL (single breakend SV support), NONE (no SV support for CN breakpoint), ", - "MULT (multiple SV support at exact breakpoint)" - ), - "Method", paste0( - "Method used to determine the CN of the region. Allowed values: ", - "BAF_WEIGHTED (avg of all depth windows for the region), ", - "STRUCTURAL_VARIANT (inferred using ploidy of flanking SVs), ", - "LONG_ARM (inferred from the long arm), GERMLINE_AMPLIFICATION ", - "(inferred using special logic to handle regions of germline amplification)" - ), - "BAF (count)", "Tumor BAF after adjusted for purity and ploidy (Count of AMBER baf points covered by this segment)", - "GC (windowCount)", "Proportion of segment that is G or C (Count of COBALT windows covered by this segment)", - "TierTop", "Top priority of the event (from simple_sv_annotation: 1 highest, 4 lowest).", - "annotation", "INFO/SIMPLE_ANN: Simplified structural variant annotation: 'SVTYPE | EFFECT | GENE(s) | TRANSCRIPT | PRIORITY (1-4)'" - ) - - list( - tab = purple_cnv_somatic_ann, - descr = descr - ) -} - -#' Read an Annotated PURPLE CNV Somatic File -#' -#' Reads the annotated and prioritised `purple.cnv.somatic.tsv` file. -#' -#' @param x Path to annotated and prioritised `purple.cnv.somatic.tsv` file. -#' -#' @return The input file as a tibble. -#' -#' @examples -#' x <- system.file("extdata/sash/purple.cnv.gene.annotated.tsv", package = "gpgr") -#' (p <- purple_cnv_som_ann_read(x)) -#' @testexamples -#' expect_equal(colnames(p)[ncol(p)], "simple_ann") -#' -#' @export -purple_cnv_som_ann_read <- function(x) { - nm <- c( - "chromosome" = "c", - "start" = "i", - "end" = "i", - "svtype" = "c", - "baf" = "d", - "bafCount" = "d", - "copyNumber" = "d", - "depthWindowCount" = "i", - "gcContent" = "d", - "majorAlleleCopyNumber" = "d", - "method" = "c", - "minorAlleleCopyNumber" = "d", - "segmentEndSupport" = "c", - "segmentStartSupport" = "c", - "sv_top_tier" = "i", - "simple_ann" = "c" - ) - - ctypes <- paste(nm, collapse = "") - purple_cnv_somatic_ann <- readr::read_tsv(x, col_types = ctypes) - assertthat::assert_that(ncol(purple_cnv_somatic_ann) == length(nm)) - assertthat::assert_that(all(colnames(purple_cnv_somatic_ann) == names(nm))) - purple_cnv_somatic_ann -} - -#' Read PURPLE version file -#' -#' Reads the `purple.version` file containing the PURPLE version and build date. -#' -#' @param x Path to the `purple.version` file. -#' -#' @return A list with the elements: -#' * `version`: The version of PURPLE used. -#' * `build_date`: The build date of the PURPLE version. -#' -#' @examples -#' x <- system.file("extdata/purple/purple.version", package = "gpgr") -#' (v <- purple_version_read(x)) -#' @testexamples -#' expect_equal(length(v), 2) -#' expect_equal(names(v), c("version", "build_date")) -#' expect_equal(v$version, "2.51") -#' -#' @export -purple_version_read <- function(x) { - tab <- readr::read_delim(x, delim = "=", col_names = c("key", "value"), col_types = "cc") - assertthat::assert_that(nrow(tab) == 2, all(tab$key == c("version", "build.date"))) - - list( - version = tab$value[tab$key == "version"], - build_date = tab$value[tab$key == "build.date"] - ) -} - #' Read PURPLE QC file #' #' Reads the `purple.qc` file. diff --git a/R/rmd.R b/R/rmd.R index 5326144..7966dd5 100644 --- a/R/rmd.R +++ b/R/rmd.R @@ -77,6 +77,7 @@ linx_rmd <- function(sample, table_dir, plot_dir, out_file = NULL, quiet = FALSE #' @param purple_som_snv_vcf Path to `purple.somatic.vcf.gz`. #' @param result_outdir Path to directory to write tidy JSON/TSV results. #' @param somatic_snv_vcf Path to `somatic-PASS.vcf.gz` SNV VCF. +#' @param somatic_snv_summary Path to `somatic_snv_summary.json` JSON. #' @param somatic_sv_tsv Path to `manta.tsv` TSV file. #' @param somatic_sv_vcf Path to `manta.vcf.gz` VCF file. #' @param tumor_name Name of tumor sample. @@ -88,8 +89,8 @@ linx_rmd <- function(sample, table_dir, plot_dir, out_file = NULL, quiet = FALSE cancer_rmd <- function(af_global, af_keygenes, batch_name, conda_list, img_dir, key_genes, virusbreakend_tsv, virusbreakend_vcf, purple_purity, purple_qc, purple_som_cnv_ann, purple_som_cnv, purple_som_gene_cnv, purple_som_snv_vcf, - somatic_snv_vcf, somatic_sv_tsv, somatic_sv_vcf, result_outdir, tumor_name, - out_file = NULL, quiet = FALSE) { + somatic_snv_vcf, somatic_snv_summary, somatic_sv_tsv, somatic_sv_vcf, + result_outdir, tumor_name, out_file = NULL, quiet = FALSE) { assertthat::assert_that( dir.exists(img_dir), quiet %in% c(FALSE, TRUE) @@ -120,6 +121,7 @@ cancer_rmd <- function(af_global, af_keygenes, batch_name, conda_list, img_dir, img_dir = img_dir_b, key_genes = key_genes, somatic_snv_vcf = somatic_snv_vcf, + somatic_snv_summary = somatic_snv_summary, somatic_sv_tsv = somatic_sv_tsv, somatic_sv_vcf = somatic_sv_vcf, purple_som_gene_cnv = purple_som_gene_cnv, diff --git a/R/sv.R b/R/sv.R index 62e1f91..2f11649 100644 --- a/R/sv.R +++ b/R/sv.R @@ -125,28 +125,10 @@ abbreviate_effect <- function(effects) { paste(collapse = ", ") } -#' Read SV TSV -#' -#' Reads the Manta TSV file output by umccrise. -#' -#' @param x Path to `manta.tsv` output by umccrise. -#' -#' @return A tibble corresponding to the input TSV file. -#' -#' @examples -#' x <- system.file("extdata/umccrise/sv/manta.tsv", package = "gpgr") -#' (sv <- umccrise_read_sv_tsv(x)$data) -#' @testexamples -#' expect_equal(colnames(sv)[ncol(sv)], "ALT") -#' -#' @export -umccrise_read_sv_tsv <- function(x) { +sash_read_sv_tsv <- function(x) { - # tsv column names, description and types tab <- dplyr::tribble( ~Column, ~Description, ~Type, - "caller", "Manta SV caller", "c", - "sample", "Tumor sample name", "c", "chrom", "CHROM column in VCF", "c", "start", "POS column in VCF", "i", "svtype", "INFO/SVTYPE: Type of structural variant", "c", @@ -167,201 +149,272 @@ umccrise_read_sv_tsv <- function(x) { ) ctypes <- paste(tab$Type, collapse = "") - somatic_sv_tsv <- readr::read_tsv(x, col_names = TRUE, col_types = ctypes) - assertthat::assert_that(ncol(somatic_sv_tsv) == nrow(tab)) - assertthat::assert_that(all(colnames(somatic_sv_tsv) == tab$Column)) + d <- readr::read_tsv(x, col_names = TRUE, col_types = ctypes) + assertthat::assert_that(ncol(d) == nrow(tab)) + assertthat::assert_that(all(colnames(d) == tab$Column)) list( - data = somatic_sv_tsv, + data = d, descr = tab ) } -#' Process Structural Variants -#' -#' Processes the Manta TSV file output by umccrise. -#' -#' @param x Path to `manta.tsv` output by umccrise. -#' @return A list with melted/unmelted tibbles (these are NULL if TSV file was empty). -#' -#' @examples -#' x <- system.file("extdata/umccrise/sv/manta.tsv", package = "gpgr") -#' (sv <- process_sv(x)) -#' @testexamples -#' expect_true(inherits(sv, "list")) -#' expect_equal(length(sv), 4) -#' expect_equal(names(sv), c("unmelted", "melted", "tsv_descr", "col_descr")) -#' -#' @export -process_sv <- function(x) { - sv <- umccrise_read_sv_tsv(x) - tsv_descr <- sv$descr - sv <- sv$data - - if (nrow(sv) == 0) { - return(list( - unmelted = NULL, - melted = NULL - )) - } - col_descr <- dplyr::tribble( - ~Column, ~Description, - "nrow", "Row number that connects variants between tables in same tab set.", - "vcfnum", "Original event row number that connects variants to events.", - "TierTop", "Top priority of the event (from simple_sv_annotation: 1 highest, 4 lowest).", - "Tier", "Priority of the specific event (from simple_sv_annotation: 1 highest, 4 lowest).", - "Tier (Top)", "Priority of the specific event (Top tier of all event's annotations): 1 highest, 4 lowest. ", - "Chr", "Chromosome.", - "Start", "Start position.", - "End", "End position.", - "ID", "ID of BND from Manta. For PURPLE-inferred SVs this is PURPLE.", - "MATEID", "ID of BND mate from Manta.", - "BND_ID", "ID of BND pair simplified. BNDs with the same BND_ID belong to the same translocation event.", - "BND_mate", "A or B depending on whether the first or second mate of the BND pair.", - "Genes", "Genes involved in the event. DEL/DUP/INS events involving more than 2 genes are shown within separate table.", - "Transcript", "Transcripts involved in the event. DEL/DUP/INS events involving more than 2 transcripts are shown within separate table.", - "Effect", "SV effect (based on http://snpeff.sourceforge.net/SnpEff_manual.html#input).", - "Detail", "Prioritisation detail (from simple_sv_annotation).", - "AF_PURPLE", "PURPLE AF at each breakend preceded by their average.", - "CN", "Copy Number at each breakend preceded by their average.", - "CNC", "Copy Number Change at each breakend preceded by their average.", - "SR_alt", "Number of Split Reads supporting the alt allele, where P(allele|read)>0.999.", - "PR_alt", "Number of Paired Reads supporting the alt allele, where P(allele|read)>0.999.", - "SR_PR_ref", "Number of Split Reads and Paired Reads supporting the ref allele, where P(allele|read)>0.999.", - "SR_PR_sum", "Sum of SR_alt and PR_alt.", - "Type", "Simple type interpretation of SV.", - "Quality", "Variant quality score.", - "ntrx", "Number of transcripts for given event.", - "ngen", "Number of genes for given event.", - "nann", "Number of annotations for given event." +split_svs <- function(x) { + list( + purple_inferred = dplyr::filter(x, Type == "PURPLE_inf"), + sgl = dplyr::filter(x, Type == "SGL"), + bps = dplyr::filter(x, ! Type %in% c("PURPLE_inf", "SGL")) ) +} - cols_to_split <- c("AF_PURPLE", "CN_PURPLE", "CN_change_PURPLE") - double_cols <- split_double_col(sv, cols_to_split) - unmelted <- sv |> - dplyr::select(-dplyr::all_of(c(cols_to_split, "caller", "sample"))) |> - dplyr::bind_cols(double_cols) - unmelted <- unmelted |> + +join_breakpoint_entries <- function(x) { + # Group by GRIDSS identifier (clipping trailing h/o [h: High, o: lOwer]) + bps <- x |> + tidyr::separate(ID, into = c("BND_group", "BND_mate"), sep = -1, convert = TRUE, remove = FALSE)|> + dplyr::group_by(BND_group) + + # Set a sequential breakpoint identifier + bps_groups <- bps |> dplyr::n_groups() + bps |> dplyr::mutate( - SR_PR_ref = paste0(.data$SR_ref, ",", .data$PR_alt), - chrom = sub("chr", "", .data$chrom), - svtype = ifelse(is.na(.data$PURPLE_status), .data$svtype, "PURPLE_inf"), - Start = .data$start, - nann = count_pieces(.data$annotation, ","), - vcfnum = dplyr::row_number(), - vcfnum = sprintf(glue::glue("%0{nchar(nrow(unmelted))}d"), .data$vcfnum) + # Assign a unique ID based on current group + BND_ID = sprintf(paste0("%0", nchar(bps_groups), "d"), dplyr::cur_group_id()), + BND_mate = ifelse(BND_mate == "o", "A", "B"), ) |> - dplyr::rowwise() |> - dplyr::mutate(SR_PR_sum = sum(.data$SR_alt, .data$PR_alt, na.rm = TRUE)) |> - dplyr::ungroup() - - # BND IDs - # Two BND mates share the same ID up to the last digit (0 or 1) - unmelted_bnd1 <- unmelted |> - dplyr::filter(!.data$svtype %in% c("PURPLE_inf", "SGL")) |> - tidyr::separate(.data$ID, into = c("BND_group", "BND_mate"), sep = -1, convert = TRUE, remove = FALSE) |> - dplyr::group_by(.data$BND_group) - assertthat::assert_that(all(unmelted_bnd1$BND_mate %in% c("o", "h"))) - unmelted_bnd1 <- unmelted_bnd1 |> + dplyr::ungroup() |> dplyr::mutate( - # index per group 1, 2, 3.. - BND_ID = dplyr::cur_group_id(), - # turns into 001, 002, 003... if you've got 100+ rows - BND_ID = sprintf(glue::glue("%0{nchar(nrow(unmelted_bnd1))}d"), .data$BND_ID), - BND_mate = ifelse(.data$BND_mate == "o", "A", "B") + end_position = sub("^.*:(\\d+).*$", "\\1", ALT) |> as.numeric() |> base::format(big.mark = ",", trim = TRUE), + end_chrom = sub("^.*chr(.*):.*$", "\\1", ALT), + end = paste0(end_chrom, ":", end_position), ) |> - dplyr::ungroup() + dplyr::select(-c(end_position, end_chrom)) +} + + +remove_gene_fusion_dups <- function(.data, columns) { + # Order elements of multi-entry effect values for reliable comparison + v.groups <- c("frameshift_variant&gene_fusion", "gene_fusion") + v.effects_ordered <- sapply(.data$Effect, function(s) { + c <- stringr::str_split(s, '&') |> unlist() + paste0(sort(c), collapse='&') + }) + + if (all(v.groups %in% v.effects_ordered)) { + .data |> dplyr::filter(Effect != "gene_fusion") + } else { + .data + } +} - # Grab each BND mate's chrom - # Orphan mates have that info in the ALT field - match_id2mateid <- match(unmelted_bnd1$ID, unmelted_bnd1$MATEID) - unmelted_bnd2 <- unmelted_bnd1[match_id2mateid, c("chrom")] |> - dplyr::rename(BND_mate_chrom = .data$chrom) - unmelted_bnd <- dplyr::bind_cols(unmelted_bnd1, unmelted_bnd2) |> +set_many_transcripts_sv <- function(x) { + # Set many transcripts + x.tmp <- x |> + dplyr::rowwise() |> + dplyr::mutate( + transcript_count = strsplit(Transcripts, "&") |> unlist() |> unique() |> length() + ) |> + dplyr::ungroup() |> dplyr::mutate( - BND_mate_chrom = ifelse( - is.na(.data$BND_mate_chrom), - sub(".*chr(.*):.*", "orphan_\\1", .data$ALT), - .data$BND_mate_chrom - ), - BND_mate_end = sub(".*chr(.*):(\\d+).*", "\\2", .data$ALT), - BND_mate_end = as.numeric(.data$BND_mate_end) + many_transcripts = ifelse(transcript_count > 2, "many_transcripts", "few_transcripts") ) - unmelted_other <- unmelted |> - dplyr::filter(.data$svtype %in% c("PURPLE_inf", "SGL")) + # Build the many transcripts table + mt <- x.tmp |> + dplyr::filter(many_transcripts == "many_transcripts") |> + # Remove unneeded columns and rename others + dplyr::select( + "Record ID", + "Annotation ID", + "Tier (top)", + "Start", + "End", + "Transcript count" = "transcript_count", + "Transcripts", + ) - unmelted_all <- - dplyr::bind_rows( - unmelted_bnd, - unmelted_other - ) |> + # Clear transcript where appropriate + x.ready <- x.tmp |> dplyr::mutate( - Start = base::format(.data$Start, big.mark = ",", trim = TRUE), - Start = paste0(.data$chrom, ":", .data$Start), - END_tmp = ifelse(!.data$svtype %in% c("PURPLE_inf", "SGL"), .data$BND_mate_end, NA_character_), - END_tmp = ifelse( - is.na(.data$END_tmp), - NA_character_, - base::format(as.integer(.data$END_tmp), big.mark = ",", trim = TRUE) - ), - End = ifelse( - is.na(.data$END_tmp), - NA_character_, - paste0( - ifelse(.data$svtype == "BND", .data$BND_mate_chrom, .data$chrom), - ":", - .data$END_tmp - ) + Transcripts = ifelse( + many_transcripts == "few_transcripts" | is.na(many_transcripts), + Transcripts, + paste0("Many transcripts (", transcript_count, ")") ) ) |> + dplyr::select(c(-many_transcripts, -transcript_count)) + + list( + many_transcripts = mt, + sv = x.ready + ) +} + + +#' @export +process_sv <- function(x) { + # Read input and set column information + sv.input <- sash_read_sv_tsv(x) + + # Early return if no data to process + if (nrow(sv.input$data) == 0) { + return() + } + + # Prepare input + sv.ready <- sv.input$data |> + dplyr::mutate( + "annotation_count" = count_pieces(annotation, ","), + "Top Tier"= tier, + "SR_PR_ref" = paste0(SR_ref, ",", PR_ref), + "SR_PR_sum" = SR_alt + PR_alt, + start = paste(chrom, base::format(start, big.mark = ",", trim = TRUE), sep=":"), + Type = ifelse(is.na(PURPLE_status), svtype, "PURPLE_inf"), + "Record ID" = dplyr::row_number(), + ) |> + dplyr::select(-c( + chrom, + PURPLE_status, + tier, + svtype, + )) + + # Split SVs by PURPLE inferred, single breakends, and breakpoints + sv.split <- split_svs(sv.ready) + + # Combine joined breakpoints and single breakends + sv.bps <- join_breakpoint_entries(sv.split$bps) + sv.tmp <- dplyr::bind_rows(sv.split$sgl, sv.bps) + + # Prepare PURPLE inf, add event id continue from assignment above + sv.purple_inf_full <- purrr::pluck(sv.split$purple_inferred) |> + dplyr::mutate( + CN_PURPLE = as.numeric(CN_PURPLE) |> round(2) %>% sprintf("%.2f", .), + CN_change_PURPLE = as.numeric(CN_change_PURPLE) |> round(3) %>% sprintf("%.3f", .), + ) |> + dplyr::rename( + "PURPLE CN" = CN_PURPLE, + "PURPLE CN Change" = CN_change_PURPLE, + "VCF ID" = ID, + ) + + # Select columns for dedicated PURPLE inf table + sv.purple_inf <- sv.purple_inf_full |> dplyr::select( - "vcfnum", - "nann", - TierTop = "tier", - "Start", - "End", - Type = "svtype", - "BND_ID", - "BND_mate", + "Record ID", + "VCF ID", + "Position" = "start", + "PURPLE CN", + "PURPLE CN Change", + ) + + # Format some columns + cols_to_split <- c("AF_PURPLE", "CN_PURPLE") + double_cols <- split_double_col(sv.tmp, cols_to_split) + sv.tmp <- sv.tmp |> + dplyr::select(-c(cols_to_split)) |> + dplyr::bind_cols(double_cols) + + # Format a table for to be used as the SV Map + sv.unmelted <- dplyr::bind_rows(sv.tmp, sv.purple_inf_full) + sv.map <- sv.unmelted |> + dplyr::select( + "Record ID", + "Annotations" = "annotation_count", + "Top Tier", + "Start" = "start", + "End" = "end", + "Type", + "Breakend ID" = "BND_ID", + "Breakend Mate" = "BND_mate", "SR_alt", "PR_alt", "SR_PR_sum", "SR_PR_ref", - "AF_PURPLE", - CNC = "CN_change_PURPLE", - CN = "CN_PURPLE", - Quality= "QUAL", - "annotation", - ) - - abbreviate_effectv <- Vectorize(abbreviate_effect) + "PURPLE AF" = "AF_PURPLE", + "PURPLE CN" = "CN_PURPLE", + ) |> + dplyr::arrange(`Record ID`) - melted <- unmelted_all |> - dplyr::mutate(annotation = strsplit(.data$annotation, ",")) |> - tidyr::unnest(.data$annotation) |> + # Melt annotations + sv.melted_all <- sv.unmelted |> + # Split into individual annotations + dplyr::mutate(annotation = strsplit(annotation, ",")) |> + # Convert annotation fields into columns + tidyr::unnest(annotation) |> tidyr::separate( - .data$annotation, c("Event", "Effect", "Genes", "Transcript", "Detail", "Tier"), + annotation, c("Event", "Effect", "Genes", "Transcript", "Detail", "Tier"), sep = "\\|", convert = FALSE ) |> + # Remove gene_fusion annotations for variants where frameshift_variant&gene_fusion already exist + dplyr::group_by(across(-Effect)) |> + dplyr::group_modify(remove_gene_fusion_dups) |> + dplyr::ungroup() |> + # Set unique annotation ID, remove unused columns + dplyr::mutate(annotation_id = dplyr::row_number()) |> + dplyr::select(c(-Event, -ALT)) |> + # Create new tier column dplyr::mutate( - ntrx = count_pieces(.data$Transcript, "&"), - ngen = count_pieces(.data$Genes, "&"), - neff = count_pieces(.data$Effect, "&"), - Transcript = .data$Transcript |> stringr::str_replace_all("&", ", "), - Genes = .data$Genes |> stringr::str_replace_all("&", ", "), - Effect = abbreviate_effectv(.data$Effect), - `Tier (Top)` = glue::glue("{Tier} ({TierTop})") + "Tier (top)" = paste0(Tier, " (", `Top Tier`, ")"), ) |> - dplyr::distinct() |> - dplyr::arrange(.data$`Tier (Top)`, .data$Genes, .data$Effect) + # Sort rows + dplyr::arrange(`Tier (top)`, `Record ID`, Genes, Effect) + + # Abbreviate effects + abbreviate_effectv <- Vectorize(gpgr::abbreviate_effect) + sv.melted_all$Effect <- abbreviate_effectv(sv.melted_all$Effect) + + # Exclude PURPLE inferred + sv.melted <- sv.melted_all |> + dplyr::filter(Type != "PURPLE_inf") + + # Select columns + sv.tmp <- sv.melted |> + dplyr::select( + "Record ID", + "Annotation ID" = "annotation_id", + "Tier (top)", + "Start" = "start", + "End" = "end", + "Breakend ID" = "BND_ID", + "Breakend Mate" = "BND_mate", + "Effect", + "Genes", + "Transcripts" = "Transcript", + "Effect", + "Detail", + "SR_alt", + "PR_alt", + "SR_PR_sum", + "PURPLE AF" = "AF_PURPLE", + "PURPLE CN" = "CN_PURPLE", + # Removed after ops + "Tier", + ) + + # Split into priority groups, remove Tier column once split + sv.prioritised.tmp <- dplyr::filter(sv.tmp, Tier <= 3) |> dplyr::select(-Tier) + sv.unprioritised.tmp = dplyr::filter(sv.tmp, Tier >= 4) |> dplyr::select(-Tier) + + # Set many transcripts + sv.many_transcript_data.prioritised <- set_many_transcripts_sv(sv.prioritised.tmp) + sv.prioritised.many_transcripts <- purrr::pluck(sv.many_transcript_data.prioritised, "many_transcripts") + sv.prioritised <- purrr::pluck(sv.many_transcript_data.prioritised, "sv") + + sv.many_transcript_data.unprioritised <- set_many_transcripts_sv(sv.unprioritised.tmp) + sv.unprioritised.many_transcripts <- purrr::pluck(sv.many_transcript_data.unprioritised, "many_transcripts") + sv.unprioritised <- purrr::pluck(sv.many_transcript_data.unprioritised, "sv") + # Split into priority groups list( - unmelted = unmelted_all, - melted = melted, - tsv_descr = tsv_descr, - col_descr = col_descr + map = sv.map, + map_melted = sv.melted_all, + prioritised = sv.prioritised, + many_transcripts_prioritised = sv.prioritised.many_transcripts, + unprioritised = sv.unprioritised, + many_transcripts_unprioritised = sv.unprioritised.many_transcripts, + purple_inferred = sv.purple_inf ) } @@ -387,7 +440,7 @@ plot_bnd_sr_pr_tot_lines <- function(d, assertthat::assert_that(all(c("Type", "SR_alt", "PR_alt") %in% colnames(d))) dplot <- d |> dplyr::filter(.data$Type == "BND") |> - dplyr::select(SR = "SR_alt", PR = "PR_alt", Tier = "TierTop", "BND_ID") |> + dplyr::select(SR = "SR_alt", PR = "PR_alt", Tier = "Top Tier", "Breakend ID") |> dplyr::distinct() |> dplyr::mutate( PR = ifelse(is.na(.data$PR), 0, .data$PR), @@ -450,7 +503,7 @@ plot_bnd_sr_pr_tot_hist <- function(d, assertthat::assert_that(all(c("Type", "SR_alt", "PR_alt") %in% colnames(d))) dplot <- d |> dplyr::filter(.data$Type == "BND") |> - dplyr::select(SR = "SR_alt", PR = "PR_alt", "BND_ID") |> + dplyr::select(SR = "SR_alt", PR = "PR_alt", "Breakend ID") |> dplyr::distinct() |> dplyr::mutate( PR = ifelse(is.na(.data$PR), 0, .data$PR), diff --git a/inst/cli/canrep.R b/inst/cli/canrep.R index b6bea8e..7e87a87 100644 --- a/inst/cli/canrep.R +++ b/inst/cli/canrep.R @@ -7,6 +7,7 @@ canrep_add_args <- function(subp) { canrep$add_argument("--img_dir", help = "Path to directory containing PURPLE plots.", required = TRUE) canrep$add_argument("--key_genes", help = "Path to UMCCR cancer gene file.", required = TRUE) canrep$add_argument("--somatic_snv_vcf", help = "Path to `somatic-PASS.vcf.gz` SNV VCF.", required = TRUE) + canrep$add_argument("--somatic_snv_summary", help = "Path to `somatic_snv_summary.json` JSON.", required = TRUE) canrep$add_argument("--somatic_sv_tsv", help = "Path to `manta.tsv` TSV file.", required = TRUE) canrep$add_argument("--somatic_sv_vcf", help = "Path to `manta.vcf.gz` VCF file.", required = TRUE) canrep$add_argument("--purple_som_gene_cnv", help = "Path to `purple.cnv.gene.tsv`.", required = TRUE) @@ -33,6 +34,7 @@ canrep_parse_args <- function(args) { img_dir = args$img_dir, key_genes = args$key_genes, somatic_snv_vcf = args$somatic_snv_vcf, + somatic_snv_summary = args$somatic_snv_summary, somatic_sv_tsv = args$somatic_sv_tsv, somatic_sv_vcf = args$somatic_sv_vcf, purple_som_gene_cnv = args$purple_som_gene_cnv, diff --git a/inst/rmd/umccrise/_navbar.html b/inst/rmd/umccrise/_navbar.html index 53533bf..987d2b8 100644 --- a/inst/rmd/umccrise/_navbar.html +++ b/inst/rmd/umccrise/_navbar.html @@ -27,13 +27,15 @@ SVs +
  • PURPLE Charts
  • Oncoviruses
  • Addendum
  • diff --git a/inst/rmd/umccrise/cancer_report.Rmd b/inst/rmd/umccrise/cancer_report.Rmd index 09bfdd0..badc7e0 100644 --- a/inst/rmd/umccrise/cancer_report.Rmd +++ b/inst/rmd/umccrise/cancer_report.Rmd @@ -26,6 +26,7 @@ params: purple_som_snv_vcf: NULL result_outdir: NULL somatic_snv_vcf: NULL + somatic_snv_summary: NULL somatic_sv_tsv: NULL somatic_sv_vcf: NULL title: "UMCCR Cancer Report" @@ -184,32 +185,10 @@ gpgr::write_tsvjsongz(sigs_indel, glue("sigs/{bnm}-indel"), result_outdir) #---- Structural Variants ----# sv_path <- params$somatic_sv_tsv sv <- NULL -sv_unmelted <- NULL -sv_all <- NULL -no_sv_found <- TRUE -sv_tsv_descr <- NULL -sv_col_descr <- NULL -if (sv_path == "NA") { - sv_unmelted <- tibble(WARNING = "Structural variants were not called") - sv_all <- tibble(WARNING = "Structural variants were not called") -} else { - no_sv_found <- gpgr::tsv_is_empty(sv_path) - if (no_sv_found) { - sv_unmelted <- tibble(WARNING = "THERE WERE 0 SVs PRIORITISED!") - sv_all <- tibble(WARNING = "THERE WERE 0 SVs PRIORITISED!") - sv_tsv_descr <- tibble(Column = "THERE WERE 0 SVs PRIORITISED!") - sv_col_descr <- tibble(Column = "THERE WERE 0 SVs PRIORITISED!") - } else { - sv <- gpgr::process_sv(sv_path) - sv_unmelted <- sv$unmelted - sv_all <- sv$melted - sv_tsv_descr <- sv$tsv_descr - sv_col_descr <- sv$col_descr - } +no_sv_found <- gpgr::tsv_is_empty(sv_path) +if (!no_sv_found) { + sv <- gpgr::process_sv(sv_path) } - -gpgr::write_tsvjsongz(sv_unmelted, glue("sv/{bnm}-01_sv_unmelted"), result_outdir) -gpgr::write_tsvjsongz(sv_all, glue("sv/{bnm}-02_sv_melted"), result_outdir) ``` ```{r purple_prep} @@ -277,6 +256,25 @@ summarise_sigs <- function(mut_sig_contr) { paste(collapse = ", ") } +snv_summary_counts <- params$somatic_snv_summary |> + jsonlite::read_json() + +# NOTE(SW): hypermutated status is determine indirectly by different counts between 'sage' and +# 'annotated' since this indicates that the bolt hypermutated logic evaluated as true and applied +# variant selection + +hypermutated <- snv_summary_counts$sage != snv_summary_counts$annotated + +snv_summary <- snv_summary_counts |> + unlist() |> + tibble::enframe() |> + dplyr::mutate( + value = format(value, big.mark = ",", trim = TRUE), + x = glue::glue("{name}: {value}"), + ) |> + dplyr::pull(x) |> + paste(collapse = "; ") + hrd_summary <- list( chord = chord_res[["prediction"]][, "p_hrd", drop = TRUE], hrdetect = hrdetect_res[, "Probability", drop = TRUE] @@ -296,8 +294,8 @@ if (no_sv_found) { } else { sv_summary <- list( - unmelted = sv_unmelted %>% dplyr::select(Type), - melted = sv_all %>% dplyr::select(Type) + unmelted = sv$map %>% dplyr::select(Type), + melted = sv$map_melted %>% dplyr::select(Type) ) %>% purrr::map(function(x) addmargins(table(x, useNA = "ifany"))) %>% do.call("rbind", .) %>% @@ -311,6 +309,11 @@ if (no_sv_found) { qc_summary_all <- dplyr::tribble( ~n, ~variable, ~value, ~details, + 4, "Hypermutated", ifelse(hypermutated, "TRUE", "FALSE"), + ifelse(hypermutated, glue::glue( + "More than 500K SNVs detected (including non-PASS), priority regions and/or variants ", + "selected for processing.", + ), ""), 5, "MutSigs (Old)", glue::glue("{summarise_sigs(sigs_snv_2015)}"), "2015 COSMIC (https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt)", 5, "MutSigs (New)", glue::glue("{summarise_sigs(sigs_snv_2020)}"), @@ -322,6 +325,7 @@ qc_summary_all <- dplyr::tribble( "Max: {cnv_summary[['som']]$Max}; ", "N: {cnv_summary[['som']]$N} " ), "", + 17, "SNVs (Somatic)", snv_summary, "Summary SNV counts across main processes, in order", 18, "SVs (unmelted)", paste(rev(sv_summary$unmelted), collapse = ", "), "Summary of SVs as presented in raw VCF.", 19, "SVs (melted)", paste(rev(sv_summary$melted), collapse = ", "), @@ -886,13 +890,27 @@ DEL | intergenic_region | TXNIP-HFE2 | ENSG00000265972-ENSG00000168509
    +```{r} +sv.dt <- sv |> purrr::map(function(x) { + x |> DT::datatable( + filter = list(position = "top", clear = FALSE, plain = TRUE), + class = "cell-border display compact", + rownames = FALSE, extensions = c("Scroller", "Buttons", "KeyTable"), + options = list( + scroller = TRUE, scrollY = 400, scrollX = TRUE, autoWidth = FALSE, keys = TRUE, + buttons = c("csv", "excel"), dom = "Blfrtip" + ) + ) +}) +``` + ### Summary {.tabset .tabset-fade #sv-summary} #### Plots ```{r plot_bnd_sr_pr_tot_lines, fig.width=12, fig.height=12, out.width="90%"} -p1 <- gpgr:::plot_bnd_sr_pr_tot_lines(sv_unmelted) -p2 <- gpgr::plot_bnd_sr_pr_tot_hist(sv_unmelted) + ggplot2::theme(legend.position = "none") +p1 <- gpgr:::plot_bnd_sr_pr_tot_lines(sv$map) +p2 <- gpgr::plot_bnd_sr_pr_tot_hist(sv$map) + ggplot2::theme(legend.position = "none") ((p1$p_all | p2) / p1$p_tier) ``` @@ -901,17 +919,17 @@ p2 <- gpgr::plot_bnd_sr_pr_tot_hist(sv_unmelted) + ggplot2::theme(legend.positio ```{r sv_summary, eval=!no_sv_found} d <- list( - unmelted = sv_unmelted %>% dplyr::select(Tier = TierTop, Type), - melted = sv_all %>% dplyr::select(Tier, Type) + unmelted = sv$map %>% dplyr::select("Top Tier", Type), + melted = sv$map_melted %>% dplyr::select("Top Tier", Type) ) %>% dplyr::bind_rows(.id = "group") %>% dplyr::mutate(group = factor(group, levels = c("unmelted", "melted"))) %>% dplyr::group_by(group) %>% - dplyr::count(Tier, Type) %>% + dplyr::count(`Top Tier`, Type) %>% tidyr::pivot_wider(names_from = Type, values_from = n) %>% dplyr::ungroup() %>% dplyr::rowwise() %>% - dplyr::mutate(tot = sum(dplyr::c_across(!(c(group, Tier))), na.rm = TRUE)) + dplyr::mutate(tot = sum(dplyr::c_across(!(c(group, `Top Tier`))), na.rm = TRUE)) d %>% gt::gt( @@ -943,213 +961,180 @@ d %>% blank_lines(2) ``` -### __Breakend Annotations__ {.tabset .tabset-fade #breakend-annotations} +### __SV Map__ {.tabset .tabset-fade} -```{r svtab_BND, eval=!no_sv_found} -sv_BND <- sv_all %>% - dplyr::mutate( - nrow = dplyr::row_number(), - nrow = sprintf(glue::glue("%0{nchar(nrow(.))}d"), nrow) - ) %>% - dplyr::select(nrow, dplyr::everything()) +```{r} +sv.dt$map +``` +```{r, results='asis'} +blank_lines(2) +``` -sv_BND_main <- sv_BND %>% - dplyr::select( - nrow, vcfnum, Type, `Tier (Top)`, Start, End, - BND_ID, BND_mate, - Genes, Effect, Detail, SR_alt, PR_alt, SR_PR_sum, AF_PURPLE - ) +### __Breakpoints and Breakends__ {.tabset .tabset-fade} -sv_BND_other <- sv_BND %>% - dplyr::select(nrow, CNC, CN, SR_PR_ref, Quality, ntrx, Transcript) +#### Main {.tabset} -sv_BND_purple <- sv_all %>% - dplyr::filter(Type %in% c("PURPLE_inf")) %>% - dplyr::select(`Tier (Top)`, Start, Effect, Detail, CN, CNC) +##### Prioritised -gpgr::write_tsvjsongz( - dplyr::bind_cols(sv_BND_main, sv_BND_other %>% dplyr::select(-nrow)), - glue("sv/{bnm}-03_sv_BND_main"), result_outdir -) -gpgr::write_tsvjsongz(sv_BND_purple, glue("sv/{bnm}-04_sv_BND_purpleinf"), result_outdir) +```{r} +sv.dt$prioritised +``` -sv_BND_maindt <- sv_BND_main %>% - DT::datatable( - filter = list(position = "top", clear = FALSE, plain = TRUE), - class = "cell-border display compact", - rownames = FALSE, extensions = c("Scroller", "Buttons", "KeyTable"), - options = list( - scroller = TRUE, scrollY = 400, scrollX = TRUE, autoWidth = FALSE, keys = TRUE, - buttons = c("csv", "excel"), dom = "Blfrtip" - ) - ) +```{r, results='asis'} +blank_lines(2) +``` -sv_BND_otherdt <- sv_BND_other %>% - DT::datatable( - filter = list(position = "top", clear = FALSE, plain = TRUE), - class = "cell-border display compact", - rownames = FALSE, extensions = c("Scroller", "Buttons", "KeyTable"), - options = list( - scroller = TRUE, scrollY = 400, scrollX = TRUE, keys = TRUE, autoWidth = FALSE, - buttons = c("csv", "excel"), dom = "Blfrtip" - ) - ) +##### Many transcripts -sv_BND_purpledt <- sv_BND_purple %>% - DT::datatable( +```{r} +sv.dt$many_transcripts_prioritised +``` + +```{r, results='asis'} +blank_lines(2) +``` + +#### Other {.tabset} + +##### Unprioritised + +```{r} +sv.dt$unprioritised +``` + +```{r, results='asis'} +blank_lines(2) +``` + +##### Many transcripts + +```{r} +sv.dt$many_transcripts_unprioritised +``` + +```{r, results='asis'} +blank_lines(2) +``` + +##### PURPLE inferred + +```{r} +sv.dt$purple_inferred +``` + +```{r, results='asis'} +blank_lines(2) +``` + +### __Copy number variants__ {.tabset .tabset-fade} + +```{r} +cnv <- gpgr::process_cnv_tsv(params$purple_som_cnv_ann) +``` + +```{r} +cnv.dt <- cnv |> purrr::map(function(x) { + x |> DT::datatable( filter = list(position = "top", clear = FALSE, plain = TRUE), class = "cell-border display compact", rownames = FALSE, extensions = c("Scroller", "Buttons", "KeyTable"), options = list( - scroller = TRUE, scrollY = 400, scrollX = TRUE, autoWidth = TRUE, - buttons = c("csv", "excel"), - dom = "Blfrtip" + scroller = TRUE, scrollY = 400, scrollX = TRUE, autoWidth = FALSE, keys = TRUE, + buttons = c("csv", "excel"), dom = "Blfrtip" ) ) +}) ``` -#### Main Columns +#### Main {.tabset} -```{r sv_BND_main, eval=!no_sv_found} -details::details(sv_detail_table(sv_BND_main), lang = "none", summary = "Column Description") -sv_BND_maindt +##### Small + +```{r} +cnv.dt$small ``` ```{r, results='asis'} blank_lines(2) ``` -#### Other Columns +##### Medium -```{r sv_BND_otherdt, eval=!no_sv_found} -details::details(sv_detail_table(sv_BND_other), lang = "none", summary = "Column Description") -sv_BND_otherdt +```{r} +cnv.dt$medium ``` -#### PURPLE Inferred - -```{r sv_BND_purple, eval=!no_sv_found} -details::details(sv_detail_table(sv_BND_purple), lang = "none", summary = "Column Description") -sv_BND_purpledt +```{r, results='asis'} +blank_lines(2) ``` -### __CNV annotations__ {.tabset .tabset-fade #cnv-annotations} +##### Large -```{r svtab_cnv} -cnv <- gpgr::purple_cnv_som_ann_process(params$purple_som_cnv_ann) +```{r} +cnv.dt$large +``` -cnv_temp <- cnv$tab %>% - dplyr::mutate( - nrow = dplyr::row_number(), - Genes_original = Genes, - Transcript_original = Transcript - ) +```{r, results='asis'} +blank_lines(2) +``` -max_genes <- 2 -max_transcripts <- 2 +##### Many genes -cnv_main <- cnv_temp %>% - dplyr::mutate( - Genes = ifelse(ngen > max_genes, - glue::glue("Many Genes ({ngen})"), - Genes - ), - Transcript = ifelse(ntrx > max_transcripts, - glue::glue("Many Transcripts ({ntrx})"), - Transcript - ) - ) %>% - dplyr::select( - nrow, `Tier (Top)`, Type, Chr, Start, End, Effect, - Genes, Transcript, Detail, CN, `CN Min+Maj`, `Start/End SegSupport`, Method, - `BAF (count)`, `GC (windowCount)` - ) +```{r} +cnv.dt$many_genes_prioritised +``` -gpgr::write_tsvjsongz(cnv_main, glue("sv/{bnm}-01_cnv_main"), result_outdir) +```{r, results='asis'} +blank_lines(2) +``` -cnv_main_manygenes <- cnv_temp %>% - dplyr::filter(ngen > max_genes) %>% - dplyr::select(nrow, `Tier (Top)`, Type, Start, End, - Effect, ngen, - Genes = Genes_original - ) +##### Many transcripts -gpgr::write_tsvjsongz(cnv_main_manygenes, glue("sv/{bnm}-02_cnv_main_manygenes"), result_outdir) +```{r} +cnv.dt$many_transcripts +``` -cnv_main_manytx <- cnv_temp %>% - dplyr::filter(ntrx > max_transcripts) %>% - dplyr::select(nrow, `Tier (Top)`, Type, Start, End, - Effect, ntrx, - Transcript = Transcript_original - ) +```{r, results='asis'} +blank_lines(2) +``` -gpgr::write_tsvjsongz(cnv_main_manytx, glue("sv/{bnm}-03_cnv_main_manytx"), result_outdir) +#### Other {.tabset} -cnv_maindt <- cnv_main %>% - DT::datatable( - filter = list(position = "top", clear = FALSE, plain = TRUE), - class = "cell-border display compact", - rownames = FALSE, extensions = c("Scroller", "Buttons", "KeyTable"), - options = list( - scroller = TRUE, scrollY = 400, scrollX = TRUE, autoWidth = FALSE, keys = TRUE, - buttons = c("csv", "excel"), dom = "Blfrtip" - ) - ) |> - DT::formatCurrency(~ Start + End, currency = "", interval = 3, mark = ",", digits = 0) +##### Unprioritised -cnv_main_manygenesdt <- cnv_main_manygenes %>% - DT::datatable( - filter = list(position = "top", clear = FALSE, plain = TRUE), - class = "cell-border display compact", - rownames = FALSE, extensions = c("Scroller", "Buttons", "KeyTable"), - options = list( - scroller = TRUE, scrollY = 400, scrollX = TRUE, autoWidth = FALSE, keys = TRUE, - buttons = c("csv", "excel"), dom = "Blfrtip" - ) - ) |> - DT::formatCurrency(~ Start + End + ngen, currency = "", interval = 3, mark = ",", digits = 0) +```{r} +cnv.dt$unprioritised +``` -cnv_main_manytxdt <- cnv_main_manytx %>% - DT::datatable( - filter = list(position = "top", clear = FALSE, plain = TRUE), - class = "cell-border display compact", - rownames = FALSE, extensions = c("Scroller", "Buttons", "KeyTable"), - options = list( - scroller = TRUE, scrollY = 400, scrollX = TRUE, autoWidth = FALSE, keys = TRUE, - buttons = c("csv", "excel"), dom = "Blfrtip" - ) - ) |> - DT::formatCurrency(~ Start + End, currency = "", interval = 3, mark = ",", digits = 0) +```{r, results='asis'} +blank_lines(2) ``` -#### Main Columns +##### Many genes -```{r cnv_main} -details::details(sv_detail_table(cnv_main), lang = "none", summary = "Column Description") -cnv_maindt +```{r} +cnv.dt$many_genes_unprioritised ``` ```{r, results='asis'} blank_lines(2) ``` -#### Many Genes +##### Fusions -```{r cnv_main_manygenes} -details::details(sv_detail_table(cnv_main_manygenes), lang = "none", summary = "Column Description") -cnv_main_manygenesdt +```{r} +cnv.dt$fusion ``` ```{r, results='asis'} blank_lines(2) ``` -#### Many Transcripts +##### Chromosome -```{r cnv_main_manytx} -details::details(sv_detail_table(cnv_main_manytx), lang = "none", summary = "Column Description") -cnv_main_manytxdt +```{r} +cnv.dt$chromsome ``` ```{r, results='asis'} diff --git a/man/cancer_rmd.Rd b/man/cancer_rmd.Rd index 9f1e309..a27128f 100644 --- a/man/cancer_rmd.Rd +++ b/man/cancer_rmd.Rd @@ -20,6 +20,7 @@ cancer_rmd( purple_som_gene_cnv, purple_som_snv_vcf, somatic_snv_vcf, + somatic_snv_summary, somatic_sv_tsv, somatic_sv_vcf, result_outdir, @@ -59,6 +60,8 @@ cancer_rmd( \item{somatic_snv_vcf}{Path to \code{somatic-PASS.vcf.gz} SNV VCF.} +\item{somatic_snv_summary}{Path to \code{somatic_snv_summary.json} JSON.} + \item{somatic_sv_tsv}{Path to \code{manta.tsv} TSV file.} \item{somatic_sv_vcf}{Path to \code{manta.vcf.gz} VCF file.} diff --git a/man/process_sv.Rd b/man/process_sv.Rd deleted file mode 100644 index ef2ba77..0000000 --- a/man/process_sv.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/sv.R -\name{process_sv} -\alias{process_sv} -\title{Process Structural Variants} -\usage{ -process_sv(x) -} -\arguments{ -\item{x}{Path to \code{manta.tsv} output by umccrise.} -} -\value{ -A list with melted/unmelted tibbles (these are NULL if TSV file was empty). -} -\description{ -Processes the Manta TSV file output by umccrise. -} -\examples{ -x <- system.file("extdata/umccrise/sv/manta.tsv", package = "gpgr") -(sv <- process_sv(x)) -} diff --git a/man/purple_version_read.Rd b/man/purple_version_read.Rd deleted file mode 100644 index afc364c..0000000 --- a/man/purple_version_read.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/purple.R -\name{purple_version_read} -\alias{purple_version_read} -\title{Read PURPLE version file} -\usage{ -purple_version_read(x) -} -\arguments{ -\item{x}{Path to the \code{purple.version} file.} -} -\value{ -A list with the elements: -\itemize{ -\item \code{version}: The version of PURPLE used. -\item \code{build_date}: The build date of the PURPLE version. -} -} -\description{ -Reads the \code{purple.version} file containing the PURPLE version and build date. -} -\examples{ -x <- system.file("extdata/purple/purple.version", package = "gpgr") -(v <- purple_version_read(x)) -} diff --git a/man/umccrise_read_sv_tsv.Rd b/man/umccrise_read_sv_tsv.Rd deleted file mode 100644 index 5f7a455..0000000 --- a/man/umccrise_read_sv_tsv.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/sv.R -\name{umccrise_read_sv_tsv} -\alias{umccrise_read_sv_tsv} -\title{Read SV TSV} -\usage{ -umccrise_read_sv_tsv(x) -} -\arguments{ -\item{x}{Path to \code{manta.tsv} output by umccrise.} -} -\value{ -A tibble corresponding to the input TSV file. -} -\description{ -Reads the Manta TSV file output by umccrise. -} -\examples{ -x <- system.file("extdata/umccrise/sv/manta.tsv", package = "gpgr") -(sv <- umccrise_read_sv_tsv(x)$data) -} diff --git a/man/virusbreakend_summary_read.Rd b/man/virusbreakend_summary_read.Rd new file mode 100644 index 0000000..e78287b --- /dev/null +++ b/man/virusbreakend_summary_read.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/oncoviral.R +\name{virusbreakend_summary_read} +\alias{virusbreakend_summary_read} +\title{Read VIRUSBreakend Summary File} +\usage{ +virusbreakend_summary_read(x) +} +\arguments{ +\item{x}{Path to \code{virusbreakend.vcf.summary.tsv} file.} +} +\value{ +List with two elements: +\itemize{ +\item \code{tab}: Tibble containing data. +\item \code{descr}: Description of tibble columns. +} +} +\description{ +Reads the \code{virusbreakend.vcf.summary.tsv} file. +} +\examples{ +x <- system.file("extdata/virusbreakend/virusbreakend.vcf.summary.tsv", package = "gpgr") +(vb <- virusbreakend_summary_read(x)) +} diff --git a/man/virusbreakend_vcf_read.Rd b/man/virusbreakend_vcf_read.Rd new file mode 100644 index 0000000..56b8d09 --- /dev/null +++ b/man/virusbreakend_vcf_read.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/oncoviral.R +\name{virusbreakend_vcf_read} +\alias{virusbreakend_vcf_read} +\title{Read VIRUSBreakend VCF File} +\usage{ +virusbreakend_vcf_read(x) +} +\arguments{ +\item{x}{Path to \code{virusbreakend.vcf} file.} +} +\value{ +List with two elements: +\itemize{ +\item \code{tab}: Tibble containing selected data. +\item \code{descr}: Description of tibble columns. +} +} +\description{ +Reads the \code{virusbreakend.vcf} file and selects data to present. +} +\examples{ +x <- system.file("extdata/virusbreakend/virusbreakend.vcf", package = "gpgr") +(vb <- virusbreakend_vcf_read(x)) +} diff --git a/tests/testthat/test-roxytest-testexamples-oncoviral.R b/tests/testthat/test-roxytest-testexamples-oncoviral.R new file mode 100644 index 0000000..cad645a --- /dev/null +++ b/tests/testthat/test-roxytest-testexamples-oncoviral.R @@ -0,0 +1,19 @@ +# Generated by roxytest: do not edit by hand! + +# File R/oncoviral.R: @testexamples + +test_that("Function virusbreakend_summary_read() @ L18", { + + x <- system.file("extdata/virusbreakend/virusbreakend.vcf.summary.tsv", package = "gpgr") + (vb <- virusbreakend_summary_read(x)) + expect_equal(colnames(vb)[ncol(vb)], "QCStatus") +}) + + +test_that("Function virusbreakend_vcf_read() @ L101", { + + x <- system.file("extdata/virusbreakend/virusbreakend.vcf", package = "gpgr") + (vb <- virusbreakend_vcf_read(x)) + expect_equal(colnames(vb)[ncol(vb)], "QC") +}) + diff --git a/tests/testthat/test-roxytest-testexamples-purple.R b/tests/testthat/test-roxytest-testexamples-purple.R index 2f1bbcb..42cf1b3 100644 --- a/tests/testthat/test-roxytest-testexamples-purple.R +++ b/tests/testthat/test-roxytest-testexamples-purple.R @@ -19,7 +19,7 @@ test_that("Function purple_cnv_som_gene_process() @ L60", { }) -test_that("Function purple_cnv_som_read() @ L143", { +test_that("Function purple_cnv_som_read() @ L577", { x <- system.file("extdata/purple/purple.cnv.somatic.tsv", package = "gpgr") (p <- purple_cnv_som_read(x)) @@ -27,7 +27,7 @@ test_that("Function purple_cnv_som_read() @ L143", { }) -test_that("Function purple_cnv_som_process() @ L176", { +test_that("Function purple_cnv_som_process() @ L610", { x <- system.file("extdata/purple/purple.cnv.somatic.tsv", package = "gpgr") (pp <- purple_cnv_som_process(x)) @@ -35,33 +35,7 @@ test_that("Function purple_cnv_som_process() @ L176", { }) -test_that("Function purple_cnv_som_ann_process() @ L245", { - - x <- system.file("extdata/sash/purple.cnv.gene.annotated.tsv", package = "gpgr") - (p <- purple_cnv_som_ann_process(x)) - expect_equal(colnames(p)[ncol(p)], "annotation") -}) - - -test_that("Function purple_cnv_som_ann_read() @ L339", { - - x <- system.file("extdata/sash/purple.cnv.gene.annotated.tsv", package = "gpgr") - (p <- purple_cnv_som_ann_read(x)) - expect_equal(colnames(p)[ncol(p)], "simple_ann") -}) - - -test_that("Function purple_version_read() @ L385", { - - x <- system.file("extdata/purple/purple.version", package = "gpgr") - (v <- purple_version_read(x)) - expect_equal(length(v), 2) - expect_equal(names(v), c("version", "build_date")) - expect_equal(v$version, "2.51") -}) - - -test_that("Function purple_qc_read() @ L411", { +test_that("Function purple_qc_read() @ L678", { x <- system.file("extdata/purple/purple.qc", package = "gpgr") (q <- purple_qc_read(x)) @@ -69,7 +43,7 @@ test_that("Function purple_qc_read() @ L411", { }) -test_that("Function purple_purity_read() @ L469", { +test_that("Function purple_purity_read() @ L736", { x <- system.file("extdata/purple/purple.purity.tsv", package = "gpgr") (p <- purple_purity_read(x)) diff --git a/tests/testthat/test-roxytest-testexamples-sv.R b/tests/testthat/test-roxytest-testexamples-sv.R index e287ced..2ec89e0 100644 --- a/tests/testthat/test-roxytest-testexamples-sv.R +++ b/tests/testthat/test-roxytest-testexamples-sv.R @@ -46,21 +46,3 @@ test_that("Function abbreviate_effect() @ L112", { expect_equal(e4, "badaboom, bar, foo, StopGain") }) - -test_that("Function umccrise_read_sv_tsv() @ L143", { - - x <- system.file("extdata/umccrise/sv/manta.tsv", package = "gpgr") - (sv <- umccrise_read_sv_tsv(x)$data) - expect_equal(colnames(sv)[ncol(sv)], "ALT") -}) - - -test_that("Function process_sv() @ L195", { - - x <- system.file("extdata/umccrise/sv/manta.tsv", package = "gpgr") - (sv <- process_sv(x)) - expect_true(inherits(sv, "list")) - expect_equal(length(sv), 4) - expect_equal(names(sv), c("unmelted", "melted", "tsv_descr", "col_descr")) -}) - From 57a9d241043a5e57fb7e7231f5557c570275f572 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Mon, 13 Nov 2023 09:30:07 +1100 Subject: [PATCH 03/11] Adjust SV/CNV tables, annotate with OncoKB db --- NAMESPACE | 2 + R/oncokb.R | 24 + R/purple.R | 438 +++++++----------- R/rmd.R | 4 +- R/sv.R | 167 ++++--- inst/cli/canrep.R | 2 + inst/rmd/umccrise/cancer_report.Rmd | 175 +++---- man/cancer_rmd.Rd | 3 + .../test-roxytest-testexamples-purple.R | 8 +- 9 files changed, 351 insertions(+), 472 deletions(-) create mode 100644 R/oncokb.R diff --git a/NAMESPACE b/NAMESPACE index a4f699f..d661d49 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,7 @@ export(af_summary) export(cancer_rmd) export(count_pieces) export(date_log) +export(get_oncokb_genes) export(hrd_results_tabs) export(is_vcf) export(linx_breakend_process) @@ -42,6 +43,7 @@ export(purple_kataegis) export(purple_purity_read) export(purple_qc_read) export(purple_snv_vcf_read) +export(read_oncokb) export(session_info_kable) export(tsv_is_empty) export(vcf_is_empty) diff --git a/R/oncokb.R b/R/oncokb.R new file mode 100644 index 0000000..03954aa --- /dev/null +++ b/R/oncokb.R @@ -0,0 +1,24 @@ +#' @export +read_oncokb <- function(x) { + readr::read_tsv(x) |> + dplyr::filter( + `OncoKB Annotated` == "Yes" + ) |> + dplyr::pull("Hugo Symbol") +} + +#' @export +get_oncokb_genes <- function(x, oncokb_genes) { + delimiters <- " ,&-" + delimiter_re <- paste0("[", delimiters, "]") + + oncokb_genes |> + # Create regexes for each match, utilising delimiters for boundaries. Handles most cases where a gene symbol contains the '-' delimiter + purrr::map(function(n) paste0("(?<=^|", delimiter_re, ")", n, "(?=", delimiter_re, "|$)")) |> + # Loop with nm iterations through regex and gene symbols + purrr::map(function(n) stringr::str_detect(x, n) |> tibble::as_tibble_row(.name_repair="unique_quiet")) |> + # Combine as tibble to access dplyr::summarise and compile list of detected OncoKB gene symbols for each effect + dplyr::bind_rows() |> + dplyr::summarise(dplyr::across(dplyr::everything(), function(v) { paste0(sort(oncokb_genes[v]), collapse=", ") })) |> + unlist() +} diff --git a/R/purple.R b/R/purple.R index eb1fca2..51eec0e 100644 --- a/R/purple.R +++ b/R/purple.R @@ -151,250 +151,174 @@ sash_read_cnv_tsv <- function(x) { d } -split_chromosome_annotations <- function(x) { - x.grouped <- x |> - dplyr::group_by( - record_type=ifelse(grepl('chrom_[0-9XY]+', Detail), 'chromosome', 'other') - ) - - keys <- x.grouped |> - dplyr::group_keys() |> - dplyr::pull(record_type) - - x.split <- x.grouped |> - dplyr::group_split(.keep=FALSE) |> - purrr::set_names(keys) - - chromosome <- purrr::pluck(x.split, 'chromosome') |> - dplyr::select( - "Event ID" = "event_id", - "Tier (top)", - "Start" = "start", - "End" = "end", - "SV Type" = "svtype", - "Effect", - "Chromosome" = "Detail", - "PURPLE CN" = "copyNumber", - "PURPLE CN Min+Maj", - ) +filter_and_split_annotations_cnv <- function(x) { - list( - chromosome=chromosome, - other=purrr::pluck(x.split, 'other') - ) -} - -split_fusion_annotations <- function(x) { - - fusion_annotations <- c( - 'BidFusG', - 'FusG', - 'bidirectional_gene_fusion', - 'gene_fusion' + filter_conditions <- list( + # Chromosome effects + stringr::str_starts(x$Detail, "chrom_"), + # Fusion effects + x$Effect %in% c("BidFusG", "FusG") ) x.grouped <- x |> dplyr::group_by( - record_type=ifelse(Effect %in% fusion_annotations, 'fusion', 'other') + filter=ifelse(purrr::reduce(filter_conditions, `|`), "filter", "retain") ) keys <- x.grouped |> dplyr::group_keys() |> - dplyr::pull(record_type) + dplyr::pull(filter) x.split <- x.grouped |> dplyr::group_split(.keep=FALSE) |> purrr::set_names(keys) - fusions <- purrr::pluck(x.split, 'fusion') |> - dplyr::select( - "Event ID" = "event_id", - "Annotation ID" = "annotation_id", - "Tier (top)", - "Start" = "start", - "End" = "end", - "SV Type" = "svtype", - "Effect", - "Genes", - "Transcripts" = "Transcript", - "Detail", - "PURPLE CN" = "copyNumber", - "PURPLE CN Min+Maj", - ) - list( - fusions=fusions, - other=purrr::pluck(x.split, 'other') + retained = purrr::pluck(x.split, "retain"), + filtered = purrr::pluck(x.split, "filter") |> dplyr::arrange(Tier, `Event ID`) ) } -count_event_genes <- function(.data) { - .data |> - dplyr::pull(Genes) |> - strsplit('&') |> - unlist() |> - unique() |> - length() -} +collapse_effect_group <- function(x) { + x.tmp <- dplyr::first(x) + genes <- x.tmp |> + dplyr::pull(Genes.unique) |> + unlist() -set_event_type <- function(x) { - x |> - dplyr::group_by(event_id) |> + x.tmp |> dplyr::mutate( - `Gene count (event)` = count_event_genes(dplyr::pick(Genes)), - event_category = dplyr::case_when( - `Gene count (event)` == 0 ~ 'none', - `Gene count (event)` == 1 ~ 'small', - `Gene count (event)` >= 2 & `Gene count (event)` <= 3 ~ 'medium', - .default = 'large', - ) - ) |> - dplyr::ungroup() + Genes = paste0(genes, collapse=", "), + Transcripts = "", + Detail = paste0(unique(x$Detail) |> sort(), collapse=", "), + Tier = min(x$Tier), + `Annotation ID` = NA, + ) } -set_many_transcripts_cnv <- function(x) { - # Set many transcripts - x.tmp <- x |> - dplyr::rowwise() |> +set_many_genes_cnv <- function(x) { + # Count genes and set eligibility for collapsing + collapse_effects <- c("DelG", "Dup", "DelTx", "UpstreamGV", "DnstreamGV", "IntergenReg") + x.counts <- x |> + dplyr::group_by(`Event ID`, Effect) |> dplyr::mutate( - transcript_count = strsplit(Transcript, '&') |> unlist() |> unique() |> length() + Genes.unique = Genes |> stringr::str_split(", ") |> unlist() |> unique() |> list(), + `Gene count (effect group)` = Genes.unique |> dplyr::first() |> length(), + collapse = `Gene count (effect group)` > 2 & Effect %in% collapse_effects, ) |> + dplyr::select(-`Gene count (effect group)`) + + # Collapse target groups + x.collapsed <- x.counts |> + dplyr::filter(collapse) |> + dplyr::select(-collapse) |> + dplyr::group_modify(~ collapse_effect_group(.x)) |> + # Update top tier, referring to complete data set within each group + dplyr::mutate( + `Tier (top)` = paste0(Tier, " (", min(x$Tier[x$`Event ID` == `Event ID`]), ")"), + ) |> + # Create new annotation ID dplyr::ungroup() |> dplyr::mutate( - many_transcripts = ifelse(transcript_count > 2, 'many_transcripts', 'few_transcripts'), - many_transcripts = ifelse(event_category %in% c('small', 'medium'), many_transcripts, NA), + `Annotation ID` = paste0(dplyr::row_number(), ".merged"), + ) + + # Bind collapsed rows with uncollapsed rows, then count genes per entry + x.tmp <- x.counts |> + dplyr::filter(!collapse) |> + dplyr::bind_rows(x.collapsed) |> + dplyr::select(-c(collapse, Genes.unique, Tier)) |> + dplyr::rowwise() |> + dplyr::mutate( + `Gene count` = Genes |> stringr::str_split(", ") |> unlist() |> unique() |> length(), ) |> + dplyr::ungroup() |> # Sort rows - dplyr::arrange(`Tier (top)`, Genes, Effect) + dplyr::arrange(`Tier (top)`, `Event ID`) - # Build the many transcripts table - mt <- x.tmp |> - dplyr::filter(many_transcripts == 'many_transcripts') |> + # Set many genes + x.tmp <- x.tmp |> + dplyr::mutate( + many_genes = ifelse(`Gene count` > 2, "many_genes", "few_genes"), + ) + + # Build the many genes table + x.many_genes <- x.tmp |> + dplyr::filter(many_genes == "many_genes") |> # Remove unneeded columns and rename others dplyr::select( - "Event ID" = "event_id", - "Annotation ID" = "annotation_id", + "Event ID", + "Annotation ID", "Tier (top)", - "Start" = "start", - "End" = "end", - "Transcript count" = "transcript_count", - "Transcripts" = "Transcript", + "Start", + "End", + "SV Type", + "Effect", + "Gene count", + "Genes", ) - # Clear transcript where appropriate + # Clear genes and transcripts where appropriate x.ready <- x.tmp |> dplyr::mutate( - Transcript = ifelse( - many_transcripts == 'few_transcripts' | is.na(many_transcripts), - Transcript, - paste0('Many transcripts (', transcript_count, ')') - ) + Genes = ifelse( + many_genes == "few_genes" | is.na(many_genes), + Genes, + paste0("Many genes (", `Gene count`, ")") + ), + Transcripts = ifelse(many_genes == "few_genes" | is.na(many_genes), Transcripts, ""), ) |> - dplyr::select(c(-many_transcripts, -transcript_count)) - - list( - many_transcripts=mt, - cnv=x.ready - ) -} - -split_priority <- function(x) { - x.grouped <- x |> - dplyr::group_by( - prioritised = ifelse(Tier < 4, 'prioritised', 'unprioritised') - ) - - keys <- x.grouped |> - dplyr::group_keys() |> - dplyr::pull(prioritised) - - x.split <- x.grouped |> - dplyr::group_split(.keep=FALSE) |> - purrr::set_names(keys) - - list( - prioritised = purrr::pluck(x.split, 'prioritised'), - unprioritised = purrr::pluck(x.split, 'unprioritised', .default=x[NULL, ]) |> dplyr::select(-Transcript) - ) -} - -split_event_type <- function(x) { - - x.grouped <- x |> - dplyr::group_by(event_category) - - keys <- x.grouped |> - dplyr::group_keys() |> - dplyr::pull(event_category) - - x.split <- x.grouped |> - dplyr::group_split(.keep=FALSE) |> - purrr::set_names(keys) + dplyr::select(-c(many_genes, `Gene count`)) list( - none = purrr::pluck(x.split, 'none'), - small = purrr::pluck(x.split, 'small'), - medium = purrr::pluck(x.split, 'medium'), - large = purrr::pluck(x.split, 'large') + ready = x.ready, + many_genes = x.many_genes ) } -collapse_annotations <- function(x) { - x |> - dplyr::summarise( - `Gene count (effect)` = length(unique(unlist(strsplit(Genes, '&')))), - Genes = paste0(sort(unique(unlist(strsplit(Genes, '&')))), collapse=','), - Detail = paste0(sort(unique(Detail)), collapse=','), - `Tier (highest)` = min(Tier), - `Tier (lowest)` = max(Tier), - ) |> - dplyr::ungroup() -} - -set_many_genes <- function(x) { - # Set many genes +set_many_transcripts_cnv <- function(x) { + # Set many transcripts x.tmp <- x |> dplyr::rowwise() |> dplyr::mutate( - gene_count = strsplit(Genes, c(',|&')) |> unlist() |> unique() |> length() + `Transcript count` = stringr::str_split(Transcripts, ", ") |> unlist() |> unique() |> length() ) |> dplyr::ungroup() |> dplyr::mutate( - many_genes = ifelse(gene_count > 2, 'many_genes', 'few_genes'), + many_transcripts = ifelse(`Transcript count` > 2, "many_transcripts", "few_transcripts"), ) |> # Sort rows - dplyr::arrange(`Tier (top)`, Genes, Effect) + dplyr::arrange(`Tier (top)`, `Event ID`) - # Build the many genes table - mt <- x.tmp |> - dplyr::filter(many_genes == 'many_genes') |> + # Build the many transcripts table + x.many_transcripts <- x.tmp |> + dplyr::filter(many_transcripts == "many_transcripts") |> # Remove unneeded columns and rename others dplyr::select( - "Event ID" = "event_id", + "Event ID", + "Annotation ID", "Tier (top)", - "Start" = "start", - "End" = "end", - "SV Type" = "svtype", + "Start", + "End", + "SV Type", "Effect", - "Gene count (event)", - "Gene count (effect)", - "Genes", + "Transcript count", + "Transcripts", ) - # Clear genes where appropriate + # Clear transcripts where appropriate x.ready <- x.tmp |> dplyr::mutate( - Genes = ifelse( - many_genes == 'few_genes' | is.na(many_genes), - Genes, - paste0('Many genes (', gene_count, ')') + Transcripts = ifelse( + many_transcripts == "few_transcripts" | is.na(many_transcripts), + Transcripts, + paste0("Many transcripts (", `Transcript count`, ")") ) ) |> - dplyr::select(c(-many_genes, -gene_count)) + dplyr::select(-c(many_transcripts, `Transcript count`)) list( - many_genes=mt, - cnv=x.ready + ready = x.ready, + many_transcripts = x.many_transcripts ) } @@ -406,7 +330,7 @@ process_cnv_tsv <- function(x) { # Prepare input cnv.ready <- cnv.input |> dplyr::mutate( - chrom_simple = stringr::str_remove(chromosome, 'chr'), + chrom_simple = stringr::str_remove(chromosome, "chr"), start = paste(chrom_simple, base::format(start, big.mark = ",", trim = TRUE), sep=":"), end = paste(chrom_simple, base::format(end, big.mark = ",", trim = TRUE), sep=":"), ) |> @@ -417,13 +341,13 @@ process_cnv_tsv <- function(x) { # Melt annotations cnv.tmp <- cnv.ready |> - dplyr::mutate(event_id = dplyr::row_number()) |> + dplyr::mutate(`Event ID` = dplyr::row_number()) |> # Split into individual annotations dplyr::mutate(annotation = strsplit(simple_ann, ",")) |> # Convert annotation fields into columns tidyr::unnest(annotation) |> tidyr::separate( - annotation, c("Event", "Effect", "Genes", "Transcript", "Detail", "Tier"), + annotation, c("Event", "Effect", "Genes", "Transcripts", "Detail", "Tier"), sep = "\\|", convert = FALSE ) |> # Create new columns and modify existing ones @@ -431,11 +355,10 @@ process_cnv_tsv <- function(x) { copyNumber = as.numeric(copyNumber) |> round(2) %>% sprintf("%.2f", .), minorAlleleCopyNumber = as.numeric(minorAlleleCopyNumber) |> round(2) %>% sprintf("%.2f", .), majorAlleleCopyNumber = as.numeric(majorAlleleCopyNumber) |> round(2) %>% sprintf("%.2f", .), - 'PURPLE CN Min+Maj' = paste0(minorAlleleCopyNumber, "+", majorAlleleCopyNumber), - "Tier (top)" = paste0(Tier, " (", sv_top_tier, ")"), + "PURPLE CN Min+Maj" = paste0(minorAlleleCopyNumber, "+", majorAlleleCopyNumber), + "Genes" = stringr::str_replace_all(Genes, "&", ", "), + "Transcripts" = stringr::str_replace_all(Transcripts, "&", ", "), ) |> - # Set unique annotation ID - dplyr::mutate(annotation_id = dplyr::row_number()) |> # Remove unused columns dplyr::select(-c( baf, @@ -448,112 +371,61 @@ process_cnv_tsv <- function(x) { minorAlleleCopyNumber, segmentEndSupport, segmentStartSupport, - simple_ann, sv_top_tier, - )) |> - # Sort rows - dplyr::arrange(`Tier (top)`, Genes, Effect) - - # Drop specific annotations - cnv.tmp <- cnv.tmp |> - dplyr::filter(Effect != 'intergenic_region') + simple_ann, + )) # Abbreviate effects - abbreviate_effectv <- Vectorize(gpgr::abbreviate_effect) + abbreviate_effectv <- Vectorize(abbreviate_effect) cnv.tmp$Effect <- abbreviate_effectv(cnv.tmp$Effect) - # Split chromosome annotations - cnv.chromosome_split <- split_chromosome_annotations(cnv.tmp) - cnv.chromosome <- cnv.chromosome_split$chromosome - cnv.tmp <- cnv.chromosome_split$other + # Filter unwanted annotations + cnv.annotations.split <- filter_and_split_annotations_cnv(cnv.tmp) - # Split fusion annotations - cnv.fusion_split <- split_fusion_annotations(cnv.tmp) - cnv.fusion <- cnv.fusion_split$fusion - cnv.tmp <- cnv.fusion_split$other - - # Set event type based on the number of genes impacted - cnv.tmp <- set_event_type(cnv.tmp) - - # Set and create many transcripts table - cnv.many_transcripts_data <- set_many_transcripts_cnv(cnv.tmp) - cnv.many_transcripts <- purrr::pluck(cnv.many_transcripts_data, 'many_transcripts') - cnv.tmp <- purrr::pluck(cnv.many_transcripts_data, 'cnv') - - # Split into prioritised and unprioritised - cnv.priority_split <- split_priority(cnv.tmp) - cnv.unprioritised.tmp <- cnv.priority_split$unprioritised - cnv.prioritised.tmp <- cnv.priority_split$prioritised - - # Split into event type assigned above - cnv.event_type_split <- split_event_type(cnv.prioritised.tmp) - cnv.small.tmp <- cnv.event_type_split$small - cnv.medium.tmp <- cnv.event_type_split$medium - cnv.large.tmp <- cnv.event_type_split$large - - # Temporary set dplyr summarise warning off - summarise_info_opt <- getOption('dplyr.summarise.inform') - options(dplyr.summarise.inform = FALSE) - - # Collapse large event type annotations into single records, drop transcripts - cnv.large.tmp <- cnv.large.tmp |> - dplyr::group_by(dplyr::across(c(-Genes, -Transcript, -Tier, -Detail, -annotation_id))) |> - collapse_annotations() - - cnv.large.many_genes_data <- set_many_genes(cnv.large.tmp) - cnv.large.many_genes <- cnv.large.many_genes_data$many_genes - cnv.large.tmp <- cnv.large.many_genes_data$cnv - - # Select then collapse only large event type annotations into single records - cnv.unprioritised.tmp <- cnv.unprioritised.tmp |> - dplyr::filter(event_category == 'large') |> - dplyr::group_by(dplyr::across(c(-Genes, -Tier, -Detail, -annotation_id))) |> - collapse_annotations() |> + # Complete processing + cnv.tmp <- cnv.annotations.split$retained |> + # Reset sv_top_tier after removing annotations + dplyr::group_by(`Event ID`) |> + dplyr::mutate( + sv_top_tier = min(Tier), + "Tier (top)" = paste0(Tier, " (", sv_top_tier, ")"), + ) |> dplyr::ungroup() |> - dplyr::bind_rows( - cnv.unprioritised.tmp |> dplyr::filter(event_category != 'large') - ) - cnv.unprioritised.many_genes_data <- set_many_genes(cnv.unprioritised.tmp) - cnv.unprioritised.many_genes <- cnv.unprioritised.many_genes_data$many_genes - cnv.unprioritised.tmp <- cnv.unprioritised.many_genes_data$cnv - - # Restore dplyr summarise warning setting - options(dplyr.summarise.inform = summarise_info_opt) - - # Select columns for remaining tables - columns <- c( - "Event ID" = "event_id", - "Tier (top)", - "Tier lowest (effect)", - "Tier lowest (effect)", - "Start" = "start", - "End" = "end", - "SV Type" = "svtype", - "Effect", - "Gene count (event)", - "Gene count (effect)", - "Genes", - "Transcripts" = "Transcript", - "Detail", - "PURPLE CN" = "copyNumber", - "PURPLE CN Min+Maj" + # Set unique annotation ID + dplyr::mutate(`Annotation ID` = as.character(dplyr::row_number())) |> + # Sort rows + dplyr::arrange(`Tier (top)`, `Event ID`) + + # Set column names + column_selector <- c( + "Event ID", + "Annotation ID", + "Tier (top)", + "Start" = "start", + "End" = "end", + "SV Type" = "svtype", + "Effect", + "Genes", + "Transcripts", + "Detail", + "PURPLE CN" = "copyNumber", + "PURPLE CN Min+Maj" ) + cnv.tmp <- dplyr::select(cnv.tmp, tidyselect::all_of(c(column_selector, "Tier"))) + cnv.filtered <- dplyr::select(cnv.annotations.split$filtered, tidyselect::any_of(column_selector)) - cnv.small <- cnv.small.tmp |> dplyr::select(tidyselect::any_of(columns)) - cnv.medium <- cnv.medium.tmp |> dplyr::select(tidyselect::any_of(columns)) - cnv.large <- cnv.large.tmp |> dplyr::select(tidyselect::any_of(columns)) - cnv.unprioritised <- cnv.unprioritised.tmp |> dplyr::select(tidyselect::any_of(columns)) + # Collapse selected annotations and set many genes + cnv.many_genes_data <- set_many_genes_cnv(cnv.tmp) + cnv.tmp <- cnv.many_genes_data$ready + + # Set many transcripts + cnv.many_transcripts_data <- set_many_transcripts_cnv(cnv.tmp) list( - small = cnv.small, - medium = cnv.medium, - large = cnv.large, - many_genes_prioritised = cnv.large.many_genes, - many_transcripts = cnv.many_transcripts, - unprioritised = cnv.unprioritised, - many_genes_unprioritised = cnv.unprioritised.many_genes, - fusion = cnv.fusion, - chromsome = cnv.chromosome + annotations = cnv.many_transcripts_data$ready, + filtered = cnv.filtered, + many_genes = cnv.many_genes_data$many_genes, + many_transcripts = cnv.many_transcripts_data$many_transcripts ) } diff --git a/R/rmd.R b/R/rmd.R index 7966dd5..8b500fc 100644 --- a/R/rmd.R +++ b/R/rmd.R @@ -67,6 +67,7 @@ linx_rmd <- function(sample, table_dir, plot_dir, out_file = NULL, quiet = FALSE #' @param conda_list Path to `conda_pkg_list.txt` file. #' @param img_dir Path to directory containing PURPLE plots. #' @param key_genes Path to UMCCR cancer gene file. +#' @param oncokb_genes Path to OncoKB database file. #' @param virusbreakend_tsv Path to VIRUSBreakend summary file. #' @param virusbreakend_vcf Path to VIRUSBreakend VCF file. #' @param purple_purity Path to `purple.purity.tsv`. @@ -87,7 +88,7 @@ linx_rmd <- function(sample, table_dir, plot_dir, out_file = NULL, quiet = FALSE #' @return Path to rendered HTML report. #' @export cancer_rmd <- function(af_global, af_keygenes, batch_name, conda_list, img_dir, key_genes, - virusbreakend_tsv, virusbreakend_vcf, purple_purity, purple_qc, + oncokb_genes, virusbreakend_tsv, virusbreakend_vcf, purple_purity, purple_qc, purple_som_cnv_ann, purple_som_cnv, purple_som_gene_cnv, purple_som_snv_vcf, somatic_snv_vcf, somatic_snv_summary, somatic_sv_tsv, somatic_sv_vcf, result_outdir, tumor_name, out_file = NULL, quiet = FALSE) { @@ -120,6 +121,7 @@ cancer_rmd <- function(af_global, af_keygenes, batch_name, conda_list, img_dir, conda_list = conda_list, img_dir = img_dir_b, key_genes = key_genes, + oncokb_genes = oncokb_genes, somatic_snv_vcf = somatic_snv_vcf, somatic_snv_summary = somatic_snv_summary, somatic_sv_tsv = somatic_sv_tsv, diff --git a/R/sv.R b/R/sv.R index 2f11649..71bbec8 100644 --- a/R/sv.R +++ b/R/sv.R @@ -158,16 +158,28 @@ sash_read_sv_tsv <- function(x) { ) } - split_svs <- function(x) { + bps_types <- c("BND", "DEL", "DUP"," INS", "INV") + + x.grouped <- x |> + dplyr::group_by( + record_type=ifelse(Type %in% bps_types, "bps", "other") + ) + + keys <- x.grouped |> + dplyr::group_keys() |> + dplyr::pull(record_type) + + x.split <- x.grouped |> + dplyr::group_split(.keep=FALSE) |> + purrr::set_names(keys) + list( - purple_inferred = dplyr::filter(x, Type == "PURPLE_inf"), - sgl = dplyr::filter(x, Type == "SGL"), - bps = dplyr::filter(x, ! Type %in% c("PURPLE_inf", "SGL")) + bps = purrr::pluck(x.split, "bps"), + other = purrr::pluck(x.split, "other") ) } - join_breakpoint_entries <- function(x) { # Group by GRIDSS identifier (clipping trailing h/o [h: High, o: lOwer]) bps <- x |> @@ -191,13 +203,12 @@ join_breakpoint_entries <- function(x) { dplyr::select(-c(end_position, end_chrom)) } - remove_gene_fusion_dups <- function(.data, columns) { # Order elements of multi-entry effect values for reliable comparison v.groups <- c("frameshift_variant&gene_fusion", "gene_fusion") v.effects_ordered <- sapply(.data$Effect, function(s) { - c <- stringr::str_split(s, '&') |> unlist() - paste0(sort(c), collapse='&') + c <- stringr::str_split(s, "&") |> unlist() + paste0(sort(c), collapse="&") }) if (all(v.groups %in% v.effects_ordered)) { @@ -207,17 +218,49 @@ remove_gene_fusion_dups <- function(.data, columns) { } } +filter_and_split_annotations_sv <- function(x) { + + filter_conditions <- list( + # Empty Gene field + x$Genes == "", + # Only Ensembl identifiers in the Gene field + stringr::str_split(x$Genes, "[&-]|, ") |> purrr::map_lgl(\(x) stringr::str_starts(x, "ENSG") |> all()), + # Fusions that do not involve genes + x$Effect == "Fus", + # PURPLE inferred SVs + x$Type == "PURPLE_inf" + ) + + x.grouped <- x |> + dplyr::group_by( + filter=ifelse(purrr::reduce(filter_conditions, `|`), "filter", "retain") + ) |> + dplyr::select(-c(Type, `Top Tier`)) + + keys <- x.grouped |> + dplyr::group_keys() |> + dplyr::pull(filter) + + x.split <- x.grouped |> + dplyr::group_split(.keep=FALSE) |> + purrr::set_names(keys) + + list( + retained = purrr::pluck(x.split, "retain"), + filtered = purrr::pluck(x.split, "filter") + ) +} set_many_transcripts_sv <- function(x) { # Set many transcripts x.tmp <- x |> dplyr::rowwise() |> dplyr::mutate( - transcript_count = strsplit(Transcripts, "&") |> unlist() |> unique() |> length() + `Transcript count` = stringr::str_split(Transcripts, ", ") |> unlist() |> unique() |> length() ) |> dplyr::ungroup() |> dplyr::mutate( - many_transcripts = ifelse(transcript_count > 2, "many_transcripts", "few_transcripts") + many_transcripts = ifelse(`Transcript count` > 2, "many_transcripts", "few_transcripts") ) # Build the many transcripts table @@ -230,7 +273,7 @@ set_many_transcripts_sv <- function(x) { "Tier (top)", "Start", "End", - "Transcript count" = "transcript_count", + "Transcript count", "Transcripts", ) @@ -240,10 +283,10 @@ set_many_transcripts_sv <- function(x) { Transcripts = ifelse( many_transcripts == "few_transcripts" | is.na(many_transcripts), Transcripts, - paste0("Many transcripts (", transcript_count, ")") + paste0("Many transcripts (", `Transcript count`, ")") ) ) |> - dplyr::select(c(-many_transcripts, -transcript_count)) + dplyr::select(-c(many_transcripts, `Transcript count`)) list( many_transcripts = mt, @@ -251,7 +294,6 @@ set_many_transcripts_sv <- function(x) { ) } - #' @export process_sv <- function(x) { # Read input and set column information @@ -266,9 +308,14 @@ process_sv <- function(x) { sv.ready <- sv.input$data |> dplyr::mutate( "annotation_count" = count_pieces(annotation, ","), - "Top Tier"= tier, + "Top Tier" = tier, "SR_PR_ref" = paste0(SR_ref, ",", PR_ref), - "SR_PR_sum" = SR_alt + PR_alt, + "SR_PR_sum" = dplyr::case_when( + is.na(SR_alt) & is.na(PR_alt) ~ NA, + is.na(SR_alt) ~ PR_alt, + is.na(PR_alt) ~ SR_alt, + .default = SR_alt + PR_alt, + ), start = paste(chrom, base::format(start, big.mark = ",", trim = TRUE), sep=":"), Type = ifelse(is.na(PURPLE_status), svtype, "PURPLE_inf"), "Record ID" = dplyr::row_number(), @@ -280,34 +327,12 @@ process_sv <- function(x) { svtype, )) - # Split SVs by PURPLE inferred, single breakends, and breakpoints + # Split out breakpoints for merging sv.split <- split_svs(sv.ready) - # Combine joined breakpoints and single breakends + # Complete breakpoint records with corresponding mate, then combine with non-breakend records sv.bps <- join_breakpoint_entries(sv.split$bps) - sv.tmp <- dplyr::bind_rows(sv.split$sgl, sv.bps) - - # Prepare PURPLE inf, add event id continue from assignment above - sv.purple_inf_full <- purrr::pluck(sv.split$purple_inferred) |> - dplyr::mutate( - CN_PURPLE = as.numeric(CN_PURPLE) |> round(2) %>% sprintf("%.2f", .), - CN_change_PURPLE = as.numeric(CN_change_PURPLE) |> round(3) %>% sprintf("%.3f", .), - ) |> - dplyr::rename( - "PURPLE CN" = CN_PURPLE, - "PURPLE CN Change" = CN_change_PURPLE, - "VCF ID" = ID, - ) - - # Select columns for dedicated PURPLE inf table - sv.purple_inf <- sv.purple_inf_full |> - dplyr::select( - "Record ID", - "VCF ID", - "Position" = "start", - "PURPLE CN", - "PURPLE CN Change", - ) + sv.tmp <- dplyr::bind_rows(sv.bps, sv.split$other) # Format some columns cols_to_split <- c("AF_PURPLE", "CN_PURPLE") @@ -317,8 +342,7 @@ process_sv <- function(x) { dplyr::bind_cols(double_cols) # Format a table for to be used as the SV Map - sv.unmelted <- dplyr::bind_rows(sv.tmp, sv.purple_inf_full) - sv.map <- sv.unmelted |> + sv.map <- sv.tmp |> dplyr::select( "Record ID", "Annotations" = "annotation_count", @@ -338,42 +362,40 @@ process_sv <- function(x) { dplyr::arrange(`Record ID`) # Melt annotations - sv.melted_all <- sv.unmelted |> + sv.melted_all <- sv.tmp |> # Split into individual annotations dplyr::mutate(annotation = strsplit(annotation, ",")) |> # Convert annotation fields into columns tidyr::unnest(annotation) |> tidyr::separate( - annotation, c("Event", "Effect", "Genes", "Transcript", "Detail", "Tier"), + annotation, c("Event", "Effect", "Genes", "Transcripts", "Detail", "Tier"), sep = "\\|", convert = FALSE ) |> # Remove gene_fusion annotations for variants where frameshift_variant&gene_fusion already exist dplyr::group_by(across(-Effect)) |> dplyr::group_modify(remove_gene_fusion_dups) |> dplyr::ungroup() |> - # Set unique annotation ID, remove unused columns - dplyr::mutate(annotation_id = dplyr::row_number()) |> + # Remove unused columns dplyr::select(c(-Event, -ALT)) |> - # Create new tier column + # Create columns, modify others dplyr::mutate( + "Annotation ID" = dplyr::row_number(), "Tier (top)" = paste0(Tier, " (", `Top Tier`, ")"), + "Genes" = stringr::str_replace_all(Genes, "&", ", "), + "Transcripts" = stringr::str_replace_all(Transcripts, "&", ", "), ) |> # Sort rows dplyr::arrange(`Tier (top)`, `Record ID`, Genes, Effect) # Abbreviate effects - abbreviate_effectv <- Vectorize(gpgr::abbreviate_effect) + abbreviate_effectv <- Vectorize(abbreviate_effect) sv.melted_all$Effect <- abbreviate_effectv(sv.melted_all$Effect) - # Exclude PURPLE inferred - sv.melted <- sv.melted_all |> - dplyr::filter(Type != "PURPLE_inf") - - # Select columns - sv.tmp <- sv.melted |> + # Select and rename columns + sv.melted_all <- sv.melted_all |> dplyr::select( "Record ID", - "Annotation ID" = "annotation_id", + "Annotation ID", "Tier (top)", "Start" = "start", "End" = "end", @@ -381,7 +403,7 @@ process_sv <- function(x) { "Breakend Mate" = "BND_mate", "Effect", "Genes", - "Transcripts" = "Transcript", + "Transcripts", "Effect", "Detail", "SR_alt", @@ -389,32 +411,25 @@ process_sv <- function(x) { "SR_PR_sum", "PURPLE AF" = "AF_PURPLE", "PURPLE CN" = "CN_PURPLE", - # Removed after ops - "Tier", + # Dropped after ops for non-map outputs + "Top Tier", + "Type", ) - # Split into priority groups, remove Tier column once split - sv.prioritised.tmp <- dplyr::filter(sv.tmp, Tier <= 3) |> dplyr::select(-Tier) - sv.unprioritised.tmp = dplyr::filter(sv.tmp, Tier >= 4) |> dplyr::select(-Tier) - - # Set many transcripts - sv.many_transcript_data.prioritised <- set_many_transcripts_sv(sv.prioritised.tmp) - sv.prioritised.many_transcripts <- purrr::pluck(sv.many_transcript_data.prioritised, "many_transcripts") - sv.prioritised <- purrr::pluck(sv.many_transcript_data.prioritised, "sv") + # Create and set many transcript values + sv.annotations.many_transcript_data <- set_many_transcripts_sv(sv.melted_all) + sv.annotations.many_transcripts <- purrr::pluck(sv.annotations.many_transcript_data, "many_transcripts") + sv.annotations <- purrr::pluck(sv.annotations.many_transcript_data, "sv") - sv.many_transcript_data.unprioritised <- set_many_transcripts_sv(sv.unprioritised.tmp) - sv.unprioritised.many_transcripts <- purrr::pluck(sv.many_transcript_data.unprioritised, "many_transcripts") - sv.unprioritised <- purrr::pluck(sv.many_transcript_data.unprioritised, "sv") + # Filter unwanted annotations + sv.annotations.split <- filter_and_split_annotations_sv(sv.annotations) - # Split into priority groups list( map = sv.map, map_melted = sv.melted_all, - prioritised = sv.prioritised, - many_transcripts_prioritised = sv.prioritised.many_transcripts, - unprioritised = sv.unprioritised, - many_transcripts_unprioritised = sv.unprioritised.many_transcripts, - purple_inferred = sv.purple_inf + annotations = sv.annotations.split$retained, + annotations_filtered = sv.annotations.split$filtered, + many_transcripts = sv.annotations.many_transcripts ) } diff --git a/inst/cli/canrep.R b/inst/cli/canrep.R index 7e87a87..f0f0128 100644 --- a/inst/cli/canrep.R +++ b/inst/cli/canrep.R @@ -6,6 +6,7 @@ canrep_add_args <- function(subp) { canrep$add_argument("--conda_list", help = "Path to `conda_pkg_list.txt` file.") canrep$add_argument("--img_dir", help = "Path to directory containing PURPLE plots.", required = TRUE) canrep$add_argument("--key_genes", help = "Path to UMCCR cancer gene file.", required = TRUE) + canrep$add_argument("--oncokb_genes", help = "Path to OncoKB database file.", required = TRUE) canrep$add_argument("--somatic_snv_vcf", help = "Path to `somatic-PASS.vcf.gz` SNV VCF.", required = TRUE) canrep$add_argument("--somatic_snv_summary", help = "Path to `somatic_snv_summary.json` JSON.", required = TRUE) canrep$add_argument("--somatic_sv_tsv", help = "Path to `manta.tsv` TSV file.", required = TRUE) @@ -33,6 +34,7 @@ canrep_parse_args <- function(args) { conda_list = args$conda_list, img_dir = args$img_dir, key_genes = args$key_genes, + oncokb_genes = args$oncokb_genes, somatic_snv_vcf = args$somatic_snv_vcf, somatic_snv_summary = args$somatic_snv_summary, somatic_sv_tsv = args$somatic_sv_tsv, diff --git a/inst/rmd/umccrise/cancer_report.Rmd b/inst/rmd/umccrise/cancer_report.Rmd index badc7e0..d8bae71 100644 --- a/inst/rmd/umccrise/cancer_report.Rmd +++ b/inst/rmd/umccrise/cancer_report.Rmd @@ -16,6 +16,7 @@ params: conda_list: NULL img_dir: NULL key_genes: NULL + oncokb_genes: NULL virusbreakend_tsv: NULL virusbreakend_vcf: NULL purple_purity: NULL @@ -110,7 +111,6 @@ sv_detail_table <- function(tab) { } ``` - ```{r hrd_prep, results='hide'} #---- Homologous Recombination Deficiency ----# chord_res <- sigrap::chord_run( @@ -182,13 +182,54 @@ gpgr::write_tsvjsongz(sigs_indel, glue("sigs/{bnm}-indel"), result_outdir) ``` ```{r sv_prep} +#---- Misc ----# +oncokb_genes <- gpgr::read_oncokb(params$oncokb_genes) + #---- Structural Variants ----# sv_path <- params$somatic_sv_tsv sv <- NULL no_sv_found <- gpgr::tsv_is_empty(sv_path) if (!no_sv_found) { sv <- gpgr::process_sv(sv_path) + + # Set OncoKB genes + sv$annotations <- sv$annotations |> + dplyr::mutate( + "Genes (OncoKB)" = gpgr::get_oncokb_genes(Genes, oncokb_genes), + .after = "Genes", + ) } + +#---- Copy Number Variants ----# +cnv <- gpgr::process_cnv_tsv(params$purple_som_cnv_ann) + +# Set OncoKB genes +cnv$annotations <- cnv$annotations |> + dplyr::mutate( + "Genes (OncoKB)" = get_oncokb_genes(Genes, oncokb_genes), + .after = "Genes", + ) + +# Set OncoKB genes for Many Genes table and transfer to the main table where appropriate +cnv$many_genes <- cnv$many_genes |> + dplyr::mutate( + `Genes (OncoKB)` = get_oncokb_genes(Genes, oncokb_genes), + .before = "Genes", + ) |> + # Create transfer column, set entries with more than n genes as 'Many genes (m)' + dplyr::rowwise() |> + dplyr::mutate( + gene.oncokb.count = strsplit(`Genes (OncoKB)`, ", ") |> unlist() |> unique() |> length(), + gene.oncokb.tx = ifelse(gene.oncokb.count > 2, paste0("Many genes (", gene.oncokb.count, ")"), `Genes (OncoKB)`), + ) + +# Transfer selected OncoKB gene entries +v.selector <- match(cnv$many_genes$`Annotation ID`, cnv$annotations$`Annotation ID`) +cnv$annotations$`Genes (OncoKB)`[v.selector] <- cnv$many_genes$gene.oncokb.tx + +# Clean up columns +cnv$many_genes <- cnv$many_genes |> + dplyr::select(-c(gene.oncokb.count, gene.oncokb.tx)) ``` ```{r purple_prep} @@ -292,8 +333,7 @@ cnv_summary <- if (no_sv_found) { sv_summary <- tibble(unmelted = c(0), melted = c(0)) } else { - sv_summary <- - list( + sv_summary <- list( unmelted = sv$map %>% dplyr::select(Type), melted = sv$map_melted %>% dplyr::select(Type) ) %>% @@ -891,17 +931,18 @@ DEL | intergenic_region | TXNIP-HFE2 | ENSG00000265972-ENSG00000168509
    ```{r} -sv.dt <- sv |> purrr::map(function(x) { - x |> DT::datatable( - filter = list(position = "top", clear = FALSE, plain = TRUE), - class = "cell-border display compact", - rownames = FALSE, extensions = c("Scroller", "Buttons", "KeyTable"), - options = list( - scroller = TRUE, scrollY = 400, scrollX = TRUE, autoWidth = FALSE, keys = TRUE, - buttons = c("csv", "excel"), dom = "Blfrtip" +sv.dt <- sv |> + purrr::map(function(x) { + x |> DT::datatable( + filter = list(position = "top", clear = FALSE, plain = TRUE), + class = "cell-border display compact", + rownames = FALSE, extensions = c("Scroller", "Buttons", "KeyTable"), + options = list( + scroller = TRUE, scrollY = 400, scrollX = TRUE, autoWidth = FALSE, keys = TRUE, + buttons = c("csv", "excel"), dom = "Blfrtip" + ) ) - ) -}) + }) ``` ### Summary {.tabset .tabset-fade #sv-summary} @@ -973,54 +1014,30 @@ blank_lines(2) ### __Breakpoints and Breakends__ {.tabset .tabset-fade} -#### Main {.tabset} - -##### Prioritised - -```{r} -sv.dt$prioritised -``` - -```{r, results='asis'} -blank_lines(2) -``` - -##### Many transcripts - -```{r} -sv.dt$many_transcripts_prioritised -``` - -```{r, results='asis'} -blank_lines(2) -``` - -#### Other {.tabset} - -##### Unprioritised +#### Prioritised ```{r} -sv.dt$unprioritised +sv.dt$annotations ``` ```{r, results='asis'} blank_lines(2) ``` -##### Many transcripts +#### Filtered ```{r} -sv.dt$many_transcripts_unprioritised +sv.dt$annotations_filtered ``` ```{r, results='asis'} blank_lines(2) ``` -##### PURPLE inferred +#### Many Transcripts ```{r} -sv.dt$purple_inferred +sv.dt$many_transcripts ``` ```{r, results='asis'} @@ -1029,10 +1046,6 @@ blank_lines(2) ### __Copy number variants__ {.tabset .tabset-fade} -```{r} -cnv <- gpgr::process_cnv_tsv(params$purple_som_cnv_ann) -``` - ```{r} cnv.dt <- cnv |> purrr::map(function(x) { x |> DT::datatable( @@ -1047,49 +1060,37 @@ cnv.dt <- cnv |> purrr::map(function(x) { }) ``` -#### Main {.tabset} - -##### Small +#### Prioritised ```{r} -cnv.dt$small +cnv.dt$annotations ``` ```{r, results='asis'} blank_lines(2) ``` -##### Medium +#### Filtered ```{r} -cnv.dt$medium +cnv.dt$filtered ``` ```{r, results='asis'} blank_lines(2) ``` -##### Large +#### Many Genes ```{r} -cnv.dt$large +cnv.dt$many_genes ``` ```{r, results='asis'} blank_lines(2) ``` -##### Many genes - -```{r} -cnv.dt$many_genes_prioritised -``` - -```{r, results='asis'} -blank_lines(2) -``` - -##### Many transcripts +#### Many Transcripts ```{r} cnv.dt$many_transcripts @@ -1099,48 +1100,6 @@ cnv.dt$many_transcripts blank_lines(2) ``` -#### Other {.tabset} - -##### Unprioritised - -```{r} -cnv.dt$unprioritised -``` - -```{r, results='asis'} -blank_lines(2) -``` - -##### Many genes - -```{r} -cnv.dt$many_genes_unprioritised -``` - -```{r, results='asis'} -blank_lines(2) -``` - -##### Fusions - -```{r} -cnv.dt$fusion -``` - -```{r, results='asis'} -blank_lines(2) -``` - -##### Chromosome - -```{r} -cnv.dt$chromsome -``` - -```{r, results='asis'} -blank_lines(2) -``` - ### UMCCR __Gene Somatic__ CNV Calls {#gene-panel-cnv}
    diff --git a/man/cancer_rmd.Rd b/man/cancer_rmd.Rd index a27128f..39c9fd2 100644 --- a/man/cancer_rmd.Rd +++ b/man/cancer_rmd.Rd @@ -11,6 +11,7 @@ cancer_rmd( conda_list, img_dir, key_genes, + oncokb_genes, virusbreakend_tsv, virusbreakend_vcf, purple_purity, @@ -42,6 +43,8 @@ cancer_rmd( \item{key_genes}{Path to UMCCR cancer gene file.} +\item{oncokb_genes}{Path to OncoKB database file.} + \item{virusbreakend_tsv}{Path to VIRUSBreakend summary file.} \item{virusbreakend_vcf}{Path to VIRUSBreakend VCF file.} diff --git a/tests/testthat/test-roxytest-testexamples-purple.R b/tests/testthat/test-roxytest-testexamples-purple.R index 42cf1b3..a9fde90 100644 --- a/tests/testthat/test-roxytest-testexamples-purple.R +++ b/tests/testthat/test-roxytest-testexamples-purple.R @@ -19,7 +19,7 @@ test_that("Function purple_cnv_som_gene_process() @ L60", { }) -test_that("Function purple_cnv_som_read() @ L577", { +test_that("Function purple_cnv_som_read() @ L449", { x <- system.file("extdata/purple/purple.cnv.somatic.tsv", package = "gpgr") (p <- purple_cnv_som_read(x)) @@ -27,7 +27,7 @@ test_that("Function purple_cnv_som_read() @ L577", { }) -test_that("Function purple_cnv_som_process() @ L610", { +test_that("Function purple_cnv_som_process() @ L482", { x <- system.file("extdata/purple/purple.cnv.somatic.tsv", package = "gpgr") (pp <- purple_cnv_som_process(x)) @@ -35,7 +35,7 @@ test_that("Function purple_cnv_som_process() @ L610", { }) -test_that("Function purple_qc_read() @ L678", { +test_that("Function purple_qc_read() @ L550", { x <- system.file("extdata/purple/purple.qc", package = "gpgr") (q <- purple_qc_read(x)) @@ -43,7 +43,7 @@ test_that("Function purple_qc_read() @ L678", { }) -test_that("Function purple_purity_read() @ L736", { +test_that("Function purple_purity_read() @ L608", { x <- system.file("extdata/purple/purple.purity.tsv", package = "gpgr") (p <- purple_purity_read(x)) From 5131a61492ec4b12ead41280a38d7590ddbbe57e Mon Sep 17 00:00:00 2001 From: Peter Diakumis Date: Wed, 31 Jan 2024 10:27:34 +1100 Subject: [PATCH 04/11] DRAGEN HRD support (#72) * dragen hrd support * adjust render.R for local debugging * add Dragen score description --- NAMESPACE | 1 + R/rmd.R | 3 +- R/umccrise.R | 60 +++++++++++++++++++++++++---- inst/rmd/umccrise/cancer_report.Rmd | 33 ++++++++++------ inst/rmd/umccrise/render.R | 51 ++++++++++++------------ man/cancer_rmd.Rd | 1 + man/dragen_hrd.Rd | 25 ++++++++++++ man/hrd_results_tabs.Rd | 7 +++- 8 files changed, 133 insertions(+), 48 deletions(-) create mode 100644 man/dragen_hrd.Rd diff --git a/NAMESPACE b/NAMESPACE index d661d49..0b7ef7e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,7 @@ export(af_summary) export(cancer_rmd) export(count_pieces) export(date_log) +export(dragen_hrd) export(get_oncokb_genes) export(hrd_results_tabs) export(is_vcf) diff --git a/R/rmd.R b/R/rmd.R index 8b500fc..14abe17 100644 --- a/R/rmd.R +++ b/R/rmd.R @@ -87,7 +87,7 @@ linx_rmd <- function(sample, table_dir, plot_dir, out_file = NULL, quiet = FALSE #' #' @return Path to rendered HTML report. #' @export -cancer_rmd <- function(af_global, af_keygenes, batch_name, conda_list, img_dir, key_genes, +cancer_rmd <- function(af_global, af_keygenes, batch_name, conda_list, dragen_hrd, img_dir, key_genes, oncokb_genes, virusbreakend_tsv, virusbreakend_vcf, purple_purity, purple_qc, purple_som_cnv_ann, purple_som_cnv, purple_som_gene_cnv, purple_som_snv_vcf, somatic_snv_vcf, somatic_snv_summary, somatic_sv_tsv, somatic_sv_vcf, @@ -119,6 +119,7 @@ cancer_rmd <- function(af_global, af_keygenes, batch_name, conda_list, img_dir, af_keygenes = af_keygenes, batch_name = batch_name, conda_list = conda_list, + dragen_hrd = dragen_hrd, img_dir = img_dir_b, key_genes = key_genes, oncokb_genes = oncokb_genes, diff --git a/R/umccrise.R b/R/umccrise.R index aa95137..6168e11 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -1,9 +1,34 @@ +#' Parse DRAGEN HRD File +#' +#' Parses DRAGEN `hrdscore.csv` file. +#' +#' @param x Path to DRAGEN `hrdscore.csv` file. +#' +#' @return Tibble with a single row and the following score columns: +#' - HRD (homologous recombination deficiency) +#' - LOH (loss of heterozygosity) +#' - TAI (telomeric allelic imbalance) +#' - LST (large-scale state transitions) +#' +#' If no input is provided, the fields are NA. +#' @export +dragen_hrd <- function(x = NULL) { + res <- tibble::tibble(Sample = NA, LOH = NA, TAI = NA, LST = NA, HRD = NA) + if (!is.null(x)) { + res <- x |> + readr::read_csv(col_types = readr::cols(.default = "c")) + names(res) <- sub("_Score", "", names(res)) + } + res +} + #' HRDetect and CHORD Summary Table #' #' Return a summary table with HRDetect and CHORD results. #' #' @param chord_res Result from running [sigrap::chord_run()]. #' @param hrdetect_res Result from running [sigrap::hrdetect_run()]. +#' @param dragen_res Result from running [gpgr::dragen_hrd()]. #' #' @return A list with a tibble and a gt_tbl object (see [gt::gt()]). #' @@ -20,11 +45,12 @@ #' vcf.snv = snv, sample.name = nm, #' df.sv = gpgr:::chord_mantavcf2df(sv) #' ) -#' hrd_results_tabs(hrdetect_res = hrdetect_res, chord_res = chord_res) +#' dragen_res <- gpgr::dragen_hrd("path/to/sample.hrdscore.csv") +#' hrd_results_tabs(hrdetect_res = hrdetect_res, chord_res = chord_res, dragen_res = dragen_res) #' } #' #' @export -hrd_results_tabs <- function(hrdetect_res, chord_res) { +hrd_results_tabs <- function(hrdetect_res, chord_res, dragen_res) { sn <- chord_res$prediction[, "sample", drop = T] assertthat::are_equal(hrdetect_res[, "sample", drop = T], sn) @@ -42,10 +68,17 @@ hrd_results_tabs <- function(hrdetect_res, chord_res) { ) |> dplyr::filter(col != "sample") + dragen_res_tab <- + dragen_res |> + dplyr::select("HRD", "LOH", "TAI", "LST") |> + unlist() |> + tibble::enframe(name = "col", value = "val") + colnames(dragen_res_tab) <- c("DRAGEN", "results_dragen") colnames(hrdetect_res_tab) <- c("HRDetect", "results_hrdetect") colnames(chord_res_tab) <- c("CHORD", "results_chord") + # idea is to make 9-row tibbles and cbind them tab1 <- dplyr::bind_rows( hrdetect_res_tab, @@ -54,7 +87,7 @@ hrd_results_tabs <- function(hrdetect_res, chord_res) { " ", " " ) ) - tab2 <- + tab2a <- dplyr::bind_rows( chord_res_tab[1:7, ], tibble::tribble( @@ -63,10 +96,14 @@ hrd_results_tabs <- function(hrdetect_res, chord_res) { " ", " " ) ) - tab3 <- chord_res_tab[8:16, ] |> + tab2b <- chord_res_tab[8:16, ] |> purrr::set_names(c("CHORD2", "results_chord2")) + tab3 <- dplyr::bind_rows( + dragen_res_tab, + tibble::tibble(DRAGEN = rep(" ", 5), results_dragen = rep(" ", 5)) + ) - hrd_results_tab <- dplyr::bind_cols(tab1, tab2, tab3) + hrd_results_tab <- dplyr::bind_cols(tab3, tab1, tab2a, tab2b) hrd_results_gt <- hrd_results_tab |> @@ -74,6 +111,11 @@ hrd_results_tabs <- function(hrdetect_res, chord_res) { gt::tab_header( title = glue::glue("HRD Results for {sn}") ) |> + gt::tab_spanner( + label = "DRAGEN", + id = "id_dragen", + columns = c("DRAGEN", "results_dragen") + ) |> gt::tab_spanner( label = "HRDetect", id = "id_hrdetect", @@ -85,6 +127,8 @@ hrd_results_tabs <- function(hrdetect_res, chord_res) { columns = c("CHORD", "results_chord", "CHORD2", "results_chord2") ) |> gt::cols_label( + DRAGEN = "", + results_dragen = "", HRDetect = "", results_hrdetect = "", CHORD = "", @@ -97,12 +141,12 @@ hrd_results_tabs <- function(hrdetect_res, chord_res) { gt::cell_text(weight = "bold") ), locations = gt::cells_body( - columns = c("HRDetect", "CHORD", "CHORD2") + columns = c("DRAGEN", "HRDetect", "CHORD", "CHORD2") ) ) |> gt::cols_align( align = "right", - columns = c("results_hrdetect") + columns = c("results_hrdetect", "results_dragen") ) |> gt::tab_style( style = gt::cell_borders( @@ -112,7 +156,7 @@ hrd_results_tabs <- function(hrdetect_res, chord_res) { style = "solid" ), locations = gt::cells_body( - columns = c("CHORD"), + columns = c("CHORD", "HRDetect"), rows = dplyr::everything() ) ) |> diff --git a/inst/rmd/umccrise/cancer_report.Rmd b/inst/rmd/umccrise/cancer_report.Rmd index d8bae71..71d42a2 100644 --- a/inst/rmd/umccrise/cancer_report.Rmd +++ b/inst/rmd/umccrise/cancer_report.Rmd @@ -14,6 +14,7 @@ params: af_keygenes: NULL batch_name: NULL conda_list: NULL + dragen_hrd: NULL img_dir: NULL key_genes: NULL oncokb_genes: NULL @@ -113,6 +114,7 @@ sv_detail_table <- function(tab) { ```{r hrd_prep, results='hide'} #---- Homologous Recombination Deficiency ----# +dragen_hrd_res <- gpgr::dragen_hrd(params$dragen_hrd) chord_res <- sigrap::chord_run( vcf.snv = params$somatic_snv_vcf, vcf.sv = params$somatic_sv_vcf, @@ -317,6 +319,7 @@ snv_summary <- snv_summary_counts |> paste(collapse = "; ") hrd_summary <- list( + dragen = dragen_hrd_res[, "HRD", drop = TRUE], chord = chord_res[["prediction"]][, "p_hrd", drop = TRUE], hrdetect = hrdetect_res[, "Probability", drop = TRUE] ) @@ -334,9 +337,9 @@ if (no_sv_found) { sv_summary <- tibble(unmelted = c(0), melted = c(0)) } else { sv_summary <- list( - unmelted = sv$map %>% dplyr::select(Type), - melted = sv$map_melted %>% dplyr::select(Type) - ) %>% + unmelted = sv$map %>% dplyr::select(Type), + melted = sv$map_melted %>% dplyr::select(Type) + ) %>% purrr::map(function(x) addmargins(table(x, useNA = "ifany"))) %>% do.call("rbind", .) %>% as.data.frame() %>% @@ -358,8 +361,8 @@ qc_summary_all <- dplyr::tribble( "2015 COSMIC (https://cancer.sanger.ac.uk/cosmic/signatures_v2.tt)", 5, "MutSigs (New)", glue::glue("{summarise_sigs(sigs_snv_2020)}"), "2020 COSMIC (https://cancer.sanger.ac.uk/cosmic/signatures)", - 6, "HRD", glue::glue("CHORD: {hrd_summary$chord}; HRDetect: {hrd_summary$hrdetect}"), - "Homologous recombination deficiency probability.", + 6, "HRD", glue::glue("DRAGEN: {hrd_summary$dragen}; CHORD: {hrd_summary$chord}; HRDetect: {hrd_summary$hrdetect}"), + "Homologous recombination deficiency score (DRAGEN) / probability (CHORD/HRDetect).", 20, "CNVs (Somatic)", glue::glue( "Min: {cnv_summary[['som']]$Min}; ", "Max: {cnv_summary[['som']]$Max}; ", @@ -752,6 +755,15 @@ blank_lines(1)
    Description +__DRAGEN__ + +Following scores are shown: + +- HRD: homologous recombination deficiency +- LOH: loss of heterozygosity +- TAI: telomeric allelic imbalance +- LST: large-scale state transitions + __CHORD__ For more details see . @@ -796,10 +808,11 @@ that includes HRDetect.
    -```{r chord_hrdetect_res} -hrd_results <- gpgr::hrd_results_tabs(hrdetect_res, chord_res) +```{r chord_hrdetect_dragen_res} +hrd_results <- gpgr::hrd_results_tabs(hrdetect_res, chord_res, dragen_hrd_res) gpgr::write_tsvjsongz(chord_res[["prediction"]], glue("hrd/{bnm}-chord"), result_outdir) gpgr::write_tsvjsongz(hrdetect_res, glue("hrd/{bnm}-hrdetect"), result_outdir) +gpgr::write_tsvjsongz(dragen_hrd_res, glue("hrd/{bnm}-dragen"), result_outdir) hrd_results$hrd_results_gt ``` @@ -1268,7 +1281,7 @@ if (nrow(vb_summary$tab) > 0) { buttons = c("csv", "excel"), dom = "Blfrtip" ) ) |> - DT::formatCurrency(~ Length + Reads, currency = "", interval = 3, mark = ",", digits = 0) + DT::formatCurrency(~ Length + Reads, currency = "", interval = 3, mark = ",", digits = 0) } else { cat("No oncoviral content detected.") } @@ -1286,7 +1299,6 @@ vb_integrations <- gpgr::virusbreakend_vcf_read(params$virusbreakend_vcf) details::details(knitr::kable(vb_integrations$descr), lang = "none", summary = "Column Description") if (nrow(vb_integrations$tab) > 0) { - format_common_cols <- c( "Position", "Fragment support", @@ -1303,8 +1315,7 @@ if (nrow(vb_integrations$tab) > 0) { buttons = c("csv", "excel"), dom = "Blfrtip" ) ) |> - DT::formatCurrency(format_common_cols, currency = "", interval = 3, mark = ",", digits = 0) - + DT::formatCurrency(format_common_cols, currency = "", interval = 3, mark = ",", digits = 0) } else { cat("No oncoviral integrations detected.") } diff --git a/inst/rmd/umccrise/render.R b/inst/rmd/umccrise/render.R index b482b5c..975faf0 100644 --- a/inst/rmd/umccrise/render.R +++ b/inst/rmd/umccrise/render.R @@ -1,37 +1,36 @@ #!/usr/bin/env Rscript -require(here) +require(here, include.only = "here") require(gpgr) -require(glue) +require(glue, include.only = "glue") require(purrr) -batch_name <- "SBJ02269__PRJ221202" -tumor_name <- "PRJ221202" -# umccrised_dir <- "/g/data/gx8/projects/diakumis/umccrise/data/SBJ02269" -# key_genes <- "/g/data/gx8/projects/diakumis/conda/envs/umccrise/lib/python3.7/site-packages/ngs_utils/reference_data/key_genes/umccr_cancer_genes.latest.tsv" -umccrised_dir <- here::here("nogit/umccrised_data/SBJ02269") -key_genes <- here::here("nogit/umccrised_data/umccr_cancer_genes.latest.tsv") +batch_name <- "SBJ00480_PTC_HCC1395t100pc" +tumor_name <- "PTC_HCC1395t100pc" +umccrised_dir <- here("nogit/umccrised_data/seqc_inputs.20231115") params <- list( - af_global = glue::glue("{umccrised_dir}/work/{batch_name}/cancer_report/afs/af_tumor.txt"), - af_keygenes = glue::glue("{umccrised_dir}/work/{batch_name}/cancer_report/afs/af_tumor_keygenes.txt"), + af_global = glue("{umccrised_dir}/sample_data/af_tumor.txt"), + af_keygenes = glue("{umccrised_dir}/sample_data/af_tumor_keygenes.txt"), batch_name = batch_name, - conda_list = glue::glue("{umccrised_dir}/work/{batch_name}/conda_pkg_list.txt"), - img_dir = glue::glue("{umccrised_dir}/img"), - key_genes = key_genes, - oncoviral_breakpoints_tsv = glue::glue("{umccrised_dir}/work/{batch_name}/oncoviruses/oncoviral_breakpoints.tsv"), - oncoviral_present_viruses = glue::glue("{umccrised_dir}/work/{batch_name}/oncoviruses/present_viruses.txt"), - out_file = here::here("nogit/umccrised_data/SBJ02269/foo_cancer_report.html"), - purple_purity = glue::glue("{umccrised_dir}/work/{batch_name}/purple/{batch_name}.purple.purity.tsv"), - purple_qc = glue::glue("{umccrised_dir}/work/{batch_name}/purple/{batch_name}.purple.qc"), - purple_som_cnv = glue::glue("{umccrised_dir}/work/{batch_name}/purple/{batch_name}.purple.cnv.somatic.tsv"), - purple_som_gene_cnv = glue::glue("{umccrised_dir}/work/{batch_name}/purple/{batch_name}.purple.cnv.gene.tsv"), - purple_som_snv_vcf = glue::glue("{umccrised_dir}/work/{batch_name}/purple/{batch_name}.purple.somatic.vcf.gz"), - result_outdir = glue::glue("{umccrised_dir}/{batch_name}/cancer_report_tables2"), - somatic_snv_summary = glue::glue("{umccrised_dir}/work/{batch_name}/cancer_report/somatic_snv_summary.json"), - somatic_snv_vcf = glue::glue("{umccrised_dir}/{batch_name}/small_variants/{batch_name}-somatic-PASS.vcf.gz"), - somatic_sv_tsv = glue::glue("{umccrised_dir}/{batch_name}/structural/{batch_name}-manta.tsv"), - somatic_sv_vcf = glue::glue("{umccrised_dir}/{batch_name}/structural/{batch_name}-manta.vcf.gz"), + conda_list = NULL, + dragen_hrd = glue("{umccrised_dir}/sample_data/{tumor_name}.hrdscore.csv"), + img_dir = glue("{umccrised_dir}/output/img"), + key_genes = glue("{umccrised_dir}/reference_data/umccr_cancer_genes.latest.tsv"), + oncokb_genes = glue("{umccrised_dir}/reference_data/oncokb_genes.20231113.tsv"), + virusbreakend_tsv = glue("{umccrised_dir}/sample_data/virusbreakend/{batch_name}.virusbreakend.vcf.summary.tsv"), + virusbreakend_vcf = glue("{umccrised_dir}/sample_data/virusbreakend/{batch_name}.virusbreakend.vcf"), + purple_purity = glue("{umccrised_dir}/sample_data/purple/{tumor_name}.purple.purity.tsv"), + purple_qc = glue("{umccrised_dir}/sample_data/purple/{tumor_name}.purple.qc"), + purple_som_cnv_ann = glue("{umccrised_dir}/sample_data/{tumor_name}.cnv.prioritised.tsv"), + purple_som_cnv = glue("{umccrised_dir}/sample_data/purple/{tumor_name}.purple.cnv.somatic.tsv"), + purple_som_gene_cnv = glue("{umccrised_dir}/sample_data/purple/{tumor_name}.purple.cnv.gene.tsv"), + purple_som_snv_vcf = glue("{umccrised_dir}/sample_data/purple/{tumor_name}.purple.somatic.vcf.gz"), + result_outdir = glue("{umccrised_dir}/output/cancer_report_tables"), + somatic_snv_vcf = glue("{umccrised_dir}/sample_data/{tumor_name}.pass.vcf.gz"), + somatic_snv_summary = glue("{umccrised_dir}/sample_data/{tumor_name}.somatic.variant_counts_process.json"), + somatic_sv_tsv = glue("{umccrised_dir}/sample_data/{tumor_name}.sv.prioritised.tsv"), + somatic_sv_vcf = glue("{umccrised_dir}/sample_data/{tumor_name}.sv.prioritised.vcf.gz"), tumor_name = tumor_name ) diff --git a/man/cancer_rmd.Rd b/man/cancer_rmd.Rd index 39c9fd2..ccc1012 100644 --- a/man/cancer_rmd.Rd +++ b/man/cancer_rmd.Rd @@ -9,6 +9,7 @@ cancer_rmd( af_keygenes, batch_name, conda_list, + dragen_hrd, img_dir, key_genes, oncokb_genes, diff --git a/man/dragen_hrd.Rd b/man/dragen_hrd.Rd new file mode 100644 index 0000000..2f85cb3 --- /dev/null +++ b/man/dragen_hrd.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/umccrise.R +\name{dragen_hrd} +\alias{dragen_hrd} +\title{Parse DRAGEN HRD File} +\usage{ +dragen_hrd(x = NULL) +} +\arguments{ +\item{x}{Path to DRAGEN \code{hrdscore.csv} file.} +} +\value{ +Tibble with a single row and the following score columns: +\itemize{ +\item HRD (homologous recombination deficiency) +\item LOH (loss of heterozygosity) +\item TAI (telomeric allelic imbalance) +\item LST (large-scale state transitions) +} + +If no input is provided, the fields are NA. +} +\description{ +Parses DRAGEN \code{hrdscore.csv} file. +} diff --git a/man/hrd_results_tabs.Rd b/man/hrd_results_tabs.Rd index be981c8..a8434af 100644 --- a/man/hrd_results_tabs.Rd +++ b/man/hrd_results_tabs.Rd @@ -4,12 +4,14 @@ \alias{hrd_results_tabs} \title{HRDetect and CHORD Summary Table} \usage{ -hrd_results_tabs(hrdetect_res, chord_res) +hrd_results_tabs(hrdetect_res, chord_res, dragen_res) } \arguments{ \item{hrdetect_res}{Result from running \code{\link[sigrap:hrdetect_run]{sigrap::hrdetect_run()}}.} \item{chord_res}{Result from running \code{\link[sigrap:chord_run]{sigrap::chord_run()}}.} + +\item{dragen_res}{Result from running \code{\link[=dragen_hrd]{dragen_hrd()}}.} } \value{ A list with a tibble and a gt_tbl object (see \code{\link[gt:gt]{gt::gt()}}). @@ -30,7 +32,8 @@ chord_res <- sigrap::chord_run( vcf.snv = snv, sample.name = nm, df.sv = gpgr:::chord_mantavcf2df(sv) ) -hrd_results_tabs(hrdetect_res = hrdetect_res, chord_res = chord_res) +dragen_res <- gpgr::dragen_hrd("path/to/sample.hrdscore.csv") +hrd_results_tabs(hrdetect_res = hrdetect_res, chord_res = chord_res, dragen_res = dragen_res) } } From 6c3338a22ef4bd9d3b3187523cb1128a4884da88 Mon Sep 17 00:00:00 2001 From: Peter Diakumis Date: Wed, 7 Feb 2024 09:29:13 +1100 Subject: [PATCH 05/11] bcftools stats QUAL plot (#73) * bcftools qual stats support * bcftools qual stats plot --- DESCRIPTION | 2 +- NAMESPACE | 1 + R/rmd.R | 3 +- R/umccrise.R | 46 +++++++++++++++++++++++++++++ inst/rmd/umccrise/_navbar.html | 1 + inst/rmd/umccrise/cancer_report.Rmd | 9 ++++++ inst/rmd/umccrise/render.R | 1 + man/bcftools_stats_plot.Rd | 17 +++++++++++ 8 files changed, 78 insertions(+), 2 deletions(-) create mode 100644 man/bcftools_stats_plot.Rd diff --git a/DESCRIPTION b/DESCRIPTION index c75b5c9..dfc9657 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -52,5 +52,5 @@ ByteCompile: true Config/testthat/edition: 3 Encoding: UTF-8 LazyData: true -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index 0b7ef7e..35964fb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export(abbreviate_effect) export(af_summary) +export(bcftools_stats_plot) export(cancer_rmd) export(count_pieces) export(date_log) diff --git a/R/rmd.R b/R/rmd.R index 14abe17..15f6bc1 100644 --- a/R/rmd.R +++ b/R/rmd.R @@ -87,7 +87,7 @@ linx_rmd <- function(sample, table_dir, plot_dir, out_file = NULL, quiet = FALSE #' #' @return Path to rendered HTML report. #' @export -cancer_rmd <- function(af_global, af_keygenes, batch_name, conda_list, dragen_hrd, img_dir, key_genes, +cancer_rmd <- function(af_global, af_keygenes, batch_name, bcftools_stats, conda_list, dragen_hrd, img_dir, key_genes, oncokb_genes, virusbreakend_tsv, virusbreakend_vcf, purple_purity, purple_qc, purple_som_cnv_ann, purple_som_cnv, purple_som_gene_cnv, purple_som_snv_vcf, somatic_snv_vcf, somatic_snv_summary, somatic_sv_tsv, somatic_sv_vcf, @@ -118,6 +118,7 @@ cancer_rmd <- function(af_global, af_keygenes, batch_name, conda_list, dragen_hr af_global = af_global, af_keygenes = af_keygenes, batch_name = batch_name, + bcftools_stats = bcftools_stats, conda_list = conda_list, dragen_hrd = dragen_hrd, img_dir = img_dir_b, diff --git a/R/umccrise.R b/R/umccrise.R index 6168e11..825e576 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -1,3 +1,49 @@ +#' Parse bcftools stats File +#' +#' Parses bcftools stats `bcftools_stats.txt` file. +#' +#' @param x Path to bcftools stats `bcftools_stats.txt` file. +#' @return A ggplot2 object. +#' @export +bcftools_stats_plot <- function(x = NULL) { + if (is.null(x)) { + return(NULL) + } + cnames <- c("QUAL_dummy", "id", "qual", "snps", "transi", "transv", "indels") + ln <- readr::read_lines(x) + d1 <- ln[grepl("QUAL", ln)] + # line1 ignore, line2 is colnames, just clean those up + d <- d1[3:length(d1)] |> + I() |> + readr::read_tsv( + col_names = cnames, col_types = readr::cols(.default = "d", "QUAL_dummy" = "c") + ) |> + dplyr::select(-"QUAL_dummy") + if (nrow(d) == 0) { + return(NULL) + } + d <- d |> + dplyr::select("qual", "snps") |> + tidyr::uncount(.data$snps) |> + dplyr::select("qual") + med <- median(d$qual, na.rm = TRUE) + tot <- nrow(d) + p <- d |> + ggplot2::ggplot(ggplot2::aes(x = .data$qual)) + + ggplot2::geom_histogram(ggplot2::aes(y = ggplot2::after_stat(density)), binwidth = 4, fill = "lightblue") + + ggplot2::geom_density(alpha = 0.6) + + ggplot2::geom_vline(xintercept = med, colour = "blue", linetype = "dashed") + + ggplot2::scale_x_continuous(n.breaks = 10) + + ggplot2::annotate( + geom = "label", x = med + 1, y = +Inf, vjust = 2, + label = paste0("Median: ", med), + ) + + ggplot2::theme_bw() + + ggplot2::ggtitle(glue::glue("SNV quality score distribution (total SNVs: {tot})")) + p +} + + #' Parse DRAGEN HRD File #' #' Parses DRAGEN `hrdscore.csv` file. diff --git a/inst/rmd/umccrise/_navbar.html b/inst/rmd/umccrise/_navbar.html index 987d2b8..b87adb0 100644 --- a/inst/rmd/umccrise/_navbar.html +++ b/inst/rmd/umccrise/_navbar.html @@ -14,6 +14,7 @@