diff --git a/NAMESPACE b/NAMESPACE index 9c37dd2..5200896 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,18 +1,21 @@ # Generated by roxygen2: do not edit by hand +export(abbreviate_effect) export(af_summary) +export(bcftools_stats_plot) export(cancer_rmd) +export(count_pieces) export(date_log) +export(dragen_hrd) +export(get_oncokb_genes) export(hrd_results_tabs) export(is_vcf) -export(is_vcf_bpi) export(mkdir) export(pkg_exists) export(plot_bnd_sr_pr_tot_hist) export(plot_bnd_sr_pr_tot_lines) +export(process_cnv_tsv) export(process_sv) -export(purple_cnv_germ_process) -export(purple_cnv_germ_read) export(purple_cnv_som_gene_process) export(purple_cnv_som_gene_read) export(purple_cnv_som_process) @@ -21,11 +24,12 @@ export(purple_kataegis) export(purple_purity_read) export(purple_qc_read) export(purple_snv_vcf_read) -export(purple_version_read) +export(read_oncokb) export(session_info_kable) export(tsv_is_empty) -export(umccrise_read_sv_tsv) export(vcf_is_empty) +export(virusbreakend_summary_read) +export(virusbreakend_vcf_read) export(write_jsongz) export(write_tsvgz) export(write_tsvjsongz) diff --git a/R/oncokb.R b/R/oncokb.R new file mode 100644 index 0000000..03954aa --- /dev/null +++ b/R/oncokb.R @@ -0,0 +1,24 @@ +#' @export +read_oncokb <- function(x) { + readr::read_tsv(x) |> + dplyr::filter( + `OncoKB Annotated` == "Yes" + ) |> + dplyr::pull("Hugo Symbol") +} + +#' @export +get_oncokb_genes <- function(x, oncokb_genes) { + delimiters <- " ,&-" + delimiter_re <- paste0("[", delimiters, "]") + + oncokb_genes |> + # Create regexes for each match, utilising delimiters for boundaries. Handles most cases where a gene symbol contains the '-' delimiter + purrr::map(function(n) paste0("(?<=^|", delimiter_re, ")", n, "(?=", delimiter_re, "|$)")) |> + # Loop with nm iterations through regex and gene symbols + purrr::map(function(n) stringr::str_detect(x, n) |> tibble::as_tibble_row(.name_repair="unique_quiet")) |> + # Combine as tibble to access dplyr::summarise and compile list of detected OncoKB gene symbols for each effect + dplyr::bind_rows() |> + dplyr::summarise(dplyr::across(dplyr::everything(), function(v) { paste0(sort(oncokb_genes[v]), collapse=", ") })) |> + unlist() +} diff --git a/R/oncoviral.R b/R/oncoviral.R new file mode 100644 index 0000000..7c87f0d --- /dev/null +++ b/R/oncoviral.R @@ -0,0 +1,141 @@ +#' Read VIRUSBreakend Summary File +#' +#' Reads the `virusbreakend.vcf.summary.tsv` file. +#' +#' @param x Path to `virusbreakend.vcf.summary.tsv` file. +#' +#' @return List with two elements: +#' * `tab`: Tibble containing data. +#' * `descr`: Description of tibble columns. +#' +#' @examples +#' x <- system.file("extdata/virusbreakend/virusbreakend.vcf.summary.tsv", package = "gpgr") +#' (vb <- virusbreakend_summary_read(x)) +#' @testexamples +#' expect_equal(colnames(vb)[ncol(vb)], "QCStatus") +#' +#' @export +virusbreakend_summary_read <- function(x) { + + nm <- c( + "taxid_genus" = "c", + "name_genus" = "c", + "reads_genus_tree" = "i", + "taxid_species" = "c", + "name_species" = "c", + "reads_species_tree" = "i", + "taxid_assigned" = "c", + "name_assigned" = "c", + "reads_assigned_tree" = "i", + "reads_assigned_direct" = "i", + "reference" = "c", + "reference_taxid" = "c", + "reference_kmer_count" = "i", + "alternate_kmer_count" = "i", + "rname" = "c", + "startpos" = "i", + "endpos" = "i", + "numreads" = "i", + "covbases" = "d", + "coverage" = "d", + "meandepth" = "d", + "meanbaseq" = "d", + "meanmapq" = "d", + "integrations" = "i", + "QCStatus" = "c" + ) + + ctypes <- paste(nm, collapse = "") + virusbreakend_summary <- readr::read_tsv(x, col_types = ctypes) + + if (nrow(virusbreakend_summary) > 0) { + assertthat::assert_that(ncol(virusbreakend_summary) == length(nm)) + assertthat::assert_that(all(colnames(virusbreakend_summary) == names(nm))) + + virusbreakend_summary <- virusbreakend_summary |> + dplyr::select( + Virus="name_assigned", + Length="endpos", + Reads="numreads", + Coverage="coverage", + `Mean depth`="meandepth", + Intergrations="integrations", + QC="QCStatus", + ) + } + + descr <- dplyr::tribble( + ~Column, ~Description, + "Virus", "Assigned NCBI taxonomy name of viral reference", + "Length", "Length of viral contig", + "Reads", "Number of reads mapped to adjusted viral reference", + "Coverage", "Percentage of viral positions with at least one read mapped", + "Mean depth", "Mean alignment depth", + "Intergrations", "Number of detected integration breakpoints", + "QC", "QC status of viral intergrations", + ) + + list( + tab = virusbreakend_summary, + descr = descr + ) +} + +#' Read VIRUSBreakend VCF File +#' +#' Reads the `virusbreakend.vcf` file and selects data to present. +#' +#' @param x Path to `virusbreakend.vcf` file. +#' +#' @return List with two elements: +#' * `tab`: Tibble containing selected data. +#' * `descr`: Description of tibble columns. +#' +#' @examples +#' x <- system.file("extdata/virusbreakend/virusbreakend.vcf", package = "gpgr") +#' (vb <- virusbreakend_vcf_read(x)) +#' @testexamples +#' expect_equal(colnames(vb)[ncol(vb)], "QC") +#' +#' @export +virusbreakend_vcf_read <- function(x) { + + d <- bedr::read.vcf(x, split.info = TRUE, verbose = FALSE) + + if (nrow(d$vcf) > 0) { + virusbreakend_integrations <- tibble::as_tibble(d$vcf) |> + dplyr::select( + Contig="CHROM", + Position="POS", + "Fragment support"="BVF", + "Fragment support (unmapped)"="BUM", + "Softclip read support"="BSC", + Reference="REF", + Alt="ALT", + `Breakend ID`="ID", + `Mate ID`="MATEID", + QC="FILTER", + ) + } else { + virusbreakend_integrations <- tibble::tibble() + } + + descr <- dplyr::tribble( + ~Column, ~Description, + "Contig", "Name of contig", + "Position", "Position of breakend in contig", + "Breakend ID", "ID of integration breakend", + "Mate ID", "ID of integration breakend mate", + "Reference", "Reference allele", + "Alt", "Alternative allele", + "QC", "VCF filter values", + "Fragment support", "Total number of fragments supporting breakend", + "Fragment support (unmapped)", "Number of fragments supporting breakend that have one read unmapped", + "Softclip read support", "Number of softclipped reads supporting breakend" + ) + + list( + tab = virusbreakend_integrations, + description = descr + ) +} diff --git a/R/purple.R b/R/purple.R index a417d42..713e20c 100644 --- a/R/purple.R +++ b/R/purple.R @@ -18,13 +18,12 @@ purple_cnv_som_gene_read <- function(x) { nm <- c( "chromosome" = "c", "start" = "i", "end" = "i", "gene" = "c", - "minCopyNumber" = "d", "maxCopyNumber" = "d", - "unused" = "c", "somaticRegions" = "d", "germlineHomDeletionRegions" = "d", - "germlineHetToHomDeletionRegions" = "d", - "transcriptId" = "c", "transcriptVersion" = "c", "chromosomeBand" = "c", + "minCopyNumber" = "d", "maxCopyNumber" = "d", "somaticRegions" = "d", + "transcriptId" = "c", "isCanonical" = "c", "chromosomeBand" = "c", "minRegions" = "d", "minRegionStart" = "i", "minRegionEnd" = "i", "minRegionStartSupport" = "c", "minRegionEndSupport" = "c", - "minRegionMethod" = "c", "minMinorAlleleCopyNumber" = "d" + "minRegionMethod" = "c", "minMinorAlleleCopyNumber" = "d", + "depthWindowCount" = "i" ) ctypes <- paste(nm, collapse = "") @@ -34,7 +33,6 @@ purple_cnv_som_gene_read <- function(x) { purple_cnv_gene } - #' Process PURPLE CNV Gene File for UMCCRISE #' #' Processes the `purple.cnv.gene.tsv` file. Keeps genes that are in the @@ -43,8 +41,8 @@ purple_cnv_som_gene_read <- function(x) { #' #' @param x Path to `purple.cnv.gene.tsv` file. #' @param g Path to gene file containing at least three columns: -#' * `symbol`: gene name (character). -#' * `tumorsuppressor`: is this gene a tumor suppressor (TRUE/FALSE). +#' * `ensembl_gene_symbol`: gene name (character). +#' * `tsgene`: is this gene a tumor suppressor (TRUE/FALSE). #' * `oncogene`: is this gene an oncogene (TRUE/FALSE). #' #' @return List with two elements: @@ -66,27 +64,26 @@ purple_cnv_som_gene_process <- function(x, g = NULL) { } genes <- readr::read_tsv(g, col_types = readr::cols( - symbol = "c", oncogene = "l", tumorsuppressor = "l" + ensembl_gene_symbol = "c", oncogene = "l", tsgene = "l" )) |> - dplyr::select("symbol", "oncogene", "tumorsuppressor") + dplyr::select("ensembl_gene_symbol", "oncogene", "tsgene") oncogenes <- genes |> dplyr::filter(.data$oncogene) |> - dplyr::pull(.data$symbol) + dplyr::pull(.data$ensembl_gene_symbol) tsgenes <- genes |> - dplyr::filter(.data$tumorsuppressor) |> - dplyr::pull(.data$symbol) + dplyr::filter(.data$tsgene) |> + dplyr::pull(.data$ensembl_gene_symbol) purple_cnv_gene <- purple_cnv_gene |> - dplyr::filter(.data$gene %in% genes$symbol) |> + dplyr::filter(.data$gene %in% genes$ensembl_gene_symbol) |> dplyr::mutate( chromosome = as.factor(.data$chromosome), - transcriptID = paste0(.data$transcriptId, ".", .data$transcriptVersion), + transcriptID = paste0(.data$transcriptId), minRegStartEnd = paste0(.data$minRegionStart, "-", .data$minRegionEnd), minRegSupportStartEndMethod = paste0( .data$minRegionStartSupport, "-", .data$minRegionEndSupport, " (", .data$minRegionMethod, ")" ), - germDelReg = paste0(.data$germlineHomDeletionRegions, "/", .data$germlineHetToHomDeletionRegions), oncogene = .data$gene %in% oncogenes, tsgene = .data$gene %in% tsgenes, onco_or_ts = dplyr::case_when( @@ -101,7 +98,7 @@ purple_cnv_som_gene_process <- function(x, g = NULL) { chrom = "chromosome", "start", "end", chrBand = "chromosomeBand", "onco_or_ts", "transcriptID", minMinorAlleleCN = "minMinorAlleleCopyNumber", - somReg = "somaticRegions", "germDelReg", minReg = "minRegions", + somReg = "somaticRegions", minReg = "minRegions", "minRegStartEnd", "minRegSupportStartEndMethod" ) @@ -115,7 +112,6 @@ purple_cnv_som_gene_process <- function(x, g = NULL) { "transcriptID", "Ensembl transcript ID (dot version)", "minMinorAlleleCN", "Minimum allele ploidy found over the gene exons - useful for identifying LOH events", "somReg (somaticRegions)", "Count of somatic copy number regions this gene spans", - "germDelReg (germlineHomDeletionRegions / germlineHetToHomDeletionRegions)", "Number of regions spanned by this gene that are (homozygously deleted in the germline / both heterozygously deleted in the germline and homozygously deleted in the tumor)", "minReg (minRegions)", "Number of somatic regions inside the gene that share the min copy number", "minRegStartEnd", "Start/End base of the copy number region overlapping the gene with the minimum copy number", "minRegSupportStartEndMethod", "Start/end support of the CN region overlapping the gene with the min CN (plus determination method)" @@ -127,6 +123,312 @@ purple_cnv_som_gene_process <- function(x, g = NULL) { ) } +sash_read_cnv_tsv <- function(x) { + + nm <- c( + "chromosome" = "c", + "start" = "i", + "end" = "i", + "svtype" = "c", + "baf" = "d", + "bafCount" = "d", + "copyNumber" = "d", + "depthWindowCount" = "i", + "gcContent" = "d", + "majorAlleleCopyNumber" = "d", + "method" = "c", + "minorAlleleCopyNumber" = "d", + "segmentEndSupport" = "c", + "segmentStartSupport" = "c", + "sv_top_tier" = "i", + "simple_ann" = "c" + ) + + ctypes <- paste(nm, collapse = "") + d <- readr::read_tsv(x, col_types = ctypes) + assertthat::assert_that(ncol(d) == length(nm)) + assertthat::assert_that(all(colnames(d) == names(nm))) + d +} + +filter_and_split_annotations_cnv <- function(x) { + + filter_conditions <- list( + # Chromosome effects + stringr::str_starts(x$Detail, "chrom_"), + # Fusion effects + x$Effect %in% c("BidFusG", "FusG") + ) + + x.grouped <- x |> + dplyr::group_by( + filter=ifelse(purrr::reduce(filter_conditions, `|`), "filter", "retain") + ) + + keys <- x.grouped |> + dplyr::group_keys() |> + dplyr::pull(filter) + + x.split <- x.grouped |> + dplyr::group_split(.keep=FALSE) |> + purrr::set_names(keys) + + list( + retained = purrr::pluck(x.split, "retain"), + filtered = purrr::pluck(x.split, "filter") |> dplyr::arrange(Tier, `Event ID`) + ) +} + +collapse_effect_group <- function(x) { + x.tmp <- dplyr::first(x) + genes <- x.tmp |> + dplyr::pull(Genes.unique) |> + unlist() + + x.tmp |> + dplyr::mutate( + Genes = paste0(genes, collapse=", "), + Transcripts = "", + Detail = paste0(unique(x$Detail) |> sort(), collapse=", "), + Tier = min(x$Tier), + `Annotation ID` = NA, + ) +} + +set_many_genes_cnv <- function(x) { + # Count genes and set eligibility for collapsing + collapse_effects <- c("DelG", "Dup", "DelTx", "UpstreamGV", "DnstreamGV", "IntergenReg") + x.counts <- x |> + dplyr::group_by(`Event ID`, Effect) |> + dplyr::mutate( + Genes.unique = Genes |> stringr::str_split(", ") |> unlist() |> unique() |> list(), + `Gene count (effect group)` = Genes.unique |> dplyr::first() |> length(), + collapse = `Gene count (effect group)` > 2 & Effect %in% collapse_effects, + ) |> + dplyr::select(-`Gene count (effect group)`) + + # Collapse target groups + x.collapsed <- x.counts |> + dplyr::filter(collapse) |> + dplyr::select(-collapse) |> + dplyr::group_modify(~ collapse_effect_group(.x)) |> + # Update top tier, referring to complete data set within each group + dplyr::mutate( + `Tier (top)` = paste0(Tier, " (", min(x$Tier[x$`Event ID` == `Event ID`]), ")"), + ) |> + # Create new annotation ID + dplyr::ungroup() |> + dplyr::mutate( + `Annotation ID` = paste0(dplyr::row_number(), ".merged"), + ) + + # Bind collapsed rows with uncollapsed rows, then count genes per entry + x.tmp <- x.counts |> + dplyr::filter(!collapse) |> + dplyr::bind_rows(x.collapsed) |> + dplyr::select(-c(collapse, Genes.unique, Tier)) |> + dplyr::rowwise() |> + dplyr::mutate( + `Gene count` = Genes |> stringr::str_split(", ") |> unlist() |> unique() |> length(), + ) |> + dplyr::ungroup() |> + # Sort rows + dplyr::arrange(`Tier (top)`, `Event ID`) + + # Set many genes + x.tmp <- x.tmp |> + dplyr::mutate( + many_genes = ifelse(`Gene count` > 2, "many_genes", "few_genes"), + ) + + # Build the many genes table + x.many_genes <- x.tmp |> + dplyr::filter(many_genes == "many_genes") |> + # Remove unneeded columns and rename others + dplyr::select( + "Event ID", + "Annotation ID", + "Tier (top)", + "Start", + "End", + "SV Type", + "Effect", + "Gene count", + "Genes", + ) + + # Clear genes and transcripts where appropriate + x.ready <- x.tmp |> + dplyr::mutate( + Genes = ifelse( + many_genes == "few_genes" | is.na(many_genes), + Genes, + paste0("Many genes (", `Gene count`, ")") + ), + Transcripts = ifelse(many_genes == "few_genes" | is.na(many_genes), Transcripts, ""), + ) |> + dplyr::select(-c(many_genes, `Gene count`)) + + list( + ready = x.ready, + many_genes = x.many_genes + ) +} + +set_many_transcripts_cnv <- function(x) { + # Set many transcripts + x.tmp <- x |> + dplyr::rowwise() |> + dplyr::mutate( + `Transcript count` = stringr::str_split(Transcripts, ", ") |> unlist() |> unique() |> length() + ) |> + dplyr::ungroup() |> + dplyr::mutate( + many_transcripts = ifelse(`Transcript count` > 2, "many_transcripts", "few_transcripts"), + ) |> + # Sort rows + dplyr::arrange(`Tier (top)`, `Event ID`) + + # Build the many transcripts table + x.many_transcripts <- x.tmp |> + dplyr::filter(many_transcripts == "many_transcripts") |> + # Remove unneeded columns and rename others + dplyr::select( + "Event ID", + "Annotation ID", + "Tier (top)", + "Start", + "End", + "SV Type", + "Effect", + "Transcript count", + "Transcripts", + ) + + # Clear transcripts where appropriate + x.ready <- x.tmp |> + dplyr::mutate( + Transcripts = ifelse( + many_transcripts == "few_transcripts" | is.na(many_transcripts), + Transcripts, + paste0("Many transcripts (", `Transcript count`, ")") + ) + ) |> + dplyr::select(-c(many_transcripts, `Transcript count`)) + + list( + ready = x.ready, + many_transcripts = x.many_transcripts + ) +} + +#' @export +process_cnv_tsv <- function(x) { + # Read input + cnv.input <- sash_read_cnv_tsv(x) + + # Prepare input + cnv.ready <- cnv.input |> + dplyr::mutate( + chrom_simple = stringr::str_remove(chromosome, "chr"), + start = paste(chrom_simple, base::format(start, big.mark = ",", trim = TRUE), sep=":"), + end = paste(chrom_simple, base::format(end, big.mark = ",", trim = TRUE), sep=":"), + ) |> + dplyr::select(-c( + chromosome, + chrom_simple, + )) + + # Melt annotations + cnv.tmp <- cnv.ready |> + dplyr::mutate(`Event ID` = dplyr::row_number()) |> + # Split into individual annotations + dplyr::mutate(annotation = strsplit(simple_ann, ",")) |> + # Convert annotation fields into columns + tidyr::unnest(annotation) |> + tidyr::separate( + annotation, c("Event", "Effect", "Genes", "Transcripts", "Detail", "Tier"), + sep = "\\|", convert = FALSE + ) |> + # Create new columns and modify existing ones + dplyr::mutate( + copyNumber = as.numeric(copyNumber) |> round(2) %>% sprintf("%.2f", .), + minorAlleleCopyNumber = as.numeric(minorAlleleCopyNumber) |> round(2) %>% sprintf("%.2f", .), + majorAlleleCopyNumber = as.numeric(majorAlleleCopyNumber) |> round(2) %>% sprintf("%.2f", .), + "PURPLE CN Min+Maj" = paste0(minorAlleleCopyNumber, "+", majorAlleleCopyNumber), + "Genes" = stringr::str_replace_all(Genes, "&", ", "), + "Transcripts" = stringr::str_replace_all(Transcripts, "&", ", "), + ) |> + # Remove unused columns + dplyr::select(-c( + baf, + bafCount, + depthWindowCount, + Event, + gcContent, + majorAlleleCopyNumber, + method, + minorAlleleCopyNumber, + segmentEndSupport, + segmentStartSupport, + sv_top_tier, + simple_ann, + )) + + # Abbreviate effects + abbreviate_effectv <- Vectorize(abbreviate_effect) + cnv.tmp$Effect <- abbreviate_effectv(cnv.tmp$Effect) + + # Filter unwanted annotations + cnv.annotations.split <- filter_and_split_annotations_cnv(cnv.tmp) + + # Complete processing + cnv.tmp <- cnv.annotations.split$retained |> + # Reset sv_top_tier after removing annotations + dplyr::group_by(`Event ID`) |> + dplyr::mutate( + sv_top_tier = min(Tier), + "Tier (top)" = paste0(Tier, " (", sv_top_tier, ")"), + ) |> + dplyr::ungroup() |> + # Set unique annotation ID + dplyr::mutate(`Annotation ID` = as.character(dplyr::row_number())) |> + # Sort rows + dplyr::arrange(`Tier (top)`, `Event ID`) + + # Set column names + column_selector <- c( + "Event ID", + "Annotation ID", + "Tier (top)", + "Start" = "start", + "End" = "end", + "SV Type" = "svtype", + "Effect", + "Genes", + "Transcripts", + "Detail", + "PURPLE CN" = "copyNumber", + "PURPLE CN Min+Maj" + ) + cnv.tmp <- dplyr::select(cnv.tmp, tidyselect::all_of(c(column_selector, "Tier"))) + cnv.filtered <- dplyr::select(cnv.annotations.split$filtered, tidyselect::any_of(column_selector)) + + # Collapse selected annotations and set many genes + cnv.many_genes_data <- set_many_genes_cnv(cnv.tmp) + cnv.tmp <- cnv.many_genes_data$ready + + # Set many transcripts + cnv.many_transcripts_data <- set_many_transcripts_cnv(cnv.tmp) + + list( + annotations = cnv.many_transcripts_data$ready, + filtered = cnv.filtered, + many_genes = cnv.many_genes_data$many_genes, + many_transcripts = cnv.many_transcripts_data$many_transcripts + ) +} + #' Read PURPLE CNV Somatic File #' #' Reads the `purple.cnv.somatic.tsv` file, which contains the copy number @@ -162,8 +464,7 @@ purple_cnv_som_read <- function(x) { #' Process PURPLE CNV Somatic File for UMCCRISE #' -#' Processes the `purple.cnv.somatic.tsv` file. -#' and selects columns of interest. +#' Processes the `purple.cnv.somatic.tsv` file and selects columns of interest. #' #' @param x Path to `purple.cnv.somatic.tsv` file. #' @@ -230,80 +531,6 @@ purple_cnv_som_process <- function(x) { ) } -#' Read PURPLE CNV Germline File -#' -#' Reads the `purple.cnv.germline.tsv` file. -#' -#' @param x Path to `purple.cnv.germline.tsv` file. -#' -#' @return The input file as a tibble. -#' -#' @examples -#' x <- system.file("extdata/purple/purple.cnv.germline.tsv", package = "gpgr") -#' (p <- purple_cnv_germ_read(x)) -#' @testexamples -#' expect_equal(colnames(p)[ncol(p)], "majorAlleleCopyNumber") -#' -#' @export -purple_cnv_germ_read <- function(x) { - # as of PURPLE v2.39, germline and somatic files have same columns. - purple_cnv_germline <- purple_cnv_som_read(x) - purple_cnv_germline -} - -#' Process PURPLE CNV germline File for UMCCRISE -#' -#' Processes the `purple.cnv.germline.tsv` file. -#' and selects columns of interest. -#' -#' @param x Path to `purple.cnv.germline.tsv` file. -#' -#' @return List with two elements: -#' * `tab`: Tibble with more condensed columns. -#' * `descr`: Description of tibble columns. -#' -#' @examples -#' x <- system.file("extdata/purple/purple.cnv.germline.tsv", package = "gpgr") -#' (pp <- purple_cnv_germ_process(x)) -#' @testexamples -#' expect_equal(colnames(pp$tab)[ncol(pp$tab)], "GC (windowCount)") -#' -#' @export -purple_cnv_germ_process <- function(x) { - # as of PURPLE v2.39, germline and somatic files have same columns. - processed_purple_cnv_germline <- purple_cnv_som_process(x) - processed_purple_cnv_germline -} - -#' Read PURPLE version file -#' -#' Reads the `purple.version` file containing the PURPLE version and build date. -#' -#' @param x Path to the `purple.version` file. -#' -#' @return A list with the elements: -#' * `version`: The version of PURPLE used. -#' * `build_date`: The build date of the PURPLE version. -#' -#' @examples -#' x <- system.file("extdata/purple/purple.version", package = "gpgr") -#' (v <- purple_version_read(x)) -#' @testexamples -#' expect_equal(length(v), 2) -#' expect_equal(names(v), c("version", "build_date")) -#' expect_equal(v$version, "2.51") -#' -#' @export -purple_version_read <- function(x) { - tab <- readr::read_delim(x, delim = "=", col_names = c("key", "value"), col_types = "cc") - assertthat::assert_that(nrow(tab) == 2, all(tab$key == c("version", "build.date"))) - - list( - version = tab$value[tab$key == "version"], - build_date = tab$value[tab$key == "build.date"] - ) -} - #' Read PURPLE QC file #' #' Reads the `purple.qc` file. @@ -328,7 +555,8 @@ purple_qc_read <- function(x) { nm <- c( "QCStatus", "Method", "CopyNumberSegments", "UnsupportedCopyNumberSegments", "Purity", "AmberGender", - "CobaltGender", "DeletedGenes", "Contamination", "GermlineAberrations" + "CobaltGender", "DeletedGenes", "Contamination", "GermlineAberrations", + "AmberMeanDepth", "LohPercent" ) assertthat::assert_that(all(purple_qc$key == nm)) @@ -350,6 +578,8 @@ purple_qc_read <- function(x) { "Rate of contamination in tumor sample as determined by AMBER.", 16, "GermlineAberrations", glue::glue('{q["GermlineAberrations"]}'), "Can be one or more of: KLINEFELTER, TRISOMY_X/21/13/18/15, XYY, MOSAIC_X.", + 18, "AmberMeanDepth", glue::glue('{q["AmberMeanDepth"]}'), + "Mean depth as determined by AMBER.", ) list( @@ -392,7 +622,6 @@ purple_purity_read <- function(x) { "maxPloidy", "d", "minDiploidProportion", "d", "maxDiploidProportion", "d", - "version", "c", "somaticPenalty", "d", "wholeGenomeDuplication", "c", "msIndelsPerMb", "d", @@ -401,7 +630,9 @@ purple_purity_read <- function(x) { "tmlStatus", "c", "tmbPerMb", "d", "tmbStatus", "c", - "svTumorMutationalBurden", "d" + "svTumorMutationalBurden", "d", + "runMode", "c", + "targeted", "c" ) ctypes <- paste(tab$type, collapse = "") @@ -475,7 +706,7 @@ purple_snv_vcf_read <- function(x) { dplyr::select("ID", "Description") info_cols <- c( - "AF", "PURPLE_AF", "PURPLE_CN", + "PURPLE_AF", "PURPLE_CN", "PURPLE_GERMLINE", "PURPLE_MACN", "PURPLE_VCN", "HMF_HOTSPOT", "KT", "MH", "SUBCL", "TNC" ) @@ -506,7 +737,7 @@ purple_snv_vcf_read <- function(x) { purple_kataegis <- function(x) { d <- purple_snv_vcf_read(x) info_cols <- c( - "KT", "AF", "PURPLE_AF", "PURPLE_CN", + "KT", "PURPLE_AF", "PURPLE_CN", "PURPLE_MACN", "PURPLE_VCN", "SUBCL", "MH", "TNC" ) diff --git a/R/rmd.R b/R/rmd.R index 8e21b99..25a454c 100644 --- a/R/rmd.R +++ b/R/rmd.R @@ -12,17 +12,18 @@ #' @param conda_list Path to `conda_pkg_list.txt` file. #' @param img_dir Path to directory containing PURPLE plots. #' @param key_genes Path to UMCCR cancer gene file. -#' @param oncoviral_breakpoints_tsv Path to `oncoviruses/oncoviral_breakpoints.tsv`. -#' @param oncoviral_present_viruses Path to `oncoviruses/present_viruses.txt`. -#' @param purple_germ_cnv Path to `purple.cnv.germline.tsv`. +#' @param oncokb_genes Path to OncoKB database file. +#' @param virusbreakend_tsv Path to VIRUSBreakend summary file. +#' @param virusbreakend_vcf Path to VIRUSBreakend VCF file. #' @param purple_purity Path to `purple.purity.tsv`. #' @param purple_qc Path to `purple.qc`. #' @param purple_som_cnv Path to `purple.cnv.somatic.tsv`. +#' @param purple_som_cnv_ann Path to annotated and prioritised `purple.cnv.somatic.tsv`. #' @param purple_som_gene_cnv Path to `purple.cnv.gene.tsv`. #' @param purple_som_snv_vcf Path to `purple.somatic.vcf.gz`. #' @param result_outdir Path to directory to write tidy JSON/TSV results. -#' @param somatic_snv_summary Path to `somatic_snv_summary.json` JSON. #' @param somatic_snv_vcf Path to `somatic-PASS.vcf.gz` SNV VCF. +#' @param somatic_snv_summary Path to `somatic_snv_summary.json` JSON. #' @param somatic_sv_tsv Path to `manta.tsv` TSV file. #' @param somatic_sv_vcf Path to `manta.vcf.gz` VCF file. #' @param tumor_name Name of tumor sample. @@ -31,13 +32,11 @@ #' #' @return Path to rendered HTML report. #' @export -cancer_rmd <- function(af_global, af_keygenes, batch_name, conda_list, img_dir, key_genes, - oncoviral_breakpoints_tsv, oncoviral_present_viruses, - purple_germ_cnv, purple_purity, purple_qc, purple_som_cnv, - purple_som_gene_cnv, purple_som_snv_vcf, somatic_snv_summary, - somatic_snv_vcf, somatic_sv_tsv, somatic_sv_vcf, - result_outdir, tumor_name, - out_file = NULL, quiet = FALSE) { +cancer_rmd <- function(af_global, af_keygenes, batch_name, bcftools_stats, conda_list, dragen_hrd, img_dir, key_genes, + oncokb_genes, virusbreakend_tsv, virusbreakend_vcf, purple_purity, purple_qc, + purple_som_cnv_ann, purple_som_cnv, purple_som_gene_cnv, purple_som_snv_vcf, + somatic_snv_vcf, somatic_snv_summary, somatic_sv_tsv, somatic_sv_vcf, + result_outdir, tumor_name, out_file = NULL, quiet = FALSE) { assertthat::assert_that( dir.exists(img_dir), quiet %in% c(FALSE, TRUE) @@ -64,21 +63,24 @@ cancer_rmd <- function(af_global, af_keygenes, batch_name, conda_list, img_dir, af_global = af_global, af_keygenes = af_keygenes, batch_name = batch_name, + bcftools_stats = bcftools_stats, conda_list = conda_list, + dragen_hrd = dragen_hrd, img_dir = img_dir_b, key_genes = key_genes, - somatic_snv_summary = somatic_snv_summary, + oncokb_genes = oncokb_genes, somatic_snv_vcf = somatic_snv_vcf, + somatic_snv_summary = somatic_snv_summary, somatic_sv_tsv = somatic_sv_tsv, somatic_sv_vcf = somatic_sv_vcf, purple_som_gene_cnv = purple_som_gene_cnv, + purple_som_cnv_ann = purple_som_cnv_ann, purple_som_cnv = purple_som_cnv, - purple_germ_cnv = purple_germ_cnv, purple_purity = purple_purity, purple_qc = purple_qc, purple_som_snv_vcf = purple_som_snv_vcf, - oncoviral_present_viruses = oncoviral_present_viruses, - oncoviral_breakpoints_tsv = oncoviral_breakpoints_tsv, + virusbreakend_tsv = virusbreakend_tsv, + virusbreakend_vcf = virusbreakend_vcf, result_outdir = result_outdir, tumor_name = tumor_name ) diff --git a/R/sv.R b/R/sv.R index 53acd85..69df9ec 100644 --- a/R/sv.R +++ b/R/sv.R @@ -73,6 +73,7 @@ split_double_col <- function(d, nms) { #' expect_equal(m, 2) #' expect_error(gpgr:::count_pieces("foo", NA)) #' +#' @export count_pieces <- function(x, sep) { ifelse(nchar(x) == 0, 0, stringr::str_count(x, sep) + 1) } @@ -107,6 +108,7 @@ count_pieces <- function(x, sep) { #' expect_equal(e3, "TFBSDel, TFBSVar") #' expect_equal(e4, "badaboom, bar, foo, StopGain") #' +#' @export abbreviate_effect <- function(effects) { effect_abbrev_nms <- names(gpgr::EFFECT_ABBREVIATIONS) @@ -123,238 +125,328 @@ abbreviate_effect <- function(effects) { paste(collapse = ", ") } -#' Read SV TSV -#' -#' Reads the Manta TSV file output by umccrise. -#' -#' @param x Path to `manta.tsv` output by umccrise. -#' -#' @return A tibble corresponding to the input TSV file. -#' -#' @examples -#' x <- system.file("extdata/umccrise/sv/manta.tsv", package = "gpgr") -#' (sv <- umccrise_read_sv_tsv(x)$data) -#' @testexamples -#' expect_equal(colnames(sv)[ncol(sv)], "ALT") -#' -#' @export -umccrise_read_sv_tsv <- function(x) { +sash_read_sv_tsv <- function(x) { - # tsv column names, description and types tab <- dplyr::tribble( ~Column, ~Description, ~Type, - "caller", "Manta SV caller", "c", - "sample", "Tumor sample name", "c", "chrom", "CHROM column in VCF", "c", "start", "POS column in VCF", "i", - "end", "INFO/END: End position of the variant described in this record", "i", "svtype", "INFO/SVTYPE: Type of structural variant", "c", - "split_read_support", "FORMAT/SR of tumor sample: Split reads for the ref and alt alleles in the order listed, for reads where P(allele|read)>0.999", "c", - "paired_support_PE", "FORMAT/PE of tumor sample: ??", "c", - "paired_support_PR", "FORMAT/PR of tumor sample: Spanning paired-read support for the ref and alt alleles in the order listed, for reads where P(allele|read)>0.999", "c", - "AF_BPI", "INFO/BPI_AF: AF at each breakpoint (so AF_BPI1,AF_BPI2)", "c", - "somaticscore", "INFO/SOMATICSCORE: Somatic variant quality score", "i", + "SR_alt", "Split reads (or equivalent) of tumor sample", "i", + "PR_alt", "Paired reads (or equivalent) of tumor sample", "i", + "SR_asm_alt", "Split reads (assembly) of tumor sample", "i", + "PR_asm_alt", "Pair reads (assembly) of tumor sample", "i", + "IC_alt", "INDEL-containing reads of tumor sample", "i", + "SR_ref", "FORMAT/REF of tumor sample", "i", + "PR_ref", "FORMAT/REFPAIR of tumor sample", "i", + "QUAL", "QUAL column in VCF", "f", "tier", "INFO/SV_TOP_TIER (or 4 if missing): Highest priority tier for the effects of a variant entry", "c", "annotation", "INFO/SIMPLE_ANN: Simplified structural variant annotation: 'SVTYPE | EFFECT | GENE(s) | TRANSCRIPT | PRIORITY (1-4)'", "c", "AF_PURPLE", "INFO/PURPLE_AF: AF at each breakend (purity adjusted) (so AF_PURPLE1,AF_PURPLE2)", "c", "CN_PURPLE", "INFO/PURPLE_CN: CN at each breakend (purity adjusted) (so CN_PURPLE1,CN_PURPLE2)", "c", "CN_change_PURPLE", "INFO/PURPLE_CN_CHANGE: change in CN at each breakend (purity adjusted) (so CN_change_PURPLE1,CN_change_PURPLE2)", "c", - "Ploidy_PURPLE", "INFO/PURPLE_PLOIDY: Ploidy of variant (purity adjusted)", "d", "PURPLE_status", "INFERRED if FILTER=INFERRED, or RECOVERED if has INFO/RECOVERED, else blank. INFERRED: Breakend inferred from copy number transition", "c", - "START_BPI", "INFO/BPI_START: BPI adjusted breakend location", "i", - "END_BPI", "INFO/BPI_END: BPI adjusted breakend location", "i", "ID", "ID column in VCF", "c", "MATEID", "INFO/MATEID: ID of mate breakend", "c", "ALT", "ALT column in VCF", "c" ) ctypes <- paste(tab$Type, collapse = "") - somatic_sv_tsv <- readr::read_tsv(x, col_names = TRUE, col_types = ctypes) - assertthat::assert_that(ncol(somatic_sv_tsv) == nrow(tab)) - assertthat::assert_that(all(colnames(somatic_sv_tsv) == tab$Column)) + d <- readr::read_tsv(x, col_names = TRUE, col_types = ctypes) + assertthat::assert_that(ncol(d) == nrow(tab)) + assertthat::assert_that(all(colnames(d) == tab$Column)) list( - data = somatic_sv_tsv, + data = d, descr = tab ) } -#' Process Structural Variants -#' -#' Processes the Manta TSV file output by umccrise. -#' -#' @param x Path to `manta.tsv` output by umccrise. -#' @return A list with melted/unmelted tibbles (these are NULL if TSV file was empty). -#' -#' @examples -#' x <- system.file("extdata/umccrise/sv/manta.tsv", package = "gpgr") -#' (sv <- process_sv(x)) -#' @testexamples -#' expect_true(inherits(sv, "list")) -#' expect_equal(length(sv), 4) -#' expect_equal(names(sv), c("unmelted", "melted", "tsv_descr", "col_descr")) -#' -#' @export -process_sv <- function(x) { - sv <- umccrise_read_sv_tsv(x) - tsv_descr <- sv$descr - sv <- sv$data - - if (nrow(sv) == 0) { - return(list( - unmelted = NULL, - melted = NULL - )) - } - col_descr <- dplyr::tribble( - ~Column, ~Description, - "nrow", "Row number that connects variants between tables in same tab set.", - "vcfnum", "Original event row number that connects variants to events.", - "TierTop", "Top priority of the event (from simple_sv_annotation: 1 highest, 4 lowest).", - "Tier", "Priority of the specific event (from simple_sv_annotation: 1 highest, 4 lowest).", - "Tier (Top)", "Priority of the specific event (Top tier of all event's annotations): 1 highest, 4 lowest. ", - "Chr", "Chromosome.", - "Start", "Start position as inferred by BPI. For PURPLE-inferred SVs we use POS.", - "End", "End position. For BNDs = Chr:Start of the mate.Values are inferred by BPI (PURPLE-inferred SVs do not have an End).", - "ID", "ID of BND from Manta. For PURPLE-inferred SVs this is PURPLE.", - "MATEID", "ID of BND mate from Manta.", - "BND_ID", "ID of BND pair simplified. BNDs with the same BND_ID belong to the same translocation event.", - "BND_mate", "A or B depending on whether the first or second mate of the BND pair.", - "Genes", "Genes involved in the event. DEL/DUP/INS events involving more than 2 genes are shown within separate table.", - "Transcript", "Transcripts involved in the event. DEL/DUP/INS events involving more than 2 transcripts are shown within separate table.", - "Effect", "SV effect (based on http://snpeff.sourceforge.net/SnpEff_manual.html#input).", - "Detail", "Prioritisation detail (from simple_sv_annotation).", - "Ploidy", "Ploidy of variant from PURPLE (purity adjusted).", - "AF_PURPLE", "PURPLE AF at each breakend preceded by their average.", - "AF_BPI", "BPI AF at each breakend preceded by their average.", - "CN", "Copy Number at each breakend preceded by their average.", - "CNC", "Copy Number Change at each breakend preceded by their average.", - "SR_alt", "Number of Split Reads supporting the alt allele, where P(allele|read)>0.999.", - "PR_alt", "Number of Paired Reads supporting the alt allele, where P(allele|read)>0.999.", - "SR_PR_ref", "Number of Split Reads and Paired Reads supporting the ref allele, where P(allele|read)>0.999.", - "SR_PR_sum", "Sum of SR_alt and PR_alt.", - "Type", "Type of structural variant.", - "SScore", "Somatic variant quality score.", - "ntrx", "Number of transcripts for given event.", - "ngen", "Number of genes for given event.", - "nann", "Number of annotations for given event." +split_svs <- function(x) { + bps_types <- c("BND", "DEL", "DUP"," INS", "INV") + + x.grouped <- x |> + dplyr::group_by( + record_type=ifelse(Type %in% bps_types, "bps", "other") + ) + + keys <- x.grouped |> + dplyr::group_keys() |> + dplyr::pull(record_type) + + x.split <- x.grouped |> + dplyr::group_split(.keep=FALSE) |> + purrr::set_names(keys) + + list( + bps = purrr::pluck(x.split, "bps"), + other = purrr::pluck(x.split, "other") ) +} - cols_to_split <- c("AF_BPI", "AF_PURPLE", "CN_PURPLE", "CN_change_PURPLE") - double_cols <- split_double_col(sv, cols_to_split) - unmelted <- sv |> - dplyr::select(-dplyr::all_of(c(cols_to_split, "caller", "sample"))) |> - dplyr::bind_cols(double_cols) |> - tidyr::separate(.data$split_read_support, c("SR_ref", "SR_alt"), ",", convert = TRUE) |> - tidyr::separate(.data$paired_support_PR, c("PR_ref", "PR_alt"), ",", convert = TRUE) - unmelted <- unmelted |> +join_breakpoint_entries <- function(x) { + # Group by GRIDSS identifier (clipping trailing h/o [h: High, o: lOwer]) + bps <- x |> + tidyr::separate(ID, into = c("BND_group", "BND_mate"), sep = -1, convert = TRUE, remove = FALSE)|> + dplyr::group_by(BND_group) + + # Set a sequential breakpoint identifier + bps_groups <- bps |> dplyr::n_groups() + bps |> dplyr::mutate( - SR_PR_ref = paste0(.data$SR_ref, ",", .data$PR_ref), - Ploidy = round(as.double(.data$Ploidy_PURPLE), 2), - chrom = sub("chr", "", .data$chrom), - svtype = ifelse(is.na(.data$PURPLE_status), .data$svtype, "PURPLE_inf"), - # when BPI cannot be run, just use start - Start = ifelse(is.na(.data$PURPLE_status), - ifelse(is.na(.data$START_BPI), .data$start, .data$START_BPI), - .data$start - ), - nann = count_pieces(.data$annotation, ","), - vcfnum = dplyr::row_number(), - vcfnum = sprintf(glue::glue("%0{nchar(nrow(unmelted))}d"), .data$vcfnum) + # Assign a unique ID based on current group + BND_ID = sprintf(paste0("%0", nchar(bps_groups), "d"), dplyr::cur_group_id()), + BND_mate = ifelse(BND_mate == "o", "A", "B"), ) |> - dplyr::rowwise() |> - dplyr::mutate(SR_PR_sum = sum(.data$SR_alt, .data$PR_alt, na.rm = TRUE)) |> - dplyr::ungroup() - - # BND IDs - # Two BND mates share the same ID up to the last digit (0 or 1) - unmelted_bnd1 <- unmelted |> - dplyr::filter(.data$svtype == "BND") |> - tidyr::separate(.data$ID, into = c("BND_group", "BND_mate"), sep = -1, convert = TRUE, remove = FALSE) |> - dplyr::group_by(.data$BND_group) - unmelted_bnd1 <- unmelted_bnd1 |> + dplyr::ungroup() |> dplyr::mutate( - # index per group 1, 2, 3.. - BND_ID = dplyr::cur_group_id(), - # turns into 001, 002, 003... if you've got 100+ rows - BND_ID = sprintf(glue::glue("%0{nchar(nrow(unmelted_bnd1))}d"), .data$BND_ID), - BND_mate = ifelse(.data$BND_mate == 0, "A", "B") + end_position = sub("^.*:(\\d+).*$", "\\1", ALT) |> as.numeric() |> base::format(big.mark = ",", trim = TRUE), + end_chrom = sub("^.*chr(.*):.*$", "\\1", ALT), + end = paste0(end_chrom, ":", end_position), ) |> - dplyr::ungroup() + dplyr::select(-c(end_position, end_chrom)) +} - # Grab each BND mate's chrom - # Orphan mates have that info in the ALT field - match_id2mateid <- match(unmelted_bnd1$ID, unmelted_bnd1$MATEID) - unmelted_bnd2 <- unmelted_bnd1[match_id2mateid, c("chrom")] |> - dplyr::rename(BND_mate_chrom = .data$chrom) +remove_gene_fusion_dups <- function(.data, columns) { + # Order elements of multi-entry effect values for reliable comparison + v.groups <- c("frameshift_variant&gene_fusion", "gene_fusion") + v.effects_ordered <- sapply(.data$Effect, function(s) { + c <- stringr::str_split(s, "&") |> unlist() + paste0(sort(c), collapse="&") + }) - unmelted_bnd <- dplyr::bind_cols(unmelted_bnd1, unmelted_bnd2) |> - dplyr::mutate( - BND_mate_chrom = ifelse( - is.na(.data$BND_mate_chrom), - sub(".*chr(.*):.*", "orphan_\\1", .data$ALT), - .data$BND_mate_chrom - ), - BND_mate_end = sub(".*chr(.*):(\\d+).*", "\\2", .data$ALT), - BND_mate_end = as.numeric(.data$BND_mate_end) - ) + if (all(v.groups %in% v.effects_ordered)) { + .data |> dplyr::filter(Effect != "gene_fusion") + } else { + .data + } +} - unmelted_other <- unmelted |> - dplyr::filter(.data$svtype != "BND") +filter_and_split_annotations_sv <- function(x) { - unmelted_all <- - dplyr::bind_rows( - unmelted_bnd, - unmelted_other + filter_conditions <- list( + # Empty Gene field + x$Genes == "", + # Only Ensembl identifiers in the Gene field + stringr::str_split(x$Genes, "[&-]|, ") |> purrr::map_lgl(\(x) stringr::str_starts(x, "ENSG") |> all()), + # Fusions that do not involve genes + x$Effect == "Fus", + # PURPLE inferred SVs + x$Type == "PURPLE_inf" + ) + + x.grouped <- x |> + dplyr::group_by( + filter=ifelse(purrr::reduce(filter_conditions, `|`), "filter", "retain") ) |> + dplyr::select(-c(Type, `Top Tier`)) + + keys <- x.grouped |> + dplyr::group_keys() |> + dplyr::pull(filter) + + x.split <- x.grouped |> + dplyr::group_split(.keep=FALSE) |> + purrr::set_names(keys) + + list( + retained = purrr::pluck(x.split, "retain"), + filtered = purrr::pluck(x.split, "filter") + ) +} + +set_many_transcripts_sv <- function(x) { + # Set many transcripts + x.tmp <- x |> + dplyr::rowwise() |> dplyr::mutate( - Start = base::format(.data$Start, big.mark = ",", trim = TRUE), - Start = paste0(.data$chrom, ":", .data$Start), - END_tmp = ifelse(is.na(.data$END_BPI), .data$end, .data$END_BPI), - END_tmp = ifelse(is.na(.data$END_tmp) & .data$svtype == "BND", .data$BND_mate_end, .data$END_tmp), - END_tmp = base::format(.data$END_tmp, big.mark = ",", trim = TRUE), - End = paste0( - ifelse(.data$svtype == "BND", .data$BND_mate_chrom, .data$chrom), - ":", - .data$END_tmp - ) + `Transcript count` = stringr::str_split(Transcripts, ", ") |> unlist() |> unique() |> length() ) |> + dplyr::ungroup() |> + dplyr::mutate( + many_transcripts = ifelse(`Transcript count` > 2, "many_transcripts", "few_transcripts") + ) + + # Build the many transcripts table + mt <- x.tmp |> + dplyr::filter(many_transcripts == "many_transcripts") |> + # Remove unneeded columns and rename others dplyr::select( - "vcfnum", "nann", - TierTop = "tier", - "Start", "End", - Type = "svtype", - "BND_ID", "BND_mate", - "SR_alt", "PR_alt", "SR_PR_sum", "SR_PR_ref", "Ploidy", - "AF_PURPLE", "AF_BPI", CNC = "CN_change_PURPLE", - CN = "CN_PURPLE", SScore = "somaticscore", "annotation" + "Record ID", + "Annotation ID", + "Tier (top)", + "Start", + "End", + "Transcript count", + "Transcripts", ) - abbreviate_effectv <- Vectorize(abbreviate_effect) + # Clear transcript where appropriate + x.ready <- x.tmp |> + dplyr::mutate( + Transcripts = ifelse( + many_transcripts == "few_transcripts" | is.na(many_transcripts), + Transcripts, + paste0("Many transcripts (", `Transcript count`, ")") + ) + ) |> + dplyr::select(-c(many_transcripts, `Transcript count`)) + + list( + many_transcripts = mt, + sv = x.ready + ) +} + +#' @export +process_sv <- function(x) { + # Read input and set column information + sv.input <- sash_read_sv_tsv(x) + + # Early return if no data to process + if (nrow(sv.input$data) == 0) { + return() + } + + # Prepare input + sv.ready <- sv.input$data |> + dplyr::mutate( + "annotation_count" = count_pieces(annotation, ","), + "Top Tier" = tier, + "SR_PR_ref" = paste0(SR_ref, ",", PR_ref), + "SR_PR_sum" = dplyr::case_when( + is.na(SR_alt) & is.na(PR_alt) ~ NA, + is.na(SR_alt) ~ PR_alt, + is.na(PR_alt) ~ SR_alt, + .default = SR_alt + PR_alt, + ), + "SR_PR_asm_sum" = dplyr::case_when( + is.na(SR_asm_alt) & is.na(PR_asm_alt) ~ NA, + is.na(SR_asm_alt) ~ PR_asm_alt, + is.na(PR_asm_alt) ~ SR_asm_alt, + .default = SR_asm_alt + PR_asm_alt, + ), + start = paste(chrom, base::format(start, big.mark = ",", trim = TRUE), sep=":"), + Type = ifelse(is.na(PURPLE_status), svtype, "PURPLE_inf"), + "Record ID" = dplyr::row_number(), + ) |> + dplyr::select(-c( + chrom, + PURPLE_status, + tier, + svtype, + )) + + # Split out breakpoints for merging + sv.split <- split_svs(sv.ready) + + # Complete breakpoint records with corresponding mate, then combine with non-breakend records + sv.bps <- join_breakpoint_entries(sv.split$bps) + sv.tmp <- dplyr::bind_rows(sv.bps, sv.split$other) + + # Format some columns + cols_to_split <- c("AF_PURPLE", "CN_PURPLE") + double_cols <- split_double_col(sv.tmp, cols_to_split) + sv.tmp <- sv.tmp |> + dplyr::select(-c(cols_to_split)) |> + dplyr::bind_cols(double_cols) + + # Format a table for to be used as the SV Map + sv.map <- sv.tmp |> + dplyr::select( + "Record ID", + "Annotations" = "annotation_count", + "Top Tier", + "Start" = "start", + "End" = "end", + "Type", + "Breakend ID" = "BND_ID", + "Breakend Mate" = "BND_mate", + "SR_alt", + "PR_alt", + "SR_PR_sum", + "SR_asm_alt", + "PR_asm_alt", + "SR_PR_asm_sum", + "IC_alt", + "SR_PR_ref", + "PURPLE AF" = "AF_PURPLE", + "PURPLE CN" = "CN_PURPLE", + ) |> + dplyr::arrange(`Record ID`) - melted <- unmelted_all |> - dplyr::mutate(annotation = strsplit(.data$annotation, ",")) |> - tidyr::unnest(.data$annotation) |> + # Melt annotations + sv.melted_all <- sv.tmp |> + # Split into individual annotations + dplyr::mutate(annotation = strsplit(annotation, ",")) |> + # Convert annotation fields into columns + tidyr::unnest(annotation) |> tidyr::separate( - .data$annotation, c("Event", "Effect", "Genes", "Transcript", "Detail", "Tier"), + annotation, c("Event", "Effect", "Genes", "Transcripts", "Detail", "Tier"), sep = "\\|", convert = FALSE ) |> + # Remove gene_fusion annotations for variants where frameshift_variant&gene_fusion already exist + dplyr::group_by(across(-Effect)) |> + dplyr::group_modify(remove_gene_fusion_dups) |> + dplyr::ungroup() |> + # Remove unused columns + dplyr::select(c(-Event, -ALT)) |> + # Create columns, modify others dplyr::mutate( - ntrx = count_pieces(.data$Transcript, "&"), - ngen = count_pieces(.data$Genes, "&"), - neff = count_pieces(.data$Effect, "&"), - Transcript = .data$Transcript |> stringr::str_replace_all("&", ", "), - Genes = .data$Genes |> stringr::str_replace_all("&", ", "), - Effect = abbreviate_effectv(.data$Effect), - `Tier (Top)` = glue::glue("{Tier} ({TierTop})") + "Annotation ID" = dplyr::row_number(), + "Tier (top)" = paste0(Tier, " (", `Top Tier`, ")"), + "Genes" = stringr::str_replace_all(Genes, "&", ", "), + "Transcripts" = stringr::str_replace_all(Transcripts, "&", ", "), ) |> - dplyr::distinct() |> - dplyr::arrange(.data$`Tier (Top)`, .data$Genes, .data$Effect) + # Sort rows + dplyr::arrange(`Tier (top)`, `Record ID`, Genes, Effect) + + # Abbreviate effects + abbreviate_effectv <- Vectorize(abbreviate_effect) + sv.melted_all$Effect <- abbreviate_effectv(sv.melted_all$Effect) + + # Select and rename columns + sv.melted_all <- sv.melted_all |> + dplyr::select( + "Record ID", + "Annotation ID", + "Tier (top)", + "Start" = "start", + "End" = "end", + "Breakend ID" = "BND_ID", + "Breakend Mate" = "BND_mate", + "Effect", + "Genes", + "Transcripts", + "Effect", + "Detail", + "SR_alt", + "PR_alt", + "SR_PR_sum", + "SR_asm_alt", + "PR_asm_alt", + "SR_PR_asm_sum", + "IC_alt", + "PURPLE AF" = "AF_PURPLE", + "PURPLE CN" = "CN_PURPLE", + # Dropped after ops for non-map outputs + "Top Tier", + "Type", + ) + + # Create and set many transcript values + sv.annotations.many_transcript_data <- set_many_transcripts_sv(sv.melted_all) + sv.annotations.many_transcripts <- purrr::pluck(sv.annotations.many_transcript_data, "many_transcripts") + sv.annotations <- purrr::pluck(sv.annotations.many_transcript_data, "sv") + + # Filter unwanted annotations + sv.annotations.split <- filter_and_split_annotations_sv(sv.annotations) list( - unmelted = unmelted_all, - melted = melted, - tsv_descr = tsv_descr, - col_descr = col_descr + map = sv.map, + map_melted = sv.melted_all, + annotations = sv.annotations.split$retained, + annotations_filtered = sv.annotations.split$filtered, + many_transcripts = sv.annotations.many_transcripts ) } @@ -380,7 +472,7 @@ plot_bnd_sr_pr_tot_lines <- function(d, assertthat::assert_that(all(c("Type", "SR_alt", "PR_alt") %in% colnames(d))) dplot <- d |> dplyr::filter(.data$Type == "BND") |> - dplyr::select(SR = "SR_alt", PR = "PR_alt", Tier = "TierTop", "BND_ID") |> + dplyr::select(SR = "SR_alt", PR = "PR_alt", Tier = "Top Tier", "Breakend ID") |> dplyr::distinct() |> dplyr::mutate( PR = ifelse(is.na(.data$PR), 0, .data$PR), @@ -443,7 +535,7 @@ plot_bnd_sr_pr_tot_hist <- function(d, assertthat::assert_that(all(c("Type", "SR_alt", "PR_alt") %in% colnames(d))) dplot <- d |> dplyr::filter(.data$Type == "BND") |> - dplyr::select(SR = "SR_alt", PR = "PR_alt", "BND_ID") |> + dplyr::select(SR = "SR_alt", PR = "PR_alt", "Breakend ID") |> dplyr::distinct() |> dplyr::mutate( PR = ifelse(is.na(.data$PR), 0, .data$PR), diff --git a/R/umccrise.R b/R/umccrise.R index aa95137..1c81b27 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -1,9 +1,80 @@ +#' Parse bcftools stats File +#' +#' Parses bcftools stats `bcftools_stats.txt` file. +#' +#' @param x Path to bcftools stats `bcftools_stats.txt` file. +#' @return A ggplot2 object. +#' @export +bcftools_stats_plot <- function(x = NULL) { + if (is.null(x)) { + return(NULL) + } + cnames <- c("QUAL_dummy", "id", "qual", "snps", "transi", "transv", "indels") + ln <- readr::read_lines(x) + d1 <- ln[grepl("QUAL", ln)] + # line1 ignore, line2 is colnames, just clean those up + d <- d1[3:length(d1)] |> + I() |> + readr::read_tsv( + col_names = cnames, col_types = readr::cols(.default = "d", "QUAL_dummy" = "c") + ) |> + dplyr::select(-"QUAL_dummy") + if (nrow(d) == 0) { + return(NULL) + } + d <- d |> + dplyr::select("qual", "snps", "indels") |> + tidyr::uncount(.data$snps + .data$indels) |> + dplyr::select("qual") + med <- median(d$qual, na.rm = TRUE) + tot <- nrow(d) + p <- d |> + ggplot2::ggplot(ggplot2::aes(x = .data$qual)) + + ggplot2::geom_histogram(ggplot2::aes(y = ggplot2::after_stat(density)), binwidth = 4, fill = "lightblue") + + ggplot2::geom_density(alpha = 0.6) + + ggplot2::geom_vline(xintercept = med, colour = "blue", linetype = "dashed") + + ggplot2::scale_x_continuous(n.breaks = 10) + + ggplot2::annotate( + geom = "label", x = med + 1, y = +Inf, vjust = 2, + label = paste0("Median: ", med), + ) + + ggplot2::theme_bw() + + ggplot2::ggtitle(glue::glue("SNV quality score distribution (total SNVs: {tot})")) + p +} + + +#' Parse DRAGEN HRD File +#' +#' Parses DRAGEN `hrdscore.csv` file. +#' +#' @param x Path to DRAGEN `hrdscore.csv` file. +#' +#' @return Tibble with a single row and the following score columns: +#' - HRD (homologous recombination deficiency) +#' - LOH (loss of heterozygosity) +#' - TAI (telomeric allelic imbalance) +#' - LST (large-scale state transitions) +#' +#' If no input is provided, the fields are NA. +#' @export +dragen_hrd <- function(x = NULL) { + res <- tibble::tibble(Sample = NA, LOH = NA, TAI = NA, LST = NA, HRD = NA) + if (!is.null(x)) { + res <- x |> + readr::read_csv(col_types = readr::cols(.default = "c")) + names(res) <- sub("_Score", "", names(res)) + } + res +} + #' HRDetect and CHORD Summary Table #' #' Return a summary table with HRDetect and CHORD results. #' #' @param chord_res Result from running [sigrap::chord_run()]. #' @param hrdetect_res Result from running [sigrap::hrdetect_run()]. +#' @param dragen_res Result from running [gpgr::dragen_hrd()]. #' #' @return A list with a tibble and a gt_tbl object (see [gt::gt()]). #' @@ -20,11 +91,12 @@ #' vcf.snv = snv, sample.name = nm, #' df.sv = gpgr:::chord_mantavcf2df(sv) #' ) -#' hrd_results_tabs(hrdetect_res = hrdetect_res, chord_res = chord_res) +#' dragen_res <- gpgr::dragen_hrd("path/to/sample.hrdscore.csv") +#' hrd_results_tabs(hrdetect_res = hrdetect_res, chord_res = chord_res, dragen_res = dragen_res) #' } #' #' @export -hrd_results_tabs <- function(hrdetect_res, chord_res) { +hrd_results_tabs <- function(hrdetect_res, chord_res, dragen_res) { sn <- chord_res$prediction[, "sample", drop = T] assertthat::are_equal(hrdetect_res[, "sample", drop = T], sn) @@ -42,10 +114,17 @@ hrd_results_tabs <- function(hrdetect_res, chord_res) { ) |> dplyr::filter(col != "sample") + dragen_res_tab <- + dragen_res |> + dplyr::select("HRD", "LOH", "TAI", "LST") |> + unlist() |> + tibble::enframe(name = "col", value = "val") + colnames(dragen_res_tab) <- c("DRAGEN", "results_dragen") colnames(hrdetect_res_tab) <- c("HRDetect", "results_hrdetect") colnames(chord_res_tab) <- c("CHORD", "results_chord") + # idea is to make 9-row tibbles and cbind them tab1 <- dplyr::bind_rows( hrdetect_res_tab, @@ -54,7 +133,7 @@ hrd_results_tabs <- function(hrdetect_res, chord_res) { " ", " " ) ) - tab2 <- + tab2a <- dplyr::bind_rows( chord_res_tab[1:7, ], tibble::tribble( @@ -63,10 +142,14 @@ hrd_results_tabs <- function(hrdetect_res, chord_res) { " ", " " ) ) - tab3 <- chord_res_tab[8:16, ] |> + tab2b <- chord_res_tab[8:16, ] |> purrr::set_names(c("CHORD2", "results_chord2")) + tab3 <- dplyr::bind_rows( + dragen_res_tab, + tibble::tibble(DRAGEN = rep(" ", 5), results_dragen = rep(" ", 5)) + ) - hrd_results_tab <- dplyr::bind_cols(tab1, tab2, tab3) + hrd_results_tab <- dplyr::bind_cols(tab3, tab1, tab2a, tab2b) hrd_results_gt <- hrd_results_tab |> @@ -74,6 +157,11 @@ hrd_results_tabs <- function(hrdetect_res, chord_res) { gt::tab_header( title = glue::glue("HRD Results for {sn}") ) |> + gt::tab_spanner( + label = "DRAGEN", + id = "id_dragen", + columns = c("DRAGEN", "results_dragen") + ) |> gt::tab_spanner( label = "HRDetect", id = "id_hrdetect", @@ -85,6 +173,8 @@ hrd_results_tabs <- function(hrdetect_res, chord_res) { columns = c("CHORD", "results_chord", "CHORD2", "results_chord2") ) |> gt::cols_label( + DRAGEN = "", + results_dragen = "", HRDetect = "", results_hrdetect = "", CHORD = "", @@ -97,12 +187,12 @@ hrd_results_tabs <- function(hrdetect_res, chord_res) { gt::cell_text(weight = "bold") ), locations = gt::cells_body( - columns = c("HRDetect", "CHORD", "CHORD2") + columns = c("DRAGEN", "HRDetect", "CHORD", "CHORD2") ) ) |> gt::cols_align( align = "right", - columns = c("results_hrdetect") + columns = c("results_hrdetect", "results_dragen") ) |> gt::tab_style( style = gt::cell_borders( @@ -112,7 +202,7 @@ hrd_results_tabs <- function(hrdetect_res, chord_res) { style = "solid" ), locations = gt::cells_body( - columns = c("CHORD"), + columns = c("CHORD", "HRDetect"), rows = dplyr::everything() ) ) |> diff --git a/R/utils.R b/R/utils.R index e0b3852..d57901e 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,13 +1,3 @@ -#' BPI run on VCF -#' -#' @param x Path to VCF file. -#' @export -is_vcf_bpi <- function(x) { - assertthat::assert_that(is_vcf(x)) - d <- bedr::read.vcf(x, nrows = 0, verbose = FALSE) # nrows 0 parses just the vcf header - "bpiVersion" %in% names(d[["header"]]) -} - #' Does the input file look like a VCF? #' #' Quickly checks that the input file has a 'vcf' or 'vcf.gz' suffix, and that @@ -44,9 +34,11 @@ is_vcf <- function(x) { n_max = 1 ) + # COMMIT NOTE: FORMAT FIELD IS NOT MANDITORY + vcf_cols <- c( "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", - "FILTER", "INFO", "FORMAT" + "FILTER", "INFO" ) d_cols <- colnames(d) diff --git a/README.md b/README.md index 9931b82..e70f982 100644 --- a/README.md +++ b/README.md @@ -120,13 +120,12 @@ export PATH="${gpgr_cli}:${PATH}" SOMATIC_SNV_SUMMARY --somatic_snv_vcf SOMATIC_SNV_VCF --somatic_sv_tsv SOMATIC_SV_TSV --somatic_sv_vcf SOMATIC_SV_VCF --purple_som_gene_cnv PURPLE_SOM_GENE_CNV - --purple_som_cnv PURPLE_SOM_CNV --purple_germ_cnv - PURPLE_GERM_CNV --purple_purity PURPLE_PURITY --purple_qc - PURPLE_QC --purple_som_snv_vcf PURPLE_SOM_SNV_VCF - --oncoviral_present_viruses ONCOVIRAL_PRESENT_VIRUSES - --oncoviral_breakpoints_tsv ONCOVIRAL_BREAKPOINTS_TSV - [--out_file OUT_FILE] [--quiet] --result_outdir - RESULT_OUTDIR --tumor_name TUMOR_NAME + --purple_som_cnv PURPLE_SOM_CNV --purple_purity + PURPLE_PURITY --purple_qc PURPLE_QC --purple_som_snv_vcf + PURPLE_SOM_SNV_VCF --oncoviral_present_viruses + ONCOVIRAL_PRESENT_VIRUSES --oncoviral_breakpoints_tsv + ONCOVIRAL_BREAKPOINTS_TSV [--out_file OUT_FILE] [--quiet] + --result_outdir RESULT_OUTDIR --tumor_name TUMOR_NAME options: -h, --help show this help message and exit @@ -153,8 +152,6 @@ export PATH="${gpgr_cli}:${PATH}" Path to `purple.cnv.gene.tsv`. --purple_som_cnv PURPLE_SOM_CNV Path to `purple.cnv.somatic.tsv`. - --purple_germ_cnv PURPLE_GERM_CNV - Path to `purple.cnv.germline.tsv`. --purple_purity PURPLE_PURITY Path to `purple.purity.tsv`. --purple_qc PURPLE_QC diff --git a/inst/cli/canrep.R b/inst/cli/canrep.R index 520598c..b50a351 100644 --- a/inst/cli/canrep.R +++ b/inst/cli/canrep.R @@ -3,21 +3,24 @@ canrep_add_args <- function(subp) { canrep$add_argument("--af_global", help = "Path to `af_tumor.txt` file.", required = TRUE) canrep$add_argument("--af_keygenes", help = "Path to `af_tumor_keygenes.txt` file.", required = TRUE) canrep$add_argument("--batch_name", help = "Name of batch sample.", required = TRUE) - canrep$add_argument("--conda_list", help = "Path to `conda_pkg_list.txt` file.", required = TRUE) + canrep$add_argument("--conda_list", help = "Path to `conda_pkg_list.txt` file.") canrep$add_argument("--img_dir", help = "Path to directory containing PURPLE plots.", required = TRUE) canrep$add_argument("--key_genes", help = "Path to UMCCR cancer gene file.", required = TRUE) - canrep$add_argument("--somatic_snv_summary", help = "Path to `somatic_snv_summary.json`.", required = TRUE) + canrep$add_argument("--oncokb_genes", help = "Path to OncoKB database file.", required = TRUE) canrep$add_argument("--somatic_snv_vcf", help = "Path to `somatic-PASS.vcf.gz` SNV VCF.", required = TRUE) + canrep$add_argument("--somatic_snv_summary", help = "Path to `somatic_snv_summary.json` JSON.", required = TRUE) canrep$add_argument("--somatic_sv_tsv", help = "Path to `manta.tsv` TSV file.", required = TRUE) canrep$add_argument("--somatic_sv_vcf", help = "Path to `manta.vcf.gz` VCF file.", required = TRUE) canrep$add_argument("--purple_som_gene_cnv", help = "Path to `purple.cnv.gene.tsv`.", required = TRUE) + canrep$add_argument("--purple_som_cnv_ann", help = "Path to annotated and prioritised `purple.cnv.somatic.tsv`.", required = TRUE) canrep$add_argument("--purple_som_cnv", help = "Path to `purple.cnv.somatic.tsv`.", required = TRUE) - canrep$add_argument("--purple_germ_cnv", help = "Path to `purple.cnv.germline.tsv`.", required = TRUE) canrep$add_argument("--purple_purity", help = "Path to `purple.purity.tsv`.", required = TRUE) canrep$add_argument("--purple_qc", help = "Path to `purple.qc`.", required = TRUE) canrep$add_argument("--purple_som_snv_vcf", help = "Path to `purple.somatic.vcf.gz`.", required = TRUE) - canrep$add_argument("--oncoviral_present_viruses", help = "Path to `oncoviruses/present_viruses.txt`.", required = TRUE) - canrep$add_argument("--oncoviral_breakpoints_tsv", help = "Path to `oncoviruses/oncoviral_breakpoints.tsv`.", required = TRUE) + canrep$add_argument("--virusbreakend_tsv", help = "Path to VIRUSBreakend summary file.", required = TRUE) + canrep$add_argument("--virusbreakend_vcf", help = "Path to VIRUSBreakend VCF file.", required = TRUE) + canrep$add_argument("--dragen_hrd", help = "Path to DRAGEN HRD file", required = TRUE) + canrep$add_argument("--bcftools_stats", help = "Path to bcftools stats file", required = TRUE) canrep$add_argument("--out_file", help = "Path to output HTML file (needs '.html' suffix) [def: {tumor_name}_cancer_report.html].") canrep$add_argument("--quiet", help = "Suppress log printing during rendering.", action = "store_true") canrep$add_argument("--result_outdir", help = "Path to directory to write tidy JSON/TSV results.", required = TRUE) @@ -33,18 +36,21 @@ canrep_parse_args <- function(args) { conda_list = args$conda_list, img_dir = args$img_dir, key_genes = args$key_genes, - somatic_snv_summary = args$somatic_snv_summary, + oncokb_genes = args$oncokb_genes, somatic_snv_vcf = args$somatic_snv_vcf, + somatic_snv_summary = args$somatic_snv_summary, somatic_sv_tsv = args$somatic_sv_tsv, somatic_sv_vcf = args$somatic_sv_vcf, purple_som_gene_cnv = args$purple_som_gene_cnv, + purple_som_cnv_ann = args$purple_som_cnv_ann, purple_som_cnv = args$purple_som_cnv, - purple_germ_cnv = args$purple_germ_cnv, purple_purity = args$purple_purity, purple_qc = args$purple_qc, purple_som_snv_vcf = args$purple_som_snv_vcf, - oncoviral_present_viruses = args$oncoviral_present_viruses, - oncoviral_breakpoints_tsv = args$oncoviral_breakpoints_tsv, + virusbreakend_tsv = args$virusbreakend_tsv, + virusbreakend_vcf = args$virusbreakend_vcf, + dragen_hrd = args$dragen_hrd, + bcftools_stats = args$bcftools_stats, out_file = args$out_file, quiet = args$quiet, result_outdir = args$result_outdir, diff --git a/inst/rmd/umccrise/_navbar.html b/inst/rmd/umccrise/_navbar.html index 9c3a4ed..b87adb0 100644 --- a/inst/rmd/umccrise/_navbar.html +++ b/inst/rmd/umccrise/_navbar.html @@ -14,6 +14,7 @@
diff --git a/inst/rmd/umccrise/cancer_report.Rmd b/inst/rmd/umccrise/cancer_report.Rmd index 45d15a2..7bc1fb0 100644 --- a/inst/rmd/umccrise/cancer_report.Rmd +++ b/inst/rmd/umccrise/cancer_report.Rmd @@ -13,20 +13,23 @@ params: af_global: NULL af_keygenes: NULL batch_name: NULL + bcftools_stats: NULL conda_list: NULL + dragen_hrd: NULL img_dir: NULL key_genes: NULL - oncoviral_breakpoints_tsv: NULL - oncoviral_present_viruses: NULL - purple_germ_cnv: NULL + oncokb_genes: NULL + virusbreakend_tsv: NULL + virusbreakend_vcf: NULL purple_purity: NULL purple_qc: NULL + purple_som_cnv_ann: NULL purple_som_cnv: NULL purple_som_gene_cnv: NULL purple_som_snv_vcf: NULL result_outdir: NULL - somatic_snv_summary: NULL somatic_snv_vcf: NULL + somatic_snv_summary: NULL somatic_sv_tsv: NULL somatic_sv_vcf: NULL title: "UMCCR Cancer Report" @@ -110,14 +113,13 @@ sv_detail_table <- function(tab) { } ``` - ```{r hrd_prep, results='hide'} #---- Homologous Recombination Deficiency ----# -chord_sv_df <- sigrap::chord_mantavcf2df(params$somatic_sv_vcf) +dragen_hrd_res <- gpgr::dragen_hrd(params$dragen_hrd) chord_res <- sigrap::chord_run( vcf.snv = params$somatic_snv_vcf, - df.sv = chord_sv_df, - sv.caller = "manta", + vcf.sv = params$somatic_sv_vcf, + sv.caller = "gridss", sample.name = params$tumor_name, ref.genome = "hg38", verbose = TRUE @@ -183,35 +185,54 @@ gpgr::write_tsvjsongz(sigs_indel, glue("sigs/{bnm}-indel"), result_outdir) ``` ```{r sv_prep} +#---- Misc ----# +oncokb_genes <- gpgr::read_oncokb(params$oncokb_genes) + #---- Structural Variants ----# sv_path <- params$somatic_sv_tsv sv <- NULL -sv_unmelted <- NULL -sv_all <- NULL -no_sv_found <- TRUE -sv_tsv_descr <- NULL -sv_col_descr <- NULL -if (sv_path == "NA") { - sv_unmelted <- tibble(WARNING = "Structural variants were not called") - sv_all <- tibble(WARNING = "Structural variants were not called") -} else { - no_sv_found <- gpgr::tsv_is_empty(sv_path) - if (no_sv_found) { - sv_unmelted <- tibble(WARNING = "THERE WERE 0 SVs PRIORITISED!") - sv_all <- tibble(WARNING = "THERE WERE 0 SVs PRIORITISED!") - sv_tsv_descr <- tibble(Column = "THERE WERE 0 SVs PRIORITISED!") - sv_col_descr <- tibble(Column = "THERE WERE 0 SVs PRIORITISED!") - } else { - sv <- gpgr::process_sv(sv_path) - sv_unmelted <- sv$unmelted - sv_all <- sv$melted - sv_tsv_descr <- sv$tsv_descr - sv_col_descr <- sv$col_descr - } +no_sv_found <- gpgr::tsv_is_empty(sv_path) +if (!no_sv_found) { + sv <- gpgr::process_sv(sv_path) + + # Set OncoKB genes + sv$annotations <- sv$annotations |> + dplyr::mutate( + "Genes (OncoKB)" = gpgr::get_oncokb_genes(Genes, oncokb_genes), + .after = "Genes", + ) } -gpgr::write_tsvjsongz(sv_unmelted, glue("sv/{bnm}-01_sv_unmelted"), result_outdir) -gpgr::write_tsvjsongz(sv_all, glue("sv/{bnm}-02_sv_melted"), result_outdir) +#---- Copy Number Variants ----# +cnv <- gpgr::process_cnv_tsv(params$purple_som_cnv_ann) + +# Set OncoKB genes +cnv$annotations <- cnv$annotations |> + dplyr::mutate( + "Genes (OncoKB)" = get_oncokb_genes(Genes, oncokb_genes), + .after = "Genes", + ) + +# Set OncoKB genes for Many Genes table and transfer to the main table where appropriate +cnv$many_genes <- cnv$many_genes |> + dplyr::mutate( + `Genes (OncoKB)` = get_oncokb_genes(Genes, oncokb_genes), + .before = "Genes", + ) |> + # Create transfer column, set entries with more than n genes as 'Many genes (m)' + dplyr::rowwise() |> + dplyr::mutate( + gene.oncokb.count = strsplit(`Genes (OncoKB)`, ", ") |> unlist() |> unique() |> length(), + gene.oncokb.tx = ifelse(gene.oncokb.count > 2, paste0("Many genes (", gene.oncokb.count, ")"), `Genes (OncoKB)`), + ) + +# Transfer selected OncoKB gene entries +v.selector <- match(cnv$many_genes$`Annotation ID`, cnv$annotations$`Annotation ID`) +cnv$annotations$`Genes (OncoKB)`[v.selector] <- cnv$many_genes$gene.oncokb.tx + +# Clean up columns +cnv$many_genes <- cnv$many_genes |> + dplyr::select(-c(gene.oncokb.count, gene.oncokb.tx)) ``` ```{r purple_prep} @@ -225,15 +246,13 @@ purple_qc_summary <- dplyr::bind_rows( #---- PURPLE CNV Tables ----# purple_cnv_som <- gpgr::purple_cnv_som_process(params$purple_som_cnv) purple_cnv_som_gene <- gpgr::purple_cnv_som_gene_process(params$purple_som_gene_cnv, params$key_genes) -purple_cnv_germ <- gpgr::purple_cnv_germ_process(params$purple_germ_cnv) gpgr::write_tsvjsongz(purple_cnv_som$tab, glue("purple/{bnm}-purple_cnv_som"), result_outdir) gpgr::write_tsvjsongz(purple_cnv_som_gene$tab, glue("purple/{bnm}-purple_cnv_som_gene"), result_outdir) -gpgr::write_tsvjsongz(purple_cnv_germ$tab, glue("purple/{bnm}-purple_cnv_germ"), result_outdir) ``` ```{r, results='asis'} -blank_lines(2) +blank_lines(1) ``` ## __Summary__ @@ -241,23 +260,6 @@ blank_lines(2)