Skip to content

Commit

Permalink
Patch/issue 124 (#128)
Browse files Browse the repository at this point in the history
* Add verbosity setting

* Rename

* Add small fixes and regenerate docs

* Bump version
  • Loading branch information
anngvu authored Sep 12, 2023
1 parent 47f5542 commit 5d341a0
Show file tree
Hide file tree
Showing 16 changed files with 103 additions and 44 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: nfportalutils
Title: NF Portal Utilities
Version: 0.0.0.9400
Version: 0.0.0.9410
Authors@R: c(
person(given = "Robert", family = "Allaway", role = c("aut", "cre"),
email = "[email protected]",
Expand Down
4 changes: 3 additions & 1 deletion R/annotations.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
#' @param manifest A table manifest. Needs to contain `entityId`.
#' @param ignore_na Whether to ignore annotations that are `NA`; default TRUE.
#' @param ignore_blank Whether to ignore annotations that are that empty strings; default TRUE.
#' @param verbose Be chatty, default FALSE.
#' @export
annotate_with_manifest <- function(manifest, ignore_na = TRUE, ignore_blank = TRUE) {
annotate_with_manifest <- function(manifest, ignore_na = TRUE, ignore_blank = TRUE, verbose = FALSE) {
# Split by `entityId`
annotations <- as.data.table(manifest)
if("Filename" %in% names(annotations)) annotations[, Filename := NULL]
Expand All @@ -22,6 +23,7 @@ annotate_with_manifest <- function(manifest, ignore_na = TRUE, ignore_blank = TR
for(entity in names(annotations)) {
.syn$setAnnotations(entity = entity, annotations = as.list(annotations[[entity]]))
}
if (verbose) message("Annotations submitted")
}


Expand Down
27 changes: 20 additions & 7 deletions R/find.R
Original file line number Diff line number Diff line change
Expand Up @@ -68,18 +68,31 @@ find_data_root <- function(project_id) {
#'
#' @param syn_out Id of top-level folder that corresponds to `publishDir` in a nextflow workflow.
#' @param asset Name of asset to find.
#' @param workflow Specify workflow, "rna-seq" or "sarek"; defaults to "rna-seq"
#' @returns Id of samplesheet.
#' @export
find_nf_asset <- function(syn_out,
asset = c("software_versions", "multiqc_report", "samplesheet", "samtools_stats")) {
asset = c("software_versions", "multiqc_report", "samplesheet", "samtools_stats"),
workflow = "rna-seq") {

asset <- match.arg(asset)
path <- switch(asset,
software_versions = "pipeline_info/software_versions.yml",
multiqc_report = "multiqc/star_salmon/multiqc_report.html",
samplesheet = "pipeline_info/samplesheet.valid.csv",
samtools_stats = "multiqc/star_salmon/multiqc_data/multiqc_samtools_stats.txt"
)
# Assets and paths can differ slightly depending on workflow, except for `software_versions.yml`, get workflow first
if(workflow == "rna-seq") {
path <- switch(asset,
software_versions = "pipeline_info/software_versions.yml",
multiqc_report = "multiqc/star_salmon/multiqc_report.html",
samplesheet = "pipeline_info/samplesheet.valid.csv",
samtools_stats = "multiqc/star_salmon/multiqc_data/multiqc_samtools_stats.txt"
)
} else if(workflow == "sarek") {
path <- switch(asset,
software_versions = "pipeline_info/software_versions.yml",
multiqc_report = "multiqc/multiqc_report.html",
# samplesheet not yet stored for sarek
samtools_stats = "multiqc/multiqc_data/multiqc_samtools_stats.txt")
} else {
stop("Unrecognized workflow.")
}

id <- find_in(syn_out, path)
if(is.null(id)) stop("File not found. Is this the right output directory/path?")
Expand Down
45 changes: 29 additions & 16 deletions R/nextflow_utils.R → R/nextflow_annotation_utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -65,18 +65,23 @@ map_sample_output_rnaseq <- function(syn_out) {

#' Map sample to output from nf-sarek
#'
#' See https://nf-co.re/sarek. Processed outputs are nested by sample and variant callers, i.e.
#' `*VariantCalling/<TUMOR_vs_NORMAL>/<CALLER>`. This walks through the output destination (URI of `*VariantCalling`)
#' See https://nf-co.re/sarek. Most processed outputs are nested by sample and variant callers, i.e.
#' `*VariantCalling/<TUMOR_vs_NORMAL>/<CALLER>`. Other times the data is organized as
#' `*VariantCalling/<CALLER>/<TUMOR_vs_NORMAL>`.
#' This walks through the output destination (URI of `*VariantCalling`)
#' with similar intention to \code{\link{map_sample_output_rnaseq}}, but for Sarek outputs.
#'
#' Note: And additional step post-Sarek will create MAFs in the output subdirectory DeepVariant.
#' If this is run _after_ the MAF creation step, this will return file indexes with those .maf files.
#'
#' @param syn_out Syn id of syn output destination with files of interest.
#' @param sample_level If caller is organized by sample, use 2 (default), if samples organized by caller, use 3. See details.
#' @import data.table
#' @return A `data.table` with cols `caller` `caller_path` `caller_syn` `output_name` `output_id` `sample` `workflow`
#' @export
map_sample_output_sarek <- function(syn_out) {
map_sample_output_sarek <- function(syn_out, sample_level = 2) {

workflow_level <- if(sample_level == 2) 3 else 2

# `walk` can be very slow
ls <- walk(syn_out)
Expand All @@ -93,9 +98,9 @@ map_sample_output_sarek <- function(syn_out) {
})
)
paths <- strsplit(outputs$caller_path, "/", fixed = TRUE)
outputs[, sample := sapply(paths, `[[`, 2)]
outputs[, sample := sapply(paths, `[[`, sample_level)]
outputs[, sample := strsplit(sample, "_vs_")]
outputs[, workflow := sapply(paths, `[[`, 3)]
outputs[, workflow := sapply(paths, `[[`, workflow_level)]
return(outputs)
}

Expand All @@ -106,20 +111,22 @@ map_sample_output_sarek <- function(syn_out) {
#'
#' @inheritParams map_sample_input_ss
#' @inheritParams map_sample_output_rnaseq
#' @inheritParams map_sample_output_sarek
#' @param workflow Workflow.
#' @return A table with `sample` `level` `output_id` `output_name` `input_id`.
#' @export
map_sample_io <- function(workflow = c("nf-rnaseq", "nf-sarek"),
samplesheet,
syn_out) {
syn_out,
sample_level = 2) {

workflow <- match.arg(workflow)
sample_inputs <- map_sample_input_ss(samplesheet)

if(workflow == "nf-rnaseq") {
sample_outputs <- map_sample_output_rnaseq(syn_out)
} else if(workflow == "nf-sarek") {
sample_outputs <- map_sample_output_sarek(syn_out)
sample_outputs <- map_sample_output_sarek(syn_out, sample_level = sample_level)
# sample can contain 2 samples (tumor vs normal from same indiv) -> take first
sample_outputs[, sample := sapply(sample, first)]
}
Expand Down Expand Up @@ -187,9 +194,15 @@ annotate_with_tool_stats <- function(samtools_stats_file = NULL,

#' Derive annotations for processed output data
#'
#' Build annotations through inheritance from inputs. If multiple inputs, inherit props from the FIRST input.
#' Files that pass through this naturally have `dataSubtype` set to "processed" and `fileFormat` set
#' to the actual new file format. In the future, `template` itself may define `format` so we don't need to specify explicitly.
#' A processed or derived file can inherit annotations from the input file(s).
#' Currently, this generously facilitates inheritance of many properties except ones that
#' "obviously" shouldn't be inherited, such as "fileFormat" or "comments".
#' These rules are hard-coded and might need to be expanded as the data model changes,
#' and the manifest generated should still be reviewed.
#'
#' If multiple inputs given, this will inherit annotations from the FIRST input.
#' Files that pass through this naturally have `dataSubtype` automatically set to "processed" and `fileFormat` set
#' to the actual new file format. In the future, `template` itself may define default `format` so we don't need to specify explicitly.
#'
#' @param format File format of the processed data, e.g. "vcf".
#' @keywords internal
Expand All @@ -206,7 +219,7 @@ derive_annotations <- function(sample_io,
if(verbose) message(glue::glue("Creating annotations for ", n, " {format} files"))

props <- get_dependency_from_json_schema(id = template, schema = schema)
props <- props[!props %in% c("comments", "entityId", "fileFormat", "dataType", "dataSubtype", "progressReportNumber")]
props <- props[!props %in% c("comments", "entityId", "fileFormat", "dataType", "dataSubtype", "progressReportNumber", "Component")]

from <- sapply(x$input_id, `[`, 1)
to <- x$output_id
Expand Down Expand Up @@ -311,10 +324,12 @@ annotate_expression <- function(sample_io,
#'
#' @inheritParams annotate_aligned_reads
#' @param sample_io Table mapping input to outputs, which reference output `.vcf.gz` or `maf` files.
#' @param workflow_ref Character vector with names for workflow and values as version-specific links.
#' @param format Variant format, "auto" to handle any "vcf" or "maf" files present automatically, or specify one explicitly. See details.
#' @param data_type Variant type, use "auto" to infer from naming scheme and current NF processing SOP, or specify more explicitly.
#' @export
annotate_called_variants <- function(sample_io,
workflow_ref,
format = c("auto", "vcf", "maf"),
template = "bts:ProcessedVariantCallsTemplate",
schema = "https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/NF.jsonld",
Expand Down Expand Up @@ -350,13 +365,10 @@ annotate_called_variants <- function(sample_io,
} else {
data_type_assign <- function(name, format) { data_type }
}
annotations[, data_type := data_type_assign(Filename, fileFormat), by = entityId]
annotations[, dataType := data_type_assign(Filename, fileFormat), by = entityId]

annotations[fileFormat == "vcf" & workflow == "Strelka", workflowLink := "https://nf-co.re/sarek/2.7.1/output#strelka2"]
annotations[fileFormat == "vcf" & workflow == "Strelka", workflow := "Strelka2"]
annotations[fileFormat == "vcf" & workflow == "Mutect2", workflowLink := "https://nf-co.re/sarek/2.7.1/output#gatk-mutect2"]
annotations[fileFormat == "vcf" & workflow == "FreeBayes", workflowLink := "https://nf-co.re/sarek/2.7.1/output#freebayes"]
annotations[fileFormat == "vcf" & workflow == "DeepVariant", workflowLink := "https://github.com/google/deepvariant/tree/r1.1"]
annotations[fileFormat == "vcf", workflowLink := workflow_ref[workflow]]

annotations[fileFormat == "maf", workflow := "nf-vcf2maf"]
annotations[fileFormat == "maf", workflowLink := "https://github.com/Sage-Bionetworks-Workflows/nf-vcf2maf/tree/1.0.1"]
Expand All @@ -365,6 +377,7 @@ annotate_called_variants <- function(sample_io,
annotate_with_manifest(annotations)
if(verbose) message("Applied annotations.")
}

return(annotations)
}

Expand Down
2 changes: 1 addition & 1 deletion man/annotate_aligned_reads.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion man/annotate_called_variants.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/annotate_cnv.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/annotate_expression.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 8 additions & 1 deletion man/annotate_with_manifest.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/annotate_with_tool_stats.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 11 additions & 4 deletions man/derive_annotations.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion man/find_nf_asset.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/map_sample_input_ss.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 9 additions & 2 deletions man/map_sample_io.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/map_sample_output_rnaseq.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 8 additions & 4 deletions man/map_sample_output_sarek.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 5d341a0

Please sign in to comment.