Patch/issue 124 (#128)

* Add verbosity setting * Rename * Add small fixes and regenerate docs * Bump version
nf-osi · Sep 12, 2023 · 5d341a0 · 5d341a0
1 parent 47f5542
commit 5d341a0
Show file tree

Hide file tree

Showing 16 changed files with 103 additions and 44 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: nfportalutils
 Title: NF Portal Utilities
-Version: 0.0.0.9400
+Version: 0.0.0.9410
 Authors@R: c(
     person(given = "Robert", family = "Allaway", role = c("aut", "cre"),
            email = "[email protected]",

diff --git a/R/annotations.R b/R/annotations.R
@@ -9,8 +9,9 @@
 #' @param manifest A table manifest. Needs to contain `entityId`.
 #' @param ignore_na Whether to ignore annotations that are `NA`; default TRUE.
 #' @param ignore_blank Whether to ignore annotations that are that empty strings; default TRUE.
+#' @param verbose Be chatty, default FALSE.
 #' @export   
-annotate_with_manifest <- function(manifest, ignore_na = TRUE, ignore_blank = TRUE) {
+annotate_with_manifest <- function(manifest, ignore_na = TRUE, ignore_blank = TRUE, verbose = FALSE) {
   # Split by `entityId`
   annotations <- as.data.table(manifest)
   if("Filename" %in% names(annotations)) annotations[, Filename := NULL]
@@ -22,6 +23,7 @@ annotate_with_manifest <- function(manifest, ignore_na = TRUE, ignore_blank = TR
   for(entity in names(annotations)) {
     .syn$setAnnotations(entity = entity, annotations = as.list(annotations[[entity]]))
   }
+  if (verbose) message("Annotations submitted")
 }
 
 

diff --git a/R/find.R b/R/find.R
@@ -68,18 +68,31 @@ find_data_root <- function(project_id) {
 #' 
 #' @param syn_out Id of top-level folder that corresponds to `publishDir` in a nextflow workflow.
 #' @param asset Name of asset to find.
+#' @param workflow Specify workflow, "rna-seq" or "sarek"; defaults to "rna-seq"
 #' @returns Id of samplesheet. 
 #' @export
 find_nf_asset <- function(syn_out, 
-                          asset = c("software_versions", "multiqc_report", "samplesheet", "samtools_stats")) {
+                          asset = c("software_versions", "multiqc_report", "samplesheet", "samtools_stats"),
+                          workflow = "rna-seq") {
 
   asset <- match.arg(asset)
-  path <- switch(asset,
-                 software_versions = "pipeline_info/software_versions.yml",
-                 multiqc_report = "multiqc/star_salmon/multiqc_report.html",
-                 samplesheet = "pipeline_info/samplesheet.valid.csv",
-                 samtools_stats = "multiqc/star_salmon/multiqc_data/multiqc_samtools_stats.txt"
-  )
+  # Assets and paths can differ slightly depending on workflow, except for `software_versions.yml`, get workflow first
+  if(workflow == "rna-seq") {
+    path <- switch(asset,
+                   software_versions = "pipeline_info/software_versions.yml",
+                   multiqc_report = "multiqc/star_salmon/multiqc_report.html",
+                   samplesheet = "pipeline_info/samplesheet.valid.csv",
+                   samtools_stats = "multiqc/star_salmon/multiqc_data/multiqc_samtools_stats.txt"
+    )
+  } else if(workflow == "sarek") {
+    path <- switch(asset,
+                   software_versions = "pipeline_info/software_versions.yml",
+                   multiqc_report = "multiqc/multiqc_report.html",
+                   # samplesheet not yet stored for sarek
+                   samtools_stats = "multiqc/multiqc_data/multiqc_samtools_stats.txt")
+  } else {
+    stop("Unrecognized workflow.")
+  }
 
   id <- find_in(syn_out, path)
   if(is.null(id)) stop("File not found. Is this the right output directory/path?")

diff --git a/R/nextflow_utils.R → R/nextflow_annotation_utils.R b/R/nextflow_utils.R → R/nextflow_annotation_utils.R
@@ -65,18 +65,23 @@ map_sample_output_rnaseq <- function(syn_out) {
 
 #' Map sample to output from nf-sarek
 #' 
-#' See https://nf-co.re/sarek. Processed outputs are nested by sample and variant callers, i.e. 
-#' `*VariantCalling/<TUMOR_vs_NORMAL>/<CALLER>`. This walks through the output destination (URI of `*VariantCalling`)
+#' See https://nf-co.re/sarek. Most processed outputs are nested by sample and variant callers, i.e. 
+#' `*VariantCalling/<TUMOR_vs_NORMAL>/<CALLER>`. Other times the data is organized as
+#'  `*VariantCalling/<CALLER>/<TUMOR_vs_NORMAL>`.
+#' This walks through the output destination (URI of `*VariantCalling`)
 #' with similar intention to \code{\link{map_sample_output_rnaseq}}, but for Sarek outputs.
 #' 
 #' Note: And additional step post-Sarek will create MAFs in the output subdirectory DeepVariant. 
 #' If this is run _after_ the MAF creation step, this will return file indexes with those .maf files.
 #' 
 #' @param syn_out Syn id of syn output destination with files of interest. 
+#' @param sample_level If caller is organized by sample, use 2 (default), if samples organized by caller, use 3. See details. 
 #' @import data.table
 #' @return A `data.table` with cols `caller` `caller_path` `caller_syn` `output_name` `output_id` `sample` `workflow`
 #' @export
-map_sample_output_sarek <- function(syn_out) {
+map_sample_output_sarek <- function(syn_out, sample_level = 2) {
+
+  workflow_level <- if(sample_level == 2) 3 else 2
 
   # `walk` can be very slow
   ls <- walk(syn_out)
@@ -93,9 +98,9 @@ map_sample_output_sarek <- function(syn_out) {
     })
   )
   paths <- strsplit(outputs$caller_path, "/", fixed = TRUE)
-  outputs[, sample := sapply(paths, `[[`, 2)]
+  outputs[, sample := sapply(paths, `[[`, sample_level)]
   outputs[, sample := strsplit(sample, "_vs_")]
-  outputs[, workflow := sapply(paths, `[[`, 3)]
+  outputs[, workflow := sapply(paths, `[[`, workflow_level)]
   return(outputs)
 }
 
@@ -106,20 +111,22 @@ map_sample_output_sarek <- function(syn_out) {
 #' 
 #' @inheritParams map_sample_input_ss
 #' @inheritParams map_sample_output_rnaseq
+#' @inheritParams map_sample_output_sarek
 #' @param workflow Workflow. 
 #' @return A table with `sample` `level` `output_id` `output_name` `input_id`.
 #' @export
 map_sample_io <- function(workflow = c("nf-rnaseq", "nf-sarek"),
                           samplesheet,
-                          syn_out) {
+                          syn_out,
+                          sample_level = 2) {
 
   workflow <- match.arg(workflow)
   sample_inputs <- map_sample_input_ss(samplesheet)
 
   if(workflow == "nf-rnaseq") {
     sample_outputs <- map_sample_output_rnaseq(syn_out)
   } else if(workflow == "nf-sarek") {
-    sample_outputs <- map_sample_output_sarek(syn_out)
+    sample_outputs <- map_sample_output_sarek(syn_out, sample_level = sample_level)
     # sample can contain 2 samples (tumor vs normal from same indiv) -> take first
     sample_outputs[, sample := sapply(sample, first)]
   }
@@ -187,9 +194,15 @@ annotate_with_tool_stats <- function(samtools_stats_file = NULL,
 
 #' Derive annotations for processed output data
 #' 
-#' Build annotations through inheritance from inputs. If multiple inputs, inherit props from the FIRST input. 
-#' Files that pass through this naturally have `dataSubtype` set to "processed" and `fileFormat` set 
-#' to the actual new file format. In the future, `template` itself may define `format` so we don't need to specify explicitly.
+#' A processed or derived file can inherit annotations from the input file(s).
+#' Currently, this generously facilitates inheritance of many properties except ones that 
+#' "obviously" shouldn't be inherited, such as "fileFormat" or "comments". 
+#' These rules are hard-coded and might need to be expanded as the data model changes, 
+#' and the manifest generated should still be reviewed.
+#' 
+#' If multiple inputs given, this will inherit annotations from the FIRST input. 
+#' Files that pass through this naturally have `dataSubtype` automatically set to "processed" and `fileFormat` set 
+#' to the actual new file format. In the future, `template` itself may define default `format` so we don't need to specify explicitly.
 #' 
 #' @param format File format of the processed data, e.g. "vcf".
 #' @keywords internal
@@ -206,7 +219,7 @@ derive_annotations <- function(sample_io,
   if(verbose) message(glue::glue("Creating annotations for ", n, " {format} files"))
 
   props <- get_dependency_from_json_schema(id = template, schema = schema)
-  props <- props[!props %in% c("comments", "entityId", "fileFormat", "dataType", "dataSubtype", "progressReportNumber")]
+  props <- props[!props %in% c("comments", "entityId", "fileFormat", "dataType", "dataSubtype", "progressReportNumber", "Component")]
 
   from <- sapply(x$input_id, `[`, 1)
   to <- x$output_id
@@ -311,10 +324,12 @@ annotate_expression <- function(sample_io,
 #' 
 #' @inheritParams annotate_aligned_reads
 #' @param sample_io Table mapping input to outputs, which reference output `.vcf.gz` or `maf` files.
+#' @param workflow_ref Character vector with names for workflow and values as version-specific links.
 #' @param format Variant format, "auto" to handle any "vcf" or "maf" files present automatically, or specify one explicitly. See details.
 #' @param data_type Variant type, use "auto" to infer from naming scheme and current NF processing SOP, or specify more explicitly.
 #' @export
 annotate_called_variants <- function(sample_io,
+                                     workflow_ref,
                                      format = c("auto", "vcf", "maf"),
                                      template = "bts:ProcessedVariantCallsTemplate",
                                      schema = "https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/NF.jsonld",
@@ -350,13 +365,10 @@ annotate_called_variants <- function(sample_io,
   } else {
     data_type_assign <- function(name, format) { data_type }
   }
-  annotations[, data_type := data_type_assign(Filename, fileFormat), by = entityId]
+  annotations[, dataType := data_type_assign(Filename, fileFormat), by = entityId]
 
-  annotations[fileFormat == "vcf" & workflow == "Strelka", workflowLink := "https://nf-co.re/sarek/2.7.1/output#strelka2"]
   annotations[fileFormat == "vcf" & workflow == "Strelka", workflow := "Strelka2"]
-  annotations[fileFormat == "vcf" & workflow == "Mutect2", workflowLink := "https://nf-co.re/sarek/2.7.1/output#gatk-mutect2"]
-  annotations[fileFormat == "vcf" & workflow == "FreeBayes", workflowLink := "https://nf-co.re/sarek/2.7.1/output#freebayes"]
-  annotations[fileFormat == "vcf" & workflow == "DeepVariant", workflowLink := "https://github.com/google/deepvariant/tree/r1.1"]
+  annotations[fileFormat == "vcf", workflowLink := workflow_ref[workflow]]
 
   annotations[fileFormat == "maf", workflow := "nf-vcf2maf"] 
   annotations[fileFormat == "maf", workflowLink := "https://github.com/Sage-Bionetworks-Workflows/nf-vcf2maf/tree/1.0.1"] 
@@ -365,6 +377,7 @@ annotate_called_variants <- function(sample_io,
     annotate_with_manifest(annotations)
     if(verbose) message("Applied annotations.")
   }
+
   return(annotations)
 }
 

diff --git a/man/annotate_aligned_reads.Rd b/man/annotate_aligned_reads.Rd
diff --git a/man/annotate_called_variants.Rd b/man/annotate_called_variants.Rd
diff --git a/man/annotate_cnv.Rd b/man/annotate_cnv.Rd
diff --git a/man/annotate_expression.Rd b/man/annotate_expression.Rd
diff --git a/man/annotate_with_manifest.Rd b/man/annotate_with_manifest.Rd
diff --git a/man/annotate_with_tool_stats.Rd b/man/annotate_with_tool_stats.Rd
diff --git a/man/derive_annotations.Rd b/man/derive_annotations.Rd
diff --git a/man/find_nf_asset.Rd b/man/find_nf_asset.Rd
diff --git a/man/map_sample_input_ss.Rd b/man/map_sample_input_ss.Rd
diff --git a/man/map_sample_io.Rd b/man/map_sample_io.Rd
diff --git a/man/map_sample_output_rnaseq.Rd b/man/map_sample_output_rnaseq.Rd
diff --git a/man/map_sample_output_sarek.Rd b/man/map_sample_output_sarek.Rd