nf-osi · anngvu · Aug 15, 2024 · Aug 15, 2024 · Aug 15, 2024 · Aug 15, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: nfportalutils
 Title: NF Portal Utilities
-Version: 0.9500
+Version: 0.9600
 Authors@R: c(
     person(given = "Robert", family = "Allaway", role = c("aut", "cre"),
            email = "[email protected]",
@@ -14,7 +14,7 @@ License: MIT + file LICENSE
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Imports: 
     dplyr,
     data.table,

diff --git a/NAMESPACE b/NAMESPACE
@@ -21,10 +21,10 @@ export(add_to_scope)
 export(adjust_view)
 export(annotate_aligned_reads)
 export(annotate_called_variants)
-export(annotate_cnv)
-export(annotate_expression)
+export(annotate_processed)
+export(annotate_quantified_expression)
 export(annotate_with_manifest)
-export(annotate_with_tool_stats)
+export(annotate_with_samtools_stats)
 export(as_table_schema)
 export(assign_study_data_types)
 export(bad_url)
@@ -50,9 +50,11 @@ export(find_child_type)
 export(find_data_root)
 export(find_in)
 export(find_nf_asset)
+export(find_parent)
 export(from_pubmed)
 export(get_by_prop_from_json_schema)
 export(get_dependency_from_json_schema)
+export(get_path)
 export(get_project_wiki)
 export(get_valid_values_from_json_schema)
 export(grant_specific_file_access)
@@ -75,11 +77,9 @@ export(meta_qc_project)
 export(missing_annotation_email)
 export(new_dataset)
 export(new_project)
-export(nf_cnv_dataset)
-export(nf_sarek_datasets)
-export(nf_star_salmon_datasets)
 export(nf_workflow_version)
 export(precheck_manifest)
+export(processed_meta)
 export(processing_flowchart)
 export(register_study)
 export(register_study_files)

diff --git a/R/datasets.R b/R/datasets.R
@@ -1,21 +1,21 @@
 # -- Editing Collections -------------------------------------------------------#
 
-# General helpers that should work for both datasets (collection of files) 
+# General helpers that should work for both datasets (collection of files)
 # and dataset collections (collection of datasets).
 
 
 #' Structure as collection items
-#' 
+#'
 #' Helper taking entity ids to create records used for dataset items *or* dataset collection items.
 #' Collection items have the form `list(entityId = id, versionNumber = x)`.
 #'
-#' Note: For item version, dataset items allow two meanings of literal or absolute "latest" 
+#' Note: For item version, dataset items allow two meanings of literal or absolute "latest"
 #' vs. "stable_latest", but with files either one can be used to mean the same thing
 #' since there will be correct interpretation done under the hood.
 #' See implementation in `latest_version`.
 #'
 #' @param ids Ids of entities to make into dataset items.
-#' @param item_version Integer for version that will be used for all items, e.g. 1. 
+#' @param item_version Integer for version that will be used for all items, e.g. 1.
 #' Otherwise, "latest" or "stable_latest". See details.
 #' @keywords internal
 as_coll_items <- function(ids, item_version = c("abs", "stable")) {
@@ -31,35 +31,35 @@ as_coll_items <- function(ids, item_version = c("abs", "stable")) {
 }
 
 
-#' Apply updates to current collection of items
-#' 
-#' This is essentially an internal transaction helper for trying to apply a changeset to a collection,
-#' used in several higher-level collection utils. 
+#' INTERNAL - apply updates to a collection of items
+#'
+#' An internal transaction helper for trying to apply a changeset to a collection,
+#' used in several higher-level collection utils.
 #' Given the changeset that can represent updates of both types "replace" or "add",
-#' this applies an update join keyed on `entityId` for the replace and 
+#' this applies an update join keyed on `entityId` for the replace and
 #' appends the new items to get the updated collection.
-#' 
-#' @param current_items List of lists representing a collection of items.
-#' @param update_items Collection of items to apply as updates to `current_items`. 
+#'
+#' @param current List of lists representing a collection of items.
+#' @param update Collection of items to apply as updates to `current_items`.
 #' @keywords internal
-update_items <- function(current_coll, update_coll) {
-  
-  current_coll <- data.table::rbindlist(current_coll)
-  update_coll <- data.table::rbindlist(update_coll)
+update_items <- function(current, update) {
+
+  current_coll <- data.table::rbindlist(current)
+  update_coll <- data.table::rbindlist(update)
   replaced <- current_coll[update_coll, on = .(entityId), versionNumber := i.versionNumber]
   added <- update_coll[!current_coll, on = .(entityId)]
   updated <- rbind(replaced, added)
   # reconversion; using pure apply as.list coerces versionNumber into char
-  updated <- apply(updated, 1, function(i) list(entityId = unname(i[1]), versionNumber = as.integer(i[2]))) 
+  updated <- apply(updated, 1, function(i) list(entityId = unname(i[1]), versionNumber = as.integer(i[2])))
   updated
 }
 
 
 #' Update item versions to "latest" in a collection
-#' 
+#'
 #' Update an _existing_ collection so that all items or a subset of items reference their latest version.
 #' Should work for both datasets (collection of files) and dataset collections (collection of datasets).
-#' 
+#'
 #' @inheritParams latest_version
 #' @param collection_id Collection id.
 #' @param items Vector of dataset ids for which to update reference to latest version, or "all" (default) to update all.
@@ -72,7 +72,7 @@ use_latest_in_collection <- function(collection_id, items = "all", version_seman
   if((length(items) == 1) && (items  == "all")) {
     coll$items <- as_coll_items(current_items, item_version = version_semantics)
   } else {
-    
+
     # Check subset; if no check, this becomes `add_to_collection`
     if(!all(items %in% current_items)) {
       warning("Subset given includes items not actually in collection: ", items[!items %in% current_items])
@@ -86,7 +86,7 @@ use_latest_in_collection <- function(collection_id, items = "all", version_seman
     coll$items <- updated_items
   }
   .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(coll, auto_unbox = TRUE))
-  
+
 }
 
 
@@ -96,24 +96,24 @@ use_latest_in_collection <- function(collection_id, items = "all", version_seman
 #' For datasets, the items should be files. For dataset collections, the items should be datasets.
 #' If an item attempting to be added happens to already be in the collection,
 #' this might lead to version conflicts, so the update will be rejected unless `force` is true.
-#' 
-#' This is implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet 
+#'
+#' This is implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet
 #' implement dataset collection class and methods (but dataset and relevant methods like `add_item` method are available).
-#' Thus, while this is generic enough to handle both datasets and dataset collections 
+#' Thus, while this is generic enough to handle both datasets and dataset collections
 #' it is expected to be used more for dataset collections given that the dataset method is provided.
-#' 
+#'
 #' @param collection_id Collection id.
 #' @param items Character vector of one or more dataset entity ids to add.
 #' @param check_items Whether to check that ids are really appropriate item types and remove non-appropriate item types
 #' to help avoid Synapse errors (default `FALSE` because in most cases `items` are curated, and using check will be slower).
-#' @param force If some items are currently in the collection with a different version, 
+#' @param force If some items are currently in the collection with a different version,
 #' should these items be force-added using current version? The safe default is `FALSE` to ensure any such updates are intentional.
 #' @export
 add_to_collection <- function(collection_id, items, check_items = FALSE, force = FALSE) {
-  
+
   coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"))
   coll_type <- which_coll_type(coll)
-  
+
   if(check_items) {
     item_type_check <- if(coll_type == "dataset") is_file else is_dataset
     correct_item_type <- sapply(items, item_type_check)
@@ -126,7 +126,7 @@ add_to_collection <- function(collection_id, items, check_items = FALSE, force =
       }
     }
   }
-  
+
   current_items <- sapply(coll$items, function(x) x$entityId)
   if(any(items %in% current_items) && !force) {
     stop("Some items to be added are already in collection. Use `force = TRUE` to allow replacing existing versions.")
@@ -188,137 +188,14 @@ latest_version <- function(id, version_semantics = c("abs", "stable")) {
 }
 
 
-#' Create datasets for Sarek-called somatic or germline variants results
-#' 
-#' Organize variant call files from Nextflow Sarek into 3-4 datasets, 
-#' grouping files by variant type and workflow with titles having the format: 
-#' "{type} Genomic Variants - {workflow} Pipeline", e.g. "Somatic Genomic Variants - Strelka Pipeline".
-#' As you can see, this assumes that you want to create datasets that segregate Somatic and Germline calls. 
-#' This makes sense for NF because Germline calls can be treated differently.  
-#' This uses latest version of all files and creates a Draft version of the dataset.
-#' 
-#' Since we basically just need the syn entity id, variant type, and workflow to group the files. 
-#' Instead of getting this info through running `map_*` as in the example,
-#' you may prefer using a fileview, in which case you just need to download a table from a fileview 
-#' that has `id` => `output_id` + the `dataType` and `workflow` annotations. 
-#' The fileview can be used _after_ the files are annotated. If you want to create datasets _before_
-#' files are annotated, then you have to use `map_*`.
-#' 
-#' Finally, datasets cannot use the same name if stored in the same project,
-#' so if there are multiple batches, the names will have to be made unique by adding
-#' the batch number, source data id, processing date, or whatever makes sense.
-#' 
-#' @inheritParams new_dataset
-#' @param output_map The `data.table` returned from `map_sample_output_sarek`. See details for alternatives.
-#' @param workflow One of workflows used.
-#' @param verbose Optional, whether to be verbose -- defaults to TRUE.
-#' @import data.table
-#' @return A list of dataset objects.
-#' @export
-#' @examples
-#'\dontrun{
-#' syn_out <- "syn26648589"
-#' m <- map_sample_output_sarek(syn_out)
-#' datasets <- nf_sarek_datasets(m, parent = "syn26462036", dry_run = F) # use a test project
-#'}
-nf_sarek_datasets <- function(output_map, 
-                              parent, 
-                              workflow = c("FreeBayes", "Mutect2", "Strelka", "DeepVariant"),
-                              verbose = TRUE, 
-                              dry_run = TRUE) { 
-
-  output_map <- as.data.table(output_map)
-  if(!is.null(output_map$dataType)) {
-    data_type <- unique(output_map$dataType)
-    if(length(data_type) != 1) stop("Expecting one `dataType`, which does not appear to be the case.")
-    gvtype <- grep("(Germline|Somatic)Variants", data_type, value = T)
-    if(!length(gvtype)) stop("Data type does not look right, expecting either Germline or Somatic variants.")
-    gvtype <- switch(gvtype,
-                     SomaticVariants = "Somatic",
-                     GermlineVariants = "Germline")
-
-  } else {
-    # Detect genomic variants type from first path name 
-    gvtype <- if(grepl("SomaticVariantCalls", first(output_map$caller_path))) {
-      "Somatic"  
-    } else if(grepl("GermlineVariantCalls", first(output_map$caller_path)))  {
-      "Germline"
-    } else {
-      stop("Could not assign either Germline or Somatic labels based on main output folder. 
-           Check whether folder contains mixed types or is not the right one.")
-    }
-  }
-  pattern <- "vcf.gz(.tbi)?$"
-  workflow <- match.arg(workflow)
-  datasets <- list()
-  for(i in workflow) {
-    dataset <- output_map[workflow == i & grepl(pattern, output_name)]
-    if(nrow(dataset)) {
-      if(verbose) glue::glue("Creating {i} dataset with {nrow(dataset)} files")
-      name <- glue::glue("{gvtype} Genomic Variants - {i} Pipeline")
-      dataset <- new_dataset(name = name, parent = parent, items = dataset$output_id, dry_run = TRUE)
-      if(dry_run) datasets[[i]] <- syn_dataset else datasets[[i]] <- .syn$store(syn_dataset)
-    }
-  }
-
-  return(datasets)
-
-}
-
-
-#' Create dataset for STAR-Salmon expression quantification results
-#' 
-#' With a level-3 manifest that is created from `annotate_expression`,
-#' calls `new_dataset` to make quantification files (.sf) into dataset. 
-#' Uses latest version of the files and creates a "Draft" dataset.
-#' See `nf_sarek_datasets`.
-#' 
-#' @inheritParams new_dataset
-#' @inheritParams nf_sarek_datasets
-#' @param manifest A table of annotated data manifest from `annotate_expression`.
-#' @export
-nf_star_salmon_datasets <- function(manifest, 
-                                    parent,
-                                    dry_run = TRUE) { 
-
-  items <- manifest$entityId
-  new_dataset(name = "Gene Expression Quantification from RNA-seq",
-              parent = parent,
-              items = items,
-              dry_run = dry_run)
-}
-
-#' Create dataset for CNVKit results
-#'
-#' Create dataset from all files in CNVKit output
-#'
-#' @inheritParams new_dataset
-#' @param syn_out Output folder called 'cnvkit'
-#' @export
-nf_cnv_dataset <- function(syn_out,
-                           parent,
-                           dry_run = TRUE) {
-
-  files <- walk(syn_out)
-  files <- unlist(files)
-  df <- as.data.frame(matrix(files, ncol = 2, byrow = TRUE))
-  names(df) <- c("Filename", "id")
-  df <- df[grepl("cnr$|cns$|cnn$|bed$|pdf$|png$", df$Filename), ]
-  items <- df$id
-  new_dataset(name = "Copy Number Variant - CNVkit",
-              parent = parent,
-              items = items,
-              dry_run = dry_run)
-}
-
 
 # -- Checks------------- -------------------------------------------------------#
 
 # TODO Potentially move these type checks somewhere else like basic_utils
 # TODO Better composition to reduce code, esp. if more will be added
 
 #' Check whether entity is dataset
-#' 
+#'
 #' @keywords internal
 is_dataset <- function(id) {
   tryCatch({
@@ -329,7 +206,7 @@ is_dataset <- function(id) {
 }
 
 #' Check whether entity is dataset collection
-#' 
+#'
 #' @keywords internal
 is_dataset_collection <- function(id) {
   tryCatch({
@@ -341,17 +218,17 @@ is_dataset_collection <- function(id) {
 
 
 #' Which collection type
-#' 
+#'
 #' Checks for a valid collection type or returns error
-#' 
+#'
 #' @keywords internal
 which_coll_type <- function(coll) {
   coll_type <- c("dataset", "dataset collection")[c(is_dataset(coll), is_dataset_collection(coll))]
   if(length(coll_type)) coll_type else stop("Entity is not a dataset or dataset collection.")
 }
 
 #' Check whether entity is file
-#' 
+#'
 #' @keywords internal
 is_file <- function(id) {
   tryCatch({