Revalidation workflow (#162)

* Add helper finder * Add remanifest util * Refine development of remanifest * Add precheck/supplemental check util * Update docs * Update pkgdown index * Exports * Update expected errors and warnings * Expand handling of modern datasets elsewhere * Update output * Update exports, test message * Update docs * Add vignette
nf-osi · Jan 17, 2024 · 3f110ac · 3f110ac
1 parent c432501
commit 3f110ac
Show file tree

Hide file tree

Showing 16 changed files with 1,537 additions and 36 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -44,6 +44,7 @@ export(data_curator_app_subpage)
 export(delete_provenance)
 export(dsp_dataset_mapping)
 export(find_child)
+export(find_child_type)
 export(find_data_root)
 export(find_in)
 export(find_nf_asset)
@@ -61,6 +62,7 @@ export(make_folder)
 export(make_public)
 export(make_public_viewable)
 export(manifest_generate)
+export(manifest_passed)
 export(manifest_validate)
 export(map_sample_input_ss)
 export(map_sample_io)
@@ -75,9 +77,11 @@ export(nf_cnv_dataset)
 export(nf_sarek_datasets)
 export(nf_star_salmon_datasets)
 export(nf_workflow_version)
+export(precheck_manifest)
 export(processing_flowchart)
 export(register_study)
 export(register_study_files)
+export(remanifest)
 export(remove_button)
 export(remove_wiki_subpage)
 export(summarize_attribute)

diff --git a/R/annotation_qc.R b/R/annotation_qc.R
@@ -133,7 +133,7 @@ tersely <- function(error) {
 #' @param result Result list data from schematic service.
 #' @returns Boolean for whether passed.
 #' @returns List of structure `list(result = result, notes = notes)`, where `result` indicates whether the dataset passed.
-#' @keywords internal
+#' @export
 manifest_passed <- function(result) {
 
   errors <- length(result$errors)
@@ -280,26 +280,104 @@ meta_qc_project <- function(project_id, result_file = NULL, ...) {
 
 }
 
+
 #' List datasets in project
 #'
-#' Return a list of dataset folders if they are in expected location in project, otherwise NULL w/ explanatory message.
+#' Return a list of datasets. Datasets can be folders in the expected location in the project or actual dataset entities.
+#' Note that dataset-folders will always be what exists first in the project, as sort of the dataset precursor.
+#' The files in dataset-folders can be translated to dataset entities later.
+#' When not found, will return NULL w/ explanatory message.
 #'
 #' @inheritParams find_data_root
+#' @param type Whether to list datasets as immediate folders under "Raw Data" root (default, see details) or actual dataset entities in the project.
 #' @export
-list_project_datasets <- function(project_id) {
+list_project_datasets <- function(project_id,
+                                  type = c("folder", "dataset")) {
 
-  data_root <- find_data_root(project_id)
-  if(is.null(data_root)) {
+  type <- match.arg(type)
+  if(type == "folder") {
+    data_root <- find_data_root(project_id)
+    if(is.null(data_root)) {
 
-    warning("Data root could not be located.")
-    return()
+      warning("Data root could not be located.")
+      return()
 
-  } else {
+    } else {
 
-    in_data <- .syn$getChildren(data_root)
-    in_data <- reticulate::iterate(in_data)
-    datasets <- Filter(function(x) x$type == "org.sagebionetworks.repo.model.Folder", in_data)
-    if(!length(datasets)) warning("No datasets found under data root.")
+      in_data <- .syn$getChildren(data_root)
+      in_data <- reticulate::iterate(in_data)
+      datasets <- Filter(function(x) x$type == "org.sagebionetworks.repo.model.Folder", in_data)
+      if(!length(datasets)) warning("No datasets found under data root.")
+      datasets
+    }
+  } else {
+    children <- .syn$getChildren(project_id)
+    datasets <- reticulate::iterate(children)
+    datasets <- Filter(function(x) x$type == "org.sagebionetworks.repo.model.table.Dataset", datasets)
+    if(!length(datasets)) warning("No dataset entities found in project.")
     datasets
   }
 }
+
+#' Precheck a manifest
+#'
+#' Precheck before sending manifest off to schematic validation service.
+#' Provides additional context and helpful recommendations.
+#'
+#' @param manifest_csv Path to manifest_csv.
+#' @param official_props (Optional) Doc listing official model attributes. Currently this requires the LinkML format.
+#' @export
+precheck_manifest <- function(manifest_csv,
+                              official_props = "https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/modules/props.yaml") {
+
+  manifest <- fread(manifest_csv)
+  attributes <- names(manifest)
+
+  #-- ERRORS --#
+  if(!"Component" %in% attributes) {
+    message(glue::glue())
+  } else {
+    unique_components <- unique(manifest$Component)
+    if(length(unique_components) > 1) {
+      which_components <- glue::glue_collapse(shQuote(unique_components), ", ")
+      message(glue::glue("{emoji::emoji('x')} Multiple components detected in a single manifest: {which_components}. This can happen when files were annotated at different eras.
+                          Suggestions: 1) Split up the manifest because schematic can only validate one type at a type. 2) Harmonize the components if this is sensible.
+                          For example, RNASeqTemplate is an alias for GenomicsAssayTemplate"))
+    }
+
+    if("" %in% unique_components) {
+      message(glue::glue("{emoji::emoji('x')} Blank value '' for Component detected. This can happen because files were annotated before 2022, when Component was introduced for most DCCs."))
+    }
+
+  }
+
+  # Duplicate columns like age..1 etc.
+  likely_dups <- grep("[.][0-9]+", attributes, value = TRUE)
+  if(length(likely_dups)) {
+    likely_dups <- glue::glue_collapse(shQuote(likely_dups), ", ")
+    message(glue::glue("{emoji::emoji('x')} The pattern of these attribute names suggest duplicates: {likely_dups}. This may happen when metadata is supplemented programmatically with a data-type mismatch"))
+  }
+
+  #-- WARNINGS --#
+  # These technically don't break present-day schematic revalidation but should be cleaned up; many are from earlier schematic issues.
+
+  # See https://sagebionetworks.slack.com/archives/C01ANC02U59/p1681418154850589
+  if("Uuid" %in% attributes) {
+    message(crayon::yellow(glue::glue("{emoji::emoji('warning')} An attribute `Uuid` is present and should preferably be removed.")))
+  }
+
+  # See https://github.com/Sage-Bionetworks/schematic/issues/476#issuecomment-848853193
+  if("eTag" %in% attributes) {
+    message(crayon::yellow(glue::glue("{emoji::emoji('warning')} An attribute `eTag` is present and preferably be removed.")))
+  }
+
+  #-- INFO only --#
+  if(!is.null(official_props)) props <- names(yaml::read_yaml(official_props)$slots)
+  custom_attributes <- setdiff(attributes, props)
+  if(length(custom_attributes)) {
+    custom_attributes <- glue::glue_collapse(shQuote(custom_attributes), ", ")
+    message(crayon::blue(glue::glue("{emoji::emoji('information')} Custom attributes (not documented in data model) were found: {custom_attributes}. In general, custom attributes added by the researcher to help with data management are fine.
+                Just check that they are not PHI or added by mistake. If they are deemed generally useful or important enough, they can also be documented officially in the data model for others to reference.")))
+  }
+
+}
diff --git a/R/find.R b/R/find.R
@@ -1,15 +1,15 @@
 # Utils to help overcome nested folders
 
 #' Find in path
-#' 
+#'
 #' Get the Synapse id of an entity nested several folder layers deep without
 #' having to click through the UI or create a fileview as long as the structure/path is known.
-#' 
+#'
 #' @param scope Id of the container (project or folder) to begin search.
 #' @param path Path string in format "subdir1/subdir2/file.txt", where the last-level element will be the id returned.
 #' @export
 find_in <- function(scope, path) {
-  
+
   path_list <- strsplit(path, split = "/", fixed = TRUE)[[1]]
   here <- scope
   id <- NULL
@@ -28,11 +28,11 @@ find_in <- function(scope, path) {
 #' @param child_name Name of child entity.
 #' @export
 find_child <- function(child_name, parent) {
-  
+
   q <- .syn$getChildren(parent)
   child_id <- NULL
   repeat {
-    x <- reticulate::iter_next(q) 
+    x <- reticulate::iter_next(q)
     if(is.null(x) || x$name == child_name) {
       child_id <- x$id
       break
@@ -41,40 +41,59 @@ find_child <- function(child_name, parent) {
   child_id
 }
 
+
+#' Find children of type
+#'
+#' Small utility like`find_child` but retrieves files by type rather than by specific name.
+#' Returns a vector of ids, with entity names set as names.
+#'
+#' @inheritParams find_child
+#' @param child_type Type(s) as a list, even for only one type. Defaults to "file".
+#' @export
+find_child_type <- function(parent, child_type = list("file")) {
+
+  x <- .syn$getChildren(parent, includeTypes = child_type)
+  y <- reticulate::iterate(x)
+  if(!length(y)) return()
+  z <- setNames(sapply(y, `[[`, "id"), sapply(y, `[[`, "name"))
+  return(z)
+}
+
+
 #' Find data folder
-#' 
+#'
 #' Convenience function to find data folder, which can have slight name variations, in a project.
-#' 
+#'
 #' @param project_id Synapse project id.
 #' @export
 find_data_root <- function(project_id) {
-  
+
   data_root <- find_child("Data", parent = project_id)
   if(is.null(data_root)) data_root <- find_child("Raw Data", parent = project_id)
   data_root
-} 
+}
 
 
 # Find nextflow assets --------------------------------------------------------- #
 
 # Convenience functions for getting Synapse ids of nextflow assets
 
 #' Find a standard nextflow workflow output asset
-#' 
+#'
 #' Note that samplesheets became part of the output only for newer versions of nf-core/rna-seq;
-#' older runs may not find samplesheets. 
-#' Paths default to known working paths corresponding to the latest major workflow version, 
-#' but this may change and may need to be updated as part of util maintenance. 
-#' 
+#' older runs may not find samplesheets.
+#' Paths default to known working paths corresponding to the latest major workflow version,
+#' but this may change and may need to be updated as part of util maintenance.
+#'
 #' @param syn_out Id of top-level folder that corresponds to `publishDir` in a nextflow workflow.
 #' @param asset Name of asset to find.
 #' @param workflow Specify workflow, "rna-seq" or "sarek"; defaults to "rna-seq"
-#' @returns Id of samplesheet. 
+#' @returns Id of samplesheet.
 #' @export
-find_nf_asset <- function(syn_out, 
+find_nf_asset <- function(syn_out,
                           asset = c("software_versions", "multiqc_report", "samplesheet", "samtools_stats"),
                           workflow = "rna-seq") {
-  
+
   asset <- match.arg(asset)
   # Assets and paths can differ slightly depending on workflow, except for `software_versions.yml`, get workflow first
   if(workflow == "rna-seq") {
@@ -93,20 +112,20 @@ find_nf_asset <- function(syn_out,
   } else {
     stop("Unrecognized workflow.")
   }
-  
+
   id <- find_in(syn_out, path)
   if(is.null(id)) stop("File not found. Is this the right output directory/path?")
   id
 }
 
 
 #' Return workflow version according to workflow meta
-#' 
+#'
 #' @inheritParams find_nf_asset
-#' @returns Version string. 
+#' @returns Version string.
 #' @export
 nf_workflow_version <- function(syn_out) {
-  
+
   version_meta <- find_nf_asset(syn_out, asset = "software_versions")
   file <- .syn$get(version_meta, downloadFile = TRUE)
   yml <- yaml::read_yaml(file$path)

diff --git a/R/remanifest.R b/R/remanifest.R
@@ -0,0 +1,108 @@
+#' Reconstitute a manifest
+#'
+#' Quickly "reconstitute" a manifest for files with existing annotations.
+#' This is a *power-user* alternative to schematic's `manifest_generate` with option `useAnnotations = TRUE` for certain scenarios:
+#'
+#' - Meant to be faster and more convenient for workflows focused on scalable/programmatic annotation/revalidation.
+#' Convenience and speed mainly comes from bypassing the schematic API layer and using the Synapse API directly,
+#' so manifest is reconstituted directly as R `data.table` **instead of having to download a Googlesheets or Excel intermediate**.
+#' **The result `.csv` is still intended to be schematic-compatible for validation via `manifest_validate`**.
+#'
+#' - Works with both a Synapse dataset entity as well as a folder;
+#' schematic currently only handles the latter (the folder concept predates Synapse datasets).
+#'
+#' - If folder-scoped, can choose between generating manifest with just immediate files *or*, like schematic, nested files.
+#' This is useful, for example, if the main scope folder is "RNA-seq" and the main files are fastqs, but there is a child folder called "md5" with md5 files,
+#' and the manifest is really meant for the fastqs.
+#'
+#' - Does not try to download `synapse_manifest.csv`, which can be useful to regenerate a manifest when ARs have been set.
+#'
+#' - Does not require an asset view.
+#'
+#' Other developer notes:
+#' See also `gather_annotations` for the under-the-hood workhorse.
+#' Intended for further development to be used as part of advanced annotation utils.
+#'
+#' @param scope Synapse id of something representing one or more datasets. Can be a folder *or* Synapse dataset entity.
+#' @param file Name of file to be written. Use `NULL` to return the `data.table` instead of writing to file.
+#' @param recurse Include everything nested within folder scope, default `TRUE`, or only the immediate files.
+#' Not used if dataset is Synapse dataset entity.
+#' @param mf Intermediate manifest to build upon; used when recursive.
+#' @import data.table
+#' @export
+remanifest <- function(scope,
+                       file = "manifest.csv",
+                       recurse = TRUE,
+                       mf = data.table()) {
+
+  dataset <- .syn$get(scope[1])
+  if(dataset$properties$concreteType == "org.sagebionetworks.repo.model.table.Dataset") {
+
+    dataset_items <- sapply(dataset$properties$datasetItems, `[[`, "entityId")
+    if(!length(dataset_items)) {
+
+      return(data.table())
+
+    } else {
+
+      manifest <- gather_annotations(dataset_items)
+      if(!is.null(file)) save_manifest(manifest, file) else manifest
+
+    }
+
+  } else if(dataset$properties$concreteType == "org.sagebionetworks.repo.model.Folder") {
+
+    files <- find_child_type(scope[1])
+    if(length(files)) {
+      ids <- files
+      manifest <- gather_annotations(ids)
+    } else {
+      manifest <- data.table()
+    }
+
+    if(recurse) {
+      more <- find_child_type(scope[1], list("folder"))
+      scope <- c(scope[-1], more)
+      if(!length(scope)) {
+        manifest <- rbind(mf, manifest, fill = T)
+        if(!is.null(file)) save_manifest(manifest, file) else manifest
+      } else {
+        remanifest(scope = scope,
+                   recurse = recurse,
+                   mf = rbind(mf, manifest, fill = T))
+      }
+    } else {
+      if(!is.null(file)) save_manifest(manifest, file) else manifest
+    }
+
+  } else {
+
+    "Dataset is only accepted as a folder or dataset entity. Please check the id used."
+
+  }
+}
+
+#' Internal helper for gathering annotations into a table using the REST API
+#'
+#' This is an internal implementation that works directly with the platform service JSON
+#' to afford more low-level control and avoid Python-R object conversion differences between reticulate versions.
+#'
+#' @param ids One or more ids.
+#' @param list_sep List separator for list annotations.
+#' @keywords internal
+gather_annotations <- function(ids, list_sep = ", ") {
+  results <- lapply(ids, function(id) .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{id}/annotations2")))
+  annotations <- lapply(results, function(x) c(list(entityId = list(value = x$id)), x$annotations))
+  annotations <- lapply(annotations, function(x) lapply(x, function(col) paste(col$value, collapse = list_sep)))
+  manifest <- data.table::rbindlist(annotations, fill = TRUE)
+  manifest
+}
+
+
+#' Save manifest
+#'
+#' @keywords internal
+save_manifest <- function(manifest, file) {
+  fwrite(manifest, file = file)
+  checked_message(glue::glue("Saved manifest as {file}"))
+}