Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Patch/nf anno utils #186

Open
wants to merge 16 commits into
base: develop
Choose a base branch
from
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: nfportalutils
Title: NF Portal Utilities
Version: 0.9500
Version: 0.9600
Authors@R: c(
person(given = "Robert", family = "Allaway", role = c("aut", "cre"),
email = "[email protected]",
Expand All @@ -14,7 +14,7 @@ License: MIT + file LICENSE
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
RoxygenNote: 7.3.1
Imports:
dplyr,
data.table,
Expand Down
12 changes: 6 additions & 6 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ export(add_to_scope)
export(adjust_view)
export(annotate_aligned_reads)
export(annotate_called_variants)
export(annotate_cnv)
export(annotate_expression)
export(annotate_processed)
export(annotate_quantified_expression)
export(annotate_with_manifest)
export(annotate_with_tool_stats)
export(annotate_with_samtools_stats)
export(as_table_schema)
export(assign_study_data_types)
export(bad_url)
Expand All @@ -50,9 +50,11 @@ export(find_child_type)
export(find_data_root)
export(find_in)
export(find_nf_asset)
export(find_parent)
export(from_pubmed)
export(get_by_prop_from_json_schema)
export(get_dependency_from_json_schema)
export(get_path)
export(get_project_wiki)
export(get_valid_values_from_json_schema)
export(grant_specific_file_access)
Expand All @@ -75,11 +77,9 @@ export(meta_qc_project)
export(missing_annotation_email)
export(new_dataset)
export(new_project)
export(nf_cnv_dataset)
export(nf_sarek_datasets)
export(nf_star_salmon_datasets)
export(nf_workflow_version)
export(precheck_manifest)
export(processed_meta)
export(processing_flowchart)
export(register_study)
export(register_study_files)
Expand Down
191 changes: 34 additions & 157 deletions R/datasets.R
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
# -- Editing Collections -------------------------------------------------------#

# General helpers that should work for both datasets (collection of files)
# General helpers that should work for both datasets (collection of files)
# and dataset collections (collection of datasets).


#' Structure as collection items
#'
#'
#' Helper taking entity ids to create records used for dataset items *or* dataset collection items.
#' Collection items have the form `list(entityId = id, versionNumber = x)`.
#'
#' Note: For item version, dataset items allow two meanings of literal or absolute "latest"
#' Note: For item version, dataset items allow two meanings of literal or absolute "latest"
#' vs. "stable_latest", but with files either one can be used to mean the same thing
#' since there will be correct interpretation done under the hood.
#' See implementation in `latest_version`.
#'
#' @param ids Ids of entities to make into dataset items.
#' @param item_version Integer for version that will be used for all items, e.g. 1.
#' @param item_version Integer for version that will be used for all items, e.g. 1.
#' Otherwise, "latest" or "stable_latest". See details.
#' @keywords internal
as_coll_items <- function(ids, item_version = c("abs", "stable")) {
Expand All @@ -31,35 +31,35 @@ as_coll_items <- function(ids, item_version = c("abs", "stable")) {
}


#' Apply updates to current collection of items
#'
#' This is essentially an internal transaction helper for trying to apply a changeset to a collection,
#' used in several higher-level collection utils.
#' INTERNAL - apply updates to a collection of items
#'
#' An internal transaction helper for trying to apply a changeset to a collection,
#' used in several higher-level collection utils.
#' Given the changeset that can represent updates of both types "replace" or "add",
#' this applies an update join keyed on `entityId` for the replace and
#' this applies an update join keyed on `entityId` for the replace and
#' appends the new items to get the updated collection.
#'
#' @param current_items List of lists representing a collection of items.
#' @param update_items Collection of items to apply as updates to `current_items`.
#'
#' @param current List of lists representing a collection of items.
#' @param update Collection of items to apply as updates to `current_items`.
#' @keywords internal
update_items <- function(current_coll, update_coll) {
current_coll <- data.table::rbindlist(current_coll)
update_coll <- data.table::rbindlist(update_coll)
update_items <- function(current, update) {

current_coll <- data.table::rbindlist(current)
update_coll <- data.table::rbindlist(update)
replaced <- current_coll[update_coll, on = .(entityId), versionNumber := i.versionNumber]
added <- update_coll[!current_coll, on = .(entityId)]
updated <- rbind(replaced, added)
# reconversion; using pure apply as.list coerces versionNumber into char
updated <- apply(updated, 1, function(i) list(entityId = unname(i[1]), versionNumber = as.integer(i[2])))
updated <- apply(updated, 1, function(i) list(entityId = unname(i[1]), versionNumber = as.integer(i[2])))
updated
}


#' Update item versions to "latest" in a collection
#'
#'
#' Update an _existing_ collection so that all items or a subset of items reference their latest version.
#' Should work for both datasets (collection of files) and dataset collections (collection of datasets).
#'
#'
#' @inheritParams latest_version
#' @param collection_id Collection id.
#' @param items Vector of dataset ids for which to update reference to latest version, or "all" (default) to update all.
Expand All @@ -72,7 +72,7 @@ use_latest_in_collection <- function(collection_id, items = "all", version_seman
if((length(items) == 1) && (items == "all")) {
coll$items <- as_coll_items(current_items, item_version = version_semantics)
} else {

# Check subset; if no check, this becomes `add_to_collection`
if(!all(items %in% current_items)) {
warning("Subset given includes items not actually in collection: ", items[!items %in% current_items])
Expand All @@ -86,7 +86,7 @@ use_latest_in_collection <- function(collection_id, items = "all", version_seman
coll$items <- updated_items
}
.syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(coll, auto_unbox = TRUE))

}


Expand All @@ -96,24 +96,24 @@ use_latest_in_collection <- function(collection_id, items = "all", version_seman
#' For datasets, the items should be files. For dataset collections, the items should be datasets.
#' If an item attempting to be added happens to already be in the collection,
#' this might lead to version conflicts, so the update will be rejected unless `force` is true.
#'
#' This is implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet
#'
#' This is implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet
#' implement dataset collection class and methods (but dataset and relevant methods like `add_item` method are available).
#' Thus, while this is generic enough to handle both datasets and dataset collections
#' Thus, while this is generic enough to handle both datasets and dataset collections
#' it is expected to be used more for dataset collections given that the dataset method is provided.
#'
#'
#' @param collection_id Collection id.
#' @param items Character vector of one or more dataset entity ids to add.
#' @param check_items Whether to check that ids are really appropriate item types and remove non-appropriate item types
#' to help avoid Synapse errors (default `FALSE` because in most cases `items` are curated, and using check will be slower).
#' @param force If some items are currently in the collection with a different version,
#' @param force If some items are currently in the collection with a different version,
#' should these items be force-added using current version? The safe default is `FALSE` to ensure any such updates are intentional.
#' @export
add_to_collection <- function(collection_id, items, check_items = FALSE, force = FALSE) {

coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"))
coll_type <- which_coll_type(coll)

if(check_items) {
item_type_check <- if(coll_type == "dataset") is_file else is_dataset
correct_item_type <- sapply(items, item_type_check)
Expand All @@ -126,7 +126,7 @@ add_to_collection <- function(collection_id, items, check_items = FALSE, force =
}
}
}

current_items <- sapply(coll$items, function(x) x$entityId)
if(any(items %in% current_items) && !force) {
stop("Some items to be added are already in collection. Use `force = TRUE` to allow replacing existing versions.")
Expand Down Expand Up @@ -188,137 +188,14 @@ latest_version <- function(id, version_semantics = c("abs", "stable")) {
}


#' Create datasets for Sarek-called somatic or germline variants results
#'
#' Organize variant call files from Nextflow Sarek into 3-4 datasets,
#' grouping files by variant type and workflow with titles having the format:
#' "{type} Genomic Variants - {workflow} Pipeline", e.g. "Somatic Genomic Variants - Strelka Pipeline".
#' As you can see, this assumes that you want to create datasets that segregate Somatic and Germline calls.
#' This makes sense for NF because Germline calls can be treated differently.
#' This uses latest version of all files and creates a Draft version of the dataset.
#'
#' Since we basically just need the syn entity id, variant type, and workflow to group the files.
#' Instead of getting this info through running `map_*` as in the example,
#' you may prefer using a fileview, in which case you just need to download a table from a fileview
#' that has `id` => `output_id` + the `dataType` and `workflow` annotations.
#' The fileview can be used _after_ the files are annotated. If you want to create datasets _before_
#' files are annotated, then you have to use `map_*`.
#'
#' Finally, datasets cannot use the same name if stored in the same project,
#' so if there are multiple batches, the names will have to be made unique by adding
#' the batch number, source data id, processing date, or whatever makes sense.
#'
#' @inheritParams new_dataset
#' @param output_map The `data.table` returned from `map_sample_output_sarek`. See details for alternatives.
#' @param workflow One of workflows used.
#' @param verbose Optional, whether to be verbose -- defaults to TRUE.
#' @import data.table
#' @return A list of dataset objects.
#' @export
#' @examples
#'\dontrun{
#' syn_out <- "syn26648589"
#' m <- map_sample_output_sarek(syn_out)
#' datasets <- nf_sarek_datasets(m, parent = "syn26462036", dry_run = F) # use a test project
#'}
nf_sarek_datasets <- function(output_map,
parent,
workflow = c("FreeBayes", "Mutect2", "Strelka", "DeepVariant"),
verbose = TRUE,
dry_run = TRUE) {

output_map <- as.data.table(output_map)
if(!is.null(output_map$dataType)) {
data_type <- unique(output_map$dataType)
if(length(data_type) != 1) stop("Expecting one `dataType`, which does not appear to be the case.")
gvtype <- grep("(Germline|Somatic)Variants", data_type, value = T)
if(!length(gvtype)) stop("Data type does not look right, expecting either Germline or Somatic variants.")
gvtype <- switch(gvtype,
SomaticVariants = "Somatic",
GermlineVariants = "Germline")

} else {
# Detect genomic variants type from first path name
gvtype <- if(grepl("SomaticVariantCalls", first(output_map$caller_path))) {
"Somatic"
} else if(grepl("GermlineVariantCalls", first(output_map$caller_path))) {
"Germline"
} else {
stop("Could not assign either Germline or Somatic labels based on main output folder.
Check whether folder contains mixed types or is not the right one.")
}
}
pattern <- "vcf.gz(.tbi)?$"
workflow <- match.arg(workflow)
datasets <- list()
for(i in workflow) {
dataset <- output_map[workflow == i & grepl(pattern, output_name)]
if(nrow(dataset)) {
if(verbose) glue::glue("Creating {i} dataset with {nrow(dataset)} files")
name <- glue::glue("{gvtype} Genomic Variants - {i} Pipeline")
dataset <- new_dataset(name = name, parent = parent, items = dataset$output_id, dry_run = TRUE)
if(dry_run) datasets[[i]] <- syn_dataset else datasets[[i]] <- .syn$store(syn_dataset)
}
}

return(datasets)

}


#' Create dataset for STAR-Salmon expression quantification results
#'
#' With a level-3 manifest that is created from `annotate_expression`,
#' calls `new_dataset` to make quantification files (.sf) into dataset.
#' Uses latest version of the files and creates a "Draft" dataset.
#' See `nf_sarek_datasets`.
#'
#' @inheritParams new_dataset
#' @inheritParams nf_sarek_datasets
#' @param manifest A table of annotated data manifest from `annotate_expression`.
#' @export
nf_star_salmon_datasets <- function(manifest,
parent,
dry_run = TRUE) {

items <- manifest$entityId
new_dataset(name = "Gene Expression Quantification from RNA-seq",
parent = parent,
items = items,
dry_run = dry_run)
}

#' Create dataset for CNVKit results
#'
#' Create dataset from all files in CNVKit output
#'
#' @inheritParams new_dataset
#' @param syn_out Output folder called 'cnvkit'
#' @export
nf_cnv_dataset <- function(syn_out,
parent,
dry_run = TRUE) {

files <- walk(syn_out)
files <- unlist(files)
df <- as.data.frame(matrix(files, ncol = 2, byrow = TRUE))
names(df) <- c("Filename", "id")
df <- df[grepl("cnr$|cns$|cnn$|bed$|pdf$|png$", df$Filename), ]
items <- df$id
new_dataset(name = "Copy Number Variant - CNVkit",
parent = parent,
items = items,
dry_run = dry_run)
}


# -- Checks------------- -------------------------------------------------------#

# TODO Potentially move these type checks somewhere else like basic_utils
# TODO Better composition to reduce code, esp. if more will be added

#' Check whether entity is dataset
#'
#'
#' @keywords internal
is_dataset <- function(id) {
tryCatch({
Expand All @@ -329,7 +206,7 @@ is_dataset <- function(id) {
}

#' Check whether entity is dataset collection
#'
#'
#' @keywords internal
is_dataset_collection <- function(id) {
tryCatch({
Expand All @@ -341,17 +218,17 @@ is_dataset_collection <- function(id) {


#' Which collection type
#'
#'
#' Checks for a valid collection type or returns error
#'
#'
#' @keywords internal
which_coll_type <- function(coll) {
coll_type <- c("dataset", "dataset collection")[c(is_dataset(coll), is_dataset_collection(coll))]
if(length(coll_type)) coll_type else stop("Entity is not a dataset or dataset collection.")
}

#' Check whether entity is file
#'
#'
#' @keywords internal
is_file <- function(id) {
tryCatch({
Expand Down
Loading
Loading