Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataset features #74

Merged
merged 25 commits into from
Jul 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
10631b0
Add draft datasets util (wip)
anngvu Jun 29, 2023
ffdb661
Parameterize workflow per review comment
anngvu Jun 29, 2023
1603e7f
Factor out dataset item constructor to make more functional and flexible
anngvu Jun 29, 2023
04b0870
Implement util for adding to dataset collection
anngvu Jun 30, 2023
fae5fa4
Update explanation
anngvu Jun 30, 2023
e0bb731
Refactor into base fun
anngvu Jun 30, 2023
963ade1
Add tests
anngvu Jun 30, 2023
1048a42
Reorganize
anngvu Jul 1, 2023
0de3331
Resolve TODO checks
anngvu Jul 1, 2023
eea014f
Export, document
anngvu Jul 1, 2023
11488a9
More updates of docs to be consistent with changes
anngvu Jul 1, 2023
3ecc3a2
Add util for update and corresponding unit test
anngvu Jul 2, 2023
103f7cd
Update unit test and fix discovered bug
anngvu Jul 2, 2023
cefa999
More unit tests
anngvu Jul 2, 2023
50bff87
Add another collection util
anngvu Jul 2, 2023
1883265
Generalize and reorganize
anngvu Jul 2, 2023
a2bfed9
More tests and code qc
anngvu Jul 2, 2023
cc11a48
Draft dataset citation util
anngvu Jul 2, 2023
3887461
Mark internal or export, add doc details
anngvu Jul 3, 2023
c861904
Write new docs, update pkgdown reference
anngvu Jul 3, 2023
c1e1522
Fix bug for one failing test in test_dataset_utils
anngvu Jul 3, 2023
d158cb6
Update testing notes
anngvu Jul 3, 2023
2998455
Add dependency check
anngvu Jul 3, 2023
76336a2
Address CMD check doc complaints, update NAMESPACE and bump pkg version
anngvu Jul 3, 2023
53657a2
Final updates, reorg, and notes for experimental stuff
anngvu Jul 3, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: nfportalutils
Title: NF Portal Utilities
Version: 0.0.0.9210
Version: 0.0.0.9300
Authors@R: c(
person(given = "Robert", family = "Allaway", role = c("aut", "cre"),
email = "[email protected]",
Expand Down
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ export(add_people_from_table)
export(add_publication_from_pubmed)
export(add_publication_from_unpaywall)
export(add_publications_from_file)
export(add_to_collection)
export(annotate_aligned_reads)
export(annotate_called_variants)
export(annotate_expression)
Expand Down Expand Up @@ -49,7 +50,10 @@ export(map_sample_io)
export(map_sample_output_rnaseq)
export(map_sample_output_sarek)
export(missing_annotation_email)
export(new_dataset)
export(new_project)
export(nf_sarek_datasets)
export(nf_star_salmon_datasets)
export(processing_flowchart)
export(qc_manifest)
export(register_study)
Expand All @@ -61,6 +65,7 @@ export(syn_login)
export(syncBP_maf)
export(table_query)
export(update_study_annotations)
export(use_latest_in_collection)
export(wiki_mod)
import(data.table)
import(reticulate)
Expand Down
54 changes: 54 additions & 0 deletions R/citation.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# -- Citations -----------------------------------------------------------------#

#' Get DOI metadata if it exists
#'
#' Returns list of metadata associated with DOI if exists, otherwise NULL.
#' Currently usable for certain entity types like files or datasets,
#' though this should be revised to make more useful with other objects.
#' Note: Internal/experimental use only, not for production use.
#'
#' @param id Dataset data.
#' @keywords internal
get_doi_meta <- function(id) {

# TODO Template query according to object type of id,
# i.e. folders can have dois, but they don't have version #s
obj <- .syn$get(id, downloadFile = FALSE)
versionNumber <- obj$properties$versionNumber # error if no versionNumber
tryCatch({
.syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/doi?id={id}&type=DATASET&version={versionNumber}"))
},
error = function(e) if(grepl("DOI association does not exist.", e$message)) NULL else e)
}

#' Generate example dataset citation
#'
#' This is currently more for demo purposes, to check how well current metadata
#' could be formatted into citation text. Datasets where DOIs have been minted
#' *or* NF-OSI processed datasets within the official Portal Collection should
#' work well, while there are no guarantees for other cases.
#' Note: Internal/experimental use only, not for production use.
#'
#' @param id Dataset id.
#' @param format Currently just "Scientific Data" format.
#' @param output Currently only markdown, from which other utils can be used to generate LaTeX or HTML.
#' @keywords internal
cite_dataset <- function(id,
format = "Scientific Data",
output = c("markdown")) {
if(!is_dataset(id)) stop("Not a dataset")
if(length(get_doi_meta(id))) {
message("For now, please go to https://citation.crosscite.org/ for the most comprehensive citation options.")
return(NULL)
} else {
meta <- .syn$get_annotations(id)
title <- meta$title
creator <- meta$creator
repository <- "Synapse"
accession <- if(length(doi)) doi else glue::glue("https://www.synapse.org/#!Synapse:{id}")
yearPublished <- meta$yearPublished
glue::glue("{creator}. _{repository}_ {accession} ({yearPublished}).")
}

}

308 changes: 308 additions & 0 deletions R/datasets.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,308 @@
# -- Editing Collections -------------------------------------------------------#

# General helpers that should work for both datasets (collection of files)
# and dataset collections (collection of datasets).


#' As collection items
#'
#' Helper taking entity ids to create records used for dataset items or dataset collection items.
#' Collection items have the form `list(entityId = id, versionNumber = x)`.
#'
#' @param ids Ids of entities to make into dataset items.
#' @param item_version Integer for version that will be used for all items, e.g. 1.
#' If NULL, this will look up the latest version for each id and use that.
#' @keywords internal
as_coll_items <- function(ids, item_version = NULL) {
if(is.null(item_version)) {
item_version <- lapply(ids, function(id) .syn$get(id, downloadFile = FALSE)$properties$versionNumber)
}
items <- Map(function(id, version) list(entityId = id, versionNumber = version), ids, item_version)
names(items) <- NULL # need to unname list for API
items
}


#' Apply updates to current collection of items
#'
#' This is essentially an internal transaction helper for trying to apply a changeset to a collection,
#' used in several higher-level collection utils.
#' Given the changeset that can represent updates of both types "replace" or "add",
#' this applies an update join keyed on `entityId` for the replace and
#' appends the new items to get the updated collection.
#'
#' @param current_items List of lists representing a collection of items.
#' @param update_items Collection of items to apply as updates to `current_items`.
#' @keywords internal
update_items <- function(current_coll, update_coll) {

current_coll <- data.table::rbindlist(current_coll)
update_coll <- data.table::rbindlist(update_coll)
replaced <- current_coll[update_coll, on = .(entityId), versionNumber := i.versionNumber]
added <- update_coll[!current_coll, on = .(entityId)]
updated <- rbind(replaced, added)
# reconversion; using pure apply as.list coerces versionNumber into char
updated <- apply(updated, 1, function(i) list(entityId = unname(i[1]), versionNumber = as.integer(i[2])))
updated
}


#' Update item versions to "latest" in a collection
#'
#' Update an _existing_ collection so that all items or a subset of items reference their latest version.
#' This should work for both datasets (collection of files) and dataset collections (collection of datasets).
#'
#' @param collection_id Collection id.
#' @param items Vector of dataset ids for which to update reference to latest version,
#' or "all" (default) to update all in the dataset collection.
#' @export
use_latest_in_collection <- function(collection_id, items = "all") {
anngvu marked this conversation as resolved.
Show resolved Hide resolved
coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"))
current_items <- sapply(coll$items, function(i) i$entityId)

if((length(items) == 1) && (items == "all")) {
coll$items <- as_coll_items(current_items)
} else {

# Check subset; if no check, this becomes `add_to_collection`
if(!all(items %in% current_items)) {
warning("Subset given includes items not actually in collection: ", items[!items %in% current_items])
items <- items[items %in% current_items]
if(!length(items)) {
warning("No qualifying items to update. No updates applied.")
return(coll)
}
}
updated_items <- update_items(coll$items, as_coll_items(items))
coll$items <- updated_items
}
.syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(coll, auto_unbox = TRUE))

}


#' Add to collection
#'
#' Add items(s) to an _existing_ collection, using the item(s)' current (latest) version.
#' For datasets, the items should be files. For dataset collections, the items should be datasets.
#' If an item attempting to be added happens to already be in the collection,
#' this might lead to version conflicts, so the update will be rejected unless `force` is true.
#'
#' This is implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet
#' implement dataset collection class and methods (but dataset and relevant methods like `add_item` method are available).
#' Thus, while this is generic enough to handle both datasets and dataset collections
#' it is expected to be used more for dataset collections given that the dataset method is provided.
#'
#' @param collection_id Collection id.
#' @param items Character vector of one or more dataset entity ids to add.
#' @param check_items Whether to check that ids are really appropriate item types and remove non-appropriate item types
#' to help avoid Synapse errors (default `FALSE` because in most cases `items` are curated, and using check will be slower).
#' @param force If some items are currently in the collection with a different version,
#' should these items be force-added using current version? The safe default is `FALSE` to ensure any such updates are intentional.
#' @export
add_to_collection <- function(collection_id, items, check_items = FALSE, force = FALSE) {

coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"))
coll_type <- which_coll_type(coll)

if(check_items) {
item_type_check <- if(coll_type == "dataset") is_file else is_dataset
correct_item_type <- sapply(items, item_type_check)
if(any(!correct_item_type)) {
warning("Some items not correct entity types for the collection and will not be added: ", items[!correct_item_type])
items <- items[correct_item_type]
if(!length(items)) {
warning("No qualifying items to add. No updates applied.", call. = FALSE)
return(coll)
}
}
}

current_items <- sapply(coll$items, function(x) x$entityId)
if(any(items %in% current_items) && !force) {
stop("Some items to be added are already in collection. Use `force = TRUE` to allow replacing existing versions.")
} else {
coll$items <- update_items(coll$items, as_coll_items(items))
}
.syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(coll, auto_unbox = TRUE))
}


# -- Datasets ------------------------------------------------------------------#

#' Create new dataset with given items
#'
#' @inheritParams as_coll_items
#' @param name Name of the dataset. It should be unique within the `parent` project.
#' @param parent Synapse id of parent project where the dataset will live.
#' @param items Id(s) of items to include.
#' Usually the same parent project storing the files, but in some cases it may be a different project.
#' @param dry_run If TRUE, don't actually store dataset, just return the data object for inspection or further modification.
#' @export
new_dataset <- function(name, parent, items, item_version = NULL, dry_run = TRUE) {

dataset_items <- as_coll_items(items, item_version)
dataset <- synapseclient$Dataset(name = name,
parent = parent,
dataset_items = dataset_items)
if(dry_run) dataset else .syn$store(dataset)
}


#' Create Sarek-processed datasets
#'
#' Organize variant call files from Nextflow Sarek into 3-4 datasets,
#' grouping files by variant type and workflow with titles having the format:
#' "{type} Genomic Variants - {workflow} Pipeline", e.g. "Somatic Genomic Variants - Strelka Pipeline".
#' As you can see, this assumes that you want to create datasets that segregate Somatic and Germline calls.
#' This makes sense for NF because Germline calls can be treated differently.
#' This uses latest version of all files and creates a Draft version of the dataset.
#'
#' Since we basically just need the syn entity id, variant type, and workflow to group the files.
#' Instead of getting this info through running `map_*` as in the example,
#' you may prefer using a fileview, in which case you just need to download a table from a fileview
#' that has `id` => `output_id` + the `dataType` and `workflow` annotations.
#' The fileview can be used _after_ the files are annotated. If you want to create datasets _before_
#' files are annotated, then you have to use `map_*`.
#'
#' Finally, datasets cannot use the same name if stored in the same project,
#' so if there are multiple batches, the names will have to be made unique by adding
#' the batch number, source data id, processing date, or whatever makes sense.
#'
#' @inheritParams new_dataset
#' @param output_map The `data.table` returned from `map_sample_output_sarek`. See details for alternatives.
#' @param workflow One of workflows used.
#' @param verbose Optional, whether to be verbose -- defaults to TRUE.
#' @import data.table
#' @return A list of dataset objects.
#' @export
#' @examples
#'\dontrun{
#' syn_out <- "syn26648589"
#' m <- map_sample_output_sarek(syn_out)
#' datasets <- nf_sarek_datasets(m, parent = "syn26462036", dry_run = F) # use a test project
#'}
nf_sarek_datasets <- function(output_map,
parent,
workflow = c("FreeBayes", "Mutect2", "Strelka", "DeepVariant"),
verbose = TRUE,
dry_run = TRUE) {

output_map <- as.data.table(output_map)
if(!is.null(output_map$dataType)) {
data_type <- unique(output_map$dataType)
if(length(data_type) != 1) stop("Expecting one `dataType`, which does not appear to be the case.")
gvtype <- grep("(Germline|Somatic)Variants", data_type, value = T)
if(!length(gvtype)) stop("Data type does not look right, expecting either Germline or Somatic variants.")
gvtype <- switch(gvtype,
SomaticVariants = "Somatic",
GermlineVariants = "Germline")

} else {
# Detect genomic variants type from first path name
gvtype <- if(grepl("SomaticVariantCalls", first(output_map$caller_path))) {
"Somatic"
} else if(grepl("GermlineVariantCalls", first(output_map$caller_path))) {
"Germline"
} else {
stop("Could not assign either Germline or Somatic labels based on main output folder.
Check whether folder contains mixed types or is not the right one.")
}
}
pattern <- "vcf.gz(.tbi)?$"
workflow <- match.arg(workflow)
datasets <- list()
for(i in workflow) {
dataset <- output_map[workflow == i & grepl(pattern, output_name)]
if(nrow(dataset)) {
if(verbose) glue::glue("Creating {i} dataset with {nrow(dataset)} files")
name <- glue::glue("{gvtype} Genomic Variants - {i} Pipeline")
dataset <- new_dataset(name = name, parent = parent, items = dataset$output_id, dry_run = TRUE)
if(dry_run) datasets[[i]] <- syn_dataset else datasets[[i]] <- .syn$store(syn_dataset)
}
}

return(datasets)

}


#' Create NF STAR-Salmon dataset
#'
#' Organize gene expression quantification files (.sf) into one dataset.
#' Uses latest version of the files and creates a "Draft" dataset.
#' See also `nf_sarek_datasets`.
#'
#' @inheritParams new_dataset
#' @inheritParams nf_sarek_datasets
#' @param output_map The `data.table` returned from `map_sample_output_sarek`.
#' @export
#' @examples
#'\dontrun{
#' syn_out <- "syn30840584"
#' m <- map_sample_output_rnaseq(syn_out)
#' datasets <- nf_rnaseq_dataset(m, out, parent = "syn4939902", dry_run = F)
#'}
nf_star_salmon_datasets <- function(output_map,
parent,
verbose = TRUE,
dry_run = TRUE) {

# Select the .sf and index files
output_ids <- output_map[grepl(".sf$", output_name), output_id]
new_dataset(name = "Gene Expression Quantification from RNA-seq",
parent = parent,
items = output_ids,
dry_run = dry_run)
}


# -- Checks------------- -------------------------------------------------------#

# TODO Potentially move these type checks somewhere else like basic_utils
# TODO Better composition to reduce code, esp. if more will be added

#' Check whether entity is dataset
#'
#' @keywords internal
is_dataset <- function(id) {
tryCatch({
entity <- .syn$get(id, downloadFile = FALSE)
entity$properties$concreteType == "org.sagebionetworks.repo.model.table.Dataset"
},
error = function(e) FALSE)
}

#' Check whether entity is dataset collection
#'
#' @keywords internal
is_dataset_collection <- function(id) {
tryCatch({
entity <- .syn$get(id, downloadFile = FALSE)
entity$properties$concreteType == "org.sagebionetworks.repo.model.table.DatasetCollection"
},
error = function(e) FALSE)
}


#' Which collection type
#'
#' Checks for a valid collection type or returns error
#'
#' @keywords internal
which_coll_type <- function(coll) {
coll_type <- c("dataset", "dataset collection")[c(is_dataset(coll), is_dataset_collection(coll))]
if(length(coll_type)) coll_type else stop("Entity is not a dataset or dataset collection.")
}

#' Check whether entity is file
#'
#' @keywords internal
is_file <- function(id) {
tryCatch({
entity <- .syn$get(id, downloadFile = FALSE)
entity$properties$concreteType == "org.sagebionetworks.repo.model.FileEntity"
},
error = function(e) FALSE)
}

Loading
Loading