From 10631b0dc3009de861e5ad4b90127e0ae152c1db Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Thu, 29 Jun 2023 15:12:59 -0600 Subject: [PATCH 01/25] Add draft datasets util (wip) --- R/datasets.R | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 R/datasets.R diff --git a/R/datasets.R b/R/datasets.R new file mode 100644 index 00000000..cdbf0ce8 --- /dev/null +++ b/R/datasets.R @@ -0,0 +1,118 @@ +#' Create Sarek-processed datasets +#' +#' Organize variant call files from Nextflow Sarek into 3-4 datasets, +#' grouping files by variant type and workflow with titles having the format: +#' "{type} Genomic Variants - {workflow} Pipeline", e.g. "Somatic Genomic Variants - Strelka Pipeline". +#' As you can see, this assumes that you want to create datasets that segregate Somatic and Germline calls. +#' This makes sense for NF because Germline calls can be treated differently. +#' This uses version 1 of all files and creates a Draft version of the dataset. +#' +#' Since we basically just need the syn entity id, variant type, and workflow to group the files. +#' Instead of getting this info through running `map_*` as in the example, +#' you may prefer using a fileview, in which case you just need to download a table from a fileview +#' that has `id` => `output_id` + the `dataType` and `workflow` annotations. +#' The fileview can be used _after_ the files are annotated. If you want to create datasets _before_ +#' files are annotated, then you have to use `map_*`. +#' +#' Finally, datasets cannot use the same name if stored in the same project, +#' so if there are multiple batches, the names will have to be made unique by adding +#' the batch number, source data id, processing date, or whatever makes sense. +#' +#' @param output_map The `data.table` returned from `map_sample_output_sarek`. See details for alternatives. +#' @param parent Synapse id of parent project where the datasets will live. +#' Usually the same parent project storing the files, but in some cases it may be a different project. +#' @param verbose Optional, whether to be verbose -- defaults to TRUE. +#' @param dry_run If TRUE, don't actually store datasets and return the objects for inspection or modification, +#' e.g. setting a better title or description than the default. +#' @import data.table +#' @return A list of dataset objects. +#' @export +#' @examples +#'\dontrun{ +#' syn_out <- "syn26648589" +#' m <- map_sample_output_sarek(syn_out) +#' datasets <- nf_sarek_datasets(m, parent = "syn26462036", dry_run = F) # use a test project +#'} +nf_sarek_datasets <- function(output_map, + parent, + verbose = TRUE, + dry_run = TRUE) { + + output_map <- as.data.table(output_map) + if(!is.null(output_map$dataType)) { + data_type <- unique(output_map$dataType) + if(length(data_type) != 1) stop("Expecting one `dataType`, which does not appear to be the case.") + gvtype <- grep("(Germline|Somatic)Variants", data_type, value = T) + if(!length(gvtype)) stop("Data type does not look right, expecting either Germline or Somatic variants.") + gvtype <- switch(gvtype, + SomaticVariants = "Somatic", + GermlineVariants = "Germline") + + } else { + # Detect genomic variants type from first path name + gvtype <- if(grepl("SomaticVariantCalls", first(output_map$caller_path))) { + "Somatic" + } else if(grepl("GermlineVariantCalls", first(output_map$caller_path))) { + "Germline" + } else { + stop("Could not assign either Germline or Somatic labels based on main output folder. + Check whether folder contains mixed types or is not the right one.") + } + } + pattern <- "vcf.gz(.tbi)?$" + WORKFLOW <- c("FreeBayes", "Mutect2", "Strelka", "DeepVariant") + datasets <- list() + for(i in WORKFLOW) { + dataset <- output_map[workflow == i & grepl(pattern, output_name)] + if(nrow(dataset)) { + if(verbose) glue::glue("Creating {i} dataset with {nrow(dataset)} files") + + name <- glue::glue("{gvtype} Genomic Variants - {i} Pipeline") + dataset_items <- lapply(dataset$output_id, function(entity) list(entityId = entity, versionNumber = 1L)) + + syn_dataset <- synapseclient$Dataset(name = name, + parent = parent, + dataset_items = dataset_items) + + if(dry_run) datasets[[i]] <- syn_dataset else datasets[[i]] <- .syn$store(syn_dataset) + } + } + + return(datasets) + +} + + +#' Create NF STAR-Salmon dataset +#' +#' Organize gene expression quantification files (.sf) into one dataset. +#' Uses version 1 of the files and creates a Draft dataset. +#' See also `nf_sarek_datasets`. +#' +#' @inheritParams nf_sarek_datasets +#' @param output_map The `data.table` returned from `map_sample_output_sarek`. +#' @export +#' @examples +#'\dontrun{ +#' syn_out <- "syn30840584" +#' m <- map_sample_output_rnaseq(syn_out) +#' datasets <- nf_rnaseq_dataset(m, out, parent = "syn4939902", dry_run = F) +#'} +nf_star_salmon_datasets <- function(output_map, + parent, + verbose = TRUE, + dry_run = TRUE) { + + # Select the .sf and index files + dataset_items <- output_map[grepl(".sf$", output_name), output_id] + name <- "Gene Expression Quantification from RNA-seq" + dataset_items <- lapply(dataset_items, function(entity) list(entityId = entity, versionNumber = 1L)) + dataset <- synapseclient$Dataset(name = name, + parent = parent, + dataset_items = dataset_items) + + if(dry_run) dataset else .syn$store(syn_dataset) + +} + + From ffdb66147c5a34b37b90559b30fff48f9c466b5a Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Thu, 29 Jun 2023 15:18:25 -0600 Subject: [PATCH 02/25] Parameterize workflow per review comment --- R/datasets.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R/datasets.R b/R/datasets.R index cdbf0ce8..e7149318 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -35,6 +35,7 @@ #'} nf_sarek_datasets <- function(output_map, parent, + workflow = c("FreeBayes", "Mutect2", "Strelka", "DeepVariant"), verbose = TRUE, dry_run = TRUE) { @@ -60,9 +61,9 @@ nf_sarek_datasets <- function(output_map, } } pattern <- "vcf.gz(.tbi)?$" - WORKFLOW <- c("FreeBayes", "Mutect2", "Strelka", "DeepVariant") + workflow <- match.arg(workflow) datasets <- list() - for(i in WORKFLOW) { + for(i in workflow) { dataset <- output_map[workflow == i & grepl(pattern, output_name)] if(nrow(dataset)) { if(verbose) glue::glue("Creating {i} dataset with {nrow(dataset)} files") @@ -86,7 +87,7 @@ nf_sarek_datasets <- function(output_map, #' Create NF STAR-Salmon dataset #' #' Organize gene expression quantification files (.sf) into one dataset. -#' Uses version 1 of the files and creates a Draft dataset. +#' Uses version 1 of the files and creates a "Draft" dataset. #' See also `nf_sarek_datasets`. #' #' @inheritParams nf_sarek_datasets From 1603e7f9446cffa9a501fabb4cb2f0c11a1f538c Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Thu, 29 Jun 2023 16:11:44 -0600 Subject: [PATCH 03/25] Factor out dataset item constructor to make more functional and flexible --- R/datasets.R | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/R/datasets.R b/R/datasets.R index e7149318..376b87bc 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -1,3 +1,19 @@ +#' As dataset items +#' +#' Helper taking entity ids to create records in the structure needed for dataset creation. +#' Note: Currently does not check that ids are "file" entities; technically dataset items can't be folders, for example. +#' +#' @param ids Ids of entities to make into dataset items. +#' @param version Integer for version that will be used for all items, e.g. 1. +#' If NULL, this will look up the latest version for each id and use that. +as_dataset_items <- function(ids, version = NULL) { + if(is.null(version)) { + version <- lapply(ids, function(id) .syn$get(id)$properties$versionNumber) + } + dataset_items <- Map(function(id, version) list(entityId = id, versionNumber = 1L), ids, version) + dataset_items +} + #' Create Sarek-processed datasets #' #' Organize variant call files from Nextflow Sarek into 3-4 datasets, @@ -69,7 +85,7 @@ nf_sarek_datasets <- function(output_map, if(verbose) glue::glue("Creating {i} dataset with {nrow(dataset)} files") name <- glue::glue("{gvtype} Genomic Variants - {i} Pipeline") - dataset_items <- lapply(dataset$output_id, function(entity) list(entityId = entity, versionNumber = 1L)) + dataset_items <- as_dataset_items(dataset$output_id) syn_dataset <- synapseclient$Dataset(name = name, parent = parent, @@ -105,10 +121,9 @@ nf_star_salmon_datasets <- function(output_map, dry_run = TRUE) { # Select the .sf and index files - dataset_items <- output_map[grepl(".sf$", output_name), output_id] - name <- "Gene Expression Quantification from RNA-seq" - dataset_items <- lapply(dataset_items, function(entity) list(entityId = entity, versionNumber = 1L)) - dataset <- synapseclient$Dataset(name = name, + output_ids <- output_map[grepl(".sf$", output_name), output_id] + dataset_items <- as_dataset_items(output_ids) + dataset <- synapseclient$Dataset(name = "Gene Expression Quantification from RNA-seq", parent = parent, dataset_items = dataset_items) From 04b08706c424f6fd23c3db32cc8d5a3691a23280 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Thu, 29 Jun 2023 18:09:53 -0600 Subject: [PATCH 04/25] Implement util for adding to dataset collection --- R/datasets.R | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/R/datasets.R b/R/datasets.R index 376b87bc..6d179da3 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -11,8 +11,25 @@ as_dataset_items <- function(ids, version = NULL) { version <- lapply(ids, function(id) .syn$get(id)$properties$versionNumber) } dataset_items <- Map(function(id, version) list(entityId = id, versionNumber = 1L), ids, version) + names(dataset_items) <- NULL # need to unname list for API dataset_items -} +} + +#' Add to dataset collection +#' +#' Add dataset(s) to an _existing_ dataset collection. +#' Notes: +#' - If somehow non-dataset entities are included, Synapse will ignore these ids. +#' - Implemented with lower-level REST calls because the Python client (as of v2.7) doesn't seem to have the method for yet dataset collections. +#' +#' @param dataset_ids Character vector of one or more dataset entity ids to add. +#' @param collection_id Id of the dataset collection. +add_to_dataset_collection <- function(dataset_ids, collection_id) { + e <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}")) + items <- as_dataset_items(dataset_ids) + e$items <- c(e$items, items) + .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(e, auto_unbox = TRUE)) +} #' Create Sarek-processed datasets #' From fae5fa46eef999adee76902a4abe9c81a6ceaa5e Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Thu, 29 Jun 2023 18:11:10 -0600 Subject: [PATCH 05/25] Update explanation --- R/datasets.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/datasets.R b/R/datasets.R index 6d179da3..feb50038 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -1,7 +1,6 @@ #' As dataset items #' -#' Helper taking entity ids to create records in the structure needed for dataset creation. -#' Note: Currently does not check that ids are "file" entities; technically dataset items can't be folders, for example. +#' Helper taking entity ids to create records in the structure needed for dataset items or dataset collection items. #' #' @param ids Ids of entities to make into dataset items. #' @param version Integer for version that will be used for all items, e.g. 1. From e0bb7310ac9cffcc719dea514c0167c650c26a71 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 30 Jun 2023 13:02:29 -0600 Subject: [PATCH 06/25] Refactor into base fun --- R/datasets.R | 58 +++++++++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/R/datasets.R b/R/datasets.R index feb50038..9bd6cbf7 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -3,13 +3,13 @@ #' Helper taking entity ids to create records in the structure needed for dataset items or dataset collection items. #' #' @param ids Ids of entities to make into dataset items. -#' @param version Integer for version that will be used for all items, e.g. 1. +#' @param item_version Integer for version that will be used for all items, e.g. 1. #' If NULL, this will look up the latest version for each id and use that. -as_dataset_items <- function(ids, version = NULL) { - if(is.null(version)) { - version <- lapply(ids, function(id) .syn$get(id)$properties$versionNumber) +as_dataset_items <- function(ids, item_version = NULL) { + if(is.null(item_version)) { + item_version <- lapply(ids, function(id) .syn$get(id)$properties$versionNumber) } - dataset_items <- Map(function(id, version) list(entityId = id, versionNumber = 1L), ids, version) + dataset_items <- Map(function(id, version) list(entityId = id, versionNumber = 1L), ids, item_version) names(dataset_items) <- NULL # need to unname list for API dataset_items } @@ -30,6 +30,25 @@ add_to_dataset_collection <- function(dataset_ids, collection_id) { .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(e, auto_unbox = TRUE)) } +#' New dataset with given items +#' +#' Make a _new_ dataset with given set of entities. +#' +#' @inheritParams as_dataset_items +#' @param name Name of the dataset. It should be unique within the `parent` project. +#' @param parent Synapse id of parent project where the datasets will live. +#' @param items Id(s) of items to include. +#' Usually the same parent project storing the files, but in some cases it may be a different project. +#' @param dry_run If TRUE, don't actually store dataset, just return the data object for inspection or further modification. +new_dataset <- function(name, parent, items, item_version = NULL, dry_run = TRUE) { + + dataset_items <- as_dataset_items(items, item_version) + dataset <- synapseclient$Dataset(name = name, + parent = parent, + dataset_items = dataset_items) + if(dry_run) dataset else .syn$store(dataset) +} + #' Create Sarek-processed datasets #' #' Organize variant call files from Nextflow Sarek into 3-4 datasets, @@ -50,12 +69,9 @@ add_to_dataset_collection <- function(dataset_ids, collection_id) { #' so if there are multiple batches, the names will have to be made unique by adding #' the batch number, source data id, processing date, or whatever makes sense. #' +#' @inheritParams new_dataset #' @param output_map The `data.table` returned from `map_sample_output_sarek`. See details for alternatives. -#' @param parent Synapse id of parent project where the datasets will live. -#' Usually the same parent project storing the files, but in some cases it may be a different project. #' @param verbose Optional, whether to be verbose -- defaults to TRUE. -#' @param dry_run If TRUE, don't actually store datasets and return the objects for inspection or modification, -#' e.g. setting a better title or description than the default. #' @import data.table #' @return A list of dataset objects. #' @export @@ -98,15 +114,9 @@ nf_sarek_datasets <- function(output_map, for(i in workflow) { dataset <- output_map[workflow == i & grepl(pattern, output_name)] if(nrow(dataset)) { - if(verbose) glue::glue("Creating {i} dataset with {nrow(dataset)} files") - + if(verbose) glue::glue("Creating {i} dataset with {nrow(dataset)} files") name <- glue::glue("{gvtype} Genomic Variants - {i} Pipeline") - dataset_items <- as_dataset_items(dataset$output_id) - - syn_dataset <- synapseclient$Dataset(name = name, - parent = parent, - dataset_items = dataset_items) - + dataset <- new_dataset(name = name, parent = parent, items = dataset$output_id, dry_run = TRUE) if(dry_run) datasets[[i]] <- syn_dataset else datasets[[i]] <- .syn$store(syn_dataset) } } @@ -122,6 +132,7 @@ nf_sarek_datasets <- function(output_map, #' Uses version 1 of the files and creates a "Draft" dataset. #' See also `nf_sarek_datasets`. #' +#' @inheritParams new_dataset #' @inheritParams nf_sarek_datasets #' @param output_map The `data.table` returned from `map_sample_output_sarek`. #' @export @@ -138,13 +149,8 @@ nf_star_salmon_datasets <- function(output_map, # Select the .sf and index files output_ids <- output_map[grepl(".sf$", output_name), output_id] - dataset_items <- as_dataset_items(output_ids) - dataset <- synapseclient$Dataset(name = "Gene Expression Quantification from RNA-seq", - parent = parent, - dataset_items = dataset_items) - - if(dry_run) dataset else .syn$store(syn_dataset) - + new_dataset(name = "Gene Expression Quantification from RNA-seq", + parent = parent, + items = output_ids, + dry_run = dry_run) } - - From 963ade19594a210585ecbd4c24683bd55371c754 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 30 Jun 2023 17:56:18 -0600 Subject: [PATCH 07/25] Add tests --- tests/testthat/test_dataset_utils.R | 53 +++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 tests/testthat/test_dataset_utils.R diff --git a/tests/testthat/test_dataset_utils.R b/tests/testthat/test_dataset_utils.R new file mode 100644 index 00000000..7ea57b92 --- /dev/null +++ b/tests/testthat/test_dataset_utils.R @@ -0,0 +1,53 @@ +test_that("Creating dataset with `new_dataset` works as expected when given valid parameters, defaulting to current item versions", { + + skip_if_no_synapseclient() + skip_if_no_token() + NF_test <- "syn26462036" + # Note that files are all version 2 on Synapse + items <- c("syn51239179", + "syn51239178", + "syn51239177") + dataset <- new_dataset(name = "test_dataset", parent = NF_test, items = items, dry_run = FALSE) + .syn$delete(dataset) + expected_items_in_dataset <- list(list(entityId = "syn51239179", versionNumber = 2L), + list(entityId = "syn51239178", versionNumber = 2L), + list(entityId = "syn51239177", versionNumber = 2L)) + testthat::expect_equal(expected_items_in_dataset, dataset$properties$datasetItems) + +}) + +test_that("Creating dataset with `new_dataset` works as expected when given valid parameters and a specific item version is specified", { + + skip_if_no_synapseclient() + skip_if_no_token() + NF_test <- "syn26462036" + items <- c("syn51239179", + "syn51239178", + "syn51239177") + dataset <- new_dataset(name = "test_dataset", parent = NF_test, items = items, item_version = 1L, dry_run = FALSE) + .syn$delete(dataset) + expected_items_in_dataset <- list(list(entityId = "syn51239179", versionNumber = 1L), + list(entityId = "syn51239178", versionNumber = 1L), + list(entityId = "syn51239177", versionNumber = 1L)) + testthat::expect_equal(expected_items_in_dataset, dataset$properties$datasetItems) + +}) + +# When providing an item not allowed to be a dataset item (a table or folder), the Synapse error will be something like +# ``` +# Error: synapseclient.core.exceptions.SynapseHTTPError: 400 Client Error: +# Currently, only files can be included in a dataset. syn27242487 is 'org.sagebionetworks.repo.model.table.TableEntity' +# ``` +# This is a good, informative error +test_that("Creating dataset with `new_dataset` will fail when trying to include a non-vaid item (specifically, a table)", { + + skip_if_no_synapseclient() + skip_if_no_token() + NF_test <- "syn26462036" + items <- c("syn51239179", + "syn51239178", + "syn27242487") # This is a table + testthat::expect_error(dataset <- new_dataset(name = "test_dataset", parent = NF_test, items = items, dry_run = FALSE)) +}) + + From 1048a42dc1db39f455b2d440a276ff3bb8cd50ba Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Sat, 1 Jul 2023 15:47:03 -0600 Subject: [PATCH 08/25] Reorganize --- R/datasets.R | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/R/datasets.R b/R/datasets.R index 9bd6cbf7..bc6cd5a8 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -7,29 +7,13 @@ #' If NULL, this will look up the latest version for each id and use that. as_dataset_items <- function(ids, item_version = NULL) { if(is.null(item_version)) { - item_version <- lapply(ids, function(id) .syn$get(id)$properties$versionNumber) + item_version <- lapply(ids, function(id) .syn$get(id, downloadFile = FALSE)$properties$versionNumber) } dataset_items <- Map(function(id, version) list(entityId = id, versionNumber = 1L), ids, item_version) names(dataset_items) <- NULL # need to unname list for API dataset_items } -#' Add to dataset collection -#' -#' Add dataset(s) to an _existing_ dataset collection. -#' Notes: -#' - If somehow non-dataset entities are included, Synapse will ignore these ids. -#' - Implemented with lower-level REST calls because the Python client (as of v2.7) doesn't seem to have the method for yet dataset collections. -#' -#' @param dataset_ids Character vector of one or more dataset entity ids to add. -#' @param collection_id Id of the dataset collection. -add_to_dataset_collection <- function(dataset_ids, collection_id) { - e <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}")) - items <- as_dataset_items(dataset_ids) - e$items <- c(e$items, items) - .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(e, auto_unbox = TRUE)) -} - #' New dataset with given items #' #' Make a _new_ dataset with given set of entities. @@ -154,3 +138,25 @@ nf_star_salmon_datasets <- function(output_map, items = output_ids, dry_run = dry_run) } + +# -- Dataset Collections -------------------------------------------------------# + +#' Add to dataset collection +#' +#' Add dataset(s) to an _existing_ dataset collection. +#' Implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet +#' implement an `add_scope`-type method for dataset collections that is available for entity view. +#' Notes: +#' - TODO: Check that ids are unique with existing items or Synapse will reject. +#' - TODO: Check that entities are datasets or this will fail. +#' +#' @param dataset_ids Character vector of one or more dataset entity ids to add. +#' @param collection_id Id of the dataset collection. +add_to_dataset_collection <- function(dataset_ids, collection_id) { + e <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}")) + items <- as_dataset_items(dataset_ids) + e$items <- c(e$items, items) + .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(e, auto_unbox = TRUE)) +} + +# ------------------------------------------------------------------------------# From 0de33316fcdbba121085fbeeca72ef43c1793de5 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Sat, 1 Jul 2023 16:48:58 -0600 Subject: [PATCH 09/25] Resolve TODO checks --- R/datasets.R | 52 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/R/datasets.R b/R/datasets.R index bc6cd5a8..39fb9786 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -139,24 +139,56 @@ nf_star_salmon_datasets <- function(output_map, dry_run = dry_run) } +# -- Checks------------- -------------------------------------------------------# + +#' Check whether entity is dataset +#' +#' @keywords internal +is_dataset <- function(id) { + tryCatch({ + entity <- .syn$get(id, downloadFile = FALSE) + entity$properties$concreteType == "org.sagebionetworks.repo.model.table.Dataset" + }, + error = function(e) FALSE) +} + + # -- Dataset Collections -------------------------------------------------------# #' Add to dataset collection #' -#' Add dataset(s) to an _existing_ dataset collection. +#' Add dataset(s) to an _existing_ dataset collection. #' Implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet #' implement an `add_scope`-type method for dataset collections that is available for entity view. -#' Notes: -#' - TODO: Check that ids are unique with existing items or Synapse will reject. -#' - TODO: Check that entities are datasets or this will fail. #' -#' @param dataset_ids Character vector of one or more dataset entity ids to add. +#' @param items Character vector of one or more dataset entity ids to add, using their current version. #' @param collection_id Id of the dataset collection. -add_to_dataset_collection <- function(dataset_ids, collection_id) { - e <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}")) - items <- as_dataset_items(dataset_ids) - e$items <- c(e$items, items) - .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(e, auto_unbox = TRUE)) +#' @param check_items Whether to check that ids are really dataset entities and remove non-dataset entities with warning (default FALSE). +#' This may be useful given that sometimes "datasets" can be folder or file entities. Note that using check will be slower. +#' @param replace If specified items are current items in the collection, replace items with the current version? +#' The safe default is FALSE to ensure any version changes are intentional. +add_to_dataset_collection <- function(items, collection_id, check_items = FALSE, replace = FALSE) { + + if(check_items) { + confirmed_dataset <- sapply(items, is_dataset) + if(any(!confirmed_dataset)) { + warning("Items which are not dataset entities will be ignored:", items[!confirmed_dataset]) + items <- items[confirmed_dataset] + } + } + dc <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}")) + current_items <- sapply(dc$items, function(i) i$entityId) + + # Synapse will normally throw error if trying to add a dataset already in collection + if(any(items %in% current_items) && !replace) { + stop("Datasets to be added are already in collection. Use `replace = TRUE` if you want to override existing dataset versions.") + } else if (any(items %in% current_items && replace)) { + dc$items <- as_dataset_items(union(current_items, items)) + message("Some datasets replaced with their most current version:", items[items %in% current_items]) + } else { + dc$items <- c(dc$items, as_dataset_items(items)) + } + .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(dc, auto_unbox = TRUE)) } # ------------------------------------------------------------------------------# From eea014f8ad6a6f4ead756b1ecaf59cff57d046ad Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Sat, 1 Jul 2023 17:01:22 -0600 Subject: [PATCH 10/25] Export, document --- R/datasets.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/datasets.R b/R/datasets.R index 39fb9786..15162628 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -159,7 +159,7 @@ is_dataset <- function(id) { #' #' Add dataset(s) to an _existing_ dataset collection. #' Implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet -#' implement an `add_scope`-type method for dataset collections that is available for entity view. +#' implement dataset collection methods. #' #' @param items Character vector of one or more dataset entity ids to add, using their current version. #' @param collection_id Id of the dataset collection. From 11488a96884bc30b28a2ce7fdda5ee61eead705b Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Sat, 1 Jul 2023 17:03:58 -0600 Subject: [PATCH 11/25] More updates of docs to be consistent with changes --- R/datasets.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/R/datasets.R b/R/datasets.R index 15162628..f2b397ea 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -40,7 +40,7 @@ new_dataset <- function(name, parent, items, item_version = NULL, dry_run = TRUE #' "{type} Genomic Variants - {workflow} Pipeline", e.g. "Somatic Genomic Variants - Strelka Pipeline". #' As you can see, this assumes that you want to create datasets that segregate Somatic and Germline calls. #' This makes sense for NF because Germline calls can be treated differently. -#' This uses version 1 of all files and creates a Draft version of the dataset. +#' This uses latest version of all files and creates a Draft version of the dataset. #' #' Since we basically just need the syn entity id, variant type, and workflow to group the files. #' Instead of getting this info through running `map_*` as in the example, @@ -113,7 +113,7 @@ nf_sarek_datasets <- function(output_map, #' Create NF STAR-Salmon dataset #' #' Organize gene expression quantification files (.sf) into one dataset. -#' Uses version 1 of the files and creates a "Draft" dataset. +#' Uses latest version of the files and creates a "Draft" dataset. #' See also `nf_sarek_datasets`. #' #' @inheritParams new_dataset @@ -167,6 +167,7 @@ is_dataset <- function(id) { #' This may be useful given that sometimes "datasets" can be folder or file entities. Note that using check will be slower. #' @param replace If specified items are current items in the collection, replace items with the current version? #' The safe default is FALSE to ensure any version changes are intentional. +#' @export add_to_dataset_collection <- function(items, collection_id, check_items = FALSE, replace = FALSE) { if(check_items) { @@ -191,4 +192,6 @@ add_to_dataset_collection <- function(items, collection_id, check_items = FALSE, .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(dc, auto_unbox = TRUE)) } +#' + # ------------------------------------------------------------------------------# From 3ecc3a2007b58508a9ad0f84c334eb17251eb42d Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Sun, 2 Jul 2023 08:47:01 -0600 Subject: [PATCH 12/25] Add util for update and corresponding unit test --- R/datasets.R | 55 +++++++++++++------ .../testthat/test_dataset_collection_utils.R | 30 ++++++++++ 2 files changed, 68 insertions(+), 17 deletions(-) create mode 100644 tests/testthat/test_dataset_collection_utils.R diff --git a/R/datasets.R b/R/datasets.R index f2b397ea..21837460 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -155,43 +155,64 @@ is_dataset <- function(id) { # -- Dataset Collections -------------------------------------------------------# +#' Apply updates to current collection of items +#' +#' A collection of items has items of the form `list(entityId = id, versionNumber = x)`. +#' Given another collection that can represent updates of both types replace or add, +#' this applies an update join keyed on `entityId` for the replace and +#' appends the new items to get the updated collection. +#' +#' @param current_items List of lists representing a collection of items. +#' @param update_items Collection of items to apply as updates to `current_items`. +update_items <- function(current_coll, update_coll) { + + current_coll <- data.table::rbindlist(current_coll) + update_coll <- data.table::rbindlist(update_coll) + updated_coll <- rbind( + current_coll[update_coll, on = .(entityId), versionNumber := i.versionNumber], # replace + update_coll[!current_coll, on = .(entityId)]) # add + updated_coll <- apply(updated_coll, 1, as.list) + updated_coll +} + #' Add to dataset collection #' -#' Add dataset(s) to an _existing_ dataset collection. +#' Add dataset(s) to an _existing_ dataset collection, using their current (latest) version. +#' If a dataset attempting to be added happens to already be in the dataset collection, +#' this might lead to version conflicts, so the update won't processed unless `force` is true. +#' #' Implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet -#' implement dataset collection methods. +#' implement dataset collection methods. #' -#' @param items Character vector of one or more dataset entity ids to add, using their current version. #' @param collection_id Id of the dataset collection. -#' @param check_items Whether to check that ids are really dataset entities and remove non-dataset entities with warning (default FALSE). -#' This may be useful given that sometimes "datasets" can be folder or file entities. Note that using check will be slower. -#' @param replace If specified items are current items in the collection, replace items with the current version? -#' The safe default is FALSE to ensure any version changes are intentional. +#' @param items Character vector of one or more dataset entity ids to add. +#' @param check_items Whether to check that ids are really dataset entities and remove non-dataset entities (default FALSE) +#' to help avoid Synapse error. This may be useful given that sometimes "datasets" can be folder or file entities. +#' Note that using check will be slower. +#' @param force If some items are currently in the collection with a different version, +#' should these items be force added using current version? The safe default is FALSE to ensure any version changes are intentional. #' @export -add_to_dataset_collection <- function(items, collection_id, check_items = FALSE, replace = FALSE) { +add_to_dataset_collection <- function(collection_id, items, check_items = FALSE, force = FALSE) { if(check_items) { confirmed_dataset <- sapply(items, is_dataset) if(any(!confirmed_dataset)) { - warning("Items which are not dataset entities will be ignored:", items[!confirmed_dataset]) + warning("Items which are not dataset entities will not be added:", items[!confirmed_dataset]) items <- items[confirmed_dataset] } } dc <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}")) - current_items <- sapply(dc$items, function(i) i$entityId) - # Synapse will normally throw error if trying to add a dataset already in collection - if(any(items %in% current_items) && !replace) { - stop("Datasets to be added are already in collection. Use `replace = TRUE` if you want to override existing dataset versions.") - } else if (any(items %in% current_items && replace)) { - dc$items <- as_dataset_items(union(current_items, items)) - message("Some datasets replaced with their most current version:", items[items %in% current_items]) + if(any(items %in% current_items) && !force) { + stop("Some datasets to be added are already in collection. Use `force = TRUE` to allow replacing existing dataset versions.") + } else if (any(items %in% current_items) && force) { + dc$items <- update_items(dc$items, as_dataset_items(items)) } else { dc$items <- c(dc$items, as_dataset_items(items)) } .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(dc, auto_unbox = TRUE)) } -#' + # ------------------------------------------------------------------------------# diff --git a/tests/testthat/test_dataset_collection_utils.R b/tests/testthat/test_dataset_collection_utils.R new file mode 100644 index 00000000..9d377084 --- /dev/null +++ b/tests/testthat/test_dataset_collection_utils.R @@ -0,0 +1,30 @@ +test_that("Update helper for dataset collection items works with combined replace/add updates", { + + current_items <- list( + list(entityId = "syn1", versionNumber = 1L), + list(entityId = "syn2", versionNumber = 1L), + list(entityId = "syn3", versionNumber = 1L) + ) + + update_items <- list( + list(entityId = "syn3", versionNumber = 2L), + list(entityId = "syn4", versionNumber = 2L), + list(entityId = "syn5", versionNumber = 2L) + ) + + expected <- list( + list(entityId = "syn1", versionNumber = 1L), + list(entityId = "syn2", versionNumber = 1L), + list(entityId = "syn3", versionNumber = 2L), + list(entityId = "syn4", versionNumber = 2L), + list(entityId = "syn5", versionNumber = 2L) + ) + + testthat::expect_equal(update_items(current_items, update_items), + expected) + +}) + + + + From 103f7cdae2ef06750431561a06117a22542adc11 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Sun, 2 Jul 2023 09:32:15 -0600 Subject: [PATCH 13/25] Update unit test and fix discovered bug --- R/datasets.R | 11 ++++++----- tests/testthat/test_dataset_collection_utils.R | 11 ++++------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/R/datasets.R b/R/datasets.R index 21837460..40a6d9cb 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -168,11 +168,12 @@ update_items <- function(current_coll, update_coll) { current_coll <- data.table::rbindlist(current_coll) update_coll <- data.table::rbindlist(update_coll) - updated_coll <- rbind( - current_coll[update_coll, on = .(entityId), versionNumber := i.versionNumber], # replace - update_coll[!current_coll, on = .(entityId)]) # add - updated_coll <- apply(updated_coll, 1, as.list) - updated_coll + replaced <- current_coll[update_coll, on = .(entityId), versionNumber := i.versionNumber] + added <- update_coll[!current_coll, on = .(entityId)] + updated <- rbind(replaced, added) + # reconversion; using pure apply as.list coerces versionNumber into char + updated <- apply(updated, 1, function(i) list(entityId = unname(i[1]), versionNumber = as.integer(i[2]))) + updated } #' Add to dataset collection diff --git a/tests/testthat/test_dataset_collection_utils.R b/tests/testthat/test_dataset_collection_utils.R index 9d377084..73ecdf09 100644 --- a/tests/testthat/test_dataset_collection_utils.R +++ b/tests/testthat/test_dataset_collection_utils.R @@ -1,12 +1,12 @@ -test_that("Update helper for dataset collection items works with combined replace/add updates", { +test_that("Update helper for item collection works with combined 'replace' and 'add' update types", { - current_items <- list( + current <- list( list(entityId = "syn1", versionNumber = 1L), list(entityId = "syn2", versionNumber = 1L), list(entityId = "syn3", versionNumber = 1L) ) - update_items <- list( + update <- list( list(entityId = "syn3", versionNumber = 2L), list(entityId = "syn4", versionNumber = 2L), list(entityId = "syn5", versionNumber = 2L) @@ -20,11 +20,8 @@ test_that("Update helper for dataset collection items works with combined replac list(entityId = "syn5", versionNumber = 2L) ) - testthat::expect_equal(update_items(current_items, update_items), + testthat::expect_identical(update_items(current, update), expected) }) - - - From cefa999261b4f2265c92a081bed9ce5419028e00 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Sun, 2 Jul 2023 09:44:38 -0600 Subject: [PATCH 14/25] More unit tests --- .../testthat/test_dataset_collection_utils.R | 53 ++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/tests/testthat/test_dataset_collection_utils.R b/tests/testthat/test_dataset_collection_utils.R index 73ecdf09..3177b1dd 100644 --- a/tests/testthat/test_dataset_collection_utils.R +++ b/tests/testthat/test_dataset_collection_utils.R @@ -21,7 +21,58 @@ test_that("Update helper for item collection works with combined 'replace' and ' ) testthat::expect_identical(update_items(current, update), - expected) + expected) }) + +test_that("Update helper for item collection works with just 'replace' update type", { + + current <- list( + list(entityId = "syn1", versionNumber = 1L), + list(entityId = "syn2", versionNumber = 1L), + list(entityId = "syn3", versionNumber = 1L) + ) + + update <- list( + list(entityId = "syn2", versionNumber = 2L), + list(entityId = "syn3", versionNumber = 2L) + ) + + expected <- list( + list(entityId = "syn1", versionNumber = 1L), + list(entityId = "syn2", versionNumber = 2L), + list(entityId = "syn3", versionNumber = 2L) + ) + + testthat::expect_identical(update_items(current, update), + expected) + +}) + + +test_that("Update helper for item collection works with just 'add' update type", { + + current <- list( + list(entityId = "syn1", versionNumber = 1L), + list(entityId = "syn2", versionNumber = 1L), + list(entityId = "syn3", versionNumber = 1L) + ) + + update <- list( + list(entityId = "syn4", versionNumber = 2L), + list(entityId = "syn5", versionNumber = 2L) + ) + + expected <- list( + list(entityId = "syn1", versionNumber = 1L), + list(entityId = "syn2", versionNumber = 1L), + list(entityId = "syn3", versionNumber = 1L), + list(entityId = "syn4", versionNumber = 2L), + list(entityId = "syn5", versionNumber = 2L) + ) + + testthat::expect_identical(update_items(current, update), + expected) + +}) From 50bff87fcb65e67c5f7715401d3cb84d9e1cc2d3 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Sun, 2 Jul 2023 09:44:52 -0600 Subject: [PATCH 15/25] Add another collection util --- R/datasets.R | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/R/datasets.R b/R/datasets.R index 40a6d9cb..ada1c02d 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -153,7 +153,7 @@ is_dataset <- function(id) { } -# -- Dataset Collections -------------------------------------------------------# +# -- Collections ---------------------------------------------------------------# #' Apply updates to current collection of items #' @@ -176,6 +176,36 @@ update_items <- function(current_coll, update_coll) { updated } +#' Update item versions to latest in a collection +#' +#' Update the collection so that all items or a subset of items reference their latest version. +#' This should work for both datasets (collection of files) and dataset collections (collection of datasets). +#' +#' @param collection_id +#' @param items Vector of dataset ids for which to update reference to latest version, +#' or "all" (default) to update all in the dataset collection. +#' @export +use_latest_in_collection(collection_id, items = "all") { + coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}")) + item_set <- match.arg(items) + current_items <- sapply(coll$items, function(i) i$entityId) + + if(item_set == "all") { + coll$items <- as_dataset_items(current_items) + } else { + + # Check subset; if no check, this becomes like `add_to_dataset_collection` + if(!all(items %in% current_items)) { + warning("Subset given includes items not actually in collection. These will be ignored:", items[!items %in% current_items]) + items <- items[items %in% current_items] + updated_items <- update_items(coll$items, as_dataset_items(items)) + coll$items <- updated_items + } + } + .syn$store(coll) + +} + #' Add to dataset collection #' #' Add dataset(s) to an _existing_ dataset collection, using their current (latest) version. From 18832654de601a65dc48b87790796b09c1185d7f Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Sun, 2 Jul 2023 10:55:36 -0600 Subject: [PATCH 16/25] Generalize and reorganize --- R/datasets.R | 242 +++++++++++------- ...ection_utils.R => test_collection_utils.R} | 0 2 files changed, 145 insertions(+), 97 deletions(-) rename tests/testthat/{test_dataset_collection_utils.R => test_collection_utils.R} (100%) diff --git a/R/datasets.R b/R/datasets.R index ada1c02d..805d1516 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -1,38 +1,144 @@ -#' As dataset items +# -- Editing Collections -------------------------------------------------------# + +# General helpers that should work for both datasets (collection of files) +# and dataset collections (collection of datasets). + + +#' As collection items #' -#' Helper taking entity ids to create records in the structure needed for dataset items or dataset collection items. +#' Helper taking entity ids to create records used for dataset items or dataset collection items. +#' Collection items have the form `list(entityId = id, versionNumber = x)`. #' #' @param ids Ids of entities to make into dataset items. #' @param item_version Integer for version that will be used for all items, e.g. 1. #' If NULL, this will look up the latest version for each id and use that. -as_dataset_items <- function(ids, item_version = NULL) { +#' @keywords internal +as_coll_items <- function(ids, item_version = NULL) { if(is.null(item_version)) { item_version <- lapply(ids, function(id) .syn$get(id, downloadFile = FALSE)$properties$versionNumber) } - dataset_items <- Map(function(id, version) list(entityId = id, versionNumber = 1L), ids, item_version) - names(dataset_items) <- NULL # need to unname list for API - dataset_items + items <- Map(function(id, version) list(entityId = id, versionNumber = 1L), ids, item_version) + names(items) <- NULL # need to unname list for API + items +} + + +#' Apply updates to current collection of items +#' +#' Given another collection that can represent updates of both types replace or add, +#' this applies an update join keyed on `entityId` for the replace and +#' appends the new items to get the updated collection. +#' +#' @param current_items List of lists representing a collection of items. +#' @param update_items Collection of items to apply as updates to `current_items`. +update_items <- function(current_coll, update_coll) { + + current_coll <- data.table::rbindlist(current_coll) + update_coll <- data.table::rbindlist(update_coll) + replaced <- current_coll[update_coll, on = .(entityId), versionNumber := i.versionNumber] + added <- update_coll[!current_coll, on = .(entityId)] + updated <- rbind(replaced, added) + # reconversion; using pure apply as.list coerces versionNumber into char + updated <- apply(updated, 1, function(i) list(entityId = unname(i[1]), versionNumber = as.integer(i[2]))) + updated +} + + +#' Update item versions to "latest" in a collection +#' +#' Update an _existing_ collection so that all items or a subset of items reference their latest version. +#' This should work for both datasets (collection of files) and dataset collections (collection of datasets). +#' +#' @param collection_id +#' @param items Vector of dataset ids for which to update reference to latest version, +#' or "all" (default) to update all in the dataset collection. +#' @export +use_latest_in_collection(collection_id, items = "all") { + coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}")) + item_set <- match.arg(items) + current_items <- sapply(coll$items, function(i) i$entityId) + + if(item_set == "all") { + coll$items <- as_coll_items(current_items) + } else { + + # Check subset; if no check, this becomes `add_to_collection` + if(!all(items %in% current_items)) { + warning("Subset given includes items not actually in collection. These will be ignored:", items[!items %in% current_items]) + items <- items[items %in% current_items] + updated_items <- update_items(coll$items, as_coll_items(items)) + coll$items <- updated_items + } + } + .syn$store(coll) + } -#' New dataset with given items + +#' Add to collection #' -#' Make a _new_ dataset with given set of entities. +#' Add items(s) to an _existing_ collection, using the item(s)' current (latest) version. +#' For datasets, the items should be files. For dataset collections, the items should be datasets. +#' If an item attempting to be added happens to already be in the collection, +#' this might lead to version conflicts, so the update will be rejected unless `force` is true. +#' +#' This is implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet +#' implement dataset collection class and methods (but dataset and relevant methods like `add_item` method are available). +#' Thus, while this is generic enough to handle both datasets and dataset collections +#' it is expected to be used more for dataset collections given that the dataset method is provided. +#' +#' @param collection_id Id of the collection. +#' @param items Character vector of one or more dataset entity ids to add. +#' @param check_items Whether to check that ids are really appropriate item types and remove non-appropriate item types +#' to help avoid Synapse errors (default `FALSE` because in most cases `items` are curated, and using check will be slower). +#' @param force If some items are currently in the collection with a different version, +#' should these items be force-added using current version? The safe default is `FALSE` to ensure any such updates are intentional. +#' @export +add_to_collection <- function(collection_id, items, check_items = FALSE, force = FALSE) { + + coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}")) + coll_type <- c("dataset", "dataset collection")[c(is_dataset(coll), is_dataset_collection(coll))] + if(!length(coll_type)) stop("Entity is not a dataset or dataset collection.") + + if(check_items) { + expected_type_check <- if(coll_type == "dataset") is_file else is_dataset + correct_item_type <- sapply(items, expected_type_check) + if(any(!correct_item_type)) { + warning("Some items not correct entity types for the collection! These will not be added:", items[!correct_item_type]) + items <- items[correct_item_type] + } + } + + current_items <- sapply(coll$items, function(x) x$entityId) + if(any(items %in% current_items) && !force) { + stop("Some items to be added are already in collection. Use `force = TRUE` to allow replacing existing versions.") + } else { + coll$items <- update_items(coll$items, as_coll_items(items)) + } + .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(coll, auto_unbox = TRUE)) +} + + +# -- Datasets ------------------------------------------------------------------# + +#' Create new dataset with given items #' -#' @inheritParams as_dataset_items +#' @inheritParams as_coll_items #' @param name Name of the dataset. It should be unique within the `parent` project. -#' @param parent Synapse id of parent project where the datasets will live. +#' @param parent Synapse id of parent project where the dataset will live. #' @param items Id(s) of items to include. #' Usually the same parent project storing the files, but in some cases it may be a different project. #' @param dry_run If TRUE, don't actually store dataset, just return the data object for inspection or further modification. new_dataset <- function(name, parent, items, item_version = NULL, dry_run = TRUE) { - dataset_items <- as_dataset_items(items, item_version) + dataset_items <- as_coll_items(items, item_version) dataset <- synapseclient$Dataset(name = name, parent = parent, dataset_items = dataset_items) if(dry_run) dataset else .syn$store(dataset) } + #' Create Sarek-processed datasets #' #' Organize variant call files from Nextflow Sarek into 3-4 datasets, @@ -141,6 +247,9 @@ nf_star_salmon_datasets <- function(output_map, # -- Checks------------- -------------------------------------------------------# +# TODO Better composition to reduce code, esp. if more will be added +# TODO Potentially move somewhere else like basic_utils + #' Check whether entity is dataset #' #' @keywords internal @@ -152,98 +261,37 @@ is_dataset <- function(id) { error = function(e) FALSE) } - -# -- Collections ---------------------------------------------------------------# - -#' Apply updates to current collection of items +#' Check whether entity is dataset collection #' -#' A collection of items has items of the form `list(entityId = id, versionNumber = x)`. -#' Given another collection that can represent updates of both types replace or add, -#' this applies an update join keyed on `entityId` for the replace and -#' appends the new items to get the updated collection. -#' -#' @param current_items List of lists representing a collection of items. -#' @param update_items Collection of items to apply as updates to `current_items`. -update_items <- function(current_coll, update_coll) { - - current_coll <- data.table::rbindlist(current_coll) - update_coll <- data.table::rbindlist(update_coll) - replaced <- current_coll[update_coll, on = .(entityId), versionNumber := i.versionNumber] - added <- update_coll[!current_coll, on = .(entityId)] - updated <- rbind(replaced, added) - # reconversion; using pure apply as.list coerces versionNumber into char - updated <- apply(updated, 1, function(i) list(entityId = unname(i[1]), versionNumber = as.integer(i[2]))) - updated -} +#' @keywords internal +is_dataset_collection <- function(id) { + tryCatch({ + entity <- .syn$get(id, downloadFile = FALSE) + entity$properties$concreteType == "org.sagebionetworks.repo.model.table.DatasetCollection" + }, + error = function(e) FALSE) +} -#' Update item versions to latest in a collection -#' -#' Update the collection so that all items or a subset of items reference their latest version. -#' This should work for both datasets (collection of files) and dataset collections (collection of datasets). +#' Check whether entity is dataset collection #' -#' @param collection_id -#' @param items Vector of dataset ids for which to update reference to latest version, -#' or "all" (default) to update all in the dataset collection. -#' @export -use_latest_in_collection(collection_id, items = "all") { - coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}")) - item_set <- match.arg(items) - current_items <- sapply(coll$items, function(i) i$entityId) - - if(item_set == "all") { - coll$items <- as_dataset_items(current_items) - } else { - - # Check subset; if no check, this becomes like `add_to_dataset_collection` - if(!all(items %in% current_items)) { - warning("Subset given includes items not actually in collection. These will be ignored:", items[!items %in% current_items]) - items <- items[items %in% current_items] - updated_items <- update_items(coll$items, as_dataset_items(items)) - coll$items <- updated_items - } - } - .syn$store(coll) - +#' @keywords internal +is_dataset_collection <- function(id) { + tryCatch({ + entity <- .syn$get(id, downloadFile = FALSE) + entity$properties$concreteType == "org.sagebionetworks.repo.model.table.DatasetCollection" + }, + error = function(e) FALSE) } -#' Add to dataset collection -#' -#' Add dataset(s) to an _existing_ dataset collection, using their current (latest) version. -#' If a dataset attempting to be added happens to already be in the dataset collection, -#' this might lead to version conflicts, so the update won't processed unless `force` is true. +#' Check whether entity is file #' -#' Implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet -#' implement dataset collection methods. -#' -#' @param collection_id Id of the dataset collection. -#' @param items Character vector of one or more dataset entity ids to add. -#' @param check_items Whether to check that ids are really dataset entities and remove non-dataset entities (default FALSE) -#' to help avoid Synapse error. This may be useful given that sometimes "datasets" can be folder or file entities. -#' Note that using check will be slower. -#' @param force If some items are currently in the collection with a different version, -#' should these items be force added using current version? The safe default is FALSE to ensure any version changes are intentional. -#' @export -add_to_dataset_collection <- function(collection_id, items, check_items = FALSE, force = FALSE) { - - if(check_items) { - confirmed_dataset <- sapply(items, is_dataset) - if(any(!confirmed_dataset)) { - warning("Items which are not dataset entities will not be added:", items[!confirmed_dataset]) - items <- items[confirmed_dataset] - } - } - dc <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}")) - - if(any(items %in% current_items) && !force) { - stop("Some datasets to be added are already in collection. Use `force = TRUE` to allow replacing existing dataset versions.") - } else if (any(items %in% current_items) && force) { - dc$items <- update_items(dc$items, as_dataset_items(items)) - } else { - dc$items <- c(dc$items, as_dataset_items(items)) - } - .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(dc, auto_unbox = TRUE)) +#' @keywords internal +is_file <- function(id) { + tryCatch({ + entity <- .syn$get(id, downloadFile = FALSE) + entity$properties$concreteType == "org.sagebionetworks.repo.model.File" + }, + error = function(e) FALSE) } - -# ------------------------------------------------------------------------------# diff --git a/tests/testthat/test_dataset_collection_utils.R b/tests/testthat/test_collection_utils.R similarity index 100% rename from tests/testthat/test_dataset_collection_utils.R rename to tests/testthat/test_collection_utils.R From a2bfed9339e924fc7059c8d57e50ca32f2f3ec50 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Sun, 2 Jul 2023 16:34:40 -0600 Subject: [PATCH 17/25] More tests and code qc --- R/datasets.R | 54 +++++++------ tests/testthat/test_dataset_utils.R | 117 +++++++++++++++++++++++++--- 2 files changed, 134 insertions(+), 37 deletions(-) diff --git a/R/datasets.R b/R/datasets.R index 805d1516..4adfbed9 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -17,7 +17,7 @@ as_coll_items <- function(ids, item_version = NULL) { if(is.null(item_version)) { item_version <- lapply(ids, function(id) .syn$get(id, downloadFile = FALSE)$properties$versionNumber) } - items <- Map(function(id, version) list(entityId = id, versionNumber = 1L), ids, item_version) + items <- Map(function(id, version) list(entityId = id, versionNumber = version), ids, item_version) names(items) <- NULL # need to unname list for API items } @@ -25,7 +25,7 @@ as_coll_items <- function(ids, item_version = NULL) { #' Apply updates to current collection of items #' -#' Given another collection that can represent updates of both types replace or add, +#' Given another collection that can represent updates of both types "replace" or "add", #' this applies an update join keyed on `entityId` for the replace and #' appends the new items to get the updated collection. #' @@ -53,24 +53,27 @@ update_items <- function(current_coll, update_coll) { #' @param items Vector of dataset ids for which to update reference to latest version, #' or "all" (default) to update all in the dataset collection. #' @export -use_latest_in_collection(collection_id, items = "all") { +use_latest_in_collection <- function(collection_id, items = "all") { coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}")) - item_set <- match.arg(items) current_items <- sapply(coll$items, function(i) i$entityId) - if(item_set == "all") { + if((length(items) == 1) && (items == "all")) { coll$items <- as_coll_items(current_items) } else { # Check subset; if no check, this becomes `add_to_collection` if(!all(items %in% current_items)) { - warning("Subset given includes items not actually in collection. These will be ignored:", items[!items %in% current_items]) + warning("Subset given includes items not actually in collection: ", items[!items %in% current_items]) items <- items[items %in% current_items] - updated_items <- update_items(coll$items, as_coll_items(items)) - coll$items <- updated_items + if(!length(items)) { + warning("No qualifying items to update. No updates applied.") + return(coll) + } } + updated_items <- update_items(coll$items, as_coll_items(items)) + coll$items <- updated_items } - .syn$store(coll) + .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(coll, auto_unbox = TRUE)) } @@ -97,21 +100,24 @@ use_latest_in_collection(collection_id, items = "all") { add_to_collection <- function(collection_id, items, check_items = FALSE, force = FALSE) { coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}")) - coll_type <- c("dataset", "dataset collection")[c(is_dataset(coll), is_dataset_collection(coll))] - if(!length(coll_type)) stop("Entity is not a dataset or dataset collection.") + coll_type <- which_coll_type(coll) if(check_items) { - expected_type_check <- if(coll_type == "dataset") is_file else is_dataset - correct_item_type <- sapply(items, expected_type_check) + item_type_check <- if(coll_type == "dataset") is_file else is_dataset + correct_item_type <- sapply(items, item_type_check) if(any(!correct_item_type)) { - warning("Some items not correct entity types for the collection! These will not be added:", items[!correct_item_type]) + warning("Some items not correct entity types for the collection and will not be added: ", items[!correct_item_type]) items <- items[correct_item_type] + if(!length(items)) { + warning("No qualifying items to add. No updates applied.", call. = FALSE) + return(coll) + } } } current_items <- sapply(coll$items, function(x) x$entityId) if(any(items %in% current_items) && !force) { - stop("Some items to be added are already in collection. Use `force = TRUE` to allow replacing existing versions.") + stop("Some items to be added are already in collection. Use `force = TRUE` to allow replacing existing versions.") } else { coll$items <- update_items(coll$items, as_coll_items(items)) } @@ -272,15 +278,14 @@ is_dataset_collection <- function(id) { error = function(e) FALSE) } -#' Check whether entity is dataset collection + +#' Which collection type #' -#' @keywords internal -is_dataset_collection <- function(id) { - tryCatch({ - entity <- .syn$get(id, downloadFile = FALSE) - entity$properties$concreteType == "org.sagebionetworks.repo.model.table.DatasetCollection" - }, - error = function(e) FALSE) +#' Checks for a valid collection type or returns error +#' +which_coll_type <- function(coll) { + coll_type <- c("dataset", "dataset collection")[c(is_dataset(coll), is_dataset_collection(coll))] + if(length(coll_type)) coll_type else stop("Entity is not a dataset or dataset collection.") } #' Check whether entity is file @@ -289,9 +294,8 @@ is_dataset_collection <- function(id) { is_file <- function(id) { tryCatch({ entity <- .syn$get(id, downloadFile = FALSE) - entity$properties$concreteType == "org.sagebionetworks.repo.model.File" + entity$properties$concreteType == "org.sagebionetworks.repo.model.FileEntity" }, error = function(e) FALSE) } - diff --git a/tests/testthat/test_dataset_utils.R b/tests/testthat/test_dataset_utils.R index 7ea57b92..5bef2af3 100644 --- a/tests/testthat/test_dataset_utils.R +++ b/tests/testthat/test_dataset_utils.R @@ -1,3 +1,16 @@ +# Create a basic draft dataset from some files at version 1; all files have a latest version 2 +# Returns dataset id only +create_dataset_fixture <- function(instance = 1) { + NF_test <- "syn26462036" + items <- c("syn51239179", + "syn51239178", + "syn51239177") + dataset <- new_dataset(name = paste0("test_fixture_dataset_", instance), parent = NF_test, items = items, item_version = 1L, dry_run = FALSE) + dataset_id <- dataset$properties$id + dataset_id +} + + test_that("Creating dataset with `new_dataset` works as expected when given valid parameters, defaulting to current item versions", { skip_if_no_synapseclient() @@ -8,14 +21,15 @@ test_that("Creating dataset with `new_dataset` works as expected when given vali "syn51239178", "syn51239177") dataset <- new_dataset(name = "test_dataset", parent = NF_test, items = items, dry_run = FALSE) - .syn$delete(dataset) - expected_items_in_dataset <- list(list(entityId = "syn51239179", versionNumber = 2L), - list(entityId = "syn51239178", versionNumber = 2L), - list(entityId = "syn51239177", versionNumber = 2L)) + expected_items_in_dataset <- list( + list(entityId = "syn51239179", versionNumber = 2L), + list(entityId = "syn51239178", versionNumber = 2L), + list(entityId = "syn51239177", versionNumber = 2L)) testthat::expect_equal(expected_items_in_dataset, dataset$properties$datasetItems) - + .syn$delete(dataset) }) + test_that("Creating dataset with `new_dataset` works as expected when given valid parameters and a specific item version is specified", { skip_if_no_synapseclient() @@ -25,21 +39,21 @@ test_that("Creating dataset with `new_dataset` works as expected when given vali "syn51239178", "syn51239177") dataset <- new_dataset(name = "test_dataset", parent = NF_test, items = items, item_version = 1L, dry_run = FALSE) - .syn$delete(dataset) - expected_items_in_dataset <- list(list(entityId = "syn51239179", versionNumber = 1L), - list(entityId = "syn51239178", versionNumber = 1L), - list(entityId = "syn51239177", versionNumber = 1L)) + expected_items_in_dataset <- list( + list(entityId = "syn51239179", versionNumber = 1L), + list(entityId = "syn51239178", versionNumber = 1L), + list(entityId = "syn51239177", versionNumber = 1L)) testthat::expect_equal(expected_items_in_dataset, dataset$properties$datasetItems) - + .syn$delete(dataset) }) + # When providing an item not allowed to be a dataset item (a table or folder), the Synapse error will be something like # ``` # Error: synapseclient.core.exceptions.SynapseHTTPError: 400 Client Error: # Currently, only files can be included in a dataset. syn27242487 is 'org.sagebionetworks.repo.model.table.TableEntity' # ``` -# This is a good, informative error -test_that("Creating dataset with `new_dataset` will fail when trying to include a non-vaid item (specifically, a table)", { +test_that("Creating dataset with `new_dataset` will fail when trying to include a non-valid item (a table)", { skip_if_no_synapseclient() skip_if_no_token() @@ -51,3 +65,82 @@ test_that("Creating dataset with `new_dataset` will fail when trying to include }) +test_that("Updating a dataset to make a subset of files reference the latest version works", { + + skip_if_no_synapseclient() + skip_if_no_token() + + dataset_id <- create_dataset_fixture() + items_to_update <- c("syn51239178", "syn51239177") # both should be updated to Version 2 + updated <- use_latest_in_collection(collection_id = dataset_id, items = items_to_update) + expected_updated_items <- list( + list(entityId = "syn51239179", versionNumber = 1L), + list(entityId = "syn51239178", versionNumber = 2L), + list(entityId = "syn51239177", versionNumber = 2L)) + testthat::expect_identical(updated$items, expected_updated_items) + .syn$delete(dataset_id) +}) + + +test_that("Updating a dataset to make _all_ files reference the latest version works", { + + skip_if_no_synapseclient() + skip_if_no_token() + + dataset_id <- create_dataset_fixture() + expected_updated_items <- list( + list(entityId = "syn51239179", versionNumber = 2L), + list(entityId = "syn51239178", versionNumber = 2L), + list(entityId = "syn51239177", versionNumber = 2L)) + updated <- use_latest_in_collection(collection_id = dataset_id, items = "all") + testthat::expect_identical(updated$items, expected_updated_items) + .syn$delete(dataset_id) +}) + + +# Dataset collections ---------------------------------------------------------# + +test_that("Updating a dataset collection to make a subset of datasets reference the latest version works", { + + skip_if_no_synapseclient() + skip_if_no_token() + + dataset_collection_id <- "syn51809938" + dataset_item_to_update <- "syn51809898" + .syn$create_snapshot_version(dataset_item_to_update) + new_version <- .syn$get(dataset_item_to_update, downloadFile = FALSE)$properties$versionNumber + DC <- use_latest_in_collection(collection_id = dataset_collection_id, items = dataset_item_to_update) + updated_item <- Filter(function(item) item$entityId == dataset_item_to_update, DC$items) + testthat::expect_equal(updated_item$versionNumber, new_version) +}) + + +test_that("Adding new dataset to dataset collection works", { + + skip_if_no_synapseclient() + skip_if_no_token() + + dataset_collection_id <- "syn51809938" + coll_state <- coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{dataset_collection_id}")) + one_more_item <- create_dataset_fixture() + new_coll_state <- add_to_collection(collection_id = dataset_collection_id, items = one_more_item) + testthat::expect_equal(length(new_coll_state$items), length(coll_state$items) + 1L) + # cleanup: set collection to previous items state + new_coll_state$items <- coll_state$items + .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{dataset_collection_id}"), body = jsonlite::toJSON(new_coll_state, auto_unbox = TRUE)) + # delete dataset + .syn$delete(one_more_item) +}) + + +test_that("Adding non-datasets to dataset collection gives expected handling and warning", { + + skip_if_no_synapseclient() + skip_if_no_token() + + dataset_collection_id <- "syn51809938" + bad_items <- "syn51106349" # a folder + testthat::expect_warning(add_to_collection(collection_id = dataset_collection_id, items = bad_items, check_items = TRUE), + regexp = paste("No qualifying items to add. No updates applied.")) +}) + From cc11a48e190201aa1819a107e6a15d1638c87c04 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Sun, 2 Jul 2023 17:41:09 -0600 Subject: [PATCH 18/25] Draft dataset citation util --- R/datasets.R | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/R/datasets.R b/R/datasets.R index 4adfbed9..be283da8 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -251,10 +251,37 @@ nf_star_salmon_datasets <- function(output_map, dry_run = dry_run) } +# -- Citations -----------------------------------------------------------------# + +# TODO Potentially move somewhere else + +#' Generate dataset citation +#' +#' This is currently more for demo purposes, to check how well current metadata +#' could be formatted into citation text. Datasets within the official +#' Portal Collection should work well enough, while there are no guarantees for +#' unofficial/community-contributed datasets. +#' @param dataset_id Dataset id. +#' @param format Currently just "Scientific Data" format. +#' @param output Currently just markdown format. There are many ways to +#' generate LaTeX or HTML from markdown. +#' @keywords internal +dataset_citation <- function(dataset_id, format = "Scientific Data", output = c("markdown")) { + if(!is_dataset(id)) stop("Not a dataset") + meta <- .syn$get_annotations(id) + doi <- tryCatch(meta$doi, error = function(e) NULL) + title <- meta$title + creator <- meta$creator + yearPublished <- meta$yearPublished + repository <- "Synapse" + accession <- if(length(doi)) doi else glue::glue("https://www.synapse.org/#!Synapse:{id}") + glue::glue("{creator}. _{repository}_ {accession} ({yearPublished}).") +} + # -- Checks------------- -------------------------------------------------------# +# TODO Potentially move these type checks somewhere else like basic_utils # TODO Better composition to reduce code, esp. if more will be added -# TODO Potentially move somewhere else like basic_utils #' Check whether entity is dataset #' From 388746189512234394b8d104475426f7835a3691 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Mon, 3 Jul 2023 08:17:17 -0600 Subject: [PATCH 19/25] Mark internal or export, add doc details --- R/datasets.R | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/R/datasets.R b/R/datasets.R index be283da8..164263b2 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -25,12 +25,15 @@ as_coll_items <- function(ids, item_version = NULL) { #' Apply updates to current collection of items #' -#' Given another collection that can represent updates of both types "replace" or "add", +#' This is essentially an internal transaction helper for trying to apply a changeset to a collection, +#' used in several higher-level collection utils. +#' Given the changeset that can represent updates of both types "replace" or "add", #' this applies an update join keyed on `entityId` for the replace and #' appends the new items to get the updated collection. #' #' @param current_items List of lists representing a collection of items. #' @param update_items Collection of items to apply as updates to `current_items`. +#' @keywords internal update_items <- function(current_coll, update_coll) { current_coll <- data.table::rbindlist(current_coll) @@ -135,6 +138,7 @@ add_to_collection <- function(collection_id, items, check_items = FALSE, force = #' @param items Id(s) of items to include. #' Usually the same parent project storing the files, but in some cases it may be a different project. #' @param dry_run If TRUE, don't actually store dataset, just return the data object for inspection or further modification. +#' @export new_dataset <- function(name, parent, items, item_version = NULL, dry_run = TRUE) { dataset_items <- as_coll_items(items, item_version) @@ -266,7 +270,7 @@ nf_star_salmon_datasets <- function(output_map, #' @param output Currently just markdown format. There are many ways to #' generate LaTeX or HTML from markdown. #' @keywords internal -dataset_citation <- function(dataset_id, format = "Scientific Data", output = c("markdown")) { +cite_dataset <- function(dataset_id, format = "Scientific Data", output = c("markdown")) { if(!is_dataset(id)) stop("Not a dataset") meta <- .syn$get_annotations(id) doi <- tryCatch(meta$doi, error = function(e) NULL) @@ -310,6 +314,7 @@ is_dataset_collection <- function(id) { #' #' Checks for a valid collection type or returns error #' +#' @keywords internal which_coll_type <- function(coll) { coll_type <- c("dataset", "dataset collection")[c(is_dataset(coll), is_dataset_collection(coll))] if(length(coll_type)) coll_type else stop("Entity is not a dataset or dataset collection.") From c861904964cb9d7246a3bb6086d244c5253ecc97 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Mon, 3 Jul 2023 08:52:07 -0600 Subject: [PATCH 20/25] Write new docs, update pkgdown reference --- _pkgdown.yml | 17 +++++++++++ man/add_to_collection.Rd | 31 +++++++++++++++++++ man/as_coll_items.Rd | 19 ++++++++++++ man/cite_dataset.Rd | 23 ++++++++++++++ man/is_dataset.Rd | 12 ++++++++ man/is_dataset_collection.Rd | 12 ++++++++ man/is_file.Rd | 12 ++++++++ man/new_dataset.Rd | 24 +++++++++++++++ man/nf_sarek_datasets.Rd | 53 +++++++++++++++++++++++++++++++++ man/nf_star_salmon_datasets.Rd | 29 ++++++++++++++++++ man/update_items.Rd | 21 +++++++++++++ man/use_latest_in_collection.Rd | 18 +++++++++++ man/which_coll_type.Rd | 12 ++++++++ 13 files changed, 283 insertions(+) create mode 100644 man/add_to_collection.Rd create mode 100644 man/as_coll_items.Rd create mode 100644 man/cite_dataset.Rd create mode 100644 man/is_dataset.Rd create mode 100644 man/is_dataset_collection.Rd create mode 100644 man/is_file.Rd create mode 100644 man/new_dataset.Rd create mode 100644 man/nf_sarek_datasets.Rd create mode 100644 man/nf_star_salmon_datasets.Rd create mode 100644 man/update_items.Rd create mode 100644 man/use_latest_in_collection.Rd create mode 100644 man/which_coll_type.Rd diff --git a/_pkgdown.yml b/_pkgdown.yml index 5c8b9ff0..47c3195d 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -44,6 +44,23 @@ reference: - annotate_reports_sarek - annotate_with_tool_stats +- title: Dataset Creation and Management +- subtitle: General dataset creation and citation + desc: Create datasets generally, create citation example text +- contents: + - new_dataset + - cite_dataset +- subtitle: Specialized dataset creation + desc: Create specialized datasets for nextflow processed data, i.e. with some custom construction queries and title templating. +- contents: + - nf_sarek_datasets + - nf_star_salmon_datasets +- subtitle: Working with dataset collections to manage datasets after creation +- contents: + - add_to_collection + - use_latest_in_collection + - update_items + - title: Data Model Utils desc: Talk to a JSON-LD data model important to the portal data (i.e. NF-metadata-dictionary) - contents: diff --git a/man/add_to_collection.Rd b/man/add_to_collection.Rd new file mode 100644 index 00000000..e6d774a4 --- /dev/null +++ b/man/add_to_collection.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets.R +\name{add_to_collection} +\alias{add_to_collection} +\title{Add to collection} +\usage{ +add_to_collection(collection_id, items, check_items = FALSE, force = FALSE) +} +\arguments{ +\item{collection_id}{Id of the collection.} + +\item{items}{Character vector of one or more dataset entity ids to add.} + +\item{check_items}{Whether to check that ids are really appropriate item types and remove non-appropriate item types +to help avoid Synapse errors (default \code{FALSE} because in most cases \code{items} are curated, and using check will be slower).} + +\item{force}{If some items are currently in the collection with a different version, +should these items be force-added using current version? The safe default is \code{FALSE} to ensure any such updates are intentional.} +} +\description{ +Add items(s) to an \emph{existing} collection, using the item(s)' current (latest) version. +For datasets, the items should be files. For dataset collections, the items should be datasets. +If an item attempting to be added happens to already be in the collection, +this might lead to version conflicts, so the update will be rejected unless \code{force} is true. +} +\details{ +This is implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet +implement dataset collection class and methods (but dataset and relevant methods like \code{add_item} method are available). +Thus, while this is generic enough to handle both datasets and dataset collections +it is expected to be used more for dataset collections given that the dataset method is provided. +} diff --git a/man/as_coll_items.Rd b/man/as_coll_items.Rd new file mode 100644 index 00000000..9c97671b --- /dev/null +++ b/man/as_coll_items.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets.R +\name{as_coll_items} +\alias{as_coll_items} +\title{As collection items} +\usage{ +as_coll_items(ids, item_version = NULL) +} +\arguments{ +\item{ids}{Ids of entities to make into dataset items.} + +\item{item_version}{Integer for version that will be used for all items, e.g. 1. +If NULL, this will look up the latest version for each id and use that.} +} +\description{ +Helper taking entity ids to create records used for dataset items or dataset collection items. +Collection items have the form \code{list(entityId = id, versionNumber = x)}. +} +\keyword{internal} diff --git a/man/cite_dataset.Rd b/man/cite_dataset.Rd new file mode 100644 index 00000000..6a53c059 --- /dev/null +++ b/man/cite_dataset.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets.R +\name{cite_dataset} +\alias{cite_dataset} +\title{Generate dataset citation} +\usage{ +cite_dataset(dataset_id, format = "Scientific Data", output = c("markdown")) +} +\arguments{ +\item{dataset_id}{Dataset id.} + +\item{format}{Currently just "Scientific Data" format.} + +\item{output}{Currently just markdown format. There are many ways to +generate LaTeX or HTML from markdown.} +} +\description{ +This is currently more for demo purposes, to check how well current metadata +could be formatted into citation text. Datasets within the official +Portal Collection should work well enough, while there are no guarantees for +unofficial/community-contributed datasets. +} +\keyword{internal} diff --git a/man/is_dataset.Rd b/man/is_dataset.Rd new file mode 100644 index 00000000..ea75bdb6 --- /dev/null +++ b/man/is_dataset.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets.R +\name{is_dataset} +\alias{is_dataset} +\title{Check whether entity is dataset} +\usage{ +is_dataset(id) +} +\description{ +Check whether entity is dataset +} +\keyword{internal} diff --git a/man/is_dataset_collection.Rd b/man/is_dataset_collection.Rd new file mode 100644 index 00000000..adb1036b --- /dev/null +++ b/man/is_dataset_collection.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets.R +\name{is_dataset_collection} +\alias{is_dataset_collection} +\title{Check whether entity is dataset collection} +\usage{ +is_dataset_collection(id) +} +\description{ +Check whether entity is dataset collection +} +\keyword{internal} diff --git a/man/is_file.Rd b/man/is_file.Rd new file mode 100644 index 00000000..09b1315c --- /dev/null +++ b/man/is_file.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets.R +\name{is_file} +\alias{is_file} +\title{Check whether entity is file} +\usage{ +is_file(id) +} +\description{ +Check whether entity is file +} +\keyword{internal} diff --git a/man/new_dataset.Rd b/man/new_dataset.Rd new file mode 100644 index 00000000..985d3935 --- /dev/null +++ b/man/new_dataset.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets.R +\name{new_dataset} +\alias{new_dataset} +\title{Create new dataset with given items} +\usage{ +new_dataset(name, parent, items, item_version = NULL, dry_run = TRUE) +} +\arguments{ +\item{name}{Name of the dataset. It should be unique within the \code{parent} project.} + +\item{parent}{Synapse id of parent project where the dataset will live.} + +\item{items}{Id(s) of items to include. +Usually the same parent project storing the files, but in some cases it may be a different project.} + +\item{item_version}{Integer for version that will be used for all items, e.g. 1. +If NULL, this will look up the latest version for each id and use that.} + +\item{dry_run}{If TRUE, don't actually store dataset, just return the data object for inspection or further modification.} +} +\description{ +Create new dataset with given items +} diff --git a/man/nf_sarek_datasets.Rd b/man/nf_sarek_datasets.Rd new file mode 100644 index 00000000..db4012d6 --- /dev/null +++ b/man/nf_sarek_datasets.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets.R +\name{nf_sarek_datasets} +\alias{nf_sarek_datasets} +\title{Create Sarek-processed datasets} +\usage{ +nf_sarek_datasets( + output_map, + parent, + workflow = c("FreeBayes", "Mutect2", "Strelka", "DeepVariant"), + verbose = TRUE, + dry_run = TRUE +) +} +\arguments{ +\item{output_map}{The \code{data.table} returned from \code{map_sample_output_sarek}. See details for alternatives.} + +\item{parent}{Synapse id of parent project where the dataset will live.} + +\item{verbose}{Optional, whether to be verbose -- defaults to TRUE.} + +\item{dry_run}{If TRUE, don't actually store dataset, just return the data object for inspection or further modification.} +} +\value{ +A list of dataset objects. +} +\description{ +Organize variant call files from Nextflow Sarek into 3-4 datasets, +grouping files by variant type and workflow with titles having the format: +"{type} Genomic Variants - {workflow} Pipeline", e.g. "Somatic Genomic Variants - Strelka Pipeline". +As you can see, this assumes that you want to create datasets that segregate Somatic and Germline calls. +This makes sense for NF because Germline calls can be treated differently. +This uses latest version of all files and creates a Draft version of the dataset. +} +\details{ +Since we basically just need the syn entity id, variant type, and workflow to group the files. +Instead of getting this info through running \verb{map_*} as in the example, +you may prefer using a fileview, in which case you just need to download a table from a fileview +that has \code{id} => \code{output_id} + the \code{dataType} and \code{workflow} annotations. +The fileview can be used \emph{after} the files are annotated. If you want to create datasets \emph{before} +files are annotated, then you have to use \verb{map_*}. + +Finally, datasets cannot use the same name if stored in the same project, +so if there are multiple batches, the names will have to be made unique by adding +the batch number, source data id, processing date, or whatever makes sense. +} +\examples{ +\dontrun{ +syn_out <- "syn26648589" +m <- map_sample_output_sarek(syn_out) +datasets <- nf_sarek_datasets(m, parent = "syn26462036", dry_run = F) # use a test project +} +} diff --git a/man/nf_star_salmon_datasets.Rd b/man/nf_star_salmon_datasets.Rd new file mode 100644 index 00000000..64d0c149 --- /dev/null +++ b/man/nf_star_salmon_datasets.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets.R +\name{nf_star_salmon_datasets} +\alias{nf_star_salmon_datasets} +\title{Create NF STAR-Salmon dataset} +\usage{ +nf_star_salmon_datasets(output_map, parent, verbose = TRUE, dry_run = TRUE) +} +\arguments{ +\item{output_map}{The \code{data.table} returned from \code{map_sample_output_sarek}.} + +\item{parent}{Synapse id of parent project where the dataset will live.} + +\item{verbose}{Optional, whether to be verbose -- defaults to TRUE.} + +\item{dry_run}{If TRUE, don't actually store dataset, just return the data object for inspection or further modification.} +} +\description{ +Organize gene expression quantification files (.sf) into one dataset. +Uses latest version of the files and creates a "Draft" dataset. +See also \code{nf_sarek_datasets}. +} +\examples{ +\dontrun{ +syn_out <- "syn30840584" +m <- map_sample_output_rnaseq(syn_out) +datasets <- nf_rnaseq_dataset(m, out, parent = "syn4939902", dry_run = F) +} +} diff --git a/man/update_items.Rd b/man/update_items.Rd new file mode 100644 index 00000000..c7a516e3 --- /dev/null +++ b/man/update_items.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets.R +\name{update_items} +\alias{update_items} +\title{Apply updates to current collection of items} +\usage{ +update_items(current_coll, update_coll) +} +\arguments{ +\item{current_items}{List of lists representing a collection of items.} + +\item{update_items}{Collection of items to apply as updates to \code{current_items}.} +} +\description{ +This is essentially an internal transaction helper for trying to apply a changeset to a collection, +used in several higher-level collection utils. +Given the changeset that can represent updates of both types "replace" or "add", +this applies an update join keyed on \code{entityId} for the replace and +appends the new items to get the updated collection. +} +\keyword{internal} diff --git a/man/use_latest_in_collection.Rd b/man/use_latest_in_collection.Rd new file mode 100644 index 00000000..00c376d9 --- /dev/null +++ b/man/use_latest_in_collection.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets.R +\name{use_latest_in_collection} +\alias{use_latest_in_collection} +\title{Update item versions to "latest" in a collection} +\usage{ +use_latest_in_collection(collection_id, items = "all") +} +\arguments{ +\item{collection_id}{} + +\item{items}{Vector of dataset ids for which to update reference to latest version, +or "all" (default) to update all in the dataset collection.} +} +\description{ +Update an \emph{existing} collection so that all items or a subset of items reference their latest version. +This should work for both datasets (collection of files) and dataset collections (collection of datasets). +} diff --git a/man/which_coll_type.Rd b/man/which_coll_type.Rd new file mode 100644 index 00000000..ab0eb92e --- /dev/null +++ b/man/which_coll_type.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets.R +\name{which_coll_type} +\alias{which_coll_type} +\title{Which collection type} +\usage{ +which_coll_type(coll) +} +\description{ +Checks for a valid collection type or returns error +} +\keyword{internal} From c1e152208cbb3385b4894b6632f7fa59745f9fe6 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Mon, 3 Jul 2023 12:19:33 -0600 Subject: [PATCH 21/25] Fix bug for one failing test in test_dataset_utils --- tests/testthat/test_dataset_utils.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test_dataset_utils.R b/tests/testthat/test_dataset_utils.R index 5bef2af3..214e0f66 100644 --- a/tests/testthat/test_dataset_utils.R +++ b/tests/testthat/test_dataset_utils.R @@ -111,7 +111,7 @@ test_that("Updating a dataset collection to make a subset of datasets reference new_version <- .syn$get(dataset_item_to_update, downloadFile = FALSE)$properties$versionNumber DC <- use_latest_in_collection(collection_id = dataset_collection_id, items = dataset_item_to_update) updated_item <- Filter(function(item) item$entityId == dataset_item_to_update, DC$items) - testthat::expect_equal(updated_item$versionNumber, new_version) + testthat::expect_equal(updated_item[[1]]$versionNumber, new_version) }) From d158cb6dac34123626d1022361c3b5c387b1147b Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Mon, 3 Jul 2023 12:38:28 -0600 Subject: [PATCH 22/25] Update testing notes --- tests/README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/README.md b/tests/README.md index 4465b673..1afaac71 100644 --- a/tests/README.md +++ b/tests/README.md @@ -29,7 +29,13 @@ If you are a potential contributor who is not an NF-OSI member, request to be added to the test repo for writing/running tests. **If `TEST_SYNAPSE_AUTH_TOKEN` is not available, dependent tests are simply skipped.** -- During tests, the `TEST_SYNAPSE_AUTH_TOKEN` is temporarily set as `SYNAPSE_AUTH_TOKEN` to create the synapseclient object. +- During tests, the `TEST_SYNAPSE_AUTH_TOKEN` is temporarily set as `SYNAPSE_AUTH_TOKEN`. + +- For pkg testing during development: + - Use `testthat::test_local()` in the pkg root to test all functions. Recommended. + - Write a `test_that` function to test a new package function and run it interactively. + Note `test_local` does things like importing Python modules into the environment (i.e. all the stuff that happens during package load), + so the standalone function testing is a little trickier in having check for and set up dependencies in the environment. - Given that most tests depend on a successful login, and that tests are run according to the alphabetical naming of test*.R files, From 2998455807da58180caadfe8df21d448871ec6af Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Mon, 3 Jul 2023 12:38:50 -0600 Subject: [PATCH 23/25] Add dependency check --- tests/testthat/helpers.R | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/testthat/helpers.R b/tests/testthat/helpers.R index d4ff7c99..c3fce801 100644 --- a/tests/testthat/helpers.R +++ b/tests/testthat/helpers.R @@ -1,14 +1,24 @@ # Implementing skips according to suggested handling when using reticulate # See https://rstudio.github.io/reticulate/articles/package.html - # Skips tests on CRAN machines or other incompatible testing environments # where Python can't be configured so package checks don't fail + +# Skip if Python synapseclient module not installed/accessible +# This is normally imported upon package load, see `zzz.R` skip_if_no_synapseclient <- function() { have_synapseclient <- py_module_available("synapseclient") if(!have_synapseclient) skip("synapseclient not available for testing") } +# Skip if Python synapseutils module not installed/accessible +# This is normally imported upon package load, see `zzz.R` +skip_if_no_synapseutils <- function() { + have_synapseutils <- py_module_available("synapseclient") + if(!have_synapseutils) + skip("synapseutils not available for testing") +} + # Skip if no pandas; pandas is needed for smaller subset of functions in the package skip_if_no_pandas <- function() { have_pandas <- py_module_available("pandas") From 76336a2d8cb0714f690d81623dafc284907c502b Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Mon, 3 Jul 2023 12:47:55 -0600 Subject: [PATCH 24/25] Address CMD check doc complaints, update NAMESPACE and bump pkg version --- DESCRIPTION | 2 +- NAMESPACE | 5 +++++ R/datasets.R | 5 +++-- man/add_to_collection.Rd | 2 +- man/nf_sarek_datasets.Rd | 2 ++ man/use_latest_in_collection.Rd | 2 +- 6 files changed, 13 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b4e735c1..5cd8e93c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: nfportalutils Title: NF Portal Utilities -Version: 0.0.0.9210 +Version: 0.0.0.9300 Authors@R: c( person(given = "Robert", family = "Allaway", role = c("aut", "cre"), email = "robert.allaway@sagebionetworks.org", diff --git a/NAMESPACE b/NAMESPACE index 6c68ffc5..c739fdbd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,6 +16,7 @@ export(add_people_from_table) export(add_publication_from_pubmed) export(add_publication_from_unpaywall) export(add_publications_from_file) +export(add_to_collection) export(annotate_aligned_reads) export(annotate_called_variants) export(annotate_expression) @@ -49,7 +50,10 @@ export(map_sample_io) export(map_sample_output_rnaseq) export(map_sample_output_sarek) export(missing_annotation_email) +export(new_dataset) export(new_project) +export(nf_sarek_datasets) +export(nf_star_salmon_datasets) export(processing_flowchart) export(qc_manifest) export(register_study) @@ -61,6 +65,7 @@ export(syn_login) export(syncBP_maf) export(table_query) export(update_study_annotations) +export(use_latest_in_collection) export(wiki_mod) import(data.table) import(reticulate) diff --git a/R/datasets.R b/R/datasets.R index 164263b2..1cb41b07 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -52,7 +52,7 @@ update_items <- function(current_coll, update_coll) { #' Update an _existing_ collection so that all items or a subset of items reference their latest version. #' This should work for both datasets (collection of files) and dataset collections (collection of datasets). #' -#' @param collection_id +#' @param collection_id Collection id. #' @param items Vector of dataset ids for which to update reference to latest version, #' or "all" (default) to update all in the dataset collection. #' @export @@ -93,7 +93,7 @@ use_latest_in_collection <- function(collection_id, items = "all") { #' Thus, while this is generic enough to handle both datasets and dataset collections #' it is expected to be used more for dataset collections given that the dataset method is provided. #' -#' @param collection_id Id of the collection. +#' @param collection_id Collection id. #' @param items Character vector of one or more dataset entity ids to add. #' @param check_items Whether to check that ids are really appropriate item types and remove non-appropriate item types #' to help avoid Synapse errors (default `FALSE` because in most cases `items` are curated, and using check will be slower). @@ -171,6 +171,7 @@ new_dataset <- function(name, parent, items, item_version = NULL, dry_run = TRUE #' #' @inheritParams new_dataset #' @param output_map The `data.table` returned from `map_sample_output_sarek`. See details for alternatives. +#' @param workflow One of workflows used. #' @param verbose Optional, whether to be verbose -- defaults to TRUE. #' @import data.table #' @return A list of dataset objects. diff --git a/man/add_to_collection.Rd b/man/add_to_collection.Rd index e6d774a4..0b52c9e6 100644 --- a/man/add_to_collection.Rd +++ b/man/add_to_collection.Rd @@ -7,7 +7,7 @@ add_to_collection(collection_id, items, check_items = FALSE, force = FALSE) } \arguments{ -\item{collection_id}{Id of the collection.} +\item{collection_id}{Collection id.} \item{items}{Character vector of one or more dataset entity ids to add.} diff --git a/man/nf_sarek_datasets.Rd b/man/nf_sarek_datasets.Rd index db4012d6..46ddbe61 100644 --- a/man/nf_sarek_datasets.Rd +++ b/man/nf_sarek_datasets.Rd @@ -17,6 +17,8 @@ nf_sarek_datasets( \item{parent}{Synapse id of parent project where the dataset will live.} +\item{workflow}{One of workflows used.} + \item{verbose}{Optional, whether to be verbose -- defaults to TRUE.} \item{dry_run}{If TRUE, don't actually store dataset, just return the data object for inspection or further modification.} diff --git a/man/use_latest_in_collection.Rd b/man/use_latest_in_collection.Rd index 00c376d9..8a8983d3 100644 --- a/man/use_latest_in_collection.Rd +++ b/man/use_latest_in_collection.Rd @@ -7,7 +7,7 @@ use_latest_in_collection(collection_id, items = "all") } \arguments{ -\item{collection_id}{} +\item{collection_id}{Collection id.} \item{items}{Vector of dataset ids for which to update reference to latest version, or "all" (default) to update all in the dataset collection.} From 53657a24d052bb656dec3e7acf59f45019c235dc Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Mon, 3 Jul 2023 17:58:25 -0600 Subject: [PATCH 25/25] Final updates, reorg, and notes for experimental stuff --- R/citation.R | 54 +++++++++++++++++++++++++++++++++++++++++++++ R/datasets.R | 26 ---------------------- _pkgdown.yml | 5 +++-- man/cite_dataset.Rd | 18 +++++++-------- man/get_doi_meta.Rd | 18 +++++++++++++++ 5 files changed, 84 insertions(+), 37 deletions(-) create mode 100644 R/citation.R create mode 100644 man/get_doi_meta.Rd diff --git a/R/citation.R b/R/citation.R new file mode 100644 index 00000000..7d145317 --- /dev/null +++ b/R/citation.R @@ -0,0 +1,54 @@ +# -- Citations -----------------------------------------------------------------# + +#' Get DOI metadata if it exists +#' +#' Returns list of metadata associated with DOI if exists, otherwise NULL. +#' Currently usable for certain entity types like files or datasets, +#' though this should be revised to make more useful with other objects. +#' Note: Internal/experimental use only, not for production use. +#' +#' @param id Dataset data. +#' @keywords internal +get_doi_meta <- function(id) { + + # TODO Template query according to object type of id, + # i.e. folders can have dois, but they don't have version #s + obj <- .syn$get(id, downloadFile = FALSE) + versionNumber <- obj$properties$versionNumber # error if no versionNumber + tryCatch({ + .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/doi?id={id}&type=DATASET&version={versionNumber}")) + }, + error = function(e) if(grepl("DOI association does not exist.", e$message)) NULL else e) +} + +#' Generate example dataset citation +#' +#' This is currently more for demo purposes, to check how well current metadata +#' could be formatted into citation text. Datasets where DOIs have been minted +#' *or* NF-OSI processed datasets within the official Portal Collection should +#' work well, while there are no guarantees for other cases. +#' Note: Internal/experimental use only, not for production use. +#' +#' @param id Dataset id. +#' @param format Currently just "Scientific Data" format. +#' @param output Currently only markdown, from which other utils can be used to generate LaTeX or HTML. +#' @keywords internal +cite_dataset <- function(id, + format = "Scientific Data", + output = c("markdown")) { + if(!is_dataset(id)) stop("Not a dataset") + if(length(get_doi_meta(id))) { + message("For now, please go to https://citation.crosscite.org/ for the most comprehensive citation options.") + return(NULL) + } else { + meta <- .syn$get_annotations(id) + title <- meta$title + creator <- meta$creator + repository <- "Synapse" + accession <- if(length(doi)) doi else glue::glue("https://www.synapse.org/#!Synapse:{id}") + yearPublished <- meta$yearPublished + glue::glue("{creator}. _{repository}_ {accession} ({yearPublished}).") + } + +} + diff --git a/R/datasets.R b/R/datasets.R index 1cb41b07..fffc9a8d 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -256,32 +256,6 @@ nf_star_salmon_datasets <- function(output_map, dry_run = dry_run) } -# -- Citations -----------------------------------------------------------------# - -# TODO Potentially move somewhere else - -#' Generate dataset citation -#' -#' This is currently more for demo purposes, to check how well current metadata -#' could be formatted into citation text. Datasets within the official -#' Portal Collection should work well enough, while there are no guarantees for -#' unofficial/community-contributed datasets. -#' @param dataset_id Dataset id. -#' @param format Currently just "Scientific Data" format. -#' @param output Currently just markdown format. There are many ways to -#' generate LaTeX or HTML from markdown. -#' @keywords internal -cite_dataset <- function(dataset_id, format = "Scientific Data", output = c("markdown")) { - if(!is_dataset(id)) stop("Not a dataset") - meta <- .syn$get_annotations(id) - doi <- tryCatch(meta$doi, error = function(e) NULL) - title <- meta$title - creator <- meta$creator - yearPublished <- meta$yearPublished - repository <- "Synapse" - accession <- if(length(doi)) doi else glue::glue("https://www.synapse.org/#!Synapse:{id}") - glue::glue("{creator}. _{repository}_ {accession} ({yearPublished}).") -} # -- Checks------------- -------------------------------------------------------# diff --git a/_pkgdown.yml b/_pkgdown.yml index 47c3195d..fd004201 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -46,10 +46,9 @@ reference: - title: Dataset Creation and Management - subtitle: General dataset creation and citation - desc: Create datasets generally, create citation example text + desc: Create datasets in general - contents: - new_dataset - - cite_dataset - subtitle: Specialized dataset creation desc: Create specialized datasets for nextflow processed data, i.e. with some custom construction queries and title templating. - contents: @@ -137,4 +136,6 @@ reference: - .replace_string_column_with_stringlist_column - .store_rows - missing_annotation_email + - get_doi_meta + - cite_dataset diff --git a/man/cite_dataset.Rd b/man/cite_dataset.Rd index 6a53c059..8314d9d3 100644 --- a/man/cite_dataset.Rd +++ b/man/cite_dataset.Rd @@ -1,23 +1,23 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/datasets.R +% Please edit documentation in R/citation.R \name{cite_dataset} \alias{cite_dataset} -\title{Generate dataset citation} +\title{Generate example dataset citation} \usage{ -cite_dataset(dataset_id, format = "Scientific Data", output = c("markdown")) +cite_dataset(id, format = "Scientific Data", output = c("markdown")) } \arguments{ -\item{dataset_id}{Dataset id.} +\item{id}{Dataset id.} \item{format}{Currently just "Scientific Data" format.} -\item{output}{Currently just markdown format. There are many ways to -generate LaTeX or HTML from markdown.} +\item{output}{Currently only markdown, from which other utils can be used to generate LaTeX or HTML.} } \description{ This is currently more for demo purposes, to check how well current metadata -could be formatted into citation text. Datasets within the official -Portal Collection should work well enough, while there are no guarantees for -unofficial/community-contributed datasets. +could be formatted into citation text. Datasets where DOIs have been minted +\emph{or} NF-OSI processed datasets within the official Portal Collection should +work well, while there are no guarantees for other cases. +Note: Internal/experimental use only, not for production use. } \keyword{internal} diff --git a/man/get_doi_meta.Rd b/man/get_doi_meta.Rd new file mode 100644 index 00000000..4a6ac20c --- /dev/null +++ b/man/get_doi_meta.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/citation.R +\name{get_doi_meta} +\alias{get_doi_meta} +\title{Get DOI metadata if it exists} +\usage{ +get_doi_meta(id) +} +\arguments{ +\item{id}{Dataset data.} +} +\description{ +Returns list of metadata associated with DOI if exists, otherwise NULL. +Currently usable for certain entity types like files or datasets, +though this should be revised to make more useful with other objects. +Note: Internal/experimental use only, not for production use. +} +\keyword{internal}