From 10631b0dc3009de861e5ad4b90127e0ae152c1db Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Thu, 29 Jun 2023 15:12:59 -0600
Subject: [PATCH 01/25] Add draft datasets util (wip)

---
 R/datasets.R | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 R/datasets.R

diff --git a/R/datasets.R b/R/datasets.R
new file mode 100644
index 00000000..cdbf0ce8
--- /dev/null
+++ b/R/datasets.R
@@ -0,0 +1,118 @@
+#' Create Sarek-processed datasets
+#' 
+#' Organize variant call files from Nextflow Sarek into 3-4 datasets, 
+#' grouping files by variant type and workflow with titles having the format: 
+#' "{type} Genomic Variants - {workflow} Pipeline", e.g. "Somatic Genomic Variants - Strelka Pipeline".
+#' As you can see, this assumes that you want to create datasets that segregate Somatic and Germline calls. 
+#' This makes sense for NF because Germline calls can be treated differently.  
+#' This uses version 1 of all files and creates a Draft version of the dataset.
+#' 
+#' Since we basically just need the syn entity id, variant type, and workflow to group the files. 
+#' Instead of getting this info through running `map_*` as in the example,
+#' you may prefer using a fileview, in which case you just need to download a table from a fileview 
+#' that has `id` => `output_id` + the `dataType` and `workflow` annotations. 
+#' The fileview can be used _after_ the files are annotated. If you want to create datasets _before_
+#' files are annotated, then you have to use `map_*`.
+#' 
+#' Finally, datasets cannot use the same name if stored in the same project,
+#' so if there are multiple batches, the names will have to be made unique by adding
+#' the batch number, source data id, processing date, or whatever makes sense.
+#' 
+#' @param output_map The `data.table` returned from `map_sample_output_sarek`. See details for alternatives.
+#' @param parent Synapse id of parent project where the datasets will live. 
+#' Usually the same parent project storing the files, but in some cases it may be a different project.
+#' @param verbose Optional, whether to be verbose -- defaults to TRUE.
+#' @param dry_run If TRUE, don't actually store datasets and return the objects for inspection or modification, 
+#' e.g. setting a better title or description than the default.
+#' @import data.table
+#' @return A list of dataset objects.
+#' @export
+#' @examples
+#'\dontrun{
+#' syn_out <- "syn26648589"
+#' m <- map_sample_output_sarek(syn_out)
+#' datasets <- nf_sarek_datasets(m, parent = "syn26462036", dry_run = F) # use a test project
+#'}
+nf_sarek_datasets <- function(output_map, 
+                              parent, 
+                              verbose = TRUE, 
+                              dry_run = TRUE) { 
+  
+  output_map <- as.data.table(output_map)
+  if(!is.null(output_map$dataType)) {
+    data_type <- unique(output_map$dataType)
+    if(length(data_type) != 1) stop("Expecting one `dataType`, which does not appear to be the case.")
+    gvtype <- grep("(Germline|Somatic)Variants", data_type, value = T)
+    if(!length(gvtype)) stop("Data type does not look right, expecting either Germline or Somatic variants.")
+    gvtype <- switch(gvtype,
+                     SomaticVariants = "Somatic",
+                     GermlineVariants = "Germline")
+    
+  } else {
+    # Detect genomic variants type from first path name 
+    gvtype <- if(grepl("SomaticVariantCalls", first(output_map$caller_path))) {
+      "Somatic"  
+    } else if(grepl("GermlineVariantCalls", first(output_map$caller_path)))  {
+      "Germline"
+    } else {
+      stop("Could not assign either Germline or Somatic labels based on main output folder. 
+           Check whether folder contains mixed types or is not the right one.")
+    }
+  }
+  pattern <- "vcf.gz(.tbi)?$"
+  WORKFLOW <- c("FreeBayes", "Mutect2", "Strelka", "DeepVariant")
+  datasets <- list()
+  for(i in WORKFLOW) {
+    dataset <- output_map[workflow == i & grepl(pattern, output_name)]
+    if(nrow(dataset)) {
+      if(verbose) glue::glue("Creating {i} dataset with {nrow(dataset)} files") 
+      
+      name <- glue::glue("{gvtype} Genomic Variants - {i} Pipeline")
+      dataset_items <- lapply(dataset$output_id, function(entity) list(entityId = entity, versionNumber = 1L))
+      
+      syn_dataset <- synapseclient$Dataset(name = name,
+                                           parent = parent,
+                                           dataset_items = dataset_items)
+      
+      if(dry_run) datasets[[i]] <- syn_dataset else datasets[[i]] <- .syn$store(syn_dataset)
+    }
+  }
+  
+  return(datasets)
+  
+}
+
+
+#' Create NF STAR-Salmon dataset
+#' 
+#' Organize gene expression quantification files (.sf) into one dataset. 
+#' Uses version 1 of the files and creates a Draft dataset.
+#' See also `nf_sarek_datasets`.
+#' 
+#' @inheritParams nf_sarek_datasets
+#' @param output_map The `data.table` returned from `map_sample_output_sarek`.
+#' @export
+#' @examples
+#'\dontrun{
+#' syn_out <- "syn30840584"
+#' m <- map_sample_output_rnaseq(syn_out) 
+#' datasets <- nf_rnaseq_dataset(m, out, parent = "syn4939902", dry_run = F)
+#'}
+nf_star_salmon_datasets <- function(output_map, 
+                                    parent, 
+                                    verbose = TRUE, 
+                                    dry_run = TRUE) { 
+  
+  # Select the .sf and index files
+  dataset_items <- output_map[grepl(".sf$", output_name), output_id]
+  name <- "Gene Expression Quantification from RNA-seq"
+  dataset_items <- lapply(dataset_items, function(entity) list(entityId = entity, versionNumber = 1L))
+  dataset <- synapseclient$Dataset(name = name,
+                                   parent = parent,
+                                   dataset_items = dataset_items)
+  
+  if(dry_run) dataset else .syn$store(syn_dataset)
+  
+}
+
+

From ffdb66147c5a34b37b90559b30fff48f9c466b5a Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Thu, 29 Jun 2023 15:18:25 -0600
Subject: [PATCH 02/25] Parameterize workflow per review comment

---
 R/datasets.R | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/R/datasets.R b/R/datasets.R
index cdbf0ce8..e7149318 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -35,6 +35,7 @@
 #'}
 nf_sarek_datasets <- function(output_map, 
                               parent, 
+                              workflow = c("FreeBayes", "Mutect2", "Strelka", "DeepVariant"),
                               verbose = TRUE, 
                               dry_run = TRUE) { 
   
@@ -60,9 +61,9 @@ nf_sarek_datasets <- function(output_map,
     }
   }
   pattern <- "vcf.gz(.tbi)?$"
-  WORKFLOW <- c("FreeBayes", "Mutect2", "Strelka", "DeepVariant")
+  workflow <- match.arg(workflow)
   datasets <- list()
-  for(i in WORKFLOW) {
+  for(i in workflow) {
     dataset <- output_map[workflow == i & grepl(pattern, output_name)]
     if(nrow(dataset)) {
       if(verbose) glue::glue("Creating {i} dataset with {nrow(dataset)} files") 
@@ -86,7 +87,7 @@ nf_sarek_datasets <- function(output_map,
 #' Create NF STAR-Salmon dataset
 #' 
 #' Organize gene expression quantification files (.sf) into one dataset. 
-#' Uses version 1 of the files and creates a Draft dataset.
+#' Uses version 1 of the files and creates a "Draft" dataset.
 #' See also `nf_sarek_datasets`.
 #' 
 #' @inheritParams nf_sarek_datasets

From 1603e7f9446cffa9a501fabb4cb2f0c11a1f538c Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Thu, 29 Jun 2023 16:11:44 -0600
Subject: [PATCH 03/25] Factor out dataset item constructor to make more
 functional and flexible

---
 R/datasets.R | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/R/datasets.R b/R/datasets.R
index e7149318..376b87bc 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -1,3 +1,19 @@
+#' As dataset items
+#' 
+#' Helper taking entity ids to create records in the structure needed for dataset creation.
+#' Note: Currently does not check that ids are "file" entities; technically dataset items can't be folders, for example.
+#'
+#' @param ids Ids of entities to make into dataset items.
+#' @param version Integer for version that will be used for all items, e.g. 1. 
+#' If NULL, this will look up the latest version for each id and use that.
+as_dataset_items <- function(ids, version = NULL) {
+  if(is.null(version)) {
+    version <- lapply(ids, function(id) .syn$get(id)$properties$versionNumber)
+  }
+  dataset_items <- Map(function(id, version) list(entityId = id, versionNumber = 1L), ids, version)
+  dataset_items
+} 
+
 #' Create Sarek-processed datasets
 #' 
 #' Organize variant call files from Nextflow Sarek into 3-4 datasets, 
@@ -69,7 +85,7 @@ nf_sarek_datasets <- function(output_map,
       if(verbose) glue::glue("Creating {i} dataset with {nrow(dataset)} files") 
       
       name <- glue::glue("{gvtype} Genomic Variants - {i} Pipeline")
-      dataset_items <- lapply(dataset$output_id, function(entity) list(entityId = entity, versionNumber = 1L))
+      dataset_items <- as_dataset_items(dataset$output_id)
       
       syn_dataset <- synapseclient$Dataset(name = name,
                                            parent = parent,
@@ -105,10 +121,9 @@ nf_star_salmon_datasets <- function(output_map,
                                     dry_run = TRUE) { 
   
   # Select the .sf and index files
-  dataset_items <- output_map[grepl(".sf$", output_name), output_id]
-  name <- "Gene Expression Quantification from RNA-seq"
-  dataset_items <- lapply(dataset_items, function(entity) list(entityId = entity, versionNumber = 1L))
-  dataset <- synapseclient$Dataset(name = name,
+  output_ids <- output_map[grepl(".sf$", output_name), output_id]
+  dataset_items <- as_dataset_items(output_ids)
+  dataset <- synapseclient$Dataset(name = "Gene Expression Quantification from RNA-seq",
                                    parent = parent,
                                    dataset_items = dataset_items)
   

From 04b08706c424f6fd23c3db32cc8d5a3691a23280 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Thu, 29 Jun 2023 18:09:53 -0600
Subject: [PATCH 04/25] Implement util for adding to dataset collection

---
 R/datasets.R | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/R/datasets.R b/R/datasets.R
index 376b87bc..6d179da3 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -11,8 +11,25 @@ as_dataset_items <- function(ids, version = NULL) {
     version <- lapply(ids, function(id) .syn$get(id)$properties$versionNumber)
   }
   dataset_items <- Map(function(id, version) list(entityId = id, versionNumber = 1L), ids, version)
+  names(dataset_items) <- NULL # need to unname list for API
   dataset_items
-} 
+}
+
+#' Add to dataset collection
+#' 
+#' Add dataset(s) to an _existing_ dataset collection. 
+#' Notes: 
+#' - If somehow non-dataset entities are included, Synapse will ignore these ids.
+#' - Implemented with lower-level REST calls because the Python client (as of v2.7) doesn't seem to have the method for yet dataset collections.
+#' 
+#' @param dataset_ids Character vector of one or more dataset entity ids to add.
+#' @param collection_id Id of the dataset collection.
+add_to_dataset_collection <- function(dataset_ids, collection_id) {
+  e <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"))
+  items <- as_dataset_items(dataset_ids)
+  e$items <- c(e$items, items)
+  .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(e, auto_unbox = TRUE))
+}
 
 #' Create Sarek-processed datasets
 #' 

From fae5fa46eef999adee76902a4abe9c81a6ceaa5e Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Thu, 29 Jun 2023 18:11:10 -0600
Subject: [PATCH 05/25] Update explanation

---
 R/datasets.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/R/datasets.R b/R/datasets.R
index 6d179da3..feb50038 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -1,7 +1,6 @@
 #' As dataset items
 #' 
-#' Helper taking entity ids to create records in the structure needed for dataset creation.
-#' Note: Currently does not check that ids are "file" entities; technically dataset items can't be folders, for example.
+#' Helper taking entity ids to create records in the structure needed for dataset items or dataset collection items.
 #'
 #' @param ids Ids of entities to make into dataset items.
 #' @param version Integer for version that will be used for all items, e.g. 1. 

From e0bb7310ac9cffcc719dea514c0167c650c26a71 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Fri, 30 Jun 2023 13:02:29 -0600
Subject: [PATCH 06/25] Refactor into base fun

---
 R/datasets.R | 58 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 26 deletions(-)

diff --git a/R/datasets.R b/R/datasets.R
index feb50038..9bd6cbf7 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -3,13 +3,13 @@
 #' Helper taking entity ids to create records in the structure needed for dataset items or dataset collection items.
 #'
 #' @param ids Ids of entities to make into dataset items.
-#' @param version Integer for version that will be used for all items, e.g. 1. 
+#' @param item_version Integer for version that will be used for all items, e.g. 1. 
 #' If NULL, this will look up the latest version for each id and use that.
-as_dataset_items <- function(ids, version = NULL) {
-  if(is.null(version)) {
-    version <- lapply(ids, function(id) .syn$get(id)$properties$versionNumber)
+as_dataset_items <- function(ids, item_version = NULL) {
+  if(is.null(item_version)) {
+    item_version <- lapply(ids, function(id) .syn$get(id)$properties$versionNumber)
   }
-  dataset_items <- Map(function(id, version) list(entityId = id, versionNumber = 1L), ids, version)
+  dataset_items <- Map(function(id, version) list(entityId = id, versionNumber = 1L), ids, item_version)
   names(dataset_items) <- NULL # need to unname list for API
   dataset_items
 }
@@ -30,6 +30,25 @@ add_to_dataset_collection <- function(dataset_ids, collection_id) {
   .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(e, auto_unbox = TRUE))
 }
 
+#' New dataset with given items
+#'
+#' Make a _new_ dataset with given set of entities.
+#'
+#' @inheritParams as_dataset_items
+#' @param name Name of the dataset. It should be unique within the `parent` project.
+#' @param parent Synapse id of parent project where the datasets will live.
+#' @param items Id(s) of items to include.
+#' Usually the same parent project storing the files, but in some cases it may be a different project.
+#' @param dry_run If TRUE, don't actually store dataset, just return the data object for inspection or further modification.
+new_dataset <- function(name, parent, items, item_version = NULL, dry_run = TRUE) {
+
+  dataset_items <- as_dataset_items(items, item_version)
+  dataset <- synapseclient$Dataset(name = name,
+                                   parent = parent,
+                                   dataset_items = dataset_items)
+  if(dry_run) dataset else .syn$store(dataset)
+}
+
 #' Create Sarek-processed datasets
 #' 
 #' Organize variant call files from Nextflow Sarek into 3-4 datasets, 
@@ -50,12 +69,9 @@ add_to_dataset_collection <- function(dataset_ids, collection_id) {
 #' so if there are multiple batches, the names will have to be made unique by adding
 #' the batch number, source data id, processing date, or whatever makes sense.
 #' 
+#' @inheritParams new_dataset
 #' @param output_map The `data.table` returned from `map_sample_output_sarek`. See details for alternatives.
-#' @param parent Synapse id of parent project where the datasets will live. 
-#' Usually the same parent project storing the files, but in some cases it may be a different project.
 #' @param verbose Optional, whether to be verbose -- defaults to TRUE.
-#' @param dry_run If TRUE, don't actually store datasets and return the objects for inspection or modification, 
-#' e.g. setting a better title or description than the default.
 #' @import data.table
 #' @return A list of dataset objects.
 #' @export
@@ -98,15 +114,9 @@ nf_sarek_datasets <- function(output_map,
   for(i in workflow) {
     dataset <- output_map[workflow == i & grepl(pattern, output_name)]
     if(nrow(dataset)) {
-      if(verbose) glue::glue("Creating {i} dataset with {nrow(dataset)} files") 
-      
+      if(verbose) glue::glue("Creating {i} dataset with {nrow(dataset)} files")
       name <- glue::glue("{gvtype} Genomic Variants - {i} Pipeline")
-      dataset_items <- as_dataset_items(dataset$output_id)
-      
-      syn_dataset <- synapseclient$Dataset(name = name,
-                                           parent = parent,
-                                           dataset_items = dataset_items)
-      
+      dataset <- new_dataset(name = name, parent = parent, items = dataset$output_id, dry_run = TRUE)
       if(dry_run) datasets[[i]] <- syn_dataset else datasets[[i]] <- .syn$store(syn_dataset)
     }
   }
@@ -122,6 +132,7 @@ nf_sarek_datasets <- function(output_map,
 #' Uses version 1 of the files and creates a "Draft" dataset.
 #' See also `nf_sarek_datasets`.
 #' 
+#' @inheritParams new_dataset
 #' @inheritParams nf_sarek_datasets
 #' @param output_map The `data.table` returned from `map_sample_output_sarek`.
 #' @export
@@ -138,13 +149,8 @@ nf_star_salmon_datasets <- function(output_map,
   
   # Select the .sf and index files
   output_ids <- output_map[grepl(".sf$", output_name), output_id]
-  dataset_items <- as_dataset_items(output_ids)
-  dataset <- synapseclient$Dataset(name = "Gene Expression Quantification from RNA-seq",
-                                   parent = parent,
-                                   dataset_items = dataset_items)
-  
-  if(dry_run) dataset else .syn$store(syn_dataset)
-  
+  new_dataset(name = "Gene Expression Quantification from RNA-seq",
+              parent = parent,
+              items = output_ids,
+              dry_run = dry_run)
 }
-
-

From 963ade19594a210585ecbd4c24683bd55371c754 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Fri, 30 Jun 2023 17:56:18 -0600
Subject: [PATCH 07/25] Add tests

---
 tests/testthat/test_dataset_utils.R | 53 +++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 tests/testthat/test_dataset_utils.R

diff --git a/tests/testthat/test_dataset_utils.R b/tests/testthat/test_dataset_utils.R
new file mode 100644
index 00000000..7ea57b92
--- /dev/null
+++ b/tests/testthat/test_dataset_utils.R
@@ -0,0 +1,53 @@
+test_that("Creating dataset with `new_dataset` works as expected when given valid parameters, defaulting to current item versions", {
+  
+  skip_if_no_synapseclient()
+  skip_if_no_token()
+  NF_test <- "syn26462036"
+  # Note that files are all version 2 on Synapse
+  items <- c("syn51239179",
+             "syn51239178",
+             "syn51239177")
+  dataset <- new_dataset(name = "test_dataset", parent = NF_test, items = items, dry_run = FALSE)
+  .syn$delete(dataset)
+  expected_items_in_dataset <- list(list(entityId = "syn51239179", versionNumber = 2L),
+                                    list(entityId = "syn51239178", versionNumber = 2L),
+                                    list(entityId = "syn51239177", versionNumber = 2L)) 
+  testthat::expect_equal(expected_items_in_dataset, dataset$properties$datasetItems)
+  
+})
+
+test_that("Creating dataset with `new_dataset` works as expected when given valid parameters and a specific item version is specified", {
+  
+  skip_if_no_synapseclient()
+  skip_if_no_token()
+  NF_test <- "syn26462036"
+  items <- c("syn51239179",
+             "syn51239178",
+             "syn51239177")
+  dataset <- new_dataset(name = "test_dataset", parent = NF_test, items = items, item_version = 1L, dry_run = FALSE)
+  .syn$delete(dataset)
+  expected_items_in_dataset <- list(list(entityId = "syn51239179", versionNumber = 1L),
+                                    list(entityId = "syn51239178", versionNumber = 1L),
+                                    list(entityId = "syn51239177", versionNumber = 1L))
+  testthat::expect_equal(expected_items_in_dataset, dataset$properties$datasetItems)
+  
+})
+
+# When providing an item not allowed to be a dataset item (a table or folder), the Synapse error will be something like
+# ```
+# Error: synapseclient.core.exceptions.SynapseHTTPError: 400 Client Error:
+# Currently, only files can be included in a dataset. syn27242487 is 'org.sagebionetworks.repo.model.table.TableEntity'
+# ```
+# This is a good, informative error
+test_that("Creating dataset with `new_dataset` will fail when trying to include a non-vaid item (specifically, a table)", {
+  
+  skip_if_no_synapseclient()
+  skip_if_no_token()
+  NF_test <- "syn26462036"
+  items <- c("syn51239179",
+             "syn51239178",
+             "syn27242487") # This is a table
+  testthat::expect_error(dataset <- new_dataset(name = "test_dataset", parent = NF_test, items = items, dry_run = FALSE))
+})
+
+

From 1048a42dc1db39f455b2d440a276ff3bb8cd50ba Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Sat, 1 Jul 2023 15:47:03 -0600
Subject: [PATCH 08/25] Reorganize

---
 R/datasets.R | 40 +++++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/R/datasets.R b/R/datasets.R
index 9bd6cbf7..bc6cd5a8 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -7,29 +7,13 @@
 #' If NULL, this will look up the latest version for each id and use that.
 as_dataset_items <- function(ids, item_version = NULL) {
   if(is.null(item_version)) {
-    item_version <- lapply(ids, function(id) .syn$get(id)$properties$versionNumber)
+    item_version <- lapply(ids, function(id) .syn$get(id, downloadFile = FALSE)$properties$versionNumber)
   }
   dataset_items <- Map(function(id, version) list(entityId = id, versionNumber = 1L), ids, item_version)
   names(dataset_items) <- NULL # need to unname list for API
   dataset_items
 }
 
-#' Add to dataset collection
-#' 
-#' Add dataset(s) to an _existing_ dataset collection. 
-#' Notes: 
-#' - If somehow non-dataset entities are included, Synapse will ignore these ids.
-#' - Implemented with lower-level REST calls because the Python client (as of v2.7) doesn't seem to have the method for yet dataset collections.
-#' 
-#' @param dataset_ids Character vector of one or more dataset entity ids to add.
-#' @param collection_id Id of the dataset collection.
-add_to_dataset_collection <- function(dataset_ids, collection_id) {
-  e <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"))
-  items <- as_dataset_items(dataset_ids)
-  e$items <- c(e$items, items)
-  .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(e, auto_unbox = TRUE))
-}
-
 #' New dataset with given items
 #'
 #' Make a _new_ dataset with given set of entities.
@@ -154,3 +138,25 @@ nf_star_salmon_datasets <- function(output_map,
               items = output_ids,
               dry_run = dry_run)
 }
+
+# -- Dataset Collections -------------------------------------------------------#
+
+#' Add to dataset collection
+#' 
+#' Add dataset(s) to an _existing_ dataset collection. 
+#' Implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet 
+#' implement an `add_scope`-type method for dataset collections that is available for entity view.
+#' Notes: 
+#' - TODO: Check that ids are unique with existing items or Synapse will reject.
+#' - TODO: Check that entities are datasets or this will fail.
+#' 
+#' @param dataset_ids Character vector of one or more dataset entity ids to add.
+#' @param collection_id Id of the dataset collection.
+add_to_dataset_collection <- function(dataset_ids, collection_id) {
+  e <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"))
+  items <- as_dataset_items(dataset_ids)
+  e$items <- c(e$items, items)
+  .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(e, auto_unbox = TRUE))
+}
+
+# ------------------------------------------------------------------------------#

From 0de33316fcdbba121085fbeeca72ef43c1793de5 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Sat, 1 Jul 2023 16:48:58 -0600
Subject: [PATCH 09/25] Resolve TODO checks

---
 R/datasets.R | 52 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 42 insertions(+), 10 deletions(-)

diff --git a/R/datasets.R b/R/datasets.R
index bc6cd5a8..39fb9786 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -139,24 +139,56 @@ nf_star_salmon_datasets <- function(output_map,
               dry_run = dry_run)
 }
 
+# -- Checks------------- -------------------------------------------------------#
+
+#' Check whether entity is dataset
+#' 
+#' @keywords internal
+is_dataset <- function(id) {
+  tryCatch({
+    entity <- .syn$get(id, downloadFile = FALSE)
+    entity$properties$concreteType == "org.sagebionetworks.repo.model.table.Dataset"
+  },
+  error = function(e) FALSE)
+}
+
+
 # -- Dataset Collections -------------------------------------------------------#
 
 #' Add to dataset collection
 #' 
-#' Add dataset(s) to an _existing_ dataset collection. 
+#' Add dataset(s) to an _existing_ dataset collection.
 #' Implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet 
 #' implement an `add_scope`-type method for dataset collections that is available for entity view.
-#' Notes: 
-#' - TODO: Check that ids are unique with existing items or Synapse will reject.
-#' - TODO: Check that entities are datasets or this will fail.
 #' 
-#' @param dataset_ids Character vector of one or more dataset entity ids to add.
+#' @param items Character vector of one or more dataset entity ids to add, using their current version. 
 #' @param collection_id Id of the dataset collection.
-add_to_dataset_collection <- function(dataset_ids, collection_id) {
-  e <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"))
-  items <- as_dataset_items(dataset_ids)
-  e$items <- c(e$items, items)
-  .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(e, auto_unbox = TRUE))
+#' @param check_items Whether to check that ids are really dataset entities and remove non-dataset entities with warning (default FALSE). 
+#' This may be useful given that sometimes "datasets" can be folder or file entities. Note that using check will be slower.
+#' @param replace If specified items are current items in the collection, replace items with the current version?
+#' The safe default is FALSE to ensure any version changes are intentional.
+add_to_dataset_collection <- function(items, collection_id, check_items = FALSE, replace = FALSE) {
+  
+  if(check_items) {
+    confirmed_dataset <- sapply(items, is_dataset)
+    if(any(!confirmed_dataset)) {
+      warning("Items which are not dataset entities will be ignored:", items[!confirmed_dataset])
+      items <- items[confirmed_dataset]
+    }
+  }
+  dc <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"))
+  current_items <- sapply(dc$items, function(i) i$entityId)
+  
+  # Synapse will normally throw error if trying to add a dataset already in collection
+  if(any(items %in% current_items) && !replace) {
+    stop("Datasets to be added are already in collection. Use `replace = TRUE` if you want to override existing dataset versions.") 
+  } else if (any(items %in% current_items && replace)) {
+    dc$items <- as_dataset_items(union(current_items, items))
+    message("Some datasets replaced with their most current version:", items[items %in% current_items])
+  } else {
+    dc$items <- c(dc$items, as_dataset_items(items))
+  }
+  .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(dc, auto_unbox = TRUE))
 }
 
 # ------------------------------------------------------------------------------#

From eea014f8ad6a6f4ead756b1ecaf59cff57d046ad Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Sat, 1 Jul 2023 17:01:22 -0600
Subject: [PATCH 10/25] Export, document

---
 R/datasets.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/datasets.R b/R/datasets.R
index 39fb9786..15162628 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -159,7 +159,7 @@ is_dataset <- function(id) {
 #' 
 #' Add dataset(s) to an _existing_ dataset collection.
 #' Implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet 
-#' implement an `add_scope`-type method for dataset collections that is available for entity view.
+#' implement dataset collection methods.
 #' 
 #' @param items Character vector of one or more dataset entity ids to add, using their current version. 
 #' @param collection_id Id of the dataset collection.

From 11488a96884bc30b28a2ce7fdda5ee61eead705b Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Sat, 1 Jul 2023 17:03:58 -0600
Subject: [PATCH 11/25] More updates of docs to be consistent with changes

---
 R/datasets.R | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/R/datasets.R b/R/datasets.R
index 15162628..f2b397ea 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -40,7 +40,7 @@ new_dataset <- function(name, parent, items, item_version = NULL, dry_run = TRUE
 #' "{type} Genomic Variants - {workflow} Pipeline", e.g. "Somatic Genomic Variants - Strelka Pipeline".
 #' As you can see, this assumes that you want to create datasets that segregate Somatic and Germline calls. 
 #' This makes sense for NF because Germline calls can be treated differently.  
-#' This uses version 1 of all files and creates a Draft version of the dataset.
+#' This uses latest version of all files and creates a Draft version of the dataset.
 #' 
 #' Since we basically just need the syn entity id, variant type, and workflow to group the files. 
 #' Instead of getting this info through running `map_*` as in the example,
@@ -113,7 +113,7 @@ nf_sarek_datasets <- function(output_map,
 #' Create NF STAR-Salmon dataset
 #' 
 #' Organize gene expression quantification files (.sf) into one dataset. 
-#' Uses version 1 of the files and creates a "Draft" dataset.
+#' Uses latest version of the files and creates a "Draft" dataset.
 #' See also `nf_sarek_datasets`.
 #' 
 #' @inheritParams new_dataset
@@ -167,6 +167,7 @@ is_dataset <- function(id) {
 #' This may be useful given that sometimes "datasets" can be folder or file entities. Note that using check will be slower.
 #' @param replace If specified items are current items in the collection, replace items with the current version?
 #' The safe default is FALSE to ensure any version changes are intentional.
+#' @export
 add_to_dataset_collection <- function(items, collection_id, check_items = FALSE, replace = FALSE) {
   
   if(check_items) {
@@ -191,4 +192,6 @@ add_to_dataset_collection <- function(items, collection_id, check_items = FALSE,
   .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(dc, auto_unbox = TRUE))
 }
 
+#' 
+
 # ------------------------------------------------------------------------------#

From 3ecc3a2007b58508a9ad0f84c334eb17251eb42d Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Sun, 2 Jul 2023 08:47:01 -0600
Subject: [PATCH 12/25] Add util for update and corresponding unit test

---
 R/datasets.R                                  | 55 +++++++++++++------
 .../testthat/test_dataset_collection_utils.R  | 30 ++++++++++
 2 files changed, 68 insertions(+), 17 deletions(-)
 create mode 100644 tests/testthat/test_dataset_collection_utils.R

diff --git a/R/datasets.R b/R/datasets.R
index f2b397ea..21837460 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -155,43 +155,64 @@ is_dataset <- function(id) {
 
 # -- Dataset Collections -------------------------------------------------------#
 
+#' Apply updates to current collection of items
+#' 
+#' A collection of items has items of the form `list(entityId = id, versionNumber = x)`.
+#' Given another collection that can represent updates of both types replace or add,
+#' this applies an update join keyed on `entityId` for the replace and 
+#' appends the new items to get the updated collection.
+#' 
+#' @param current_items List of lists representing a collection of items.
+#' @param update_items Collection of items to apply as updates to `current_items`. 
+update_items <- function(current_coll, update_coll) {
+  
+  current_coll <- data.table::rbindlist(current_coll)
+  update_coll <- data.table::rbindlist(update_coll)
+  updated_coll <- rbind(
+    current_coll[update_coll, on = .(entityId), versionNumber := i.versionNumber], # replace
+    update_coll[!current_coll, on = .(entityId)]) # add
+  updated_coll <- apply(updated_coll, 1, as.list) 
+  updated_coll
+} 
+
 #' Add to dataset collection
 #' 
-#' Add dataset(s) to an _existing_ dataset collection.
+#' Add dataset(s) to an _existing_ dataset collection, using their current (latest) version.
+#' If a dataset attempting to be added happens to already be in the dataset collection,
+#' this might lead to version conflicts, so the update won't processed unless `force` is true.
+#' 
 #' Implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet 
-#' implement dataset collection methods.
+#' implement dataset collection methods. 
 #' 
-#' @param items Character vector of one or more dataset entity ids to add, using their current version. 
 #' @param collection_id Id of the dataset collection.
-#' @param check_items Whether to check that ids are really dataset entities and remove non-dataset entities with warning (default FALSE). 
-#' This may be useful given that sometimes "datasets" can be folder or file entities. Note that using check will be slower.
-#' @param replace If specified items are current items in the collection, replace items with the current version?
-#' The safe default is FALSE to ensure any version changes are intentional.
+#' @param items Character vector of one or more dataset entity ids to add. 
+#' @param check_items Whether to check that ids are really dataset entities and remove non-dataset entities (default FALSE) 
+#' to help avoid Synapse error. This may be useful given that sometimes "datasets" can be folder or file entities. 
+#' Note that using check will be slower.
+#' @param force If some items are currently in the collection with a different version, 
+#' should these items be force added using current version? The safe default is FALSE to ensure any version changes are intentional.
 #' @export
-add_to_dataset_collection <- function(items, collection_id, check_items = FALSE, replace = FALSE) {
+add_to_dataset_collection <- function(collection_id, items, check_items = FALSE, force = FALSE) {
   
   if(check_items) {
     confirmed_dataset <- sapply(items, is_dataset)
     if(any(!confirmed_dataset)) {
-      warning("Items which are not dataset entities will be ignored:", items[!confirmed_dataset])
+      warning("Items which are not dataset entities will not be added:", items[!confirmed_dataset])
       items <- items[confirmed_dataset]
     }
   }
   dc <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"))
-  current_items <- sapply(dc$items, function(i) i$entityId)
   
-  # Synapse will normally throw error if trying to add a dataset already in collection
-  if(any(items %in% current_items) && !replace) {
-    stop("Datasets to be added are already in collection. Use `replace = TRUE` if you want to override existing dataset versions.") 
-  } else if (any(items %in% current_items && replace)) {
-    dc$items <- as_dataset_items(union(current_items, items))
-    message("Some datasets replaced with their most current version:", items[items %in% current_items])
+  if(any(items %in% current_items) && !force) {
+    stop("Some datasets to be added are already in collection. Use `force = TRUE` to allow replacing existing dataset versions.") 
+  } else if (any(items %in% current_items) && force) {
+    dc$items <- update_items(dc$items, as_dataset_items(items))
   } else {
     dc$items <- c(dc$items, as_dataset_items(items))
   }
   .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(dc, auto_unbox = TRUE))
 }
 
-#' 
+
 
 # ------------------------------------------------------------------------------#
diff --git a/tests/testthat/test_dataset_collection_utils.R b/tests/testthat/test_dataset_collection_utils.R
new file mode 100644
index 00000000..9d377084
--- /dev/null
+++ b/tests/testthat/test_dataset_collection_utils.R
@@ -0,0 +1,30 @@
+test_that("Update helper for dataset collection items works with combined replace/add updates", {
+  
+  current_items <- list(
+    list(entityId = "syn1", versionNumber = 1L),
+    list(entityId = "syn2", versionNumber = 1L),
+    list(entityId = "syn3", versionNumber = 1L)
+  )
+  
+  update_items <- list(
+    list(entityId = "syn3", versionNumber = 2L),
+    list(entityId = "syn4", versionNumber = 2L),
+    list(entityId = "syn5", versionNumber = 2L)
+  )
+  
+  expected <- list(
+    list(entityId = "syn1", versionNumber = 1L),
+    list(entityId = "syn2", versionNumber = 1L),
+    list(entityId = "syn3", versionNumber = 2L),
+    list(entityId = "syn4", versionNumber = 2L),
+    list(entityId = "syn5", versionNumber = 2L)
+  )
+  
+  testthat::expect_equal(update_items(current_items, update_items),
+                         expected)
+  
+})
+
+
+
+

From 103f7cdae2ef06750431561a06117a22542adc11 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Sun, 2 Jul 2023 09:32:15 -0600
Subject: [PATCH 13/25] Update unit test and fix discovered bug

---
 R/datasets.R                                   | 11 ++++++-----
 tests/testthat/test_dataset_collection_utils.R | 11 ++++-------
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/R/datasets.R b/R/datasets.R
index 21837460..40a6d9cb 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -168,11 +168,12 @@ update_items <- function(current_coll, update_coll) {
   
   current_coll <- data.table::rbindlist(current_coll)
   update_coll <- data.table::rbindlist(update_coll)
-  updated_coll <- rbind(
-    current_coll[update_coll, on = .(entityId), versionNumber := i.versionNumber], # replace
-    update_coll[!current_coll, on = .(entityId)]) # add
-  updated_coll <- apply(updated_coll, 1, as.list) 
-  updated_coll
+  replaced <- current_coll[update_coll, on = .(entityId), versionNumber := i.versionNumber]
+  added <- update_coll[!current_coll, on = .(entityId)]
+  updated <- rbind(replaced, added)
+  # reconversion; using pure apply as.list coerces versionNumber into char
+  updated <- apply(updated, 1, function(i) list(entityId = unname(i[1]), versionNumber = as.integer(i[2]))) 
+  updated
 } 
 
 #' Add to dataset collection
diff --git a/tests/testthat/test_dataset_collection_utils.R b/tests/testthat/test_dataset_collection_utils.R
index 9d377084..73ecdf09 100644
--- a/tests/testthat/test_dataset_collection_utils.R
+++ b/tests/testthat/test_dataset_collection_utils.R
@@ -1,12 +1,12 @@
-test_that("Update helper for dataset collection items works with combined replace/add updates", {
+test_that("Update helper for item collection works with combined 'replace' and 'add' update types", {
   
-  current_items <- list(
+  current <- list(
     list(entityId = "syn1", versionNumber = 1L),
     list(entityId = "syn2", versionNumber = 1L),
     list(entityId = "syn3", versionNumber = 1L)
   )
   
-  update_items <- list(
+  update <- list(
     list(entityId = "syn3", versionNumber = 2L),
     list(entityId = "syn4", versionNumber = 2L),
     list(entityId = "syn5", versionNumber = 2L)
@@ -20,11 +20,8 @@ test_that("Update helper for dataset collection items works with combined replac
     list(entityId = "syn5", versionNumber = 2L)
   )
   
-  testthat::expect_equal(update_items(current_items, update_items),
+  testthat::expect_identical(update_items(current, update),
                          expected)
   
 })
 
-
-
-

From cefa999261b4f2265c92a081bed9ce5419028e00 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Sun, 2 Jul 2023 09:44:38 -0600
Subject: [PATCH 14/25] More unit tests

---
 .../testthat/test_dataset_collection_utils.R  | 53 ++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

diff --git a/tests/testthat/test_dataset_collection_utils.R b/tests/testthat/test_dataset_collection_utils.R
index 73ecdf09..3177b1dd 100644
--- a/tests/testthat/test_dataset_collection_utils.R
+++ b/tests/testthat/test_dataset_collection_utils.R
@@ -21,7 +21,58 @@ test_that("Update helper for item collection works with combined 'replace' and '
   )
   
   testthat::expect_identical(update_items(current, update),
-                         expected)
+                             expected)
   
 })
 
+
+test_that("Update helper for item collection works with just 'replace' update type", {
+  
+  current <- list(
+    list(entityId = "syn1", versionNumber = 1L),
+    list(entityId = "syn2", versionNumber = 1L),
+    list(entityId = "syn3", versionNumber = 1L)
+  )
+  
+  update <- list(
+    list(entityId = "syn2", versionNumber = 2L),
+    list(entityId = "syn3", versionNumber = 2L)
+  )
+  
+  expected <- list(
+    list(entityId = "syn1", versionNumber = 1L),
+    list(entityId = "syn2", versionNumber = 2L),
+    list(entityId = "syn3", versionNumber = 2L)
+  )
+  
+  testthat::expect_identical(update_items(current, update),
+                             expected)
+  
+})
+
+
+test_that("Update helper for item collection works with just 'add' update type", {
+  
+  current <- list(
+    list(entityId = "syn1", versionNumber = 1L),
+    list(entityId = "syn2", versionNumber = 1L),
+    list(entityId = "syn3", versionNumber = 1L)
+  )
+  
+  update <- list(
+    list(entityId = "syn4", versionNumber = 2L),
+    list(entityId = "syn5", versionNumber = 2L)
+  )
+  
+  expected <- list(
+    list(entityId = "syn1", versionNumber = 1L),
+    list(entityId = "syn2", versionNumber = 1L),
+    list(entityId = "syn3", versionNumber = 1L),
+    list(entityId = "syn4", versionNumber = 2L),
+    list(entityId = "syn5", versionNumber = 2L)
+  )
+  
+  testthat::expect_identical(update_items(current, update),
+                             expected)
+  
+})

From 50bff87fcb65e67c5f7715401d3cb84d9e1cc2d3 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Sun, 2 Jul 2023 09:44:52 -0600
Subject: [PATCH 15/25] Add another collection util

---
 R/datasets.R | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/R/datasets.R b/R/datasets.R
index 40a6d9cb..ada1c02d 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -153,7 +153,7 @@ is_dataset <- function(id) {
 }
 
 
-# -- Dataset Collections -------------------------------------------------------#
+# -- Collections ---------------------------------------------------------------#
 
 #' Apply updates to current collection of items
 #' 
@@ -176,6 +176,36 @@ update_items <- function(current_coll, update_coll) {
   updated
 } 
 
+#' Update item versions to latest in a collection
+#' 
+#' Update the collection so that all items or a subset of items reference their latest version.
+#' This should work for both datasets (collection of files) and dataset collections (collection of datasets).
+#' 
+#' @param collection_id 
+#' @param items Vector of dataset ids for which to update reference to latest version, 
+#' or "all" (default) to update all in the dataset collection.
+#' @export
+use_latest_in_collection(collection_id, items = "all") {
+  coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"))
+  item_set <- match.arg(items)
+  current_items <- sapply(coll$items, function(i) i$entityId)
+  
+  if(item_set == "all") {
+    coll$items <- as_dataset_items(current_items)
+  } else {
+    
+    # Check subset; if no check, this becomes like `add_to_dataset_collection`
+    if(!all(items %in% current_items)) {
+      warning("Subset given includes items not actually in collection. These will be ignored:", items[!items %in% current_items])
+      items <- items[items %in% current_items]
+      updated_items <- update_items(coll$items, as_dataset_items(items))
+      coll$items <- updated_items
+    }
+  }
+  .syn$store(coll)
+  
+}
+
 #' Add to dataset collection
 #' 
 #' Add dataset(s) to an _existing_ dataset collection, using their current (latest) version.

From 18832654de601a65dc48b87790796b09c1185d7f Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Sun, 2 Jul 2023 10:55:36 -0600
Subject: [PATCH 16/25] Generalize and reorganize

---
 R/datasets.R                                  | 242 +++++++++++-------
 ...ection_utils.R => test_collection_utils.R} |   0
 2 files changed, 145 insertions(+), 97 deletions(-)
 rename tests/testthat/{test_dataset_collection_utils.R => test_collection_utils.R} (100%)

diff --git a/R/datasets.R b/R/datasets.R
index ada1c02d..805d1516 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -1,38 +1,144 @@
-#' As dataset items
+# -- Editing Collections -------------------------------------------------------#
+
+# General helpers that should work for both datasets (collection of files) 
+# and dataset collections (collection of datasets).
+
+
+#' As collection items
 #' 
-#' Helper taking entity ids to create records in the structure needed for dataset items or dataset collection items.
+#' Helper taking entity ids to create records used for dataset items or dataset collection items.
+#' Collection items have the form `list(entityId = id, versionNumber = x)`.
 #'
 #' @param ids Ids of entities to make into dataset items.
 #' @param item_version Integer for version that will be used for all items, e.g. 1. 
 #' If NULL, this will look up the latest version for each id and use that.
-as_dataset_items <- function(ids, item_version = NULL) {
+#' @keywords internal
+as_coll_items <- function(ids, item_version = NULL) {
   if(is.null(item_version)) {
     item_version <- lapply(ids, function(id) .syn$get(id, downloadFile = FALSE)$properties$versionNumber)
   }
-  dataset_items <- Map(function(id, version) list(entityId = id, versionNumber = 1L), ids, item_version)
-  names(dataset_items) <- NULL # need to unname list for API
-  dataset_items
+  items <- Map(function(id, version) list(entityId = id, versionNumber = 1L), ids, item_version)
+  names(items) <- NULL # need to unname list for API
+  items
+}
+
+
+#' Apply updates to current collection of items
+#' 
+#' Given another collection that can represent updates of both types replace or add,
+#' this applies an update join keyed on `entityId` for the replace and 
+#' appends the new items to get the updated collection.
+#' 
+#' @param current_items List of lists representing a collection of items.
+#' @param update_items Collection of items to apply as updates to `current_items`. 
+update_items <- function(current_coll, update_coll) {
+  
+  current_coll <- data.table::rbindlist(current_coll)
+  update_coll <- data.table::rbindlist(update_coll)
+  replaced <- current_coll[update_coll, on = .(entityId), versionNumber := i.versionNumber]
+  added <- update_coll[!current_coll, on = .(entityId)]
+  updated <- rbind(replaced, added)
+  # reconversion; using pure apply as.list coerces versionNumber into char
+  updated <- apply(updated, 1, function(i) list(entityId = unname(i[1]), versionNumber = as.integer(i[2]))) 
+  updated
+} 
+
+
+#' Update item versions to "latest" in a collection
+#' 
+#' Update an _existing_ collection so that all items or a subset of items reference their latest version.
+#' This should work for both datasets (collection of files) and dataset collections (collection of datasets).
+#' 
+#' @param collection_id 
+#' @param items Vector of dataset ids for which to update reference to latest version, 
+#' or "all" (default) to update all in the dataset collection.
+#' @export
+use_latest_in_collection(collection_id, items = "all") {
+  coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"))
+  item_set <- match.arg(items)
+  current_items <- sapply(coll$items, function(i) i$entityId)
+  
+  if(item_set == "all") {
+    coll$items <- as_coll_items(current_items)
+  } else {
+    
+    # Check subset; if no check, this becomes `add_to_collection`
+    if(!all(items %in% current_items)) {
+      warning("Subset given includes items not actually in collection. These will be ignored:", items[!items %in% current_items])
+      items <- items[items %in% current_items]
+      updated_items <- update_items(coll$items, as_coll_items(items))
+      coll$items <- updated_items
+    }
+  }
+  .syn$store(coll)
+  
 }
 
-#' New dataset with given items
+
+#' Add to collection
 #'
-#' Make a _new_ dataset with given set of entities.
+#' Add items(s) to an _existing_ collection, using the item(s)' current (latest) version.
+#' For datasets, the items should be files. For dataset collections, the items should be datasets.
+#' If an item attempting to be added happens to already be in the collection,
+#' this might lead to version conflicts, so the update will be rejected unless `force` is true.
+#' 
+#' This is implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet 
+#' implement dataset collection class and methods (but dataset and relevant methods like `add_item` method are available).
+#' Thus, while this is generic enough to handle both datasets and dataset collections 
+#' it is expected to be used more for dataset collections given that the dataset method is provided.
+#' 
+#' @param collection_id Id of the collection.
+#' @param items Character vector of one or more dataset entity ids to add.
+#' @param check_items Whether to check that ids are really appropriate item types and remove non-appropriate item types
+#' to help avoid Synapse errors (default `FALSE` because in most cases `items` are curated, and using check will be slower).
+#' @param force If some items are currently in the collection with a different version, 
+#' should these items be force-added using current version? The safe default is `FALSE` to ensure any such updates are intentional.
+#' @export
+add_to_collection <- function(collection_id, items, check_items = FALSE, force = FALSE) {
+  
+  coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"))
+  coll_type <- c("dataset", "dataset collection")[c(is_dataset(coll), is_dataset_collection(coll))]
+  if(!length(coll_type)) stop("Entity is not a dataset or dataset collection.")
+  
+  if(check_items) {
+    expected_type_check <- if(coll_type == "dataset") is_file else is_dataset
+    correct_item_type <- sapply(items, expected_type_check)
+    if(any(!correct_item_type)) {
+      warning("Some items not correct entity types for the collection! These will not be added:", items[!correct_item_type])
+      items <- items[correct_item_type]
+    }
+  }
+  
+  current_items <- sapply(coll$items, function(x) x$entityId)
+  if(any(items %in% current_items) && !force) {
+    stop("Some items to be added are already in collection. Use `force = TRUE` to allow replacing existing versions.") 
+  } else {
+    coll$items <- update_items(coll$items, as_coll_items(items))
+  }
+  .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(coll, auto_unbox = TRUE))
+}
+
+
+# -- Datasets ------------------------------------------------------------------#
+
+#' Create new dataset with given items
 #'
-#' @inheritParams as_dataset_items
+#' @inheritParams as_coll_items
 #' @param name Name of the dataset. It should be unique within the `parent` project.
-#' @param parent Synapse id of parent project where the datasets will live.
+#' @param parent Synapse id of parent project where the dataset will live.
 #' @param items Id(s) of items to include.
 #' Usually the same parent project storing the files, but in some cases it may be a different project.
 #' @param dry_run If TRUE, don't actually store dataset, just return the data object for inspection or further modification.
 new_dataset <- function(name, parent, items, item_version = NULL, dry_run = TRUE) {
 
-  dataset_items <- as_dataset_items(items, item_version)
+  dataset_items <- as_coll_items(items, item_version)
   dataset <- synapseclient$Dataset(name = name,
                                    parent = parent,
                                    dataset_items = dataset_items)
   if(dry_run) dataset else .syn$store(dataset)
 }
 
+
 #' Create Sarek-processed datasets
 #' 
 #' Organize variant call files from Nextflow Sarek into 3-4 datasets, 
@@ -141,6 +247,9 @@ nf_star_salmon_datasets <- function(output_map,
 
 # -- Checks------------- -------------------------------------------------------#
 
+# TODO Better composition to reduce code, esp. if more will be added
+# TODO Potentially move somewhere else like basic_utils
+
 #' Check whether entity is dataset
 #' 
 #' @keywords internal
@@ -152,98 +261,37 @@ is_dataset <- function(id) {
   error = function(e) FALSE)
 }
 
-
-# -- Collections ---------------------------------------------------------------#
-
-#' Apply updates to current collection of items
+#' Check whether entity is dataset collection
 #' 
-#' A collection of items has items of the form `list(entityId = id, versionNumber = x)`.
-#' Given another collection that can represent updates of both types replace or add,
-#' this applies an update join keyed on `entityId` for the replace and 
-#' appends the new items to get the updated collection.
-#' 
-#' @param current_items List of lists representing a collection of items.
-#' @param update_items Collection of items to apply as updates to `current_items`. 
-update_items <- function(current_coll, update_coll) {
-  
-  current_coll <- data.table::rbindlist(current_coll)
-  update_coll <- data.table::rbindlist(update_coll)
-  replaced <- current_coll[update_coll, on = .(entityId), versionNumber := i.versionNumber]
-  added <- update_coll[!current_coll, on = .(entityId)]
-  updated <- rbind(replaced, added)
-  # reconversion; using pure apply as.list coerces versionNumber into char
-  updated <- apply(updated, 1, function(i) list(entityId = unname(i[1]), versionNumber = as.integer(i[2]))) 
-  updated
-} 
+#' @keywords internal
+is_dataset_collection <- function(id) {
+  tryCatch({
+    entity <- .syn$get(id, downloadFile = FALSE)
+    entity$properties$concreteType == "org.sagebionetworks.repo.model.table.DatasetCollection"
+  },
+  error = function(e) FALSE)
+}
 
-#' Update item versions to latest in a collection
-#' 
-#' Update the collection so that all items or a subset of items reference their latest version.
-#' This should work for both datasets (collection of files) and dataset collections (collection of datasets).
+#' Check whether entity is dataset collection
 #' 
-#' @param collection_id 
-#' @param items Vector of dataset ids for which to update reference to latest version, 
-#' or "all" (default) to update all in the dataset collection.
-#' @export
-use_latest_in_collection(collection_id, items = "all") {
-  coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"))
-  item_set <- match.arg(items)
-  current_items <- sapply(coll$items, function(i) i$entityId)
-  
-  if(item_set == "all") {
-    coll$items <- as_dataset_items(current_items)
-  } else {
-    
-    # Check subset; if no check, this becomes like `add_to_dataset_collection`
-    if(!all(items %in% current_items)) {
-      warning("Subset given includes items not actually in collection. These will be ignored:", items[!items %in% current_items])
-      items <- items[items %in% current_items]
-      updated_items <- update_items(coll$items, as_dataset_items(items))
-      coll$items <- updated_items
-    }
-  }
-  .syn$store(coll)
-  
+#' @keywords internal
+is_dataset_collection <- function(id) {
+  tryCatch({
+    entity <- .syn$get(id, downloadFile = FALSE)
+    entity$properties$concreteType == "org.sagebionetworks.repo.model.table.DatasetCollection"
+  },
+  error = function(e) FALSE)
 }
 
-#' Add to dataset collection
-#' 
-#' Add dataset(s) to an _existing_ dataset collection, using their current (latest) version.
-#' If a dataset attempting to be added happens to already be in the dataset collection,
-#' this might lead to version conflicts, so the update won't processed unless `force` is true.
+#' Check whether entity is file
 #' 
-#' Implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet 
-#' implement dataset collection methods. 
-#' 
-#' @param collection_id Id of the dataset collection.
-#' @param items Character vector of one or more dataset entity ids to add. 
-#' @param check_items Whether to check that ids are really dataset entities and remove non-dataset entities (default FALSE) 
-#' to help avoid Synapse error. This may be useful given that sometimes "datasets" can be folder or file entities. 
-#' Note that using check will be slower.
-#' @param force If some items are currently in the collection with a different version, 
-#' should these items be force added using current version? The safe default is FALSE to ensure any version changes are intentional.
-#' @export
-add_to_dataset_collection <- function(collection_id, items, check_items = FALSE, force = FALSE) {
-  
-  if(check_items) {
-    confirmed_dataset <- sapply(items, is_dataset)
-    if(any(!confirmed_dataset)) {
-      warning("Items which are not dataset entities will not be added:", items[!confirmed_dataset])
-      items <- items[confirmed_dataset]
-    }
-  }
-  dc <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"))
-  
-  if(any(items %in% current_items) && !force) {
-    stop("Some datasets to be added are already in collection. Use `force = TRUE` to allow replacing existing dataset versions.") 
-  } else if (any(items %in% current_items) && force) {
-    dc$items <- update_items(dc$items, as_dataset_items(items))
-  } else {
-    dc$items <- c(dc$items, as_dataset_items(items))
-  }
-  .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(dc, auto_unbox = TRUE))
+#' @keywords internal
+is_file <- function(id) {
+  tryCatch({
+    entity <- .syn$get(id, downloadFile = FALSE)
+    entity$properties$concreteType == "org.sagebionetworks.repo.model.File"
+  },
+  error = function(e) FALSE)
 }
 
 
-
-# ------------------------------------------------------------------------------#
diff --git a/tests/testthat/test_dataset_collection_utils.R b/tests/testthat/test_collection_utils.R
similarity index 100%
rename from tests/testthat/test_dataset_collection_utils.R
rename to tests/testthat/test_collection_utils.R

From a2bfed9339e924fc7059c8d57e50ca32f2f3ec50 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Sun, 2 Jul 2023 16:34:40 -0600
Subject: [PATCH 17/25] More tests and code qc

---
 R/datasets.R                        |  54 +++++++------
 tests/testthat/test_dataset_utils.R | 117 +++++++++++++++++++++++++---
 2 files changed, 134 insertions(+), 37 deletions(-)

diff --git a/R/datasets.R b/R/datasets.R
index 805d1516..4adfbed9 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -17,7 +17,7 @@ as_coll_items <- function(ids, item_version = NULL) {
   if(is.null(item_version)) {
     item_version <- lapply(ids, function(id) .syn$get(id, downloadFile = FALSE)$properties$versionNumber)
   }
-  items <- Map(function(id, version) list(entityId = id, versionNumber = 1L), ids, item_version)
+  items <- Map(function(id, version) list(entityId = id, versionNumber = version), ids, item_version)
   names(items) <- NULL # need to unname list for API
   items
 }
@@ -25,7 +25,7 @@ as_coll_items <- function(ids, item_version = NULL) {
 
 #' Apply updates to current collection of items
 #' 
-#' Given another collection that can represent updates of both types replace or add,
+#' Given another collection that can represent updates of both types "replace" or "add",
 #' this applies an update join keyed on `entityId` for the replace and 
 #' appends the new items to get the updated collection.
 #' 
@@ -53,24 +53,27 @@ update_items <- function(current_coll, update_coll) {
 #' @param items Vector of dataset ids for which to update reference to latest version, 
 #' or "all" (default) to update all in the dataset collection.
 #' @export
-use_latest_in_collection(collection_id, items = "all") {
+use_latest_in_collection <- function(collection_id, items = "all") {
   coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"))
-  item_set <- match.arg(items)
   current_items <- sapply(coll$items, function(i) i$entityId)
   
-  if(item_set == "all") {
+  if((length(items) == 1) && (items  == "all")) {
     coll$items <- as_coll_items(current_items)
   } else {
     
     # Check subset; if no check, this becomes `add_to_collection`
     if(!all(items %in% current_items)) {
-      warning("Subset given includes items not actually in collection. These will be ignored:", items[!items %in% current_items])
+      warning("Subset given includes items not actually in collection: ", items[!items %in% current_items])
       items <- items[items %in% current_items]
-      updated_items <- update_items(coll$items, as_coll_items(items))
-      coll$items <- updated_items
+      if(!length(items)) {
+        warning("No qualifying items to update. No updates applied.")
+        return(coll)
+      }
     }
+    updated_items <- update_items(coll$items, as_coll_items(items))
+    coll$items <- updated_items
   }
-  .syn$store(coll)
+  .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(coll, auto_unbox = TRUE))
   
 }
 
@@ -97,21 +100,24 @@ use_latest_in_collection(collection_id, items = "all") {
 add_to_collection <- function(collection_id, items, check_items = FALSE, force = FALSE) {
   
   coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"))
-  coll_type <- c("dataset", "dataset collection")[c(is_dataset(coll), is_dataset_collection(coll))]
-  if(!length(coll_type)) stop("Entity is not a dataset or dataset collection.")
+  coll_type <- which_coll_type(coll)
   
   if(check_items) {
-    expected_type_check <- if(coll_type == "dataset") is_file else is_dataset
-    correct_item_type <- sapply(items, expected_type_check)
+    item_type_check <- if(coll_type == "dataset") is_file else is_dataset
+    correct_item_type <- sapply(items, item_type_check)
     if(any(!correct_item_type)) {
-      warning("Some items not correct entity types for the collection! These will not be added:", items[!correct_item_type])
+      warning("Some items not correct entity types for the collection and will not be added: ", items[!correct_item_type])
       items <- items[correct_item_type]
+      if(!length(items)) {
+        warning("No qualifying items to add. No updates applied.", call. = FALSE)
+        return(coll)
+      }
     }
   }
   
   current_items <- sapply(coll$items, function(x) x$entityId)
   if(any(items %in% current_items) && !force) {
-    stop("Some items to be added are already in collection. Use `force = TRUE` to allow replacing existing versions.") 
+    stop("Some items to be added are already in collection. Use `force = TRUE` to allow replacing existing versions.")
   } else {
     coll$items <- update_items(coll$items, as_coll_items(items))
   }
@@ -272,15 +278,14 @@ is_dataset_collection <- function(id) {
   error = function(e) FALSE)
 }
 
-#' Check whether entity is dataset collection
+
+#' Which collection type
 #' 
-#' @keywords internal
-is_dataset_collection <- function(id) {
-  tryCatch({
-    entity <- .syn$get(id, downloadFile = FALSE)
-    entity$properties$concreteType == "org.sagebionetworks.repo.model.table.DatasetCollection"
-  },
-  error = function(e) FALSE)
+#' Checks for a valid collection type or returns error
+#' 
+which_coll_type <- function(coll) {
+  coll_type <- c("dataset", "dataset collection")[c(is_dataset(coll), is_dataset_collection(coll))]
+  if(length(coll_type)) coll_type else stop("Entity is not a dataset or dataset collection.")
 }
 
 #' Check whether entity is file
@@ -289,9 +294,8 @@ is_dataset_collection <- function(id) {
 is_file <- function(id) {
   tryCatch({
     entity <- .syn$get(id, downloadFile = FALSE)
-    entity$properties$concreteType == "org.sagebionetworks.repo.model.File"
+    entity$properties$concreteType == "org.sagebionetworks.repo.model.FileEntity"
   },
   error = function(e) FALSE)
 }
 
-
diff --git a/tests/testthat/test_dataset_utils.R b/tests/testthat/test_dataset_utils.R
index 7ea57b92..5bef2af3 100644
--- a/tests/testthat/test_dataset_utils.R
+++ b/tests/testthat/test_dataset_utils.R
@@ -1,3 +1,16 @@
+# Create a basic draft dataset from some files at version 1; all files have a latest version 2 
+# Returns dataset id only
+create_dataset_fixture <- function(instance = 1) {
+  NF_test <- "syn26462036"
+  items <- c("syn51239179",
+             "syn51239178",
+             "syn51239177")
+  dataset <- new_dataset(name = paste0("test_fixture_dataset_", instance), parent = NF_test, items = items, item_version = 1L, dry_run = FALSE)
+  dataset_id <- dataset$properties$id
+  dataset_id
+}
+
+
 test_that("Creating dataset with `new_dataset` works as expected when given valid parameters, defaulting to current item versions", {
   
   skip_if_no_synapseclient()
@@ -8,14 +21,15 @@ test_that("Creating dataset with `new_dataset` works as expected when given vali
              "syn51239178",
              "syn51239177")
   dataset <- new_dataset(name = "test_dataset", parent = NF_test, items = items, dry_run = FALSE)
-  .syn$delete(dataset)
-  expected_items_in_dataset <- list(list(entityId = "syn51239179", versionNumber = 2L),
-                                    list(entityId = "syn51239178", versionNumber = 2L),
-                                    list(entityId = "syn51239177", versionNumber = 2L)) 
+  expected_items_in_dataset <- list(
+    list(entityId = "syn51239179", versionNumber = 2L),
+    list(entityId = "syn51239178", versionNumber = 2L),
+    list(entityId = "syn51239177", versionNumber = 2L)) 
   testthat::expect_equal(expected_items_in_dataset, dataset$properties$datasetItems)
-  
+  .syn$delete(dataset)
 })
 
+
 test_that("Creating dataset with `new_dataset` works as expected when given valid parameters and a specific item version is specified", {
   
   skip_if_no_synapseclient()
@@ -25,21 +39,21 @@ test_that("Creating dataset with `new_dataset` works as expected when given vali
              "syn51239178",
              "syn51239177")
   dataset <- new_dataset(name = "test_dataset", parent = NF_test, items = items, item_version = 1L, dry_run = FALSE)
-  .syn$delete(dataset)
-  expected_items_in_dataset <- list(list(entityId = "syn51239179", versionNumber = 1L),
-                                    list(entityId = "syn51239178", versionNumber = 1L),
-                                    list(entityId = "syn51239177", versionNumber = 1L))
+  expected_items_in_dataset <- list(
+    list(entityId = "syn51239179", versionNumber = 1L),
+    list(entityId = "syn51239178", versionNumber = 1L),
+    list(entityId = "syn51239177", versionNumber = 1L))
   testthat::expect_equal(expected_items_in_dataset, dataset$properties$datasetItems)
-  
+  .syn$delete(dataset)
 })
 
+
 # When providing an item not allowed to be a dataset item (a table or folder), the Synapse error will be something like
 # ```
 # Error: synapseclient.core.exceptions.SynapseHTTPError: 400 Client Error:
 # Currently, only files can be included in a dataset. syn27242487 is 'org.sagebionetworks.repo.model.table.TableEntity'
 # ```
-# This is a good, informative error
-test_that("Creating dataset with `new_dataset` will fail when trying to include a non-vaid item (specifically, a table)", {
+test_that("Creating dataset with `new_dataset` will fail when trying to include a non-valid item (a table)", {
   
   skip_if_no_synapseclient()
   skip_if_no_token()
@@ -51,3 +65,82 @@ test_that("Creating dataset with `new_dataset` will fail when trying to include
 })
 
 
+test_that("Updating a dataset to make a subset of files reference the latest version works", {
+  
+  skip_if_no_synapseclient()
+  skip_if_no_token()
+  
+  dataset_id <- create_dataset_fixture()
+  items_to_update <- c("syn51239178", "syn51239177") # both should be updated to Version 2
+  updated <- use_latest_in_collection(collection_id = dataset_id, items = items_to_update)
+  expected_updated_items <- list(
+    list(entityId = "syn51239179", versionNumber = 1L),
+    list(entityId = "syn51239178", versionNumber = 2L),
+    list(entityId = "syn51239177", versionNumber = 2L))
+  testthat::expect_identical(updated$items, expected_updated_items)
+  .syn$delete(dataset_id)
+})
+
+
+test_that("Updating a dataset to make _all_ files reference the latest version works", {
+  
+  skip_if_no_synapseclient()
+  skip_if_no_token()
+  
+  dataset_id <- create_dataset_fixture()
+  expected_updated_items <- list(
+    list(entityId = "syn51239179", versionNumber = 2L),
+    list(entityId = "syn51239178", versionNumber = 2L),
+    list(entityId = "syn51239177", versionNumber = 2L))
+  updated <- use_latest_in_collection(collection_id = dataset_id, items = "all")
+  testthat::expect_identical(updated$items, expected_updated_items)
+  .syn$delete(dataset_id)
+})
+
+
+# Dataset collections ---------------------------------------------------------#
+
+test_that("Updating a dataset collection to make a subset of datasets reference the latest version works", {
+  
+  skip_if_no_synapseclient()
+  skip_if_no_token()
+  
+  dataset_collection_id <- "syn51809938"
+  dataset_item_to_update <- "syn51809898"
+  .syn$create_snapshot_version(dataset_item_to_update)
+  new_version <- .syn$get(dataset_item_to_update, downloadFile = FALSE)$properties$versionNumber
+  DC <- use_latest_in_collection(collection_id = dataset_collection_id, items = dataset_item_to_update)
+  updated_item <- Filter(function(item) item$entityId == dataset_item_to_update, DC$items)
+  testthat::expect_equal(updated_item$versionNumber, new_version)
+})
+
+
+test_that("Adding new dataset to dataset collection works", {
+
+  skip_if_no_synapseclient()
+  skip_if_no_token()
+  
+  dataset_collection_id <- "syn51809938"
+  coll_state <- coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{dataset_collection_id}"))
+  one_more_item <- create_dataset_fixture()
+  new_coll_state <- add_to_collection(collection_id = dataset_collection_id, items = one_more_item)
+  testthat::expect_equal(length(new_coll_state$items), length(coll_state$items) + 1L)
+  # cleanup: set collection to previous items state
+  new_coll_state$items <- coll_state$items
+  .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{dataset_collection_id}"), body = jsonlite::toJSON(new_coll_state, auto_unbox = TRUE))
+  # delete dataset
+  .syn$delete(one_more_item)
+})
+
+
+test_that("Adding non-datasets to dataset collection gives expected handling and warning", {
+  
+  skip_if_no_synapseclient()
+  skip_if_no_token()
+  
+  dataset_collection_id <- "syn51809938"
+  bad_items <- "syn51106349" # a folder
+  testthat::expect_warning(add_to_collection(collection_id = dataset_collection_id, items = bad_items, check_items = TRUE),
+                           regexp = paste("No qualifying items to add. No updates applied."))
+})
+

From cc11a48e190201aa1819a107e6a15d1638c87c04 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Sun, 2 Jul 2023 17:41:09 -0600
Subject: [PATCH 18/25] Draft dataset citation util

---
 R/datasets.R | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/R/datasets.R b/R/datasets.R
index 4adfbed9..be283da8 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -251,10 +251,37 @@ nf_star_salmon_datasets <- function(output_map,
               dry_run = dry_run)
 }
 
+# -- Citations -----------------------------------------------------------------#
+
+# TODO Potentially move somewhere else
+
+#' Generate dataset citation
+#' 
+#' This is currently more for demo purposes, to check how well current metadata 
+#' could be formatted into citation text. Datasets within the official 
+#' Portal Collection should work well enough, while there are no guarantees for 
+#' unofficial/community-contributed datasets.
+#' @param dataset_id Dataset id.
+#' @param format Currently just "Scientific Data" format.
+#' @param output Currently just markdown format. There are many ways to 
+#' generate LaTeX or HTML from markdown.
+#' @keywords internal
+dataset_citation <- function(dataset_id, format = "Scientific Data", output = c("markdown")) {
+  if(!is_dataset(id)) stop("Not a dataset")
+  meta <- .syn$get_annotations(id)
+  doi <- tryCatch(meta$doi, error = function(e) NULL)
+  title <- meta$title
+  creator <- meta$creator
+  yearPublished <- meta$yearPublished
+  repository <- "Synapse"
+  accession <- if(length(doi)) doi else glue::glue("https://www.synapse.org/#!Synapse:{id}") 
+  glue::glue("{creator}. _{repository}_ {accession} ({yearPublished}).")
+}
+
 # -- Checks------------- -------------------------------------------------------#
 
+# TODO Potentially move these type checks somewhere else like basic_utils
 # TODO Better composition to reduce code, esp. if more will be added
-# TODO Potentially move somewhere else like basic_utils
 
 #' Check whether entity is dataset
 #' 

From 388746189512234394b8d104475426f7835a3691 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Mon, 3 Jul 2023 08:17:17 -0600
Subject: [PATCH 19/25] Mark internal or export, add doc details

---
 R/datasets.R | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/R/datasets.R b/R/datasets.R
index be283da8..164263b2 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -25,12 +25,15 @@ as_coll_items <- function(ids, item_version = NULL) {
 
 #' Apply updates to current collection of items
 #' 
-#' Given another collection that can represent updates of both types "replace" or "add",
+#' This is essentially an internal transaction helper for trying to apply a changeset to a collection,
+#' used in several higher-level collection utils. 
+#' Given the changeset that can represent updates of both types "replace" or "add",
 #' this applies an update join keyed on `entityId` for the replace and 
 #' appends the new items to get the updated collection.
 #' 
 #' @param current_items List of lists representing a collection of items.
 #' @param update_items Collection of items to apply as updates to `current_items`. 
+#' @keywords internal
 update_items <- function(current_coll, update_coll) {
   
   current_coll <- data.table::rbindlist(current_coll)
@@ -135,6 +138,7 @@ add_to_collection <- function(collection_id, items, check_items = FALSE, force =
 #' @param items Id(s) of items to include.
 #' Usually the same parent project storing the files, but in some cases it may be a different project.
 #' @param dry_run If TRUE, don't actually store dataset, just return the data object for inspection or further modification.
+#' @export
 new_dataset <- function(name, parent, items, item_version = NULL, dry_run = TRUE) {
 
   dataset_items <- as_coll_items(items, item_version)
@@ -266,7 +270,7 @@ nf_star_salmon_datasets <- function(output_map,
 #' @param output Currently just markdown format. There are many ways to 
 #' generate LaTeX or HTML from markdown.
 #' @keywords internal
-dataset_citation <- function(dataset_id, format = "Scientific Data", output = c("markdown")) {
+cite_dataset <- function(dataset_id, format = "Scientific Data", output = c("markdown")) {
   if(!is_dataset(id)) stop("Not a dataset")
   meta <- .syn$get_annotations(id)
   doi <- tryCatch(meta$doi, error = function(e) NULL)
@@ -310,6 +314,7 @@ is_dataset_collection <- function(id) {
 #' 
 #' Checks for a valid collection type or returns error
 #' 
+#' @keywords internal
 which_coll_type <- function(coll) {
   coll_type <- c("dataset", "dataset collection")[c(is_dataset(coll), is_dataset_collection(coll))]
   if(length(coll_type)) coll_type else stop("Entity is not a dataset or dataset collection.")

From c861904964cb9d7246a3bb6086d244c5253ecc97 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Mon, 3 Jul 2023 08:52:07 -0600
Subject: [PATCH 20/25] Write new docs, update pkgdown reference

---
 _pkgdown.yml                    | 17 +++++++++++
 man/add_to_collection.Rd        | 31 +++++++++++++++++++
 man/as_coll_items.Rd            | 19 ++++++++++++
 man/cite_dataset.Rd             | 23 ++++++++++++++
 man/is_dataset.Rd               | 12 ++++++++
 man/is_dataset_collection.Rd    | 12 ++++++++
 man/is_file.Rd                  | 12 ++++++++
 man/new_dataset.Rd              | 24 +++++++++++++++
 man/nf_sarek_datasets.Rd        | 53 +++++++++++++++++++++++++++++++++
 man/nf_star_salmon_datasets.Rd  | 29 ++++++++++++++++++
 man/update_items.Rd             | 21 +++++++++++++
 man/use_latest_in_collection.Rd | 18 +++++++++++
 man/which_coll_type.Rd          | 12 ++++++++
 13 files changed, 283 insertions(+)
 create mode 100644 man/add_to_collection.Rd
 create mode 100644 man/as_coll_items.Rd
 create mode 100644 man/cite_dataset.Rd
 create mode 100644 man/is_dataset.Rd
 create mode 100644 man/is_dataset_collection.Rd
 create mode 100644 man/is_file.Rd
 create mode 100644 man/new_dataset.Rd
 create mode 100644 man/nf_sarek_datasets.Rd
 create mode 100644 man/nf_star_salmon_datasets.Rd
 create mode 100644 man/update_items.Rd
 create mode 100644 man/use_latest_in_collection.Rd
 create mode 100644 man/which_coll_type.Rd

diff --git a/_pkgdown.yml b/_pkgdown.yml
index 5c8b9ff0..47c3195d 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -44,6 +44,23 @@ reference:
   - annotate_reports_sarek
   - annotate_with_tool_stats
 
+- title: Dataset Creation and Management
+- subtitle: General dataset creation and citation
+  desc: Create datasets generally, create citation example text
+- contents:
+  - new_dataset
+  - cite_dataset
+- subtitle: Specialized dataset creation
+  desc: Create specialized datasets for nextflow processed data, i.e. with some custom construction queries and title templating.
+- contents:
+  - nf_sarek_datasets
+  - nf_star_salmon_datasets
+- subtitle: Working with dataset collections to manage datasets after creation
+- contents:
+  - add_to_collection
+  - use_latest_in_collection
+  - update_items
+
 - title: Data Model Utils
   desc: Talk to a JSON-LD data model important to the portal data (i.e. NF-metadata-dictionary)
 - contents:
diff --git a/man/add_to_collection.Rd b/man/add_to_collection.Rd
new file mode 100644
index 00000000..e6d774a4
--- /dev/null
+++ b/man/add_to_collection.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/datasets.R
+\name{add_to_collection}
+\alias{add_to_collection}
+\title{Add to collection}
+\usage{
+add_to_collection(collection_id, items, check_items = FALSE, force = FALSE)
+}
+\arguments{
+\item{collection_id}{Id of the collection.}
+
+\item{items}{Character vector of one or more dataset entity ids to add.}
+
+\item{check_items}{Whether to check that ids are really appropriate item types and remove non-appropriate item types
+to help avoid Synapse errors (default \code{FALSE} because in most cases \code{items} are curated, and using check will be slower).}
+
+\item{force}{If some items are currently in the collection with a different version,
+should these items be force-added using current version? The safe default is \code{FALSE} to ensure any such updates are intentional.}
+}
+\description{
+Add items(s) to an \emph{existing} collection, using the item(s)' current (latest) version.
+For datasets, the items should be files. For dataset collections, the items should be datasets.
+If an item attempting to be added happens to already be in the collection,
+this might lead to version conflicts, so the update will be rejected unless \code{force} is true.
+}
+\details{
+This is implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet
+implement dataset collection class and methods (but dataset and relevant methods like \code{add_item} method are available).
+Thus, while this is generic enough to handle both datasets and dataset collections
+it is expected to be used more for dataset collections given that the dataset method is provided.
+}
diff --git a/man/as_coll_items.Rd b/man/as_coll_items.Rd
new file mode 100644
index 00000000..9c97671b
--- /dev/null
+++ b/man/as_coll_items.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/datasets.R
+\name{as_coll_items}
+\alias{as_coll_items}
+\title{As collection items}
+\usage{
+as_coll_items(ids, item_version = NULL)
+}
+\arguments{
+\item{ids}{Ids of entities to make into dataset items.}
+
+\item{item_version}{Integer for version that will be used for all items, e.g. 1.
+If NULL, this will look up the latest version for each id and use that.}
+}
+\description{
+Helper taking entity ids to create records used for dataset items or dataset collection items.
+Collection items have the form \code{list(entityId = id, versionNumber = x)}.
+}
+\keyword{internal}
diff --git a/man/cite_dataset.Rd b/man/cite_dataset.Rd
new file mode 100644
index 00000000..6a53c059
--- /dev/null
+++ b/man/cite_dataset.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/datasets.R
+\name{cite_dataset}
+\alias{cite_dataset}
+\title{Generate dataset citation}
+\usage{
+cite_dataset(dataset_id, format = "Scientific Data", output = c("markdown"))
+}
+\arguments{
+\item{dataset_id}{Dataset id.}
+
+\item{format}{Currently just "Scientific Data" format.}
+
+\item{output}{Currently just markdown format. There are many ways to
+generate LaTeX or HTML from markdown.}
+}
+\description{
+This is currently more for demo purposes, to check how well current metadata
+could be formatted into citation text. Datasets within the official
+Portal Collection should work well enough, while there are no guarantees for
+unofficial/community-contributed datasets.
+}
+\keyword{internal}
diff --git a/man/is_dataset.Rd b/man/is_dataset.Rd
new file mode 100644
index 00000000..ea75bdb6
--- /dev/null
+++ b/man/is_dataset.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/datasets.R
+\name{is_dataset}
+\alias{is_dataset}
+\title{Check whether entity is dataset}
+\usage{
+is_dataset(id)
+}
+\description{
+Check whether entity is dataset
+}
+\keyword{internal}
diff --git a/man/is_dataset_collection.Rd b/man/is_dataset_collection.Rd
new file mode 100644
index 00000000..adb1036b
--- /dev/null
+++ b/man/is_dataset_collection.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/datasets.R
+\name{is_dataset_collection}
+\alias{is_dataset_collection}
+\title{Check whether entity is dataset collection}
+\usage{
+is_dataset_collection(id)
+}
+\description{
+Check whether entity is dataset collection
+}
+\keyword{internal}
diff --git a/man/is_file.Rd b/man/is_file.Rd
new file mode 100644
index 00000000..09b1315c
--- /dev/null
+++ b/man/is_file.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/datasets.R
+\name{is_file}
+\alias{is_file}
+\title{Check whether entity is file}
+\usage{
+is_file(id)
+}
+\description{
+Check whether entity is file
+}
+\keyword{internal}
diff --git a/man/new_dataset.Rd b/man/new_dataset.Rd
new file mode 100644
index 00000000..985d3935
--- /dev/null
+++ b/man/new_dataset.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/datasets.R
+\name{new_dataset}
+\alias{new_dataset}
+\title{Create new dataset with given items}
+\usage{
+new_dataset(name, parent, items, item_version = NULL, dry_run = TRUE)
+}
+\arguments{
+\item{name}{Name of the dataset. It should be unique within the \code{parent} project.}
+
+\item{parent}{Synapse id of parent project where the dataset will live.}
+
+\item{items}{Id(s) of items to include.
+Usually the same parent project storing the files, but in some cases it may be a different project.}
+
+\item{item_version}{Integer for version that will be used for all items, e.g. 1.
+If NULL, this will look up the latest version for each id and use that.}
+
+\item{dry_run}{If TRUE, don't actually store dataset, just return the data object for inspection or further modification.}
+}
+\description{
+Create new dataset with given items
+}
diff --git a/man/nf_sarek_datasets.Rd b/man/nf_sarek_datasets.Rd
new file mode 100644
index 00000000..db4012d6
--- /dev/null
+++ b/man/nf_sarek_datasets.Rd
@@ -0,0 +1,53 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/datasets.R
+\name{nf_sarek_datasets}
+\alias{nf_sarek_datasets}
+\title{Create Sarek-processed datasets}
+\usage{
+nf_sarek_datasets(
+  output_map,
+  parent,
+  workflow = c("FreeBayes", "Mutect2", "Strelka", "DeepVariant"),
+  verbose = TRUE,
+  dry_run = TRUE
+)
+}
+\arguments{
+\item{output_map}{The \code{data.table} returned from \code{map_sample_output_sarek}. See details for alternatives.}
+
+\item{parent}{Synapse id of parent project where the dataset will live.}
+
+\item{verbose}{Optional, whether to be verbose -- defaults to TRUE.}
+
+\item{dry_run}{If TRUE, don't actually store dataset, just return the data object for inspection or further modification.}
+}
+\value{
+A list of dataset objects.
+}
+\description{
+Organize variant call files from Nextflow Sarek into 3-4 datasets,
+grouping files by variant type and workflow with titles having the format:
+"{type} Genomic Variants - {workflow} Pipeline", e.g. "Somatic Genomic Variants - Strelka Pipeline".
+As you can see, this assumes that you want to create datasets that segregate Somatic and Germline calls.
+This makes sense for NF because Germline calls can be treated differently.
+This uses latest version of all files and creates a Draft version of the dataset.
+}
+\details{
+Since we basically just need the syn entity id, variant type, and workflow to group the files.
+Instead of getting this info through running \verb{map_*} as in the example,
+you may prefer using a fileview, in which case you just need to download a table from a fileview
+that has \code{id} => \code{output_id} + the \code{dataType} and \code{workflow} annotations.
+The fileview can be used \emph{after} the files are annotated. If you want to create datasets \emph{before}
+files are annotated, then you have to use \verb{map_*}.
+
+Finally, datasets cannot use the same name if stored in the same project,
+so if there are multiple batches, the names will have to be made unique by adding
+the batch number, source data id, processing date, or whatever makes sense.
+}
+\examples{
+\dontrun{
+syn_out <- "syn26648589"
+m <- map_sample_output_sarek(syn_out)
+datasets <- nf_sarek_datasets(m, parent = "syn26462036", dry_run = F) # use a test project
+}
+}
diff --git a/man/nf_star_salmon_datasets.Rd b/man/nf_star_salmon_datasets.Rd
new file mode 100644
index 00000000..64d0c149
--- /dev/null
+++ b/man/nf_star_salmon_datasets.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/datasets.R
+\name{nf_star_salmon_datasets}
+\alias{nf_star_salmon_datasets}
+\title{Create NF STAR-Salmon dataset}
+\usage{
+nf_star_salmon_datasets(output_map, parent, verbose = TRUE, dry_run = TRUE)
+}
+\arguments{
+\item{output_map}{The \code{data.table} returned from \code{map_sample_output_sarek}.}
+
+\item{parent}{Synapse id of parent project where the dataset will live.}
+
+\item{verbose}{Optional, whether to be verbose -- defaults to TRUE.}
+
+\item{dry_run}{If TRUE, don't actually store dataset, just return the data object for inspection or further modification.}
+}
+\description{
+Organize gene expression quantification files (.sf) into one dataset.
+Uses latest version of the files and creates a "Draft" dataset.
+See also \code{nf_sarek_datasets}.
+}
+\examples{
+\dontrun{
+syn_out <- "syn30840584"
+m <- map_sample_output_rnaseq(syn_out) 
+datasets <- nf_rnaseq_dataset(m, out, parent = "syn4939902", dry_run = F)
+}
+}
diff --git a/man/update_items.Rd b/man/update_items.Rd
new file mode 100644
index 00000000..c7a516e3
--- /dev/null
+++ b/man/update_items.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/datasets.R
+\name{update_items}
+\alias{update_items}
+\title{Apply updates to current collection of items}
+\usage{
+update_items(current_coll, update_coll)
+}
+\arguments{
+\item{current_items}{List of lists representing a collection of items.}
+
+\item{update_items}{Collection of items to apply as updates to \code{current_items}.}
+}
+\description{
+This is essentially an internal transaction helper for trying to apply a changeset to a collection,
+used in several higher-level collection utils.
+Given the changeset that can represent updates of both types "replace" or "add",
+this applies an update join keyed on \code{entityId} for the replace and
+appends the new items to get the updated collection.
+}
+\keyword{internal}
diff --git a/man/use_latest_in_collection.Rd b/man/use_latest_in_collection.Rd
new file mode 100644
index 00000000..00c376d9
--- /dev/null
+++ b/man/use_latest_in_collection.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/datasets.R
+\name{use_latest_in_collection}
+\alias{use_latest_in_collection}
+\title{Update item versions to "latest" in a collection}
+\usage{
+use_latest_in_collection(collection_id, items = "all")
+}
+\arguments{
+\item{collection_id}{}
+
+\item{items}{Vector of dataset ids for which to update reference to latest version,
+or "all" (default) to update all in the dataset collection.}
+}
+\description{
+Update an \emph{existing} collection so that all items or a subset of items reference their latest version.
+This should work for both datasets (collection of files) and dataset collections (collection of datasets).
+}
diff --git a/man/which_coll_type.Rd b/man/which_coll_type.Rd
new file mode 100644
index 00000000..ab0eb92e
--- /dev/null
+++ b/man/which_coll_type.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/datasets.R
+\name{which_coll_type}
+\alias{which_coll_type}
+\title{Which collection type}
+\usage{
+which_coll_type(coll)
+}
+\description{
+Checks for a valid collection type or returns error
+}
+\keyword{internal}

From c1e152208cbb3385b4894b6632f7fa59745f9fe6 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Mon, 3 Jul 2023 12:19:33 -0600
Subject: [PATCH 21/25] Fix bug for one failing test in test_dataset_utils

---
 tests/testthat/test_dataset_utils.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/testthat/test_dataset_utils.R b/tests/testthat/test_dataset_utils.R
index 5bef2af3..214e0f66 100644
--- a/tests/testthat/test_dataset_utils.R
+++ b/tests/testthat/test_dataset_utils.R
@@ -111,7 +111,7 @@ test_that("Updating a dataset collection to make a subset of datasets reference
   new_version <- .syn$get(dataset_item_to_update, downloadFile = FALSE)$properties$versionNumber
   DC <- use_latest_in_collection(collection_id = dataset_collection_id, items = dataset_item_to_update)
   updated_item <- Filter(function(item) item$entityId == dataset_item_to_update, DC$items)
-  testthat::expect_equal(updated_item$versionNumber, new_version)
+  testthat::expect_equal(updated_item[[1]]$versionNumber, new_version)
 })
 
 

From d158cb6dac34123626d1022361c3b5c387b1147b Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Mon, 3 Jul 2023 12:38:28 -0600
Subject: [PATCH 22/25] Update testing notes

---
 tests/README.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/README.md b/tests/README.md
index 4465b673..1afaac71 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -29,7 +29,13 @@ If you are a potential contributor who is not an NF-OSI member,
 request to be added to the test repo for writing/running tests.
 **If `TEST_SYNAPSE_AUTH_TOKEN` is not available, dependent tests are simply skipped.**
 
-- During tests, the `TEST_SYNAPSE_AUTH_TOKEN` is temporarily set as `SYNAPSE_AUTH_TOKEN` to create the synapseclient object.
+- During tests, the `TEST_SYNAPSE_AUTH_TOKEN` is temporarily set as `SYNAPSE_AUTH_TOKEN`.
+
+- For pkg testing during development: 
+  - Use `testthat::test_local()` in the pkg root to test all functions. Recommended.
+  - Write a `test_that` function to test a new package function and run it interactively. 
+  Note `test_local` does things like importing Python modules into the environment (i.e. all the stuff that happens during package load), 
+  so the standalone function testing is a little trickier in having check for and set up dependencies in the environment.   
 
 - Given that most tests depend on a successful login, and that tests are run
 according to the alphabetical naming of test*.R files, 

From 2998455807da58180caadfe8df21d448871ec6af Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Mon, 3 Jul 2023 12:38:50 -0600
Subject: [PATCH 23/25] Add dependency check

---
 tests/testthat/helpers.R | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tests/testthat/helpers.R b/tests/testthat/helpers.R
index d4ff7c99..c3fce801 100644
--- a/tests/testthat/helpers.R
+++ b/tests/testthat/helpers.R
@@ -1,14 +1,24 @@
 # Implementing skips according to suggested handling when using reticulate
 # See https://rstudio.github.io/reticulate/articles/package.html
-
 # Skips tests on CRAN machines or other incompatible testing environments
 # where Python can't be configured so package checks don't fail 
+
+# Skip if Python synapseclient module not installed/accessible
+# This is normally imported upon package load, see `zzz.R`
 skip_if_no_synapseclient <- function() {
   have_synapseclient <- py_module_available("synapseclient") 
   if(!have_synapseclient)
     skip("synapseclient not available for testing")
 }
 
+# Skip if Python synapseutils module not installed/accessible
+# This is normally imported upon package load, see `zzz.R`
+skip_if_no_synapseutils <- function() {
+  have_synapseutils <- py_module_available("synapseclient") 
+  if(!have_synapseutils)
+    skip("synapseutils not available for testing")
+}
+
 # Skip if no pandas; pandas is needed for smaller subset of functions in the package
 skip_if_no_pandas <- function() {
   have_pandas <- py_module_available("pandas") 

From 76336a2d8cb0714f690d81623dafc284907c502b Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Mon, 3 Jul 2023 12:47:55 -0600
Subject: [PATCH 24/25] Address CMD check doc complaints, update NAMESPACE and
 bump pkg version

---
 DESCRIPTION                     | 2 +-
 NAMESPACE                       | 5 +++++
 R/datasets.R                    | 5 +++--
 man/add_to_collection.Rd        | 2 +-
 man/nf_sarek_datasets.Rd        | 2 ++
 man/use_latest_in_collection.Rd | 2 +-
 6 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index b4e735c1..5cd8e93c 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: nfportalutils
 Title: NF Portal Utilities
-Version: 0.0.0.9210
+Version: 0.0.0.9300
 Authors@R: c(
     person(given = "Robert", family = "Allaway", role = c("aut", "cre"),
            email = "robert.allaway@sagebionetworks.org",
diff --git a/NAMESPACE b/NAMESPACE
index 6c68ffc5..c739fdbd 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -16,6 +16,7 @@ export(add_people_from_table)
 export(add_publication_from_pubmed)
 export(add_publication_from_unpaywall)
 export(add_publications_from_file)
+export(add_to_collection)
 export(annotate_aligned_reads)
 export(annotate_called_variants)
 export(annotate_expression)
@@ -49,7 +50,10 @@ export(map_sample_io)
 export(map_sample_output_rnaseq)
 export(map_sample_output_sarek)
 export(missing_annotation_email)
+export(new_dataset)
 export(new_project)
+export(nf_sarek_datasets)
+export(nf_star_salmon_datasets)
 export(processing_flowchart)
 export(qc_manifest)
 export(register_study)
@@ -61,6 +65,7 @@ export(syn_login)
 export(syncBP_maf)
 export(table_query)
 export(update_study_annotations)
+export(use_latest_in_collection)
 export(wiki_mod)
 import(data.table)
 import(reticulate)
diff --git a/R/datasets.R b/R/datasets.R
index 164263b2..1cb41b07 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -52,7 +52,7 @@ update_items <- function(current_coll, update_coll) {
 #' Update an _existing_ collection so that all items or a subset of items reference their latest version.
 #' This should work for both datasets (collection of files) and dataset collections (collection of datasets).
 #' 
-#' @param collection_id 
+#' @param collection_id Collection id.
 #' @param items Vector of dataset ids for which to update reference to latest version, 
 #' or "all" (default) to update all in the dataset collection.
 #' @export
@@ -93,7 +93,7 @@ use_latest_in_collection <- function(collection_id, items = "all") {
 #' Thus, while this is generic enough to handle both datasets and dataset collections 
 #' it is expected to be used more for dataset collections given that the dataset method is provided.
 #' 
-#' @param collection_id Id of the collection.
+#' @param collection_id Collection id.
 #' @param items Character vector of one or more dataset entity ids to add.
 #' @param check_items Whether to check that ids are really appropriate item types and remove non-appropriate item types
 #' to help avoid Synapse errors (default `FALSE` because in most cases `items` are curated, and using check will be slower).
@@ -171,6 +171,7 @@ new_dataset <- function(name, parent, items, item_version = NULL, dry_run = TRUE
 #' 
 #' @inheritParams new_dataset
 #' @param output_map The `data.table` returned from `map_sample_output_sarek`. See details for alternatives.
+#' @param workflow One of workflows used.
 #' @param verbose Optional, whether to be verbose -- defaults to TRUE.
 #' @import data.table
 #' @return A list of dataset objects.
diff --git a/man/add_to_collection.Rd b/man/add_to_collection.Rd
index e6d774a4..0b52c9e6 100644
--- a/man/add_to_collection.Rd
+++ b/man/add_to_collection.Rd
@@ -7,7 +7,7 @@
 add_to_collection(collection_id, items, check_items = FALSE, force = FALSE)
 }
 \arguments{
-\item{collection_id}{Id of the collection.}
+\item{collection_id}{Collection id.}
 
 \item{items}{Character vector of one or more dataset entity ids to add.}
 
diff --git a/man/nf_sarek_datasets.Rd b/man/nf_sarek_datasets.Rd
index db4012d6..46ddbe61 100644
--- a/man/nf_sarek_datasets.Rd
+++ b/man/nf_sarek_datasets.Rd
@@ -17,6 +17,8 @@ nf_sarek_datasets(
 
 \item{parent}{Synapse id of parent project where the dataset will live.}
 
+\item{workflow}{One of workflows used.}
+
 \item{verbose}{Optional, whether to be verbose -- defaults to TRUE.}
 
 \item{dry_run}{If TRUE, don't actually store dataset, just return the data object for inspection or further modification.}
diff --git a/man/use_latest_in_collection.Rd b/man/use_latest_in_collection.Rd
index 00c376d9..8a8983d3 100644
--- a/man/use_latest_in_collection.Rd
+++ b/man/use_latest_in_collection.Rd
@@ -7,7 +7,7 @@
 use_latest_in_collection(collection_id, items = "all")
 }
 \arguments{
-\item{collection_id}{}
+\item{collection_id}{Collection id.}
 
 \item{items}{Vector of dataset ids for which to update reference to latest version,
 or "all" (default) to update all in the dataset collection.}

From 53657a24d052bb656dec3e7acf59f45019c235dc Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Mon, 3 Jul 2023 17:58:25 -0600
Subject: [PATCH 25/25] Final updates, reorg, and notes for experimental stuff

---
 R/citation.R        | 54 +++++++++++++++++++++++++++++++++++++++++++++
 R/datasets.R        | 26 ----------------------
 _pkgdown.yml        |  5 +++--
 man/cite_dataset.Rd | 18 +++++++--------
 man/get_doi_meta.Rd | 18 +++++++++++++++
 5 files changed, 84 insertions(+), 37 deletions(-)
 create mode 100644 R/citation.R
 create mode 100644 man/get_doi_meta.Rd

diff --git a/R/citation.R b/R/citation.R
new file mode 100644
index 00000000..7d145317
--- /dev/null
+++ b/R/citation.R
@@ -0,0 +1,54 @@
+# -- Citations -----------------------------------------------------------------#
+
+#' Get DOI metadata if it exists
+#' 
+#' Returns list of metadata associated with DOI if exists, otherwise NULL. 
+#' Currently usable for certain entity types like files or datasets, 
+#' though this should be revised to make more useful with other objects.
+#' Note: Internal/experimental use only, not for production use.
+#'
+#' @param id Dataset data.
+#' @keywords internal
+get_doi_meta <- function(id) {
+  
+  # TODO Template query according to object type of id, 
+  # i.e. folders can have dois, but they don't have version #s
+  obj <- .syn$get(id, downloadFile = FALSE)
+  versionNumber <- obj$properties$versionNumber # error if no versionNumber
+  tryCatch({
+    .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/doi?id={id}&type=DATASET&version={versionNumber}"))
+  }, 
+  error = function(e) if(grepl("DOI association does not exist.", e$message)) NULL else e)
+}
+
+#' Generate example dataset citation
+#' 
+#' This is currently more for demo purposes, to check how well current metadata 
+#' could be formatted into citation text. Datasets where DOIs have been minted 
+#' *or* NF-OSI processed datasets within the official Portal Collection should  
+#' work well, while there are no guarantees for other cases.
+#' Note: Internal/experimental use only, not for production use.
+#'
+#' @param id Dataset id.
+#' @param format Currently just "Scientific Data" format.
+#' @param output Currently only markdown, from which other utils can be used to generate LaTeX or HTML.
+#' @keywords internal
+cite_dataset <- function(id, 
+                         format = "Scientific Data", 
+                         output = c("markdown")) {
+  if(!is_dataset(id)) stop("Not a dataset")
+  if(length(get_doi_meta(id))) {
+    message("For now, please go to https://citation.crosscite.org/ for the most comprehensive citation options.")
+    return(NULL)
+  } else {
+    meta <- .syn$get_annotations(id)
+    title <- meta$title
+    creator <- meta$creator
+    repository <- "Synapse"
+    accession <- if(length(doi)) doi else glue::glue("https://www.synapse.org/#!Synapse:{id}") 
+    yearPublished <- meta$yearPublished
+    glue::glue("{creator}. _{repository}_ {accession} ({yearPublished}).")
+  }
+  
+}
+
diff --git a/R/datasets.R b/R/datasets.R
index 1cb41b07..fffc9a8d 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -256,32 +256,6 @@ nf_star_salmon_datasets <- function(output_map,
               dry_run = dry_run)
 }
 
-# -- Citations -----------------------------------------------------------------#
-
-# TODO Potentially move somewhere else
-
-#' Generate dataset citation
-#' 
-#' This is currently more for demo purposes, to check how well current metadata 
-#' could be formatted into citation text. Datasets within the official 
-#' Portal Collection should work well enough, while there are no guarantees for 
-#' unofficial/community-contributed datasets.
-#' @param dataset_id Dataset id.
-#' @param format Currently just "Scientific Data" format.
-#' @param output Currently just markdown format. There are many ways to 
-#' generate LaTeX or HTML from markdown.
-#' @keywords internal
-cite_dataset <- function(dataset_id, format = "Scientific Data", output = c("markdown")) {
-  if(!is_dataset(id)) stop("Not a dataset")
-  meta <- .syn$get_annotations(id)
-  doi <- tryCatch(meta$doi, error = function(e) NULL)
-  title <- meta$title
-  creator <- meta$creator
-  yearPublished <- meta$yearPublished
-  repository <- "Synapse"
-  accession <- if(length(doi)) doi else glue::glue("https://www.synapse.org/#!Synapse:{id}") 
-  glue::glue("{creator}. _{repository}_ {accession} ({yearPublished}).")
-}
 
 # -- Checks------------- -------------------------------------------------------#
 
diff --git a/_pkgdown.yml b/_pkgdown.yml
index 47c3195d..fd004201 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -46,10 +46,9 @@ reference:
 
 - title: Dataset Creation and Management
 - subtitle: General dataset creation and citation
-  desc: Create datasets generally, create citation example text
+  desc: Create datasets in general
 - contents:
   - new_dataset
-  - cite_dataset
 - subtitle: Specialized dataset creation
   desc: Create specialized datasets for nextflow processed data, i.e. with some custom construction queries and title templating.
 - contents:
@@ -137,4 +136,6 @@ reference:
   - .replace_string_column_with_stringlist_column
   - .store_rows
   - missing_annotation_email
+  - get_doi_meta
+  - cite_dataset
 
diff --git a/man/cite_dataset.Rd b/man/cite_dataset.Rd
index 6a53c059..8314d9d3 100644
--- a/man/cite_dataset.Rd
+++ b/man/cite_dataset.Rd
@@ -1,23 +1,23 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/datasets.R
+% Please edit documentation in R/citation.R
 \name{cite_dataset}
 \alias{cite_dataset}
-\title{Generate dataset citation}
+\title{Generate example dataset citation}
 \usage{
-cite_dataset(dataset_id, format = "Scientific Data", output = c("markdown"))
+cite_dataset(id, format = "Scientific Data", output = c("markdown"))
 }
 \arguments{
-\item{dataset_id}{Dataset id.}
+\item{id}{Dataset id.}
 
 \item{format}{Currently just "Scientific Data" format.}
 
-\item{output}{Currently just markdown format. There are many ways to
-generate LaTeX or HTML from markdown.}
+\item{output}{Currently only markdown, from which other utils can be used to generate LaTeX or HTML.}
 }
 \description{
 This is currently more for demo purposes, to check how well current metadata
-could be formatted into citation text. Datasets within the official
-Portal Collection should work well enough, while there are no guarantees for
-unofficial/community-contributed datasets.
+could be formatted into citation text. Datasets where DOIs have been minted
+\emph{or} NF-OSI processed datasets within the official Portal Collection should
+work well, while there are no guarantees for other cases.
+Note: Internal/experimental use only, not for production use.
 }
 \keyword{internal}
diff --git a/man/get_doi_meta.Rd b/man/get_doi_meta.Rd
new file mode 100644
index 00000000..4a6ac20c
--- /dev/null
+++ b/man/get_doi_meta.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/citation.R
+\name{get_doi_meta}
+\alias{get_doi_meta}
+\title{Get DOI metadata if it exists}
+\usage{
+get_doi_meta(id)
+}
+\arguments{
+\item{id}{Dataset data.}
+}
+\description{
+Returns list of metadata associated with DOI if exists, otherwise NULL.
+Currently usable for certain entity types like files or datasets,
+though this should be revised to make more useful with other objects.
+Note: Internal/experimental use only, not for production use.
+}
+\keyword{internal}