From 7c8804d4b16bd9215f91cb93c1f17ebbdba73cc6 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu <32753274+anngvu@users.noreply.github.com> Date: Thu, 17 Aug 2023 08:57:59 -0600 Subject: [PATCH] Handle version semantics for collections (#115) * Handling of latest semantics * Fix docs * Run pkg check, bump version --- DESCRIPTION | 2 +- R/datasets.R | 67 ++++++++++++++++++++++++++------- R/views.R | 28 ++++++++++++++ man/as_coll_items.Rd | 14 +++++-- man/latest_version.Rd | 27 +++++++++++++ man/new_dataset.Rd | 2 +- man/new_view.Rd | 23 +++++++++++ man/use_latest_in_collection.Rd | 13 +++++-- 8 files changed, 152 insertions(+), 24 deletions(-) create mode 100644 R/views.R create mode 100644 man/latest_version.Rd create mode 100644 man/new_view.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 91cd27ac..18b0089a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: nfportalutils Title: NF Portal Utilities -Version: 0.0.0.9310 +Version: 0.0.0.9320 Authors@R: c( person(given = "Robert", family = "Allaway", role = c("aut", "cre"), email = "robert.allaway@sagebionetworks.org", diff --git a/R/datasets.R b/R/datasets.R index 5ecf7edd..d484d3d1 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -4,19 +4,27 @@ # and dataset collections (collection of datasets). -#' As collection items +#' Structure as collection items #' -#' Helper taking entity ids to create records used for dataset items or dataset collection items. +#' Helper taking entity ids to create records used for dataset items *or* dataset collection items. #' Collection items have the form `list(entityId = id, versionNumber = x)`. #' +#' Note: For item version, dataset items allow two meanings of literal or absolute "latest" +#' vs. "stable_latest", but with files either one can be used to mean the same thing +#' since there will be correct interpretation done under the hood. +#' See implementation in `latest_version`. +#' #' @param ids Ids of entities to make into dataset items. #' @param item_version Integer for version that will be used for all items, e.g. 1. -#' If NULL, this will look up the latest version for each id and use that. +#' Otherwise, "latest" or "stable_latest". See details. #' @keywords internal -as_coll_items <- function(ids, item_version = NULL) { - if(is.null(item_version)) { - item_version <- lapply(ids, function(id) .syn$get(id, downloadFile = FALSE)$properties$versionNumber) +as_coll_items <- function(ids, item_version = c("abs", "stable")) { + + if(!is.integer(item_version)) { + version_semantics <- match.arg(item_version) + item_version <- lapply(ids, function(id) latest_version(id, version_semantics)) } + items <- Map(function(id, version) list(entityId = id, versionNumber = version), ids, item_version) names(items) <- NULL # need to unname list for API items @@ -44,24 +52,25 @@ update_items <- function(current_coll, update_coll) { # reconversion; using pure apply as.list coerces versionNumber into char updated <- apply(updated, 1, function(i) list(entityId = unname(i[1]), versionNumber = as.integer(i[2]))) updated -} +} #' Update item versions to "latest" in a collection #' #' Update an _existing_ collection so that all items or a subset of items reference their latest version. -#' This should work for both datasets (collection of files) and dataset collections (collection of datasets). +#' Should work for both datasets (collection of files) and dataset collections (collection of datasets). #' +#' @inheritParams latest_version #' @param collection_id Collection id. -#' @param items Vector of dataset ids for which to update reference to latest version, -#' or "all" (default) to update all in the dataset collection. +#' @param items Vector of dataset ids for which to update reference to latest version, or "all" (default) to update all. #' @export -use_latest_in_collection <- function(collection_id, items = "all") { +use_latest_in_collection <- function(collection_id, items = "all", version_semantics = "abs") { + coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}")) current_items <- sapply(coll$items, function(i) i$entityId) - + if((length(items) == 1) && (items == "all")) { - coll$items <- as_coll_items(current_items) + coll$items <- as_coll_items(current_items, item_version = version_semantics) } else { # Check subset; if no check, this becomes `add_to_collection` @@ -73,7 +82,7 @@ use_latest_in_collection <- function(collection_id, items = "all") { return(coll) } } - updated_items <- update_items(coll$items, as_coll_items(items)) + updated_items <- update_items(coll$items, as_coll_items(items, item_version = version_semantics)) coll$items <- updated_items } .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(coll, auto_unbox = TRUE)) @@ -149,6 +158,36 @@ new_dataset <- function(name, parent, items, item_version = NULL, dry_run = TRUE } +#' Get the latest version +#' +#' Get latest version, with special handling for semantics of "latest" regarding new collection types. +#' Datasets and dataset collections always start out as draft so unlike other entities +#' there is a concept of a stable version which is the "real" latest, but which might not always exist. +#' For datasets/dataset collections the latest version refers to a DRAFT, so latest stable version is `versionNumber` - 1 +#' under the condition that the `versionNumber` is greater or equal to 2. +#' When `versionNumber` = 1 and `isLatestVersion` is TRUE, this means there is not yet a stable version. +#' When using stable version semantics, if a stable version does not exist an error will be thrown. +#' +#' The parameter `version_semantics` allows user to specify "what type of *latest* do you mean?". +#' +#' Note: Do not use with versioned ids of the form "syn12345678.3" +#' +#' @param id Dataset id. See details. +#' @param version_semantics Use "abs" for absolute latest version or "stable". Only used for collection entities. See details. +latest_version <- function(id, version_semantics = c("abs", "stable")) { + + entity <- .syn$get(id, downloadFile = FALSE) + version <- entity$properties$versionNumber + if(entity$properties$concreteType %in% c("org.sagebionetworks.repo.model.table.Dataset", "org.sagebionetworks.repo.model.table.DatasetCollection") + && version_semantics == "stable_latest") { + version <- version - 1 + if(!version) stop("No stable version exists for ", id) + } + + version +} + + #' Create Sarek-processed datasets #' #' Organize variant call files from Nextflow Sarek into 3-4 datasets, diff --git a/R/views.R b/R/views.R new file mode 100644 index 00000000..3a0b7820 --- /dev/null +++ b/R/views.R @@ -0,0 +1,28 @@ +#' Create a view +#' +#' This creates a generic view, including by default just file entities and the default columns +#' (i.e. defaults to a generic fileview). +#' This is often useful to get ids of files for a large number of nested files by creating a temp fileview +#' (the alternative is to use `walk`, but if the tree structure is not regular it can be messy to parse the output). +#' +#' @param scope Character id(s) of project or folder container(s) in scope. +#' @param project Parent project id to create the view in. +#' @param name Name of view. +#' @param include Which entity type(s) to include in scope. Defaults to files. +#' +new_view <- function(scope, + project, + name = "New View", + include = "FILE") { + + included <- match.arg(include, several.ok = TRUE) + view <- synapseclient$EntityViewSchema( + name = name, + columns = list(), + parent = project, + scopes = scope, + includeEntityTypes = list(synapseclient$EntityViewType), + add_default_columns = TRUE) + view <- .syn$store(view) + invisible(view) +} diff --git a/man/as_coll_items.Rd b/man/as_coll_items.Rd index 9c97671b..e56a2848 100644 --- a/man/as_coll_items.Rd +++ b/man/as_coll_items.Rd @@ -2,18 +2,24 @@ % Please edit documentation in R/datasets.R \name{as_coll_items} \alias{as_coll_items} -\title{As collection items} +\title{Structure as collection items} \usage{ -as_coll_items(ids, item_version = NULL) +as_coll_items(ids, item_version = c("abs", "stable")) } \arguments{ \item{ids}{Ids of entities to make into dataset items.} \item{item_version}{Integer for version that will be used for all items, e.g. 1. -If NULL, this will look up the latest version for each id and use that.} +Otherwise, "latest" or "stable_latest". See details.} } \description{ -Helper taking entity ids to create records used for dataset items or dataset collection items. +Helper taking entity ids to create records used for dataset items \emph{or} dataset collection items. Collection items have the form \code{list(entityId = id, versionNumber = x)}. } +\details{ +Note: For item version, dataset items allow two meanings of literal or absolute "latest" +vs. "stable_latest", but with files either one can be used to mean the same thing +since there will be correct interpretation done under the hood. +See implementation in \code{latest_version}. +} \keyword{internal} diff --git a/man/latest_version.Rd b/man/latest_version.Rd new file mode 100644 index 00000000..f1843dd8 --- /dev/null +++ b/man/latest_version.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets.R +\name{latest_version} +\alias{latest_version} +\title{Get the latest version} +\usage{ +latest_version(id, version_semantics = c("abs", "stable")) +} +\arguments{ +\item{id}{Dataset id. See details.} + +\item{version_semantics}{Use "abs" for absolute latest version or "stable". Only used for collection entities. See details.} +} +\description{ +Get latest version, with special handling for semantics of "latest" regarding new collection types. +Datasets and dataset collections always start out as draft so unlike other entities +there is a concept of a stable version which is the "real" latest, but which might not always exist. +For datasets/dataset collections the latest version refers to a DRAFT, so latest stable version is \code{versionNumber} - 1 +under the condition that the \code{versionNumber} is greater or equal to 2. +When \code{versionNumber} = 1 and \code{isLatestVersion} is TRUE, this means there is not yet a stable version. +When using stable version semantics, if a stable version does not exist an error will be thrown. +} +\details{ +The parameter \code{version_semantics} allows user to specify "what type of \emph{latest} do you mean?". + +Note: Do not use with versioned ids of the form "syn12345678.3" +} diff --git a/man/new_dataset.Rd b/man/new_dataset.Rd index 985d3935..7023a7d8 100644 --- a/man/new_dataset.Rd +++ b/man/new_dataset.Rd @@ -15,7 +15,7 @@ new_dataset(name, parent, items, item_version = NULL, dry_run = TRUE) Usually the same parent project storing the files, but in some cases it may be a different project.} \item{item_version}{Integer for version that will be used for all items, e.g. 1. -If NULL, this will look up the latest version for each id and use that.} +Otherwise, "latest" or "stable_latest". See details.} \item{dry_run}{If TRUE, don't actually store dataset, just return the data object for inspection or further modification.} } diff --git a/man/new_view.Rd b/man/new_view.Rd new file mode 100644 index 00000000..d9f3f951 --- /dev/null +++ b/man/new_view.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/views.R +\name{new_view} +\alias{new_view} +\title{Create a view} +\usage{ +new_view(scope, project, name = "New View", include = "FILE") +} +\arguments{ +\item{scope}{Character id(s) of project or folder container(s) in scope.} + +\item{project}{Parent project id to create the view in.} + +\item{name}{Name of view.} + +\item{include}{Which entity type(s) to include in scope. Defaults to files.} +} +\description{ +This creates a generic view, including by default just file entities and the default columns +(i.e. defaults to a generic fileview). +This is often useful to get ids of files for a large number of nested files by creating a temp fileview +(the alternative is to use \code{walk}, but if the tree structure is not regular it can be messy to parse the output). +} diff --git a/man/use_latest_in_collection.Rd b/man/use_latest_in_collection.Rd index 8a8983d3..27e6ab72 100644 --- a/man/use_latest_in_collection.Rd +++ b/man/use_latest_in_collection.Rd @@ -4,15 +4,20 @@ \alias{use_latest_in_collection} \title{Update item versions to "latest" in a collection} \usage{ -use_latest_in_collection(collection_id, items = "all") +use_latest_in_collection( + collection_id, + items = "all", + version_semantics = "abs" +) } \arguments{ \item{collection_id}{Collection id.} -\item{items}{Vector of dataset ids for which to update reference to latest version, -or "all" (default) to update all in the dataset collection.} +\item{items}{Vector of dataset ids for which to update reference to latest version, or "all" (default) to update all.} + +\item{version_semantics}{Use "abs" for absolute latest version or "stable". Only used for collection entities. See details.} } \description{ Update an \emph{existing} collection so that all items or a subset of items reference their latest version. -This should work for both datasets (collection of files) and dataset collections (collection of datasets). +Should work for both datasets (collection of files) and dataset collections (collection of datasets). }