From f3ae7d2bd839656ad02cf37dd228221a9df8b26b Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Thu, 22 Feb 2024 18:58:42 -0700 Subject: [PATCH 01/37] Start refactor on access utils --- R/access_utils.R | 61 ++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/R/access_utils.R b/R/access_utils.R index 25008acc..bb75c48a 100644 --- a/R/access_utils.R +++ b/R/access_utils.R @@ -29,9 +29,8 @@ summarize_file_access <- function(principal_id, # 3378999 for NF-OSI fileview_id # "syn16858331" ) { - .check_login() tryCatch({ - view <- .syn$tableQuery(glue::glue("SELECT id,type,benefactorId FROM {fileview_id}")) + view <- synapser::synTableQuery(glue::glue("SELECT id,type,benefactorId FROM {fileview_id}")) }, error = function(e) stop("Could not query view!")) view <- as.data.table(view$asDataFrame()) files_by_benefactor <- view[type == "file", .N, by = .(benefactorId)] @@ -56,7 +55,7 @@ check_access <- function(id, stopifnot(is.numeric(principal_id)) acl_result <- tryCatch({ - .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{id}/acl"))$resourceAccess %>% + synapser::synRestGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{id}/acl"))$resourceAccess %>% rbindlist(.) }, error = function(e) stop(glue::glue("Error for {id}: {e$message}"))) @@ -69,26 +68,26 @@ check_access <- function(id, # -- SETTING ACCESS -------------------------------------------------------------# -#' Set public access to VIEW (READ) only for an entity -#' -#' Set both registered users and non-registered users to have VIEW-only permissions. +#' Set public access to VIEW (READ) only for an entity +#' +#' Set both registered users and non-registered users to have VIEW-only permissions. #' See code{link{make_public}} for more permissive permissions to download (for registered users), which is usually set later at data release time. -#' +#' #' @param id Synapse entity id. #' @export make_public_viewable <- function(id) { - .check_login() + ALL_REGISTERED_SYNAPSE_USERS_GROUP <- "273948" PUBLIC_GROUP <- "273949" # set registered synapse users to view, download - .syn$setPermissions(entity = id, - principalId = ALL_REGISTERED_SYNAPSE_USERS_GROUP, - accessType = list("READ")) - + synapser::synSetPermissions(entity = id, + principalId = ALL_REGISTERED_SYNAPSE_USERS_GROUP, + accessType = list("READ")) + # set public to view - .syn$setPermissions(entity = id, - principalId = PUBLIC_GROUP, - accessType = list("READ")) + synapser::synSetPermissions(entity = id, + principalId = PUBLIC_GROUP, + accessType = list("READ")) } @@ -101,18 +100,18 @@ make_public_viewable <- function(id) { #' @param id Synapse entity id. #' @export make_public <- function(id) { - .check_login() + ALL_REGISTERED_SYNAPSE_USERS_GROUP <- "273948" PUBLIC_GROUP <- "273949" # set registered synapse users to view, download - .syn$setPermissions(entity = id, - principalId = ALL_REGISTERED_SYNAPSE_USERS_GROUP, - accessType = list("READ","DOWNLOAD")) + synapser::synSetPermissions(entity = id, + principalId = ALL_REGISTERED_SYNAPSE_USERS_GROUP, + accessType = list("READ","DOWNLOAD")) # set public to view - .syn$setPermissions(entity = id, - principalId = PUBLIC_GROUP, - accessType = list("READ")) + synapser::synSetPermissions(entity = id, + principalId = PUBLIC_GROUP, + accessType = list("READ")) } @@ -138,14 +137,14 @@ grant_specific_file_access <- function(principal_id, entity_ids, create_dataset # set registered synapse users to view, download sapply(entity_ids, function(id){ - .syn$setPermissions(entity = id, - principalId = principal_id, - accessType = list("READ","DOWNLOAD")) + synapser::synSetPermissions(entity = id, + principalId = principal_id, + accessType = list("READ","DOWNLOAD")) }) ##need to grab the current versions for dataset creation dataset_items <- lapply(entity_ids, function(id){ - vsn <- .syn$get(id, downloadFile = F)$versionNumber + vsn <- synapser::synGet(id, downloadFile = F)$versionNumber list(entityId = id, versionNumber = vsn) }) @@ -156,15 +155,15 @@ grant_specific_file_access <- function(principal_id, entity_ids, create_dataset if(create_dataset){ tryCatch({ # First attempt with addAnnotationColumns = TRUE - dataset <- .syn$store(synapseclient$Dataset(name = dataset_name, + dataset <- synapser::synStore(synapseclient$Dataset(name = dataset_name, parent = project_id, dataset_items = dataset_items, addAnnotationColumns = TRUE)) message(glue::glue("{emoji::emoji(\"thumbsup\")} Dataset created with annotation columns at {dataset$properties$id}")) }, error = function(e) { # If error, retry with addAnnotationColumns = FALSE - dataset <- .syn$store(synapseclient$Dataset(name = dataset_name, + dataset <- synapser::synStore(synapseclient$Dataset(name = dataset_name, parent = project_id, dataset_items = dataset_items, addAnnotationColumns = FALSE)) - .syn$setPermissions(entity = dataset$properties$id, principalId = principal_id, - accessType = list("READ", "DOWNLOAD")) + synapser::synSetPermissions(entity = dataset$properties$id, principalId = principal_id, + accessType = list("READ", "DOWNLOAD")) message(glue::glue("{emoji::emoji(\"warning\")} Dataset created without annotation columns at {dataset$properties$id}. Annotation columns will need to be added manually.")) }) } @@ -172,7 +171,7 @@ grant_specific_file_access <- function(principal_id, entity_ids, create_dataset message(glue::glue('{emoji::emoji("astonished")} Principal {principal_id} added to {length(entity_ids)} entities')) #TODO: set schema programmatically? might be easier to add annotations to schema in web client as needed to support principal_id... - ## Note Dec 2023; schema is automatically defined unless there is an error caused by the way synapse detects annotation schemas, e.g. a type collision that causes duplicate columns with the same name. + ## Note Dec 2023; schema is automatically defined unless there is an error caused by the way synapse detects annotation schemas, e.g. a type collision that causes duplicate columns with the same name. } From bc03f00efee74ff58b2b9f66308e9bf7a77a7472 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 09:44:09 -0700 Subject: [PATCH 02/37] Rework manifest_generate bc can no longer access inner creds obj --- R/annotation_qc.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/annotation_qc.R b/R/annotation_qc.R index 6d6589d0..e76e7d67 100644 --- a/R/annotation_qc.R +++ b/R/annotation_qc.R @@ -11,6 +11,7 @@ #' @param output_format Format of 'excel', 'google_sheet', or 'dataframe'. Defaults to 'excel'. #' @param use_annotations Use annotations if filling out manifest for existing dataset. Defaults to TRUE for NF. #' @param service Service endpoint to use. Defaults to the schematic production endpoint. +#' @param access_token Synapse auth token, defaults to `SYNAPSE_AUTH_TOKEN` set in env. #' @returns For excel, path to local file; for google_sheet, URL to sheet; for dataframe, JSON string of data. #' @export manifest_generate <- function(data_type, @@ -20,11 +21,11 @@ manifest_generate <- function(data_type, asset_view = "syn16858331", output_format = "excel", use_annotations = TRUE, - service = "https://schematic.api.sagebionetworks.org/v1/manifest/generate") { + service = "https://schematic.api.sagebionetworks.org/v1/manifest/generate", + access_token = Sys.getenv("SYNAPSE_AUTH_TOKEN")) { # yes, param needs to be re-encoded like this for 'dataframe' output_format_param <- if (output_format == "dataframe") "dataframe (only if getting existing manifests)" else output_format - access_token <- .syn$credentials$secret use_annotations <- tolower(as.character(use_annotations)) req <- httr::GET(service, From e5c73e70bbaf068388ecf8bd2e3b3bac789bcc34 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 09:55:52 -0700 Subject: [PATCH 03/37] Update some find_ funs bec on which list_project_datasets has dep --- R/find.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/find.R b/R/find.R index 6cd91eb8..b3590e73 100644 --- a/R/find.R +++ b/R/find.R @@ -29,10 +29,10 @@ find_in <- function(scope, path) { #' @export find_child <- function(child_name, parent) { - q <- .syn$getChildren(parent) + q <- synapser::synGetChildren(parent) child_id <- NULL repeat { - x <- reticulate::iter_next(q) + x <- synapser::nextElem(q) if(is.null(x) || x$name == child_name) { child_id <- x$id break @@ -52,8 +52,8 @@ find_child <- function(child_name, parent) { #' @export find_child_type <- function(parent, child_type = list("file")) { - x <- .syn$getChildren(parent, includeTypes = child_type) - y <- reticulate::iterate(x) + x <- synapser::synGetChildren(parent, includeTypes = child_type) + y <- synapser::as.list(x) if(!length(y)) return() z <- setNames(sapply(y, `[[`, "id"), sapply(y, `[[`, "name")) return(z) From cebe432af50a5947d3b870bfce3c6329d448f025 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 09:59:28 -0700 Subject: [PATCH 04/37] Update annotations.R --- R/annotations.R | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/R/annotations.R b/R/annotations.R index 17d14b9d..7748331f 100644 --- a/R/annotations.R +++ b/R/annotations.R @@ -1,16 +1,16 @@ #' Set annotations from a manifest -#' -#' The [Synapse docs](https://help.synapse.org/docs/Managing-Custom-Metadata-at-Scale.2004254976.html) -#' suggest doing batch annotations through a fileview. However, it is often simpler to -#' modify or set new annotations directly given a table of just the entities (rows) and props (cols) we want. +#' +#' The [Synapse docs](https://help.synapse.org/docs/Managing-Custom-Metadata-at-Scale.2004254976.html) +#' suggest doing batch annotations through a fileview. However, it is often simpler to +#' modify or set new annotations directly given a table of just the entities (rows) and props (cols) we want. #' This is like how schematic works, except without any validation (so works best for power-users who know the data model well). -#' Some desired defaults are taken into account, such as not submitting key-values with `NA` and empty strings. -#' +#' Some desired defaults are taken into account, such as not submitting key-values with `NA` and empty strings. +#' #' @param manifest A table manifest. Needs to contain `entityId`. #' @param ignore_na Whether to ignore annotations that are `NA`; default TRUE. #' @param ignore_blank Whether to ignore annotations that are that empty strings; default TRUE. #' @param verbose Be chatty, default FALSE. -#' @export +#' @export annotate_with_manifest <- function(manifest, ignore_na = TRUE, ignore_blank = TRUE, verbose = FALSE) { # Split by `entityId` annotations <- as.data.table(manifest) @@ -21,31 +21,29 @@ annotate_with_manifest <- function(manifest, ignore_na = TRUE, ignore_blank = TR filterBlank <- if(ignore_blank) function(x) !any(x == "") else TRUE # same as above annotations <- lapply(annotations, function(x) Filter(function(x) filterNA(x) & filterBlank(x) & length(x), unlist(x, recursive = F))) for(entity in names(annotations)) { - .syn$setAnnotations(entity = entity, annotations = as.list(annotations[[entity]])) + synapser::synSetAnnotations(entity = entity, annotations = as.list(annotations[[entity]])) } if (verbose) message("Annotations submitted") } #' Copy annotations -#' +#' #' Copy annotations (all or selectively) from a source entity to one or more target entities. -#' If annotations already exist on target entities, the copy will replace the current values. -#' -#' @param entity_from Syn id from which to copy. -#' @param entity_to One or more syn ids to copy annotations to. -#' @param select Vector of properties to selectively copy if present on the entity. +#' If annotations already exist on target entities, the copy will replace the current values. +#' +#' @param entity_from Syn id from which to copy. +#' @param entity_to One or more syn ids to copy annotations to. +#' @param select Vector of properties to selectively copy if present on the entity. #' If not specified, will copy over everything, which may not be desirable. -#' @param update Whether to immediately update or return annotation objects only. +#' @param update Whether to immediately update or return annotation objects only. #' @export copy_annotations <- function(entity_from, entity_to, select = NULL, update = FALSE) { - - .check_login() - - annotations <- .syn$get_annotations(entity_from) + + annotations <- synapser::synGetAnnotations(entity_from) if(is.null(select)) { cp <- annotations } else { @@ -54,10 +52,10 @@ copy_annotations <- function(entity_from, if(k %in% select) cp[k] <- annotations[k] } } - + if(update) { for(e in entity_to) { - .syn$setAnnotations(e, annotations = cp) + synapser::synSetAnnotations(e, annotations = cp) } } else { return(cp) From 13cd6696dbbe84a0ee5a39f6321b2b1149a5335f Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 09:59:55 -0700 Subject: [PATCH 05/37] Update annotation_qc.R --- R/annotation_qc.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/annotation_qc.R b/R/annotation_qc.R index e76e7d67..2c15085a 100644 --- a/R/annotation_qc.R +++ b/R/annotation_qc.R @@ -305,15 +305,15 @@ list_project_datasets <- function(project_id, } else { - in_data <- .syn$getChildren(data_root) - in_data <- reticulate::iterate(in_data) + in_data <- synapser::synGetChildren(data_root) + in_data <- synapser::as.list(in_data) datasets <- Filter(function(x) x$type == "org.sagebionetworks.repo.model.Folder", in_data) if(!length(datasets)) warning("No datasets found under data root.") datasets } } else { - children <- .syn$getChildren(project_id) - datasets <- reticulate::iterate(children) + children <- synapser::synGetChildren(project_id) + datasets <- synapser::as.list(children) datasets <- Filter(function(x) x$type == "org.sagebionetworks.repo.model.table.Dataset", datasets) if(!length(datasets)) warning("No dataset entities found in project.") datasets From 33b8bd165056d90aad755c77fe360afb864e43e9 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 10:00:37 -0700 Subject: [PATCH 06/37] Update assign_study_data_types.R --- R/assign_study_data_types.R | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/R/assign_study_data_types.R b/R/assign_study_data_types.R index 6cb92a56..8f125e90 100644 --- a/R/assign_study_data_types.R +++ b/R/assign_study_data_types.R @@ -26,8 +26,6 @@ assign_study_data_types <- function(study_table_id, attribute = "dataType", dry_run = TRUE) { - .check_login() - # get studies within scope from study table studies <- table_query(table_id = study_table_id, columns = id_col) %>% unlist() @@ -72,7 +70,7 @@ summarize_attribute <- function(summary_query, dry_run = TRUE, check_fun = NULL) { - values <- .syn$tableQuery(summary_query,includeRowIdAndRowVersion = F)$asDataFrame() + values <- synapser::synTableQuery(summary_query,includeRowIdAndRowVersion = F) %>% synapser::as.data.frame() meta <- lapply(values[[attribute]], function(x) unique(trimws(strsplit(x, split = ",")[[1]]))) # in case of stray whitespaces if(is_valid_syn_id(entity_id)) { names(meta) <- entity_id @@ -82,14 +80,14 @@ summarize_attribute <- function(summary_query, result_list <- list() for(entity in names(meta)) { - entity_meta <- .syn$get_annotations(entity) + entity_meta <- synapser::synGetAnnotations(entity) entity_meta[attribute] <- meta[[entity]] result_list[[entity]] <- entity_meta if(!dry_run) { if(is.function(check_fun)) { - if(check_fun(meta[[entity]])) .syn$set_annotations(entity_meta) else message("Skipped update for {entity}.") + if(check_fun(meta[[entity]])) synapser::synSetAnnotations(entity_meta) else message("Skipped update for {entity}.") } else { - .syn$set_annotations(entity_meta) + synapser::synSetAnnotations(entity_meta) message(glue::glue("Updated {entity} {attribute}.")) } } From db3601c461255d73269ef2a343240bab30f33b2e Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 10:01:16 -0700 Subject: [PATCH 07/37] Update access_utils.R --- R/access_utils.R | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/R/access_utils.R b/R/access_utils.R index bb75c48a..e4088e30 100644 --- a/R/access_utils.R +++ b/R/access_utils.R @@ -32,7 +32,7 @@ summarize_file_access <- function(principal_id, # 3378999 for NF-OSI tryCatch({ view <- synapser::synTableQuery(glue::glue("SELECT id,type,benefactorId FROM {fileview_id}")) }, error = function(e) stop("Could not query view!")) - view <- as.data.table(view$asDataFrame()) + view <- synapser::as.data.frame(view) files_by_benefactor <- view[type == "file", .N, by = .(benefactorId)] access <- view[, check_access(benefactorId, principal_id, access_type), by = .(benefactorId)] # files_by_benefactor can be smaller than access because there are folders without files @@ -129,9 +129,8 @@ make_public <- function(id) { #' @param dataset_name Optional name for dataset to be created #' @export grant_specific_file_access <- function(principal_id, entity_ids, create_dataset = F, project_id = NULL, dataset_name = NULL) { - # .check_login() - if(create_dataset & is.null(project_id)){ + if(create_dataset && is.null(project_id)){ stop("project_id must be provided if create_dataset = T") } @@ -142,12 +141,6 @@ grant_specific_file_access <- function(principal_id, entity_ids, create_dataset accessType = list("READ","DOWNLOAD")) }) - ##need to grab the current versions for dataset creation - dataset_items <- lapply(entity_ids, function(id){ - vsn <- synapser::synGet(id, downloadFile = F)$versionNumber - list(entityId = id, versionNumber = vsn) - }) - if(is.null(dataset_name)){ dataset_name <- glue::glue("Dataset {Sys.Date()} for {principal_id}") } @@ -155,16 +148,24 @@ grant_specific_file_access <- function(principal_id, entity_ids, create_dataset if(create_dataset){ tryCatch({ # First attempt with addAnnotationColumns = TRUE - dataset <- synapser::synStore(synapseclient$Dataset(name = dataset_name, - parent = project_id, dataset_items = dataset_items, addAnnotationColumns = TRUE)) + dataset <- new_dataset(name = dataset_name, + parent = project_id, + items = entity_ids, + addAnnotationColumns = TRUE, + dry_run = FALSE) message(glue::glue("{emoji::emoji(\"thumbsup\")} Dataset created with annotation columns at {dataset$properties$id}")) }, error = function(e) { # If error, retry with addAnnotationColumns = FALSE - dataset <- synapser::synStore(synapseclient$Dataset(name = dataset_name, - parent = project_id, dataset_items = dataset_items, addAnnotationColumns = FALSE)) - synapser::synSetPermissions(entity = dataset$properties$id, principalId = principal_id, + dataset <- new_dataset(name = dataset_name, + parent = project_id, + items = entity_ids, + addAnnotationColumns = FALSE, + dry_run = FALSE) + synapser::synSetPermissions(entity = dataset$properties$id, + principalId = principal_id, accessType = list("READ", "DOWNLOAD")) - message(glue::glue("{emoji::emoji(\"warning\")} Dataset created without annotation columns at {dataset$properties$id}. Annotation columns will need to be added manually.")) + message(glue::glue("{emoji::emoji(\"warning\")} Dataset created without annotation columns at {dataset$properties$id}. + Annotation columns will need to be added manually.")) }) } From 92c39a7ac1a17c0f582fb9beb3acae85bbbcf8e7 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 10:01:59 -0700 Subject: [PATCH 08/37] Update access utils-related vignette --- vignettes/access-utilities.Rmd | 74 +++++++++++++++++++++++++++++++ vignettes/survey-public-files.Rmd | 52 ---------------------- 2 files changed, 74 insertions(+), 52 deletions(-) create mode 100644 vignettes/access-utilities.Rmd delete mode 100644 vignettes/survey-public-files.Rmd diff --git a/vignettes/access-utilities.Rmd b/vignettes/access-utilities.Rmd new file mode 100644 index 00000000..7506d414 --- /dev/null +++ b/vignettes/access-utilities.Rmd @@ -0,0 +1,74 @@ +--- +title: "Access utilities" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Surveying public files in the portal} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +## Intro + +This explains some access-related utilities and use cases where they're helpful. + +## Set up + +The usual setup: +```{r setup, eval=F} +library(nfportalutils) +synapser::synLogin() +``` + +## Give selected access to an individual or team + +Sometimes a selected set of private (embargoed) data needs to made available to a collaborator or other researcher outside the project. +One way to go about this is to identify the file ids, add the individual/team to the (this creates local sharing settings), and then collect those files into a dataset so a single dataset link can be shared (this makes it easier especially if files are spread across multiple folders). + +The main convenience util to do this is `grant_specific_file_access`. +To test it out, we create some mock files below, and share with NF-OSI Team as the collaborator (replace example with appropriate project id that you own). + +```{r, eval=F} + +file_ids <- c("syn123", "syn456") +outside_collaborator <- "" # NF-OSI Team +project_id <- "" # replace +grant_specific_file_access(principal_id = outside_collaborator, + entity_ids = file_ids, + create_dataset = T, + project_id = NULL, + dataset_name = NULL) # optional + +``` + +## Survey files downloadable for Synapse registered users + +There's often reference to "public" files, which usually means files that are viewable + downloadable to Synapse users. +If we just have a fileview with ids of the files, how do we know which ones are "public"? +The group of Synapse users has id `273948`, and we can use a util called `summarize_file_access`, passing in this group id, the permissions we're checking, and the fileview id. + +```{r query-1, eval=F} +public_access <- summarize_file_access(principal_id = 273948, "DOWNLOAD", "syn16858331") +public_access +``` + +Sometimes this data is only retrieved because someone wants a breakdown, both as absolute numbers and as proportions for "public" files. +Getting the breakdown looks like this: +```{r summarize-1, eval=F} +public_access[, .(n_files = sum(N)), by = access][, .(access, n_files, proportion = n_files / sum(n_files))] +``` + +### Some nuances + +So `summarize_file_access` has a step where the file benefactor (a parent container that sets the permissions on the child files) is first identified. +Since access is looked up most efficiently through the benefactor, the API is queried to get the current permissions for the benefactor and then uses that information to derive the permissions that must be on the files. +The API **only returns access permission info at present**. +If you use an older fileview to do the benefactor lookup, you might get inaccurate results, because files are moved around/inherit from different benefactors in time. + + diff --git a/vignettes/survey-public-files.Rmd b/vignettes/survey-public-files.Rmd deleted file mode 100644 index a3041013..00000000 --- a/vignettes/survey-public-files.Rmd +++ /dev/null @@ -1,52 +0,0 @@ ---- -title: "Surveying public files in the portal" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Surveying public files in the portal} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -## Intro - -This quick makes use of some functions to survey files in the portal and their access. - -## Set up - -The usual setup: -```{r setup, eval=F} -library(nfportalutils) -syn_login() -``` - -## Files downloadable for Synapse registered users - -When talking about "public" files, this usually means files that are viewable and downloadable to Synapse users. -This group has id `273948`, so we use in the query below: - -```{r query-1, eval=F} -public_access <- summarize_file_access(principal_id = 273948, "DOWNLOAD", "syn16858331") -public_access -``` - -Breakdown as absolute number and as proportions: -```{r summarize-1, eval=F} -public_access[, .(n_files = sum(N)), by = access][, .(access, n_files, proportion = n_files / sum(n_files))] -``` - -## Some Nuances - -While it would be nice to see the file access restrictions at different points in time, note that the underlying API only returns access control info at present. -A file may have inherited from a benefactor at an earlier point, but then becomes its own benefactor later (i.e. more granular access control), -so queries based on a past state will likely not work. -Don't try something like: -```{r query-bad, eval=F} -public_access_q3_2022 <- summarize_file_access(principal_id = 273948, "DOWNLOAD", "syn16858331.47") -``` \ No newline at end of file From 52d32cf9334a1892f0678fbcf5947fa119c20c6b Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 10:02:51 -0700 Subject: [PATCH 09/37] Update add_publication_from_pubmed.R --- R/add_publication_from_pubmed.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/R/add_publication_from_pubmed.R b/R/add_publication_from_pubmed.R index 5f80cc49..4f44380e 100644 --- a/R/add_publication_from_pubmed.R +++ b/R/add_publication_from_pubmed.R @@ -8,8 +8,6 @@ function(pmid, study_id, disease_focus, manifestation, publication_table_id, study_table_id, dry_run = T) { - .check_login() - counter <<- counter + 1L # cat("current record:", counter) # make verbose? # Query only for data needed, i.e. PMID to check non-dup; result can be cached @@ -25,7 +23,7 @@ if(!length(record)) return() study_id_set <- glue::glue_collapse(glue::single_quote(study_id), sep = ", ") - study <- .syn$tableQuery(glue::glue("SELECT studyId, studyName, fundingAgency FROM {study_table_id} WHERE studyId IN ({study_id_set})"))$asDataFrame() + study <- synapser::as.data.frame(synapser::synTableQuery(glue::glue("SELECT studyId, studyName, fundingAgency FROM {study_table_id} WHERE studyId IN ({study_id_set})"))) record <- cbind(record, diseaseFocus = I(list(disease_focus)), manifestation = I(list(manifestation)), studyId = I(list(study$studyId)), studyName = I(list(study$studyName)), fundingAgency = I(list(study$fundingAgency))) @@ -37,7 +35,7 @@ new_data <- as_table_schema(record, publication_table_id) } if(!dry_run) { - new_data <- .syn$store(new_data) + new_data <- synapser::synStore(new_data) message(glue::glue('PMID:{new_data$asDataFrame()$pmid} added!')) } else { new_data From 29c22a742f6d59b7ad373c8a2a944b57f6fa3337 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 10:03:08 -0700 Subject: [PATCH 10/37] Update datasets.R --- R/datasets.R | 209 +++++++++++---------------------------------------- 1 file changed, 45 insertions(+), 164 deletions(-) diff --git a/R/datasets.R b/R/datasets.R index c063c92f..4a954fb6 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -1,21 +1,21 @@ # -- Editing Collections -------------------------------------------------------# -# General helpers that should work for both datasets (collection of files) +# General helpers that should work for both datasets (collection of files) # and dataset collections (collection of datasets). #' Structure as collection items -#' +#' #' Helper taking entity ids to create records used for dataset items *or* dataset collection items. #' Collection items have the form `list(entityId = id, versionNumber = x)`. #' -#' Note: For item version, dataset items allow two meanings of literal or absolute "latest" +#' Note: For item version, dataset items allow two meanings of literal or absolute "latest" #' vs. "stable_latest", but with files either one can be used to mean the same thing #' since there will be correct interpretation done under the hood. #' See implementation in `latest_version`. #' #' @param ids Ids of entities to make into dataset items. -#' @param item_version Integer for version that will be used for all items, e.g. 1. +#' @param item_version Integer for version that will be used for all items, e.g. 1. #' Otherwise, "latest" or "stable_latest". See details. #' @keywords internal as_coll_items <- function(ids, item_version = c("abs", "stable")) { @@ -32,34 +32,34 @@ as_coll_items <- function(ids, item_version = c("abs", "stable")) { #' Apply updates to current collection of items -#' +#' #' This is essentially an internal transaction helper for trying to apply a changeset to a collection, -#' used in several higher-level collection utils. +#' used in several higher-level collection utils. #' Given the changeset that can represent updates of both types "replace" or "add", -#' this applies an update join keyed on `entityId` for the replace and +#' this applies an update join keyed on `entityId` for the replace and #' appends the new items to get the updated collection. -#' +#' #' @param current_items List of lists representing a collection of items. -#' @param update_items Collection of items to apply as updates to `current_items`. +#' @param update_items Collection of items to apply as updates to `current_items`. #' @keywords internal update_items <- function(current_coll, update_coll) { - + current_coll <- data.table::rbindlist(current_coll) update_coll <- data.table::rbindlist(update_coll) replaced <- current_coll[update_coll, on = .(entityId), versionNumber := i.versionNumber] added <- update_coll[!current_coll, on = .(entityId)] updated <- rbind(replaced, added) # reconversion; using pure apply as.list coerces versionNumber into char - updated <- apply(updated, 1, function(i) list(entityId = unname(i[1]), versionNumber = as.integer(i[2]))) + updated <- apply(updated, 1, function(i) list(entityId = unname(i[1]), versionNumber = as.integer(i[2]))) updated } #' Update item versions to "latest" in a collection -#' +#' #' Update an _existing_ collection so that all items or a subset of items reference their latest version. #' Should work for both datasets (collection of files) and dataset collections (collection of datasets). -#' +#' #' @inheritParams latest_version #' @param collection_id Collection id. #' @param items Vector of dataset ids for which to update reference to latest version, or "all" (default) to update all. @@ -72,7 +72,7 @@ use_latest_in_collection <- function(collection_id, items = "all", version_seman if((length(items) == 1) && (items == "all")) { coll$items <- as_coll_items(current_items, item_version = version_semantics) } else { - + # Check subset; if no check, this becomes `add_to_collection` if(!all(items %in% current_items)) { warning("Subset given includes items not actually in collection: ", items[!items %in% current_items]) @@ -85,8 +85,8 @@ use_latest_in_collection <- function(collection_id, items = "all", version_seman updated_items <- update_items(coll$items, as_coll_items(items, item_version = version_semantics)) coll$items <- updated_items } - .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(coll, auto_unbox = TRUE)) - + synapser::synRestPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(coll, auto_unbox = TRUE)) + } @@ -96,24 +96,24 @@ use_latest_in_collection <- function(collection_id, items = "all", version_seman #' For datasets, the items should be files. For dataset collections, the items should be datasets. #' If an item attempting to be added happens to already be in the collection, #' this might lead to version conflicts, so the update will be rejected unless `force` is true. -#' -#' This is implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet +#' +#' This is implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet #' implement dataset collection class and methods (but dataset and relevant methods like `add_item` method are available). -#' Thus, while this is generic enough to handle both datasets and dataset collections +#' Thus, while this is generic enough to handle both datasets and dataset collections #' it is expected to be used more for dataset collections given that the dataset method is provided. -#' +#' #' @param collection_id Collection id. #' @param items Character vector of one or more dataset entity ids to add. #' @param check_items Whether to check that ids are really appropriate item types and remove non-appropriate item types #' to help avoid Synapse errors (default `FALSE` because in most cases `items` are curated, and using check will be slower). -#' @param force If some items are currently in the collection with a different version, +#' @param force If some items are currently in the collection with a different version, #' should these items be force-added using current version? The safe default is `FALSE` to ensure any such updates are intentional. #' @export add_to_collection <- function(collection_id, items, check_items = FALSE, force = FALSE) { - - coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}")) + + coll <- synapser::synRestGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}")) coll_type <- which_coll_type(coll) - + if(check_items) { item_type_check <- if(coll_type == "dataset") is_file else is_dataset correct_item_type <- sapply(items, item_type_check) @@ -126,14 +126,14 @@ add_to_collection <- function(collection_id, items, check_items = FALSE, force = } } } - + current_items <- sapply(coll$items, function(x) x$entityId) if(any(items %in% current_items) && !force) { stop("Some items to be added are already in collection. Use `force = TRUE` to allow replacing existing versions.") } else { coll$items <- update_items(coll$items, as_coll_items(items)) } - .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(coll, auto_unbox = TRUE)) + synapser::synRestPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(coll, auto_unbox = TRUE)) } @@ -141,20 +141,25 @@ add_to_collection <- function(collection_id, items, check_items = FALSE, force = #' Create new dataset with given items #' +#' Offers somewhat more convenient interface than the base `dataset` constructor: +#' needs only item ids and creates structure needed + uses the LATEST version for items by default. +#' #' @inheritParams as_coll_items #' @param name Name of the dataset. It should be unique within the `parent` project. #' @param parent Synapse id of parent project where the dataset will live. #' @param items Id(s) of items to include. #' Usually the same parent project storing the files, but in some cases it may be a different project. +#' @param addAnnotationColumns Whether to add annotation columns, default `FALSE`. #' @param dry_run If TRUE, don't actually store dataset, just return the data object for inspection or further modification. #' @export -new_dataset <- function(name, parent, items, item_version = NULL, dry_run = TRUE) { +new_dataset <- function(name, parent, items, item_version = NULL, addAnnotationColumns = FALSE, dry_run = TRUE) { dataset_items <- as_coll_items(items, item_version) - dataset <- synapseclient$Dataset(name = name, - parent = parent, - dataset_items = dataset_items) - if(dry_run) dataset else .syn$store(dataset) + dataset <- synapser::Dataset(name = name, + parent = parent, + dataset_items = dataset_items, + addAnnotationColumns = addAnnotationColumns) + if(dry_run) dataset else synapser::synStore(dataset) } @@ -176,7 +181,7 @@ new_dataset <- function(name, parent, items, item_version = NULL, dry_run = TRUE #' @param version_semantics Use "abs" for absolute latest version or "stable". Only used for collection entities. See details. latest_version <- function(id, version_semantics = c("abs", "stable")) { - entity <- .syn$get(id, downloadFile = FALSE) + entity <- synapser::synGet(id, downloadFile = FALSE) version <- entity$properties$versionNumber if(entity$properties$concreteType %in% c("org.sagebionetworks.repo.model.table.Dataset", "org.sagebionetworks.repo.model.table.DatasetCollection") && version_semantics == "stable_latest") { @@ -188,152 +193,28 @@ latest_version <- function(id, version_semantics = c("abs", "stable")) { } -#' Create datasets for Sarek-called somatic or germline variants results -#' -#' Organize variant call files from Nextflow Sarek into 3-4 datasets, -#' grouping files by variant type and workflow with titles having the format: -#' "{type} Genomic Variants - {workflow} Pipeline", e.g. "Somatic Genomic Variants - Strelka Pipeline". -#' As you can see, this assumes that you want to create datasets that segregate Somatic and Germline calls. -#' This makes sense for NF because Germline calls can be treated differently. -#' This uses latest version of all files and creates a Draft version of the dataset. -#' -#' Since we basically just need the syn entity id, variant type, and workflow to group the files. -#' Instead of getting this info through running `map_*` as in the example, -#' you may prefer using a fileview, in which case you just need to download a table from a fileview -#' that has `id` => `output_id` + the `dataType` and `workflow` annotations. -#' The fileview can be used _after_ the files are annotated. If you want to create datasets _before_ -#' files are annotated, then you have to use `map_*`. -#' -#' Finally, datasets cannot use the same name if stored in the same project, -#' so if there are multiple batches, the names will have to be made unique by adding -#' the batch number, source data id, processing date, or whatever makes sense. -#' -#' @inheritParams new_dataset -#' @param output_map The `data.table` returned from `map_sample_output_sarek`. See details for alternatives. -#' @param workflow One of workflows used. -#' @param verbose Optional, whether to be verbose -- defaults to TRUE. -#' @import data.table -#' @return A list of dataset objects. -#' @export -#' @examples -#'\dontrun{ -#' syn_out <- "syn26648589" -#' m <- map_sample_output_sarek(syn_out) -#' datasets <- nf_sarek_datasets(m, parent = "syn26462036", dry_run = F) # use a test project -#'} -nf_sarek_datasets <- function(output_map, - parent, - workflow = c("FreeBayes", "Mutect2", "Strelka", "DeepVariant"), - verbose = TRUE, - dry_run = TRUE) { - - output_map <- as.data.table(output_map) - if(!is.null(output_map$dataType)) { - data_type <- unique(output_map$dataType) - if(length(data_type) != 1) stop("Expecting one `dataType`, which does not appear to be the case.") - gvtype <- grep("(Germline|Somatic)Variants", data_type, value = T) - if(!length(gvtype)) stop("Data type does not look right, expecting either Germline or Somatic variants.") - gvtype <- switch(gvtype, - SomaticVariants = "Somatic", - GermlineVariants = "Germline") - - } else { - # Detect genomic variants type from first path name - gvtype <- if(grepl("SomaticVariantCalls", first(output_map$caller_path))) { - "Somatic" - } else if(grepl("GermlineVariantCalls", first(output_map$caller_path))) { - "Germline" - } else { - stop("Could not assign either Germline or Somatic labels based on main output folder. - Check whether folder contains mixed types or is not the right one.") - } - } - pattern <- "vcf.gz(.tbi)?$" - workflow <- match.arg(workflow) - datasets <- list() - for(i in workflow) { - dataset <- output_map[workflow == i & grepl(pattern, output_name)] - if(nrow(dataset)) { - if(verbose) glue::glue("Creating {i} dataset with {nrow(dataset)} files") - name <- glue::glue("{gvtype} Genomic Variants - {i} Pipeline") - dataset <- new_dataset(name = name, parent = parent, items = dataset$output_id, dry_run = TRUE) - if(dry_run) datasets[[i]] <- syn_dataset else datasets[[i]] <- .syn$store(syn_dataset) - } - } - - return(datasets) - -} - - -#' Create dataset for STAR-Salmon expression quantification results -#' -#' With a level-3 manifest that is created from `annotate_expression`, -#' calls `new_dataset` to make quantification files (.sf) into dataset. -#' Uses latest version of the files and creates a "Draft" dataset. -#' See `nf_sarek_datasets`. -#' -#' @inheritParams new_dataset -#' @inheritParams nf_sarek_datasets -#' @param manifest A table of annotated data manifest from `annotate_expression`. -#' @export -nf_star_salmon_datasets <- function(manifest, - parent, - dry_run = TRUE) { - - items <- manifest$entityId - new_dataset(name = "Gene Expression Quantification from RNA-seq", - parent = parent, - items = items, - dry_run = dry_run) -} - -#' Create dataset for CNVKit results -#' -#' Create dataset from all files in CNVKit output -#' -#' @inheritParams new_dataset -#' @param syn_out Output folder called 'cnvkit' -#' @export -nf_cnv_dataset <- function(syn_out, - parent, - dry_run = TRUE) { - - files <- walk(syn_out) - files <- unlist(files) - df <- as.data.frame(matrix(files, ncol = 2, byrow = TRUE)) - names(df) <- c("Filename", "id") - df <- df[grepl("cnr$|cns$|cnn$|bed$|pdf$|png$", df$Filename), ] - items <- df$id - new_dataset(name = "Copy Number Variant - CNVkit", - parent = parent, - items = items, - dry_run = dry_run) -} - - # -- Checks------------- -------------------------------------------------------# # TODO Potentially move these type checks somewhere else like basic_utils # TODO Better composition to reduce code, esp. if more will be added #' Check whether entity is dataset -#' +#' #' @keywords internal is_dataset <- function(id) { tryCatch({ - entity <- .syn$get(id, downloadFile = FALSE) + entity <- synapser::synGet(id, downloadFile = FALSE) entity$properties$concreteType == "org.sagebionetworks.repo.model.table.Dataset" }, error = function(e) FALSE) } #' Check whether entity is dataset collection -#' +#' #' @keywords internal is_dataset_collection <- function(id) { tryCatch({ - entity <- .syn$get(id, downloadFile = FALSE) + entity <- synapser::synGet(id, downloadFile = FALSE) entity$properties$concreteType == "org.sagebionetworks.repo.model.table.DatasetCollection" }, error = function(e) FALSE) @@ -341,9 +222,9 @@ is_dataset_collection <- function(id) { #' Which collection type -#' +#' #' Checks for a valid collection type or returns error -#' +#' #' @keywords internal which_coll_type <- function(coll) { coll_type <- c("dataset", "dataset collection")[c(is_dataset(coll), is_dataset_collection(coll))] @@ -351,11 +232,11 @@ which_coll_type <- function(coll) { } #' Check whether entity is file -#' +#' #' @keywords internal is_file <- function(id) { tryCatch({ - entity <- .syn$get(id, downloadFile = FALSE) + entity <- synapser::synGet(id, downloadFile = FALSE) entity$properties$concreteType == "org.sagebionetworks.repo.model.FileEntity" }, error = function(e) FALSE) From be7e67d58352cc754cfa38674bcf29f6302e4060 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 10:03:20 -0700 Subject: [PATCH 11/37] Update tests --- tests/testthat/test-add_pubmed_publications.R | 3 - tests/testthat/test_auth_login.R | 13 +++- tests/testthat/test_dataset_utils.R | 69 +++++++++---------- 3 files changed, 44 insertions(+), 41 deletions(-) delete mode 100644 tests/testthat/test-add_pubmed_publications.R diff --git a/tests/testthat/test-add_pubmed_publications.R b/tests/testthat/test-add_pubmed_publications.R deleted file mode 100644 index 8849056e..00000000 --- a/tests/testthat/test-add_pubmed_publications.R +++ /dev/null @@ -1,3 +0,0 @@ -test_that("multiplication works", { - expect_equal(2 * 2, 4) -}) diff --git a/tests/testthat/test_auth_login.R b/tests/testthat/test_auth_login.R index 525e95ab..b3d2d86f 100644 --- a/tests/testthat/test_auth_login.R +++ b/tests/testthat/test_auth_login.R @@ -1,7 +1,16 @@ -test_that("Implicit login SYNAPSE_AUTH_TOKEN works", { +# REMOVE this once every functionality has switched to second login method below +test_that("(Legacy) Implicit login SYNAPSE_AUTH_TOKEN works", { skip_if_no_synapseclient() skip_if_no_token() withr::local_envvar(SYNAPSE_AUTH_TOKEN = Sys.getenv("TEST_SYNAPSE_AUTH_TOKEN")) # Testing for .syn in the global environment expect_is(syn_login(), "synapseclient.client.Synapse") -}) \ No newline at end of file +}) + + +test_that("(synapser) Implicit authtoken login SYNAPSE_AUTH_TOKEN works", { + skip_if_no_synapseclient() + skip_if_no_token() + withr::local_envvar(SYNAPSE_AUTH_TOKEN = Sys.getenv("TEST_SYNAPSE_AUTH_TOKEN")) + testthat::expect_equal(synapser::synLogin(), NULL) +}) diff --git a/tests/testthat/test_dataset_utils.R b/tests/testthat/test_dataset_utils.R index 214e0f66..8715fd17 100644 --- a/tests/testthat/test_dataset_utils.R +++ b/tests/testthat/test_dataset_utils.R @@ -1,6 +1,8 @@ -# Create a basic draft dataset from some files at version 1; all files have a latest version 2 +# Create a basic draft dataset from some files at version 1; all files have a latest version 2 # Returns dataset id only create_dataset_fixture <- function(instance = 1) { + + skip_if_no_login() NF_test <- "syn26462036" items <- c("syn51239179", "syn51239178", @@ -12,9 +14,9 @@ create_dataset_fixture <- function(instance = 1) { test_that("Creating dataset with `new_dataset` works as expected when given valid parameters, defaulting to current item versions", { - - skip_if_no_synapseclient() - skip_if_no_token() + + skip_if_no_login() + NF_test <- "syn26462036" # Note that files are all version 2 on Synapse items <- c("syn51239179", @@ -24,16 +26,16 @@ test_that("Creating dataset with `new_dataset` works as expected when given vali expected_items_in_dataset <- list( list(entityId = "syn51239179", versionNumber = 2L), list(entityId = "syn51239178", versionNumber = 2L), - list(entityId = "syn51239177", versionNumber = 2L)) + list(entityId = "syn51239177", versionNumber = 2L)) testthat::expect_equal(expected_items_in_dataset, dataset$properties$datasetItems) - .syn$delete(dataset) + synapser::synDelete(dataset) }) test_that("Creating dataset with `new_dataset` works as expected when given valid parameters and a specific item version is specified", { - - skip_if_no_synapseclient() - skip_if_no_token() + + skip_if_no_login() + NF_test <- "syn26462036" items <- c("syn51239179", "syn51239178", @@ -44,7 +46,7 @@ test_that("Creating dataset with `new_dataset` works as expected when given vali list(entityId = "syn51239178", versionNumber = 1L), list(entityId = "syn51239177", versionNumber = 1L)) testthat::expect_equal(expected_items_in_dataset, dataset$properties$datasetItems) - .syn$delete(dataset) + synapser::synDelete(dataset) }) @@ -54,9 +56,9 @@ test_that("Creating dataset with `new_dataset` works as expected when given vali # Currently, only files can be included in a dataset. syn27242487 is 'org.sagebionetworks.repo.model.table.TableEntity' # ``` test_that("Creating dataset with `new_dataset` will fail when trying to include a non-valid item (a table)", { - - skip_if_no_synapseclient() - skip_if_no_token() + + skip_if_no_login() + NF_test <- "syn26462036" items <- c("syn51239179", "syn51239178", @@ -66,10 +68,9 @@ test_that("Creating dataset with `new_dataset` will fail when trying to include test_that("Updating a dataset to make a subset of files reference the latest version works", { - - skip_if_no_synapseclient() - skip_if_no_token() - + + skip_if_no_login() + dataset_id <- create_dataset_fixture() items_to_update <- c("syn51239178", "syn51239177") # both should be updated to Version 2 updated <- use_latest_in_collection(collection_id = dataset_id, items = items_to_update) @@ -78,15 +79,14 @@ test_that("Updating a dataset to make a subset of files reference the latest ver list(entityId = "syn51239178", versionNumber = 2L), list(entityId = "syn51239177", versionNumber = 2L)) testthat::expect_identical(updated$items, expected_updated_items) - .syn$delete(dataset_id) + synapser::synDelete(dataset_id) }) test_that("Updating a dataset to make _all_ files reference the latest version works", { - - skip_if_no_synapseclient() - skip_if_no_token() - + + skip_if_no_login() + dataset_id <- create_dataset_fixture() expected_updated_items <- list( list(entityId = "syn51239179", versionNumber = 2L), @@ -94,17 +94,16 @@ test_that("Updating a dataset to make _all_ files reference the latest version w list(entityId = "syn51239177", versionNumber = 2L)) updated <- use_latest_in_collection(collection_id = dataset_id, items = "all") testthat::expect_identical(updated$items, expected_updated_items) - .syn$delete(dataset_id) + synapser::synDelete(dataset_id) }) # Dataset collections ---------------------------------------------------------# test_that("Updating a dataset collection to make a subset of datasets reference the latest version works", { - - skip_if_no_synapseclient() - skip_if_no_token() - + + skip_if_no_login() + dataset_collection_id <- "syn51809938" dataset_item_to_update <- "syn51809898" .syn$create_snapshot_version(dataset_item_to_update) @@ -117,9 +116,8 @@ test_that("Updating a dataset collection to make a subset of datasets reference test_that("Adding new dataset to dataset collection works", { - skip_if_no_synapseclient() - skip_if_no_token() - + skip_if_no_login() + dataset_collection_id <- "syn51809938" coll_state <- coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{dataset_collection_id}")) one_more_item <- create_dataset_fixture() @@ -127,17 +125,16 @@ test_that("Adding new dataset to dataset collection works", { testthat::expect_equal(length(new_coll_state$items), length(coll_state$items) + 1L) # cleanup: set collection to previous items state new_coll_state$items <- coll_state$items - .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{dataset_collection_id}"), body = jsonlite::toJSON(new_coll_state, auto_unbox = TRUE)) + synapser::synRestPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{dataset_collection_id}"), body = jsonlite::toJSON(new_coll_state, auto_unbox = TRUE)) # delete dataset - .syn$delete(one_more_item) + synapser::synDelete(one_more_item) }) test_that("Adding non-datasets to dataset collection gives expected handling and warning", { - - skip_if_no_synapseclient() - skip_if_no_token() - + + skip_if_no_login() + dataset_collection_id <- "syn51809938" bad_items <- "syn51106349" # a folder testthat::expect_warning(add_to_collection(collection_id = dataset_collection_id, items = bad_items, check_items = TRUE), From ee83bd3931911f0dc796f2d054b115e512be76a5 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 10:06:07 -0700 Subject: [PATCH 12/37] Reorganize datasets, separate out subset --- R/datasets_nf.R | 122 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 R/datasets_nf.R diff --git a/R/datasets_nf.R b/R/datasets_nf.R new file mode 100644 index 00000000..b4d00445 --- /dev/null +++ b/R/datasets_nf.R @@ -0,0 +1,122 @@ +#' Create datasets for Sarek-called somatic or germline variants results +#' +#' Organize variant call files from Nextflow Sarek into 3-4 datasets, +#' grouping files by variant type and workflow with titles having the format: +#' "{type} Genomic Variants - {workflow} Pipeline", e.g. "Somatic Genomic Variants - Strelka Pipeline". +#' As you can see, this assumes that you want to create datasets that segregate Somatic and Germline calls. +#' This makes sense for NF because Germline calls can be treated differently. +#' This uses latest version of all files and creates a Draft version of the dataset. +#' +#' Since we basically just need the syn entity id, variant type, and workflow to group the files. +#' Instead of getting this info through running `map_*` as in the example, +#' you may prefer using a fileview, in which case you just need to download a table from a fileview +#' that has `id` => `output_id` + the `dataType` and `workflow` annotations. +#' The fileview can be used _after_ the files are annotated. If you want to create datasets _before_ +#' files are annotated, then you have to use `map_*`. +#' +#' Finally, datasets cannot use the same name if stored in the same project, +#' so if there are multiple batches, the names will have to be made unique by adding +#' the batch number, source data id, processing date, or whatever makes sense. +#' +#' @inheritParams new_dataset +#' @param output_map The `data.table` returned from `map_sample_output_sarek`. See details for alternatives. +#' @param workflow One of workflows used. +#' @param verbose Optional, whether to be verbose -- defaults to TRUE. +#' @import data.table +#' @return A list of dataset objects. +#' @export +#' @examples +#'\dontrun{ +#' syn_out <- "syn26648589" +#' m <- map_sample_output_sarek(syn_out) +#' datasets <- nf_sarek_datasets(m, parent = "syn26462036", dry_run = F) # use a test project +#'} +nf_sarek_datasets <- function(output_map, + parent, + workflow = c("FreeBayes", "Mutect2", "Strelka", "DeepVariant"), + verbose = TRUE, + dry_run = TRUE) { + + output_map <- as.data.table(output_map) + if(!is.null(output_map$dataType)) { + data_type <- unique(output_map$dataType) + if(length(data_type) != 1) stop("Expecting one `dataType`, which does not appear to be the case.") + gvtype <- grep("(Germline|Somatic)Variants", data_type, value = T) + if(!length(gvtype)) stop("Data type does not look right, expecting either Germline or Somatic variants.") + gvtype <- switch(gvtype, + SomaticVariants = "Somatic", + GermlineVariants = "Germline") + + } else { + # Detect genomic variants type from first path name + gvtype <- if(grepl("SomaticVariantCalls", first(output_map$caller_path))) { + "Somatic" + } else if(grepl("GermlineVariantCalls", first(output_map$caller_path))) { + "Germline" + } else { + stop("Could not assign either Germline or Somatic labels based on main output folder. + Check whether folder contains mixed types or is not the right one.") + } + } + pattern <- "vcf.gz(.tbi)?$" + workflow <- match.arg(workflow) + datasets <- list() + for(i in workflow) { + dataset <- output_map[workflow == i & grepl(pattern, output_name)] + if(nrow(dataset)) { + if(verbose) glue::glue("Creating {i} dataset with {nrow(dataset)} files") + name <- glue::glue("{gvtype} Genomic Variants - {i} Pipeline") + dataset <- new_dataset(name = name, parent = parent, items = dataset$output_id, dry_run = TRUE) + if(dry_run) datasets[[i]] <- syn_dataset else datasets[[i]] <- .syn$store(syn_dataset) + } + } + + return(datasets) + +} + + +#' Create dataset for STAR-Salmon expression quantification results +#' +#' With a level-3 manifest that is created from `annotate_expression`, +#' calls `new_dataset` to make quantification files (.sf) into dataset. +#' Uses latest version of the files and creates a "Draft" dataset. +#' See `nf_sarek_datasets`. +#' +#' @inheritParams new_dataset +#' @inheritParams nf_sarek_datasets +#' @param manifest A table of annotated data manifest from `annotate_expression`. +#' @export +nf_star_salmon_datasets <- function(manifest, + parent, + dry_run = TRUE) { + + items <- manifest$entityId + new_dataset(name = "Gene Expression Quantification from RNA-seq", + parent = parent, + items = items, + dry_run = dry_run) +} + +#' Create dataset for CNVKit results +#' +#' Create dataset from all files in CNVKit output +#' +#' @inheritParams new_dataset +#' @param syn_out Output folder called 'cnvkit' +#' @export +nf_cnv_dataset <- function(syn_out, + parent, + dry_run = TRUE) { + + files <- walk(syn_out) + files <- unlist(files) + df <- as.data.frame(matrix(files, ncol = 2, byrow = TRUE)) + names(df) <- c("Filename", "id") + df <- df[grepl("cnr$|cns$|cnn$|bed$|pdf$|png$", df$Filename), ] + items <- df$id + new_dataset(name = "Copy Number Variant - CNVkit", + parent = parent, + items = items, + dry_run = dry_run) +} \ No newline at end of file From fd638f0c2d1fdf2191d73a806c808ee264f00169 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 10:20:56 -0700 Subject: [PATCH 13/37] Update docs --- man/manifest_generate.Rd | 5 ++++- man/new_dataset.Rd | 14 ++++++++++++-- man/nf_cnv_dataset.Rd | 2 +- man/nf_sarek_datasets.Rd | 2 +- man/nf_star_salmon_datasets.Rd | 2 +- 5 files changed, 19 insertions(+), 6 deletions(-) diff --git a/man/manifest_generate.Rd b/man/manifest_generate.Rd index b4791b87..4d3f075f 100644 --- a/man/manifest_generate.Rd +++ b/man/manifest_generate.Rd @@ -13,7 +13,8 @@ manifest_generate( asset_view = "syn16858331", output_format = "excel", use_annotations = TRUE, - service = "https://schematic.api.sagebionetworks.org/v1/manifest/generate" + service = "https://schematic.api.sagebionetworks.org/v1/manifest/generate", + access_token = Sys.getenv("SYNAPSE_AUTH_TOKEN") ) } \arguments{ @@ -32,6 +33,8 @@ manifest_generate( \item{use_annotations}{Use annotations if filling out manifest for existing dataset. Defaults to TRUE for NF.} \item{service}{Service endpoint to use. Defaults to the schematic production endpoint.} + +\item{access_token}{Synapse auth token, defaults to \code{SYNAPSE_AUTH_TOKEN} set in env.} } \value{ For excel, path to local file; for google_sheet, URL to sheet; for dataframe, JSON string of data. diff --git a/man/new_dataset.Rd b/man/new_dataset.Rd index 7023a7d8..9312345d 100644 --- a/man/new_dataset.Rd +++ b/man/new_dataset.Rd @@ -4,7 +4,14 @@ \alias{new_dataset} \title{Create new dataset with given items} \usage{ -new_dataset(name, parent, items, item_version = NULL, dry_run = TRUE) +new_dataset( + name, + parent, + items, + item_version = NULL, + addAnnotationColumns = FALSE, + dry_run = TRUE +) } \arguments{ \item{name}{Name of the dataset. It should be unique within the \code{parent} project.} @@ -17,8 +24,11 @@ Usually the same parent project storing the files, but in some cases it may be a \item{item_version}{Integer for version that will be used for all items, e.g. 1. Otherwise, "latest" or "stable_latest". See details.} +\item{addAnnotationColumns}{Whether to add annotation columns, default \code{FALSE}.} + \item{dry_run}{If TRUE, don't actually store dataset, just return the data object for inspection or further modification.} } \description{ -Create new dataset with given items +Offers somewhat more convenient interface than the base \code{dataset} constructor: +needs only item ids and creates structure needed + uses the LATEST version for items by default. } diff --git a/man/nf_cnv_dataset.Rd b/man/nf_cnv_dataset.Rd index 880df7d0..327ae332 100644 --- a/man/nf_cnv_dataset.Rd +++ b/man/nf_cnv_dataset.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/datasets.R +% Please edit documentation in R/datasets_nf.R \name{nf_cnv_dataset} \alias{nf_cnv_dataset} \title{Create dataset for CNVKit results} diff --git a/man/nf_sarek_datasets.Rd b/man/nf_sarek_datasets.Rd index 009d3030..c57bf053 100644 --- a/man/nf_sarek_datasets.Rd +++ b/man/nf_sarek_datasets.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/datasets.R +% Please edit documentation in R/datasets_nf.R \name{nf_sarek_datasets} \alias{nf_sarek_datasets} \title{Create datasets for Sarek-called somatic or germline variants results} diff --git a/man/nf_star_salmon_datasets.Rd b/man/nf_star_salmon_datasets.Rd index ca4a91c0..a1b38bfd 100644 --- a/man/nf_star_salmon_datasets.Rd +++ b/man/nf_star_salmon_datasets.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/datasets.R +% Please edit documentation in R/datasets_nf.R \name{nf_star_salmon_datasets} \alias{nf_star_salmon_datasets} \title{Create dataset for STAR-Salmon expression quantification results} From 5e045500d38f2e81aa9badee3924ae1af0c2d4a5 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 10:36:12 -0700 Subject: [PATCH 14/37] Resolve additional in annotation_qc --- R/annotation_qc.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/annotation_qc.R b/R/annotation_qc.R index 2c15085a..98ac5703 100644 --- a/R/annotation_qc.R +++ b/R/annotation_qc.R @@ -158,13 +158,13 @@ manifest_passed <- function(result) { #' @export infer_data_type <- function(dataset_id) { - children <- .syn$getChildren(dataset_id) - children <- reticulate::iterate(children) + children <- synapser::synGetChildren(dataset_id) + children <- synapser::as.list(children) if(!length(children)) return(list(result = NA, notes = "Empty dataset folder")) children <- first(children, 3) data_type <- c() for (entity in children) { - e <- .syn$getAnnotations(entity) + e <- synapser::synGetAnnotations(entity) data_type <- append(data_type, e$Component) } data_type <- unique(data_type) @@ -203,9 +203,9 @@ meta_qc_dataset <- function(dataset_id, schema_url = "https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/NF.jsonld", cleanup = TRUE) { - dataset_name <- .syn$get(dataset_id)$properties$name + dataset_name <- synapser::synGet(dataset_id)$properties$name - files <- reticulate::iterate(.syn$getChildren(dataset_id)) + files <- synapser::as.list(synapser::synGetChildren(dataset_id)) if(!length(files)) { return(list(result = NA, notes = "Empty dataset with no files", From 430c4968269a47a6538d8f52c38e7431354414eb Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 10:45:50 -0700 Subject: [PATCH 15/37] Update login for relevant vignettes --- vignettes/access-utilities.Rmd | 2 +- vignettes/revalidation-workflows.Rmd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vignettes/access-utilities.Rmd b/vignettes/access-utilities.Rmd index 7506d414..c785cad9 100644 --- a/vignettes/access-utilities.Rmd +++ b/vignettes/access-utilities.Rmd @@ -23,7 +23,7 @@ This explains some access-related utilities and use cases where they're helpful. The usual setup: ```{r setup, eval=F} library(nfportalutils) -synapser::synLogin() +synapser::synLogin(authToken = Sys.getenv("SYNAPSE_AUTH_TOKEN")) ``` ## Give selected access to an individual or team diff --git a/vignettes/revalidation-workflows.Rmd b/vignettes/revalidation-workflows.Rmd index aa3096c5..750a9a3f 100644 --- a/vignettes/revalidation-workflows.Rmd +++ b/vignettes/revalidation-workflows.Rmd @@ -17,7 +17,7 @@ First set up as usual. ```r library(nfportalutils) -syn_login(Sys.getenv("SYNAPSE_AUTH_TOKEN")) +synapser::synLogin(authToken = Sys.getenv("SYNAPSE_AUTH_TOKEN")) ``` ## Basics with Schematic API service From 5928e7dea8558e4a66c9a3de2b46bcdf49dea0aa Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 10:55:57 -0700 Subject: [PATCH 16/37] Fix vignette --- vignettes/access-utilities.Rmd | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vignettes/access-utilities.Rmd b/vignettes/access-utilities.Rmd index c785cad9..3a0e4c86 100644 --- a/vignettes/access-utilities.Rmd +++ b/vignettes/access-utilities.Rmd @@ -7,7 +7,7 @@ vignette: > %\VignetteEncoding{UTF-8} --- -```{r, include = FALSE} +```{r, include = FALSE } knitr::opts_chunk$set( collapse = TRUE, comment = "#>" @@ -20,10 +20,9 @@ This explains some access-related utilities and use cases where they're helpful. ## Set up -The usual setup: ```{r setup, eval=F} -library(nfportalutils) -synapser::synLogin(authToken = Sys.getenv("SYNAPSE_AUTH_TOKEN")) +# library(nfportalutils) +# synapser::synLogin(authToken = Sys.getenv("SYNAPSE_AUTH_TOKEN")) ``` ## Give selected access to an individual or team From a51c11dfa27b6297e9a422882876ff32067e772e Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 12:51:34 -0700 Subject: [PATCH 17/37] Pkgdown needs pip install for new vignettes --- .github/workflows/pkgdown.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index cacff559..b69f6b2c 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -27,7 +27,12 @@ jobs: - uses: r-lib/actions/setup-r@v2 with: use-public-rspm: true - + + - name: Install synapseclient + run: | + pip install synapseclient + synapse --version + - uses: r-lib/actions/setup-r-dependencies@v2 with: extra-packages: any::pkgdown, local::. From 112b44852498e803b1d57219e541ca7ab296fc89 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 15:55:58 -0700 Subject: [PATCH 18/37] Add small helper for tutorial --- R/tutorial.R | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 R/tutorial.R diff --git a/R/tutorial.R b/R/tutorial.R new file mode 100644 index 00000000..d234bbbb --- /dev/null +++ b/R/tutorial.R @@ -0,0 +1,16 @@ +# Functions especially useful for tutorials or testing + +#' Create n temp files +#' +#' Create some text files for upload +#' +#' @param n Integer number of files. +#' @return Paths to files in the temp directory. +mock_files <- function(n) { + paths <- vector(mode = "character", length = n) + for(i in 1:n) { + paths[i] <- filePath <- tempfile("data_", fileext = c(".txt")) + writeLines(text = sample(letters, 10), con = filePath) + } + paths +} \ No newline at end of file From 4480f6564d012f6c988549a3e206e32d5927424f Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 15:56:08 -0700 Subject: [PATCH 19/37] Fix --- R/access_utils.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/access_utils.R b/R/access_utils.R index e4088e30..06fbf122 100644 --- a/R/access_utils.R +++ b/R/access_utils.R @@ -30,9 +30,10 @@ summarize_file_access <- function(principal_id, # 3378999 for NF-OSI ) { tryCatch({ - view <- synapser::synTableQuery(glue::glue("SELECT id,type,benefactorId FROM {fileview_id}")) + view <- synapser::synTableQuery(glue::glue("SELECT id,type,benefactorId FROM {fileview_id}")) %>% + synapser::as.data.frame() %>% + as.data.table() }, error = function(e) stop("Could not query view!")) - view <- synapser::as.data.frame(view) files_by_benefactor <- view[type == "file", .N, by = .(benefactorId)] access <- view[, check_access(benefactorId, principal_id, access_type), by = .(benefactorId)] # files_by_benefactor can be smaller than access because there are folders without files From fa38fcbf61019c611f23421db361746c4f838ab8 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 15:56:34 -0700 Subject: [PATCH 20/37] Clean up vignette --- vignettes/access-utilities.Rmd | 52 ++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/vignettes/access-utilities.Rmd b/vignettes/access-utilities.Rmd index 3a0e4c86..2e2f8ea3 100644 --- a/vignettes/access-utilities.Rmd +++ b/vignettes/access-utilities.Rmd @@ -16,13 +16,13 @@ knitr::opts_chunk$set( ## Intro -This explains some access-related utilities and use cases where they're helpful. +This explains several access-related utilities and use cases where they're helpful. ## Set up ```{r setup, eval=F} -# library(nfportalutils) -# synapser::synLogin(authToken = Sys.getenv("SYNAPSE_AUTH_TOKEN")) +library(nfportalutils) +synapser::synLogin(authToken = Sys.getenv("SYNAPSE_AUTH_TOKEN")) ``` ## Give selected access to an individual or team @@ -31,21 +31,46 @@ Sometimes a selected set of private (embargoed) data needs to made available to One way to go about this is to identify the file ids, add the individual/team to the (this creates local sharing settings), and then collect those files into a dataset so a single dataset link can be shared (this makes it easier especially if files are spread across multiple folders). The main convenience util to do this is `grant_specific_file_access`. -To test it out, we create some mock files below, and share with NF-OSI Team as the collaborator (replace example with appropriate project id that you own). +To test it out, we create a data folder, and then create and upload mock files. ```{r, eval=F} -file_ids <- c("syn123", "syn456") -outside_collaborator <- "" # NF-OSI Team -project_id <- "" # replace +project_id <- "syn26462036" # this the NF-dev-playground project, replace with your own dev project if needed +folder <- synapser::Folder(name = paste("Demo Dataset -", Sys.Date()), parent = project_id) +folder <- synapser::synStore(folder) +folder_id <- folder$properties$id + +# create some temp files with same mock data +file_paths <- mock_files(3) +syn_files <- list() +for(path in file_paths) { + file <- synapser::File(path = path, parent = folder_id) + sf <- synapser::synStore(file) + syn_files <- c(syn_files, sf) +} + +file_ids <- sapply(syn_files, function(x) x$properties$id) +``` + + +Now let's use the function to share with NF-OSI Team as the collaborator (replace example with appropriate project id that you own). + +```{r, eval=F} + +outside_collaborator <- "3342573" # use another appropriate example id if needed grant_specific_file_access(principal_id = outside_collaborator, entity_ids = file_ids, create_dataset = T, - project_id = NULL, + project_id = project_id, dataset_name = NULL) # optional ``` +Clean up by removing the mock folder and files. You can delete the dataset through the UI. +```{r} +synapser::synDelete(folder_id) +``` + ## Survey files downloadable for Synapse registered users There's often reference to "public" files, which usually means files that are viewable + downloadable to Synapse users. @@ -57,17 +82,14 @@ public_access <- summarize_file_access(principal_id = 273948, "DOWNLOAD", "syn16 public_access ``` -Sometimes this data is only retrieved because someone wants a breakdown, both as absolute numbers and as proportions for "public" files. -Getting the breakdown looks like this: +The results should show that `summarize_file_access`'s first step is identifying all the unique benefactors (the parent container that sets the permissions on the child files) and the current permissions, then compiles a summary number of files under that benefactor. + +For an even more summarized breakdown as proportions: ```{r summarize-1, eval=F} public_access[, .(n_files = sum(N)), by = access][, .(access, n_files, proportion = n_files / sum(n_files))] ``` ### Some nuances -So `summarize_file_access` has a step where the file benefactor (a parent container that sets the permissions on the child files) is first identified. -Since access is looked up most efficiently through the benefactor, the API is queried to get the current permissions for the benefactor and then uses that information to derive the permissions that must be on the files. -The API **only returns access permission info at present**. -If you use an older fileview to do the benefactor lookup, you might get inaccurate results, because files are moved around/inherit from different benefactors in time. - +Always use the most current fileview because of the benefactor lookup; inaccurate results might be returned otherwise. From 56e27736463b022c8263d0bc3f8f83f140b16ae6 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 17:00:49 -0700 Subject: [PATCH 21/37] Update other add_publication --- R/add_publication_from_unpaywall.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/R/add_publication_from_unpaywall.R b/R/add_publication_from_unpaywall.R index b4269b2d..e82279e9 100644 --- a/R/add_publication_from_unpaywall.R +++ b/R/add_publication_from_unpaywall.R @@ -40,11 +40,9 @@ add_publication_from_unpaywall <- function(publication_table_id, #TODO: Check schema up-front and convert metadata to json in correct format - .check_login() + schema <- synapser::synGet(entity = publication_table_id) - schema <- .syn$get(entity = publication_table_id) - - pub_table <- .syn$tableQuery(glue::glue('select * from {publication_table_id}'))$filepath %>% + pub_table <- synapser::synTableQuery(glue::glue('select * from {publication_table_id}'))$filepath %>% readr::read_csv(na=character()) ##asDataFrame() & reticulate return rowIdAndRowVersion as concatenated rownames, read_csv reads them in as columns if(doi %in% pub_table$doi){ From c3fd5be74be7bc851a0959b539653ef36f7c153e Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 18:13:28 -0700 Subject: [PATCH 22/37] Provide other copy fun bc no .syn reference for synapseutils --- R/basic_utils.R | 38 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/R/basic_utils.R b/R/basic_utils.R index b80d51c7..ce9ea4dd 100644 --- a/R/basic_utils.R +++ b/R/basic_utils.R @@ -1,27 +1,19 @@ -#' Create copy of entity +#' Copy a table #' -#' Create a copy of syn entity; mostly used to create a copy on which to test out changes. -#' See https://python-docs.synapse.org/build/html/synapseutils.html?highlight=copy#synapseutils.copy_functions.copy -#' @param entity Entity to copy. -#' @param destination_id Id of destination project/container that entity will be copied to. -#' @param skip_copy_wiki_page Whether to skip copying wiki; defaults FALSE. -#' @param skip_copy_annotations Whether to skip copying annotations; defaults FALSE. -#' @keywords internal -copy <- function(entity, - destination_id, - skip_copy_wiki_page = FALSE, - skip_copy_annotations = FALSE) { - - .check_login() - # load synapseutils as needed - - - synapseutils$copy(.syn, - entity = entity, - destinationId = destination_id, - skipCopyWikiPage = skip_copy_wiki_page, - skipCopyAnnotations = skip_copy_annotations) - +#' Copy a table. +#' @export +copy_table <- function(table_id, + destination_id) { + + message(glue::glue("Getting table {table_id}")) + schema <- synapser::synGet(table_id) + data <- synapser::synTableQuery(glue::glue("select * from {table_id}"), includeRowIdAndRowVersion = FALSE) + columns <- schema$columnIds + schema_copy <- synapser::Schema(name = schema$name, parent = destination_id, columns = columns) + table_copy <- synapser::Table(schema = schema_copy, values = data$filepath) + table_copy <- synapser::synStore(table_copy) + message(glue::glue("Copied table {table_id} to {table_copy$tableId}")) + table_copy$tableId } From 1b2f870f7cdd61ad96f592f25ee454a1bcf714a5 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 18:24:03 -0700 Subject: [PATCH 23/37] Regen docs, NAMESPACE --- NAMESPACE | 1 + R/basic_utils.R | 5 ++++- man/copy.Rd | 27 --------------------------- man/copy_table.Rd | 16 ++++++++++++++++ man/mock_files.Rd | 17 +++++++++++++++++ 5 files changed, 38 insertions(+), 28 deletions(-) delete mode 100644 man/copy.Rd create mode 100644 man/copy_table.Rd create mode 100644 man/mock_files.Rd diff --git a/NAMESPACE b/NAMESPACE index 85ab7af7..a7442d2a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -42,6 +42,7 @@ export(check_readpair_validity) export(check_wiki_links) export(convert_to_stringlist) export(copy_annotations) +export(copy_table) export(data_curator_app_subpage) export(delete_provenance) export(dsp_dataset_mapping) diff --git a/R/basic_utils.R b/R/basic_utils.R index ce9ea4dd..26b56d77 100644 --- a/R/basic_utils.R +++ b/R/basic_utils.R @@ -1,6 +1,9 @@ #' Copy a table #' -#' Copy a table. +#' Copy a table. One of the most common use cases is testing, to avoid modifying a "production" table. +#' +#' @param table_id Id of table to copy. +#' @param destination_id Parent project id for the copy. #' @export copy_table <- function(table_id, destination_id) { diff --git a/man/copy.Rd b/man/copy.Rd deleted file mode 100644 index 99c80a08..00000000 --- a/man/copy.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/basic_utils.R -\name{copy} -\alias{copy} -\title{Create copy of entity} -\usage{ -copy( - entity, - destination_id, - skip_copy_wiki_page = FALSE, - skip_copy_annotations = FALSE -) -} -\arguments{ -\item{entity}{Entity to copy.} - -\item{destination_id}{Id of destination project/container that entity will be copied to.} - -\item{skip_copy_wiki_page}{Whether to skip copying wiki; defaults FALSE.} - -\item{skip_copy_annotations}{Whether to skip copying annotations; defaults FALSE.} -} -\description{ -Create a copy of syn entity; mostly used to create a copy on which to test out changes. -See https://python-docs.synapse.org/build/html/synapseutils.html?highlight=copy#synapseutils.copy_functions.copy -} -\keyword{internal} diff --git a/man/copy_table.Rd b/man/copy_table.Rd new file mode 100644 index 00000000..a916cc22 --- /dev/null +++ b/man/copy_table.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/basic_utils.R +\name{copy_table} +\alias{copy_table} +\title{Copy a table} +\usage{ +copy_table(table_id, destination_id) +} +\arguments{ +\item{table_id}{Id of table to copy.} + +\item{destination_id}{Parent project id for the copy.} +} +\description{ +Copy a table. One of the most common use cases is testing, to avoid modifying a "production" table. +} diff --git a/man/mock_files.Rd b/man/mock_files.Rd new file mode 100644 index 00000000..2b68a4f8 --- /dev/null +++ b/man/mock_files.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tutorial.R +\name{mock_files} +\alias{mock_files} +\title{Create n temp files} +\usage{ +mock_files(n) +} +\arguments{ +\item{n}{Integer number of files.} +} +\value{ +Paths to files in the temp directory. +} +\description{ +Create some text files for upload +} From 014b91c997084aa89cfe8158b188c4a446e07282 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 18:24:11 -0700 Subject: [PATCH 24/37] Update pkgdown index --- _pkgdown.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index 8fe504be..d9a66af3 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -151,11 +151,12 @@ reference: - as_table_schema - make_folder - add_to_scope + - copy_table - new_view - list_project_datasets - latest_version + - mock_files - walk - - copy - convert_to_stringlist - bare_syn_id - bad_url From 77fe02cec9847626674a7fcd4371f4e78489dd42 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 23 Feb 2024 18:24:28 -0700 Subject: [PATCH 25/37] More vignettes --- vignettes/access-utilities.Rmd | 2 +- vignettes/portal-tables-utils.Rmd | 129 ++++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+), 1 deletion(-) create mode 100644 vignettes/portal-tables-utils.Rmd diff --git a/vignettes/access-utilities.Rmd b/vignettes/access-utilities.Rmd index 2e2f8ea3..c21ea224 100644 --- a/vignettes/access-utilities.Rmd +++ b/vignettes/access-utilities.Rmd @@ -67,7 +67,7 @@ grant_specific_file_access(principal_id = outside_collaborator, ``` Clean up by removing the mock folder and files. You can delete the dataset through the UI. -```{r} +```{r, eval=F} synapser::synDelete(folder_id) ``` diff --git a/vignettes/portal-tables-utils.Rmd b/vignettes/portal-tables-utils.Rmd new file mode 100644 index 00000000..5d8262cc --- /dev/null +++ b/vignettes/portal-tables-utils.Rmd @@ -0,0 +1,129 @@ +--- +title: "Portal tables utils" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Portal tables} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r setup, eval=F} +library(nfportalutils) +synapser::synLogin(authToken = Sys.getenv("SYNAPSE_AUTH_TOKEN")) +``` + +## NF Portal Tables Overview + + + + +These tables behind the NF Data Portal have grown over time and need maintenance like any other infrastructure. +This walks through the original use cases for these utils to accomplish tasks such as: + +- Add new publications to Portal - Publications +- Cleaning or migrating data, e.g. correcting an annotation "rnaSeq" to "RNA-seq" within Portal - Files +- Add new people to Portal - People +- Register a new study to Portal - Studies ** +- For each study in Portal - Studies, fill in related studies ** + +Some tasks marked ** are no longer done manually much of the time (i.e. they may be automated as part of larger workflows), so they'll be covered more briefly. + +### How this will work + +We'll make copies of or use a sufficiently good representation of the relevant tables for update operations. +This also provides an example workflow to follow for contributors who need to update one of the utils and test changes. + +If needed, update the project id, which will be the parent project to host table copies. +```{r, eval=F} +project_id <- "syn26462036" # this the NF-dev-playground project +``` + +### Portal - Publications updates + +There are actually two options for adding publications: `add_publication_from_pubmed` and `add_publication_from_unpaywall`. +The `pubmed` option is the default, and `unpaywall` is an additional option if `pubmed` is unreachable or fails. +In past experience, `add_publication_from_pubmed` works well enough in 98% of cases. + +First, create copy to work with. +```{r, eval=F} +PUBS_COPY <- copy_table("syn16857542", destination_id = project_id) +``` + +#### Adding 1-2 publications at a time + +The minimum information needed is `pmid` (the new pub to add) and `study_id` (the linked study). +This can use `add_publication_from_pubmed`. +The only arg that might need further explaination is `study_table_id` -- this is used to . + +```{r, eval=F} + +add_publication_from_pubmed(pmid = , + study_id = , + disease_focus = , + manifestation = , + publication_table_id = PUBS_COPY, + study_table_id = , + dry_run = F) +``` + +#### Adding publications for larger batch or from a spreadsheet + +Larger batches that are compiled in a spreadsheet should instead use `add_publications_from_file`. +The spreadsheet needs to have [`studyId`, `pmid`, `manifestation`, `diseaseFocus`] columns filled out. + +Let's create and show an example where we add a random publication to be associated with the NF-dev-playground project. +```{r, eval=F} + +pubs_example_file <- "" +``` + +Write this example to a temp `.csv` file. +```{r, eval=F} + + +``` + +```{r, eval=F} + +add_publications_from_file( + file = "new_pubs.csv", + publication_table_id = PUBS_COPY, + study_table_id = , + list_sep = "|", + dry_run = FALSE +) + +``` + + +Clean up the table copy. +```{r, eval=F} +synapser::synDelete(PUBS_COPY) + +``` + +### Portal - Files corrections + +TO DO. + +### Portal - People updates + +TO DO. + +### Portal - Studies updates + +#### Register new study + +TO DO. + +#### Augment with 'related studies' + +TO DO. + From 3eb0747ec4d1d580df52559763a5707aee505229 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Tue, 27 Feb 2024 08:21:45 -0700 Subject: [PATCH 26/37] Refactor for add_pub --- R/add_publication_from_pubmed.R | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/R/add_publication_from_pubmed.R b/R/add_publication_from_pubmed.R index 4f44380e..8e53fbb5 100644 --- a/R/add_publication_from_pubmed.R +++ b/R/add_publication_from_pubmed.R @@ -5,14 +5,18 @@ .add_publication_from_pubmed <- function(batch = 0L, cache = batch) { # implement logging for batch? pmids <- new_data <- NULL counter <- 0L - function(pmid, study_id, disease_focus, manifestation, + function(pmid, study_id, + disease_focus = c(""), manifestation = c(""), publication_table_id, study_table_id, dry_run = T) { counter <<- counter + 1L # cat("current record:", counter) # make verbose? # Query only for data needed, i.e. PMID to check non-dup; result can be cached if(is.null(pmids)) { - pmids <- table_query(publication_table_id, "pmid") %>% unlist(use.names = F) + pmids <- synapser::synTableQuery(glue::glue("select pmid from {publication_table_id}")) %>% + synapser::as.data.frame() %>% + unlist(use.names = F) + pmids <- gsub("PMID:", "", pmids) if(cache) pmids <<- pmids } @@ -23,9 +27,14 @@ if(!length(record)) return() study_id_set <- glue::glue_collapse(glue::single_quote(study_id), sep = ", ") - study <- synapser::as.data.frame(synapser::synTableQuery(glue::glue("SELECT studyId, studyName, fundingAgency FROM {study_table_id} WHERE studyId IN ({study_id_set})"))) - record <- cbind(record, diseaseFocus = I(list(disease_focus)), manifestation = I(list(manifestation)), - studyId = I(list(study$studyId)), studyName = I(list(study$studyName)), fundingAgency = I(list(study$fundingAgency))) + study <- synapser::synTableQuery(glue::glue("SELECT studyId, studyName, fundingAgency FROM {study_table_id} WHERE studyId IN ({study_id_set})"), includeRowIdAndRowVersion = F)%>% + synapser::as.data.frame() + record <- cbind(record, + diseaseFocus = I(list(disease_focus)), + manifestation = I(list(manifestation)), + studyId = I(list(study$studyId)), + studyName = I(list(study$studyName)), + fundingAgency = I(unique(sapply(study$fundingAgency, jsonlite::fromJSON)))) # If batch mode, rbind and defer table schemafication until all records processed if(batch) { @@ -36,7 +45,7 @@ } if(!dry_run) { new_data <- synapser::synStore(new_data) - message(glue::glue('PMID:{new_data$asDataFrame()$pmid} added!')) + message(glue::glue('Added new pmid(s)!')) } else { new_data } @@ -52,8 +61,8 @@ #' #' @param pmid PubMed ID (*not* PMCID) of the publication to be added. #' @param study_id Synapse id(s) of the study that are associated with the publication. -#' @param disease_focus The disease focus(s) that are associated with the publication. -#' @param manifestation The manifestation(s) that are associated with the publication. +#' @param disease_focus (Optional) The disease focus(s) that are associated with the publication. +#' @param manifestation (Optional) The manifestation(s) that are associated with the publication. #' @param publication_table_id Synapse id of the portal publication table. Must have write access. #' @param study_table_id Synapse id of the portal study table. Need read access. #' @param dry_run Default = TRUE. Skips upload to table and instead prints formatted publication metadata. From 281e34a50ff3f26860d5e8beaa20a70a3428609a Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Tue, 27 Feb 2024 08:22:15 -0700 Subject: [PATCH 27/37] Update internal copy util --- R/schema_utils.R | 76 +++++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 39 deletions(-) diff --git a/R/schema_utils.R b/R/schema_utils.R index d28be750..30ad913e 100644 --- a/R/schema_utils.R +++ b/R/schema_utils.R @@ -2,45 +2,43 @@ # -- Synapse schema ------------------------------------------------------------# -#' Transform table data to target schema for Synapse storage -#' -#' **Currently implements list-schema features first and will do more later.** -#' Check and encode data values to expectations of Synapse target table schema for storage. -#' The target schema is more likely from an existing table, since new tables can take advantage of `build_table`. +#' Transform table data to target schema for Synapse storage +#' +#' **Currently implements list-schema features first and will do more later.** +#' Check and encode data values to expectations of Synapse target table schema for storage. +#' The target schema is more likely from an existing table, since new tables can take advantage of `build_table`. #' To get compatible list data, does JSON encoding and optionally `list_truncate` when running into length limits. -#' If truncation is not OK, then the incompatibility will have to be resolved by updating schema outside of this. +#' If truncation is not OK, then the incompatibility will have to be resolved by updating schema outside of this. #' Note that the setting applies to ALL list columns, though it would be desirable to be column-specific. -#' +#' #' @param df A table, i.e. `data.frame`. -#' @param schema Table [schema object](https://python-docs.synapse.org/build/html/Entity.html#synapseclient.table.Schema) or +#' @param schema Table [schema object](https://python-docs.synapse.org/build/html/Entity.html#synapseclient.table.Schema) or #' Synapse id of target table from which to get schema. #' @param list_truncate If length exceeds schema max for list columns, set `TRUE` to allow data truncation, `FALSE` to error only (default). #' @return Synapse Table object ready for storing. #' @export -as_table_schema <- function(df, - schema, +as_table_schema <- function(df, + schema, list_truncate = FALSE) { - - .check_login() + if("data.table" %in% class(df)) df <- as.data.frame(df) - if(!"synapseclient.table.Schema" %in% class(schema) && is_valid_syn_id(schema)) schema <- .syn$get(schema) - col_schema <- .syn$getTableColumns(schema) %>% reticulate::iterate() - + if(!"synapseclient.table.Schema" %in% class(schema) && is_valid_syn_id(schema)) schema <- synapser::synGet(schema) + col_schema <- synapser::synGetTableColumns(schema) %>% synapser::as.list() + # Basic checks of columns col_schema_names <- sapply(col_schema, `[[`, "name") if(length(col_schema_names) != length(df)) stop("Number of columns differs from schema.") tryCatch({ df <- df[col_schema_names] # enforce same order as schema while checking names }, error = function(e) stop("Column names don't match ones in schema.")) - + # https://docs.synapse.org/rest/org/sagebionetworks/repo/model/table/ColumnType.html col_type <- sapply(col_schema, `[[`, "columnType") for(i in seq_along(col_type)) { values <- df[[i]] if(grepl("STRING", col_type[i])) { maxsize <- col_schema[[i]]$maximumSize - if(anyNA(values)) stop("Please remove NA values from STRING column ", names(df)[i]) - size_fail <- sapply(values, function(x) any(nchar(x) > maxsize)) + size_fail <- sapply(values, function(x) any(sapply(x, function(s) if(is.na(s)) FALSE else nchar(s) > maxsize))) if(any(size_fail)) stop(paste("Characters in", names(df)[i], "exceeds max size of", maxsize)) } if(grepl("*_LIST", col_type[i])) { @@ -55,30 +53,30 @@ as_table_schema <- function(df, } } df[[i]] <- sapply(values, function(x) as.character(jsonlite::toJSON(unlist(x)))) # unlist in case x is derived from list - } + } } - table_data <- synapseclient$Table(schema, df) + table_data <- synapser::Table(schema, df) table_data } # -- Schematic (JSON-LD) schema ------------------------------------------------# #' Look up connected nodes by specified property in JSON-LD schema -#' +#' #' Use with schematic-generated JSON-LD schema: given `@id`, get connected nodes by specified prop (e.g. `sms:something`). -#' Intended to be a generic used to define more specific lookup utils. -#' Can do recursive lookup, though graph should be a tree/acyclic (!). +#' Intended to be a generic used to define more specific lookup utils. +#' Can do recursive lookup, though graph should be a tree/acyclic (!). #' (Useful for props such as `dependsOn`, doesn't make sense for props such as `rdfs:label`.) -#' -#' @param id Id (`@id`) for which to get range values; include prefix if needed. +#' +#' @param id Id (`@id`) for which to get range values; include prefix if needed. #' @param prop Property; include prefix if needed. #' @param schema Path (URL or local) to file from which schema will be read, or schema as list object. -#' @param return_labels Return labels (default), otherwise ids of connected nodes. +#' @param return_labels Return labels (default), otherwise ids of connected nodes. #' @param recursive Recursive lookup? #' @param result Vector of accumulated results; used for recursive lookup. #' @param rest Vector of remaining ids; used for recursive lookup. #' @export -get_by_prop_from_json_schema <- function(id, +get_by_prop_from_json_schema <- function(id, prop, schema = 'https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/NF.jsonld', return_labels = TRUE, @@ -89,29 +87,29 @@ get_by_prop_from_json_schema <- function(id, schema <- jsonlite::read_json(schema) schema <- schema$`@graph` } - + matches <- Filter(function(x) x$`@id` == id, schema) if(!length(matches)) stop(glue::glue("Id `{id}` not found in schema!")) ids <- unlist(lapply(matches[[1]][[prop]], function(x) x$`@id`)) - + if(return_labels) { labels <- Filter(function(x) x$`@id` %in% ids, schema) %>% sapply(function(x) x$`sms:displayName`) %>% unlist() - result <- c(result, labels) + result <- c(result, labels) } else { result <- c(result, ids) } - + rest <- c(rest, ids) if(recursive && length(rest)) { id <- rest[1] rest <- rest[-1] - get_by_prop_from_json_schema(id, + get_by_prop_from_json_schema(id, prop, - schema, - return_labels, + schema, + return_labels, recursive, - result, + result, rest) } else { # result @@ -121,9 +119,9 @@ get_by_prop_from_json_schema <- function(id, #' Get dependencies for node in JSON-LD schema -#' +#' #' Shorthand for getting props defined in annotation template using `get_by_prop_from_json_schema` under the hood. -#' +#' #' @inheritParams get_by_prop_from_json_schema #' @export get_dependency_from_json_schema <- function(id, @@ -133,7 +131,7 @@ get_dependency_from_json_schema <- function(id, recursive = TRUE, result = NULL, rest = NULL) { - + get_by_prop_from_json_schema(id, prop, schema, return_labels, recursive, result, rest) - + } From 7ebcb1809510f8d42f98fd1f9264189866ff25a0 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Tue, 27 Feb 2024 08:23:58 -0700 Subject: [PATCH 28/37] Update vignette --- inst/extdata/pubs_example.csv | 4 ++ vignettes/portal-tables-utils.Rmd | 115 ++++++++++++++++++++++-------- 2 files changed, 89 insertions(+), 30 deletions(-) create mode 100644 inst/extdata/pubs_example.csv diff --git a/inst/extdata/pubs_example.csv b/inst/extdata/pubs_example.csv new file mode 100644 index 00000000..8e9d5b73 --- /dev/null +++ b/inst/extdata/pubs_example.csv @@ -0,0 +1,4 @@ +pmid,studyId,diseaseFocus,manifestation,comments +38383777,syn11672851,NA,,Drug-Target Explorer +38383780,syn4939902,Neufibromatosis type 1,MPNST,Johns Hopkins Biobank project +38375882,syn51133914|syn51133929,Neufibromatosis type 1,MPNST|Plexiform neurofibroma,DHART project 1and DHART project 2 produced collaborative paper diff --git a/vignettes/portal-tables-utils.Rmd b/vignettes/portal-tables-utils.Rmd index 5d8262cc..00306304 100644 --- a/vignettes/portal-tables-utils.Rmd +++ b/vignettes/portal-tables-utils.Rmd @@ -1,6 +1,8 @@ --- title: "Portal tables utils" -output: rmarkdown::html_vignette +output: + rmarkdown::html_vignette: + code_folding: show vignette: > %\VignetteIndexEntry{Portal tables} %\VignetteEngine{knitr::rmarkdown} @@ -33,25 +35,25 @@ This walks through the original use cases for these utils to accomplish tasks su - Register a new study to Portal - Studies ** - For each study in Portal - Studies, fill in related studies ** -Some tasks marked ** are no longer done manually much of the time (i.e. they may be automated as part of larger workflows), so they'll be covered more briefly. +Some tasks marked ** are no longer done manually much of the time (having been automated as part of larger workflows), so they'll be covered more briefly. +When requirements change, having a sense of what these utils do can help understand where, what, and how to update. ### How this will work -We'll make copies of or use a sufficiently good representation of the relevant tables for update operations. -This also provides an example workflow to follow for contributors who need to update one of the utils and test changes. +We'll make copies of the relevant portal tables for update-type operations. +What this also provides is an example workflow to follow for contributors who need to develop these utils and do testing. -If needed, update the project id, which will be the parent project to host table copies. +**First set your private development project id, which will be the parent project to host the table copies.** ```{r, eval=F} -project_id <- "syn26462036" # this the NF-dev-playground project +project_id <- "syn26462036" # the NF-dev-playground project ``` ### Portal - Publications updates There are actually two options for adding publications: `add_publication_from_pubmed` and `add_publication_from_unpaywall`. -The `pubmed` option is the default, and `unpaywall` is an additional option if `pubmed` is unreachable or fails. -In past experience, `add_publication_from_pubmed` works well enough in 98% of cases. +The `pubmed` option is the default, and `unpaywall` is an additional option if there is no `pmid`. -First, create copy to work with. +Start with creating a table copy to work with. ```{r, eval=F} PUBS_COPY <- copy_table("syn16857542", destination_id = project_id) ``` @@ -59,54 +61,83 @@ PUBS_COPY <- copy_table("syn16857542", destination_id = project_id) #### Adding 1-2 publications at a time The minimum information needed is `pmid` (the new pub to add) and `study_id` (the linked study). -This can use `add_publication_from_pubmed`. -The only arg that might need further explaination is `study_table_id` -- this is used to . +This can use `add_publication_from_pubmed`, which pulls in author, journal, etc. from PubMed. +What might need further explanation is the involvement of `study_table_id` -- +this needs to be a table where `studyId`, `studyName`, `fundingAgency` can be looked up to help fill `fundingAgency` with consistency. + +Since this is a demo, the papers are not actually related or accurately classified at all. +Commands show adding papers with and without additional `disease_focus` and `manifestation` labels (which were once manually derived). ```{r, eval=F} -add_publication_from_pubmed(pmid = , - study_id = , - disease_focus = , - manifestation = , +STUDY_TABLE <- "syn52694652" # we will READ ONLY from this table +nfportalutils::add_publication_from_pubmed(pmid = 38383787, + study_id = "syn11672851", + disease_focus = "Neurofibromatosis type 1", + manifestation = c("MPNST"), publication_table_id = PUBS_COPY, - study_table_id = , + study_table_id = STUDY_TABLE, + dry_run = F) + + +nfportalutils::add_publication_from_pubmed(pmid = 38383777, + study_id = "syn11672851", + publication_table_id = PUBS_COPY, + study_table_id = STUDY_TABLE, dry_run = F) ``` -#### Adding publications for larger batch or from a spreadsheet -Larger batches that are compiled in a spreadsheet should instead use `add_publications_from_file`. -The spreadsheet needs to have [`studyId`, `pmid`, `manifestation`, `diseaseFocus`] columns filled out. +#### Adding publications in large batch, from a spreadsheet -Let's create and show an example where we add a random publication to be associated with the NF-dev-playground project. -```{r, eval=F} +Large batches are often put in a spreadsheet and should instead use `add_publications_from_file`. +The spreadsheet needs to have [`studyId`, `pmid`, `diseaseFocus`, `manifestation`] columns filled out. +Other columns will be ignored. + +An example of this format comes with the package and is shown below. +```{r, eval=T} -pubs_example_file <- "" +example_csv <- system.file("extdata", "pubs_example.csv", package = "nfportalutils") +new_pubs <- read.csv(example_csv) +knitr::kable(new_pubs) ``` -Write this example to a temp `.csv` file. -```{r, eval=F} +Several things to note: + +- Rarely, there may be multiple studies associated with one publication, so they need to be listed with a "|" (pipe) separator. +- Indicating nulls is more nuanced for the spreadsheet version due to differences for STRING vs STRING_LIST -- for diseaseFocus, use "NA" while for manifestation can leave blank. - -``` ```{r, eval=F} add_publications_from_file( - file = "new_pubs.csv", + file = example_csv, publication_table_id = PUBS_COPY, - study_table_id = , + study_table_id = STUDY_TABLE, list_sep = "|", dry_run = FALSE ) ``` +Check the new additions in the UI. -Clean up the table copy. +To conclude this part of the vignette, clean up the table copy. ```{r, eval=F} synapser::synDelete(PUBS_COPY) +``` + + +For become even more erudite, review the source code or try some experiments regarding these concerns: + +1. What happens with trying to add a pmid that already exists in the table? +2. What happens when the pmid is incorrect due to typo? + +Toggle the code block below to show expected results. +```{r, class.source = 'fold-hide'} +#1. A publication that already exists should be skipped with a message saying so. +#2. It fails. ``` ### Portal - Files corrections @@ -115,7 +146,21 @@ TO DO. ### Portal - People updates -TO DO. +Create the table copy. +```{r, eval=F} +PEOPLE_COPY <- copy_table("syn16857542", destination_id = project_id) +``` + +This relatively simple util finds new people that have made contributions and adds them to the people table. +```{r, eval=F} + +add_people_from_table(people_table_id = PEOPLE_COPY, + people_column = "ownerId", + source_table_id = "syn16858331", # READ ONLY from the source table, which is Portal - Files + source_column = "createdBy", + dry_run = F) +``` + ### Portal - Studies updates @@ -127,3 +172,13 @@ TO DO. TO DO. + +Here are some other things to get deeper via the source code and/or docs: + +1. Why is there both `n_k` and and `n_clust`. Why is this used with `n_k` instead of `n_clust`? + +Toggle the code block below to show asnwers. +```{r, class.source = 'fold-hide'} +# That's just because `n_clust` generates results as clusters with highly variable numbers of member studies, i.e. there could be 20 studies around this one mainstream topic vs 2-3 in this more arcane topic. The table breaks when list length exceeds a certain limit. Historically, clusters have been used and a max of four studies selected as a workaround. But using `n_k` can give better related results with more control. + +``` From 151f84e44e3630105c3b4d53f924dd8e29d707c8 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Tue, 27 Feb 2024 08:24:49 -0700 Subject: [PATCH 29/37] Update README --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 586697ab..2f887f56 100644 --- a/README.md +++ b/README.md @@ -14,14 +14,14 @@ Currently, `develop` branch is default so package install and docs refer to code ## Installation -You can install `nfportalutils` from here: +You should first install `synapser` following the instructions [here](https://github.com/Sage-Bionetworks/synapser?tab=readme-ov-file#installation). +Then you can install `nfportalutils` with: ``` r remotes::install_github("nf-osi/nfportalutils") ``` - ## Additional Notes for Users - View function reference on docs site at [Reference](https://nf-osi.github.io/nfportalutils/reference/index.html). @@ -30,7 +30,7 @@ remotes::install_github("nf-osi/nfportalutils") ## Additional Notes for Contributors ### Contrib workflow -- Branch of `develop` and make changes +- Branch off `develop` and make changes - Run `devtools::check(vignettes = FALSE)` early and often, and definitely before submitting a PR - Make a pull request to `develop`; this will run `R-CMD-CHECK` and `pkgdown` - Request a reviewer if both checks pass From ad0feb6ea140ce56ad236ed9d88ecf0daab772f3 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Tue, 27 Feb 2024 08:30:11 -0700 Subject: [PATCH 30/37] Update docs --- man/add_publication_from_pubmed.Rd | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/man/add_publication_from_pubmed.Rd b/man/add_publication_from_pubmed.Rd index 18d51dae..f8b25a17 100644 --- a/man/add_publication_from_pubmed.Rd +++ b/man/add_publication_from_pubmed.Rd @@ -7,8 +7,8 @@ add_publication_from_pubmed( pmid, study_id, - disease_focus, - manifestation, + disease_focus = c(""), + manifestation = c(""), publication_table_id, study_table_id, dry_run = T @@ -19,9 +19,9 @@ add_publication_from_pubmed( \item{study_id}{Synapse id(s) of the study that are associated with the publication.} -\item{disease_focus}{The disease focus(s) that are associated with the publication.} +\item{disease_focus}{(Optional) The disease focus(s) that are associated with the publication.} -\item{manifestation}{The manifestation(s) that are associated with the publication.} +\item{manifestation}{(Optional) The manifestation(s) that are associated with the publication.} \item{publication_table_id}{Synapse id of the portal publication table. Must have write access.} From ba8252f4e3ddde258eabd1472b16b2124ce53e0c Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu <32753274+anngvu@users.noreply.github.com> Date: Fri, 1 Mar 2024 16:09:41 -0700 Subject: [PATCH 31/37] Update README.md --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 2f887f56..d6dc3484 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,12 @@ The goal of `nfportalutils` is to provide convenience functions for project and (meta)data management in the NF-OSI data portal scope. Currently, `develop` branch is default so package install and docs refer to code in this branch. +> [!WARNING] +> For the last relatively stable version of `nfportalutils`, please install at https://github.com/nf-osi/nfportalutils/releases/tag/v0.9500-presynapser. +> +> Currently, the package is in a refactoring period where usage is complex because of the coexistence of both `synapser` and separate `synapseclient` import. +> This will be updated when everything is 100% refactored to `synapser`. + ## Docs :point_right: [Package documentation!](https://nf-osi.github.io/nfportalutils/) From 69caa4fafc4c7f6175f805e54a436c1131ac4ddc Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 1 Mar 2024 16:54:54 -0700 Subject: [PATCH 32/37] Constrain synapser version --- .github/workflows/R-CMD-check.yaml | 13 ++++++++++--- DESCRIPTION | 5 +++-- README.md | 4 ++++ 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 0cbf1669..6662d30c 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -46,12 +46,19 @@ jobs: extra-packages: any::rcmdcheck needs: check - - name: If source build fails on macOS or Windows, fall back to typical install - if: steps.install-deps.outcome == 'failure' + - name: Install working version for macOS + if: steps.install-deps.outcome == 'failure' && runner.os == 'macOS' shell: Rscript {0} run: | reticulate::install_miniconda() - install.packages("synapser", repos="http://ran.synapse.org") + install.packages("https://github.com/Sage-Bionetworks/synapser/archive/refs/tags/1.3.0.tar.gz", repos=NULL) + + - name: Install working version for Windows + if: steps.install-deps.outcome == 'failure' && runner.os == 'Windows' + shell: Rscript {0} + run: | + reticulate::install_miniconda() + install.packages("https://github.com/Sage-Bionetworks/synapser/archive/refs/tags/1.3.0.zip", repos=NULL) - uses: r-lib/actions/check-r-package@v2 with: diff --git a/DESCRIPTION b/DESCRIPTION index 526aaaba..53292ba5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -41,7 +41,8 @@ Imports: plyr, readxl, yaml, - synapser (>= 1.0.0) + synapser (>= 1.0.0), + synapser (< 2.0.0) URL: https://github.com/nf-osi/nfportalutils BugReports: https://github.com/nf-osi/nfportalutils/issues Suggests: @@ -52,4 +53,4 @@ Suggests: Config/testthat/edition: 2 VignetteBuilder: knitr Remotes: - github::Sage-Bionetworks/synapser + github::Sage-Bionetworks/synapser@1.3.0 diff --git a/README.md b/README.md index d6dc3484..ce89700a 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,10 @@ Currently, `develop` branch is default so package install and docs refer to code > Currently, the package is in a refactoring period where usage is complex because of the coexistence of both `synapser` and separate `synapseclient` import. > This will be updated when everything is 100% refactored to `synapser`. +> [!NOTE] +> Underlying dependencies tested for this package are `synapser==1.3.0` and `synapseclient==3.1.1`. +> There are known breaking issues for MacOS and Windows with the newer versions for now. + ## Docs :point_right: [Package documentation!](https://nf-osi.github.io/nfportalutils/) From 04c7011f97995784b3b60a568052191b9d5781a0 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Mon, 4 Mar 2024 15:18:12 -0700 Subject: [PATCH 33/37] Update CI with more specific versioning and installation methods --- .github/workflows/R-CMD-check.yaml | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 6662d30c..f7c066eb 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -18,11 +18,10 @@ jobs: fail-fast: false matrix: config: - - {os: macOS-13, r: 'release'} - - {os: windows-2022, r: 'release'} - - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - - {os: ubuntu-latest, r: 'release'} - - {os: ubuntu-latest, r: 'oldrel-1'} + - {os: macOS-13, r: '4.2.1'} + - {os: windows-2022, r: '4.2.1'} + - {os: ubuntu-latest, r: '4.3.3'} + - {os: ubuntu-latest, r: '4.2.1'} env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} @@ -46,19 +45,14 @@ jobs: extra-packages: any::rcmdcheck needs: check - - name: Install working version for macOS - if: steps.install-deps.outcome == 'failure' && runner.os == 'macOS' + - name: Install working archive version for macOS and Windows + if: runner.os == 'Windows' || runner.os == 'macOS' shell: Rscript {0} run: | reticulate::install_miniconda() - install.packages("https://github.com/Sage-Bionetworks/synapser/archive/refs/tags/1.3.0.tar.gz", repos=NULL) - - - name: Install working version for Windows - if: steps.install-deps.outcome == 'failure' && runner.os == 'Windows' - shell: Rscript {0} - run: | - reticulate::install_miniconda() - install.packages("https://github.com/Sage-Bionetworks/synapser/archive/refs/tags/1.3.0.zip", repos=NULL) + reticulate::install_python("3.9.12") + reticulate::py_install("synapseclient==3.1.1", pip = TRUE) + install.packages("https://github.com/Sage-Bionetworks/synapser/archive/refs/tags/1.3.0.tar.gz", repos=NULL, type="source") - uses: r-lib/actions/check-r-package@v2 with: From 4b1ccd49d67d0b8d1758e8c080661e16787a99d5 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Mon, 11 Mar 2024 20:19:52 -0600 Subject: [PATCH 34/37] Doc previews per #176 --- .github/workflows/pkgdown.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index b69f6b2c..4c027f32 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -42,6 +42,12 @@ jobs: run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) shell: Rscript {0} + - name: Deploy preview for PRs + if: github.event_name == 'pull_request' + uses: rossjrw/pr-preview-action@v1 + with: + source-dir: ./docs + - name: Deploy to GitHub pages πŸš€ if: github.event_name != 'pull_request' uses: JamesIves/github-pages-deploy-action@v4.4.1 From 5a94149e3383fa2e15e2579a64512513ff771cae Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Tue, 12 Mar 2024 14:27:19 -0600 Subject: [PATCH 35/37] Start installation vignette --- vignettes/package-installation.Rmd | 155 +++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 vignettes/package-installation.Rmd diff --git a/vignettes/package-installation.Rmd b/vignettes/package-installation.Rmd new file mode 100644 index 00000000..0d0bdd1d --- /dev/null +++ b/vignettes/package-installation.Rmd @@ -0,0 +1,155 @@ +--- +title: "Package Installation" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{package-installation} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +## Installation {.tabset} + +THIS VERSION: +This is the recommended installation for the [current release](https://github.com/nf-osi/nfportalutils/releases/tag/v0.9500-presynapser) of `nfportalutils` **that does not depend on `synapser`**. + + +There *will* be a **newer version of `nfportalutils` with `synapser`** (i.e. see [this issue](https://github.com/nf-osi/nfportalutils/issues/94) on the roadmap), and instructions will be accordingly revised when the refactor is completed. + +Make sure you are in a new R session (in RStudio, go to `Session > Restart R` if necessary). + +### Windows + +#### Outside R + +- Install a supported `python` version (recommend Python 3.11) by downloading installation binary at https://www.python.org/downloads/windows/. +- Install `synapseclient` following instructions at https://pypi.org/project/synapseclient/4.0.0/. + +#### With R +- Install `remotes` to help install other packages: +`install.packages("remotes")`. + +- Check that you have the correct `reticulate` version: +`packageVersion("reticulate")`. +If the result is anything other than `1.28`, then go to the `reticulate` install step, otherwise skip to `nfportalutils` install. + +- Install `reticulate`: +``` +> remotes::install_github("rstudio/reticulate@v1.28") +Downloading GitHub repo rstudio/reticulate@v1.28 +Running `R CMD build`... +* checking for file 'C:\Users\Erik\AppData\Local\Temp\Rtmporbql0\remotes322031ec7c50\rstudio-reticulate-3de77d1/DESCRIPTION' ... OK +* preparing 'reticulate': +* checking DESCRIPTION meta-information ... OK +* cleaning src +* checking for LF line-endings in source and make files and shell scripts +* checking for empty or unneeded directories +* building 'reticulate_1.28.tar.gz' +Installing package into β€˜C:/Users/Erik/AppData/Local/R/win-library/4.3’ +(as β€˜lib’ is unspecified) +* installing *source* package 'reticulate' ... +** using staged installation +** libs +using C++ compiler: 'G__~1.EXE (GCC) 12.3.0' +g++ -std=gnu++17 -I"C:/PROGRA~1/R/R-43~1.2/include" -DNDEBUG -I'C:/Users/Erik/AppData/Local/R/win-library/4.3/Rcpp/include' -I"C:/rtools43/x86_64-w64-mingw32.static.posix/include" -O2 -Wall -mfpmath=sse -msse2 -mstackrealign -c RcppExports.cpp -o RcppExports.o +g++ -std=gnu++17 -I"C:/PROGRA~1/R/R-43~1.2/include" -DNDEBUG -I'C:/Users/Erik/AppData/Local/R/win-library/4.3/Rcpp/include' -I"C:/rtools43/x86_64-w64-mingw32.static.posix/include" -O2 -Wall -mfpmath=sse -msse2 -mstackrealign -c event_loop.cpp -o event_loop.o +g++ -std=gnu++17 -I"C:/PROGRA~1/R/R-43~1.2/include" -DNDEBUG -I'C:/Users/Erik/AppData/Local/R/win-library/4.3/Rcpp/include' -I"C:/rtools43/x86_64-w64-mingw32.static.posix/include" -O2 -Wall -mfpmath=sse -msse2 -mstackrealign -c libpython.cpp -o libpython.o +g++ -std=gnu++17 -I"C:/PROGRA~1/R/R-43~1.2/include" -DNDEBUG -I'C:/Users/Erik/AppData/Local/R/win-library/4.3/Rcpp/include' -I"C:/rtools43/x86_64-w64-mingw32.static.posix/include" -O2 -Wall -mfpmath=sse -msse2 -mstackrealign -c output.cpp -o output.o +g++ -std=gnu++17 -I"C:/PROGRA~1/R/R-43~1.2/include" -DNDEBUG -I'C:/Users/Erik/AppData/Local/R/win-library/4.3/Rcpp/include' -I"C:/rtools43/x86_64-w64-mingw32.static.posix/include" -O2 -Wall -mfpmath=sse -msse2 -mstackrealign -c python.cpp -o python.o +g++ -std=gnu++17 -I"C:/PROGRA~1/R/R-43~1.2/include" -DNDEBUG -I'C:/Users/Erik/AppData/Local/R/win-library/4.3/Rcpp/include' -I"C:/rtools43/x86_64-w64-mingw32.static.posix/include" -O2 -Wall -mfpmath=sse -msse2 -mstackrealign -c readline.cpp -o readline.o +g++ -std=gnu++17 -I"C:/PROGRA~1/R/R-43~1.2/include" -DNDEBUG -I'C:/Users/Erik/AppData/Local/R/win-library/4.3/Rcpp/include' -I"C:/rtools43/x86_64-w64-mingw32.static.posix/include" -O2 -Wall -mfpmath=sse -msse2 -mstackrealign -c signals.cpp -o signals.o +g++ -std=gnu++17 -shared -s -static-libgcc -o reticulate.dll tmp.def RcppExports.o event_loop.o libpython.o output.o python.o readline.o signals.o -LC:/rtools43/x86_64-w64-mingw32.static.posix/lib/x64 -LC:/rtools43/x86_64-w64-mingw32.static.posix/lib -LC:/PROGRA~1/R/R-43~1.2/bin/x64 -lR +installing to C:/Users/Erik/AppData/Local/R/win-library/4.3/00LOCK-reticulate/00new/reticulate/libs/x64 +** R +** inst +** byte-compile and prepare package for lazy loading +** help +*** installing help indices +*** copying figures +** building package indices +** installing vignettes +** testing if installed package can be loaded from temporary location +** testing if installed package can be loaded from final location +** testing if installed package keeps a record of temporary installation path +* DONE (reticulate) +``` + +- Install `nfportalutils` (choose option `3 - None`): +``` +> remotes::install_github("nf-osi/nfportalutils@v0.9500-presynapser") +Downloading GitHub repo nf-osi/nfportalutils@v0.9500-presynapser +These packages have more recent versions available. +It is recommended to update all of them. +Which would you like to update? + +1: All +2: CRAN packages only +3: None +4: reticulate (1.28 -> 1.35.0) [CRAN] +``` + +``` +Running `R CMD build`... +* checking for file 'C:\Users\Erik\AppData\Local\Temp\RtmpQzT2JT\remotes158c6d2c7833\nf-osi-nfportalutils-61636cd/DESCRIPTION' ... OK +* preparing 'nfportalutils': +* checking DESCRIPTION meta-information ... OK +* checking for LF line-endings in source and make files and shell scripts +* checking for empty or unneeded directories +Omitted 'LazyData' from DESCRIPTION +* building 'nfportalutils_0.9500.tar.gz' +Installing package into β€˜C:/Users/Erik/AppData/Local/R/win-library/4.3’ +(as β€˜lib’ is unspecified) +* installing *source* package 'nfportalutils' ... +** using staged installation +** R +** inst +** byte-compile and prepare package for lazy loading +** help +*** installing help indices +** building package indices +** installing vignettes +** testing if installed package can be loaded from temporary location +** testing if installed package can be loaded from final location +** testing if installed package keeps a record of temporary installation path +* DONE (nfportalutils) +``` + + +#### Post-installation + +It is recommended to set your Synapse authentication token as an environment variable that R can access. +You can create an `.Renviron` in your home. For example, create 'C:/Users/Erik/Documents/.Renviron'. + +In the file, add +``` +SYNAPSE_AUTH_TOKEN=xxx_your_token_here_xxx +``` + +### MacOS + +TODO + + + +## Test Your Installation + +Restart R after adding your token. +Then to test that your installation is working, load package and try to log in. +If you set your authentication token successfully in the above, you should be greeted accordingly. + +### Login + +```{r} + +library(nfportalutils) +syn_login() + +``` + + From 098e5273e69304c1abc13874b333017a1e76a6f5 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Tue, 12 Mar 2024 16:36:00 -0600 Subject: [PATCH 36/37] Update vignette --- vignettes/package-installation.Rmd | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vignettes/package-installation.Rmd b/vignettes/package-installation.Rmd index 0d0bdd1d..e9923f2b 100644 --- a/vignettes/package-installation.Rmd +++ b/vignettes/package-installation.Rmd @@ -141,11 +141,12 @@ TODO Restart R after adding your token. Then to test that your installation is working, load package and try to log in. -If you set your authentication token successfully in the above, you should be greeted accordingly. +If you have your authentication token set in the above recommended steps, you should be greeted accordingly. +Without a token, you are logged in as "Anonymous" and functionality will be limited. ### Login -```{r} +```{r, eval=FALSE} library(nfportalutils) syn_login() From 9b87c115ddf87105cf73bad7c7e673f5dd10e25c Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Tue, 12 Mar 2024 18:30:28 -0600 Subject: [PATCH 37/37] More instructions, clear cache --- vignettes/package-installation.Rmd | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/vignettes/package-installation.Rmd b/vignettes/package-installation.Rmd index e9923f2b..b674dbe3 100644 --- a/vignettes/package-installation.Rmd +++ b/vignettes/package-installation.Rmd @@ -19,10 +19,10 @@ knitr::opts_chunk$set( THIS VERSION: This is the recommended installation for the [current release](https://github.com/nf-osi/nfportalutils/releases/tag/v0.9500-presynapser) of `nfportalutils` **that does not depend on `synapser`**. - There *will* be a **newer version of `nfportalutils` with `synapser`** (i.e. see [this issue](https://github.com/nf-osi/nfportalutils/issues/94) on the roadmap), and instructions will be accordingly revised when the refactor is completed. -Make sure you are in a new R session (in RStudio, go to `Session > Restart R` if necessary). +Make sure you are in a new R session (in RStudio, go to `Session > Restart R` if necessary). +The trickiest part of the installation and package usage, for most R users, will be the Python interop. ### Windows @@ -133,7 +133,17 @@ SYNAPSE_AUTH_TOKEN=xxx_your_token_here_xxx ### MacOS -TODO +#### Outside R + +- If Python is not already installed, obtain Python3 via `brew` following suggested method here: https://docs.python-guide.org/starting/install3/osx/#doing-it-right. +- Install `synapseclient` following instructions at https://pypi.org/project/synapseclient/4.0.0/. + +### Linux (Ubuntu) + +#### Outside R +- Python is already included, so just need https://pypi.org/project/synapseclient/4.0.0/. + +#### With R @@ -146,11 +156,10 @@ Without a token, you are logged in as "Anonymous" and functionality will be limi ### Login -```{r, eval=FALSE} - +``` library(nfportalutils) syn_login() - ``` +