diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 0cbf1669..f7c066eb 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -18,11 +18,10 @@ jobs: fail-fast: false matrix: config: - - {os: macOS-13, r: 'release'} - - {os: windows-2022, r: 'release'} - - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - - {os: ubuntu-latest, r: 'release'} - - {os: ubuntu-latest, r: 'oldrel-1'} + - {os: macOS-13, r: '4.2.1'} + - {os: windows-2022, r: '4.2.1'} + - {os: ubuntu-latest, r: '4.3.3'} + - {os: ubuntu-latest, r: '4.2.1'} env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} @@ -46,12 +45,14 @@ jobs: extra-packages: any::rcmdcheck needs: check - - name: If source build fails on macOS or Windows, fall back to typical install - if: steps.install-deps.outcome == 'failure' + - name: Install working archive version for macOS and Windows + if: runner.os == 'Windows' || runner.os == 'macOS' shell: Rscript {0} run: | reticulate::install_miniconda() - install.packages("synapser", repos="http://ran.synapse.org") + reticulate::install_python("3.9.12") + reticulate::py_install("synapseclient==3.1.1", pip = TRUE) + install.packages("https://github.com/Sage-Bionetworks/synapser/archive/refs/tags/1.3.0.tar.gz", repos=NULL, type="source") - uses: r-lib/actions/check-r-package@v2 with: diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index cacff559..4c027f32 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -27,7 +27,12 @@ jobs: - uses: r-lib/actions/setup-r@v2 with: use-public-rspm: true - + + - name: Install synapseclient + run: | + pip install synapseclient + synapse --version + - uses: r-lib/actions/setup-r-dependencies@v2 with: extra-packages: any::pkgdown, local::. @@ -37,6 +42,12 @@ jobs: run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) shell: Rscript {0} + - name: Deploy preview for PRs + if: github.event_name == 'pull_request' + uses: rossjrw/pr-preview-action@v1 + with: + source-dir: ./docs + - name: Deploy to GitHub pages πŸš€ if: github.event_name != 'pull_request' uses: JamesIves/github-pages-deploy-action@v4.4.1 diff --git a/DESCRIPTION b/DESCRIPTION index 526aaaba..53292ba5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -41,7 +41,8 @@ Imports: plyr, readxl, yaml, - synapser (>= 1.0.0) + synapser (>= 1.0.0), + synapser (< 2.0.0) URL: https://github.com/nf-osi/nfportalutils BugReports: https://github.com/nf-osi/nfportalutils/issues Suggests: @@ -52,4 +53,4 @@ Suggests: Config/testthat/edition: 2 VignetteBuilder: knitr Remotes: - github::Sage-Bionetworks/synapser + github::Sage-Bionetworks/synapser@1.3.0 diff --git a/NAMESPACE b/NAMESPACE index 85ab7af7..a7442d2a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -42,6 +42,7 @@ export(check_readpair_validity) export(check_wiki_links) export(convert_to_stringlist) export(copy_annotations) +export(copy_table) export(data_curator_app_subpage) export(delete_provenance) export(dsp_dataset_mapping) diff --git a/R/access_utils.R b/R/access_utils.R index 25008acc..06fbf122 100644 --- a/R/access_utils.R +++ b/R/access_utils.R @@ -29,11 +29,11 @@ summarize_file_access <- function(principal_id, # 3378999 for NF-OSI fileview_id # "syn16858331" ) { - .check_login() tryCatch({ - view <- .syn$tableQuery(glue::glue("SELECT id,type,benefactorId FROM {fileview_id}")) + view <- synapser::synTableQuery(glue::glue("SELECT id,type,benefactorId FROM {fileview_id}")) %>% + synapser::as.data.frame() %>% + as.data.table() }, error = function(e) stop("Could not query view!")) - view <- as.data.table(view$asDataFrame()) files_by_benefactor <- view[type == "file", .N, by = .(benefactorId)] access <- view[, check_access(benefactorId, principal_id, access_type), by = .(benefactorId)] # files_by_benefactor can be smaller than access because there are folders without files @@ -56,7 +56,7 @@ check_access <- function(id, stopifnot(is.numeric(principal_id)) acl_result <- tryCatch({ - .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{id}/acl"))$resourceAccess %>% + synapser::synRestGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{id}/acl"))$resourceAccess %>% rbindlist(.) }, error = function(e) stop(glue::glue("Error for {id}: {e$message}"))) @@ -69,26 +69,26 @@ check_access <- function(id, # -- SETTING ACCESS -------------------------------------------------------------# -#' Set public access to VIEW (READ) only for an entity -#' -#' Set both registered users and non-registered users to have VIEW-only permissions. +#' Set public access to VIEW (READ) only for an entity +#' +#' Set both registered users and non-registered users to have VIEW-only permissions. #' See code{link{make_public}} for more permissive permissions to download (for registered users), which is usually set later at data release time. -#' +#' #' @param id Synapse entity id. #' @export make_public_viewable <- function(id) { - .check_login() + ALL_REGISTERED_SYNAPSE_USERS_GROUP <- "273948" PUBLIC_GROUP <- "273949" # set registered synapse users to view, download - .syn$setPermissions(entity = id, - principalId = ALL_REGISTERED_SYNAPSE_USERS_GROUP, - accessType = list("READ")) - + synapser::synSetPermissions(entity = id, + principalId = ALL_REGISTERED_SYNAPSE_USERS_GROUP, + accessType = list("READ")) + # set public to view - .syn$setPermissions(entity = id, - principalId = PUBLIC_GROUP, - accessType = list("READ")) + synapser::synSetPermissions(entity = id, + principalId = PUBLIC_GROUP, + accessType = list("READ")) } @@ -101,18 +101,18 @@ make_public_viewable <- function(id) { #' @param id Synapse entity id. #' @export make_public <- function(id) { - .check_login() + ALL_REGISTERED_SYNAPSE_USERS_GROUP <- "273948" PUBLIC_GROUP <- "273949" # set registered synapse users to view, download - .syn$setPermissions(entity = id, - principalId = ALL_REGISTERED_SYNAPSE_USERS_GROUP, - accessType = list("READ","DOWNLOAD")) + synapser::synSetPermissions(entity = id, + principalId = ALL_REGISTERED_SYNAPSE_USERS_GROUP, + accessType = list("READ","DOWNLOAD")) # set public to view - .syn$setPermissions(entity = id, - principalId = PUBLIC_GROUP, - accessType = list("READ")) + synapser::synSetPermissions(entity = id, + principalId = PUBLIC_GROUP, + accessType = list("READ")) } @@ -130,25 +130,18 @@ make_public <- function(id) { #' @param dataset_name Optional name for dataset to be created #' @export grant_specific_file_access <- function(principal_id, entity_ids, create_dataset = F, project_id = NULL, dataset_name = NULL) { - # .check_login() - if(create_dataset & is.null(project_id)){ + if(create_dataset && is.null(project_id)){ stop("project_id must be provided if create_dataset = T") } # set registered synapse users to view, download sapply(entity_ids, function(id){ - .syn$setPermissions(entity = id, - principalId = principal_id, - accessType = list("READ","DOWNLOAD")) + synapser::synSetPermissions(entity = id, + principalId = principal_id, + accessType = list("READ","DOWNLOAD")) }) - ##need to grab the current versions for dataset creation - dataset_items <- lapply(entity_ids, function(id){ - vsn <- .syn$get(id, downloadFile = F)$versionNumber - list(entityId = id, versionNumber = vsn) - }) - if(is.null(dataset_name)){ dataset_name <- glue::glue("Dataset {Sys.Date()} for {principal_id}") } @@ -156,23 +149,31 @@ grant_specific_file_access <- function(principal_id, entity_ids, create_dataset if(create_dataset){ tryCatch({ # First attempt with addAnnotationColumns = TRUE - dataset <- .syn$store(synapseclient$Dataset(name = dataset_name, - parent = project_id, dataset_items = dataset_items, addAnnotationColumns = TRUE)) + dataset <- new_dataset(name = dataset_name, + parent = project_id, + items = entity_ids, + addAnnotationColumns = TRUE, + dry_run = FALSE) message(glue::glue("{emoji::emoji(\"thumbsup\")} Dataset created with annotation columns at {dataset$properties$id}")) }, error = function(e) { # If error, retry with addAnnotationColumns = FALSE - dataset <- .syn$store(synapseclient$Dataset(name = dataset_name, - parent = project_id, dataset_items = dataset_items, addAnnotationColumns = FALSE)) - .syn$setPermissions(entity = dataset$properties$id, principalId = principal_id, - accessType = list("READ", "DOWNLOAD")) - message(glue::glue("{emoji::emoji(\"warning\")} Dataset created without annotation columns at {dataset$properties$id}. Annotation columns will need to be added manually.")) + dataset <- new_dataset(name = dataset_name, + parent = project_id, + items = entity_ids, + addAnnotationColumns = FALSE, + dry_run = FALSE) + synapser::synSetPermissions(entity = dataset$properties$id, + principalId = principal_id, + accessType = list("READ", "DOWNLOAD")) + message(glue::glue("{emoji::emoji(\"warning\")} Dataset created without annotation columns at {dataset$properties$id}. + Annotation columns will need to be added manually.")) }) } message(glue::glue('{emoji::emoji("astonished")} Principal {principal_id} added to {length(entity_ids)} entities')) #TODO: set schema programmatically? might be easier to add annotations to schema in web client as needed to support principal_id... - ## Note Dec 2023; schema is automatically defined unless there is an error caused by the way synapse detects annotation schemas, e.g. a type collision that causes duplicate columns with the same name. + ## Note Dec 2023; schema is automatically defined unless there is an error caused by the way synapse detects annotation schemas, e.g. a type collision that causes duplicate columns with the same name. } diff --git a/R/add_publication_from_pubmed.R b/R/add_publication_from_pubmed.R index 5f80cc49..8e53fbb5 100644 --- a/R/add_publication_from_pubmed.R +++ b/R/add_publication_from_pubmed.R @@ -5,16 +5,18 @@ .add_publication_from_pubmed <- function(batch = 0L, cache = batch) { # implement logging for batch? pmids <- new_data <- NULL counter <- 0L - function(pmid, study_id, disease_focus, manifestation, + function(pmid, study_id, + disease_focus = c(""), manifestation = c(""), publication_table_id, study_table_id, dry_run = T) { - .check_login() - counter <<- counter + 1L # cat("current record:", counter) # make verbose? # Query only for data needed, i.e. PMID to check non-dup; result can be cached if(is.null(pmids)) { - pmids <- table_query(publication_table_id, "pmid") %>% unlist(use.names = F) + pmids <- synapser::synTableQuery(glue::glue("select pmid from {publication_table_id}")) %>% + synapser::as.data.frame() %>% + unlist(use.names = F) + pmids <- gsub("PMID:", "", pmids) if(cache) pmids <<- pmids } @@ -25,9 +27,14 @@ if(!length(record)) return() study_id_set <- glue::glue_collapse(glue::single_quote(study_id), sep = ", ") - study <- .syn$tableQuery(glue::glue("SELECT studyId, studyName, fundingAgency FROM {study_table_id} WHERE studyId IN ({study_id_set})"))$asDataFrame() - record <- cbind(record, diseaseFocus = I(list(disease_focus)), manifestation = I(list(manifestation)), - studyId = I(list(study$studyId)), studyName = I(list(study$studyName)), fundingAgency = I(list(study$fundingAgency))) + study <- synapser::synTableQuery(glue::glue("SELECT studyId, studyName, fundingAgency FROM {study_table_id} WHERE studyId IN ({study_id_set})"), includeRowIdAndRowVersion = F)%>% + synapser::as.data.frame() + record <- cbind(record, + diseaseFocus = I(list(disease_focus)), + manifestation = I(list(manifestation)), + studyId = I(list(study$studyId)), + studyName = I(list(study$studyName)), + fundingAgency = I(unique(sapply(study$fundingAgency, jsonlite::fromJSON)))) # If batch mode, rbind and defer table schemafication until all records processed if(batch) { @@ -37,8 +44,8 @@ new_data <- as_table_schema(record, publication_table_id) } if(!dry_run) { - new_data <- .syn$store(new_data) - message(glue::glue('PMID:{new_data$asDataFrame()$pmid} added!')) + new_data <- synapser::synStore(new_data) + message(glue::glue('Added new pmid(s)!')) } else { new_data } @@ -54,8 +61,8 @@ #' #' @param pmid PubMed ID (*not* PMCID) of the publication to be added. #' @param study_id Synapse id(s) of the study that are associated with the publication. -#' @param disease_focus The disease focus(s) that are associated with the publication. -#' @param manifestation The manifestation(s) that are associated with the publication. +#' @param disease_focus (Optional) The disease focus(s) that are associated with the publication. +#' @param manifestation (Optional) The manifestation(s) that are associated with the publication. #' @param publication_table_id Synapse id of the portal publication table. Must have write access. #' @param study_table_id Synapse id of the portal study table. Need read access. #' @param dry_run Default = TRUE. Skips upload to table and instead prints formatted publication metadata. diff --git a/R/add_publication_from_unpaywall.R b/R/add_publication_from_unpaywall.R index b4269b2d..e82279e9 100644 --- a/R/add_publication_from_unpaywall.R +++ b/R/add_publication_from_unpaywall.R @@ -40,11 +40,9 @@ add_publication_from_unpaywall <- function(publication_table_id, #TODO: Check schema up-front and convert metadata to json in correct format - .check_login() + schema <- synapser::synGet(entity = publication_table_id) - schema <- .syn$get(entity = publication_table_id) - - pub_table <- .syn$tableQuery(glue::glue('select * from {publication_table_id}'))$filepath %>% + pub_table <- synapser::synTableQuery(glue::glue('select * from {publication_table_id}'))$filepath %>% readr::read_csv(na=character()) ##asDataFrame() & reticulate return rowIdAndRowVersion as concatenated rownames, read_csv reads them in as columns if(doi %in% pub_table$doi){ diff --git a/R/annotation_qc.R b/R/annotation_qc.R index 6d6589d0..98ac5703 100644 --- a/R/annotation_qc.R +++ b/R/annotation_qc.R @@ -11,6 +11,7 @@ #' @param output_format Format of 'excel', 'google_sheet', or 'dataframe'. Defaults to 'excel'. #' @param use_annotations Use annotations if filling out manifest for existing dataset. Defaults to TRUE for NF. #' @param service Service endpoint to use. Defaults to the schematic production endpoint. +#' @param access_token Synapse auth token, defaults to `SYNAPSE_AUTH_TOKEN` set in env. #' @returns For excel, path to local file; for google_sheet, URL to sheet; for dataframe, JSON string of data. #' @export manifest_generate <- function(data_type, @@ -20,11 +21,11 @@ manifest_generate <- function(data_type, asset_view = "syn16858331", output_format = "excel", use_annotations = TRUE, - service = "https://schematic.api.sagebionetworks.org/v1/manifest/generate") { + service = "https://schematic.api.sagebionetworks.org/v1/manifest/generate", + access_token = Sys.getenv("SYNAPSE_AUTH_TOKEN")) { # yes, param needs to be re-encoded like this for 'dataframe' output_format_param <- if (output_format == "dataframe") "dataframe (only if getting existing manifests)" else output_format - access_token <- .syn$credentials$secret use_annotations <- tolower(as.character(use_annotations)) req <- httr::GET(service, @@ -157,13 +158,13 @@ manifest_passed <- function(result) { #' @export infer_data_type <- function(dataset_id) { - children <- .syn$getChildren(dataset_id) - children <- reticulate::iterate(children) + children <- synapser::synGetChildren(dataset_id) + children <- synapser::as.list(children) if(!length(children)) return(list(result = NA, notes = "Empty dataset folder")) children <- first(children, 3) data_type <- c() for (entity in children) { - e <- .syn$getAnnotations(entity) + e <- synapser::synGetAnnotations(entity) data_type <- append(data_type, e$Component) } data_type <- unique(data_type) @@ -202,9 +203,9 @@ meta_qc_dataset <- function(dataset_id, schema_url = "https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/NF.jsonld", cleanup = TRUE) { - dataset_name <- .syn$get(dataset_id)$properties$name + dataset_name <- synapser::synGet(dataset_id)$properties$name - files <- reticulate::iterate(.syn$getChildren(dataset_id)) + files <- synapser::as.list(synapser::synGetChildren(dataset_id)) if(!length(files)) { return(list(result = NA, notes = "Empty dataset with no files", @@ -304,15 +305,15 @@ list_project_datasets <- function(project_id, } else { - in_data <- .syn$getChildren(data_root) - in_data <- reticulate::iterate(in_data) + in_data <- synapser::synGetChildren(data_root) + in_data <- synapser::as.list(in_data) datasets <- Filter(function(x) x$type == "org.sagebionetworks.repo.model.Folder", in_data) if(!length(datasets)) warning("No datasets found under data root.") datasets } } else { - children <- .syn$getChildren(project_id) - datasets <- reticulate::iterate(children) + children <- synapser::synGetChildren(project_id) + datasets <- synapser::as.list(children) datasets <- Filter(function(x) x$type == "org.sagebionetworks.repo.model.table.Dataset", datasets) if(!length(datasets)) warning("No dataset entities found in project.") datasets diff --git a/R/annotations.R b/R/annotations.R index 17d14b9d..7748331f 100644 --- a/R/annotations.R +++ b/R/annotations.R @@ -1,16 +1,16 @@ #' Set annotations from a manifest -#' -#' The [Synapse docs](https://help.synapse.org/docs/Managing-Custom-Metadata-at-Scale.2004254976.html) -#' suggest doing batch annotations through a fileview. However, it is often simpler to -#' modify or set new annotations directly given a table of just the entities (rows) and props (cols) we want. +#' +#' The [Synapse docs](https://help.synapse.org/docs/Managing-Custom-Metadata-at-Scale.2004254976.html) +#' suggest doing batch annotations through a fileview. However, it is often simpler to +#' modify or set new annotations directly given a table of just the entities (rows) and props (cols) we want. #' This is like how schematic works, except without any validation (so works best for power-users who know the data model well). -#' Some desired defaults are taken into account, such as not submitting key-values with `NA` and empty strings. -#' +#' Some desired defaults are taken into account, such as not submitting key-values with `NA` and empty strings. +#' #' @param manifest A table manifest. Needs to contain `entityId`. #' @param ignore_na Whether to ignore annotations that are `NA`; default TRUE. #' @param ignore_blank Whether to ignore annotations that are that empty strings; default TRUE. #' @param verbose Be chatty, default FALSE. -#' @export +#' @export annotate_with_manifest <- function(manifest, ignore_na = TRUE, ignore_blank = TRUE, verbose = FALSE) { # Split by `entityId` annotations <- as.data.table(manifest) @@ -21,31 +21,29 @@ annotate_with_manifest <- function(manifest, ignore_na = TRUE, ignore_blank = TR filterBlank <- if(ignore_blank) function(x) !any(x == "") else TRUE # same as above annotations <- lapply(annotations, function(x) Filter(function(x) filterNA(x) & filterBlank(x) & length(x), unlist(x, recursive = F))) for(entity in names(annotations)) { - .syn$setAnnotations(entity = entity, annotations = as.list(annotations[[entity]])) + synapser::synSetAnnotations(entity = entity, annotations = as.list(annotations[[entity]])) } if (verbose) message("Annotations submitted") } #' Copy annotations -#' +#' #' Copy annotations (all or selectively) from a source entity to one or more target entities. -#' If annotations already exist on target entities, the copy will replace the current values. -#' -#' @param entity_from Syn id from which to copy. -#' @param entity_to One or more syn ids to copy annotations to. -#' @param select Vector of properties to selectively copy if present on the entity. +#' If annotations already exist on target entities, the copy will replace the current values. +#' +#' @param entity_from Syn id from which to copy. +#' @param entity_to One or more syn ids to copy annotations to. +#' @param select Vector of properties to selectively copy if present on the entity. #' If not specified, will copy over everything, which may not be desirable. -#' @param update Whether to immediately update or return annotation objects only. +#' @param update Whether to immediately update or return annotation objects only. #' @export copy_annotations <- function(entity_from, entity_to, select = NULL, update = FALSE) { - - .check_login() - - annotations <- .syn$get_annotations(entity_from) + + annotations <- synapser::synGetAnnotations(entity_from) if(is.null(select)) { cp <- annotations } else { @@ -54,10 +52,10 @@ copy_annotations <- function(entity_from, if(k %in% select) cp[k] <- annotations[k] } } - + if(update) { for(e in entity_to) { - .syn$setAnnotations(e, annotations = cp) + synapser::synSetAnnotations(e, annotations = cp) } } else { return(cp) diff --git a/R/assign_study_data_types.R b/R/assign_study_data_types.R index 6cb92a56..8f125e90 100644 --- a/R/assign_study_data_types.R +++ b/R/assign_study_data_types.R @@ -26,8 +26,6 @@ assign_study_data_types <- function(study_table_id, attribute = "dataType", dry_run = TRUE) { - .check_login() - # get studies within scope from study table studies <- table_query(table_id = study_table_id, columns = id_col) %>% unlist() @@ -72,7 +70,7 @@ summarize_attribute <- function(summary_query, dry_run = TRUE, check_fun = NULL) { - values <- .syn$tableQuery(summary_query,includeRowIdAndRowVersion = F)$asDataFrame() + values <- synapser::synTableQuery(summary_query,includeRowIdAndRowVersion = F) %>% synapser::as.data.frame() meta <- lapply(values[[attribute]], function(x) unique(trimws(strsplit(x, split = ",")[[1]]))) # in case of stray whitespaces if(is_valid_syn_id(entity_id)) { names(meta) <- entity_id @@ -82,14 +80,14 @@ summarize_attribute <- function(summary_query, result_list <- list() for(entity in names(meta)) { - entity_meta <- .syn$get_annotations(entity) + entity_meta <- synapser::synGetAnnotations(entity) entity_meta[attribute] <- meta[[entity]] result_list[[entity]] <- entity_meta if(!dry_run) { if(is.function(check_fun)) { - if(check_fun(meta[[entity]])) .syn$set_annotations(entity_meta) else message("Skipped update for {entity}.") + if(check_fun(meta[[entity]])) synapser::synSetAnnotations(entity_meta) else message("Skipped update for {entity}.") } else { - .syn$set_annotations(entity_meta) + synapser::synSetAnnotations(entity_meta) message(glue::glue("Updated {entity} {attribute}.")) } } diff --git a/R/basic_utils.R b/R/basic_utils.R index b80d51c7..26b56d77 100644 --- a/R/basic_utils.R +++ b/R/basic_utils.R @@ -1,27 +1,22 @@ -#' Create copy of entity +#' Copy a table #' -#' Create a copy of syn entity; mostly used to create a copy on which to test out changes. -#' See https://python-docs.synapse.org/build/html/synapseutils.html?highlight=copy#synapseutils.copy_functions.copy -#' @param entity Entity to copy. -#' @param destination_id Id of destination project/container that entity will be copied to. -#' @param skip_copy_wiki_page Whether to skip copying wiki; defaults FALSE. -#' @param skip_copy_annotations Whether to skip copying annotations; defaults FALSE. -#' @keywords internal -copy <- function(entity, - destination_id, - skip_copy_wiki_page = FALSE, - skip_copy_annotations = FALSE) { - - .check_login() - # load synapseutils as needed - - - synapseutils$copy(.syn, - entity = entity, - destinationId = destination_id, - skipCopyWikiPage = skip_copy_wiki_page, - skipCopyAnnotations = skip_copy_annotations) - +#' Copy a table. One of the most common use cases is testing, to avoid modifying a "production" table. +#' +#' @param table_id Id of table to copy. +#' @param destination_id Parent project id for the copy. +#' @export +copy_table <- function(table_id, + destination_id) { + + message(glue::glue("Getting table {table_id}")) + schema <- synapser::synGet(table_id) + data <- synapser::synTableQuery(glue::glue("select * from {table_id}"), includeRowIdAndRowVersion = FALSE) + columns <- schema$columnIds + schema_copy <- synapser::Schema(name = schema$name, parent = destination_id, columns = columns) + table_copy <- synapser::Table(schema = schema_copy, values = data$filepath) + table_copy <- synapser::synStore(table_copy) + message(glue::glue("Copied table {table_id} to {table_copy$tableId}")) + table_copy$tableId } diff --git a/R/datasets.R b/R/datasets.R index c063c92f..4a954fb6 100644 --- a/R/datasets.R +++ b/R/datasets.R @@ -1,21 +1,21 @@ # -- Editing Collections -------------------------------------------------------# -# General helpers that should work for both datasets (collection of files) +# General helpers that should work for both datasets (collection of files) # and dataset collections (collection of datasets). #' Structure as collection items -#' +#' #' Helper taking entity ids to create records used for dataset items *or* dataset collection items. #' Collection items have the form `list(entityId = id, versionNumber = x)`. #' -#' Note: For item version, dataset items allow two meanings of literal or absolute "latest" +#' Note: For item version, dataset items allow two meanings of literal or absolute "latest" #' vs. "stable_latest", but with files either one can be used to mean the same thing #' since there will be correct interpretation done under the hood. #' See implementation in `latest_version`. #' #' @param ids Ids of entities to make into dataset items. -#' @param item_version Integer for version that will be used for all items, e.g. 1. +#' @param item_version Integer for version that will be used for all items, e.g. 1. #' Otherwise, "latest" or "stable_latest". See details. #' @keywords internal as_coll_items <- function(ids, item_version = c("abs", "stable")) { @@ -32,34 +32,34 @@ as_coll_items <- function(ids, item_version = c("abs", "stable")) { #' Apply updates to current collection of items -#' +#' #' This is essentially an internal transaction helper for trying to apply a changeset to a collection, -#' used in several higher-level collection utils. +#' used in several higher-level collection utils. #' Given the changeset that can represent updates of both types "replace" or "add", -#' this applies an update join keyed on `entityId` for the replace and +#' this applies an update join keyed on `entityId` for the replace and #' appends the new items to get the updated collection. -#' +#' #' @param current_items List of lists representing a collection of items. -#' @param update_items Collection of items to apply as updates to `current_items`. +#' @param update_items Collection of items to apply as updates to `current_items`. #' @keywords internal update_items <- function(current_coll, update_coll) { - + current_coll <- data.table::rbindlist(current_coll) update_coll <- data.table::rbindlist(update_coll) replaced <- current_coll[update_coll, on = .(entityId), versionNumber := i.versionNumber] added <- update_coll[!current_coll, on = .(entityId)] updated <- rbind(replaced, added) # reconversion; using pure apply as.list coerces versionNumber into char - updated <- apply(updated, 1, function(i) list(entityId = unname(i[1]), versionNumber = as.integer(i[2]))) + updated <- apply(updated, 1, function(i) list(entityId = unname(i[1]), versionNumber = as.integer(i[2]))) updated } #' Update item versions to "latest" in a collection -#' +#' #' Update an _existing_ collection so that all items or a subset of items reference their latest version. #' Should work for both datasets (collection of files) and dataset collections (collection of datasets). -#' +#' #' @inheritParams latest_version #' @param collection_id Collection id. #' @param items Vector of dataset ids for which to update reference to latest version, or "all" (default) to update all. @@ -72,7 +72,7 @@ use_latest_in_collection <- function(collection_id, items = "all", version_seman if((length(items) == 1) && (items == "all")) { coll$items <- as_coll_items(current_items, item_version = version_semantics) } else { - + # Check subset; if no check, this becomes `add_to_collection` if(!all(items %in% current_items)) { warning("Subset given includes items not actually in collection: ", items[!items %in% current_items]) @@ -85,8 +85,8 @@ use_latest_in_collection <- function(collection_id, items = "all", version_seman updated_items <- update_items(coll$items, as_coll_items(items, item_version = version_semantics)) coll$items <- updated_items } - .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(coll, auto_unbox = TRUE)) - + synapser::synRestPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(coll, auto_unbox = TRUE)) + } @@ -96,24 +96,24 @@ use_latest_in_collection <- function(collection_id, items = "all", version_seman #' For datasets, the items should be files. For dataset collections, the items should be datasets. #' If an item attempting to be added happens to already be in the collection, #' this might lead to version conflicts, so the update will be rejected unless `force` is true. -#' -#' This is implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet +#' +#' This is implemented with lower-level REST API because the Python client (as of v2.7) doesn't yet #' implement dataset collection class and methods (but dataset and relevant methods like `add_item` method are available). -#' Thus, while this is generic enough to handle both datasets and dataset collections +#' Thus, while this is generic enough to handle both datasets and dataset collections #' it is expected to be used more for dataset collections given that the dataset method is provided. -#' +#' #' @param collection_id Collection id. #' @param items Character vector of one or more dataset entity ids to add. #' @param check_items Whether to check that ids are really appropriate item types and remove non-appropriate item types #' to help avoid Synapse errors (default `FALSE` because in most cases `items` are curated, and using check will be slower). -#' @param force If some items are currently in the collection with a different version, +#' @param force If some items are currently in the collection with a different version, #' should these items be force-added using current version? The safe default is `FALSE` to ensure any such updates are intentional. #' @export add_to_collection <- function(collection_id, items, check_items = FALSE, force = FALSE) { - - coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}")) + + coll <- synapser::synRestGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}")) coll_type <- which_coll_type(coll) - + if(check_items) { item_type_check <- if(coll_type == "dataset") is_file else is_dataset correct_item_type <- sapply(items, item_type_check) @@ -126,14 +126,14 @@ add_to_collection <- function(collection_id, items, check_items = FALSE, force = } } } - + current_items <- sapply(coll$items, function(x) x$entityId) if(any(items %in% current_items) && !force) { stop("Some items to be added are already in collection. Use `force = TRUE` to allow replacing existing versions.") } else { coll$items <- update_items(coll$items, as_coll_items(items)) } - .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(coll, auto_unbox = TRUE)) + synapser::synRestPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{collection_id}"), body = jsonlite::toJSON(coll, auto_unbox = TRUE)) } @@ -141,20 +141,25 @@ add_to_collection <- function(collection_id, items, check_items = FALSE, force = #' Create new dataset with given items #' +#' Offers somewhat more convenient interface than the base `dataset` constructor: +#' needs only item ids and creates structure needed + uses the LATEST version for items by default. +#' #' @inheritParams as_coll_items #' @param name Name of the dataset. It should be unique within the `parent` project. #' @param parent Synapse id of parent project where the dataset will live. #' @param items Id(s) of items to include. #' Usually the same parent project storing the files, but in some cases it may be a different project. +#' @param addAnnotationColumns Whether to add annotation columns, default `FALSE`. #' @param dry_run If TRUE, don't actually store dataset, just return the data object for inspection or further modification. #' @export -new_dataset <- function(name, parent, items, item_version = NULL, dry_run = TRUE) { +new_dataset <- function(name, parent, items, item_version = NULL, addAnnotationColumns = FALSE, dry_run = TRUE) { dataset_items <- as_coll_items(items, item_version) - dataset <- synapseclient$Dataset(name = name, - parent = parent, - dataset_items = dataset_items) - if(dry_run) dataset else .syn$store(dataset) + dataset <- synapser::Dataset(name = name, + parent = parent, + dataset_items = dataset_items, + addAnnotationColumns = addAnnotationColumns) + if(dry_run) dataset else synapser::synStore(dataset) } @@ -176,7 +181,7 @@ new_dataset <- function(name, parent, items, item_version = NULL, dry_run = TRUE #' @param version_semantics Use "abs" for absolute latest version or "stable". Only used for collection entities. See details. latest_version <- function(id, version_semantics = c("abs", "stable")) { - entity <- .syn$get(id, downloadFile = FALSE) + entity <- synapser::synGet(id, downloadFile = FALSE) version <- entity$properties$versionNumber if(entity$properties$concreteType %in% c("org.sagebionetworks.repo.model.table.Dataset", "org.sagebionetworks.repo.model.table.DatasetCollection") && version_semantics == "stable_latest") { @@ -188,152 +193,28 @@ latest_version <- function(id, version_semantics = c("abs", "stable")) { } -#' Create datasets for Sarek-called somatic or germline variants results -#' -#' Organize variant call files from Nextflow Sarek into 3-4 datasets, -#' grouping files by variant type and workflow with titles having the format: -#' "{type} Genomic Variants - {workflow} Pipeline", e.g. "Somatic Genomic Variants - Strelka Pipeline". -#' As you can see, this assumes that you want to create datasets that segregate Somatic and Germline calls. -#' This makes sense for NF because Germline calls can be treated differently. -#' This uses latest version of all files and creates a Draft version of the dataset. -#' -#' Since we basically just need the syn entity id, variant type, and workflow to group the files. -#' Instead of getting this info through running `map_*` as in the example, -#' you may prefer using a fileview, in which case you just need to download a table from a fileview -#' that has `id` => `output_id` + the `dataType` and `workflow` annotations. -#' The fileview can be used _after_ the files are annotated. If you want to create datasets _before_ -#' files are annotated, then you have to use `map_*`. -#' -#' Finally, datasets cannot use the same name if stored in the same project, -#' so if there are multiple batches, the names will have to be made unique by adding -#' the batch number, source data id, processing date, or whatever makes sense. -#' -#' @inheritParams new_dataset -#' @param output_map The `data.table` returned from `map_sample_output_sarek`. See details for alternatives. -#' @param workflow One of workflows used. -#' @param verbose Optional, whether to be verbose -- defaults to TRUE. -#' @import data.table -#' @return A list of dataset objects. -#' @export -#' @examples -#'\dontrun{ -#' syn_out <- "syn26648589" -#' m <- map_sample_output_sarek(syn_out) -#' datasets <- nf_sarek_datasets(m, parent = "syn26462036", dry_run = F) # use a test project -#'} -nf_sarek_datasets <- function(output_map, - parent, - workflow = c("FreeBayes", "Mutect2", "Strelka", "DeepVariant"), - verbose = TRUE, - dry_run = TRUE) { - - output_map <- as.data.table(output_map) - if(!is.null(output_map$dataType)) { - data_type <- unique(output_map$dataType) - if(length(data_type) != 1) stop("Expecting one `dataType`, which does not appear to be the case.") - gvtype <- grep("(Germline|Somatic)Variants", data_type, value = T) - if(!length(gvtype)) stop("Data type does not look right, expecting either Germline or Somatic variants.") - gvtype <- switch(gvtype, - SomaticVariants = "Somatic", - GermlineVariants = "Germline") - - } else { - # Detect genomic variants type from first path name - gvtype <- if(grepl("SomaticVariantCalls", first(output_map$caller_path))) { - "Somatic" - } else if(grepl("GermlineVariantCalls", first(output_map$caller_path))) { - "Germline" - } else { - stop("Could not assign either Germline or Somatic labels based on main output folder. - Check whether folder contains mixed types or is not the right one.") - } - } - pattern <- "vcf.gz(.tbi)?$" - workflow <- match.arg(workflow) - datasets <- list() - for(i in workflow) { - dataset <- output_map[workflow == i & grepl(pattern, output_name)] - if(nrow(dataset)) { - if(verbose) glue::glue("Creating {i} dataset with {nrow(dataset)} files") - name <- glue::glue("{gvtype} Genomic Variants - {i} Pipeline") - dataset <- new_dataset(name = name, parent = parent, items = dataset$output_id, dry_run = TRUE) - if(dry_run) datasets[[i]] <- syn_dataset else datasets[[i]] <- .syn$store(syn_dataset) - } - } - - return(datasets) - -} - - -#' Create dataset for STAR-Salmon expression quantification results -#' -#' With a level-3 manifest that is created from `annotate_expression`, -#' calls `new_dataset` to make quantification files (.sf) into dataset. -#' Uses latest version of the files and creates a "Draft" dataset. -#' See `nf_sarek_datasets`. -#' -#' @inheritParams new_dataset -#' @inheritParams nf_sarek_datasets -#' @param manifest A table of annotated data manifest from `annotate_expression`. -#' @export -nf_star_salmon_datasets <- function(manifest, - parent, - dry_run = TRUE) { - - items <- manifest$entityId - new_dataset(name = "Gene Expression Quantification from RNA-seq", - parent = parent, - items = items, - dry_run = dry_run) -} - -#' Create dataset for CNVKit results -#' -#' Create dataset from all files in CNVKit output -#' -#' @inheritParams new_dataset -#' @param syn_out Output folder called 'cnvkit' -#' @export -nf_cnv_dataset <- function(syn_out, - parent, - dry_run = TRUE) { - - files <- walk(syn_out) - files <- unlist(files) - df <- as.data.frame(matrix(files, ncol = 2, byrow = TRUE)) - names(df) <- c("Filename", "id") - df <- df[grepl("cnr$|cns$|cnn$|bed$|pdf$|png$", df$Filename), ] - items <- df$id - new_dataset(name = "Copy Number Variant - CNVkit", - parent = parent, - items = items, - dry_run = dry_run) -} - - # -- Checks------------- -------------------------------------------------------# # TODO Potentially move these type checks somewhere else like basic_utils # TODO Better composition to reduce code, esp. if more will be added #' Check whether entity is dataset -#' +#' #' @keywords internal is_dataset <- function(id) { tryCatch({ - entity <- .syn$get(id, downloadFile = FALSE) + entity <- synapser::synGet(id, downloadFile = FALSE) entity$properties$concreteType == "org.sagebionetworks.repo.model.table.Dataset" }, error = function(e) FALSE) } #' Check whether entity is dataset collection -#' +#' #' @keywords internal is_dataset_collection <- function(id) { tryCatch({ - entity <- .syn$get(id, downloadFile = FALSE) + entity <- synapser::synGet(id, downloadFile = FALSE) entity$properties$concreteType == "org.sagebionetworks.repo.model.table.DatasetCollection" }, error = function(e) FALSE) @@ -341,9 +222,9 @@ is_dataset_collection <- function(id) { #' Which collection type -#' +#' #' Checks for a valid collection type or returns error -#' +#' #' @keywords internal which_coll_type <- function(coll) { coll_type <- c("dataset", "dataset collection")[c(is_dataset(coll), is_dataset_collection(coll))] @@ -351,11 +232,11 @@ which_coll_type <- function(coll) { } #' Check whether entity is file -#' +#' #' @keywords internal is_file <- function(id) { tryCatch({ - entity <- .syn$get(id, downloadFile = FALSE) + entity <- synapser::synGet(id, downloadFile = FALSE) entity$properties$concreteType == "org.sagebionetworks.repo.model.FileEntity" }, error = function(e) FALSE) diff --git a/R/datasets_nf.R b/R/datasets_nf.R new file mode 100644 index 00000000..b4d00445 --- /dev/null +++ b/R/datasets_nf.R @@ -0,0 +1,122 @@ +#' Create datasets for Sarek-called somatic or germline variants results +#' +#' Organize variant call files from Nextflow Sarek into 3-4 datasets, +#' grouping files by variant type and workflow with titles having the format: +#' "{type} Genomic Variants - {workflow} Pipeline", e.g. "Somatic Genomic Variants - Strelka Pipeline". +#' As you can see, this assumes that you want to create datasets that segregate Somatic and Germline calls. +#' This makes sense for NF because Germline calls can be treated differently. +#' This uses latest version of all files and creates a Draft version of the dataset. +#' +#' Since we basically just need the syn entity id, variant type, and workflow to group the files. +#' Instead of getting this info through running `map_*` as in the example, +#' you may prefer using a fileview, in which case you just need to download a table from a fileview +#' that has `id` => `output_id` + the `dataType` and `workflow` annotations. +#' The fileview can be used _after_ the files are annotated. If you want to create datasets _before_ +#' files are annotated, then you have to use `map_*`. +#' +#' Finally, datasets cannot use the same name if stored in the same project, +#' so if there are multiple batches, the names will have to be made unique by adding +#' the batch number, source data id, processing date, or whatever makes sense. +#' +#' @inheritParams new_dataset +#' @param output_map The `data.table` returned from `map_sample_output_sarek`. See details for alternatives. +#' @param workflow One of workflows used. +#' @param verbose Optional, whether to be verbose -- defaults to TRUE. +#' @import data.table +#' @return A list of dataset objects. +#' @export +#' @examples +#'\dontrun{ +#' syn_out <- "syn26648589" +#' m <- map_sample_output_sarek(syn_out) +#' datasets <- nf_sarek_datasets(m, parent = "syn26462036", dry_run = F) # use a test project +#'} +nf_sarek_datasets <- function(output_map, + parent, + workflow = c("FreeBayes", "Mutect2", "Strelka", "DeepVariant"), + verbose = TRUE, + dry_run = TRUE) { + + output_map <- as.data.table(output_map) + if(!is.null(output_map$dataType)) { + data_type <- unique(output_map$dataType) + if(length(data_type) != 1) stop("Expecting one `dataType`, which does not appear to be the case.") + gvtype <- grep("(Germline|Somatic)Variants", data_type, value = T) + if(!length(gvtype)) stop("Data type does not look right, expecting either Germline or Somatic variants.") + gvtype <- switch(gvtype, + SomaticVariants = "Somatic", + GermlineVariants = "Germline") + + } else { + # Detect genomic variants type from first path name + gvtype <- if(grepl("SomaticVariantCalls", first(output_map$caller_path))) { + "Somatic" + } else if(grepl("GermlineVariantCalls", first(output_map$caller_path))) { + "Germline" + } else { + stop("Could not assign either Germline or Somatic labels based on main output folder. + Check whether folder contains mixed types or is not the right one.") + } + } + pattern <- "vcf.gz(.tbi)?$" + workflow <- match.arg(workflow) + datasets <- list() + for(i in workflow) { + dataset <- output_map[workflow == i & grepl(pattern, output_name)] + if(nrow(dataset)) { + if(verbose) glue::glue("Creating {i} dataset with {nrow(dataset)} files") + name <- glue::glue("{gvtype} Genomic Variants - {i} Pipeline") + dataset <- new_dataset(name = name, parent = parent, items = dataset$output_id, dry_run = TRUE) + if(dry_run) datasets[[i]] <- syn_dataset else datasets[[i]] <- .syn$store(syn_dataset) + } + } + + return(datasets) + +} + + +#' Create dataset for STAR-Salmon expression quantification results +#' +#' With a level-3 manifest that is created from `annotate_expression`, +#' calls `new_dataset` to make quantification files (.sf) into dataset. +#' Uses latest version of the files and creates a "Draft" dataset. +#' See `nf_sarek_datasets`. +#' +#' @inheritParams new_dataset +#' @inheritParams nf_sarek_datasets +#' @param manifest A table of annotated data manifest from `annotate_expression`. +#' @export +nf_star_salmon_datasets <- function(manifest, + parent, + dry_run = TRUE) { + + items <- manifest$entityId + new_dataset(name = "Gene Expression Quantification from RNA-seq", + parent = parent, + items = items, + dry_run = dry_run) +} + +#' Create dataset for CNVKit results +#' +#' Create dataset from all files in CNVKit output +#' +#' @inheritParams new_dataset +#' @param syn_out Output folder called 'cnvkit' +#' @export +nf_cnv_dataset <- function(syn_out, + parent, + dry_run = TRUE) { + + files <- walk(syn_out) + files <- unlist(files) + df <- as.data.frame(matrix(files, ncol = 2, byrow = TRUE)) + names(df) <- c("Filename", "id") + df <- df[grepl("cnr$|cns$|cnn$|bed$|pdf$|png$", df$Filename), ] + items <- df$id + new_dataset(name = "Copy Number Variant - CNVkit", + parent = parent, + items = items, + dry_run = dry_run) +} \ No newline at end of file diff --git a/R/find.R b/R/find.R index 6cd91eb8..b3590e73 100644 --- a/R/find.R +++ b/R/find.R @@ -29,10 +29,10 @@ find_in <- function(scope, path) { #' @export find_child <- function(child_name, parent) { - q <- .syn$getChildren(parent) + q <- synapser::synGetChildren(parent) child_id <- NULL repeat { - x <- reticulate::iter_next(q) + x <- synapser::nextElem(q) if(is.null(x) || x$name == child_name) { child_id <- x$id break @@ -52,8 +52,8 @@ find_child <- function(child_name, parent) { #' @export find_child_type <- function(parent, child_type = list("file")) { - x <- .syn$getChildren(parent, includeTypes = child_type) - y <- reticulate::iterate(x) + x <- synapser::synGetChildren(parent, includeTypes = child_type) + y <- synapser::as.list(x) if(!length(y)) return() z <- setNames(sapply(y, `[[`, "id"), sapply(y, `[[`, "name")) return(z) diff --git a/R/schema_utils.R b/R/schema_utils.R index d28be750..30ad913e 100644 --- a/R/schema_utils.R +++ b/R/schema_utils.R @@ -2,45 +2,43 @@ # -- Synapse schema ------------------------------------------------------------# -#' Transform table data to target schema for Synapse storage -#' -#' **Currently implements list-schema features first and will do more later.** -#' Check and encode data values to expectations of Synapse target table schema for storage. -#' The target schema is more likely from an existing table, since new tables can take advantage of `build_table`. +#' Transform table data to target schema for Synapse storage +#' +#' **Currently implements list-schema features first and will do more later.** +#' Check and encode data values to expectations of Synapse target table schema for storage. +#' The target schema is more likely from an existing table, since new tables can take advantage of `build_table`. #' To get compatible list data, does JSON encoding and optionally `list_truncate` when running into length limits. -#' If truncation is not OK, then the incompatibility will have to be resolved by updating schema outside of this. +#' If truncation is not OK, then the incompatibility will have to be resolved by updating schema outside of this. #' Note that the setting applies to ALL list columns, though it would be desirable to be column-specific. -#' +#' #' @param df A table, i.e. `data.frame`. -#' @param schema Table [schema object](https://python-docs.synapse.org/build/html/Entity.html#synapseclient.table.Schema) or +#' @param schema Table [schema object](https://python-docs.synapse.org/build/html/Entity.html#synapseclient.table.Schema) or #' Synapse id of target table from which to get schema. #' @param list_truncate If length exceeds schema max for list columns, set `TRUE` to allow data truncation, `FALSE` to error only (default). #' @return Synapse Table object ready for storing. #' @export -as_table_schema <- function(df, - schema, +as_table_schema <- function(df, + schema, list_truncate = FALSE) { - - .check_login() + if("data.table" %in% class(df)) df <- as.data.frame(df) - if(!"synapseclient.table.Schema" %in% class(schema) && is_valid_syn_id(schema)) schema <- .syn$get(schema) - col_schema <- .syn$getTableColumns(schema) %>% reticulate::iterate() - + if(!"synapseclient.table.Schema" %in% class(schema) && is_valid_syn_id(schema)) schema <- synapser::synGet(schema) + col_schema <- synapser::synGetTableColumns(schema) %>% synapser::as.list() + # Basic checks of columns col_schema_names <- sapply(col_schema, `[[`, "name") if(length(col_schema_names) != length(df)) stop("Number of columns differs from schema.") tryCatch({ df <- df[col_schema_names] # enforce same order as schema while checking names }, error = function(e) stop("Column names don't match ones in schema.")) - + # https://docs.synapse.org/rest/org/sagebionetworks/repo/model/table/ColumnType.html col_type <- sapply(col_schema, `[[`, "columnType") for(i in seq_along(col_type)) { values <- df[[i]] if(grepl("STRING", col_type[i])) { maxsize <- col_schema[[i]]$maximumSize - if(anyNA(values)) stop("Please remove NA values from STRING column ", names(df)[i]) - size_fail <- sapply(values, function(x) any(nchar(x) > maxsize)) + size_fail <- sapply(values, function(x) any(sapply(x, function(s) if(is.na(s)) FALSE else nchar(s) > maxsize))) if(any(size_fail)) stop(paste("Characters in", names(df)[i], "exceeds max size of", maxsize)) } if(grepl("*_LIST", col_type[i])) { @@ -55,30 +53,30 @@ as_table_schema <- function(df, } } df[[i]] <- sapply(values, function(x) as.character(jsonlite::toJSON(unlist(x)))) # unlist in case x is derived from list - } + } } - table_data <- synapseclient$Table(schema, df) + table_data <- synapser::Table(schema, df) table_data } # -- Schematic (JSON-LD) schema ------------------------------------------------# #' Look up connected nodes by specified property in JSON-LD schema -#' +#' #' Use with schematic-generated JSON-LD schema: given `@id`, get connected nodes by specified prop (e.g. `sms:something`). -#' Intended to be a generic used to define more specific lookup utils. -#' Can do recursive lookup, though graph should be a tree/acyclic (!). +#' Intended to be a generic used to define more specific lookup utils. +#' Can do recursive lookup, though graph should be a tree/acyclic (!). #' (Useful for props such as `dependsOn`, doesn't make sense for props such as `rdfs:label`.) -#' -#' @param id Id (`@id`) for which to get range values; include prefix if needed. +#' +#' @param id Id (`@id`) for which to get range values; include prefix if needed. #' @param prop Property; include prefix if needed. #' @param schema Path (URL or local) to file from which schema will be read, or schema as list object. -#' @param return_labels Return labels (default), otherwise ids of connected nodes. +#' @param return_labels Return labels (default), otherwise ids of connected nodes. #' @param recursive Recursive lookup? #' @param result Vector of accumulated results; used for recursive lookup. #' @param rest Vector of remaining ids; used for recursive lookup. #' @export -get_by_prop_from_json_schema <- function(id, +get_by_prop_from_json_schema <- function(id, prop, schema = 'https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/NF.jsonld', return_labels = TRUE, @@ -89,29 +87,29 @@ get_by_prop_from_json_schema <- function(id, schema <- jsonlite::read_json(schema) schema <- schema$`@graph` } - + matches <- Filter(function(x) x$`@id` == id, schema) if(!length(matches)) stop(glue::glue("Id `{id}` not found in schema!")) ids <- unlist(lapply(matches[[1]][[prop]], function(x) x$`@id`)) - + if(return_labels) { labels <- Filter(function(x) x$`@id` %in% ids, schema) %>% sapply(function(x) x$`sms:displayName`) %>% unlist() - result <- c(result, labels) + result <- c(result, labels) } else { result <- c(result, ids) } - + rest <- c(rest, ids) if(recursive && length(rest)) { id <- rest[1] rest <- rest[-1] - get_by_prop_from_json_schema(id, + get_by_prop_from_json_schema(id, prop, - schema, - return_labels, + schema, + return_labels, recursive, - result, + result, rest) } else { # result @@ -121,9 +119,9 @@ get_by_prop_from_json_schema <- function(id, #' Get dependencies for node in JSON-LD schema -#' +#' #' Shorthand for getting props defined in annotation template using `get_by_prop_from_json_schema` under the hood. -#' +#' #' @inheritParams get_by_prop_from_json_schema #' @export get_dependency_from_json_schema <- function(id, @@ -133,7 +131,7 @@ get_dependency_from_json_schema <- function(id, recursive = TRUE, result = NULL, rest = NULL) { - + get_by_prop_from_json_schema(id, prop, schema, return_labels, recursive, result, rest) - + } diff --git a/R/tutorial.R b/R/tutorial.R new file mode 100644 index 00000000..d234bbbb --- /dev/null +++ b/R/tutorial.R @@ -0,0 +1,16 @@ +# Functions especially useful for tutorials or testing + +#' Create n temp files +#' +#' Create some text files for upload +#' +#' @param n Integer number of files. +#' @return Paths to files in the temp directory. +mock_files <- function(n) { + paths <- vector(mode = "character", length = n) + for(i in 1:n) { + paths[i] <- filePath <- tempfile("data_", fileext = c(".txt")) + writeLines(text = sample(letters, 10), con = filePath) + } + paths +} \ No newline at end of file diff --git a/README.md b/README.md index 586697ab..ce89700a 100644 --- a/README.md +++ b/README.md @@ -8,20 +8,30 @@ The goal of `nfportalutils` is to provide convenience functions for project and (meta)data management in the NF-OSI data portal scope. Currently, `develop` branch is default so package install and docs refer to code in this branch. +> [!WARNING] +> For the last relatively stable version of `nfportalutils`, please install at https://github.com/nf-osi/nfportalutils/releases/tag/v0.9500-presynapser. +> +> Currently, the package is in a refactoring period where usage is complex because of the coexistence of both `synapser` and separate `synapseclient` import. +> This will be updated when everything is 100% refactored to `synapser`. + +> [!NOTE] +> Underlying dependencies tested for this package are `synapser==1.3.0` and `synapseclient==3.1.1`. +> There are known breaking issues for MacOS and Windows with the newer versions for now. + ## Docs :point_right: [Package documentation!](https://nf-osi.github.io/nfportalutils/) ## Installation -You can install `nfportalutils` from here: +You should first install `synapser` following the instructions [here](https://github.com/Sage-Bionetworks/synapser?tab=readme-ov-file#installation). +Then you can install `nfportalutils` with: ``` r remotes::install_github("nf-osi/nfportalutils") ``` - ## Additional Notes for Users - View function reference on docs site at [Reference](https://nf-osi.github.io/nfportalutils/reference/index.html). @@ -30,7 +40,7 @@ remotes::install_github("nf-osi/nfportalutils") ## Additional Notes for Contributors ### Contrib workflow -- Branch of `develop` and make changes +- Branch off `develop` and make changes - Run `devtools::check(vignettes = FALSE)` early and often, and definitely before submitting a PR - Make a pull request to `develop`; this will run `R-CMD-CHECK` and `pkgdown` - Request a reviewer if both checks pass diff --git a/_pkgdown.yml b/_pkgdown.yml index 8fe504be..d9a66af3 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -151,11 +151,12 @@ reference: - as_table_schema - make_folder - add_to_scope + - copy_table - new_view - list_project_datasets - latest_version + - mock_files - walk - - copy - convert_to_stringlist - bare_syn_id - bad_url diff --git a/inst/extdata/pubs_example.csv b/inst/extdata/pubs_example.csv new file mode 100644 index 00000000..8e9d5b73 --- /dev/null +++ b/inst/extdata/pubs_example.csv @@ -0,0 +1,4 @@ +pmid,studyId,diseaseFocus,manifestation,comments +38383777,syn11672851,NA,,Drug-Target Explorer +38383780,syn4939902,Neufibromatosis type 1,MPNST,Johns Hopkins Biobank project +38375882,syn51133914|syn51133929,Neufibromatosis type 1,MPNST|Plexiform neurofibroma,DHART project 1and DHART project 2 produced collaborative paper diff --git a/man/add_publication_from_pubmed.Rd b/man/add_publication_from_pubmed.Rd index 18d51dae..f8b25a17 100644 --- a/man/add_publication_from_pubmed.Rd +++ b/man/add_publication_from_pubmed.Rd @@ -7,8 +7,8 @@ add_publication_from_pubmed( pmid, study_id, - disease_focus, - manifestation, + disease_focus = c(""), + manifestation = c(""), publication_table_id, study_table_id, dry_run = T @@ -19,9 +19,9 @@ add_publication_from_pubmed( \item{study_id}{Synapse id(s) of the study that are associated with the publication.} -\item{disease_focus}{The disease focus(s) that are associated with the publication.} +\item{disease_focus}{(Optional) The disease focus(s) that are associated with the publication.} -\item{manifestation}{The manifestation(s) that are associated with the publication.} +\item{manifestation}{(Optional) The manifestation(s) that are associated with the publication.} \item{publication_table_id}{Synapse id of the portal publication table. Must have write access.} diff --git a/man/copy.Rd b/man/copy.Rd deleted file mode 100644 index 99c80a08..00000000 --- a/man/copy.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/basic_utils.R -\name{copy} -\alias{copy} -\title{Create copy of entity} -\usage{ -copy( - entity, - destination_id, - skip_copy_wiki_page = FALSE, - skip_copy_annotations = FALSE -) -} -\arguments{ -\item{entity}{Entity to copy.} - -\item{destination_id}{Id of destination project/container that entity will be copied to.} - -\item{skip_copy_wiki_page}{Whether to skip copying wiki; defaults FALSE.} - -\item{skip_copy_annotations}{Whether to skip copying annotations; defaults FALSE.} -} -\description{ -Create a copy of syn entity; mostly used to create a copy on which to test out changes. -See https://python-docs.synapse.org/build/html/synapseutils.html?highlight=copy#synapseutils.copy_functions.copy -} -\keyword{internal} diff --git a/man/copy_table.Rd b/man/copy_table.Rd new file mode 100644 index 00000000..a916cc22 --- /dev/null +++ b/man/copy_table.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/basic_utils.R +\name{copy_table} +\alias{copy_table} +\title{Copy a table} +\usage{ +copy_table(table_id, destination_id) +} +\arguments{ +\item{table_id}{Id of table to copy.} + +\item{destination_id}{Parent project id for the copy.} +} +\description{ +Copy a table. One of the most common use cases is testing, to avoid modifying a "production" table. +} diff --git a/man/manifest_generate.Rd b/man/manifest_generate.Rd index b4791b87..4d3f075f 100644 --- a/man/manifest_generate.Rd +++ b/man/manifest_generate.Rd @@ -13,7 +13,8 @@ manifest_generate( asset_view = "syn16858331", output_format = "excel", use_annotations = TRUE, - service = "https://schematic.api.sagebionetworks.org/v1/manifest/generate" + service = "https://schematic.api.sagebionetworks.org/v1/manifest/generate", + access_token = Sys.getenv("SYNAPSE_AUTH_TOKEN") ) } \arguments{ @@ -32,6 +33,8 @@ manifest_generate( \item{use_annotations}{Use annotations if filling out manifest for existing dataset. Defaults to TRUE for NF.} \item{service}{Service endpoint to use. Defaults to the schematic production endpoint.} + +\item{access_token}{Synapse auth token, defaults to \code{SYNAPSE_AUTH_TOKEN} set in env.} } \value{ For excel, path to local file; for google_sheet, URL to sheet; for dataframe, JSON string of data. diff --git a/man/mock_files.Rd b/man/mock_files.Rd new file mode 100644 index 00000000..2b68a4f8 --- /dev/null +++ b/man/mock_files.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tutorial.R +\name{mock_files} +\alias{mock_files} +\title{Create n temp files} +\usage{ +mock_files(n) +} +\arguments{ +\item{n}{Integer number of files.} +} +\value{ +Paths to files in the temp directory. +} +\description{ +Create some text files for upload +} diff --git a/man/new_dataset.Rd b/man/new_dataset.Rd index 7023a7d8..9312345d 100644 --- a/man/new_dataset.Rd +++ b/man/new_dataset.Rd @@ -4,7 +4,14 @@ \alias{new_dataset} \title{Create new dataset with given items} \usage{ -new_dataset(name, parent, items, item_version = NULL, dry_run = TRUE) +new_dataset( + name, + parent, + items, + item_version = NULL, + addAnnotationColumns = FALSE, + dry_run = TRUE +) } \arguments{ \item{name}{Name of the dataset. It should be unique within the \code{parent} project.} @@ -17,8 +24,11 @@ Usually the same parent project storing the files, but in some cases it may be a \item{item_version}{Integer for version that will be used for all items, e.g. 1. Otherwise, "latest" or "stable_latest". See details.} +\item{addAnnotationColumns}{Whether to add annotation columns, default \code{FALSE}.} + \item{dry_run}{If TRUE, don't actually store dataset, just return the data object for inspection or further modification.} } \description{ -Create new dataset with given items +Offers somewhat more convenient interface than the base \code{dataset} constructor: +needs only item ids and creates structure needed + uses the LATEST version for items by default. } diff --git a/man/nf_cnv_dataset.Rd b/man/nf_cnv_dataset.Rd index 880df7d0..327ae332 100644 --- a/man/nf_cnv_dataset.Rd +++ b/man/nf_cnv_dataset.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/datasets.R +% Please edit documentation in R/datasets_nf.R \name{nf_cnv_dataset} \alias{nf_cnv_dataset} \title{Create dataset for CNVKit results} diff --git a/man/nf_sarek_datasets.Rd b/man/nf_sarek_datasets.Rd index 009d3030..c57bf053 100644 --- a/man/nf_sarek_datasets.Rd +++ b/man/nf_sarek_datasets.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/datasets.R +% Please edit documentation in R/datasets_nf.R \name{nf_sarek_datasets} \alias{nf_sarek_datasets} \title{Create datasets for Sarek-called somatic or germline variants results} diff --git a/man/nf_star_salmon_datasets.Rd b/man/nf_star_salmon_datasets.Rd index ca4a91c0..a1b38bfd 100644 --- a/man/nf_star_salmon_datasets.Rd +++ b/man/nf_star_salmon_datasets.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/datasets.R +% Please edit documentation in R/datasets_nf.R \name{nf_star_salmon_datasets} \alias{nf_star_salmon_datasets} \title{Create dataset for STAR-Salmon expression quantification results} diff --git a/tests/testthat/test-add_pubmed_publications.R b/tests/testthat/test-add_pubmed_publications.R deleted file mode 100644 index 8849056e..00000000 --- a/tests/testthat/test-add_pubmed_publications.R +++ /dev/null @@ -1,3 +0,0 @@ -test_that("multiplication works", { - expect_equal(2 * 2, 4) -}) diff --git a/tests/testthat/test_auth_login.R b/tests/testthat/test_auth_login.R index 525e95ab..b3d2d86f 100644 --- a/tests/testthat/test_auth_login.R +++ b/tests/testthat/test_auth_login.R @@ -1,7 +1,16 @@ -test_that("Implicit login SYNAPSE_AUTH_TOKEN works", { +# REMOVE this once every functionality has switched to second login method below +test_that("(Legacy) Implicit login SYNAPSE_AUTH_TOKEN works", { skip_if_no_synapseclient() skip_if_no_token() withr::local_envvar(SYNAPSE_AUTH_TOKEN = Sys.getenv("TEST_SYNAPSE_AUTH_TOKEN")) # Testing for .syn in the global environment expect_is(syn_login(), "synapseclient.client.Synapse") -}) \ No newline at end of file +}) + + +test_that("(synapser) Implicit authtoken login SYNAPSE_AUTH_TOKEN works", { + skip_if_no_synapseclient() + skip_if_no_token() + withr::local_envvar(SYNAPSE_AUTH_TOKEN = Sys.getenv("TEST_SYNAPSE_AUTH_TOKEN")) + testthat::expect_equal(synapser::synLogin(), NULL) +}) diff --git a/tests/testthat/test_dataset_utils.R b/tests/testthat/test_dataset_utils.R index 214e0f66..8715fd17 100644 --- a/tests/testthat/test_dataset_utils.R +++ b/tests/testthat/test_dataset_utils.R @@ -1,6 +1,8 @@ -# Create a basic draft dataset from some files at version 1; all files have a latest version 2 +# Create a basic draft dataset from some files at version 1; all files have a latest version 2 # Returns dataset id only create_dataset_fixture <- function(instance = 1) { + + skip_if_no_login() NF_test <- "syn26462036" items <- c("syn51239179", "syn51239178", @@ -12,9 +14,9 @@ create_dataset_fixture <- function(instance = 1) { test_that("Creating dataset with `new_dataset` works as expected when given valid parameters, defaulting to current item versions", { - - skip_if_no_synapseclient() - skip_if_no_token() + + skip_if_no_login() + NF_test <- "syn26462036" # Note that files are all version 2 on Synapse items <- c("syn51239179", @@ -24,16 +26,16 @@ test_that("Creating dataset with `new_dataset` works as expected when given vali expected_items_in_dataset <- list( list(entityId = "syn51239179", versionNumber = 2L), list(entityId = "syn51239178", versionNumber = 2L), - list(entityId = "syn51239177", versionNumber = 2L)) + list(entityId = "syn51239177", versionNumber = 2L)) testthat::expect_equal(expected_items_in_dataset, dataset$properties$datasetItems) - .syn$delete(dataset) + synapser::synDelete(dataset) }) test_that("Creating dataset with `new_dataset` works as expected when given valid parameters and a specific item version is specified", { - - skip_if_no_synapseclient() - skip_if_no_token() + + skip_if_no_login() + NF_test <- "syn26462036" items <- c("syn51239179", "syn51239178", @@ -44,7 +46,7 @@ test_that("Creating dataset with `new_dataset` works as expected when given vali list(entityId = "syn51239178", versionNumber = 1L), list(entityId = "syn51239177", versionNumber = 1L)) testthat::expect_equal(expected_items_in_dataset, dataset$properties$datasetItems) - .syn$delete(dataset) + synapser::synDelete(dataset) }) @@ -54,9 +56,9 @@ test_that("Creating dataset with `new_dataset` works as expected when given vali # Currently, only files can be included in a dataset. syn27242487 is 'org.sagebionetworks.repo.model.table.TableEntity' # ``` test_that("Creating dataset with `new_dataset` will fail when trying to include a non-valid item (a table)", { - - skip_if_no_synapseclient() - skip_if_no_token() + + skip_if_no_login() + NF_test <- "syn26462036" items <- c("syn51239179", "syn51239178", @@ -66,10 +68,9 @@ test_that("Creating dataset with `new_dataset` will fail when trying to include test_that("Updating a dataset to make a subset of files reference the latest version works", { - - skip_if_no_synapseclient() - skip_if_no_token() - + + skip_if_no_login() + dataset_id <- create_dataset_fixture() items_to_update <- c("syn51239178", "syn51239177") # both should be updated to Version 2 updated <- use_latest_in_collection(collection_id = dataset_id, items = items_to_update) @@ -78,15 +79,14 @@ test_that("Updating a dataset to make a subset of files reference the latest ver list(entityId = "syn51239178", versionNumber = 2L), list(entityId = "syn51239177", versionNumber = 2L)) testthat::expect_identical(updated$items, expected_updated_items) - .syn$delete(dataset_id) + synapser::synDelete(dataset_id) }) test_that("Updating a dataset to make _all_ files reference the latest version works", { - - skip_if_no_synapseclient() - skip_if_no_token() - + + skip_if_no_login() + dataset_id <- create_dataset_fixture() expected_updated_items <- list( list(entityId = "syn51239179", versionNumber = 2L), @@ -94,17 +94,16 @@ test_that("Updating a dataset to make _all_ files reference the latest version w list(entityId = "syn51239177", versionNumber = 2L)) updated <- use_latest_in_collection(collection_id = dataset_id, items = "all") testthat::expect_identical(updated$items, expected_updated_items) - .syn$delete(dataset_id) + synapser::synDelete(dataset_id) }) # Dataset collections ---------------------------------------------------------# test_that("Updating a dataset collection to make a subset of datasets reference the latest version works", { - - skip_if_no_synapseclient() - skip_if_no_token() - + + skip_if_no_login() + dataset_collection_id <- "syn51809938" dataset_item_to_update <- "syn51809898" .syn$create_snapshot_version(dataset_item_to_update) @@ -117,9 +116,8 @@ test_that("Updating a dataset collection to make a subset of datasets reference test_that("Adding new dataset to dataset collection works", { - skip_if_no_synapseclient() - skip_if_no_token() - + skip_if_no_login() + dataset_collection_id <- "syn51809938" coll_state <- coll <- .syn$restGET(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{dataset_collection_id}")) one_more_item <- create_dataset_fixture() @@ -127,17 +125,16 @@ test_that("Adding new dataset to dataset collection works", { testthat::expect_equal(length(new_coll_state$items), length(coll_state$items) + 1L) # cleanup: set collection to previous items state new_coll_state$items <- coll_state$items - .syn$restPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{dataset_collection_id}"), body = jsonlite::toJSON(new_coll_state, auto_unbox = TRUE)) + synapser::synRestPUT(glue::glue("https://repo-prod.prod.sagebase.org/repo/v1/entity/{dataset_collection_id}"), body = jsonlite::toJSON(new_coll_state, auto_unbox = TRUE)) # delete dataset - .syn$delete(one_more_item) + synapser::synDelete(one_more_item) }) test_that("Adding non-datasets to dataset collection gives expected handling and warning", { - - skip_if_no_synapseclient() - skip_if_no_token() - + + skip_if_no_login() + dataset_collection_id <- "syn51809938" bad_items <- "syn51106349" # a folder testthat::expect_warning(add_to_collection(collection_id = dataset_collection_id, items = bad_items, check_items = TRUE), diff --git a/vignettes/access-utilities.Rmd b/vignettes/access-utilities.Rmd new file mode 100644 index 00000000..c21ea224 --- /dev/null +++ b/vignettes/access-utilities.Rmd @@ -0,0 +1,95 @@ +--- +title: "Access utilities" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Surveying public files in the portal} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE } +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +## Intro + +This explains several access-related utilities and use cases where they're helpful. + +## Set up + +```{r setup, eval=F} +library(nfportalutils) +synapser::synLogin(authToken = Sys.getenv("SYNAPSE_AUTH_TOKEN")) +``` + +## Give selected access to an individual or team + +Sometimes a selected set of private (embargoed) data needs to made available to a collaborator or other researcher outside the project. +One way to go about this is to identify the file ids, add the individual/team to the (this creates local sharing settings), and then collect those files into a dataset so a single dataset link can be shared (this makes it easier especially if files are spread across multiple folders). + +The main convenience util to do this is `grant_specific_file_access`. +To test it out, we create a data folder, and then create and upload mock files. + +```{r, eval=F} + +project_id <- "syn26462036" # this the NF-dev-playground project, replace with your own dev project if needed +folder <- synapser::Folder(name = paste("Demo Dataset -", Sys.Date()), parent = project_id) +folder <- synapser::synStore(folder) +folder_id <- folder$properties$id + +# create some temp files with same mock data +file_paths <- mock_files(3) +syn_files <- list() +for(path in file_paths) { + file <- synapser::File(path = path, parent = folder_id) + sf <- synapser::synStore(file) + syn_files <- c(syn_files, sf) +} + +file_ids <- sapply(syn_files, function(x) x$properties$id) +``` + + +Now let's use the function to share with NF-OSI Team as the collaborator (replace example with appropriate project id that you own). + +```{r, eval=F} + +outside_collaborator <- "3342573" # use another appropriate example id if needed +grant_specific_file_access(principal_id = outside_collaborator, + entity_ids = file_ids, + create_dataset = T, + project_id = project_id, + dataset_name = NULL) # optional + +``` + +Clean up by removing the mock folder and files. You can delete the dataset through the UI. +```{r, eval=F} +synapser::synDelete(folder_id) +``` + +## Survey files downloadable for Synapse registered users + +There's often reference to "public" files, which usually means files that are viewable + downloadable to Synapse users. +If we just have a fileview with ids of the files, how do we know which ones are "public"? +The group of Synapse users has id `273948`, and we can use a util called `summarize_file_access`, passing in this group id, the permissions we're checking, and the fileview id. + +```{r query-1, eval=F} +public_access <- summarize_file_access(principal_id = 273948, "DOWNLOAD", "syn16858331") +public_access +``` + +The results should show that `summarize_file_access`'s first step is identifying all the unique benefactors (the parent container that sets the permissions on the child files) and the current permissions, then compiles a summary number of files under that benefactor. + +For an even more summarized breakdown as proportions: +```{r summarize-1, eval=F} +public_access[, .(n_files = sum(N)), by = access][, .(access, n_files, proportion = n_files / sum(n_files))] +``` + +### Some nuances + +Always use the most current fileview because of the benefactor lookup; inaccurate results might be returned otherwise. + diff --git a/vignettes/package-installation.Rmd b/vignettes/package-installation.Rmd new file mode 100644 index 00000000..b674dbe3 --- /dev/null +++ b/vignettes/package-installation.Rmd @@ -0,0 +1,165 @@ +--- +title: "Package Installation" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{package-installation} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +## Installation {.tabset} + +THIS VERSION: +This is the recommended installation for the [current release](https://github.com/nf-osi/nfportalutils/releases/tag/v0.9500-presynapser) of `nfportalutils` **that does not depend on `synapser`**. + +There *will* be a **newer version of `nfportalutils` with `synapser`** (i.e. see [this issue](https://github.com/nf-osi/nfportalutils/issues/94) on the roadmap), and instructions will be accordingly revised when the refactor is completed. + +Make sure you are in a new R session (in RStudio, go to `Session > Restart R` if necessary). +The trickiest part of the installation and package usage, for most R users, will be the Python interop. + +### Windows + +#### Outside R + +- Install a supported `python` version (recommend Python 3.11) by downloading installation binary at https://www.python.org/downloads/windows/. +- Install `synapseclient` following instructions at https://pypi.org/project/synapseclient/4.0.0/. + +#### With R +- Install `remotes` to help install other packages: +`install.packages("remotes")`. + +- Check that you have the correct `reticulate` version: +`packageVersion("reticulate")`. +If the result is anything other than `1.28`, then go to the `reticulate` install step, otherwise skip to `nfportalutils` install. + +- Install `reticulate`: +``` +> remotes::install_github("rstudio/reticulate@v1.28") +Downloading GitHub repo rstudio/reticulate@v1.28 +Running `R CMD build`... +* checking for file 'C:\Users\Erik\AppData\Local\Temp\Rtmporbql0\remotes322031ec7c50\rstudio-reticulate-3de77d1/DESCRIPTION' ... OK +* preparing 'reticulate': +* checking DESCRIPTION meta-information ... OK +* cleaning src +* checking for LF line-endings in source and make files and shell scripts +* checking for empty or unneeded directories +* building 'reticulate_1.28.tar.gz' +Installing package into β€˜C:/Users/Erik/AppData/Local/R/win-library/4.3’ +(as β€˜lib’ is unspecified) +* installing *source* package 'reticulate' ... +** using staged installation +** libs +using C++ compiler: 'G__~1.EXE (GCC) 12.3.0' +g++ -std=gnu++17 -I"C:/PROGRA~1/R/R-43~1.2/include" -DNDEBUG -I'C:/Users/Erik/AppData/Local/R/win-library/4.3/Rcpp/include' -I"C:/rtools43/x86_64-w64-mingw32.static.posix/include" -O2 -Wall -mfpmath=sse -msse2 -mstackrealign -c RcppExports.cpp -o RcppExports.o +g++ -std=gnu++17 -I"C:/PROGRA~1/R/R-43~1.2/include" -DNDEBUG -I'C:/Users/Erik/AppData/Local/R/win-library/4.3/Rcpp/include' -I"C:/rtools43/x86_64-w64-mingw32.static.posix/include" -O2 -Wall -mfpmath=sse -msse2 -mstackrealign -c event_loop.cpp -o event_loop.o +g++ -std=gnu++17 -I"C:/PROGRA~1/R/R-43~1.2/include" -DNDEBUG -I'C:/Users/Erik/AppData/Local/R/win-library/4.3/Rcpp/include' -I"C:/rtools43/x86_64-w64-mingw32.static.posix/include" -O2 -Wall -mfpmath=sse -msse2 -mstackrealign -c libpython.cpp -o libpython.o +g++ -std=gnu++17 -I"C:/PROGRA~1/R/R-43~1.2/include" -DNDEBUG -I'C:/Users/Erik/AppData/Local/R/win-library/4.3/Rcpp/include' -I"C:/rtools43/x86_64-w64-mingw32.static.posix/include" -O2 -Wall -mfpmath=sse -msse2 -mstackrealign -c output.cpp -o output.o +g++ -std=gnu++17 -I"C:/PROGRA~1/R/R-43~1.2/include" -DNDEBUG -I'C:/Users/Erik/AppData/Local/R/win-library/4.3/Rcpp/include' -I"C:/rtools43/x86_64-w64-mingw32.static.posix/include" -O2 -Wall -mfpmath=sse -msse2 -mstackrealign -c python.cpp -o python.o +g++ -std=gnu++17 -I"C:/PROGRA~1/R/R-43~1.2/include" -DNDEBUG -I'C:/Users/Erik/AppData/Local/R/win-library/4.3/Rcpp/include' -I"C:/rtools43/x86_64-w64-mingw32.static.posix/include" -O2 -Wall -mfpmath=sse -msse2 -mstackrealign -c readline.cpp -o readline.o +g++ -std=gnu++17 -I"C:/PROGRA~1/R/R-43~1.2/include" -DNDEBUG -I'C:/Users/Erik/AppData/Local/R/win-library/4.3/Rcpp/include' -I"C:/rtools43/x86_64-w64-mingw32.static.posix/include" -O2 -Wall -mfpmath=sse -msse2 -mstackrealign -c signals.cpp -o signals.o +g++ -std=gnu++17 -shared -s -static-libgcc -o reticulate.dll tmp.def RcppExports.o event_loop.o libpython.o output.o python.o readline.o signals.o -LC:/rtools43/x86_64-w64-mingw32.static.posix/lib/x64 -LC:/rtools43/x86_64-w64-mingw32.static.posix/lib -LC:/PROGRA~1/R/R-43~1.2/bin/x64 -lR +installing to C:/Users/Erik/AppData/Local/R/win-library/4.3/00LOCK-reticulate/00new/reticulate/libs/x64 +** R +** inst +** byte-compile and prepare package for lazy loading +** help +*** installing help indices +*** copying figures +** building package indices +** installing vignettes +** testing if installed package can be loaded from temporary location +** testing if installed package can be loaded from final location +** testing if installed package keeps a record of temporary installation path +* DONE (reticulate) +``` + +- Install `nfportalutils` (choose option `3 - None`): +``` +> remotes::install_github("nf-osi/nfportalutils@v0.9500-presynapser") +Downloading GitHub repo nf-osi/nfportalutils@v0.9500-presynapser +These packages have more recent versions available. +It is recommended to update all of them. +Which would you like to update? + +1: All +2: CRAN packages only +3: None +4: reticulate (1.28 -> 1.35.0) [CRAN] +``` + +``` +Running `R CMD build`... +* checking for file 'C:\Users\Erik\AppData\Local\Temp\RtmpQzT2JT\remotes158c6d2c7833\nf-osi-nfportalutils-61636cd/DESCRIPTION' ... OK +* preparing 'nfportalutils': +* checking DESCRIPTION meta-information ... OK +* checking for LF line-endings in source and make files and shell scripts +* checking for empty or unneeded directories +Omitted 'LazyData' from DESCRIPTION +* building 'nfportalutils_0.9500.tar.gz' +Installing package into β€˜C:/Users/Erik/AppData/Local/R/win-library/4.3’ +(as β€˜lib’ is unspecified) +* installing *source* package 'nfportalutils' ... +** using staged installation +** R +** inst +** byte-compile and prepare package for lazy loading +** help +*** installing help indices +** building package indices +** installing vignettes +** testing if installed package can be loaded from temporary location +** testing if installed package can be loaded from final location +** testing if installed package keeps a record of temporary installation path +* DONE (nfportalutils) +``` + + +#### Post-installation + +It is recommended to set your Synapse authentication token as an environment variable that R can access. +You can create an `.Renviron` in your home. For example, create 'C:/Users/Erik/Documents/.Renviron'. + +In the file, add +``` +SYNAPSE_AUTH_TOKEN=xxx_your_token_here_xxx +``` + +### MacOS + +#### Outside R + +- If Python is not already installed, obtain Python3 via `brew` following suggested method here: https://docs.python-guide.org/starting/install3/osx/#doing-it-right. +- Install `synapseclient` following instructions at https://pypi.org/project/synapseclient/4.0.0/. + +### Linux (Ubuntu) + +#### Outside R +- Python is already included, so just need https://pypi.org/project/synapseclient/4.0.0/. + +#### With R + + + +## Test Your Installation + +Restart R after adding your token. +Then to test that your installation is working, load package and try to log in. +If you have your authentication token set in the above recommended steps, you should be greeted accordingly. +Without a token, you are logged in as "Anonymous" and functionality will be limited. + +### Login + +``` +library(nfportalutils) +syn_login() +``` + + + diff --git a/vignettes/portal-tables-utils.Rmd b/vignettes/portal-tables-utils.Rmd new file mode 100644 index 00000000..00306304 --- /dev/null +++ b/vignettes/portal-tables-utils.Rmd @@ -0,0 +1,184 @@ +--- +title: "Portal tables utils" +output: + rmarkdown::html_vignette: + code_folding: show +vignette: > + %\VignetteIndexEntry{Portal tables} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r setup, eval=F} +library(nfportalutils) +synapser::synLogin(authToken = Sys.getenv("SYNAPSE_AUTH_TOKEN")) +``` + +## NF Portal Tables Overview + + + + +These tables behind the NF Data Portal have grown over time and need maintenance like any other infrastructure. +This walks through the original use cases for these utils to accomplish tasks such as: + +- Add new publications to Portal - Publications +- Cleaning or migrating data, e.g. correcting an annotation "rnaSeq" to "RNA-seq" within Portal - Files +- Add new people to Portal - People +- Register a new study to Portal - Studies ** +- For each study in Portal - Studies, fill in related studies ** + +Some tasks marked ** are no longer done manually much of the time (having been automated as part of larger workflows), so they'll be covered more briefly. +When requirements change, having a sense of what these utils do can help understand where, what, and how to update. + +### How this will work + +We'll make copies of the relevant portal tables for update-type operations. +What this also provides is an example workflow to follow for contributors who need to develop these utils and do testing. + +**First set your private development project id, which will be the parent project to host the table copies.** +```{r, eval=F} +project_id <- "syn26462036" # the NF-dev-playground project +``` + +### Portal - Publications updates + +There are actually two options for adding publications: `add_publication_from_pubmed` and `add_publication_from_unpaywall`. +The `pubmed` option is the default, and `unpaywall` is an additional option if there is no `pmid`. + +Start with creating a table copy to work with. +```{r, eval=F} +PUBS_COPY <- copy_table("syn16857542", destination_id = project_id) +``` + +#### Adding 1-2 publications at a time + +The minimum information needed is `pmid` (the new pub to add) and `study_id` (the linked study). +This can use `add_publication_from_pubmed`, which pulls in author, journal, etc. from PubMed. +What might need further explanation is the involvement of `study_table_id` -- +this needs to be a table where `studyId`, `studyName`, `fundingAgency` can be looked up to help fill `fundingAgency` with consistency. + +Since this is a demo, the papers are not actually related or accurately classified at all. +Commands show adding papers with and without additional `disease_focus` and `manifestation` labels (which were once manually derived). + +```{r, eval=F} + +STUDY_TABLE <- "syn52694652" # we will READ ONLY from this table +nfportalutils::add_publication_from_pubmed(pmid = 38383787, + study_id = "syn11672851", + disease_focus = "Neurofibromatosis type 1", + manifestation = c("MPNST"), + publication_table_id = PUBS_COPY, + study_table_id = STUDY_TABLE, + dry_run = F) + + +nfportalutils::add_publication_from_pubmed(pmid = 38383777, + study_id = "syn11672851", + publication_table_id = PUBS_COPY, + study_table_id = STUDY_TABLE, + dry_run = F) +``` + + +#### Adding publications in large batch, from a spreadsheet + +Large batches are often put in a spreadsheet and should instead use `add_publications_from_file`. +The spreadsheet needs to have [`studyId`, `pmid`, `diseaseFocus`, `manifestation`] columns filled out. +Other columns will be ignored. + +An example of this format comes with the package and is shown below. +```{r, eval=T} + +example_csv <- system.file("extdata", "pubs_example.csv", package = "nfportalutils") +new_pubs <- read.csv(example_csv) +knitr::kable(new_pubs) +``` + +Several things to note: + +- Rarely, there may be multiple studies associated with one publication, so they need to be listed with a "|" (pipe) separator. +- Indicating nulls is more nuanced for the spreadsheet version due to differences for STRING vs STRING_LIST -- for diseaseFocus, use "NA" while for manifestation can leave blank. + + +```{r, eval=F} + +add_publications_from_file( + file = example_csv, + publication_table_id = PUBS_COPY, + study_table_id = STUDY_TABLE, + list_sep = "|", + dry_run = FALSE +) + +``` + +Check the new additions in the UI. + +To conclude this part of the vignette, clean up the table copy. +```{r, eval=F} +synapser::synDelete(PUBS_COPY) +``` + + +For become even more erudite, review the source code or try some experiments regarding these concerns: + +1. What happens with trying to add a pmid that already exists in the table? +2. What happens when the pmid is incorrect due to typo? + + +Toggle the code block below to show expected results. +```{r, class.source = 'fold-hide'} +#1. A publication that already exists should be skipped with a message saying so. +#2. It fails. +``` + +### Portal - Files corrections + +TO DO. + +### Portal - People updates + +Create the table copy. +```{r, eval=F} +PEOPLE_COPY <- copy_table("syn16857542", destination_id = project_id) +``` + +This relatively simple util finds new people that have made contributions and adds them to the people table. +```{r, eval=F} + +add_people_from_table(people_table_id = PEOPLE_COPY, + people_column = "ownerId", + source_table_id = "syn16858331", # READ ONLY from the source table, which is Portal - Files + source_column = "createdBy", + dry_run = F) +``` + + +### Portal - Studies updates + +#### Register new study + +TO DO. + +#### Augment with 'related studies' + +TO DO. + + +Here are some other things to get deeper via the source code and/or docs: + +1. Why is there both `n_k` and and `n_clust`. Why is this used with `n_k` instead of `n_clust`? + +Toggle the code block below to show asnwers. +```{r, class.source = 'fold-hide'} +# That's just because `n_clust` generates results as clusters with highly variable numbers of member studies, i.e. there could be 20 studies around this one mainstream topic vs 2-3 in this more arcane topic. The table breaks when list length exceeds a certain limit. Historically, clusters have been used and a max of four studies selected as a workaround. But using `n_k` can give better related results with more control. + +``` diff --git a/vignettes/revalidation-workflows.Rmd b/vignettes/revalidation-workflows.Rmd index aa3096c5..750a9a3f 100644 --- a/vignettes/revalidation-workflows.Rmd +++ b/vignettes/revalidation-workflows.Rmd @@ -17,7 +17,7 @@ First set up as usual. ```r library(nfportalutils) -syn_login(Sys.getenv("SYNAPSE_AUTH_TOKEN")) +synapser::synLogin(authToken = Sys.getenv("SYNAPSE_AUTH_TOKEN")) ``` ## Basics with Schematic API service diff --git a/vignettes/survey-public-files.Rmd b/vignettes/survey-public-files.Rmd deleted file mode 100644 index a3041013..00000000 --- a/vignettes/survey-public-files.Rmd +++ /dev/null @@ -1,52 +0,0 @@ ---- -title: "Surveying public files in the portal" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Surveying public files in the portal} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -## Intro - -This quick makes use of some functions to survey files in the portal and their access. - -## Set up - -The usual setup: -```{r setup, eval=F} -library(nfportalutils) -syn_login() -``` - -## Files downloadable for Synapse registered users - -When talking about "public" files, this usually means files that are viewable and downloadable to Synapse users. -This group has id `273948`, so we use in the query below: - -```{r query-1, eval=F} -public_access <- summarize_file_access(principal_id = 273948, "DOWNLOAD", "syn16858331") -public_access -``` - -Breakdown as absolute number and as proportions: -```{r summarize-1, eval=F} -public_access[, .(n_files = sum(N)), by = access][, .(access, n_files, proportion = n_files / sum(n_files))] -``` - -## Some Nuances - -While it would be nice to see the file access restrictions at different points in time, note that the underlying API only returns access control info at present. -A file may have inherited from a benefactor at an earlier point, but then becomes its own benefactor later (i.e. more granular access control), -so queries based on a past state will likely not work. -Don't try something like: -```{r query-bad, eval=F} -public_access_q3_2022 <- summarize_file_access(principal_id = 273948, "DOWNLOAD", "syn16858331.47") -``` \ No newline at end of file