Feat/view fix (#134)

* Add view adaptation utils (wip) * Ignore failing test that needs to be rewritten * Reduce code duplication * Implement budgeting (wip) * Export and small fixes * Update pkgdown index
nf-osi · Oct 12, 2023 · 09ba648 · 09ba648
1 parent f4867cd
commit 09ba648
Show file tree

Hide file tree

Showing 12 changed files with 383 additions and 66 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -17,6 +17,7 @@ export(add_publication_from_pubmed)
 export(add_publication_from_unpaywall)
 export(add_publications_from_file)
 export(add_to_collection)
+export(adjust_view)
 export(annotate_aligned_reads)
 export(annotate_called_variants)
 export(annotate_cnv)
@@ -26,6 +27,7 @@ export(annotate_with_tool_stats)
 export(as_table_schema)
 export(assign_study_data_types)
 export(bad_url)
+export(byte_budget)
 export(calculate_related_studies)
 export(cbp_add_cna)
 export(cbp_add_expression)
@@ -75,6 +77,7 @@ export(register_study_files)
 export(remove_button)
 export(remove_wiki_subpage)
 export(summarize_file_access)
+export(swap_col)
 export(syn_login)
 export(table_query)
 export(update_study_annotations)

diff --git a/R/view_fix.R b/R/view_fix.R
@@ -0,0 +1,215 @@
+#' Adjust view
+#'
+#' When a view schema and data are mismatched, the view cannot be built and therefore cannot be queried.
+#' The most common causes (likely accounting for 98%+ of instances combined) will be the max size/length issues.
+#' This will iteratively update the schema based exactly on whatever the server is saying to do* for the
+#' sizing and list length issues, until the view is functional and querying works.
+#' However, if the issue is not one of these, this will fail because handlers for other problems are currently not implemented.
+#'
+#' *Note: Fixes are applied iteratively because that's how server currently surfaces repair recommendations.
+#'
+#' @param view Synapse view id.
+#' @param max_tries Max number of tries. Vast majority of views should only have accumulated 1-2 bad data mutations, so a default of 5 should be reasonable.
+#' @param check_byte_budget Check if this will lead to exceeding table budget.
+#' @export
+adjust_view <- function(view, max_tries = 5L, check_byte_budget = TRUE) {
+
+  attempts <- 1
+  stopifnot(is_valid_syn_id(view))
+
+  while(attempts < max_tries) {
+
+    res <- tryCatch(test <- .syn$tableQuery(glue::glue("SELECT * FROM {view} LIMIT 1")), error = function(e) return(e))
+    need_fix <- "synapseclient.core.exceptions.SynapseHTTPError" %in% class(res)
+    if(!need_fix) {
+      message("No applicable error found with view.")
+      break
+    }
+
+    msg <- jsonlite::fromJSON(res$response$text)
+    reason <- msg$reason
+    if(!length(reason)) stop(glue::glue("Server returned response that can't be handled: {msg}"))
+
+    if(grepl("too small", reason)) {
+      view <- adjust_string_size(view, hint = reason)
+      attempts <- attempts + 1
+    }
+
+    if(grepl("maximumListLength", reason)) {
+      view <- adjust_list_length(view, hint = reason)
+      attempts <- attempts + 1
+    }
+  }
+}
+
+
+#' Adjust schema max size based on hint
+#'
+#' Note: For STRING cols, the hard limit is 1000 char size,
+#' though at 250 using LARGETEXT is officially recommended, so possibly at that breakpoint
+#' this should just create the different column type instead of increasing size.
+#'
+#' @keywords internal
+adjust_string_size <- function(view, hint, check_byte_budget = TRUE) {
+
+    # parse to extract column and resizing recommendation
+    col_name <- regmatches(hint, regexpr("(?<=\')(.*?)(?=\')", hint, perl = TRUE))
+    size <-  regmatches(hint, regexpr("[0-9]+(?= characters)", hint, perl = TRUE))
+    # convert warn to error if this is not a num
+    size <- tryCatch(as.integer(size), warning = function(w) stop("Unable to parse size from ", hint))
+    message(glue::glue("Adjusting size for '{col_name}'..."))
+
+    # get schema and apply changes; must create new immutable column with diff size
+    schema <- .syn$get(view)
+    new_col <- ref_col <- match_col(schema, col_name)
+    new_col$id <- NULL
+    new_col$maximumSize <- size
+    if(check_byte_budget) check_byte_budget_col_swap(schema, ref_col, new_col) # errors and will not proceed to create new col if fail
+    new_col <- new_col(new_col)
+
+    # then update schema
+    schema <- swap_col(schema, old = ref_col$id, new = new_col$id)
+
+    return(schema$properties$id)
+}
+
+
+#' Adjust schema max list length based on hint
+#'
+#' @keywords internal
+adjust_list_length <- function(view, hint, check_byte_budget = TRUE) {
+
+  # parse to extract column length recommendation
+  col_name <- regmatches(hint, regexpr("(?<=\")(.*?)(?=\")", hint, perl = TRUE))
+  len <-  regmatches(hint, regexpr("[0-9]+$", hint, perl = TRUE))
+  len <- tryCatch(as.integer(len), warning = function(w) stop("Unable to parse length from ", hint))
+  message(glue::glue("adjusting list length for '{col_name}'..."))
+
+  # get schema and apply changes; new col def with diff maximumListLength
+  schema <- .syn$get(view)
+  new_col <- ref_col <- match_col(schema, col_name)
+  new_col$id <- NULL
+  new_col$maximumListLength <- len
+  if(check_byte_budget) check_byte_budget_col_swap(schema, ref_col, new_col)
+  new_col <- new_col(new_col)
+
+  schema <- swap_col(schema, old = ref_col$id, new = new_col$id)
+
+  return(schema$properties$id)
+
+}
+
+#- Helpers ---------------------------------------------------------------------#
+
+#' Check byte budget when swapping cols in schema
+#'
+#' @keywords internal
+check_byte_budget_col_swap <- function(schema, ref_col, new_col) {
+  adj_remaining <- byte_budget(schema) + byte_budget(schema_cols = list(ref_col), result = "allocated")
+  req_allocate <- byte_budget(schema_cols = list(new_col), result = "allocated")
+  if(req_allocate > adj_remaining) stop(glue::glue("Will have {adj_remaining} bytes in table width budget but adjustment requires {req_allocate} bytes"))
+}
+
+#' Find matching col in schema based on name
+#'
+#' Synapse doesn't allow schemas to have columns of same name; this should never return more than one.
+#' @keywords internal
+match_col <- function(schema, col_name) {
+
+  schema_cols <- .syn$getTableColumns(schema) %>% reticulate::iterate()
+  index <- match(col_name, sapply(schema_cols, `[[`, "name"))
+  if(is.na(index)) stop("Encountered issue finding relevant column in schema")
+  ref_col <- schema_cols[[index]]
+  ref_col
+
+}
+
+
+#' Create new col
+#'
+#' @param col Column definition represented as a list.
+#' @keywords internal
+new_col <- function(col) {
+
+  new_col_json <- jsonlite::toJSON(col, auto_unbox = TRUE)
+  new_col <- .syn$restPOST(uri = "https://repo-prod.prod.sagebase.org/repo/v1/column", body = new_col_json)
+  message(glue::glue("Created new column {new_col$id}"))
+  new_col
+
+}
+
+
+#' Swap out old column for a new column in a schema
+#'
+#' @param schema A table schema.
+#' @param old Id of old col.
+#' @param new Id of new col.
+#' @export
+swap_col <- function(schema, old, new) {
+
+  schema$removeColumn(old)
+  schema$addColumn(new)
+  schema <- .syn$store(schema)
+  message(glue::glue("Updated schema to replace old column {old} with new column {new}"))
+  schema
+}
+
+
+#' Calculate byte budget for a schema
+#'
+#' Tables have a hard width limit of 64KB.
+#' Given a current table schema, this does math for how many bytes remain or are already allocated.
+#' Useful as an austerity measure if indeed one has a very large table,
+#' or in other cases when philosophically being more principled in schema configuration.
+#'
+#' See also:
+#' - https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/table/ColumnType.html
+#'
+#' @param table Existing Synapse table id or the table schema object used to retrieve the column types.
+#' @param schema_cols Optional, this also can take a list of column characteristics; use when building from scratch and columns are not yet stored.
+#' If given, `table` will be ignored.
+#' @param result Return the summary number for "remaining" or "allocated", or return a TRUE/FALSE for "within" budget.
+#' @export
+byte_budget <- function(table, schema_cols = NULL, result = "remaining") {
+
+  if(is.null(schema_cols)) schema_cols <- .syn$getTableColumns(table) %>% reticulate::iterate()
+
+  WIDTH_LIMIT <- 64000
+
+  # bytes for col type
+  STRING <- expression( 4L * maximumSize )
+  DOUBLE <- 23L
+  INTEGER <- 20L
+  BOOLEAN <- 5L
+  DATE <- 20L
+  FILEHANDLEID <- 20L
+  ENTITYID <- 44L
+  SUBMISSIONID <- 20L
+  EVALUATIONID <- 20L
+  LINK <- expression( 4L * maximumSize )
+  MEDIUMTEXT <- 421L
+  LARGETEXT <- 2133L
+  USERID <- 20L
+  STRING_LIST <- expression( 4L * maximumSize * maximumListLength )
+  INTEGER_LIST <- expression( 20L * maximumListLength )
+  BOOLEAN_LIST <-  expression( 5L * maximumListLength )
+  DATE_LIST <- expression( 20L * maximumListLength )
+  ENTITYID_LIST <- expression( 44L * maximumListLength )
+  USERID_LIST <- expression( 20L * maximumListLength )
+  JSON <- 2133L
+
+  allocated <- sum(sapply(schema_cols, function(x) {
+    eval(get(x$columnType), envir = x) }
+  ))
+
+  if(result == "remaining") {
+    WIDTH_LIMIT - allocated
+  } else if(result == "allocated") {
+    allocated
+  } else if(result == "within") {
+    allocated < WIDTH_LIMIT
+  } else {
+    NA
+  }
+}
+
diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -3,19 +3,25 @@ template:
   bootstrap: 5
 
 reference:
-- title: Portal Tables Utils
+- title: Table Utils
+- subtitle: Main PORTAL table data update and management
   desc: Augment and update one of the main portal tables (e.g. Portal - Studies)
-- contents: 
+- contents:
   - starts_with("add_publication")
   - assign_study_data_types
   - calculate_related_studies
   - register_study
   - add_people_from_table
   - register_study_files
+- subtitle: Lower-level table maintenance
+- contents:
+  - adjust_view
+  - swap_col
+  - byte_budget
 
-- title: Project Configuration and Management 
+- title: Project Configuration and Management
   desc: Create or retrofit an NF project to expected default structure and assets
-- contents: 
+- contents:
   - new_project
   - add_default_fileview
   - add_default_folders
@@ -27,15 +33,15 @@ reference:
 - contents:
   - update_study_annotations
   - annotate_with_manifest
-  - copy_annotations 
+  - copy_annotations
   - .modify_annotation
   - meta_qc_dataset
   - meta_qc_project
   - manifest_generate
   - manifest_validate
   - infer_data_type
-- subtitle: Special annotation of nextflow processed data 
-  desc: Special annotation of nextflow processed data 
+- subtitle: Special annotation of nextflow processed data
+  desc: Special annotation of nextflow processed data
 - contents:
   - map_reports_sarek
   - map_sample_input_ss
@@ -87,18 +93,18 @@ reference:
 
 - title: Search Utils
   desc: Help locate Synapse accessions, etc.
-- contents: 
+- contents:
   - contains("find")
 
 - title: Provenance Utils
   desc: Manage provenance metadata
 - contents:
   - contains("activity")
   - delete_provenance
-  
+
 - title: Content Utils
   desc: Create and manage Wiki-type content for projects and pages
-- contents: 
+- contents:
   - add_default_wiki
   - wiki_mod
   - remove_wiki_subpage
@@ -107,9 +113,9 @@ reference:
   - check_wiki_links
   - remove_button
   - processing_flowchart
-  
+
 - title: Export Data to Other Platforms
-  desc: Helpers to export/release NF data to other platforms/databases. 
+  desc: Helpers to export/release NF data to other platforms/databases.
 - subtitle: cBioPortal
   desc: Export data as a cBioPortal study
 - contents:
@@ -126,7 +132,7 @@ reference:
   - identify_read_pair
   - test_failed
   - test_passed
-  
+
 - title: Basic Utils
   desc: Low-level functions
 - contents:
@@ -146,7 +152,7 @@ reference:
   - from_pubmed
 
 - title: Internal/experimental
-  description: Mostly meant to be internal or experimental stuff 
+  description: Mostly meant to be internal or experimental stuff
 - contents:
   - .delim_string_to_vector
   - .replace_string_column_with_stringlist_column

diff --git a/man/adjust_list_length.Rd b/man/adjust_list_length.Rd
diff --git a/man/adjust_string_size.Rd b/man/adjust_string_size.Rd
diff --git a/man/adjust_view.Rd b/man/adjust_view.Rd