Skip to content

Commit

Permalink
Feat/view fix (#134)
Browse files Browse the repository at this point in the history
* Add view adaptation utils (wip)

* Ignore failing test that needs to be rewritten

* Reduce code duplication

* Implement budgeting (wip)

* Export and small fixes

* Update pkgdown index
  • Loading branch information
anngvu authored Oct 12, 2023
1 parent f4867cd commit 09ba648
Show file tree
Hide file tree
Showing 12 changed files with 383 additions and 66 deletions.
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ export(add_publication_from_pubmed)
export(add_publication_from_unpaywall)
export(add_publications_from_file)
export(add_to_collection)
export(adjust_view)
export(annotate_aligned_reads)
export(annotate_called_variants)
export(annotate_cnv)
Expand All @@ -26,6 +27,7 @@ export(annotate_with_tool_stats)
export(as_table_schema)
export(assign_study_data_types)
export(bad_url)
export(byte_budget)
export(calculate_related_studies)
export(cbp_add_cna)
export(cbp_add_expression)
Expand Down Expand Up @@ -75,6 +77,7 @@ export(register_study_files)
export(remove_button)
export(remove_wiki_subpage)
export(summarize_file_access)
export(swap_col)
export(syn_login)
export(table_query)
export(update_study_annotations)
Expand Down
215 changes: 215 additions & 0 deletions R/view_fix.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
#' Adjust view
#'
#' When a view schema and data are mismatched, the view cannot be built and therefore cannot be queried.
#' The most common causes (likely accounting for 98%+ of instances combined) will be the max size/length issues.
#' This will iteratively update the schema based exactly on whatever the server is saying to do* for the
#' sizing and list length issues, until the view is functional and querying works.
#' However, if the issue is not one of these, this will fail because handlers for other problems are currently not implemented.
#'
#' *Note: Fixes are applied iteratively because that's how server currently surfaces repair recommendations.
#'
#' @param view Synapse view id.
#' @param max_tries Max number of tries. Vast majority of views should only have accumulated 1-2 bad data mutations, so a default of 5 should be reasonable.
#' @param check_byte_budget Check if this will lead to exceeding table budget.
#' @export
adjust_view <- function(view, max_tries = 5L, check_byte_budget = TRUE) {

attempts <- 1
stopifnot(is_valid_syn_id(view))

while(attempts < max_tries) {

res <- tryCatch(test <- .syn$tableQuery(glue::glue("SELECT * FROM {view} LIMIT 1")), error = function(e) return(e))
need_fix <- "synapseclient.core.exceptions.SynapseHTTPError" %in% class(res)
if(!need_fix) {
message("No applicable error found with view.")
break
}

msg <- jsonlite::fromJSON(res$response$text)
reason <- msg$reason
if(!length(reason)) stop(glue::glue("Server returned response that can't be handled: {msg}"))

if(grepl("too small", reason)) {
view <- adjust_string_size(view, hint = reason)
attempts <- attempts + 1
}

if(grepl("maximumListLength", reason)) {
view <- adjust_list_length(view, hint = reason)
attempts <- attempts + 1
}
}
}


#' Adjust schema max size based on hint
#'
#' Note: For STRING cols, the hard limit is 1000 char size,
#' though at 250 using LARGETEXT is officially recommended, so possibly at that breakpoint
#' this should just create the different column type instead of increasing size.
#'
#' @keywords internal
adjust_string_size <- function(view, hint, check_byte_budget = TRUE) {

# parse to extract column and resizing recommendation
col_name <- regmatches(hint, regexpr("(?<=\')(.*?)(?=\')", hint, perl = TRUE))
size <- regmatches(hint, regexpr("[0-9]+(?= characters)", hint, perl = TRUE))
# convert warn to error if this is not a num
size <- tryCatch(as.integer(size), warning = function(w) stop("Unable to parse size from ", hint))
message(glue::glue("Adjusting size for '{col_name}'..."))

# get schema and apply changes; must create new immutable column with diff size
schema <- .syn$get(view)
new_col <- ref_col <- match_col(schema, col_name)
new_col$id <- NULL
new_col$maximumSize <- size
if(check_byte_budget) check_byte_budget_col_swap(schema, ref_col, new_col) # errors and will not proceed to create new col if fail
new_col <- new_col(new_col)

# then update schema
schema <- swap_col(schema, old = ref_col$id, new = new_col$id)

return(schema$properties$id)
}


#' Adjust schema max list length based on hint
#'
#' @keywords internal
adjust_list_length <- function(view, hint, check_byte_budget = TRUE) {

# parse to extract column length recommendation
col_name <- regmatches(hint, regexpr("(?<=\")(.*?)(?=\")", hint, perl = TRUE))
len <- regmatches(hint, regexpr("[0-9]+$", hint, perl = TRUE))
len <- tryCatch(as.integer(len), warning = function(w) stop("Unable to parse length from ", hint))
message(glue::glue("adjusting list length for '{col_name}'..."))

# get schema and apply changes; new col def with diff maximumListLength
schema <- .syn$get(view)
new_col <- ref_col <- match_col(schema, col_name)
new_col$id <- NULL
new_col$maximumListLength <- len
if(check_byte_budget) check_byte_budget_col_swap(schema, ref_col, new_col)
new_col <- new_col(new_col)

schema <- swap_col(schema, old = ref_col$id, new = new_col$id)

return(schema$properties$id)

}

#- Helpers ---------------------------------------------------------------------#

#' Check byte budget when swapping cols in schema
#'
#' @keywords internal
check_byte_budget_col_swap <- function(schema, ref_col, new_col) {
adj_remaining <- byte_budget(schema) + byte_budget(schema_cols = list(ref_col), result = "allocated")
req_allocate <- byte_budget(schema_cols = list(new_col), result = "allocated")
if(req_allocate > adj_remaining) stop(glue::glue("Will have {adj_remaining} bytes in table width budget but adjustment requires {req_allocate} bytes"))
}

#' Find matching col in schema based on name
#'
#' Synapse doesn't allow schemas to have columns of same name; this should never return more than one.
#' @keywords internal
match_col <- function(schema, col_name) {

schema_cols <- .syn$getTableColumns(schema) %>% reticulate::iterate()
index <- match(col_name, sapply(schema_cols, `[[`, "name"))
if(is.na(index)) stop("Encountered issue finding relevant column in schema")
ref_col <- schema_cols[[index]]
ref_col

}


#' Create new col
#'
#' @param col Column definition represented as a list.
#' @keywords internal
new_col <- function(col) {

new_col_json <- jsonlite::toJSON(col, auto_unbox = TRUE)
new_col <- .syn$restPOST(uri = "https://repo-prod.prod.sagebase.org/repo/v1/column", body = new_col_json)
message(glue::glue("Created new column {new_col$id}"))
new_col

}


#' Swap out old column for a new column in a schema
#'
#' @param schema A table schema.
#' @param old Id of old col.
#' @param new Id of new col.
#' @export
swap_col <- function(schema, old, new) {

schema$removeColumn(old)
schema$addColumn(new)
schema <- .syn$store(schema)
message(glue::glue("Updated schema to replace old column {old} with new column {new}"))
schema
}


#' Calculate byte budget for a schema
#'
#' Tables have a hard width limit of 64KB.
#' Given a current table schema, this does math for how many bytes remain or are already allocated.
#' Useful as an austerity measure if indeed one has a very large table,
#' or in other cases when philosophically being more principled in schema configuration.
#'
#' See also:
#' - https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/table/ColumnType.html
#'
#' @param table Existing Synapse table id or the table schema object used to retrieve the column types.
#' @param schema_cols Optional, this also can take a list of column characteristics; use when building from scratch and columns are not yet stored.
#' If given, `table` will be ignored.
#' @param result Return the summary number for "remaining" or "allocated", or return a TRUE/FALSE for "within" budget.
#' @export
byte_budget <- function(table, schema_cols = NULL, result = "remaining") {

if(is.null(schema_cols)) schema_cols <- .syn$getTableColumns(table) %>% reticulate::iterate()

WIDTH_LIMIT <- 64000

# bytes for col type
STRING <- expression( 4L * maximumSize )
DOUBLE <- 23L
INTEGER <- 20L
BOOLEAN <- 5L
DATE <- 20L
FILEHANDLEID <- 20L
ENTITYID <- 44L
SUBMISSIONID <- 20L
EVALUATIONID <- 20L
LINK <- expression( 4L * maximumSize )
MEDIUMTEXT <- 421L
LARGETEXT <- 2133L
USERID <- 20L
STRING_LIST <- expression( 4L * maximumSize * maximumListLength )
INTEGER_LIST <- expression( 20L * maximumListLength )
BOOLEAN_LIST <- expression( 5L * maximumListLength )
DATE_LIST <- expression( 20L * maximumListLength )
ENTITYID_LIST <- expression( 44L * maximumListLength )
USERID_LIST <- expression( 20L * maximumListLength )
JSON <- 2133L

allocated <- sum(sapply(schema_cols, function(x) {
eval(get(x$columnType), envir = x) }
))

if(result == "remaining") {
WIDTH_LIMIT - allocated
} else if(result == "allocated") {
allocated
} else if(result == "within") {
allocated < WIDTH_LIMIT
} else {
NA
}
}

34 changes: 20 additions & 14 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,25 @@ template:
bootstrap: 5

reference:
- title: Portal Tables Utils
- title: Table Utils
- subtitle: Main PORTAL table data update and management
desc: Augment and update one of the main portal tables (e.g. Portal - Studies)
- contents:
- contents:
- starts_with("add_publication")
- assign_study_data_types
- calculate_related_studies
- register_study
- add_people_from_table
- register_study_files
- subtitle: Lower-level table maintenance
- contents:
- adjust_view
- swap_col
- byte_budget

- title: Project Configuration and Management
- title: Project Configuration and Management
desc: Create or retrofit an NF project to expected default structure and assets
- contents:
- contents:
- new_project
- add_default_fileview
- add_default_folders
Expand All @@ -27,15 +33,15 @@ reference:
- contents:
- update_study_annotations
- annotate_with_manifest
- copy_annotations
- copy_annotations
- .modify_annotation
- meta_qc_dataset
- meta_qc_project
- manifest_generate
- manifest_validate
- infer_data_type
- subtitle: Special annotation of nextflow processed data
desc: Special annotation of nextflow processed data
- subtitle: Special annotation of nextflow processed data
desc: Special annotation of nextflow processed data
- contents:
- map_reports_sarek
- map_sample_input_ss
Expand Down Expand Up @@ -87,18 +93,18 @@ reference:

- title: Search Utils
desc: Help locate Synapse accessions, etc.
- contents:
- contents:
- contains("find")

- title: Provenance Utils
desc: Manage provenance metadata
- contents:
- contains("activity")
- delete_provenance

- title: Content Utils
desc: Create and manage Wiki-type content for projects and pages
- contents:
- contents:
- add_default_wiki
- wiki_mod
- remove_wiki_subpage
Expand All @@ -107,9 +113,9 @@ reference:
- check_wiki_links
- remove_button
- processing_flowchart

- title: Export Data to Other Platforms
desc: Helpers to export/release NF data to other platforms/databases.
desc: Helpers to export/release NF data to other platforms/databases.
- subtitle: cBioPortal
desc: Export data as a cBioPortal study
- contents:
Expand All @@ -126,7 +132,7 @@ reference:
- identify_read_pair
- test_failed
- test_passed

- title: Basic Utils
desc: Low-level functions
- contents:
Expand All @@ -146,7 +152,7 @@ reference:
- from_pubmed

- title: Internal/experimental
description: Mostly meant to be internal or experimental stuff
description: Mostly meant to be internal or experimental stuff
- contents:
- .delim_string_to_vector
- .replace_string_column_with_stringlist_column
Expand Down
12 changes: 12 additions & 0 deletions man/adjust_list_length.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions man/adjust_string_size.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 25 additions & 0 deletions man/adjust_view.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 09ba648

Please sign in to comment.