Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Patch/cbioportal utils #119

Merged
merged 27 commits into from
Aug 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: nfportalutils
Title: NF Portal Utilities
Version: 0.0.0.9320
Version: 0.0.0.9400
Authors@R: c(
person(given = "Robert", family = "Allaway", role = c("aut", "cre"),
email = "[email protected]",
Expand Down
6 changes: 4 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ export(as_table_schema)
export(assign_study_data_types)
export(bad_url)
export(calculate_related_studies)
export(cbp_add_cna)
export(cbp_add_expression)
export(cbp_add_maf)
export(cbp_new_study)
export(check_access)
export(check_readpair_validity)
export(check_wiki_links)
Expand All @@ -43,7 +47,6 @@ export(grant_specific_file_access)
export(key_label_to_id)
export(make_admin)
export(make_folder)
export(make_meta_study)
export(make_public)
export(make_public_viewable)
export(map_sample_input_ss)
Expand All @@ -64,7 +67,6 @@ export(remove_button)
export(remove_wiki_subpage)
export(summarize_file_access)
export(syn_login)
export(syncBP_maf)
export(table_query)
export(update_study_annotations)
export(use_latest_in_collection)
Expand Down
242 changes: 242 additions & 0 deletions R/cbioportal.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
# Export data from Synapse as a cBioPortal dataset, where different data types can be added to the package one-by-one,
# much in the spirit of https://github.com/r-lib/usethis.
# This file contains the higher-level wrappers that are user-facing.
# See `cboilerplate.R` for the lower-level utils, which allows more control.
# All functions should start with `cbp_*` so that it's clear this is cBioPortal-relevant functionality.

#' Enumerate combinations of valid cBP data types and data subtypes and helper utils if available
#'
#' https://docs.cbioportal.org/file-formats/
#'
#' @keywords internal
cbp_datatypes <- function() {

types <- data.table::rbindlist(list(
list("CLINICAL", "SAMPLE_ATTRIBUTES", "cbp_add_clinical", ""),
list("CLINICAL", "PATIENT_ATTRIBUTES", "cbp_add_clinical", ""),
list("COPY_NUMBER_ALTERATION", "SEG", "cbp_add_cna", "default"),
list("COPY_NUMBER_ALTERATION", "DISCRETE", "", ""),
list("COPY_NUMBER_ALTERATION", "DISCRETE_LONG", "", ""),
list("COPY_NUMBER_ALTERATION", "CONTINUOUS", "", ""),
list("COPY_NUMBER_ALTERATION", "LOG2-VALUE", "", ""),
list("MRNA_EXPRESSION", "CONTINUOUS", "cbp_add_expression", ""),
list("MRNA_EXPRESSION", "Z-SCORE", "cbp_add_expression", ""),
list("MUTATION_EXTENDED", "MAF", "cbp_add_maf", ""),
list("METHYLATION", "CONTINUOUS", "", ""),
list("PROTEIN_LEVEL", "LOG2-VALUE", "", ""),
list("STRUCTURAL_VARIANT", "SV", "", "")))

setnames(types, c("dataType", "dataSubtype", "util", "note"))
return(types)
}

#' Initialize a new cBioPortal study dataset
#'
#' Create a new directory with a basic required [study meta file](https://docs.cbioportal.org/file-formats/#meta-file),
#' much like how we'd create a new R package and put a DESCRIPTION file in it.
#'
#' @param cancer_study_identifier Cancer study identifier in format such as `nst_nfosi_ntap_2022`.
#' @param name Name of the study, e.g. "Malignant Peripheral Nerve Sheath Tumor (NF-OSI, 2022)".
#' @param type_of_cancer Id for type of cancer. If `validate` is TRUE, this is one of the things validated with warning if mismatched.
#' @param description Description of the study, defaults to a generic description that can be edited later.
#' @param short_name (Optional) Short name for the study.
#' @param citation (Optional) A relevant citation, e.g. "TCGA, Nature 2012".
#' @param pmid (Optional) One or more relevant pubmed ids (comma-separated, no whitespace); if used, citation cannot be `NULL`.
#' @param groups (Optional) Defaults to "PUBLIC" for use with public cBioPortal;
#' otherwise, use group names that makes sense with the configuration of your cBioPortal instance.
#' @param add_global_case_list (Optional) Use `NULL` to ignore, default is `TRUE` for an "All samples" case list( to be generated automatically.
#' @param validate Validate against public cBioPortal configuration. Default `TRUE`,
#' but might want to set to `FALSE` especially if using a custom cBioPortal instance with different configuration.
#' @param verbose Whether to be chatty.
#'
#' @export
cbp_new_study <- function(cancer_study_identifier,
name,
type_of_cancer,
description = "The data are contributed by researchers funded by the Neurofibromatosis Therapeutic Acceleration Program (NTAP). The reprocessing of the raw data is managed by the NF Open Science Initiative (https://nf.synapse.org/).",
short_name = NULL,
citation = NULL,
pmid = NULL,
groups = "PUBLIC",
add_global_case_list = TRUE,
validate = TRUE,
verbose = TRUE) {

# TODO Validate study id
study_dir <- cancer_study_identifier
if(!dir.exists(study_dir)) {
if(verbose) checked_message(glue::glue("Creating {study_dir} study directory"))
dir.create(glue::glue("./{study_dir}"))
}

checked_message("Setting dataset directory as working directory")
setwd(study_dir)

if(validate) {
tryCatch({
cancer_ids <- httr::GET("https://www.cbioportal.org/api/cancer-types?direction=ASC&pageNumber=0&pageSize=10000&projection=SUMMARY") %>%
httr::content() %>%
sapply(`[[`, "cancerTypeId")
if(!type_of_cancer %in% cancer_ids) {
warning("Specified `type_of_cancer id` isn't in the public list.
Check for typos or include additional meta for this cancer type -- see https://docs.cbioportal.org/file-formats/#cancer-type")
}
}, error = function(e) warning("Note: Other issue with validation via public API...skipping."))
}

df_file <- make_meta_study_generic(cancer_study_identifier = cancer_study_identifier,
type_of_cancer = type_of_cancer,
name = name,
description = description,
citation = citation,
pmid = pmid,
groups = groups,
short_name = short_name,
add_global_case_list = add_global_case_list)

write_meta(df_file, "meta_study.txt", verbose = verbose)
if(verbose) checked_message("Study meta added")
}

# ------------------------------------------------------------------------------- #


#' Export and add clinical data to cBioPortal dataset
#'
#' This should be run in an existing dataset package root.
#'
#' Clinical data are mapped and exported according to a reference mapping.
#' Also reformatting of `PATIENT_ID`, `SAMPLE_ID` to contain only letters, numbers, points, underscores, hyphens;
#' in Nextflow processing any spaces gets replaced with underscores so that's the default here.
#' Does *not* check for missing samples, as final validation via cBioPortal tool is still expected for that.
#'
#' @param ref_view A view that contains all clinical data for the study.
#' @param ref_map YAML file specifying the mapping of (NF) clinical metadata to cBioPortal model. See details.
#' @param verbose Whether to provide informative messages throughout.
cbp_add_clinical <- function(ref_view,
ref_map,
verbose = TRUE) {

cancer_study_identifier <- check_cbp_study_id()

if(verbose) checked_message("Pulling the clinical data from Synapse")
df <- get_clinical_data_for_cbp_study(ref_view)

if(verbose) checked_message("Formatting and making clinical data file(s)")
df$specimenID <- gsub(" ", "_", clinical_data$specimenID)
write_cbio_clinical(df, ref_map = ref_map, verbose = verbose)

if(verbose) checked_message("Making sample clinical meta file")
make_meta_sample(cancer_study_identifier, verbose = verbose)

# Before making meta, check that the optional patient data file was written
if(file.exists("data_clinical_patient.txt")) {
if(verbose) checked_message("Making patient clinical meta file")
make_meta_patient(cancer_study_identifier, verbose = verbose)
}

if(verbose) checked_message("Done with adding clinical data")

}

#' Check that in valid cBioPortal study dataset root
#'
#' The `cbp_add*` functions need to be run while in the study package root.
#' This checks in valid study directory and returns the `cancer_study_id`.
#'
#' @keywords internal
#' @return `cancer_study_id` for the current cBioPortal cancer study.
check_cbp_study_id <- function() {

tryCatch({

suppressWarnings({
study <- yaml::read_yaml("meta_study.txt")
study$cancer_study_identifier
})

}, error = function(e) stop("The path ", getwd(),
" does not appear to be a valid cBioPortal study.",
call. = FALSE))
}

# ------------------------------------------------------------------------------- #

#' Export and add mutations data to cBioPortal dataset
#'
#' This should be run in an existing dataset package root.
#'
#' Get merged maf file that represents filtered subset of `maf`s containing only (non-germline) data OK to release publicly.
#' This needs to be packaged with other files like this
#' [example of a public mutations dataset](https://github.com/cBioPortal/datahub/tree/1e03ea6ab5e0ddd497ecf349cbee7d50aeebcd5e/public/msk_ch_2020).
#'
#' @inheritParams cbp_new_study
#' @param maf_data Synapse id of `merged maf` file for public release.
#' @export
cbp_add_maf <- function(maf_data, verbose = TRUE) {

.check_login()
cancer_study_identifier <- check_cbp_study_id()

if(verbose) checked_message("Getting `maf_data` file from Synapse")
file <- .syn$get(maf_data, downloadLocation = ".")
data_mutations <- sub(file$name, "data_mutations.txt", file$path)
file.rename(file$path, data_mutations)

if(verbose) checked_message("Making maf meta file")
make_meta_maf(cancer_study_identifier, verbose = verbose)

if(verbose) checked_message("Done with adding MAF data")

}


# ------------------------------------------------------------------------------- #

#' Export and add CNA (seg) data to cBioPortal dataset
#'
#' This should be run in an existing dataset package root.
#'
#' @inheritParams cbp_new_study
#' @param cna_data Synapse id of CNA data file, currently only handles `.seg` file.
#' @export
cbp_add_cna <- function(cna_data, verbose = TRUE) {

cancer_study_identifier <- check_cbp_study_id()

if(verbose) checked_message("Getting the CNA (.seg) data file from Synapse")
file <- .syn$get(cna_data, downloadLocation = ".")
data_cna <- sub(file$name, "data_cna.seg", file$path)
file.rename(file$path, data_cna)

if(verbose) checked_message("Making the meta file")
make_meta_cna(cancer_study_identifier)

if(verbose) checked_message("Done with adding CNA data")

}


#' Export and add expression data to cBioPortal dataset
#'
#' This should be run in an existing dataset package root.
#'
#' @inheritParams cbp_new_study
#' @param expression_data Syn id of gene expression data.
#' @export
cbp_add_expression <- function(expression_data, verbose = TRUE) {

cancer_study_identifier <- check_cbp_study_id()

if(verbose) checked_message("Getting the mRNA expression data file from Synapse")
file <- .syn$get(expression_data, downloadLocation = ".")
data_expression <- sub(file$name, "data_expression.txt", file$path)
file.rename(file$path, data_expression)

if(verbose) checked_message("Making the meta file")
make_meta_expression(cancer_study_identifier)

if(verbose) checked_message("Done with adding expression data")

}

Loading
Loading