From c07b494224783d97f12499315378b78def202c67 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu <32753274+anngvu@users.noreply.github.com> Date: Fri, 9 Feb 2024 17:01:56 -0700 Subject: [PATCH] Patch/cbp and exports (#160) * Export functions * Update docs, namespace * Update version * Update cBP utils * Fix ref * Update clinical data prep * Update docs * Fix default sep * Fix expression * New cancer type helper * More changes * Update doc * Fix test --- DESCRIPTION | 2 +- NAMESPACE | 2 + R/cbioportal.R | 52 +++++++++++-- R/cboilerplate.R | 77 +++++++++++-------- _pkgdown.yml | 1 + man/cbp_add_clinical.Rd | 4 +- man/cbp_new_cancer_type.Rd | 21 +++++ man/format_gene_expression_data.Rd | 12 +++ man/make_cbio_clinical_header.Rd | 11 +-- tests/testthat/test_register_study.R | 3 +- ...tal-data-to-other-platforms-cbioportal.Rmd | 45 +++++++---- 11 files changed, 164 insertions(+), 66 deletions(-) create mode 100644 man/cbp_new_cancer_type.Rd create mode 100644 man/format_gene_expression_data.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 80039137..9e74f721 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: nfportalutils Title: NF Portal Utilities -Version: 0.0.0.946 +Version: 0.9500 Authors@R: c( person(given = "Robert", family = "Allaway", role = c("aut", "cre"), email = "robert.allaway@sagebionetworks.org", diff --git a/NAMESPACE b/NAMESPACE index 84e65a2c..85ab7af7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -31,9 +31,11 @@ export(bad_url) export(bipartite_mmd_template) export(byte_budget) export(calculate_related_studies) +export(cbp_add_clinical) export(cbp_add_cna) export(cbp_add_expression) export(cbp_add_maf) +export(cbp_new_cancer_type) export(cbp_new_study) export(check_access) export(check_readpair_validity) diff --git a/R/cbioportal.R b/R/cbioportal.R index e77184bf..13504f30 100644 --- a/R/cbioportal.R +++ b/R/cbioportal.R @@ -110,20 +110,23 @@ cbp_new_study <- function(cancer_study_identifier, #' in Nextflow processing any spaces gets replaced with underscores so that's the default here. #' Does *not* check for missing samples, as final validation via cBioPortal tool is still expected for that. #' -#' @param ref_view A view that contains all clinical data for the study. +#' @param clinical_data Clinical table query. #' @param ref_map YAML file specifying the mapping of (NF) clinical metadata to cBioPortal model. See details. #' @param verbose Whether to provide informative messages throughout. -cbp_add_clinical <- function(ref_view, +#' +#' @export +cbp_add_clinical <- function(clinical_data, ref_map, verbose = TRUE) { cancer_study_identifier <- check_cbp_study_id() - if(verbose) checked_message("Pulling the clinical data from Synapse") - df <- get_clinical_data_for_cbp_study(ref_view) + df <- .syn$tableQuery(clinical_data, includeRowIdAndRowVersion = FALSE)$asDataFrame() + if(verbose) checked_message("Retrieved clinical data from Synapse") if(verbose) checked_message("Formatting and making clinical data file(s)") - df$specimenID <- gsub(" ", "_", clinical_data$specimenID) + checked_message("Spaces in specimen IDs will be replaced with _ per cBioPortal specifications") + df$specimenID <- gsub(" ", "_", df$specimenID) write_cbio_clinical(df, ref_map = ref_map, verbose = verbose) if(verbose) checked_message("Making sample clinical meta file") @@ -238,6 +241,7 @@ cbp_add_expression <- function(expression_data, file <- .syn$get(expression_data, downloadLocation = ".") data_expression <- sub(file$name, "data_expression_tpm.txt", file$path) file.rename(file$path, data_expression) + format_gene_expression_data("data_expression_tpm.txt") if(verbose) checked_message("Making the meta file") make_meta_expression(cancer_study_identifier, type = "tpm") @@ -247,6 +251,7 @@ cbp_add_expression <- function(expression_data, file <- .syn$get(expression_data_raw, downloadLocation = ".") data_expression_supp <- sub(file$name, "data_expression_raw.txt", file$path) file.rename(file$path, data_expression_supp) + format_gene_expression_data("data_expression_raw.txt") if(verbose) checked_message("Making the meta file for supplemental raw mRNA expression data file") make_meta_expression(cancer_study_identifier, type = "raw") @@ -257,3 +262,40 @@ cbp_add_expression <- function(expression_data, } +#' Format gene expression +#' +#' @keywords internal +#' @import data.table +format_gene_expression_data <- function(file) { + data_expression <- fread(file) + data_expression[, gene_id := NULL] # Ensembl ids not used in cBioPortal + setnames(data_expression, old = c("gene_name"), new = c("Hugo_Symbol")) + fwrite(data_expression, file = file, sep = "\t") +} + +#' Create reference file for new cancer type +#' +#' Helper for creating reference for new cancer subtype which does not already exist. +#' https://docs.cbioportal.org/file-formats/#cancer-type +#' +#' @param type_of_cancer Id for new cancer type, e.g. "cnf". +#' @param name Full name for new cancer type, e.g. "Cutaneous Neurofibroma" +#' @param color Color name for new cancer; https://en.wikipedia.org/wiki/Web_colors#X11_color_names. +#' @param parent_type_of_cancer Id of existing parent, e.g. "nfib" for Neurofibroma. +#' @export +cbp_new_cancer_type <- function(type_of_cancer, + name, + color, + parent_type_of_cancer) { + + cat("genetic_alteration_type: CANCER_TYPE", + "datatype: CANCER_TYPE", + "data_filename: cancer_type.txt", + sep = "\n", + file = "meta_cancer_type.txt") + + cat(glue::glue("{type_of_cancer}\t{name}\t{color}\t{parent_type_of_cancer}"), + file = "cancer_type.txt") + + checked_message("Created new cancer type meta and data") +} diff --git a/R/cboilerplate.R b/R/cboilerplate.R index 08f80f8f..b4791302 100644 --- a/R/cboilerplate.R +++ b/R/cboilerplate.R @@ -19,16 +19,18 @@ #' Make header for cBioPortal clinical data file #' #' This is called from the wrapper `write_cbio_clinical`. -#' Reused from https://github.com/Sage-Bionetworks/genie-erbb2-cbio/blob/develop/create_clinical.R#L396. +#' Adapted from https://github.com/Sage-Bionetworks/genie-erbb2-cbio/blob/develop/create_clinical.R#L396. +#' Needs a data table of clinical data and a reference providing `label`, `description`, and `data_type`. #' #' @param df A `data.frame` representing clinical dataset to publicize. -#' @param label Character vector representing a short label for each column in the dataset -#' @param description Character vector representing a long descriptions for each column in the dataset -#' @param data_type Character vector representing the data type of each column in the dataset +#' @param mapping A reference table providing `label`, `description`, and `data_type` for each `source` attribute in `df`. #' @keywords internal -make_cbio_clinical_header <- function(df, label, description, data_type) { +make_cbio_clinical_header <- function(df, mapping) { + + label <- mapping[match(names(df), source), label] + description <- mapping[match(names(df), source), description] + data_type <- mapping[match(names(df), source), data_type] - # Original code assigns a default priority = 1 to all; this is kept until we need more complex configuration header <- rbind(label, description, data_type, rep(1)) header <- t(apply(header, 1, function(x) { return(c(paste0("#", x[1]), x[2:length(x)]))})) header <- rbind(header, label) # use harmonized name as row-5 attribute names @@ -83,38 +85,45 @@ write_cbio_clinical <- function(df, verbose = TRUE) { m <- use_ref_map(ref_map) - attributes <- m$source - m <- split(m, by = "attribute_type") - - # Move/factor out these checks? - if(!all(attributes %in% names(df))) stop(glue::glue_collapse(setdiff(attributes, names(df)), ","), " specified in mapping but not available in data. Check data.") - if(!"SAMPLE" %in% names(m)) stop("According to mapping, no SAMPLE clinical file will be created. Check mapping.") + present <- names(df) + required <- m$source[m$required] + attributes <- unique(m$source) + + # Attribute checks + message("Clinical attributes present are: ", paste(present, collapse = ", ")) + if(!all(required %in% present)) stop("Missing required clinical element(s):", paste(setdiff(required, present), collapse = ", ")) + if(!all(present %in% attributes)) stop("Missing mapping for:", paste(setdiff(present, attributes), collapse = ",")) + + # Take care of list columns and NA + .df <- data.table::copy(df) + for(col in names(.df)) { + if(class(.df[[col]]) == "list") { + .df[[col]] <- sapply(.df[[col]], function(x) paste0(x, collapse = "-")) + warning(glue::glue("The {col} field was stored as a list has been coerced for export, you may want to check output."), call. = F) + } + # Use actual NA's so that `write.table` can write out "" consistently + .df[.df[[col]] %in% na_recode, col ] <- NA_character_ + } files <- list() - for(clinical_type in names(m)) { - .df <- df[, m[[clinical_type]]$source ] - # cBioPortal does not allow list columns - for(col in names(.df)) { - if(class(.df[[col]]) == "list") { - .df[[col]] <- paste(.df[[col]], sep = ",") - warning(glue::glue("Coerced {col} data from list for export, you may want to check output."), call. = F) - } - # Use actual NA's so that `write.table` can write out "" consistently - .df[.df[[col]] %in% na_recode, col ] <- NA_character_ + m <- split(m, by = "attribute_type") + if("individualID" %in% names(.df)) { + patient_df <- unique(.df[, c(names(.df) %in% m$PATIENT$source)]) + header <- make_cbio_clinical_header(patient_df, m$PATIENT) + patient_df <- rbind(header, patient_df) + files[["PATIENT"]] <- patient_df + } + { + sample_df <- .df[, c(names(.df) %in% m$SAMPLE$source)] + header <- make_cbio_clinical_header(sample_df, m$SAMPLE) + sample_df <- rbind(header, sample_df) + files[["SAMPLE"]] <- sample_df + } - } - if(clinical_type == "PATIENT") { - .df <- unique(.df) - } + for(clinical_type in names(files)) { filename <- get_cbio_filename(clinical_type) - header <- make_cbio_clinical_header(.df, - m[[clinical_type]]$label, - m[[clinical_type]]$description, - m[[clinical_type]]$data_type) - - df_out <- rbind(header, .df) path <- glue::glue("{publish_dir}/{filename}") - write.table(df_out, + write.table(files[[clinical_type]], file = path, sep = delim, na = "", @@ -122,8 +131,8 @@ write_cbio_clinical <- function(df, row.names = F, quote = F) if(verbose) message(glue::glue("{clinical_type} data written to: {path}")) - files[[clinical_type]] <- df_out } + invisible(files) } diff --git a/_pkgdown.yml b/_pkgdown.yml index 5b37ee74..8fe504be 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -129,6 +129,7 @@ reference: desc: Export data as a cBioPortal study - contents: - cbp_new_study + - cbp_new_cancer_type - cbp_add_maf - cbp_add_clinical - cbp_add_expression diff --git a/man/cbp_add_clinical.Rd b/man/cbp_add_clinical.Rd index ca06b3d7..4168b5a2 100644 --- a/man/cbp_add_clinical.Rd +++ b/man/cbp_add_clinical.Rd @@ -4,10 +4,10 @@ \alias{cbp_add_clinical} \title{Export and add clinical data to cBioPortal dataset} \usage{ -cbp_add_clinical(ref_view, ref_map, verbose = TRUE) +cbp_add_clinical(clinical_data, ref_map, verbose = TRUE) } \arguments{ -\item{ref_view}{A view that contains all clinical data for the study.} +\item{clinical_data}{Clinical table query.} \item{ref_map}{YAML file specifying the mapping of (NF) clinical metadata to cBioPortal model. See details.} diff --git a/man/cbp_new_cancer_type.Rd b/man/cbp_new_cancer_type.Rd new file mode 100644 index 00000000..1a999720 --- /dev/null +++ b/man/cbp_new_cancer_type.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cbioportal.R +\name{cbp_new_cancer_type} +\alias{cbp_new_cancer_type} +\title{Create reference file for new cancer type} +\usage{ +cbp_new_cancer_type(type_of_cancer, name, color, parent_type_of_cancer) +} +\arguments{ +\item{type_of_cancer}{Id for new cancer type, e.g. "cnf".} + +\item{name}{Full name for new cancer type, e.g. "Cutaneous Neurofibroma"} + +\item{color}{Color name for new cancer; https://en.wikipedia.org/wiki/Web_colors#X11_color_names.} + +\item{parent_type_of_cancer}{Id of existing parent, e.g. "nfib" for Neurofibroma.} +} +\description{ +Helper for creating reference for new cancer subtype which does not already exist. +https://docs.cbioportal.org/file-formats/#cancer-type +} diff --git a/man/format_gene_expression_data.Rd b/man/format_gene_expression_data.Rd new file mode 100644 index 00000000..39654d67 --- /dev/null +++ b/man/format_gene_expression_data.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cbioportal.R +\name{format_gene_expression_data} +\alias{format_gene_expression_data} +\title{Format gene expression} +\usage{ +format_gene_expression_data(file) +} +\description{ +Format gene expression +} +\keyword{internal} diff --git a/man/make_cbio_clinical_header.Rd b/man/make_cbio_clinical_header.Rd index c200531d..41fa6ebe 100644 --- a/man/make_cbio_clinical_header.Rd +++ b/man/make_cbio_clinical_header.Rd @@ -4,19 +4,16 @@ \alias{make_cbio_clinical_header} \title{Make header for cBioPortal clinical data file} \usage{ -make_cbio_clinical_header(df, label, description, data_type) +make_cbio_clinical_header(df, mapping) } \arguments{ \item{df}{A \code{data.frame} representing clinical dataset to publicize.} -\item{label}{Character vector representing a short label for each column in the dataset} - -\item{description}{Character vector representing a long descriptions for each column in the dataset} - -\item{data_type}{Character vector representing the data type of each column in the dataset} +\item{mapping}{A reference table providing \code{label}, \code{description}, and \code{data_type} for each \code{source} attribute in \code{df}.} } \description{ This is called from the wrapper \code{write_cbio_clinical}. -Reused from https://github.com/Sage-Bionetworks/genie-erbb2-cbio/blob/develop/create_clinical.R#L396. +Adapted from https://github.com/Sage-Bionetworks/genie-erbb2-cbio/blob/develop/create_clinical.R#L396. +Needs a data table of clinical data and a reference providing \code{label}, \code{description}, and \code{data_type}. } \keyword{internal} diff --git a/tests/testthat/test_register_study.R b/tests/testthat/test_register_study.R index 1b777416..87ebb17d 100644 --- a/tests/testthat/test_register_study.R +++ b/tests/testthat/test_register_study.R @@ -20,7 +20,8 @@ test_that("Add study meta works", { expected <- c(study_meta, studyStatus = "Active") - testthat::expect_mapequal(s, expected) + testthat::expect_output(print(s), "\\{'studyName': \\['NF Dev Playground'\\], 'dataStatus': \\['Data Not Expected'\\], 'initiative': \\['Other'\\], 'studyLeads': \\['Robert Allaway', 'Anh Nguyet Vu'\\], 'studyStatus': \\['Active'\\], 'diseaseFocus': \\['Multiple'\\], 'institutions': \\['Sage Bionetworks'\\], 'fundingAgency': \\['Sage Bionetworks'\\]\\}") + }) diff --git a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd index eadc1e79..f360b759 100644 --- a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd +++ b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd @@ -29,7 +29,7 @@ The current API covers creating a cBioPortal study with a subset of data types r The design has been inspired by and should feel somewhat like working with the R package [usethis](https://github.com/r-lib/usethis), and data types can be added to the study package interactively. -Though there is some checking depending on the data type, final validation with the official cBioPortal validation tools/scripts should still be run (see last section). +Though there is some checking depending on the data type, final [validation](#Validation) with the official cBioPortal validation tools/scripts should still be run. Breaking changes are possible as the API is still in development. @@ -46,7 +46,8 @@ syn_login() ## Create a new study dataset -First create the study before we can put together the data. +First create the study dataset "package" where we can put together the data. +Each study dataset combines multiple data types -- clinical, gene expression, gene variants, etc. ```{r cbp_new_study, eval=FALSE} @@ -58,15 +59,19 @@ cbp_new_study(cancer_study_identifier = "npst_nfosi_ntap_2022", ## Add data types to study Data types can be most easily added _in any order_ using the `cbp_add*` functions. -These download data files and create the meta for them, using **defaults for NF-OSI processed data**. -Sometimes if these defaults don't match what you want, take a look at the lower-level utils `make_meta_*` or edit the files manually after. +These functions download data files and create the meta for them. -Important: user is expected to be in a valid cBioPortal study directory as set up in the previous step. +Note that: + +- These should be run with the working directory set to the study dataset directory as set up above to ensure consistent metadata. +- **Defaults are for known NF-OSI processed data outputs**. +- If these defaults don't apply because of changes in the scenario, take a look at the lower-level utils `make_meta_*` or edit the files manually after. +- Data types can vary in how much additional work is needed in remapping, reformatting, custom sanity checks, etc. ### Add mutations data - `maf_data` references a final merged maf output file from the NF-OSI processing pipeline OK for public release. -No further modifications are done except renaming it. +- This data file type requires no further modifications except renaming. ```{r add_maf, eval=FALSE} @@ -88,27 +93,32 @@ cbp_add_cna(cna_data) ### Add expression data -- `expression_data` is expected to be a `.txt` file on Synapse. +- `expression_data` is expected to be a `.txt` called `gene_tpm.tsv` file on Synapse. +- The NF-OSI default includes including the raw expression data as well, called `gene_counts.tsv`, but this can be omitted. +- These NF-OSI outputs will be somewhat modified in translation to have the required headers. ```{r add_expression, eval=FALSE} mrna_data <- "syn********" +mrna_data_raw <- "syn********" -cbp_add_expression(mrna_data) +cbp_add_expression(mrna_data, + expression_data_raw = mrna_data_raw) ``` - ### Add clinical data -- `ref_view` is a fileview that contains clinical data for the data released in the study. -- `ref_map` maps clinical variables from the NF-OSI data dictionary to cBioPortal's +- `clinical_data` is a prepared clinical data table already subsetted to those released in this study, or pass in a query that can be used for subsetting if using a full clinical database table. For example, the full clinical cohort comprises patients 1-50, but this study dataset consists of available and releasable data only for patients 1-20 for expression data and data patients 15-20 for cna data. Here, `clinical_data` can be a smaller table of just those 1-30, or it can be the original table but pass in a suitable additional filter, e.g. `where release = 'batch1'`. +- Clinical data requires mapping to be as consistent with other public datasets as possible. `ref_map` defines the mapping of clinical variables from the NF-OSI data dictionary to cBioPortal's. Only variables in the mapping are exported to cBioPortal. Follow link below to inspect the default file and format used. +- Clinical data should be added last for overall sample checks to work. For example, if there is expression data for patients 1-20 and cna data patients 15-20, +it can more informatively warn about any missing/mismatches. ```{r add_clinical, eval=FALSE} -ref_view <- "syn43278088" +clinical_data <- "select * from syn43278088" ref_map <- "https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/mappings/cBioPortal.yaml" -cbp_add_clinical(ref_view, ref_map) +cbp_add_clinical(clinical_data, ref_map) ``` ## Validation @@ -118,7 +128,10 @@ See the [general docs for dataset validation](https://docs.cbioportal.org/using- For the _public_ portal, the suggested step using the public server is given below. -Assuming your present working directory is `~/datahub/public` and a study folder called `mixed_nfosi_2022` has been placed into it, mount the dataset into the container and run validation with: -`docker run --rm -v $(pwd):/datahub cbioportal/cbioportal:4.1.13 validateStudies.py -d /datahub -l mixed_nfosi_2022 -u http://cbioportal.org -html /datahub/mixed_nfosi_2022/html_report` +Assuming your present working directory is `~/datahub/public` and a study folder called `npst_nfosi_ntap_2022` has been placed into it, mount the dataset into the container and run validation like: +``` +STUDY=npst_nfosi_ntap_2022 +sudo docker run --rm -v $(pwd):/datahub cbioportal/cbioportal:5.4.7 validateStudies.py -d /datahub -l $STUDY -u http://cbioportal.org -html /datahub/$STUDY/html_report +``` - \ No newline at end of file +The html report will list issues by data types to help with any corrections needed.