From c07b494224783d97f12499315378b78def202c67 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <32753274+anngvu@users.noreply.github.com>
Date: Fri, 9 Feb 2024 17:01:56 -0700
Subject: [PATCH] Patch/cbp and exports (#160)

* Export functions

* Update docs, namespace

* Update version

* Update cBP utils

* Fix ref

* Update clinical data prep

* Update docs

* Fix default sep

* Fix expression

* New cancer type helper

* More changes

* Update doc

* Fix test
---
 DESCRIPTION                                   |  2 +-
 NAMESPACE                                     |  2 +
 R/cbioportal.R                                | 52 +++++++++++--
 R/cboilerplate.R                              | 77 +++++++++++--------
 _pkgdown.yml                                  |  1 +
 man/cbp_add_clinical.Rd                       |  4 +-
 man/cbp_new_cancer_type.Rd                    | 21 +++++
 man/format_gene_expression_data.Rd            | 12 +++
 man/make_cbio_clinical_header.Rd              | 11 +--
 tests/testthat/test_register_study.R          |  3 +-
 ...tal-data-to-other-platforms-cbioportal.Rmd | 45 +++++++----
 11 files changed, 164 insertions(+), 66 deletions(-)
 create mode 100644 man/cbp_new_cancer_type.Rd
 create mode 100644 man/format_gene_expression_data.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index 80039137..9e74f721 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: nfportalutils
 Title: NF Portal Utilities
-Version: 0.0.0.946
+Version: 0.9500
 Authors@R: c(
     person(given = "Robert", family = "Allaway", role = c("aut", "cre"),
            email = "robert.allaway@sagebionetworks.org",
diff --git a/NAMESPACE b/NAMESPACE
index 84e65a2c..85ab7af7 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -31,9 +31,11 @@ export(bad_url)
 export(bipartite_mmd_template)
 export(byte_budget)
 export(calculate_related_studies)
+export(cbp_add_clinical)
 export(cbp_add_cna)
 export(cbp_add_expression)
 export(cbp_add_maf)
+export(cbp_new_cancer_type)
 export(cbp_new_study)
 export(check_access)
 export(check_readpair_validity)
diff --git a/R/cbioportal.R b/R/cbioportal.R
index e77184bf..13504f30 100644
--- a/R/cbioportal.R
+++ b/R/cbioportal.R
@@ -110,20 +110,23 @@ cbp_new_study <- function(cancer_study_identifier,
 #' in Nextflow processing any spaces gets replaced with underscores so that's the default here.
 #' Does *not* check for missing samples, as final validation via cBioPortal tool is still expected for that.
 #'
-#' @param ref_view A view that contains all clinical data for the study.
+#' @param clinical_data Clinical table query.
 #' @param ref_map YAML file specifying the mapping of (NF) clinical metadata to cBioPortal model. See details.
 #' @param verbose Whether to provide informative messages throughout.
-cbp_add_clinical <- function(ref_view,
+#'
+#' @export
+cbp_add_clinical <- function(clinical_data,
                              ref_map,
                              verbose = TRUE) {
 
   cancer_study_identifier <- check_cbp_study_id()
 
-  if(verbose) checked_message("Pulling the clinical data from Synapse")
-  df <- get_clinical_data_for_cbp_study(ref_view)
+  df <- .syn$tableQuery(clinical_data, includeRowIdAndRowVersion = FALSE)$asDataFrame()
+  if(verbose) checked_message("Retrieved clinical data from Synapse")
 
   if(verbose) checked_message("Formatting and making clinical data file(s)")
-  df$specimenID <- gsub(" ", "_", clinical_data$specimenID)
+  checked_message("Spaces in specimen IDs will be replaced with _ per cBioPortal specifications")
+  df$specimenID <- gsub(" ", "_", df$specimenID)
   write_cbio_clinical(df, ref_map = ref_map, verbose = verbose)
 
   if(verbose) checked_message("Making sample clinical meta file")
@@ -238,6 +241,7 @@ cbp_add_expression <- function(expression_data,
   file <- .syn$get(expression_data, downloadLocation = ".")
   data_expression <- sub(file$name, "data_expression_tpm.txt", file$path)
   file.rename(file$path, data_expression)
+  format_gene_expression_data("data_expression_tpm.txt")
 
   if(verbose) checked_message("Making the meta file")
   make_meta_expression(cancer_study_identifier, type = "tpm")
@@ -247,6 +251,7 @@ cbp_add_expression <- function(expression_data,
     file <- .syn$get(expression_data_raw, downloadLocation = ".")
     data_expression_supp <- sub(file$name, "data_expression_raw.txt", file$path)
     file.rename(file$path, data_expression_supp)
+    format_gene_expression_data("data_expression_raw.txt")
 
     if(verbose) checked_message("Making the meta file for supplemental raw mRNA expression data file")
     make_meta_expression(cancer_study_identifier, type = "raw")
@@ -257,3 +262,40 @@ cbp_add_expression <- function(expression_data,
 
 }
 
+#' Format gene expression
+#'
+#' @keywords internal
+#' @import data.table
+format_gene_expression_data <- function(file) {
+  data_expression <- fread(file)
+  data_expression[, gene_id := NULL] # Ensembl ids not used in cBioPortal
+  setnames(data_expression, old = c("gene_name"), new = c("Hugo_Symbol"))
+  fwrite(data_expression, file = file, sep = "\t")
+}
+
+#' Create reference file for new cancer type
+#'
+#' Helper for creating reference for new cancer subtype which does not already exist.
+#' https://docs.cbioportal.org/file-formats/#cancer-type
+#'
+#' @param type_of_cancer Id for new cancer type, e.g. "cnf".
+#' @param name Full name for new cancer type, e.g. "Cutaneous Neurofibroma"
+#' @param color Color name for new cancer; https://en.wikipedia.org/wiki/Web_colors#X11_color_names.
+#' @param parent_type_of_cancer Id of existing parent, e.g. "nfib" for Neurofibroma.
+#' @export
+cbp_new_cancer_type <- function(type_of_cancer,
+                                name,
+                                color,
+                                parent_type_of_cancer) {
+
+  cat("genetic_alteration_type: CANCER_TYPE",
+      "datatype: CANCER_TYPE",
+      "data_filename: cancer_type.txt",
+      sep = "\n",
+      file = "meta_cancer_type.txt")
+
+  cat(glue::glue("{type_of_cancer}\t{name}\t{color}\t{parent_type_of_cancer}"),
+      file = "cancer_type.txt")
+
+  checked_message("Created new cancer type meta and data")
+}
diff --git a/R/cboilerplate.R b/R/cboilerplate.R
index 08f80f8f..b4791302 100644
--- a/R/cboilerplate.R
+++ b/R/cboilerplate.R
@@ -19,16 +19,18 @@
 #' Make header for cBioPortal clinical data file
 #'
 #' This is called from the wrapper `write_cbio_clinical`.
-#' Reused from https://github.com/Sage-Bionetworks/genie-erbb2-cbio/blob/develop/create_clinical.R#L396.
+#' Adapted from https://github.com/Sage-Bionetworks/genie-erbb2-cbio/blob/develop/create_clinical.R#L396.
+#' Needs a data table of clinical data and a reference providing `label`, `description`, and `data_type`.
 #'
 #' @param df A `data.frame` representing clinical dataset to publicize.
-#' @param label Character vector representing a short label for each column in the dataset
-#' @param description Character vector representing a long descriptions for each column in the dataset
-#' @param data_type Character vector representing the data type of each column in the dataset
+#' @param mapping A reference table providing `label`, `description`, and `data_type` for each `source` attribute in `df`.
 #' @keywords internal
-make_cbio_clinical_header <- function(df, label, description, data_type) {
+make_cbio_clinical_header <- function(df, mapping) {
+
+  label <- mapping[match(names(df), source), label]
+  description <- mapping[match(names(df), source), description]
+  data_type <- mapping[match(names(df), source), data_type]
 
-  # Original code assigns a default priority = 1 to all; this is kept until we need more complex configuration
   header <- rbind(label, description, data_type, rep(1))
   header <- t(apply(header, 1, function(x) { return(c(paste0("#", x[1]), x[2:length(x)]))}))
   header <- rbind(header, label) # use harmonized name as row-5 attribute names
@@ -83,38 +85,45 @@ write_cbio_clinical <- function(df,
                                 verbose = TRUE) {
 
   m <- use_ref_map(ref_map)
-  attributes <- m$source
-  m <- split(m, by = "attribute_type")
-
-  # Move/factor out these checks?
-  if(!all(attributes %in% names(df))) stop(glue::glue_collapse(setdiff(attributes, names(df)), ","), " specified in mapping but not available in data. Check data.")
-  if(!"SAMPLE" %in% names(m)) stop("According to mapping, no SAMPLE clinical file will be created. Check mapping.")
+  present <- names(df)
+  required <- m$source[m$required]
+  attributes <- unique(m$source)
+
+  # Attribute checks
+  message("Clinical attributes present are: ", paste(present, collapse = ", "))
+  if(!all(required %in% present)) stop("Missing required clinical element(s):", paste(setdiff(required, present), collapse = ", "))
+  if(!all(present %in% attributes)) stop("Missing mapping for:", paste(setdiff(present, attributes), collapse = ","))
+
+  # Take care of list columns and NA
+  .df <- data.table::copy(df)
+  for(col in names(.df)) {
+    if(class(.df[[col]]) == "list") {
+      .df[[col]] <- sapply(.df[[col]], function(x) paste0(x, collapse = "-"))
+      warning(glue::glue("The {col} field was stored as a list has been coerced for export, you may want to check output."), call. = F)
+    }
+    # Use actual NA's so that `write.table` can write out "" consistently
+    .df[.df[[col]] %in% na_recode, col ] <- NA_character_
+  }
 
   files <- list()
-  for(clinical_type in names(m)) {
-    .df <- df[, m[[clinical_type]]$source ]
-    # cBioPortal does not allow list columns
-    for(col in names(.df)) {
-      if(class(.df[[col]]) == "list") {
-        .df[[col]] <- paste(.df[[col]], sep = ",")
-        warning(glue::glue("Coerced {col} data from list for export, you may want to check output."), call. = F)
-      }
-      # Use actual NA's so that `write.table` can write out "" consistently
-      .df[.df[[col]] %in% na_recode, col ] <- NA_character_
+  m <- split(m, by = "attribute_type")
+  if("individualID" %in% names(.df)) {
+    patient_df <- unique(.df[, c(names(.df) %in% m$PATIENT$source)])
+    header <- make_cbio_clinical_header(patient_df, m$PATIENT)
+    patient_df <- rbind(header, patient_df)
+    files[["PATIENT"]] <- patient_df
+  }
+  {
+    sample_df <- .df[, c(names(.df) %in% m$SAMPLE$source)]
+    header <- make_cbio_clinical_header(sample_df, m$SAMPLE)
+    sample_df <- rbind(header, sample_df)
+    files[["SAMPLE"]] <- sample_df
+  }
 
-    }
-    if(clinical_type == "PATIENT") {
-      .df <- unique(.df)
-    }
+  for(clinical_type in names(files)) {
     filename <- get_cbio_filename(clinical_type)
-    header <- make_cbio_clinical_header(.df,
-                                        m[[clinical_type]]$label,
-                                        m[[clinical_type]]$description,
-                                        m[[clinical_type]]$data_type)
-
-    df_out <- rbind(header, .df)
     path <- glue::glue("{publish_dir}/{filename}")
-    write.table(df_out,
+    write.table(files[[clinical_type]],
                 file = path,
                 sep = delim,
                 na = "",
@@ -122,8 +131,8 @@ write_cbio_clinical <- function(df,
                 row.names = F,
                 quote = F)
     if(verbose) message(glue::glue("{clinical_type} data written to: {path}"))
-    files[[clinical_type]] <- df_out
   }
+
   invisible(files)
 }
 
diff --git a/_pkgdown.yml b/_pkgdown.yml
index 5b37ee74..8fe504be 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -129,6 +129,7 @@ reference:
   desc: Export data as a cBioPortal study
 - contents:
   - cbp_new_study
+  - cbp_new_cancer_type
   - cbp_add_maf
   - cbp_add_clinical
   - cbp_add_expression
diff --git a/man/cbp_add_clinical.Rd b/man/cbp_add_clinical.Rd
index ca06b3d7..4168b5a2 100644
--- a/man/cbp_add_clinical.Rd
+++ b/man/cbp_add_clinical.Rd
@@ -4,10 +4,10 @@
 \alias{cbp_add_clinical}
 \title{Export and add clinical data to cBioPortal dataset}
 \usage{
-cbp_add_clinical(ref_view, ref_map, verbose = TRUE)
+cbp_add_clinical(clinical_data, ref_map, verbose = TRUE)
 }
 \arguments{
-\item{ref_view}{A view that contains all clinical data for the study.}
+\item{clinical_data}{Clinical table query.}
 
 \item{ref_map}{YAML file specifying the mapping of (NF) clinical metadata to cBioPortal model. See details.}
 
diff --git a/man/cbp_new_cancer_type.Rd b/man/cbp_new_cancer_type.Rd
new file mode 100644
index 00000000..1a999720
--- /dev/null
+++ b/man/cbp_new_cancer_type.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cbioportal.R
+\name{cbp_new_cancer_type}
+\alias{cbp_new_cancer_type}
+\title{Create reference file for new cancer type}
+\usage{
+cbp_new_cancer_type(type_of_cancer, name, color, parent_type_of_cancer)
+}
+\arguments{
+\item{type_of_cancer}{Id for new cancer type, e.g. "cnf".}
+
+\item{name}{Full name for new cancer type, e.g. "Cutaneous Neurofibroma"}
+
+\item{color}{Color name for new cancer; https://en.wikipedia.org/wiki/Web_colors#X11_color_names.}
+
+\item{parent_type_of_cancer}{Id of existing parent, e.g. "nfib" for Neurofibroma.}
+}
+\description{
+Helper for creating reference for new cancer subtype which does not already exist.
+https://docs.cbioportal.org/file-formats/#cancer-type
+}
diff --git a/man/format_gene_expression_data.Rd b/man/format_gene_expression_data.Rd
new file mode 100644
index 00000000..39654d67
--- /dev/null
+++ b/man/format_gene_expression_data.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cbioportal.R
+\name{format_gene_expression_data}
+\alias{format_gene_expression_data}
+\title{Format gene expression}
+\usage{
+format_gene_expression_data(file)
+}
+\description{
+Format gene expression
+}
+\keyword{internal}
diff --git a/man/make_cbio_clinical_header.Rd b/man/make_cbio_clinical_header.Rd
index c200531d..41fa6ebe 100644
--- a/man/make_cbio_clinical_header.Rd
+++ b/man/make_cbio_clinical_header.Rd
@@ -4,19 +4,16 @@
 \alias{make_cbio_clinical_header}
 \title{Make header for cBioPortal clinical data file}
 \usage{
-make_cbio_clinical_header(df, label, description, data_type)
+make_cbio_clinical_header(df, mapping)
 }
 \arguments{
 \item{df}{A \code{data.frame} representing clinical dataset to publicize.}
 
-\item{label}{Character vector representing a short label for each column in the dataset}
-
-\item{description}{Character vector representing a long descriptions for each column in the dataset}
-
-\item{data_type}{Character vector representing the data type of each column in the dataset}
+\item{mapping}{A reference table providing \code{label}, \code{description}, and \code{data_type} for each \code{source} attribute in \code{df}.}
 }
 \description{
 This is called from the wrapper \code{write_cbio_clinical}.
-Reused from https://github.com/Sage-Bionetworks/genie-erbb2-cbio/blob/develop/create_clinical.R#L396.
+Adapted from https://github.com/Sage-Bionetworks/genie-erbb2-cbio/blob/develop/create_clinical.R#L396.
+Needs a data table of clinical data and a reference providing \code{label}, \code{description}, and \code{data_type}.
 }
 \keyword{internal}
diff --git a/tests/testthat/test_register_study.R b/tests/testthat/test_register_study.R
index 1b777416..87ebb17d 100644
--- a/tests/testthat/test_register_study.R
+++ b/tests/testthat/test_register_study.R
@@ -20,7 +20,8 @@ test_that("Add study meta works", {
 
   expected <- c(study_meta, studyStatus = "Active")
 
-  testthat::expect_mapequal(s, expected)
+  testthat::expect_output(print(s), "\\{'studyName': \\['NF Dev Playground'\\], 'dataStatus': \\['Data Not Expected'\\], 'initiative': \\['Other'\\], 'studyLeads': \\['Robert Allaway', 'Anh Nguyet Vu'\\], 'studyStatus': \\['Active'\\], 'diseaseFocus': \\['Multiple'\\], 'institutions': \\['Sage Bionetworks'\\], 'fundingAgency': \\['Sage Bionetworks'\\]\\}")
+
 })
 
 
diff --git a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
index eadc1e79..f360b759 100644
--- a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
+++ b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
@@ -29,7 +29,7 @@ The current API covers creating a cBioPortal study with a subset of data types r
 The design has been inspired by and should feel somewhat like working with the R package [usethis](https://github.com/r-lib/usethis), 
 and data types can be added to the study package interactively.
 
-Though there is some checking depending on the data type, final validation with the official cBioPortal validation tools/scripts should still be run (see last section).
+Though there is some checking depending on the data type, final [validation](#Validation) with the official cBioPortal validation tools/scripts should still be run.
 
 Breaking changes are possible as the API is still in development.
 
@@ -46,7 +46,8 @@ syn_login()
 
 ## Create a new study dataset
 
-First create the study before we can put together the data.
+First create the study dataset "package" where we can put together the data. 
+Each study dataset combines multiple data types -- clinical, gene expression, gene variants, etc.
 
 ```{r cbp_new_study, eval=FALSE}
 
@@ -58,15 +59,19 @@ cbp_new_study(cancer_study_identifier = "npst_nfosi_ntap_2022",
 ## Add data types to study
 
 Data types can be most easily added _in any order_ using the `cbp_add*` functions.
-These download data files and create the meta for them, using **defaults for NF-OSI processed data**. 
-Sometimes if these defaults don't match what you want, take a look at the lower-level utils `make_meta_*` or edit the files manually after.
+These functions download data files and create the meta for them. 
 
-Important: user is expected to be in a valid cBioPortal study directory as set up in the previous step.
+Note that:
+
+- These should be run with the working directory set to the study dataset directory as set up above to ensure consistent metadata.
+- **Defaults are for known NF-OSI processed data outputs**. 
+- If these defaults don't apply because of changes in the scenario, take a look at the lower-level utils `make_meta_*` or edit the files manually after.
+- Data types can vary in how much additional work is needed in remapping, reformatting, custom sanity checks, etc.
 
 ### Add mutations data
 
 - `maf_data` references a final merged maf output file from the NF-OSI processing pipeline OK for public release. 
-No further modifications are done except renaming it.
+- This data file type requires no further modifications except renaming.
 
 ```{r add_maf, eval=FALSE}
 
@@ -88,27 +93,32 @@ cbp_add_cna(cna_data)
 
 ### Add expression data
 
-- `expression_data` is expected to be a `.txt` file on Synapse.
+- `expression_data` is expected to be a `.txt` called `gene_tpm.tsv` file on Synapse.
+- The NF-OSI default includes including the raw expression data as well, called `gene_counts.tsv`, but this can be omitted.
+- These NF-OSI outputs will be somewhat modified in translation to have the required headers.
 
 ```{r add_expression, eval=FALSE}
 
 mrna_data <- "syn********"
+mrna_data_raw <- "syn********"
 
-cbp_add_expression(mrna_data)
+cbp_add_expression(mrna_data,
+                   expression_data_raw = mrna_data_raw)
 ```
 
-
 ### Add clinical data
 
-- `ref_view` is a fileview that contains clinical data for the data released in the study.
-- `ref_map` maps clinical variables from the NF-OSI data dictionary to cBioPortal's
+- `clinical_data` is a prepared clinical data table already subsetted to those released in this study, or pass in a query that can be used for subsetting if using a full clinical database table. For example, the full clinical cohort comprises patients 1-50, but this study dataset consists of available and releasable data only for patients 1-20 for expression data and data patients 15-20 for cna data. Here, `clinical_data` can be a smaller table of just those 1-30, or it can be the original table but pass in a suitable additional filter, e.g. `where release = 'batch1'`.
+- Clinical data requires mapping to be as consistent with other public datasets as possible. `ref_map` defines the mapping of clinical variables from the NF-OSI data dictionary to cBioPortal's. Only variables in the mapping are exported to cBioPortal. Follow link below to inspect the default file and format used.
+- Clinical data should be added last for overall sample checks to work. For example, if there is expression data for patients 1-20 and cna data patients 15-20,
+it can more informatively warn about any missing/mismatches. 
 
 ```{r add_clinical, eval=FALSE}
 
-ref_view <- "syn43278088"
+clinical_data <- "select * from syn43278088"
 ref_map <- "https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/mappings/cBioPortal.yaml"
 
-cbp_add_clinical(ref_view, ref_map)
+cbp_add_clinical(clinical_data, ref_map)
 ```
 
 ## Validation
@@ -118,7 +128,10 @@ See the [general docs for dataset validation](https://docs.cbioportal.org/using-
 
 For the _public_ portal, the suggested step using the public server is given below.  
 
-Assuming your present working directory is `~/datahub/public` and a study folder called `mixed_nfosi_2022` has been placed into it, mount the dataset into the container and run validation with:  
-`docker run --rm -v $(pwd):/datahub cbioportal/cbioportal:4.1.13 validateStudies.py -d /datahub -l mixed_nfosi_2022 -u http://cbioportal.org -html /datahub/mixed_nfosi_2022/html_report`
+Assuming your present working directory is `~/datahub/public` and a study folder called `npst_nfosi_ntap_2022` has been placed into it, mount the dataset into the container and run validation like:  
+```
+STUDY=npst_nfosi_ntap_2022
+sudo docker run --rm -v $(pwd):/datahub cbioportal/cbioportal:5.4.7 validateStudies.py -d /datahub -l $STUDY -u http://cbioportal.org -html /datahub/$STUDY/html_report
+```
 
- 
\ No newline at end of file
+The html report will list issues by data types to help with any corrections needed.