From 1b076463170d26199edd09f539ce05f7f43851a8 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Thu, 17 Aug 2023 11:49:23 -0600
Subject: [PATCH 01/26] Add cboilerplate for CNA meta

---
 R/cboilerplate.R | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/R/cboilerplate.R b/R/cboilerplate.R
index 06520af9..17357695 100644
--- a/R/cboilerplate.R
+++ b/R/cboilerplate.R
@@ -1,11 +1,10 @@
 # ------------------------------------------------------------------------------ #
-# Attribution: The following utils to make cBioPortal files are adapted from some code written by the awesome @hhunterzinck
+# Attribution: Following utils for packaging to cBioPortal are adapted from some code written by the awesome @hhunterzinck
 # in the repo https://github.com/Sage-Bionetworks/genie-erbb2-cbio/
 
 # TO DO / DEV NOTES: 
-# 1. Many of functions for making meta files according to the data type might
-# benefit from implementation using S3 classes esp. if will be using more different types in the future,
-# but that is currently not worth the rewrite.
+# 1. Many of functions for making meta files according to the data type might benefit 
+# from reimplementation using S3 classes esp. if different types explode, but currently not worth the rewrite.
 
 # 2. It may make sense to write meta files automatically whenever writing a 
 # data file is called. This might be mainly updating the main wrapper or creating more wrappers.
@@ -281,6 +280,31 @@ make_meta_maf <- function(cancer_study_identifier,
   invisible(df_file)
 }
 
+
+#' Make meta file for seg (copy number variation) data
+#' 
+#' See https://docs.cbioportal.org/file-formats/#segmented-data
+#' @keywords internal
+make_meta_seg <- function(cancer_study_identifier, 
+                          data_filename = "data_cna.seg",
+                          reference_genome = "hg19",
+                          publish_dir = ".",
+                          write = TRUE,
+                          verbose = TRUE) {
+    
+  meta_filename <- "meta_seg.txt"
+  df_file <- make_meta_genomic_generic(cancer_study_identifier = cancer_study_identifier,
+                                       genetic_alteration_type = "COPY_NUMBER_ALTERATION",
+                                       datatype = "SEG",
+                                       reference_genome_id = reference_genome_id,
+                                       description = "Somatic CNA from NF-OSI processing.",
+                                       data_filename = data_filename)
+  
+  if(write) write_meta(df_file, meta_filename, publish_dir, verbose)
+  invisible(df_file)
+
+}
+
 # --- Meta study --------------------------------------------------------------- #
 
 #' Template for meta study file

From bffcb6f14067bf3e5aaa71577faa9511f7402538 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Thu, 17 Aug 2023 11:55:46 -0600
Subject: [PATCH 02/26] Reorganize/update notes

---
 R/syncBP.R | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/R/syncBP.R b/R/syncBP.R
index 1fd65e9e..66c2017f 100644
--- a/R/syncBP.R
+++ b/R/syncBP.R
@@ -1,3 +1,10 @@
+#' TO DO enhancements
+#' 
+#' When a dataset has already been created, one might want to only add or update a 
+#' data type (bc clinical data is the same). Having an `add_*` or `update_*` 
+#' type util might be useful, e.g. to add expression data to mutations data.
+
+
 #' Make cBioPortal mutations dataset from Synapse assets
 #' 
 #' The NF-OSI workflow produces a single merged maf file that represents a filtered subset of the `maf`s,
@@ -189,13 +196,4 @@ match_maf_sample_id <- function(clinical_data, merged_maf = NULL) {
   return(clinical_data)
 }
 
-#' TO DO enhancements
-#' 
-#' When a dataset has already been created, one might want to only add or update a 
-#' data type (where clinical data is the same). Having an `add_*` or `update_*` 
-#' type util might be useful, e.g. to add expression data to mutation data.
-#'
-#' Additionally, while meta files can be easily edited by hand after they are created, 
-#' some fields such as `cancer_study_identifier` will need to be changed across multiple files,
-#' so if someone can't use something like `sed` it might be helpful to have an R util for this.
 

From cb2ad9d650105dfe1c1e7b59ee54d48368014407 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Thu, 17 Aug 2023 16:01:49 -0600
Subject: [PATCH 03/26] Large refactoring of cbioportal utils (wip)

---
 R/{syncBP.R => cbioportal_export.R} | 207 +++++++++++++++++-----------
 R/cboilerplate.R                    |   6 +-
 2 files changed, 131 insertions(+), 82 deletions(-)
 rename R/{syncBP.R => cbioportal_export.R} (59%)

diff --git a/R/syncBP.R b/R/cbioportal_export.R
similarity index 59%
rename from R/syncBP.R
rename to R/cbioportal_export.R
index 66c2017f..5a4345f7 100644
--- a/R/syncBP.R
+++ b/R/cbioportal_export.R
@@ -1,24 +1,107 @@
-#' TO DO enhancements
+# These are mainly wrappers. See `cboilerplate.R` for lower-level formatting.
+# Think of this as making a new R package using `devtools`, except making a new package for cBioPortal.
+
+#' Enumerate combinations of valid cBP data types
+#' 
+#' https://docs.cbioportal.org/file-formats/
+cbp_data_types <- function() {
+  
+  list(c("CLINICAL", "SAMPLE_ATTRIBUTES"),
+       c("CLINICAL", "PATIENT_ATTRIBUTES"),
+       c("COPY_NUMBER_ALTERATION", "DISCRETE"),
+       c("COPY_NUMBER_ALTERATION", "DISCRETE_LONG"),
+       c("COPY_NUMBER_ALTERATION", "CONTINUOUS"),
+       c("COPY_NUMBER_ALTERATION", "LOG2-VALUE"),
+       c("COPY_NUMBER_ALTERATION", "SEG"),
+       c("MRNA_EXPRESSION", "CONTINUOUS"),
+       c("MRNA_EXPRESSION", "Z-SCORE"),
+       c("MUTATION_EXTENDED", "MAF"),
+       c("METHYLATION", "CONTINUOUS"),
+       c("PROTEIN_LEVEL", "LOG2-VALUE"),
+       c("STRUCTURAL_VARIANT", "SV"))
+  
+}
+
+#' Create cBioPortal study
 #' 
-#' When a dataset has already been created, one might want to only add or update a 
-#' data type (bc clinical data is the same). Having an `add_*` or `update_*` 
-#' type util might be useful, e.g. to add expression data to mutations data.
+#' @param cancer_study_identifier Cancer study identifier. See cBioPortal standards for ids.
+#' @param name of the cancer study, e.g. something following convention is "Malignant Peripheral Nerve Sheath Tumor (NF-OSI, 2022)".
+#' @param publish_dir Directory to create for study if doesn't exist, same as `cancer_study_identifier` by default.
+#' @inheritParams make_meta_study
+cbp_new_study <- function(...) {
+  
+  if(!dir.exists(publish_dir)) {
+    if(verbose) message(glue::glue("Creating {publish_dir} dataset directory"))
+    dir.create(publish_dir)
+  }
+  
+  message("Setting new dataset directory as working directory.")
+  setwd(publish_dir)
+  
+  # If a single value in tumorType, use that, otherwise "mixed" as the catch-all
+  type_of_cancer <- unique(df$tumorType)
+  type_of_cancer <- type_of_cancer[type_of_cancer != ""]
+  if(length(type_of_cancer) != 1) {
+    type_of_cancer <- "mixed" 
+    if(verbose) message("More than one cancer type detected in data, using `mixed` for study.")
+  }
+  
+  # Uses defaults for `groups` and `global_case_list`
+  make_meta_study(cancer_study_identifier = cancer_study_identifier,
+                  type_of_cancer = type_of_cancer,
+                  name = name,
+                  description = "The data are contributed by researchers funded by the Neurofibromatosis Therapeutic Acceleration Program (NTAP). 
+                    The reprocessing of the raw data is managed by the NF Open Science Initiative (https://nf.synapse.org/).", 
+                  citation = citation,
+                  pmid = pmid,
+                  short_name = short_name,
+                  verbose = verbose)
+}
 
+# ------------------------------------------------------------------------------- #
 
-#' Make cBioPortal mutations dataset from Synapse assets
-#' 
-#' The NF-OSI workflow produces a single merged maf file that represents a filtered subset of the `maf`s,
-#' containing only the (non-germline) data that _can_ be released for cBioPortal.
-#' However, this data file by itself is not immediately loadable into an instance of a cBioPortal server
-#' and needs to be packaged with other files, such as this 
-#' [example of a public mutations dataset](https://github.com/cBioPortal/datahub/tree/1e03ea6ab5e0ddd497ecf349cbee7d50aeebcd5e/public/msk_ch_2020). 
-#' This is a wrapper that goes through several steps needed to create said bundle of cBioPortal files more conveniently.
-#' 
-#' 1. A simple sanity check that this is the version of the maf release file that we want, based on the samplesheet.
-#' For example, version 1 of the samplesheet will generate a version 1 of the merged maf, 
-#' but if there is a later correction that retracts a sample (`is_releasable`=FALSE), 
-#' that step of the workflow to generate merged maf will be rerun (or it should be), so we'll want to make sure the latest versions of these files are used.  
-#' The latest versions of the samplesheets tied to each release are currently stored in `syn38793855`. 
+#' Export and add clinical data to cBioPortal dataset
+#' 
+#' #' This should be run in an existing dataset package root.
+#' 
+#' @param ref_map YAML file specifying the mapping of (NF) clinical metadata to cBioPortal model. See details.
+#' @param ref_view A view that contains all clinical data for the study.
+cbp_add_clinical <- function(ref_map,
+                             ref_view) {
+  
+  if(verbose) message("--- Pulling the clinical data ---")
+  df <- get_data_for_releasable(ss, 
+                                ref_view, 
+                                verbose = verbose)
+  write_cbio_clinical(df, 
+                      ref_map = ref_map, 
+                      verbose = verbose)
+  
+  if(verbose) message("--- Making clinical meta file ---")
+  # before making meta, check that data file was actually written 
+  if(file.exists("data_clinical_patient.txt")) {
+    make_meta_patient(cancer_study_identifier, verbose = verbose)
+  }
+  
+  make_meta_sample(cancer_study_identifier, verbose = verbose)
+  
+  if(verbose) message("Clinical data added.")
+  
+}
+
+# ------------------------------------------------------------------------------- #
+
+#' Export and add mutations data to cBioPortal dataset
+#' 
+#' This should be run in an existing dataset package root.
+#' 
+#' Get merged maf file that represents filtered subset of `maf`s containing only (non-germline) data OK to release publicly.
+#' This needs to be packaged with other files like this
+#' [example of a public mutations dataset](https://github.com/cBioPortal/datahub/tree/1e03ea6ab5e0ddd497ecf349cbee7d50aeebcd5e/public/msk_ch_2020).
+#' 
+#' 1. Sanity check that this is the version of the maf release file that we want, based on the samplesheet.
+#' Sometimes a later version retracts a sample (`is_releasable`=FALSE) so merged maf being exported should match samplesheet.  
+#' Latest versions of the samplesheets tied to each release are currently stored in `syn38793855`.
 #' (Note: Please file issue to update this doc if this changes.)
 #' 
 #' 2. Make the clinical data files. 
@@ -31,88 +114,35 @@
 #' 
 #' 3. Make meta files. Meta files are needed for describing the study, mutations data file, clinical data files.
 #' 
-#' @inheritParams make_meta_study
 #' @param merged_maf Synapse id of `merged maf` file for public release.
 #' @param samplesheet Synapse id or local path to samplesheet with release info.
-#' @param ref_map YAML file specifying the mapping of (NF) clinical metadata to cBioPortal model. See details.
-#' @param ref_view A view that contains clinical data for the release files.
-#' @param name Name of the cancer study, e.g. something following convention is "Malignant Peripheral Nerve Sheath Tumor (NF-OSI, 2022)".
 #' @param cancer_study_identifier Study identifier, convention is `{tumorType}_{institution}_{year}`, so for example "mpnst_nfosi_2022".
-#' @param publish_dir Where to output the set of files. 
-#' Defaults to (creating if necessary) a folder with same name as `cancer_study_identifier`.
 #' @param verbose Whether to provide informative messages throughout.
 #' @export
-syncBP_maf <- function(merged_maf,
-                       samplesheet,
-                       ref_map,
-                       ref_view, 
-                       name,
-                       cancer_study_identifier,
-                       citation = NULL,
-                       pmid = NULL,
-                       short_name = NULL,
-                       publish_dir = cancer_study_identifier,
-                       verbose = TRUE) {
+cbp_add_maf <- function(merged_maf,
+                        samplesheet,
+                        verbose = TRUE) {
   
   .check_login()
   
-  if(!dir.exists(publish_dir)) {
-    if(verbose) message(glue::glue("Creating {publish_dir} dataset directory"))
-    dir.create(publish_dir)
-  }
-  
   if(verbose) message("--- Getting `merged_maf` file ---")
-  file <- .syn$get(merged_maf, downloadLocation = publish_dir)
+  file <- .syn$get(merged_maf, downloadLocation = ".")
   
-  if(verbose) message("--- Standardizing `maf` release filename ---")
+  if(verbose) message("--- Standardizing `maf` release data file name ---")
   data_mutations_extended <- sub(file$name, "data_mutations_extended.txt", file$path)
   file.rename(file$path, data_mutations_extended)
   
   if(verbose) message("--- Checking the `maf` release file against samplesheet ---")
-  mm <- dt_read(data_mutations_extended) # don't bother with maftools dependency as doing basic check
+  mm <- dt_read(data_mutations_extended)
   ss <- dt_read(samplesheet)
   check_result <- check_maf_release(mm, ss)
   
   if(!is.null(check_result)) stop("Unfortunately, check of `maf` release failed so will not continue.")
   
-  if(verbose) message("--- Pulling the clinical data ---")
-  df <- get_data_for_releasable(ss, ref_view, verbose = verbose)
-  
-  if(verbose) message("--- Matching clinical data sample ids to `maf` ---")
-  df <- match_maf_sample_id(df)
-  
-  if(verbose) message("--- Making the clinical data files ---")
-  write_cbio_clinical(df, ref_map = ref_map, publish_dir = publish_dir, verbose = verbose)
+  if(verbose) message("--- Making maf meta file ---")
+  make_meta_maf(cancer_study_identifier, verbose = verbose)
   
-  if(verbose) message("--- Making the meta files ---")
-  # only make meta patient if see that it has been outputted; 
-  # meta sample is required and previous step will error otherwise
-  if(file.exists(glue::glue("{publish_dir}/data_clinical_patient.txt"))) {
-    make_meta_patient(cancer_study_identifier, publish_dir = publish_dir, verbose = verbose)
-  }
-  make_meta_sample(cancer_study_identifier, publish_dir = publish_dir, verbose = verbose)
-  make_meta_maf(cancer_study_identifier, publish_dir = publish_dir, verbose = verbose)
-  
-  # If a single value in tumorType, use that, otherwise "mixed" as the catch-all
-  type_of_cancer <- unique(df$tumorType)
-  type_of_cancer <- type_of_cancer[type_of_cancer != ""]
-  if(length(type_of_cancer) != 1) {
-    type_of_cancer <- "mixed" 
-    if(verbose) message("More than one cancer type detected in data, using `mixed` for study.")
-  } 
- 
-  # Uses defaults for `groups` and `global_case_list`
-  make_meta_study(cancer_study_identifier = cancer_study_identifier,
-                  type_of_cancer = type_of_cancer,
-                  name = name,
-                  description = "The mutation data were processed using the `sarek` nf-core pipeline. The data are contributed by researchers funded by the Neurofibromatosis Therapeutic Acceleration Program (NTAP). The reprocessing of the raw data is managed by the NF Open Science Initiative (https://nf.synapse.org/).", 
-                  citation = citation,
-                  pmid = pmid,
-                  short_name = short_name,
-                  publish_dir = publish_dir, 
-                  verbose = verbose)
-  
-  if(verbose) message("All files have been added successfully.")
+  if(verbose) message("Maf data added.")
   
 }
 
@@ -180,6 +210,7 @@ check_maf_release <- function(merged_maf,
   invisible(result)
 }
 
+
 #' Match clinical data with maf sample ids
 #' 
 #' PATIENT_ID and SAMPLE_ID can only contain letters, numbers, points, underscores and/or hyphens.
@@ -196,4 +227,20 @@ match_maf_sample_id <- function(clinical_data, merged_maf = NULL) {
   return(clinical_data)
 }
 
+# ------------------------------------------------------------------------------- #
 
+#' Export and add CNA (seg) data to cBioPortal dataset
+#' 
+#' This should be run in an existing dataset package root.
+#' 
+#' @export
+#' 
+#' @param cna_release Syn id of CNA release data, expected to be a .seg file.
+cbp_add_cna <- function(cna_release) {
+  
+  if(verbose) message("--- Getting the data file ---")
+  file <- .syn$get(cna_release, downloadLocation = ".")
+  
+  if(verbose) message("--- Making the meta file ---")
+  make_meta_cna(...)
+} 
diff --git a/R/cboilerplate.R b/R/cboilerplate.R
index 17357695..959f820b 100644
--- a/R/cboilerplate.R
+++ b/R/cboilerplate.R
@@ -281,11 +281,13 @@ make_meta_maf <- function(cancer_study_identifier,
 }
 
 
-#' Make meta file for seg (copy number variation) data
+#' Make meta file for cBioPortal copy number alteration data
+#' 
+#' Currently assumes seg data and should be extended later.
 #' 
 #' See https://docs.cbioportal.org/file-formats/#segmented-data
 #' @keywords internal
-make_meta_seg <- function(cancer_study_identifier, 
+make_meta_cna <- function(cancer_study_identifier,
                           data_filename = "data_cna.seg",
                           reference_genome = "hg19",
                           publish_dir = ".",

From 1e65147c4b3e69fbfcc9c2ac8c31df07ddded6b6 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Fri, 18 Aug 2023 08:12:19 -0600
Subject: [PATCH 04/26] Refactoring (wip)

---
 R/cbioportal_export.R | 114 +++++++++++++++++++-----------------------
 R/cboilerplate.R      |  47 -----------------
 2 files changed, 51 insertions(+), 110 deletions(-)

diff --git a/R/cbioportal_export.R b/R/cbioportal_export.R
index 5a4345f7..f8633f00 100644
--- a/R/cbioportal_export.R
+++ b/R/cbioportal_export.R
@@ -1,9 +1,12 @@
-# These are mainly wrappers. See `cboilerplate.R` for lower-level formatting.
-# Think of this as making a new R package using `devtools`, except making a new package for cBioPortal.
+# Export data from Synapse as a cBioPortal dataset.
+# This tries to have the spirit of making a new R package with `devtools`, so dataset is a "package" for cBioPortal.
+# This file contain the higher-level wrappers. See `cboilerplate.R` for the true lower-level utils.
 
 #' Enumerate combinations of valid cBP data types
 #' 
 #' https://docs.cbioportal.org/file-formats/
+#' 
+#' @keywords internal
 cbp_data_types <- function() {
   
   list(c("CLINICAL", "SAMPLE_ATTRIBUTES"),
@@ -19,16 +22,39 @@ cbp_data_types <- function() {
        c("METHYLATION", "CONTINUOUS"),
        c("PROTEIN_LEVEL", "LOG2-VALUE"),
        c("STRUCTURAL_VARIANT", "SV"))
-  
 }
 
-#' Create cBioPortal study
+#' Create a cBioPortal study
+#' 
+#' See specifications for a study at https://docs.cbioportal.org/file-formats/#meta-file.
 #' 
 #' @param cancer_study_identifier Cancer study identifier. See cBioPortal standards for ids.
 #' @param name of the cancer study, e.g. something following convention is "Malignant Peripheral Nerve Sheath Tumor (NF-OSI, 2022)".
+#' @param type_of_cancer Type of cancer, defaults to "mixed". See also http://oncotree.mskcc.org/#/home.
+#' @param name Name of the study.
+#' @param description Description of the study. A default generic description is provided.
+#' @param citation (Optional) A relevant citation, e.g. "TCGA, Nature 2012".
+#' @param pmid (Optional) One or more relevant pubmed ids (comma separated without whitespace); if used, citation cannot be `NULL`.
+#' @param groups (Optional) Defaults to "PUBLIC" for use with public cBioPortal; 
+#' otherwise, use group names that make sense for your instance.
+#' @param add_global_case_list (Optional) Use `NULL` to ignore, but default is `TRUE` for an "All samples" case list to be generated automatically.
+#' @param short_name (Optional) Short name for the study.
 #' @param publish_dir Directory to create for study if doesn't exist, same as `cancer_study_identifier` by default.
-#' @inheritParams make_meta_study
-cbp_new_study <- function(...) {
+#' @param verbose Verbosity level.
+#' 
+#' @export
+cbp_new_study <- function(cancer_study_identifier,
+                          name,
+                          type_of_cancer = "mixed",
+                          description = "The data are contributed by researchers funded by the Neurofibromatosis Therapeutic Acceleration Program (NTAP). 
+                          The reprocessing of the raw data is managed by the NF Open Science Initiative (https://nf.synapse.org/).",
+                          citation = NULL,
+                          pmid = NULL,
+                          groups = "PUBLIC",
+                          short_name = NULL,
+                          add_global_case_list = TRUE,
+                          publish_dir = cancer_study_identifier,
+                          verbose = TRUE) {
   
   if(!dir.exists(publish_dir)) {
     if(verbose) message(glue::glue("Creating {publish_dir} dataset directory"))
@@ -38,24 +64,18 @@ cbp_new_study <- function(...) {
   message("Setting new dataset directory as working directory.")
   setwd(publish_dir)
   
-  # If a single value in tumorType, use that, otherwise "mixed" as the catch-all
-  type_of_cancer <- unique(df$tumorType)
-  type_of_cancer <- type_of_cancer[type_of_cancer != ""]
-  if(length(type_of_cancer) != 1) {
-    type_of_cancer <- "mixed" 
-    if(verbose) message("More than one cancer type detected in data, using `mixed` for study.")
-  }
-  
-  # Uses defaults for `groups` and `global_case_list`
-  make_meta_study(cancer_study_identifier = cancer_study_identifier,
-                  type_of_cancer = type_of_cancer,
-                  name = name,
-                  description = "The data are contributed by researchers funded by the Neurofibromatosis Therapeutic Acceleration Program (NTAP). 
-                    The reprocessing of the raw data is managed by the NF Open Science Initiative (https://nf.synapse.org/).", 
-                  citation = citation,
-                  pmid = pmid,
-                  short_name = short_name,
-                  verbose = verbose)
+  df_file <- make_meta_study_generic(cancer_study_identifier = cancer_study_identifier,
+                                     type_of_cancer = type_of_cancer, 
+                                     name = name, 
+                                     description = description, 
+                                     citation = citation,
+                                     pmid = pmid,
+                                     groups = groups, 
+                                     short_name = short_name,
+                                     add_global_case_list = add_global_case_list)
+  
+  write_meta(df_file, "meta_study.txt", verbose = verbose)
+  if(verbose) message("--- Study meta added ---")
 }
 
 # ------------------------------------------------------------------------------- #
@@ -69,16 +89,13 @@ cbp_new_study <- function(...) {
 cbp_add_clinical <- function(ref_map,
                              ref_view) {
   
-  if(verbose) message("--- Pulling the clinical data ---")
-  df <- get_data_for_releasable(ss, 
-                                ref_view, 
-                                verbose = verbose)
-  write_cbio_clinical(df, 
-                      ref_map = ref_map, 
-                      verbose = verbose)
+  if(verbose) message("--- Pulling the clinical data from Synapse ---")
+  df <- get_clinical_data_for_cbp_study(ref_view)
+  
+  write_cbio_clinical(df, ref_map = ref_map, verbose = verbose)
   
   if(verbose) message("--- Making clinical meta file ---")
-  # before making meta, check that data file was actually written 
+  # before making meta, check that data file was actually written, since patient data is optional 
   if(file.exists("data_clinical_patient.txt")) {
     make_meta_patient(cancer_study_identifier, verbose = verbose)
   }
@@ -129,8 +146,8 @@ cbp_add_maf <- function(merged_maf,
   file <- .syn$get(merged_maf, downloadLocation = ".")
   
   if(verbose) message("--- Standardizing `maf` release data file name ---")
-  data_mutations_extended <- sub(file$name, "data_mutations_extended.txt", file$path)
-  file.rename(file$path, data_mutations_extended)
+  data_mutations <- sub(file$name, "data_mutations.txt", file$path)
+  file.rename(file$path, data_mutations)
   
   if(verbose) message("--- Checking the `maf` release file against samplesheet ---")
   mm <- dt_read(data_mutations_extended)
@@ -147,35 +164,6 @@ cbp_add_maf <- function(merged_maf,
 }
 
 
-#' Download data from a view for releasable samples in samplesheet 
-#' 
-#' This tries to check that complete data could be retrieved from said view. 
-#' Note: Since the view is typically denormalized, not all data might be clinical. 
-#' A downstream step will do some of the additional processing/subsetting needed.
-#'  
-#' @param samplesheet Samplesheet `data.table`.
-#' @param ref_view View to get data from. 
-#' @param verbose Output details.
-#' @keywords internal
-get_data_for_releasable <- function(samplesheet, 
-                                    ref_view, 
-                                    verbose = TRUE) {
-  
-  # bc specimen ids in samplesheet are different than actual ids on the files to avoid spaces, use file ids
-  # ids <- samplesheet[is_releasable == TRUE, biospecimen_id]
-  ss_key <- "synapse_id"
-  rv_key <- "id"
-  ids <- samplesheet[is_releasable == TRUE, get(ss_key)]
-  ls <- glue::glue_collapse(glue::single_quote(ids), sep = ",")
-  n <- length(ids)
-  if(verbose) message(glue::glue("Retrieving from {ref_view} data for {n} releasable ids"))
-  ref_view <- .syn$tableQuery(glue::glue("SELECT * FROM {ref_view} WHERE {rv_key} in ({ls})"))
-  df <- ref_view$asDataFrame()
-  if(nrow(df) != n) stop(glue::glue("Data retrieved for {nrow(df)} of {n} release ids. Is this right `ref_view`?"))
-  return(df)
-}
-
-
 #' Check maf file for release
 #' 
 #' Currently, this is a simple check to make sure released samples are expected. 
diff --git a/R/cboilerplate.R b/R/cboilerplate.R
index 959f820b..d4627eda 100644
--- a/R/cboilerplate.R
+++ b/R/cboilerplate.R
@@ -348,53 +348,6 @@ make_meta_study_generic <- function(type_of_cancer,
   return(rows)
 }
 
-#' Make meta study file
-#' 
-#' Adapted from https://github.com/Sage-Bionetworks/genie-erbb2-cbio/blob/develop/create_meta.R#L179
-#' See specifications at https://docs.cbioportal.org/file-formats/#meta-file.
-#' 
-#' @inheritParams make_meta_genomic_generic
-#' @inheritParams write_meta
-#' @inheritParams make_meta_patient
-#' @param type_of_cancer Type of cancer, defaults to "mixed". See also http://oncotree.mskcc.org/#/home.
-#' @param name Name of the study.
-#' @param description Description of the study.
-#' @param citation (Optional) A relevant citation, e.g. "TCGA, Nature 2012".
-#' @param pmid (Optional) One or more relevant pubmed ids (comma separated without whitespace); if used, citation cannot be `NULL`.
-#' @param groups (Optional) Defaults to "PUBLIC" for use with public cBioPortal; 
-#' otherwise, use group names that make sense for your instance.
-#' @param add_global_case_list (Optional) Use `NULL` to ignore, but default is `TRUE` for an "All samples" case list to be generated automatically.
-#' @param short_name (Optional) Short name for the study.
-#' @export
-make_meta_study <- function(cancer_study_identifier,
-                            type_of_cancer = "mixed",
-                            name, 
-                            description, 
-                            citation = NULL,
-                            pmid = NULL,
-                            groups = "PUBLIC",
-                            short_name = NULL,
-                            add_global_case_list = TRUE,
-                            publish_dir = ".",
-                            write = TRUE,
-                            verbose = TRUE) {
-  
-  meta_filename <- "meta_study.txt"
-
-  df_file <- make_meta_study_generic(cancer_study_identifier = cancer_study_identifier,
-                                     type_of_cancer = type_of_cancer, 
-                                     name = name, 
-                                     description = description, 
-                                     citation = citation,
-                                     pmid = pmid,
-                                     groups = groups, 
-                                     short_name = short_name,
-                                     add_global_case_list = add_global_case_list)
-  
-  if(write) write_meta(df_file, meta_filename, publish_dir, verbose)
-  invisible(df_file)
-}
-
 
 # --- Other utils -------------------------------------------------------------- #
 

From df308f09703f039886d24f8f18c1796281b2f24c Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Tue, 22 Aug 2023 10:30:42 -0600
Subject: [PATCH 05/26] Recomposition (wip)

---
 R/{cbioportal_export.R => cbioportal_study.R} | 201 ++++++++++--------
 1 file changed, 117 insertions(+), 84 deletions(-)
 rename R/{cbioportal_export.R => cbioportal_study.R} (53%)

diff --git a/R/cbioportal_export.R b/R/cbioportal_study.R
similarity index 53%
rename from R/cbioportal_export.R
rename to R/cbioportal_study.R
index f8633f00..62e67a5e 100644
--- a/R/cbioportal_export.R
+++ b/R/cbioportal_study.R
@@ -1,68 +1,73 @@
-# Export data from Synapse as a cBioPortal dataset.
-# This tries to have the spirit of making a new R package with `devtools`, so dataset is a "package" for cBioPortal.
-# This file contain the higher-level wrappers. See `cboilerplate.R` for the true lower-level utils.
+# Export data from Synapse as a cBioPortal dataset, where different data types can be added to the package one-by-one,
+# much in the spirit of https://github.com/r-lib/usethis.
+# This file contains the higher-level wrappers that are exported; see `cboilerplate.R` for the non-exported lower-level utils.
+# All functions should start with `cbp_*` so that it's clear this is cBioPortal-relevant functionality.
 
-#' Enumerate combinations of valid cBP data types
+#' Enumerate combinations of valid cBP data types and data subtypes
 #' 
 #' https://docs.cbioportal.org/file-formats/
 #' 
 #' @keywords internal
-cbp_data_types <- function() {
-  
-  list(c("CLINICAL", "SAMPLE_ATTRIBUTES"),
-       c("CLINICAL", "PATIENT_ATTRIBUTES"),
-       c("COPY_NUMBER_ALTERATION", "DISCRETE"),
-       c("COPY_NUMBER_ALTERATION", "DISCRETE_LONG"),
-       c("COPY_NUMBER_ALTERATION", "CONTINUOUS"),
-       c("COPY_NUMBER_ALTERATION", "LOG2-VALUE"),
-       c("COPY_NUMBER_ALTERATION", "SEG"),
-       c("MRNA_EXPRESSION", "CONTINUOUS"),
-       c("MRNA_EXPRESSION", "Z-SCORE"),
-       c("MUTATION_EXTENDED", "MAF"),
-       c("METHYLATION", "CONTINUOUS"),
-       c("PROTEIN_LEVEL", "LOG2-VALUE"),
-       c("STRUCTURAL_VARIANT", "SV"))
-}
+cbp_datatypes <- function() {
 
-#' Create a cBioPortal study
+  types <- data.table::rbindlist(list(
+    list("CLINICAL", "SAMPLE_ATTRIBUTES"),
+    list("CLINICAL", "PATIENT_ATTRIBUTES"),
+    list("COPY_NUMBER_ALTERATION", "DISCRETE"),
+    list("COPY_NUMBER_ALTERATION", "DISCRETE_LONG"),
+    list("COPY_NUMBER_ALTERATION", "CONTINUOUS"),
+    list("COPY_NUMBER_ALTERATION", "LOG2-VALUE"),
+    list("COPY_NUMBER_ALTERATION", "SEG"),
+    list("MRNA_EXPRESSION", "CONTINUOUS"),
+    list("MRNA_EXPRESSION", "Z-SCORE"),
+    list("MUTATION_EXTENDED", "MAF"),
+    list("METHYLATION", "CONTINUOUS"),
+    list("PROTEIN_LEVEL", "LOG2-VALUE"),
+    list("STRUCTURAL_VARIANT", "SV")))
+  
+  setnames(types, c("dataType", "dataSubtype"))
+  return(types)
+  }
+
+#' Initialize a new cBioPortal study dataset
 #' 
-#' See specifications for a study at https://docs.cbioportal.org/file-formats/#meta-file.
+#' Create a new directory with a basic required [study meta file](https://docs.cbioportal.org/file-formats/#meta-file),
+#' much like how we'd create a new R package and put a DESCRIPTION file in it.
 #' 
-#' @param cancer_study_identifier Cancer study identifier. See cBioPortal standards for ids.
-#' @param name of the cancer study, e.g. something following convention is "Malignant Peripheral Nerve Sheath Tumor (NF-OSI, 2022)".
-#' @param type_of_cancer Type of cancer, defaults to "mixed". See also http://oncotree.mskcc.org/#/home.
-#' @param name Name of the study.
-#' @param description Description of the study. A default generic description is provided.
+#' @param cancer_study_identifier Cancer study identifier in format such as `nst_nfosi_ntap_2022`.
+#' @param name Name of the study, e.g. "Malignant Peripheral Nerve Sheath Tumor (NF-OSI, 2022)".
+#' @param type_of_cancer Type of cancer, see http://oncotree.mskcc.org/#/home.
+#' @param description Description of the study, defaults to a generic description that can be edited later.
+#' @param short_name (Optional) Short name for the study.
 #' @param citation (Optional) A relevant citation, e.g. "TCGA, Nature 2012".
 #' @param pmid (Optional) One or more relevant pubmed ids (comma separated without whitespace); if used, citation cannot be `NULL`.
 #' @param groups (Optional) Defaults to "PUBLIC" for use with public cBioPortal; 
-#' otherwise, use group names that make sense for your instance.
-#' @param add_global_case_list (Optional) Use `NULL` to ignore, but default is `TRUE` for an "All samples" case list to be generated automatically.
-#' @param short_name (Optional) Short name for the study.
-#' @param publish_dir Directory to create for study if doesn't exist, same as `cancer_study_identifier` by default.
+#' otherwise, use group names that makes sense with the configuration of your cBioPortal instance.
+#' @param add_global_case_list( (Optional) Use `NULL` to ignore, default is `TRUE` for an "All samples" case list( to be generated automatically.
 #' @param verbose Verbosity level.
 #' 
 #' @export
 cbp_new_study <- function(cancer_study_identifier,
                           name,
-                          type_of_cancer = "mixed",
+                          type_of_cancer,
                           description = "The data are contributed by researchers funded by the Neurofibromatosis Therapeutic Acceleration Program (NTAP). 
                           The reprocessing of the raw data is managed by the NF Open Science Initiative (https://nf.synapse.org/).",
+                          short_name = NULL,
                           citation = NULL,
                           pmid = NULL,
                           groups = "PUBLIC",
-                          short_name = NULL,
                           add_global_case_list = TRUE,
-                          publish_dir = cancer_study_identifier,
                           verbose = TRUE) {
   
-  if(!dir.exists(publish_dir)) {
-    if(verbose) message(glue::glue("Creating {publish_dir} dataset directory"))
-    dir.create(publish_dir)
+  # TODO Validate study id
+  study_dir <- cancer_study_identifier
+  if(!dir.exists(study_dir)) {
+    if(verbose) message(glue::glue("✔ Creating {study_dir} study directory"))
+    dir.create(glue::glue("./{study_dir}"))
   }
   
-  message("Setting new dataset directory as working directory.")
-  setwd(publish_dir)
+  message("✔ Setting dataset directory as working directory")
+  setwd(study_dir)
   
   df_file <- make_meta_study_generic(cancer_study_identifier = cancer_study_identifier,
                                      type_of_cancer = type_of_cancer, 
@@ -75,34 +80,47 @@ cbp_new_study <- function(cancer_study_identifier,
                                      add_global_case_list = add_global_case_list)
   
   write_meta(df_file, "meta_study.txt", verbose = verbose)
-  if(verbose) message("--- Study meta added ---")
+  if(verbose) message("✔ Study meta added")
 }
 
 # ------------------------------------------------------------------------------- #
 
+
 #' Export and add clinical data to cBioPortal dataset
 #' 
-#' #' This should be run in an existing dataset package root.
+#' This should be run in an existing dataset package root. 
+#'
+#' Clinical data are mapped and exported according to a reference mapping. 
+#' Also reformatting of `PATIENT_ID`, `SAMPLE_ID` to contain only letters, numbers, points, underscores, hyphens;
+#' in Nextflow processing any spaces gets replaced with underscores so that's the default here.
+#' Does *not* check for missing samples, as final validation via cBioPortal tool is still expected for that.
 #' 
-#' @param ref_map YAML file specifying the mapping of (NF) clinical metadata to cBioPortal model. See details.
 #' @param ref_view A view that contains all clinical data for the study.
-cbp_add_clinical <- function(ref_map,
-                             ref_view) {
+#' @param ref_map YAML file specifying the mapping of (NF) clinical metadata to cBioPortal model. See details.
+#' @param verbose Whether to provide informative messages throughout.
+cbp_add_clinical <- function(ref_view,
+                             ref_map,
+                             verbose = TRUE) {
+  
+  cancer_study_identifier <- check_cbp_study_id()
   
-  if(verbose) message("--- Pulling the clinical data from Synapse ---")
+  if(verbose) message("✔ Pulling the clinical data from Synapse")
   df <- get_clinical_data_for_cbp_study(ref_view)
   
+  if(verbose) message("✔ Formatting and making clinical data file(s)")
+  df$specimenID <- gsub(" ", "_", clinical_data$specimenID)
   write_cbio_clinical(df, ref_map = ref_map, verbose = verbose)
   
-  if(verbose) message("--- Making clinical meta file ---")
-  # before making meta, check that data file was actually written, since patient data is optional 
+  if(verbose) message("✔ Making sample clinical meta file")
+  make_meta_sample(cancer_study_identifier, verbose = verbose)
+  
+  # Before making meta, check that the optional patient data file was written 
   if(file.exists("data_clinical_patient.txt")) {
+    if(verbose) message("✔ Making patient clinical meta file")
     make_meta_patient(cancer_study_identifier, verbose = verbose)
   }
   
-  make_meta_sample(cancer_study_identifier, verbose = verbose)
-  
-  if(verbose) message("Clinical data added.")
+  if(verbose) message("✔ Done with adding clinical data")
   
 }
 
@@ -131,35 +149,34 @@ cbp_add_clinical <- function(ref_map,
 #' 
 #' 3. Make meta files. Meta files are needed for describing the study, mutations data file, clinical data files.
 #' 
-#' @param merged_maf Synapse id of `merged maf` file for public release.
+#' @param maf_data Synapse id of `merged maf` file for public release.
 #' @param samplesheet Synapse id or local path to samplesheet with release info.
 #' @param cancer_study_identifier Study identifier, convention is `{tumorType}_{institution}_{year}`, so for example "mpnst_nfosi_2022".
 #' @param verbose Whether to provide informative messages throughout.
 #' @export
-cbp_add_maf <- function(merged_maf,
+cbp_add_maf <- function(maf_data,
                         samplesheet,
                         verbose = TRUE) {
   
   .check_login()
+  cancer_study_identifier <- check_cbp_study_id()
   
-  if(verbose) message("--- Getting `merged_maf` file ---")
-  file <- .syn$get(merged_maf, downloadLocation = ".")
-  
-  if(verbose) message("--- Standardizing `maf` release data file name ---")
+  if(verbose) message("✔ Getting `maf_data` file from Synapse")
+  file <- .syn$get(maf_data, downloadLocation = ".")
   data_mutations <- sub(file$name, "data_mutations.txt", file$path)
   file.rename(file$path, data_mutations)
   
-  if(verbose) message("--- Checking the `maf` release file against samplesheet ---")
+  if(verbose) message("✔ Checking the `maf` release file against samplesheet")
   mm <- dt_read(data_mutations_extended)
   ss <- dt_read(samplesheet)
   check_result <- check_maf_release(mm, ss)
   
-  if(!is.null(check_result)) stop("Unfortunately, check of `maf` release failed so will not continue.")
+  if(!is.null(check_result)) stop("Unfortunately, check of `maf` release failed so will not continue. Please update data and retry.")
   
-  if(verbose) message("--- Making maf meta file ---")
+  if(verbose) message("✔ Making maf meta file")
   make_meta_maf(cancer_study_identifier, verbose = verbose)
   
-  if(verbose) message("Maf data added.")
+  if(verbose) message("✔ Done with adding MAF data")
   
 }
 
@@ -169,15 +186,15 @@ cbp_add_maf <- function(merged_maf,
 #' Currently, this is a simple check to make sure released samples are expected. 
 #' It may be extended later on as needed. 
 #' 
-#' @param merged_maf Maf data as a `data.table`.
+#' @param maf_data Maf data as a `data.table`.
 #' @param samplesheet Samplesheet as a `data.table`.
 #' @return Returns `NULL` if everything OK, else the sample ids that don't match expectations.
-check_maf_release <- function(merged_maf, 
+check_maf_release <- function(maf_data, 
                               samplesheet) {
   
   # samplesheet[is_releasable == TRUE, .N]
   ss_samples <- samplesheet[is_releasable == TRUE, biospecimen_id]
-  mm_samples <- merged_maf[, unique(Tumor_Sample_Barcode)]
+  mm_samples <- maf_data[, unique(Tumor_Sample_Barcode)]
   
   result <- NULL
   missing_release <- setdiff(ss_samples, mm_samples)
@@ -193,42 +210,58 @@ check_maf_release <- function(merged_maf,
     ids_2 <- data.frame(sample = no_release, type = "no_release")
     result <- rbind(result, ids_2)
   }
-  if(!length(missing_release) && !length(no_release)) test_passed("Samples for release look as expected.")
+  if(!length(missing_release) && !length(no_release)) test_passed("✔ Samples for release look as expected.")
   
   invisible(result)
 }
 
 
-#' Match clinical data with maf sample ids
+# ------------------------------------------------------------------------------- #
+
+#' Export and add CNA (seg) data to cBioPortal dataset
 #' 
-#' PATIENT_ID and SAMPLE_ID can only contain letters, numbers, points, underscores and/or hyphens.
-#' In the nf processing, sample id spaces are replaced with underscores in the `maf`,
-#' so this is applied to clinical data to match.
+#' This should be run in an existing dataset package root.
 #' 
-#' @param clinical_data Clinical data as a `data.table`.
-#' @param merged_maf Maf data as a `data.table`.
-#' @keywords internal
-match_maf_sample_id <- function(clinical_data, merged_maf = NULL) {
-  clinical_data$specimenID <- gsub(" ", "_", clinical_data$specimenID)
-  # TODO check mafs and clinical data after reformatting ids
-  # maf_samples <- unique(merged_maf$Tumor_Sample_Barcode)
-  return(clinical_data)
-}
+#' @export
+#' 
+#' @param cna_release Syn id of CNA release data, currently only handles `.seg` file.
+cbp_add_cna <- function(cna_data) {
+  
+  cancer_study_identifier <- check_cbp_study_id()
+  
+  if(verbose) message("✔ Getting the CNA (.seg) data file from Synapse")
+  file <- .syn$get(cna_data, downloadLocation = ".")
+  data_cna <- sub(file$name, "data_cna.seg", file$path)
+  file.rename(file$path, data_cna)
+  
+  if(verbose) message("✔ Making the meta file")
+  make_meta_cna(...)
+  
+  if(verbose) message("✔ Done with adding CNA data")
+  
+} 
 
-# ------------------------------------------------------------------------------- #
 
-#' Export and add CNA (seg) data to cBioPortal dataset
+#' Export and add expression data to cBioPortal dataset
 #' 
 #' This should be run in an existing dataset package root.
 #' 
 #' @export
 #' 
-#' @param cna_release Syn id of CNA release data, expected to be a .seg file.
-cbp_add_cna <- function(cna_release) {
+#' @param mrna_data Syn id of gene expression data.
+cbp_add_mrna <- function(mrna_data) {
   
-  if(verbose) message("--- Getting the data file ---")
+  cancer_study_identifier <- check_cbp_study_id()
+  
+  if(verbose) message("✔ Getting the mRNA expression data file from Synapse")
   file <- .syn$get(cna_release, downloadLocation = ".")
+  data_cna <- sub(file$name, "data_cna.seg", file$path)
+  file.rename(file$path, data_cna)
+  
+  if(verbose) message("✔ Making the meta file")
+  make_meta_cna(cancer_study_identifier)
+  
+  if(verbose) message("✔ Done with adding CNA data")
   
-  if(verbose) message("--- Making the meta file ---")
-  make_meta_cna(...)
 } 
+

From 1c770ac9d4421f193df9b720cbb7a002763fd858 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Tue, 22 Aug 2023 10:31:05 -0600
Subject: [PATCH 06/26] Update boilerplate (wip)

---
 R/cboilerplate.R | 58 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 10 deletions(-)

diff --git a/R/cboilerplate.R b/R/cboilerplate.R
index d4627eda..9d6244ff 100644
--- a/R/cboilerplate.R
+++ b/R/cboilerplate.R
@@ -1,5 +1,5 @@
 # ------------------------------------------------------------------------------ #
-# Attribution: Following utils for packaging to cBioPortal are adapted from some code written by the awesome @hhunterzinck
+# Attribution: Many utils for cBioPortal below are adapted from code written by the awesome @hhunterzinck
 # in the repo https://github.com/Sage-Bionetworks/genie-erbb2-cbio/
 
 # TO DO / DEV NOTES: 
@@ -9,6 +9,23 @@
 # 2. It may make sense to write meta files automatically whenever writing a 
 # data file is called. This might be mainly updating the main wrapper or creating more wrappers.
 
+
+#' Check that in valid cBioPortal study dataset root
+#' 
+#' The `cbp_add*` functions need to be run while in the study package root.  
+#' This checks in valid study directory and returns the `cancer_study_id`.
+#' 
+#' @keywords internal
+#' @return `cancer_study_id` for the current cBioPortal cancer study.
+check_cbp_study_id <- function() {
+  
+  tryCatch({
+    study <- yaml::read_yaml("meta_study.txt")
+    study$cancer_study_identifier
+  }, 
+  error = function(e) stop(getwd(), "does not appear to be a valid cBioPortal study."))
+}
+
 # -- DATA FILES ---------------------------------------------------------------- #
 # Data files store data... cBioPortal has format specifications specific to the data type. 
 # The only data type that we need to script for is the clinical data type, 
@@ -185,13 +202,12 @@ make_meta_patient <- function(cancer_study_identifier,
                               publish_dir = ".",
                               verbose = TRUE) {
   
-  meta_filename <- "meta_clinical_patient.txt"
   df_file <- make_meta_clinical_generic(cancer_study_identifier = cancer_study_identifier,
                                         genetic_alteration_type = "CLINICAL",
                                         datatype = "PATIENT_ATTRIBUTES",
                                         data_filename = data_filename)
   
-  if(write) write_meta(df_file, meta_filename, publish_dir, verbose)
+  if(write) write_meta(df_file, "meta_clinical_patient.txt", publish_dir, verbose)
   invisible(df_file)
 }
 
@@ -208,13 +224,12 @@ make_meta_sample <- function(cancer_study_identifier,
                              write = TRUE,
                              verbose = TRUE) {
   
-  meta_filename <- "meta_clinical_sample.txt"
   df_file <- make_meta_clinical_generic(cancer_study_identifier = cancer_study_identifier, 
                                         genetic_alteration_type = "CLINICAL",
                                         datatype = "SAMPLE_ATTRIBUTES", 
                                         data_filename = data_filename)
   
-  if(write) write_meta(df_file, meta_filename, publish_dir, verbose)
+  if(write) write_meta(df_file, "meta_clinical_sample.txt", publish_dir, verbose)
   invisible(df_file)
 }
 
@@ -267,7 +282,6 @@ make_meta_maf <- function(cancer_study_identifier,
                           write = TRUE,
                           verbose = TRUE) {
   
-  meta_filename <- "meta_mutations.txt"
   df_file <- make_meta_genomic_generic(cancer_study_identifier = cancer_study_identifier, 
                                        genetic_alteration_type = "MUTATION_EXTENDED", 
                                        datatype = "MAF", 
@@ -276,7 +290,7 @@ make_meta_maf <- function(cancer_study_identifier,
                                        profile_description = "Mutation data from NF-OSI processing.",
                                        data_filename = data_filename)
   
-  if(write) write_meta(df_file, meta_filename, publish_dir, verbose)
+  if(write) write_meta(df_file, "meta_mutations.txt", publish_dir, verbose)
   invisible(df_file)
 }
 
@@ -293,8 +307,7 @@ make_meta_cna <- function(cancer_study_identifier,
                           publish_dir = ".",
                           write = TRUE,
                           verbose = TRUE) {
-    
-  meta_filename <- "meta_seg.txt"
+  
   df_file <- make_meta_genomic_generic(cancer_study_identifier = cancer_study_identifier,
                                        genetic_alteration_type = "COPY_NUMBER_ALTERATION",
                                        datatype = "SEG",
@@ -302,11 +315,35 @@ make_meta_cna <- function(cancer_study_identifier,
                                        description = "Somatic CNA from NF-OSI processing.",
                                        data_filename = data_filename)
   
-  if(write) write_meta(df_file, meta_filename, publish_dir, verbose)
+  if(write) write_meta(df_file, "meta_seg.txt", publish_dir, verbose)
   invisible(df_file)
 
 }
 
+
+#' Make meta file for cBioPortal expression data
+#' 
+#' https://docs.cbioportal.org/file-formats/#expression-data
+#' @keywords internal
+make_meta_expression <- function(cancer_study_identifier,
+                                 data_filename = "data_expression.txt",
+                                 publish_dir = ".",
+                                 write = TRUE,
+                                 verbose = TRUE) {
+  
+  df_file <- make_meta_genomic_generic(cancer_study_identifier = cancer_study_identifier,
+                                       genetic_alteration_type = "MRNA_EXPRESSION",
+                                       datatype = "CONTINUOUS",
+                                       stable_id = "rna_seq_mrna",
+                                       description = "Expression levels",
+                                       data_filename = data_filename)
+  
+  if(write) write_meta(df_file, "meta_expression.txt", publish_dir, verbose)
+  invisible(df_file)
+  
+}
+
+
 # --- Meta study --------------------------------------------------------------- #
 
 #' Template for meta study file
@@ -368,6 +405,7 @@ make_meta_study_generic <- function(type_of_cancer,
 #' or a `data.table` representation.
 #' @keywords internal
 use_ref_map <- function(ref_map, as_dt = TRUE) {
+  
   ref_map_ls <- yaml::read_yaml(ref_map) # this can read JSON
   mapping <- ref_map_ls$mapping
   

From 468453db5ca5cc32bae7e3f261de0df140c10ac8 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Tue, 22 Aug 2023 10:37:56 -0600
Subject: [PATCH 07/26] Make check optional

---
 R/cbioportal_study.R | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/R/cbioportal_study.R b/R/cbioportal_study.R
index 62e67a5e..ba95af59 100644
--- a/R/cbioportal_study.R
+++ b/R/cbioportal_study.R
@@ -150,12 +150,11 @@ cbp_add_clinical <- function(ref_view,
 #' 3. Make meta files. Meta files are needed for describing the study, mutations data file, clinical data files.
 #' 
 #' @param maf_data Synapse id of `merged maf` file for public release.
-#' @param samplesheet Synapse id or local path to samplesheet with release info.
-#' @param cancer_study_identifier Study identifier, convention is `{tumorType}_{institution}_{year}`, so for example "mpnst_nfosi_2022".
+#' @param samplesheet Optional Synapse id or local path to samplesheet with release info for cross-ref check.
 #' @param verbose Whether to provide informative messages throughout.
 #' @export
 cbp_add_maf <- function(maf_data,
-                        samplesheet,
+                        samplesheet = NULL,
                         verbose = TRUE) {
   
   .check_login()
@@ -166,12 +165,14 @@ cbp_add_maf <- function(maf_data,
   data_mutations <- sub(file$name, "data_mutations.txt", file$path)
   file.rename(file$path, data_mutations)
   
-  if(verbose) message("✔ Checking the `maf` release file against samplesheet")
-  mm <- dt_read(data_mutations_extended)
-  ss <- dt_read(samplesheet)
-  check_result <- check_maf_release(mm, ss)
+  if(!is.null(samplesheet)) {
+    if(verbose) message("✔ Checking the `maf` release file against samplesheet")
+    mm <- dt_read(data_mutations_extended)
+    ss <- dt_read(samplesheet)
+    check_result <- check_maf_release(mm, ss)
   
-  if(!is.null(check_result)) stop("Unfortunately, check of `maf` release failed so will not continue. Please update data and retry.")
+    if(!is.null(check_result)) stop("Unfortunately, check of `maf` release failed so will not continue. Please update data and retry.")
+  }
   
   if(verbose) message("✔ Making maf meta file")
   make_meta_maf(cancer_study_identifier, verbose = verbose)
@@ -248,20 +249,20 @@ cbp_add_cna <- function(cna_data) {
 #' 
 #' @export
 #' 
-#' @param mrna_data Syn id of gene expression data.
-cbp_add_mrna <- function(mrna_data) {
+#' @param expression_data Syn id of gene expression data.
+cbp_add_expression <- function(expression_data) {
   
   cancer_study_identifier <- check_cbp_study_id()
   
   if(verbose) message("✔ Getting the mRNA expression data file from Synapse")
-  file <- .syn$get(cna_release, downloadLocation = ".")
-  data_cna <- sub(file$name, "data_cna.seg", file$path)
-  file.rename(file$path, data_cna)
+  file <- .syn$get(expression_data, downloadLocation = ".")
+  data_expression <- sub(file$name, "data_expression.txt", file$path)
+  file.rename(file$path, data_expression)
   
   if(verbose) message("✔ Making the meta file")
-  make_meta_cna(cancer_study_identifier)
+  make_meta_expression(cancer_study_identifier)
   
-  if(verbose) message("✔ Done with adding CNA data")
+  if(verbose) message("✔ Done with adding expression data")
   
 } 
 

From 56eb4237add0d34e11b0afd5af1a51ed916f580f Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Tue, 22 Aug 2023 10:40:10 -0600
Subject: [PATCH 08/26] Update docs

---
 R/cbioportal_study.R | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/R/cbioportal_study.R b/R/cbioportal_study.R
index ba95af59..34909b07 100644
--- a/R/cbioportal_study.R
+++ b/R/cbioportal_study.R
@@ -134,21 +134,11 @@ cbp_add_clinical <- function(ref_view,
 #' This needs to be packaged with other files like this
 #' [example of a public mutations dataset](https://github.com/cBioPortal/datahub/tree/1e03ea6ab5e0ddd497ecf349cbee7d50aeebcd5e/public/msk_ch_2020).
 #' 
-#' 1. Sanity check that this is the version of the maf release file that we want, based on the samplesheet.
+#' There is an optional cross-check with samplesheet to confirm that this is the version of the maf release file that we want.
 #' Sometimes a later version retracts a sample (`is_releasable`=FALSE) so merged maf being exported should match samplesheet.  
 #' Latest versions of the samplesheets tied to each release are currently stored in `syn38793855`.
 #' (Note: Please file issue to update this doc if this changes.)
 #' 
-#' 2. Make the clinical data files. 
-#' For NF, the clinical metadata are annotations on the files/surfaced in a view and are pretty basic. 
-#' In the future, it would be preferable to store clinical metadata in a real normalized table.
-#' For now, this clinical data is pulled in from the view. To map NF clinical variables to the 
-#' [cBioPortal dictionary](https://github.com/cBioPortal/clinical-data-dictionary/blob/e9ec08f48bd57aabf193da70cdb5b88bdef5d01d/docs/resource_uri_to_clinical_attribute_mapping.txt)
-#' [as recommended](https://docs.cbioportal.org/file-formats/#custom-columns-in-clinical-data), 
-#' this step requires a `ref_map`, which is a YAML file.
-#' 
-#' 3. Make meta files. Meta files are needed for describing the study, mutations data file, clinical data files.
-#' 
 #' @param maf_data Synapse id of `merged maf` file for public release.
 #' @param samplesheet Optional Synapse id or local path to samplesheet with release info for cross-ref check.
 #' @param verbose Whether to provide informative messages throughout.

From 9b72e740397c4911625dbe261e8853f2d7e81e6b Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Tue, 22 Aug 2023 22:52:52 -0600
Subject: [PATCH 09/26] Cleanup helpers and docs

---
 R/cbioportal_study.R       | 143 ++++++++++++++-----------------------
 R/cboilerplate.R           |  76 +++++++++++---------
 R/nextflow_testing_utils.R |  10 +++
 3 files changed, 105 insertions(+), 124 deletions(-)

diff --git a/R/cbioportal_study.R b/R/cbioportal_study.R
index 34909b07..c13c4875 100644
--- a/R/cbioportal_study.R
+++ b/R/cbioportal_study.R
@@ -3,7 +3,7 @@
 # This file contains the higher-level wrappers that are exported; see `cboilerplate.R` for the non-exported lower-level utils.
 # All functions should start with `cbp_*` so that it's clear this is cBioPortal-relevant functionality.
 
-#' Enumerate combinations of valid cBP data types and data subtypes
+#' Enumerate combinations of valid cBP data types and data subtypes and helper utils if available
 #' 
 #' https://docs.cbioportal.org/file-formats/
 #' 
@@ -11,21 +11,21 @@
 cbp_datatypes <- function() {
 
   types <- data.table::rbindlist(list(
-    list("CLINICAL", "SAMPLE_ATTRIBUTES"),
-    list("CLINICAL", "PATIENT_ATTRIBUTES"),
-    list("COPY_NUMBER_ALTERATION", "DISCRETE"),
-    list("COPY_NUMBER_ALTERATION", "DISCRETE_LONG"),
-    list("COPY_NUMBER_ALTERATION", "CONTINUOUS"),
-    list("COPY_NUMBER_ALTERATION", "LOG2-VALUE"),
-    list("COPY_NUMBER_ALTERATION", "SEG"),
-    list("MRNA_EXPRESSION", "CONTINUOUS"),
-    list("MRNA_EXPRESSION", "Z-SCORE"),
-    list("MUTATION_EXTENDED", "MAF"),
-    list("METHYLATION", "CONTINUOUS"),
-    list("PROTEIN_LEVEL", "LOG2-VALUE"),
-    list("STRUCTURAL_VARIANT", "SV")))
-  
-  setnames(types, c("dataType", "dataSubtype"))
+    list("CLINICAL", "SAMPLE_ATTRIBUTES", "cbp_add_clinical", ""),
+    list("CLINICAL", "PATIENT_ATTRIBUTES", "cbp_add_clinical", ""),
+    list("COPY_NUMBER_ALTERATION", "SEG", "cbp_add_cna", ""),
+    list("COPY_NUMBER_ALTERATION", "DISCRETE", "", ""),
+    list("COPY_NUMBER_ALTERATION", "DISCRETE_LONG", "", ""),
+    list("COPY_NUMBER_ALTERATION", "CONTINUOUS", "", ""),
+    list("COPY_NUMBER_ALTERATION", "LOG2-VALUE", "", ""),
+    list("MRNA_EXPRESSION", "CONTINUOUS", "cbp_add_expression", ""),
+    list("MRNA_EXPRESSION", "Z-SCORE", "cbp_add_expression", ""),
+    list("MUTATION_EXTENDED", "MAF", "cbp_add_maf", ""),
+    list("METHYLATION", "CONTINUOUS", "", ""),
+    list("PROTEIN_LEVEL", "LOG2-VALUE", "", ""),
+    list("STRUCTURAL_VARIANT", "SV", "", "")))
+  
+  setnames(types, c("dataType", "dataSubtype", "fun", "note"))
   return(types)
   }
 
@@ -36,14 +36,16 @@ cbp_datatypes <- function() {
 #' 
 #' @param cancer_study_identifier Cancer study identifier in format such as `nst_nfosi_ntap_2022`.
 #' @param name Name of the study, e.g. "Malignant Peripheral Nerve Sheath Tumor (NF-OSI, 2022)".
-#' @param type_of_cancer Type of cancer, see http://oncotree.mskcc.org/#/home.
+#' @param type_of_cancer Id for type of cancer. If `validate` is TRUE, this is one of the things validated with warning if mismatched.
 #' @param description Description of the study, defaults to a generic description that can be edited later.
 #' @param short_name (Optional) Short name for the study.
 #' @param citation (Optional) A relevant citation, e.g. "TCGA, Nature 2012".
-#' @param pmid (Optional) One or more relevant pubmed ids (comma separated without whitespace); if used, citation cannot be `NULL`.
+#' @param pmid (Optional) One or more relevant pubmed ids (comma-separated, no whitespace); if used, citation cannot be `NULL`.
 #' @param groups (Optional) Defaults to "PUBLIC" for use with public cBioPortal; 
 #' otherwise, use group names that makes sense with the configuration of your cBioPortal instance.
-#' @param add_global_case_list( (Optional) Use `NULL` to ignore, default is `TRUE` for an "All samples" case list( to be generated automatically.
+#' @param add_global_case_list (Optional) Use `NULL` to ignore, default is `TRUE` for an "All samples" case list( to be generated automatically.
+#' @param validate Validate against public cBioPortal configuration. Default `TRUE`, 
+#' but might want to set to `FALSE` especially if using a custom cBioPortal instance with different configuration. 
 #' @param verbose Verbosity level.
 #' 
 #' @export
@@ -57,18 +59,31 @@ cbp_new_study <- function(cancer_study_identifier,
                           pmid = NULL,
                           groups = "PUBLIC",
                           add_global_case_list = TRUE,
+                          validate = TRUE,
                           verbose = TRUE) {
   
   # TODO Validate study id
   study_dir <- cancer_study_identifier
   if(!dir.exists(study_dir)) {
-    if(verbose) message(glue::glue("✔ Creating {study_dir} study directory"))
+    if(verbose) checked_message(glue::glue("Creating {study_dir} study directory"))
     dir.create(glue::glue("./{study_dir}"))
   }
   
-  message("✔ Setting dataset directory as working directory")
+  checked_message("Setting dataset directory as working directory")
   setwd(study_dir)
   
+  if(validate) {
+    tryCatch({
+      cancer_ids <- httr::GET("https://www.cbioportal.org/api/cancer-types?direction=ASC&pageNumber=0&pageSize=10000&projection=SUMMARY") %>% 
+        httr::content() %>%
+        sapply(`[[`, "cancerTypeId")
+      if(!type_of_cancer %in% cancer_ids) {
+        warning("Specified `type_of_cancer id` isn't in the public list. 
+                Check for typos or include additional meta for this cancer type -- see https://docs.cbioportal.org/file-formats/#cancer-type")
+      }
+    }, error = function(e) warning("Note: Other issue with validation via public API...skipping."))
+  }
+  
   df_file <- make_meta_study_generic(cancer_study_identifier = cancer_study_identifier,
                                      type_of_cancer = type_of_cancer, 
                                      name = name, 
@@ -80,7 +95,7 @@ cbp_new_study <- function(cancer_study_identifier,
                                      add_global_case_list = add_global_case_list)
   
   write_meta(df_file, "meta_study.txt", verbose = verbose)
-  if(verbose) message("✔ Study meta added")
+  if(verbose) checked_message("Study meta added")
 }
 
 # ------------------------------------------------------------------------------- #
@@ -104,23 +119,23 @@ cbp_add_clinical <- function(ref_view,
   
   cancer_study_identifier <- check_cbp_study_id()
   
-  if(verbose) message("✔ Pulling the clinical data from Synapse")
+  if(verbose) checked_message("Pulling the clinical data from Synapse")
   df <- get_clinical_data_for_cbp_study(ref_view)
   
-  if(verbose) message("✔ Formatting and making clinical data file(s)")
+  if(verbose) checked_message("Formatting and making clinical data file(s)")
   df$specimenID <- gsub(" ", "_", clinical_data$specimenID)
   write_cbio_clinical(df, ref_map = ref_map, verbose = verbose)
   
-  if(verbose) message("✔ Making sample clinical meta file")
+  if(verbose) checked_message("Making sample clinical meta file")
   make_meta_sample(cancer_study_identifier, verbose = verbose)
   
   # Before making meta, check that the optional patient data file was written 
   if(file.exists("data_clinical_patient.txt")) {
-    if(verbose) message("✔ Making patient clinical meta file")
+    if(verbose) checked_message("Making patient clinical meta file")
     make_meta_patient(cancer_study_identifier, verbose = verbose)
   }
   
-  if(verbose) message("✔ Done with adding clinical data")
+  if(verbose) checked_message("Done with adding clinical data")
   
 }
 
@@ -134,101 +149,49 @@ cbp_add_clinical <- function(ref_view,
 #' This needs to be packaged with other files like this
 #' [example of a public mutations dataset](https://github.com/cBioPortal/datahub/tree/1e03ea6ab5e0ddd497ecf349cbee7d50aeebcd5e/public/msk_ch_2020).
 #' 
-#' There is an optional cross-check with samplesheet to confirm that this is the version of the maf release file that we want.
-#' Sometimes a later version retracts a sample (`is_releasable`=FALSE) so merged maf being exported should match samplesheet.  
-#' Latest versions of the samplesheets tied to each release are currently stored in `syn38793855`.
-#' (Note: Please file issue to update this doc if this changes.)
-#' 
 #' @param maf_data Synapse id of `merged maf` file for public release.
-#' @param samplesheet Optional Synapse id or local path to samplesheet with release info for cross-ref check.
 #' @param verbose Whether to provide informative messages throughout.
 #' @export
 cbp_add_maf <- function(maf_data,
-                        samplesheet = NULL,
                         verbose = TRUE) {
   
   .check_login()
   cancer_study_identifier <- check_cbp_study_id()
   
-  if(verbose) message("✔ Getting `maf_data` file from Synapse")
+  if(verbose) checked_message("Getting `maf_data` file from Synapse")
   file <- .syn$get(maf_data, downloadLocation = ".")
   data_mutations <- sub(file$name, "data_mutations.txt", file$path)
   file.rename(file$path, data_mutations)
   
-  if(!is.null(samplesheet)) {
-    if(verbose) message("✔ Checking the `maf` release file against samplesheet")
-    mm <- dt_read(data_mutations_extended)
-    ss <- dt_read(samplesheet)
-    check_result <- check_maf_release(mm, ss)
-  
-    if(!is.null(check_result)) stop("Unfortunately, check of `maf` release failed so will not continue. Please update data and retry.")
-  }
-  
-  if(verbose) message("✔ Making maf meta file")
+  if(verbose) checked_message("Making maf meta file")
   make_meta_maf(cancer_study_identifier, verbose = verbose)
   
-  if(verbose) message("✔ Done with adding MAF data")
+  if(verbose) checked_message("Done with adding MAF data")
   
 }
 
 
-#' Check maf file for release
-#' 
-#' Currently, this is a simple check to make sure released samples are expected. 
-#' It may be extended later on as needed. 
-#' 
-#' @param maf_data Maf data as a `data.table`.
-#' @param samplesheet Samplesheet as a `data.table`.
-#' @return Returns `NULL` if everything OK, else the sample ids that don't match expectations.
-check_maf_release <- function(maf_data, 
-                              samplesheet) {
-  
-  # samplesheet[is_releasable == TRUE, .N]
-  ss_samples <- samplesheet[is_releasable == TRUE, biospecimen_id]
-  mm_samples <- maf_data[, unique(Tumor_Sample_Barcode)]
-  
-  result <- NULL
-  missing_release <- setdiff(ss_samples, mm_samples)
-  no_release <- setdiff(mm_samples, ss_samples)
-  
-  if(length(missing_release)) {
-    test_failed("Maf file seems to be missing samples specified for release.")
-    ids_1 <- data.frame(sample = missing_release, type = "missing")
-    result <- rbind(result, ids_1)
-  }
-  if(length(no_release)) {
-    test_failed("Maf file contains samples that should not be released!")
-    ids_2 <- data.frame(sample = no_release, type = "no_release")
-    result <- rbind(result, ids_2)
-  }
-  if(!length(missing_release) && !length(no_release)) test_passed("✔ Samples for release look as expected.")
-  
-  invisible(result)
-}
-
-
 # ------------------------------------------------------------------------------- #
 
 #' Export and add CNA (seg) data to cBioPortal dataset
 #' 
 #' This should be run in an existing dataset package root.
 #' 
+#' @param cna_data Synapse id of CNA data file, currently only handles `.seg` file.
 #' @export
-#' 
-#' @param cna_release Syn id of CNA release data, currently only handles `.seg` file.
 cbp_add_cna <- function(cna_data) {
   
   cancer_study_identifier <- check_cbp_study_id()
   
-  if(verbose) message("✔ Getting the CNA (.seg) data file from Synapse")
+  if(verbose) checked_message("Getting the CNA (.seg) data file from Synapse")
   file <- .syn$get(cna_data, downloadLocation = ".")
   data_cna <- sub(file$name, "data_cna.seg", file$path)
   file.rename(file$path, data_cna)
   
-  if(verbose) message("✔ Making the meta file")
-  make_meta_cna(...)
+  if(verbose) checked_message("Making the meta file")
+  make_meta_cna(cancer_study_identifier)
   
-  if(verbose) message("✔ Done with adding CNA data")
+  if(verbose) checked_message("Done with adding CNA data")
   
 } 
 
@@ -244,15 +207,15 @@ cbp_add_expression <- function(expression_data) {
   
   cancer_study_identifier <- check_cbp_study_id()
   
-  if(verbose) message("✔ Getting the mRNA expression data file from Synapse")
+  if(verbose) checked_message("Getting the mRNA expression data file from Synapse")
   file <- .syn$get(expression_data, downloadLocation = ".")
   data_expression <- sub(file$name, "data_expression.txt", file$path)
   file.rename(file$path, data_expression)
   
-  if(verbose) message("✔ Making the meta file")
+  if(verbose) checked_message("Making the meta file")
   make_meta_expression(cancer_study_identifier)
   
-  if(verbose) message("✔ Done with adding expression data")
+  if(verbose) checked_message("Done with adding expression data")
   
 } 
 
diff --git a/R/cboilerplate.R b/R/cboilerplate.R
index 9d6244ff..94a30380 100644
--- a/R/cboilerplate.R
+++ b/R/cboilerplate.R
@@ -235,12 +235,23 @@ make_meta_sample <- function(cancer_study_identifier,
 
 # -- Data meta files ---------------------------------------------------------- #
 
+#' Append key-value pair dependent on value being given
+#' 
+#' @keywords internal
+append_kv <- function(x, key, value) {
+ 
+  if(!is.null(value)) append(x, glue::glue("{key}: {value}")) else x
+}
+
 #' Generic template for genomic-type data file
 #'
-#' Reused from https://github.com/Sage-Bionetworks/genie-erbb2-cbio/blob/develop/make_meta.R#L65
-#' 
+#' Adapted from https://github.com/Sage-Bionetworks/genie-erbb2-cbio/blob/develop/make_meta.R#L65
+#' Internal workhorse union of _all_ the properties used for a genomic-type data file -- 
+#' the sensible defaults/specific combination should be passed in by a higher-level fun, e.g. `make_meta_maf`. 
+#'  
 #' @inheritParams make_meta_clinical_generic
 #' @param stable_id Stable id.
+#' @param reference_genome_id Reference genome id, e.g. 'hg19'.
 #' @param profile_name Name of the genomic profiling. This is set by the more specific `make_meta` utility. 
 #' For example, "Mutations" for `make_*_maf` and "Copy-number alterations" for `make_*_cna`.  
 #' @param profile_description Brief description for the genomic profiling. 
@@ -249,21 +260,24 @@ make_meta_sample <- function(cancer_study_identifier,
 make_meta_genomic_generic <- function(cancer_study_identifier,
                                       genetic_alteration_type, 
                                       datatype, 
+                                      reference_genome_id,
                                       stable_id, 
                                       profile_name, 
                                       profile_description, 
                                       data_filename) {
   
-  rows <- rep(NA, 8)
-  rows[1] <- glue::glue("cancer_study_identifier: {cancer_study_identifier}")
-  rows[2] <- glue::glue("genetic_alteration_type: {genetic_alteration_type}")
-  rows[3] <- glue::glue("datatype: {datatype}")
-  rows[4] <- glue::glue("stable_id: {stable_id}")
-  rows[5] <- glue::glue("show_profile_in_analysis_tab: true")
-  rows[6] <- glue::glue("profile_name: {profile_name}")
-  rows[7] <- glue::glue("profile_description: {profile_description}")
-  rows[8] <- glue::glue("data_filename: {data_filename}")
-  return(rows)
+  
+  meta <- glue::glue("cancer_study_identifier: {cancer_study_identifier}") %>%
+    append_kv("genetic_alteration_type", genetic_alteration_type) %>%
+    append_kv("datatype", datatype) %>%
+    append_kv("reference_genome_id", reference_genome_id) %>%
+    append_kv("stable_id", stable_id) %>% 
+    append_kv("show_profile_in_analysis_tab", "true") %>% 
+    append_kv("profile_name", profile_name) %>%
+    append_kv("profile_description", profile_description) %>%
+    append_kv("data_filename", data_filename)
+  
+  return(meta)
 }
 
 
@@ -303,7 +317,7 @@ make_meta_maf <- function(cancer_study_identifier,
 #' @keywords internal
 make_meta_cna <- function(cancer_study_identifier,
                           data_filename = "data_cna.seg",
-                          reference_genome = "hg19",
+                          reference_genome_id = "hg19",
                           publish_dir = ".",
                           write = TRUE,
                           verbose = TRUE) {
@@ -312,7 +326,7 @@ make_meta_cna <- function(cancer_study_identifier,
                                        genetic_alteration_type = "COPY_NUMBER_ALTERATION",
                                        datatype = "SEG",
                                        reference_genome_id = reference_genome_id,
-                                       description = "Somatic CNA from NF-OSI processing.",
+                                       profile_description = "Somatic CNA from NF-OSI processing.",
                                        data_filename = data_filename)
   
   if(write) write_meta(df_file, "meta_seg.txt", publish_dir, verbose)
@@ -335,7 +349,7 @@ make_meta_expression <- function(cancer_study_identifier,
                                        genetic_alteration_type = "MRNA_EXPRESSION",
                                        datatype = "CONTINUOUS",
                                        stable_id = "rna_seq_mrna",
-                                       description = "Expression levels",
+                                       profile_description = "Expression levels",
                                        data_filename = data_filename)
   
   if(write) write_meta(df_file, "meta_expression.txt", publish_dir, verbose)
@@ -352,37 +366,31 @@ make_meta_expression <- function(cancer_study_identifier,
 #' Low-level internal function for the tedious templating. 
 #' 
 #' @keywords internal
-make_meta_study_generic <- function(type_of_cancer,
-                                    cancer_study_identifier,
+make_meta_study_generic <- function(cancer_study_identifier,
+                                    type_of_cancer,
                                     name,
                                     description,
                                     citation = NULL,
                                     pmid = NULL,
                                     groups = NULL,
                                     short_name = NULL,
-                                    add_global_case_list = NULL) {
+                                    add_global_case_list = TRUE) {
   
   # Check meta params -- there probably should just be JSON schemas for all of these meta configs  
   if(!is.null(pmid) && is.null(citation)) stop("If `pmid` is used, `citation` has to be filled in.")
   if(!is.null(add_global_case_list) && !is.logical(add_global_case_list)) stop("Nonsensical value used for `add_global_case_list`.")
   
-  # TO DO: If type of cancer does not match the ONCOTREE vocab, it will fail validation later on
-  # This check can be done in a more upfront manner here, though will need to download the data from GitHub 
+  meta <- glue::glue("cancer_study_identifier: {cancer_study_identifier}") %>%
+    append_kv("type_of_cancer", type_of_cancer) %>%
+    append_kv("name", name) %>%
+    append_kv("description", description) %>%
+    append_kv("citation", citation) %>%
+    append_kv("pmid", pmid) %>%
+    append_kv("groups", groups) %>%
+    append_kv("short_name", short_name) %>%
+    append_kv("add_global_case_list", to.lower(as.character(add_global_case_list)))
   
-  rows <- c()
-  rows <- append(rows, glue::glue("cancer_study_identifier: {cancer_study_identifier}"))
-  rows <- append(rows, glue::glue("type_of_cancer: {type_of_cancer}"))
-  rows <- append(rows, glue::glue("name: {name}"))
-  rows <- append(rows, glue::glue("description: {description}"))
-  if(!is.null(citation)) rows <- append(rows, glue::glue("citation: {citation}"))
-  if(!is.null(pmid)) rows <- append(rows, glue::glue("pmid: {pmid}"))
-  if(!is.null(groups)) rows <- append(rows, glue::glue("groups: {groups}"))
-  if(!is.null(short_name)) rows <- append(rows, glue::glue("short_name: {short_name}"))
-  if(!is.null(add_global_case_list)) {
-    add_global_case_list <- tolower(as.character(add_global_case_list)) 
-    rows <- append(rows, glue::glue("add_global_case_list: {add_global_case_list}"))
-  }
-  return(rows)
+  return(meta)
 }
 
 
diff --git a/R/nextflow_testing_utils.R b/R/nextflow_testing_utils.R
index 33678496..e0c67364 100644
--- a/R/nextflow_testing_utils.R
+++ b/R/nextflow_testing_utils.R
@@ -88,3 +88,13 @@ test_passed <- function(display_string){
   message(glue::glue("{emoji::emoji('green_heart')} {crayon::bold(crayon::green('Test passed:'))} {crayon::green(display_string)}"))
 }
 
+#' Format checked message notification
+#' 
+#' For nicely displaying that some step was successfully completed. 
+#' 
+#' @param string Character string with check prepended.
+#' @keywords internal
+#' 
+checked_message <- function(string) {
+  message(glue::glue("{crayon::green(emoji::emoji('check_mark'))} {crayon::green(string)}"))
+}

From f3b911917de8453b9b4c7971da734b322944b359 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Tue, 22 Aug 2023 22:54:11 -0600
Subject: [PATCH 10/26] Cleanup docs

---
 NAMESPACE                        |  6 ++-
 man/append_kv.Rd                 | 12 ++++++
 man/cbp_add_clinical.Rd          | 24 +++++++++++
 man/cbp_add_cna.Rd               | 14 +++++++
 man/cbp_add_expression.Rd        | 14 +++++++
 man/cbp_add_maf.Rd               | 21 ++++++++++
 man/cbp_datatypes.Rd             | 12 ++++++
 man/cbp_new_study.Rd             | 50 +++++++++++++++++++++++
 man/check_cbp_study_id.Rd        | 16 ++++++++
 man/check_maf_release.Rd         | 20 ---------
 man/checked_message.Rd           | 15 +++++++
 man/get_data_for_releasable.Rd   | 21 ----------
 man/make_meta_cna.Rd             | 22 ++++++++++
 man/make_meta_expression.Rd      | 18 ++++++++
 man/make_meta_genomic_generic.Rd |  7 +++-
 man/make_meta_study.Rd           | 51 -----------------------
 man/make_meta_study_generic.Rd   |  4 +-
 man/match_maf_sample_id.Rd       | 19 ---------
 man/syncBP_maf.Rd                | 70 --------------------------------
 19 files changed, 230 insertions(+), 186 deletions(-)
 create mode 100644 man/append_kv.Rd
 create mode 100644 man/cbp_add_clinical.Rd
 create mode 100644 man/cbp_add_cna.Rd
 create mode 100644 man/cbp_add_expression.Rd
 create mode 100644 man/cbp_add_maf.Rd
 create mode 100644 man/cbp_datatypes.Rd
 create mode 100644 man/cbp_new_study.Rd
 create mode 100644 man/check_cbp_study_id.Rd
 delete mode 100644 man/check_maf_release.Rd
 create mode 100644 man/checked_message.Rd
 delete mode 100644 man/get_data_for_releasable.Rd
 create mode 100644 man/make_meta_cna.Rd
 create mode 100644 man/make_meta_expression.Rd
 delete mode 100644 man/make_meta_study.Rd
 delete mode 100644 man/match_maf_sample_id.Rd
 delete mode 100644 man/syncBP_maf.Rd

diff --git a/NAMESPACE b/NAMESPACE
index f51c9fd2..dc3f3a8a 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -27,6 +27,10 @@ export(as_table_schema)
 export(assign_study_data_types)
 export(bad_url)
 export(calculate_related_studies)
+export(cbp_add_cna)
+export(cbp_add_expression)
+export(cbp_add_maf)
+export(cbp_new_study)
 export(check_access)
 export(check_readpair_validity)
 export(check_wiki_links)
@@ -43,7 +47,6 @@ export(grant_specific_file_access)
 export(key_label_to_id)
 export(make_admin)
 export(make_folder)
-export(make_meta_study)
 export(make_public)
 export(make_public_viewable)
 export(map_sample_input_ss)
@@ -64,7 +67,6 @@ export(remove_button)
 export(remove_wiki_subpage)
 export(summarize_file_access)
 export(syn_login)
-export(syncBP_maf)
 export(table_query)
 export(update_study_annotations)
 export(use_latest_in_collection)
diff --git a/man/append_kv.Rd b/man/append_kv.Rd
new file mode 100644
index 00000000..7be2ddb8
--- /dev/null
+++ b/man/append_kv.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cboilerplate.R
+\name{append_kv}
+\alias{append_kv}
+\title{Append key-value pair dependent on value being given}
+\usage{
+append_kv(x, key, value)
+}
+\description{
+Append key-value pair dependent on value being given
+}
+\keyword{internal}
diff --git a/man/cbp_add_clinical.Rd b/man/cbp_add_clinical.Rd
new file mode 100644
index 00000000..642a5ee5
--- /dev/null
+++ b/man/cbp_add_clinical.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cbioportal_study.R
+\name{cbp_add_clinical}
+\alias{cbp_add_clinical}
+\title{Export and add clinical data to cBioPortal dataset}
+\usage{
+cbp_add_clinical(ref_view, ref_map, verbose = TRUE)
+}
+\arguments{
+\item{ref_view}{A view that contains all clinical data for the study.}
+
+\item{ref_map}{YAML file specifying the mapping of (NF) clinical metadata to cBioPortal model. See details.}
+
+\item{verbose}{Whether to provide informative messages throughout.}
+}
+\description{
+This should be run in an existing dataset package root.
+}
+\details{
+Clinical data are mapped and exported according to a reference mapping.
+Also reformatting of \code{PATIENT_ID}, \code{SAMPLE_ID} to contain only letters, numbers, points, underscores, hyphens;
+in Nextflow processing any spaces gets replaced with underscores so that's the default here.
+Does \emph{not} check for missing samples, as final validation via cBioPortal tool is still expected for that.
+}
diff --git a/man/cbp_add_cna.Rd b/man/cbp_add_cna.Rd
new file mode 100644
index 00000000..346fa656
--- /dev/null
+++ b/man/cbp_add_cna.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cbioportal_study.R
+\name{cbp_add_cna}
+\alias{cbp_add_cna}
+\title{Export and add CNA (seg) data to cBioPortal dataset}
+\usage{
+cbp_add_cna(cna_data)
+}
+\arguments{
+\item{cna_data}{Synapse id of CNA data file, currently only handles \code{.seg} file.}
+}
+\description{
+This should be run in an existing dataset package root.
+}
diff --git a/man/cbp_add_expression.Rd b/man/cbp_add_expression.Rd
new file mode 100644
index 00000000..8e9586ef
--- /dev/null
+++ b/man/cbp_add_expression.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cbioportal_study.R
+\name{cbp_add_expression}
+\alias{cbp_add_expression}
+\title{Export and add expression data to cBioPortal dataset}
+\usage{
+cbp_add_expression(expression_data)
+}
+\arguments{
+\item{expression_data}{Syn id of gene expression data.}
+}
+\description{
+This should be run in an existing dataset package root.
+}
diff --git a/man/cbp_add_maf.Rd b/man/cbp_add_maf.Rd
new file mode 100644
index 00000000..0760f4ae
--- /dev/null
+++ b/man/cbp_add_maf.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cbioportal_study.R
+\name{cbp_add_maf}
+\alias{cbp_add_maf}
+\title{Export and add mutations data to cBioPortal dataset}
+\usage{
+cbp_add_maf(maf_data, verbose = TRUE)
+}
+\arguments{
+\item{maf_data}{Synapse id of \verb{merged maf} file for public release.}
+
+\item{verbose}{Whether to provide informative messages throughout.}
+}
+\description{
+This should be run in an existing dataset package root.
+}
+\details{
+Get merged maf file that represents filtered subset of \code{maf}s containing only (non-germline) data OK to release publicly.
+This needs to be packaged with other files like this
+\href{https://github.com/cBioPortal/datahub/tree/1e03ea6ab5e0ddd497ecf349cbee7d50aeebcd5e/public/msk_ch_2020}{example of a public mutations dataset}.
+}
diff --git a/man/cbp_datatypes.Rd b/man/cbp_datatypes.Rd
new file mode 100644
index 00000000..70bcf6fd
--- /dev/null
+++ b/man/cbp_datatypes.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cbioportal_study.R
+\name{cbp_datatypes}
+\alias{cbp_datatypes}
+\title{Enumerate combinations of valid cBP data types and data subtypes and helper utils if available}
+\usage{
+cbp_datatypes()
+}
+\description{
+https://docs.cbioportal.org/file-formats/
+}
+\keyword{internal}
diff --git a/man/cbp_new_study.Rd b/man/cbp_new_study.Rd
new file mode 100644
index 00000000..84197282
--- /dev/null
+++ b/man/cbp_new_study.Rd
@@ -0,0 +1,50 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cbioportal_study.R
+\name{cbp_new_study}
+\alias{cbp_new_study}
+\title{Initialize a new cBioPortal study dataset}
+\usage{
+cbp_new_study(
+  cancer_study_identifier,
+  name,
+  type_of_cancer,
+  description =
+    "The data are contributed by researchers funded by the Neurofibromatosis Therapeutic Acceleration Program (NTAP). \\n                          The reprocessing of the raw data is managed by the NF Open Science Initiative (https://nf.synapse.org/).",
+  short_name = NULL,
+  citation = NULL,
+  pmid = NULL,
+  groups = "PUBLIC",
+  add_global_case_list = TRUE,
+  validate = TRUE,
+  verbose = TRUE
+)
+}
+\arguments{
+\item{cancer_study_identifier}{Cancer study identifier in format such as \code{nst_nfosi_ntap_2022}.}
+
+\item{name}{Name of the study, e.g. "Malignant Peripheral Nerve Sheath Tumor (NF-OSI, 2022)".}
+
+\item{type_of_cancer}{Id for type of cancer. If \code{validate} is TRUE, this is one of the things validated with warning if mismatched.}
+
+\item{description}{Description of the study, defaults to a generic description that can be edited later.}
+
+\item{short_name}{(Optional) Short name for the study.}
+
+\item{citation}{(Optional) A relevant citation, e.g. "TCGA, Nature 2012".}
+
+\item{pmid}{(Optional) One or more relevant pubmed ids (comma-separated, no whitespace); if used, citation cannot be \code{NULL}.}
+
+\item{groups}{(Optional) Defaults to "PUBLIC" for use with public cBioPortal;
+otherwise, use group names that makes sense with the configuration of your cBioPortal instance.}
+
+\item{add_global_case_list}{(Optional) Use \code{NULL} to ignore, default is \code{TRUE} for an "All samples" case list( to be generated automatically.}
+
+\item{validate}{Validate against public cBioPortal configuration. Default \code{TRUE},
+but might want to set to \code{FALSE} especially if using a custom cBioPortal instance with different configuration.}
+
+\item{verbose}{Verbosity level.}
+}
+\description{
+Create a new directory with a basic required \href{https://docs.cbioportal.org/file-formats/#meta-file}{study meta file},
+much like how we'd create a new R package and put a DESCRIPTION file in it.
+}
diff --git a/man/check_cbp_study_id.Rd b/man/check_cbp_study_id.Rd
new file mode 100644
index 00000000..5e791052
--- /dev/null
+++ b/man/check_cbp_study_id.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cboilerplate.R
+\name{check_cbp_study_id}
+\alias{check_cbp_study_id}
+\title{Check that in valid cBioPortal study dataset root}
+\usage{
+check_cbp_study_id()
+}
+\value{
+\code{cancer_study_id} for the current cBioPortal cancer study.
+}
+\description{
+The \verb{cbp_add*} functions need to be run while in the study package root.
+This checks in valid study directory and returns the \code{cancer_study_id}.
+}
+\keyword{internal}
diff --git a/man/check_maf_release.Rd b/man/check_maf_release.Rd
deleted file mode 100644
index 6bc91ea0..00000000
--- a/man/check_maf_release.Rd
+++ /dev/null
@@ -1,20 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/syncBP.R
-\name{check_maf_release}
-\alias{check_maf_release}
-\title{Check maf file for release}
-\usage{
-check_maf_release(merged_maf, samplesheet)
-}
-\arguments{
-\item{merged_maf}{Maf data as a \code{data.table}.}
-
-\item{samplesheet}{Samplesheet as a \code{data.table}.}
-}
-\value{
-Returns \code{NULL} if everything OK, else the sample ids that don't match expectations.
-}
-\description{
-Currently, this is a simple check to make sure released samples are expected.
-It may be extended later on as needed.
-}
diff --git a/man/checked_message.Rd b/man/checked_message.Rd
new file mode 100644
index 00000000..53bc26b9
--- /dev/null
+++ b/man/checked_message.Rd
@@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/nextflow_testing_utils.R
+\name{checked_message}
+\alias{checked_message}
+\title{Format checked message notification}
+\usage{
+checked_message(string)
+}
+\arguments{
+\item{string}{Character string with check prepended.}
+}
+\description{
+For nicely displaying that some step was successfully completed.
+}
+\keyword{internal}
diff --git a/man/get_data_for_releasable.Rd b/man/get_data_for_releasable.Rd
deleted file mode 100644
index b692dd9f..00000000
--- a/man/get_data_for_releasable.Rd
+++ /dev/null
@@ -1,21 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/syncBP.R
-\name{get_data_for_releasable}
-\alias{get_data_for_releasable}
-\title{Download data from a view for releasable samples in samplesheet}
-\usage{
-get_data_for_releasable(samplesheet, ref_view, verbose = TRUE)
-}
-\arguments{
-\item{samplesheet}{Samplesheet \code{data.table}.}
-
-\item{ref_view}{View to get data from.}
-
-\item{verbose}{Output details.}
-}
-\description{
-This tries to check that complete data could be retrieved from said view.
-Note: Since the view is typically denormalized, not all data might be clinical.
-A downstream step will do some of the additional processing/subsetting needed.
-}
-\keyword{internal}
diff --git a/man/make_meta_cna.Rd b/man/make_meta_cna.Rd
new file mode 100644
index 00000000..a26301bf
--- /dev/null
+++ b/man/make_meta_cna.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cboilerplate.R
+\name{make_meta_cna}
+\alias{make_meta_cna}
+\title{Make meta file for cBioPortal copy number alteration data}
+\usage{
+make_meta_cna(
+  cancer_study_identifier,
+  data_filename = "data_cna.seg",
+  reference_genome_id = "hg19",
+  publish_dir = ".",
+  write = TRUE,
+  verbose = TRUE
+)
+}
+\description{
+Currently assumes seg data and should be extended later.
+}
+\details{
+See https://docs.cbioportal.org/file-formats/#segmented-data
+}
+\keyword{internal}
diff --git a/man/make_meta_expression.Rd b/man/make_meta_expression.Rd
new file mode 100644
index 00000000..6d8a4464
--- /dev/null
+++ b/man/make_meta_expression.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cboilerplate.R
+\name{make_meta_expression}
+\alias{make_meta_expression}
+\title{Make meta file for cBioPortal expression data}
+\usage{
+make_meta_expression(
+  cancer_study_identifier,
+  data_filename = "data_expression.txt",
+  publish_dir = ".",
+  write = TRUE,
+  verbose = TRUE
+)
+}
+\description{
+https://docs.cbioportal.org/file-formats/#expression-data
+}
+\keyword{internal}
diff --git a/man/make_meta_genomic_generic.Rd b/man/make_meta_genomic_generic.Rd
index 383d19d5..1f73cc17 100644
--- a/man/make_meta_genomic_generic.Rd
+++ b/man/make_meta_genomic_generic.Rd
@@ -8,6 +8,7 @@ make_meta_genomic_generic(
   cancer_study_identifier,
   genetic_alteration_type,
   datatype,
+  reference_genome_id,
   stable_id,
   profile_name,
   profile_description,
@@ -21,6 +22,8 @@ make_meta_genomic_generic(
 
 \item{datatype}{The cBioPortal data type of \code{data_filename}.}
 
+\item{reference_genome_id}{Reference genome id, e.g. 'hg19'.}
+
 \item{stable_id}{Stable id.}
 
 \item{profile_name}{Name of the genomic profiling. This is set by the more specific \code{make_meta} utility.
@@ -32,6 +35,8 @@ This is set by the more specific \code{make_meta} utility.}
 \item{data_filename}{Name of the data file that this meta file describes.}
 }
 \description{
-Reused from https://github.com/Sage-Bionetworks/genie-erbb2-cbio/blob/develop/make_meta.R#L65
+Adapted from https://github.com/Sage-Bionetworks/genie-erbb2-cbio/blob/develop/make_meta.R#L65
+Internal workhorse union of \emph{all} the properties used for a genomic-type data file --
+the sensible defaults/specific combination should be passed in by a higher-level fun, e.g. \code{make_meta_maf}.
 }
 \keyword{internal}
diff --git a/man/make_meta_study.Rd b/man/make_meta_study.Rd
deleted file mode 100644
index 773c017a..00000000
--- a/man/make_meta_study.Rd
+++ /dev/null
@@ -1,51 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/cboilerplate.R
-\name{make_meta_study}
-\alias{make_meta_study}
-\title{Make meta study file}
-\usage{
-make_meta_study(
-  cancer_study_identifier,
-  type_of_cancer = "mixed",
-  name,
-  description,
-  citation = NULL,
-  pmid = NULL,
-  groups = "PUBLIC",
-  short_name = NULL,
-  add_global_case_list = TRUE,
-  publish_dir = ".",
-  write = TRUE,
-  verbose = TRUE
-)
-}
-\arguments{
-\item{cancer_study_identifier}{The study identifier.}
-
-\item{type_of_cancer}{Type of cancer, defaults to "mixed". See also http://oncotree.mskcc.org/#/home.}
-
-\item{name}{Name of the study.}
-
-\item{description}{Description of the study.}
-
-\item{citation}{(Optional) A relevant citation, e.g. "TCGA, Nature 2012".}
-
-\item{pmid}{(Optional) One or more relevant pubmed ids (comma separated without whitespace); if used, citation cannot be \code{NULL}.}
-
-\item{groups}{(Optional) Defaults to "PUBLIC" for use with public cBioPortal;
-otherwise, use group names that make sense for your instance.}
-
-\item{short_name}{(Optional) Short name for the study.}
-
-\item{add_global_case_list}{(Optional) Use \code{NULL} to ignore, but default is \code{TRUE} for an "All samples" case list to be generated automatically.}
-
-\item{publish_dir}{Directory path to write to, defaults to current.}
-
-\item{write}{Whether to write the meta file for the clinical data file.}
-
-\item{verbose}{Report where file has been written.}
-}
-\description{
-Adapted from https://github.com/Sage-Bionetworks/genie-erbb2-cbio/blob/develop/create_meta.R#L179
-See specifications at https://docs.cbioportal.org/file-formats/#meta-file.
-}
diff --git a/man/make_meta_study_generic.Rd b/man/make_meta_study_generic.Rd
index bc8ba196..e793370b 100644
--- a/man/make_meta_study_generic.Rd
+++ b/man/make_meta_study_generic.Rd
@@ -5,15 +5,15 @@
 \title{Template for meta study file}
 \usage{
 make_meta_study_generic(
-  type_of_cancer,
   cancer_study_identifier,
+  type_of_cancer,
   name,
   description,
   citation = NULL,
   pmid = NULL,
   groups = NULL,
   short_name = NULL,
-  add_global_case_list = NULL
+  add_global_case_list = TRUE
 )
 }
 \description{
diff --git a/man/match_maf_sample_id.Rd b/man/match_maf_sample_id.Rd
deleted file mode 100644
index e39ca8ca..00000000
--- a/man/match_maf_sample_id.Rd
+++ /dev/null
@@ -1,19 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/syncBP.R
-\name{match_maf_sample_id}
-\alias{match_maf_sample_id}
-\title{Match clinical data with maf sample ids}
-\usage{
-match_maf_sample_id(clinical_data, merged_maf = NULL)
-}
-\arguments{
-\item{clinical_data}{Clinical data as a \code{data.table}.}
-
-\item{merged_maf}{Maf data as a \code{data.table}.}
-}
-\description{
-PATIENT_ID and SAMPLE_ID can only contain letters, numbers, points, underscores and/or hyphens.
-In the nf processing, sample id spaces are replaced with underscores in the \code{maf},
-so this is applied to clinical data to match.
-}
-\keyword{internal}
diff --git a/man/syncBP_maf.Rd b/man/syncBP_maf.Rd
deleted file mode 100644
index f9b03593..00000000
--- a/man/syncBP_maf.Rd
+++ /dev/null
@@ -1,70 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/syncBP.R
-\name{syncBP_maf}
-\alias{syncBP_maf}
-\title{Make cBioPortal mutations dataset from Synapse assets}
-\usage{
-syncBP_maf(
-  merged_maf,
-  samplesheet,
-  ref_map,
-  ref_view,
-  name,
-  cancer_study_identifier,
-  citation = NULL,
-  pmid = NULL,
-  short_name = NULL,
-  publish_dir = cancer_study_identifier,
-  verbose = TRUE
-)
-}
-\arguments{
-\item{merged_maf}{Synapse id of \verb{merged maf} file for public release.}
-
-\item{samplesheet}{Synapse id or local path to samplesheet with release info.}
-
-\item{ref_map}{YAML file specifying the mapping of (NF) clinical metadata to cBioPortal model. See details.}
-
-\item{ref_view}{A view that contains clinical data for the release files.}
-
-\item{name}{Name of the cancer study, e.g. something following convention is "Malignant Peripheral Nerve Sheath Tumor (NF-OSI, 2022)".}
-
-\item{cancer_study_identifier}{Study identifier, convention is \verb{\{tumorType\}_\{institution\}_\{year\}}, so for example "mpnst_nfosi_2022".}
-
-\item{citation}{(Optional) A relevant citation, e.g. "TCGA, Nature 2012".}
-
-\item{pmid}{(Optional) One or more relevant pubmed ids (comma separated without whitespace); if used, citation cannot be \code{NULL}.}
-
-\item{short_name}{(Optional) Short name for the study.}
-
-\item{publish_dir}{Where to output the set of files.
-Defaults to (creating if necessary) a folder with same name as \code{cancer_study_identifier}.}
-
-\item{verbose}{Whether to provide informative messages throughout.}
-}
-\description{
-The NF-OSI workflow produces a single merged maf file that represents a filtered subset of the \code{maf}s,
-containing only the (non-germline) data that \emph{can} be released for cBioPortal.
-However, this data file by itself is not immediately loadable into an instance of a cBioPortal server
-and needs to be packaged with other files, such as this
-\href{https://github.com/cBioPortal/datahub/tree/1e03ea6ab5e0ddd497ecf349cbee7d50aeebcd5e/public/msk_ch_2020}{example of a public mutations dataset}.
-This is a wrapper that goes through several steps needed to create said bundle of cBioPortal files more conveniently.
-}
-\details{
-\enumerate{
-\item A simple sanity check that this is the version of the maf release file that we want, based on the samplesheet.
-For example, version 1 of the samplesheet will generate a version 1 of the merged maf,
-but if there is a later correction that retracts a sample (\code{is_releasable}=FALSE),
-that step of the workflow to generate merged maf will be rerun (or it should be), so we'll want to make sure the latest versions of these files are used.
-The latest versions of the samplesheets tied to each release are currently stored in \code{syn38793855}.
-(Note: Please file issue to update this doc if this changes.)
-\item Make the clinical data files.
-For NF, the clinical metadata are annotations on the files/surfaced in a view and are pretty basic.
-In the future, it would be preferable to store clinical metadata in a real normalized table.
-For now, this clinical data is pulled in from the view. To map NF clinical variables to the
-\href{https://github.com/cBioPortal/clinical-data-dictionary/blob/e9ec08f48bd57aabf193da70cdb5b88bdef5d01d/docs/resource_uri_to_clinical_attribute_mapping.txt}{cBioPortal dictionary}
-\href{https://docs.cbioportal.org/file-formats/#custom-columns-in-clinical-data}{as recommended},
-this step requires a \code{ref_map}, which is a YAML file.
-\item Make meta files. Meta files are needed for describing the study, mutations data file, clinical data files.
-}
-}

From f9529cd88b2c0b209f6074bb2f34eb9bb3f182d9 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Wed, 23 Aug 2023 08:22:40 -0600
Subject: [PATCH 11/26] Update internal fun

---
 R/cbioportal_study.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/cbioportal_study.R b/R/cbioportal_study.R
index c13c4875..ab8d617a 100644
--- a/R/cbioportal_study.R
+++ b/R/cbioportal_study.R
@@ -13,7 +13,7 @@ cbp_datatypes <- function() {
   types <- data.table::rbindlist(list(
     list("CLINICAL", "SAMPLE_ATTRIBUTES", "cbp_add_clinical", ""),
     list("CLINICAL", "PATIENT_ATTRIBUTES", "cbp_add_clinical", ""),
-    list("COPY_NUMBER_ALTERATION", "SEG", "cbp_add_cna", ""),
+    list("COPY_NUMBER_ALTERATION", "SEG", "cbp_add_cna", "default"),
     list("COPY_NUMBER_ALTERATION", "DISCRETE", "", ""),
     list("COPY_NUMBER_ALTERATION", "DISCRETE_LONG", "", ""),
     list("COPY_NUMBER_ALTERATION", "CONTINUOUS", "", ""),
@@ -25,9 +25,9 @@ cbp_datatypes <- function() {
     list("PROTEIN_LEVEL", "LOG2-VALUE", "", ""),
     list("STRUCTURAL_VARIANT", "SV", "", "")))
   
-  setnames(types, c("dataType", "dataSubtype", "fun", "note"))
+  setnames(types, c("dataType", "dataSubtype", "util", "note"))
   return(types)
-  }
+}
 
 #' Initialize a new cBioPortal study dataset
 #' 

From 711538f68c23b2f767a1183f326a103145243d63 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Wed, 23 Aug 2023 08:34:12 -0600
Subject: [PATCH 12/26] Update internal boilerplate

---
 R/cboilerplate.R | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/R/cboilerplate.R b/R/cboilerplate.R
index 94a30380..d86ee721 100644
--- a/R/cboilerplate.R
+++ b/R/cboilerplate.R
@@ -162,7 +162,7 @@ write_meta <- function(data,
   
   path <- glue::glue("{publish_dir}/{filename}")
   writeLines(data, con = path)
-  if(verbose) message(glue::glue("Meta file written to: {path}"))
+  if(verbose) checked_message("Meta file written to: {path}")
 }
 
 # -- Clinical meta files ------------------------------------------------------- #
@@ -181,12 +181,12 @@ make_meta_clinical_generic <- function(cancer_study_identifier,
                                        datatype, 
                                        data_filename) {
   
-  rows <- rep("", 4)
-  rows[1] <- c(glue::glue("cancer_study_identifier: {cancer_study_identifier}"))
-  rows[2] <- c(glue::glue("genetic_alteration_type: {genetic_alteration_type}"))
-  rows[3] <- c(glue::glue("datatype: {datatype}"))
-  rows[4] <- c(glue::glue("data_filename: {data_filename}"))
-  return(rows)
+  meta <- glue::glue("cancer_study_identifier: {cancer_study_identifier}") %>%
+    append_kv("genetic_alteration_type", genetic_alteration_type) %>%
+    append_kv("datatype", datatype) %>%
+    append_kv("data_filename", data_filename) %>%
+  
+    return(meta)
 }
 
 #' Make patient meta file
@@ -388,7 +388,7 @@ make_meta_study_generic <- function(cancer_study_identifier,
     append_kv("pmid", pmid) %>%
     append_kv("groups", groups) %>%
     append_kv("short_name", short_name) %>%
-    append_kv("add_global_case_list", to.lower(as.character(add_global_case_list)))
+    append_kv("add_global_case_list", tolower(as.character(add_global_case_list)))
   
   return(meta)
 }

From 45db74abb76abdd31c4fc7154496ebd96dcd7d35 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Wed, 23 Aug 2023 08:34:39 -0600
Subject: [PATCH 13/26] Remove line break

---
 R/cbioportal_study.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/cbioportal_study.R b/R/cbioportal_study.R
index ab8d617a..08bfd40c 100644
--- a/R/cbioportal_study.R
+++ b/R/cbioportal_study.R
@@ -1,6 +1,7 @@
 # Export data from Synapse as a cBioPortal dataset, where different data types can be added to the package one-by-one,
 # much in the spirit of https://github.com/r-lib/usethis.
-# This file contains the higher-level wrappers that are exported; see `cboilerplate.R` for the non-exported lower-level utils.
+# This file contains the higher-level wrappers that are user-facing.
+# See `cboilerplate.R` for the lower-level utils, which allows more control.
 # All functions should start with `cbp_*` so that it's clear this is cBioPortal-relevant functionality.
 
 #' Enumerate combinations of valid cBP data types and data subtypes and helper utils if available
@@ -52,8 +53,7 @@ cbp_datatypes <- function() {
 cbp_new_study <- function(cancer_study_identifier,
                           name,
                           type_of_cancer,
-                          description = "The data are contributed by researchers funded by the Neurofibromatosis Therapeutic Acceleration Program (NTAP). 
-                          The reprocessing of the raw data is managed by the NF Open Science Initiative (https://nf.synapse.org/).",
+                          description = "The data are contributed by researchers funded by the Neurofibromatosis Therapeutic Acceleration Program (NTAP). The reprocessing of the raw data is managed by the NF Open Science Initiative (https://nf.synapse.org/).",
                           short_name = NULL,
                           citation = NULL,
                           pmid = NULL,

From 79e503a48d72ffacef3be463d2fe71a078fc36b9 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Wed, 23 Aug 2023 08:35:06 -0600
Subject: [PATCH 14/26] Update vignette

---
 ...tal-data-to-other-platforms-cbioportal.Rmd | 98 ++++++++++++++-----
 1 file changed, 71 insertions(+), 27 deletions(-)

diff --git a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
index d39925c3..24a938d8 100644
--- a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
+++ b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
@@ -1,8 +1,8 @@
 ---
-title: "Bringing Portal Data to Other Platforms (cBioPortal)"
+title: "Bringing Portal Data to Other Platforms: cBioPortal"
 output: rmarkdown::html_vignette
 vignette: >
-  %\VignetteIndexEntry{Bringing Portal Data to Other Platforms (cBioPortal)}
+  %\VignetteIndexEntry{Bringing Portal Data to Other Platforms: cBioPortal}
   %\VignetteEngine{knitr::rmarkdown}
   %\VignetteEncoding{UTF-8}
 ---
@@ -15,7 +15,7 @@ knitr::opts_chunk$set(
 ```
 
 **Document Status:** Draft  
-**Estimated Reading Time:** 5 min
+**Estimated Reading Time:** 8 min
 
 ## Special acknowledgments 
 
@@ -23,50 +23,93 @@ Functionality demonstrated in this vignette benefited greatly from code original
 
 ## Intro
 
-This briefly describes usage for bringing data from NF-OSI (mainly NF-OSI processed data) is brought to other platforms (mainly cBioPortal). 
+This describes how to package some Synapse processed data as a cBioPortal study dataset.
+A cBioPortal study contains one or more data types, see [cBioPortal docs](https://docs.cbioportal.org/file-formats/). 
+The current API covers creating a cBioPortal study with a subset of data types relevant to the NF workflow (so not all data types). 
+The design has been inspired by and should feel somewhat like working with the R package [usethis](https://github.com/r-lib/usethis), 
+and data types can be added to the study package interactively.
+
+Though there is some checking depending on the data type, final validation with the official cBioPortal validation tools/scripts should still be run (see last section).
+
+Breaking changes are possible as the API is still in development.
 
 ## Set up
 
 First load the `nfportalutils` package and log in. 
 The recommended default usage of `syn_login` is to use it without directly passing in credentials. 
 Instead, have available the `SYNAPSE_AUTH_TOKEN` environment variable with your token stored therein. 
+
 ```{r setup, eval=FALSE}
 library(nfportalutils)
 syn_login()
 ```
 
-## Creating a mutations dataset
+## Create a new study dataset
 
-Compilation of a cBioPortal dataset does require putting together a number of assets on Synapse and elsewhere, but the package should make it seem as straightforward as possible.
+First create the study before we can put together the data.
 
-- `merged_maf` references a final output file from the NF-OSI processing pipeline that is directly ready for public release. No modifications are needed for this file (except renaming it).
-- `ref_view` is a fileview that contains annotations for the files released.
-- `samplesheet` is a the samplesheet used for the data processing and ultimately creating `merged_maf`
-- `ref_map` maps clinical variables from the NF-OSI data dictionary to cBioPortal's
+```{r cbp_new_study, eval=FALSE}
 
-```{r files}
-merged_maf <- "syn36553188"
-ref_view <- "syn43278088"
-samplesheet <- "syn41830510"
-ref_map <- "https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/mappings/cBioPortal.yaml"
+cbp_new_study(cancer_study_identifier = "npst_nfosi_ntap_2022",
+              name = "Plexiform Neurofibroma and Neurofibroma (Pratilas 2022)",
+              citation = "TBD")
+```
+
+## Add data types to study
+
+Data types can be added _in any order_ using the `cbp_add*` functions.
+It is expected that the user is currently in a valid cBioPortal study directory as set up in the previous step.
+
+### Add mutations data
+
+- `maf_data` references a final merged maf output file from the NF-OSI processing pipeline OK for public release. 
+No further modifications are done except renaming it.
+
+```{r add_maf, eval=FALSE}
+
+maf_data <- "syn36553188"
+
+add_cbp_maf(maf_data)
+```
+
+### Add copy number alterations (CNA) data
+
+- `cna_data` is expected to be a `.seg` file on Synapse.
+
+```{r add_cna, eval=FALSE}
+
+cna_data <- "syn********"
+
+cbp_add_cna(cna_data)
 ```
 
+### Add expression data
 
-This will create a folder with the set of files needed.
-```{r syncBP_maf, eval=FALSE}
+- `expression_data` is expected to be a `.txt` file on Synapse. 
 
-syncBP_maf(merged_maf,
-           samplesheet,
-           ref_map,
-           ref_view,
-           cancer_study_identifier = "mixed_nfosi_2022",
-           name = "Plexiform Neurofibroma and Neurofibroma (Pratilas 2022)",
-           citation = "TBD")
+Defaults are set for mRNA expression, use `?cbp_add_expression` for how to specify microRNA expression. 
+This will try to infer whether it is e.g. continuous or discrete expression data and update data properties appropriately.
+
+```{r add_expression, eval=FALSE}
+
+mrna_data <- "syn********"
+
+cbp_add_expression(mrna_data)
 ```
 
-## Updating or adding to a dataset
 
-This is a TO-DO section.
+### Add clinical data
+
+- `ref_view` is a fileview that contains clinical data for the data released.
+- `ref_map` maps clinical variables from the NF-OSI data dictionary to cBioPortal's
+
+```{r add_clinical, eval=FALSE}
+
+ref_view <- "syn43278088"
+ref_map <- "https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/mappings/cBioPortal.yaml"
+
+cbp_add_clinical(ref_view, ref_map)
+```
 
 ## Validation
 
@@ -74,7 +117,8 @@ There are additional steps such as generating case lists and validation that hav
 See the [general docs for dataset validation](https://docs.cbioportal.org/using-the-dataset-validator/).
 
 For the _public_ portal, the suggested step using the public server is given below.  
-Assuming your present working directory is `~/datahub/public` and a study folder called `mixed_nfosi_2022` has been placed into it, mount the dataset into the container and run validation like so:
+
+Assuming your present working directory is `~/datahub/public` and a study folder called `mixed_nfosi_2022` has been placed into it, mount the dataset into the container and run validation with:  
 `docker run --rm -v $(pwd):/datahub cbioportal/cbioportal:4.1.13 validateStudies.py -d /datahub -l mixed_nfosi_2022 -u http://cbioportal.org -html /datahub/mixed_nfosi_2022/html_report`
 
  
\ No newline at end of file

From 705b763b0120f5e4f4f4fe1949195bb6668dccf5 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Wed, 23 Aug 2023 08:40:55 -0600
Subject: [PATCH 15/26] Bump version, update one doc

---
 DESCRIPTION          | 2 +-
 man/cbp_new_study.Rd | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 18b0089a..07a90f63 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: nfportalutils
 Title: NF Portal Utilities
-Version: 0.0.0.9320
+Version: 0.0.0.9400
 Authors@R: c(
     person(given = "Robert", family = "Allaway", role = c("aut", "cre"),
            email = "robert.allaway@sagebionetworks.org",
diff --git a/man/cbp_new_study.Rd b/man/cbp_new_study.Rd
index 84197282..0fcd8470 100644
--- a/man/cbp_new_study.Rd
+++ b/man/cbp_new_study.Rd
@@ -9,7 +9,7 @@ cbp_new_study(
   name,
   type_of_cancer,
   description =
-    "The data are contributed by researchers funded by the Neurofibromatosis Therapeutic Acceleration Program (NTAP). \\n                          The reprocessing of the raw data is managed by the NF Open Science Initiative (https://nf.synapse.org/).",
+    "The data are contributed by researchers funded by the Neurofibromatosis Therapeutic Acceleration Program (NTAP). The reprocessing of the raw data is managed by the NF Open Science Initiative (https://nf.synapse.org/).",
   short_name = NULL,
   citation = NULL,
   pmid = NULL,

From 826014aca3e0b09e2fdc9d3e72831b096b78138f Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Wed, 23 Aug 2023 08:45:17 -0600
Subject: [PATCH 16/26] Update pkgdown reference

---
 _pkgdown.yml | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/_pkgdown.yml b/_pkgdown.yml
index bf429e88..77688b17 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -98,14 +98,16 @@ reference:
   - remove_button
   - processing_flowchart
   
-- title: Export Data to Other Portals
-  desc: Helpers to export/release NF data to other portal/database formats
+- title: Export Data to Other Platforms
+  desc: Helpers to export/release NF data to other platforms/databases. 
 - subtitle: cBioPortal
-  desc: Export data to cBioPortal format files
+  desc: Export data as a cBioPortal study
 - contents:
-  - syncBP_maf
-  - check_maf_release
-  - make_meta_study
+  - cbp_new_study
+  - cbp_add_maf
+  - cbp_add_clinical
+  - cbp_add_expression
+  - cbp_add_cna
 
 - title: Quality Control and Testing Utils
   desc: QC data

From 1b66e48722d3b861000c979320b54b827e7a1225 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Wed, 23 Aug 2023 08:49:31 -0600
Subject: [PATCH 17/26] Rename file

---
 R/{cbioportal_study.R => cbioportal.R} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename R/{cbioportal_study.R => cbioportal.R} (100%)

diff --git a/R/cbioportal_study.R b/R/cbioportal.R
similarity index 100%
rename from R/cbioportal_study.R
rename to R/cbioportal.R

From ed94a1534db2d10f5c99d39eb20d430f72168274 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Wed, 23 Aug 2023 09:01:44 -0600
Subject: [PATCH 18/26] Update docs

---
 man/cbp_add_clinical.Rd   | 2 +-
 man/cbp_add_cna.Rd        | 2 +-
 man/cbp_add_expression.Rd | 2 +-
 man/cbp_add_maf.Rd        | 2 +-
 man/cbp_datatypes.Rd      | 2 +-
 man/cbp_new_study.Rd      | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/man/cbp_add_clinical.Rd b/man/cbp_add_clinical.Rd
index 642a5ee5..ca06b3d7 100644
--- a/man/cbp_add_clinical.Rd
+++ b/man/cbp_add_clinical.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/cbioportal_study.R
+% Please edit documentation in R/cbioportal.R
 \name{cbp_add_clinical}
 \alias{cbp_add_clinical}
 \title{Export and add clinical data to cBioPortal dataset}
diff --git a/man/cbp_add_cna.Rd b/man/cbp_add_cna.Rd
index 346fa656..bd71d07d 100644
--- a/man/cbp_add_cna.Rd
+++ b/man/cbp_add_cna.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/cbioportal_study.R
+% Please edit documentation in R/cbioportal.R
 \name{cbp_add_cna}
 \alias{cbp_add_cna}
 \title{Export and add CNA (seg) data to cBioPortal dataset}
diff --git a/man/cbp_add_expression.Rd b/man/cbp_add_expression.Rd
index 8e9586ef..528f17ba 100644
--- a/man/cbp_add_expression.Rd
+++ b/man/cbp_add_expression.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/cbioportal_study.R
+% Please edit documentation in R/cbioportal.R
 \name{cbp_add_expression}
 \alias{cbp_add_expression}
 \title{Export and add expression data to cBioPortal dataset}
diff --git a/man/cbp_add_maf.Rd b/man/cbp_add_maf.Rd
index 0760f4ae..bec8cfe9 100644
--- a/man/cbp_add_maf.Rd
+++ b/man/cbp_add_maf.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/cbioportal_study.R
+% Please edit documentation in R/cbioportal.R
 \name{cbp_add_maf}
 \alias{cbp_add_maf}
 \title{Export and add mutations data to cBioPortal dataset}
diff --git a/man/cbp_datatypes.Rd b/man/cbp_datatypes.Rd
index 70bcf6fd..4a298e79 100644
--- a/man/cbp_datatypes.Rd
+++ b/man/cbp_datatypes.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/cbioportal_study.R
+% Please edit documentation in R/cbioportal.R
 \name{cbp_datatypes}
 \alias{cbp_datatypes}
 \title{Enumerate combinations of valid cBP data types and data subtypes and helper utils if available}
diff --git a/man/cbp_new_study.Rd b/man/cbp_new_study.Rd
index 0fcd8470..7504cdfd 100644
--- a/man/cbp_new_study.Rd
+++ b/man/cbp_new_study.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/cbioportal_study.R
+% Please edit documentation in R/cbioportal.R
 \name{cbp_new_study}
 \alias{cbp_new_study}
 \title{Initialize a new cBioPortal study dataset}

From e68303958048788ff773e9e8d4d540a9d58106f3 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Wed, 23 Aug 2023 09:02:11 -0600
Subject: [PATCH 19/26] Add note in vignette

---
 ...ringing-portal-data-to-other-platforms-cbioportal.Rmd | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
index 24a938d8..1884d335 100644
--- a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
+++ b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
@@ -57,8 +57,11 @@ cbp_new_study(cancer_study_identifier = "npst_nfosi_ntap_2022",
 
 ## Add data types to study
 
-Data types can be added _in any order_ using the `cbp_add*` functions.
-It is expected that the user is currently in a valid cBioPortal study directory as set up in the previous step.
+Data types can be most easily added _in any order_ using the `cbp_add*` functions.
+These download data and create the meta for them, relying on **defaults that apply to NF-OSI processed data**. 
+Sometimes if these defaults don't match what you want, use the lower-level utils or edit the files manually after creation.
+
+Important: user is expected to be in a valid cBioPortal study directory as set up in the previous step.
 
 ### Add mutations data
 
@@ -67,7 +70,7 @@ No further modifications are done except renaming it.
 
 ```{r add_maf, eval=FALSE}
 
-maf_data <- "syn36553188"
+maf_data <- "syn36553188"t
 
 add_cbp_maf(maf_data)
 ```

From 4877cbd03688eef26a497be3a7cd5dd704bf0d30 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Wed, 23 Aug 2023 09:06:17 -0600
Subject: [PATCH 20/26] Wording

---
 .../bringing-portal-data-to-other-platforms-cbioportal.Rmd  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
index 1884d335..2dc61551 100644
--- a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
+++ b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
@@ -58,8 +58,8 @@ cbp_new_study(cancer_study_identifier = "npst_nfosi_ntap_2022",
 ## Add data types to study
 
 Data types can be most easily added _in any order_ using the `cbp_add*` functions.
-These download data and create the meta for them, relying on **defaults that apply to NF-OSI processed data**. 
-Sometimes if these defaults don't match what you want, use the lower-level utils or edit the files manually after creation.
+These download data files and create the meta for them, using **defaults for NF-OSI processed data**. 
+Sometimes if these defaults don't match what you want, take a look at the lower-level utils `make_meta_*` or edit the files manually after.
 
 Important: user is expected to be in a valid cBioPortal study directory as set up in the previous step.
 
@@ -103,7 +103,7 @@ cbp_add_expression(mrna_data)
 
 ### Add clinical data
 
-- `ref_view` is a fileview that contains clinical data for the data released.
+- `ref_view` is a fileview that contains clinical data for the data released in the study.
 - `ref_map` maps clinical variables from the NF-OSI data dictionary to cBioPortal's
 
 ```{r add_clinical, eval=FALSE}

From f99beb2a7239a1c60b3f9a46e550151965c8e555 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Wed, 23 Aug 2023 09:06:50 -0600
Subject: [PATCH 21/26] Fix stray word

---
 .../bringing-portal-data-to-other-platforms-cbioportal.Rmd      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
index 2dc61551..98094860 100644
--- a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
+++ b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
@@ -70,7 +70,7 @@ No further modifications are done except renaming it.
 
 ```{r add_maf, eval=FALSE}
 
-maf_data <- "syn36553188"t
+maf_data <- "syn36553188"
 
 add_cbp_maf(maf_data)
 ```

From 22b1a1f04e63aead266617005fdaa84c93ed285d Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Wed, 23 Aug 2023 09:09:34 -0600
Subject: [PATCH 22/26] Clarify other note

---
 .../bringing-portal-data-to-other-platforms-cbioportal.Rmd   | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
index 98094860..eadc1e79 100644
--- a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
+++ b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
@@ -88,10 +88,7 @@ cbp_add_cna(cna_data)
 
 ### Add expression data
 
-- `expression_data` is expected to be a `.txt` file on Synapse. 
-
-Defaults are set for mRNA expression, use `?cbp_add_expression` for how to specify microRNA expression. 
-This will try to infer whether it is e.g. continuous or discrete expression data and update data properties appropriately.
+- `expression_data` is expected to be a `.txt` file on Synapse.
 
 ```{r add_expression, eval=FALSE}
 

From 9eda86f9080eae4bc4b894830e16147f43bd9a5e Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Wed, 23 Aug 2023 09:17:53 -0600
Subject: [PATCH 23/26] Fix message

---
 R/cboilerplate.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/cboilerplate.R b/R/cboilerplate.R
index d86ee721..e13b7462 100644
--- a/R/cboilerplate.R
+++ b/R/cboilerplate.R
@@ -162,7 +162,7 @@ write_meta <- function(data,
   
   path <- glue::glue("{publish_dir}/{filename}")
   writeLines(data, con = path)
-  if(verbose) checked_message("Meta file written to: {path}")
+  if(verbose) checked_message(glue::glue("Meta file written to: {path}"))
 }
 
 # -- Clinical meta files ------------------------------------------------------- #
@@ -408,7 +408,7 @@ make_meta_study_generic <- function(cancer_study_identifier,
 #'
 #' @param ref_map YAML or JSON mapping. See details.
 #' @param as_dt Return as `data.table`, the default, 
-#' otherwise do checking but just return the list representation, which retains some metadata. 
+#' otherwise do checking but just return the list representation, which retains some metadata.
 #' @return Either a list of lists storing `source`, `label`, `description`, `data_type`, `attribute_type`
 #' or a `data.table` representation.
 #' @keywords internal

From 41190f3f8f50f843f5ffc72877c718d678db0fdf Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Wed, 23 Aug 2023 16:11:21 -0600
Subject: [PATCH 24/26] Fix some params

---
 R/cbioportal.R   | 37 +++++++++++++++++++++++++++++--------
 R/cboilerplate.R | 25 +++++--------------------
 2 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/R/cbioportal.R b/R/cbioportal.R
index 08bfd40c..d16f449a 100644
--- a/R/cbioportal.R
+++ b/R/cbioportal.R
@@ -47,7 +47,7 @@ cbp_datatypes <- function() {
 #' @param add_global_case_list (Optional) Use `NULL` to ignore, default is `TRUE` for an "All samples" case list( to be generated automatically.
 #' @param validate Validate against public cBioPortal configuration. Default `TRUE`, 
 #' but might want to set to `FALSE` especially if using a custom cBioPortal instance with different configuration. 
-#' @param verbose Verbosity level.
+#' @param verbose Whether to be chatty.
 #' 
 #' @export
 cbp_new_study <- function(cancer_study_identifier,
@@ -139,6 +139,27 @@ cbp_add_clinical <- function(ref_view,
   
 }
 
+#' Check that in valid cBioPortal study dataset root
+#' 
+#' The `cbp_add*` functions need to be run while in the study package root.  
+#' This checks in valid study directory and returns the `cancer_study_id`.
+#' 
+#' @keywords internal
+#' @return `cancer_study_id` for the current cBioPortal cancer study.
+check_cbp_study_id <- function() {
+  
+  tryCatch({
+    
+    suppressWarnings({
+      study <- yaml::read_yaml("meta_study.txt")
+      study$cancer_study_identifier
+    })
+    
+  }, error = function(e) stop("The path ", getwd(), 
+                              " does not appear to be a valid cBioPortal study.", 
+                              call. = FALSE))
+}
+
 # ------------------------------------------------------------------------------- #
 
 #' Export and add mutations data to cBioPortal dataset
@@ -149,11 +170,10 @@ cbp_add_clinical <- function(ref_view,
 #' This needs to be packaged with other files like this
 #' [example of a public mutations dataset](https://github.com/cBioPortal/datahub/tree/1e03ea6ab5e0ddd497ecf349cbee7d50aeebcd5e/public/msk_ch_2020).
 #' 
+#' @inheritParams cbp_new_study
 #' @param maf_data Synapse id of `merged maf` file for public release.
-#' @param verbose Whether to provide informative messages throughout.
 #' @export
-cbp_add_maf <- function(maf_data,
-                        verbose = TRUE) {
+cbp_add_maf <- function(maf_data, verbose = TRUE) {
   
   .check_login()
   cancer_study_identifier <- check_cbp_study_id()
@@ -177,9 +197,10 @@ cbp_add_maf <- function(maf_data,
 #' 
 #' This should be run in an existing dataset package root.
 #' 
+#' @inheritParams cbp_new_study
 #' @param cna_data Synapse id of CNA data file, currently only handles `.seg` file.
 #' @export
-cbp_add_cna <- function(cna_data) {
+cbp_add_cna <- function(cna_data, verbose = TRUE) {
   
   cancer_study_identifier <- check_cbp_study_id()
   
@@ -200,10 +221,10 @@ cbp_add_cna <- function(cna_data) {
 #' 
 #' This should be run in an existing dataset package root.
 #' 
-#' @export
-#' 
+#' @inheritParams cbp_new_study
 #' @param expression_data Syn id of gene expression data.
-cbp_add_expression <- function(expression_data) {
+#' @export
+cbp_add_expression <- function(expression_data, verbose = TRUE) {
   
   cancer_study_identifier <- check_cbp_study_id()
   
diff --git a/R/cboilerplate.R b/R/cboilerplate.R
index e13b7462..da1c512e 100644
--- a/R/cboilerplate.R
+++ b/R/cboilerplate.R
@@ -10,22 +10,6 @@
 # data file is called. This might be mainly updating the main wrapper or creating more wrappers.
 
 
-#' Check that in valid cBioPortal study dataset root
-#' 
-#' The `cbp_add*` functions need to be run while in the study package root.  
-#' This checks in valid study directory and returns the `cancer_study_id`.
-#' 
-#' @keywords internal
-#' @return `cancer_study_id` for the current cBioPortal cancer study.
-check_cbp_study_id <- function() {
-  
-  tryCatch({
-    study <- yaml::read_yaml("meta_study.txt")
-    study$cancer_study_identifier
-  }, 
-  error = function(e) stop(getwd(), "does not appear to be a valid cBioPortal study."))
-}
-
 # -- DATA FILES ---------------------------------------------------------------- #
 # Data files store data... cBioPortal has format specifications specific to the data type. 
 # The only data type that we need to script for is the clinical data type, 
@@ -260,10 +244,10 @@ append_kv <- function(x, key, value) {
 make_meta_genomic_generic <- function(cancer_study_identifier,
                                       genetic_alteration_type, 
                                       datatype, 
-                                      reference_genome_id,
-                                      stable_id, 
-                                      profile_name, 
-                                      profile_description, 
+                                      stable_id = NULL, 
+                                      reference_genome_id = NULL,
+                                      profile_name = NULL, 
+                                      profile_description = NULL, 
                                       data_filename) {
   
   
@@ -349,6 +333,7 @@ make_meta_expression <- function(cancer_study_identifier,
                                        genetic_alteration_type = "MRNA_EXPRESSION",
                                        datatype = "CONTINUOUS",
                                        stable_id = "rna_seq_mrna",
+                                       profile_name = " mRNA expression",
                                        profile_description = "Expression levels",
                                        data_filename = data_filename)
   

From 1bdbb336b479d1db1cf84063c92d8ba1836b4d9d Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Wed, 23 Aug 2023 16:11:41 -0600
Subject: [PATCH 25/26] Update docs

---
 man/cbp_add_cna.Rd        | 4 +++-
 man/cbp_add_expression.Rd | 2 +-
 man/cbp_add_maf.Rd        | 2 +-
 man/cbp_new_study.Rd      | 2 +-
 man/check_cbp_study_id.Rd | 2 +-
 5 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/man/cbp_add_cna.Rd b/man/cbp_add_cna.Rd
index bd71d07d..bcb356b4 100644
--- a/man/cbp_add_cna.Rd
+++ b/man/cbp_add_cna.Rd
@@ -4,10 +4,12 @@
 \alias{cbp_add_cna}
 \title{Export and add CNA (seg) data to cBioPortal dataset}
 \usage{
-cbp_add_cna(cna_data)
+cbp_add_cna(cna_data, verbose = TRUE)
 }
 \arguments{
 \item{cna_data}{Synapse id of CNA data file, currently only handles \code{.seg} file.}
+
+\item{verbose}{Whether to be chatty.}
 }
 \description{
 This should be run in an existing dataset package root.
diff --git a/man/cbp_add_expression.Rd b/man/cbp_add_expression.Rd
index 528f17ba..3b930650 100644
--- a/man/cbp_add_expression.Rd
+++ b/man/cbp_add_expression.Rd
@@ -4,7 +4,7 @@
 \alias{cbp_add_expression}
 \title{Export and add expression data to cBioPortal dataset}
 \usage{
-cbp_add_expression(expression_data)
+cbp_add_expression(expression_data, verbose = TRUE)
 }
 \arguments{
 \item{expression_data}{Syn id of gene expression data.}
diff --git a/man/cbp_add_maf.Rd b/man/cbp_add_maf.Rd
index bec8cfe9..7ae41f9a 100644
--- a/man/cbp_add_maf.Rd
+++ b/man/cbp_add_maf.Rd
@@ -9,7 +9,7 @@ cbp_add_maf(maf_data, verbose = TRUE)
 \arguments{
 \item{maf_data}{Synapse id of \verb{merged maf} file for public release.}
 
-\item{verbose}{Whether to provide informative messages throughout.}
+\item{verbose}{Whether to be chatty.}
 }
 \description{
 This should be run in an existing dataset package root.
diff --git a/man/cbp_new_study.Rd b/man/cbp_new_study.Rd
index 7504cdfd..121ef004 100644
--- a/man/cbp_new_study.Rd
+++ b/man/cbp_new_study.Rd
@@ -42,7 +42,7 @@ otherwise, use group names that makes sense with the configuration of your cBioP
 \item{validate}{Validate against public cBioPortal configuration. Default \code{TRUE},
 but might want to set to \code{FALSE} especially if using a custom cBioPortal instance with different configuration.}
 
-\item{verbose}{Verbosity level.}
+\item{verbose}{Whether to be chatty.}
 }
 \description{
 Create a new directory with a basic required \href{https://docs.cbioportal.org/file-formats/#meta-file}{study meta file},
diff --git a/man/check_cbp_study_id.Rd b/man/check_cbp_study_id.Rd
index 5e791052..c3b889bc 100644
--- a/man/check_cbp_study_id.Rd
+++ b/man/check_cbp_study_id.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/cboilerplate.R
+% Please edit documentation in R/cbioportal.R
 \name{check_cbp_study_id}
 \alias{check_cbp_study_id}
 \title{Check that in valid cBioPortal study dataset root}

From 17d126e50bc63841859214c9b25ebff646a49128 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Wed, 23 Aug 2023 16:44:56 -0600
Subject: [PATCH 26/26] Update docs

---
 man/cbp_add_expression.Rd        |  2 ++
 man/make_meta_genomic_generic.Rd | 12 ++++++------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/man/cbp_add_expression.Rd b/man/cbp_add_expression.Rd
index 3b930650..25a16c34 100644
--- a/man/cbp_add_expression.Rd
+++ b/man/cbp_add_expression.Rd
@@ -8,6 +8,8 @@ cbp_add_expression(expression_data, verbose = TRUE)
 }
 \arguments{
 \item{expression_data}{Syn id of gene expression data.}
+
+\item{verbose}{Whether to be chatty.}
 }
 \description{
 This should be run in an existing dataset package root.
diff --git a/man/make_meta_genomic_generic.Rd b/man/make_meta_genomic_generic.Rd
index 1f73cc17..0538b7df 100644
--- a/man/make_meta_genomic_generic.Rd
+++ b/man/make_meta_genomic_generic.Rd
@@ -8,10 +8,10 @@ make_meta_genomic_generic(
   cancer_study_identifier,
   genetic_alteration_type,
   datatype,
-  reference_genome_id,
-  stable_id,
-  profile_name,
-  profile_description,
+  stable_id = NULL,
+  reference_genome_id = NULL,
+  profile_name = NULL,
+  profile_description = NULL,
   data_filename
 )
 }
@@ -22,10 +22,10 @@ make_meta_genomic_generic(
 
 \item{datatype}{The cBioPortal data type of \code{data_filename}.}
 
-\item{reference_genome_id}{Reference genome id, e.g. 'hg19'.}
-
 \item{stable_id}{Stable id.}
 
+\item{reference_genome_id}{Reference genome id, e.g. 'hg19'.}
+
 \item{profile_name}{Name of the genomic profiling. This is set by the more specific \code{make_meta} utility.
 For example, "Mutations" for \verb{make_*_maf} and "Copy-number alterations" for \verb{make_*_cna}.}