Skip to content

Commit

Permalink
Patch/cbp and exports (#160)
Browse files Browse the repository at this point in the history
* Export functions

* Update docs, namespace

* Update version

* Update cBP utils

* Fix ref

* Update clinical data prep

* Update docs

* Fix default sep

* Fix expression

* New cancer type helper

* More changes

* Update doc

* Fix test
  • Loading branch information
anngvu authored Feb 10, 2024
1 parent f639164 commit c07b494
Show file tree
Hide file tree
Showing 11 changed files with 164 additions and 66 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: nfportalutils
Title: NF Portal Utilities
Version: 0.0.0.946
Version: 0.9500
Authors@R: c(
person(given = "Robert", family = "Allaway", role = c("aut", "cre"),
email = "[email protected]",
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,11 @@ export(bad_url)
export(bipartite_mmd_template)
export(byte_budget)
export(calculate_related_studies)
export(cbp_add_clinical)
export(cbp_add_cna)
export(cbp_add_expression)
export(cbp_add_maf)
export(cbp_new_cancer_type)
export(cbp_new_study)
export(check_access)
export(check_readpair_validity)
Expand Down
52 changes: 47 additions & 5 deletions R/cbioportal.R
Original file line number Diff line number Diff line change
Expand Up @@ -110,20 +110,23 @@ cbp_new_study <- function(cancer_study_identifier,
#' in Nextflow processing any spaces gets replaced with underscores so that's the default here.
#' Does *not* check for missing samples, as final validation via cBioPortal tool is still expected for that.
#'
#' @param ref_view A view that contains all clinical data for the study.
#' @param clinical_data Clinical table query.
#' @param ref_map YAML file specifying the mapping of (NF) clinical metadata to cBioPortal model. See details.
#' @param verbose Whether to provide informative messages throughout.
cbp_add_clinical <- function(ref_view,
#'
#' @export
cbp_add_clinical <- function(clinical_data,
ref_map,
verbose = TRUE) {

cancer_study_identifier <- check_cbp_study_id()

if(verbose) checked_message("Pulling the clinical data from Synapse")
df <- get_clinical_data_for_cbp_study(ref_view)
df <- .syn$tableQuery(clinical_data, includeRowIdAndRowVersion = FALSE)$asDataFrame()
if(verbose) checked_message("Retrieved clinical data from Synapse")

if(verbose) checked_message("Formatting and making clinical data file(s)")
df$specimenID <- gsub(" ", "_", clinical_data$specimenID)
checked_message("Spaces in specimen IDs will be replaced with _ per cBioPortal specifications")
df$specimenID <- gsub(" ", "_", df$specimenID)
write_cbio_clinical(df, ref_map = ref_map, verbose = verbose)

if(verbose) checked_message("Making sample clinical meta file")
Expand Down Expand Up @@ -238,6 +241,7 @@ cbp_add_expression <- function(expression_data,
file <- .syn$get(expression_data, downloadLocation = ".")
data_expression <- sub(file$name, "data_expression_tpm.txt", file$path)
file.rename(file$path, data_expression)
format_gene_expression_data("data_expression_tpm.txt")

if(verbose) checked_message("Making the meta file")
make_meta_expression(cancer_study_identifier, type = "tpm")
Expand All @@ -247,6 +251,7 @@ cbp_add_expression <- function(expression_data,
file <- .syn$get(expression_data_raw, downloadLocation = ".")
data_expression_supp <- sub(file$name, "data_expression_raw.txt", file$path)
file.rename(file$path, data_expression_supp)
format_gene_expression_data("data_expression_raw.txt")

if(verbose) checked_message("Making the meta file for supplemental raw mRNA expression data file")
make_meta_expression(cancer_study_identifier, type = "raw")
Expand All @@ -257,3 +262,40 @@ cbp_add_expression <- function(expression_data,

}

#' Format gene expression
#'
#' @keywords internal
#' @import data.table
format_gene_expression_data <- function(file) {
data_expression <- fread(file)
data_expression[, gene_id := NULL] # Ensembl ids not used in cBioPortal
setnames(data_expression, old = c("gene_name"), new = c("Hugo_Symbol"))
fwrite(data_expression, file = file, sep = "\t")
}

#' Create reference file for new cancer type
#'
#' Helper for creating reference for new cancer subtype which does not already exist.
#' https://docs.cbioportal.org/file-formats/#cancer-type
#'
#' @param type_of_cancer Id for new cancer type, e.g. "cnf".
#' @param name Full name for new cancer type, e.g. "Cutaneous Neurofibroma"
#' @param color Color name for new cancer; https://en.wikipedia.org/wiki/Web_colors#X11_color_names.
#' @param parent_type_of_cancer Id of existing parent, e.g. "nfib" for Neurofibroma.
#' @export
cbp_new_cancer_type <- function(type_of_cancer,
name,
color,
parent_type_of_cancer) {

cat("genetic_alteration_type: CANCER_TYPE",
"datatype: CANCER_TYPE",
"data_filename: cancer_type.txt",
sep = "\n",
file = "meta_cancer_type.txt")

cat(glue::glue("{type_of_cancer}\t{name}\t{color}\t{parent_type_of_cancer}"),
file = "cancer_type.txt")

checked_message("Created new cancer type meta and data")
}
77 changes: 43 additions & 34 deletions R/cboilerplate.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,18 @@
#' Make header for cBioPortal clinical data file
#'
#' This is called from the wrapper `write_cbio_clinical`.
#' Reused from https://github.com/Sage-Bionetworks/genie-erbb2-cbio/blob/develop/create_clinical.R#L396.
#' Adapted from https://github.com/Sage-Bionetworks/genie-erbb2-cbio/blob/develop/create_clinical.R#L396.
#' Needs a data table of clinical data and a reference providing `label`, `description`, and `data_type`.
#'
#' @param df A `data.frame` representing clinical dataset to publicize.
#' @param label Character vector representing a short label for each column in the dataset
#' @param description Character vector representing a long descriptions for each column in the dataset
#' @param data_type Character vector representing the data type of each column in the dataset
#' @param mapping A reference table providing `label`, `description`, and `data_type` for each `source` attribute in `df`.
#' @keywords internal
make_cbio_clinical_header <- function(df, label, description, data_type) {
make_cbio_clinical_header <- function(df, mapping) {

label <- mapping[match(names(df), source), label]
description <- mapping[match(names(df), source), description]
data_type <- mapping[match(names(df), source), data_type]

# Original code assigns a default priority = 1 to all; this is kept until we need more complex configuration
header <- rbind(label, description, data_type, rep(1))
header <- t(apply(header, 1, function(x) { return(c(paste0("#", x[1]), x[2:length(x)]))}))
header <- rbind(header, label) # use harmonized name as row-5 attribute names
Expand Down Expand Up @@ -83,47 +85,54 @@ write_cbio_clinical <- function(df,
verbose = TRUE) {

m <- use_ref_map(ref_map)
attributes <- m$source
m <- split(m, by = "attribute_type")

# Move/factor out these checks?
if(!all(attributes %in% names(df))) stop(glue::glue_collapse(setdiff(attributes, names(df)), ","), " specified in mapping but not available in data. Check data.")
if(!"SAMPLE" %in% names(m)) stop("According to mapping, no SAMPLE clinical file will be created. Check mapping.")
present <- names(df)
required <- m$source[m$required]
attributes <- unique(m$source)

# Attribute checks
message("Clinical attributes present are: ", paste(present, collapse = ", "))
if(!all(required %in% present)) stop("Missing required clinical element(s):", paste(setdiff(required, present), collapse = ", "))
if(!all(present %in% attributes)) stop("Missing mapping for:", paste(setdiff(present, attributes), collapse = ","))

# Take care of list columns and NA
.df <- data.table::copy(df)
for(col in names(.df)) {
if(class(.df[[col]]) == "list") {
.df[[col]] <- sapply(.df[[col]], function(x) paste0(x, collapse = "-"))
warning(glue::glue("The {col} field was stored as a list has been coerced for export, you may want to check output."), call. = F)
}
# Use actual NA's so that `write.table` can write out "" consistently
.df[.df[[col]] %in% na_recode, col ] <- NA_character_
}

files <- list()
for(clinical_type in names(m)) {
.df <- df[, m[[clinical_type]]$source ]
# cBioPortal does not allow list columns
for(col in names(.df)) {
if(class(.df[[col]]) == "list") {
.df[[col]] <- paste(.df[[col]], sep = ",")
warning(glue::glue("Coerced {col} data from list for export, you may want to check output."), call. = F)
}
# Use actual NA's so that `write.table` can write out "" consistently
.df[.df[[col]] %in% na_recode, col ] <- NA_character_
m <- split(m, by = "attribute_type")
if("individualID" %in% names(.df)) {
patient_df <- unique(.df[, c(names(.df) %in% m$PATIENT$source)])
header <- make_cbio_clinical_header(patient_df, m$PATIENT)
patient_df <- rbind(header, patient_df)
files[["PATIENT"]] <- patient_df
}
{
sample_df <- .df[, c(names(.df) %in% m$SAMPLE$source)]
header <- make_cbio_clinical_header(sample_df, m$SAMPLE)
sample_df <- rbind(header, sample_df)
files[["SAMPLE"]] <- sample_df
}

}
if(clinical_type == "PATIENT") {
.df <- unique(.df)
}
for(clinical_type in names(files)) {
filename <- get_cbio_filename(clinical_type)
header <- make_cbio_clinical_header(.df,
m[[clinical_type]]$label,
m[[clinical_type]]$description,
m[[clinical_type]]$data_type)

df_out <- rbind(header, .df)
path <- glue::glue("{publish_dir}/{filename}")
write.table(df_out,
write.table(files[[clinical_type]],
file = path,
sep = delim,
na = "",
col.names = F,
row.names = F,
quote = F)
if(verbose) message(glue::glue("{clinical_type} data written to: {path}"))
files[[clinical_type]] <- df_out
}

invisible(files)
}

Expand Down
1 change: 1 addition & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ reference:
desc: Export data as a cBioPortal study
- contents:
- cbp_new_study
- cbp_new_cancer_type
- cbp_add_maf
- cbp_add_clinical
- cbp_add_expression
Expand Down
4 changes: 2 additions & 2 deletions man/cbp_add_clinical.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 21 additions & 0 deletions man/cbp_new_cancer_type.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions man/format_gene_expression_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 4 additions & 7 deletions man/make_cbio_clinical_header.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion tests/testthat/test_register_study.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ test_that("Add study meta works", {

expected <- c(study_meta, studyStatus = "Active")

testthat::expect_mapequal(s, expected)
testthat::expect_output(print(s), "\\{'studyName': \\['NF Dev Playground'\\], 'dataStatus': \\['Data Not Expected'\\], 'initiative': \\['Other'\\], 'studyLeads': \\['Robert Allaway', 'Anh Nguyet Vu'\\], 'studyStatus': \\['Active'\\], 'diseaseFocus': \\['Multiple'\\], 'institutions': \\['Sage Bionetworks'\\], 'fundingAgency': \\['Sage Bionetworks'\\]\\}")

})


Expand Down
Loading

0 comments on commit c07b494

Please sign in to comment.