From 806f9d5166fb4f37b9cd944cad452904be4bb96f Mon Sep 17 00:00:00 2001 From: Ramiro Magno Date: Mon, 24 Jun 2024 18:56:35 +0100 Subject: [PATCH] Add reader MRK_ENSEMBL --- R/read_report.R | 100 ++++++++++++++++++++++++++++++++++- R/report_column_processing.R | 33 +++++++++++- man/read_report.Rd | 6 ++- 3 files changed, 135 insertions(+), 4 deletions(-) diff --git a/R/read_report.R b/R/read_report.R index 4bf7f2a..bc749a0 100644 --- a/R/read_report.R +++ b/R/read_report.R @@ -34,6 +34,9 @@ #' # MGI Marker associations to Gene Trap IDs #' # read_report(file.path(base_url, "MRK_GeneTrap.rpt"), "MRK_GeneTrap") #' +#' # MGI Marker associations to Ensembl sequence information +#' # read_report(file.path(base_url, "MRK_ENSEMBL.rpt"), "MRK_ENSEMBL") +#' #' @returns A [tibble][tibble::tibble-package] with the report data in tidy #' format. #' @@ -47,7 +50,8 @@ read_report <- function(report_file, "MRK_Sequence", "MRK_SwissProt_TrEMBL", "MRK_SwissProt", - "MRK_GeneTrap"), + "MRK_GeneTrap", + "MRK_ENSEMBL"), n_max = Inf) { report_type <- match.arg(report_type) @@ -58,7 +62,8 @@ read_report <- function(report_file, MRK_Sequence = read_mrk_sequence_rpt, MRK_SwissProt_TrEMBL = read_mrk_swissprot_tr_embl_rpt, MRK_SwissProt = read_mrk_swissprot_rpt, - MRK_GeneTrap = read_mrk_genetrap_rpt) + MRK_GeneTrap = read_mrk_genetrap_rpt, + MRK_ENSEMBL = read_mrk_ensembl_rpt) read[[report_type]](file = report_file, n_max = n_max) } @@ -81,6 +86,45 @@ read_tsv <- function(file, } +# A sort of drop-in replacement of `read_tsv()` which is backed up by +# `vroom::vroom()`, while `read_tsv2()` is backed up `data.table::fread()`which +# has the useful `fill` parameter for when we have missing columns. +read_tsv2 <- function(file, + col_names, + col_types = "c", + skip = 1L, + n_max = Inf, + na = c("null", "NULL", "N/A", "")) { + + col_types_mapping <- c( + `c` = "character", + `i` = "integer", + `n` = "numeric", + `d` = "numeric", + `l` = "logical", + `f` = "factor", + `D` = "Date", + `-` = "NULL" + ) + + col_types <- unlist(strsplit(col_types, split = "")) + col_classes <- unname(col_types_mapping[col_types]) + + data.table::fread( + input = file, + sep = "\t", + col.names = col_names, + colClasses = col_classes, + # header = TRUE, + nrows = n_max, + na.strings = na, + fill = TRUE, + showProgress = FALSE + ) |> + tibble::as_tibble() + +} + #' Read a marker list report #' #' [read_mrk_list_rpt()] imports either a `MRK_List1.rpt` or a `MRK_List2.rpt` @@ -490,3 +534,55 @@ read_mrk_genetrap_rpt <- function(file, n_max = Inf) { .data$cell_line ) } + +read_mrk_ensembl_rpt <- function(file, n_max = Inf) { + col_names <- + c( + "marker_id", + "marker_symbol", + "marker_name", + "cM_pos", + "chr", + "ensembl_id", + "ensembl_trp_id", + "ensembl_prt_id", + "feature_type", + "start", + "end", + "strand", + "biotype" + ) + + col_types <- "ccccccccciicc" + # Import data + read_tsv2( + file = file, + col_names = col_names, + col_types = col_types, + n_max = n_max + ) |> + dplyr::mutate( + cM_pos = cM_pos_col(.data$cM_pos), + chr = chr_col(.data$chr), + strand = strand_col(.data$strand), + feature_type = special_feature_type_col(.data$feature_type), + biotype = biotype_col(.data$biotype), + ensembl_trp_id = ensembl_trp_id_col(.data$ensembl_trp_id), + ensembl_prt_id = ensembl_trp_id_col(.data$ensembl_prt_id) + ) |> + dplyr::relocate( + .data$marker_id, + .data$marker_symbol, + .data$marker_name, + .data$cM_pos, + .data$chr, + .data$start, + .data$end, + .data$strand, + .data$ensembl_id, + .data$ensembl_trp_id, + .data$ensembl_prt_id, + .data$feature_type, + .data$biotype + ) +} diff --git a/R/report_column_processing.R b/R/report_column_processing.R index ba71e9f..2f02db5 100644 --- a/R/report_column_processing.R +++ b/R/report_column_processing.R @@ -14,7 +14,7 @@ chr_col <- function(chr) { } cM_pos_col <- function(cM_pos) { - as.double(dplyr::if_else(cM_pos %in% c("syntenic", "N/A"), NA_character_, cM_pos)) + as.double(dplyr::if_else(cM_pos %in% c("syntenic", "N/A", "-1.0"), NA_character_, cM_pos)) } status_col <- function(status) { @@ -50,3 +50,34 @@ cell_line_col <- function(cell_line) { # Convert single NA values to empty character vectors in the list-column. dplyr::if_else(sapply(cell_line, \(x) length(x) == 1L && is.na(x)), list(character()), cell_line) } + +special_feature_type_col <- function(feature_type) { + + # In cases like: + # "lncRNA gene|lncRNA gene|lncRNA gene" + # keep only one instance. This assumes the values pipe-separated are repeated, + # this is the case in MRK_ENSEMBL.rpt. + feature_type <- sub("\\|.+", "", feature_type) + + factor(feature_type, levels = feature_types$feature_type) +} + +biotype_col <- function(biotype) { + biotype <- strsplit(biotype, "|", fixed = TRUE) + # Convert single NA values to empty character vectors in the list-column. + dplyr::if_else(sapply(biotype, \(x) length(x) == 1L && is.na(x)), list(character()), biotype) |> + sapply(\(x) unique(x)) +} + + +ensembl_trp_id_col <- function(ensembl_trp_id) { + ensembl_trp_id <- strsplit(ensembl_trp_id, " ", fixed = TRUE) + # Convert single NA values to empty character vectors in the list-column. + dplyr::if_else(sapply(ensembl_trp_id, \(x) length(x) == 1L && is.na(x)), list(character()), ensembl_trp_id) +} + +ensembl_prt_id_col <- function(ensembl_prt_id) { + ensembl_prt_id <- strsplit(ensembl_prt_id, " ", fixed = TRUE) + # Convert single NA values to empty character vectors in the list-column. + dplyr::if_else(sapply(ensembl_prt_id, \(x) length(x) == 1L && is.na(x)), list(character()), ensembl_prt_id) +} diff --git a/man/read_report.Rd b/man/read_report.Rd index 2ed6e27..e6f5626 100644 --- a/man/read_report.Rd +++ b/man/read_report.Rd @@ -7,7 +7,8 @@ read_report( report_file, report_type = c("MRK_List1", "MRK_List2", "MGI_MRK_Coord", "MGI_Gene_Model_Coord", - "MGI_GTGUP", "MRK_Sequence", "MRK_SwissProt_TrEMBL", "MRK_SwissProt", "MRK_GeneTrap"), + "MGI_GTGUP", "MRK_Sequence", "MRK_SwissProt_TrEMBL", "MRK_SwissProt", "MRK_GeneTrap", + "MRK_ENSEMBL"), n_max = Inf ) } @@ -53,4 +54,7 @@ read_report(file.path(base_url, "MRK_List1.rpt"), "MRK_List1", n_max = n) # MGI Marker associations to Gene Trap IDs # read_report(file.path(base_url, "MRK_GeneTrap.rpt"), "MRK_GeneTrap") +# MGI Marker associations to Ensembl sequence information +# read_report(file.path(base_url, "MRK_ENSEMBL.rpt"), "MRK_ENSEMBL") + }