From 806f9d5166fb4f37b9cd944cad452904be4bb96f Mon Sep 17 00:00:00 2001
From: Ramiro Magno <rmagno@pattern.institute>
Date: Mon, 24 Jun 2024 18:56:35 +0100
Subject: [PATCH] Add reader MRK_ENSEMBL

---
 R/read_report.R              | 100 ++++++++++++++++++++++++++++++++++-
 R/report_column_processing.R |  33 +++++++++++-
 man/read_report.Rd           |   6 ++-
 3 files changed, 135 insertions(+), 4 deletions(-)

diff --git a/R/read_report.R b/R/read_report.R
index 4bf7f2a..bc749a0 100644
--- a/R/read_report.R
+++ b/R/read_report.R
@@ -34,6 +34,9 @@
 #' # MGI Marker associations to Gene Trap IDs
 #' # read_report(file.path(base_url, "MRK_GeneTrap.rpt"), "MRK_GeneTrap")
 #'
+#' # MGI Marker associations to Ensembl sequence information
+#' # read_report(file.path(base_url, "MRK_ENSEMBL.rpt"), "MRK_ENSEMBL")
+#'
 #' @returns A [tibble][tibble::tibble-package] with the report data in tidy
 #'   format.
 #'
@@ -47,7 +50,8 @@ read_report <- function(report_file,
                                         "MRK_Sequence",
                                         "MRK_SwissProt_TrEMBL",
                                         "MRK_SwissProt",
-                                        "MRK_GeneTrap"),
+                                        "MRK_GeneTrap",
+                                        "MRK_ENSEMBL"),
                         n_max = Inf) {
 
   report_type <- match.arg(report_type)
@@ -58,7 +62,8 @@ read_report <- function(report_file,
                MRK_Sequence = read_mrk_sequence_rpt,
                MRK_SwissProt_TrEMBL = read_mrk_swissprot_tr_embl_rpt,
                MRK_SwissProt = read_mrk_swissprot_rpt,
-               MRK_GeneTrap = read_mrk_genetrap_rpt)
+               MRK_GeneTrap = read_mrk_genetrap_rpt,
+               MRK_ENSEMBL = read_mrk_ensembl_rpt)
 
   read[[report_type]](file = report_file, n_max = n_max)
 }
@@ -81,6 +86,45 @@ read_tsv <- function(file,
 
 }
 
+# A sort of drop-in replacement of `read_tsv()` which is backed up by
+# `vroom::vroom()`, while `read_tsv2()` is backed up `data.table::fread()`which
+# has the useful `fill` parameter for when we have missing columns.
+read_tsv2 <- function(file,
+                     col_names,
+                     col_types = "c",
+                     skip = 1L,
+                     n_max = Inf,
+                     na = c("null", "NULL", "N/A", "")) {
+
+  col_types_mapping <- c(
+    `c` = "character",
+    `i` = "integer",
+    `n` = "numeric",
+    `d` = "numeric",
+    `l` = "logical",
+    `f` = "factor",
+    `D` = "Date",
+    `-` = "NULL"
+  )
+
+  col_types <- unlist(strsplit(col_types, split = ""))
+  col_classes <- unname(col_types_mapping[col_types])
+
+  data.table::fread(
+    input = file,
+    sep = "\t",
+    col.names	= col_names,
+    colClasses = col_classes,
+    # header = TRUE,
+    nrows = n_max,
+    na.strings	= na,
+    fill = TRUE,
+    showProgress = FALSE
+  ) |>
+    tibble::as_tibble()
+
+}
+
 #' Read a marker list report
 #'
 #' [read_mrk_list_rpt()] imports either a `MRK_List1.rpt` or a `MRK_List2.rpt`
@@ -490,3 +534,55 @@ read_mrk_genetrap_rpt <- function(file, n_max = Inf) {
       .data$cell_line
     )
 }
+
+read_mrk_ensembl_rpt <- function(file, n_max = Inf) {
+  col_names <-
+    c(
+      "marker_id",
+      "marker_symbol",
+      "marker_name",
+      "cM_pos",
+      "chr",
+      "ensembl_id",
+      "ensembl_trp_id",
+      "ensembl_prt_id",
+      "feature_type",
+      "start",
+      "end",
+      "strand",
+      "biotype"
+    )
+
+  col_types <- "ccccccccciicc"
+  # Import data
+  read_tsv2(
+    file = file,
+    col_names = col_names,
+    col_types = col_types,
+    n_max = n_max
+  ) |>
+    dplyr::mutate(
+      cM_pos = cM_pos_col(.data$cM_pos),
+      chr = chr_col(.data$chr),
+      strand = strand_col(.data$strand),
+      feature_type = special_feature_type_col(.data$feature_type),
+      biotype = biotype_col(.data$biotype),
+      ensembl_trp_id = ensembl_trp_id_col(.data$ensembl_trp_id),
+      ensembl_prt_id = ensembl_trp_id_col(.data$ensembl_prt_id)
+    ) |>
+    dplyr::relocate(
+      .data$marker_id,
+      .data$marker_symbol,
+      .data$marker_name,
+      .data$cM_pos,
+      .data$chr,
+      .data$start,
+      .data$end,
+      .data$strand,
+      .data$ensembl_id,
+      .data$ensembl_trp_id,
+      .data$ensembl_prt_id,
+      .data$feature_type,
+      .data$biotype
+    )
+}
diff --git a/R/report_column_processing.R b/R/report_column_processing.R
index ba71e9f..2f02db5 100644
--- a/R/report_column_processing.R
+++ b/R/report_column_processing.R
@@ -14,7 +14,7 @@ chr_col <- function(chr) {
 }
 
 cM_pos_col <- function(cM_pos) {
-  as.double(dplyr::if_else(cM_pos %in% c("syntenic", "N/A"), NA_character_, cM_pos))
+  as.double(dplyr::if_else(cM_pos %in% c("syntenic", "N/A", "-1.0"), NA_character_, cM_pos))
 }
 
 status_col <- function(status) {
@@ -50,3 +50,34 @@ cell_line_col <- function(cell_line) {
   # Convert single NA values to empty character vectors in the list-column.
   dplyr::if_else(sapply(cell_line, \(x) length(x) == 1L && is.na(x)), list(character()), cell_line)
 }
+
+special_feature_type_col <- function(feature_type) {
+
+  # In cases like:
+  # "lncRNA gene|lncRNA gene|lncRNA gene"
+  # keep only one instance. This assumes the values pipe-separated are repeated,
+  # this is the case in MRK_ENSEMBL.rpt.
+  feature_type <- sub("\\|.+", "", feature_type)
+
+  factor(feature_type, levels = feature_types$feature_type)
+}
+
+biotype_col <- function(biotype) {
+  biotype <- strsplit(biotype, "|", fixed = TRUE)
+  # Convert single NA values to empty character vectors in the list-column.
+  dplyr::if_else(sapply(biotype, \(x) length(x) == 1L && is.na(x)), list(character()), biotype) |>
+    sapply(\(x) unique(x))
+}
+
+
+ensembl_trp_id_col <- function(ensembl_trp_id) {
+  ensembl_trp_id <- strsplit(ensembl_trp_id, " ", fixed = TRUE)
+  # Convert single NA values to empty character vectors in the list-column.
+  dplyr::if_else(sapply(ensembl_trp_id, \(x) length(x) == 1L && is.na(x)), list(character()), ensembl_trp_id)
+}
+
+ensembl_prt_id_col <- function(ensembl_prt_id) {
+  ensembl_prt_id <- strsplit(ensembl_prt_id, " ", fixed = TRUE)
+  # Convert single NA values to empty character vectors in the list-column.
+  dplyr::if_else(sapply(ensembl_prt_id, \(x) length(x) == 1L && is.na(x)), list(character()), ensembl_prt_id)
+}
diff --git a/man/read_report.Rd b/man/read_report.Rd
index 2ed6e27..e6f5626 100644
--- a/man/read_report.Rd
+++ b/man/read_report.Rd
@@ -7,7 +7,8 @@
 read_report(
   report_file,
   report_type = c("MRK_List1", "MRK_List2", "MGI_MRK_Coord", "MGI_Gene_Model_Coord",
-    "MGI_GTGUP", "MRK_Sequence", "MRK_SwissProt_TrEMBL", "MRK_SwissProt", "MRK_GeneTrap"),
+    "MGI_GTGUP", "MRK_Sequence", "MRK_SwissProt_TrEMBL", "MRK_SwissProt", "MRK_GeneTrap",
+    "MRK_ENSEMBL"),
   n_max = Inf
 )
 }
@@ -53,4 +54,7 @@ read_report(file.path(base_url, "MRK_List1.rpt"), "MRK_List1", n_max = n)
 # MGI Marker associations to Gene Trap IDs
 # read_report(file.path(base_url, "MRK_GeneTrap.rpt"), "MRK_GeneTrap")
 
+# MGI Marker associations to Ensembl sequence information
+# read_report(file.path(base_url, "MRK_ENSEMBL.rpt"), "MRK_ENSEMBL")
+
 }