Skip to content

Commit

Permalink
Add reader MRK_ENSEMBL
Browse files Browse the repository at this point in the history
  • Loading branch information
ramiromagno committed Jun 24, 2024
1 parent bda4b14 commit 806f9d5
Show file tree
Hide file tree
Showing 3 changed files with 135 additions and 4 deletions.
100 changes: 98 additions & 2 deletions R/read_report.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@
#' # MGI Marker associations to Gene Trap IDs
#' # read_report(file.path(base_url, "MRK_GeneTrap.rpt"), "MRK_GeneTrap")
#'
#' # MGI Marker associations to Ensembl sequence information
#' # read_report(file.path(base_url, "MRK_ENSEMBL.rpt"), "MRK_ENSEMBL")
#'
#' @returns A [tibble][tibble::tibble-package] with the report data in tidy
#' format.
#'
Expand All @@ -47,7 +50,8 @@ read_report <- function(report_file,
"MRK_Sequence",
"MRK_SwissProt_TrEMBL",
"MRK_SwissProt",
"MRK_GeneTrap"),
"MRK_GeneTrap",
"MRK_ENSEMBL"),
n_max = Inf) {

report_type <- match.arg(report_type)
Expand All @@ -58,7 +62,8 @@ read_report <- function(report_file,
MRK_Sequence = read_mrk_sequence_rpt,
MRK_SwissProt_TrEMBL = read_mrk_swissprot_tr_embl_rpt,
MRK_SwissProt = read_mrk_swissprot_rpt,
MRK_GeneTrap = read_mrk_genetrap_rpt)
MRK_GeneTrap = read_mrk_genetrap_rpt,
MRK_ENSEMBL = read_mrk_ensembl_rpt)

read[[report_type]](file = report_file, n_max = n_max)
}
Expand All @@ -81,6 +86,45 @@ read_tsv <- function(file,

}

# A sort of drop-in replacement of `read_tsv()` which is backed up by
# `vroom::vroom()`, while `read_tsv2()` is backed up `data.table::fread()`which
# has the useful `fill` parameter for when we have missing columns.
read_tsv2 <- function(file,
col_names,
col_types = "c",
skip = 1L,
n_max = Inf,
na = c("null", "NULL", "N/A", "")) {

col_types_mapping <- c(
`c` = "character",
`i` = "integer",
`n` = "numeric",
`d` = "numeric",
`l` = "logical",
`f` = "factor",
`D` = "Date",
`-` = "NULL"
)

col_types <- unlist(strsplit(col_types, split = ""))
col_classes <- unname(col_types_mapping[col_types])

data.table::fread(
input = file,
sep = "\t",
col.names = col_names,
colClasses = col_classes,
# header = TRUE,
nrows = n_max,
na.strings = na,
fill = TRUE,
showProgress = FALSE
) |>
tibble::as_tibble()

}

#' Read a marker list report
#'
#' [read_mrk_list_rpt()] imports either a `MRK_List1.rpt` or a `MRK_List2.rpt`
Expand Down Expand Up @@ -490,3 +534,55 @@ read_mrk_genetrap_rpt <- function(file, n_max = Inf) {
.data$cell_line
)
}

read_mrk_ensembl_rpt <- function(file, n_max = Inf) {
col_names <-
c(
"marker_id",
"marker_symbol",
"marker_name",
"cM_pos",
"chr",
"ensembl_id",
"ensembl_trp_id",
"ensembl_prt_id",
"feature_type",
"start",
"end",
"strand",
"biotype"
)

col_types <- "ccccccccciicc"
# Import data
read_tsv2(
file = file,
col_names = col_names,
col_types = col_types,
n_max = n_max
) |>
dplyr::mutate(
cM_pos = cM_pos_col(.data$cM_pos),
chr = chr_col(.data$chr),
strand = strand_col(.data$strand),
feature_type = special_feature_type_col(.data$feature_type),
biotype = biotype_col(.data$biotype),
ensembl_trp_id = ensembl_trp_id_col(.data$ensembl_trp_id),
ensembl_prt_id = ensembl_trp_id_col(.data$ensembl_prt_id)
) |>
dplyr::relocate(
.data$marker_id,
.data$marker_symbol,
.data$marker_name,
.data$cM_pos,
.data$chr,
.data$start,
.data$end,
.data$strand,
.data$ensembl_id,
.data$ensembl_trp_id,
.data$ensembl_prt_id,
.data$feature_type,
.data$biotype
)
}
33 changes: 32 additions & 1 deletion R/report_column_processing.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ chr_col <- function(chr) {
}

cM_pos_col <- function(cM_pos) {
as.double(dplyr::if_else(cM_pos %in% c("syntenic", "N/A"), NA_character_, cM_pos))
as.double(dplyr::if_else(cM_pos %in% c("syntenic", "N/A", "-1.0"), NA_character_, cM_pos))
}

status_col <- function(status) {
Expand Down Expand Up @@ -50,3 +50,34 @@ cell_line_col <- function(cell_line) {
# Convert single NA values to empty character vectors in the list-column.
dplyr::if_else(sapply(cell_line, \(x) length(x) == 1L && is.na(x)), list(character()), cell_line)
}

special_feature_type_col <- function(feature_type) {

# In cases like:
# "lncRNA gene|lncRNA gene|lncRNA gene"
# keep only one instance. This assumes the values pipe-separated are repeated,
# this is the case in MRK_ENSEMBL.rpt.
feature_type <- sub("\\|.+", "", feature_type)

factor(feature_type, levels = feature_types$feature_type)
}

biotype_col <- function(biotype) {
biotype <- strsplit(biotype, "|", fixed = TRUE)
# Convert single NA values to empty character vectors in the list-column.
dplyr::if_else(sapply(biotype, \(x) length(x) == 1L && is.na(x)), list(character()), biotype) |>
sapply(\(x) unique(x))
}


ensembl_trp_id_col <- function(ensembl_trp_id) {
ensembl_trp_id <- strsplit(ensembl_trp_id, " ", fixed = TRUE)
# Convert single NA values to empty character vectors in the list-column.
dplyr::if_else(sapply(ensembl_trp_id, \(x) length(x) == 1L && is.na(x)), list(character()), ensembl_trp_id)
}

ensembl_prt_id_col <- function(ensembl_prt_id) {
ensembl_prt_id <- strsplit(ensembl_prt_id, " ", fixed = TRUE)
# Convert single NA values to empty character vectors in the list-column.
dplyr::if_else(sapply(ensembl_prt_id, \(x) length(x) == 1L && is.na(x)), list(character()), ensembl_prt_id)
}
6 changes: 5 additions & 1 deletion man/read_report.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 806f9d5

Please sign in to comment.