Merge pull request #132 from yixuan-chen-elisa/update-test-data

update modbam2bed test data & NEWS.Rd
hansenlab · Nov 27, 2023 · f2a24de · f2a24de
2 parents 161f323 + 46f5ffa
commit f2a24de
Show file tree

Hide file tree

Showing 18 changed files with 139 additions and 131 deletions.
diff --git a/R/read.modbam2bed.R b/R/read.modbam2bed.R
@@ -1,44 +1,45 @@
 read.modbam2bed <- function(files, colData = NULL, rmZeroCov = FALSE,
                             strandCollapse = TRUE) {
-  gr_list <- list()
-  sampleNames <- sub("\\.bed$","",basename(files))
-  if (!is.null(colData)) {
-      rownames(colData) <- sampleNames
-  }
+    gr_list <- list()
+    sampleNames <- sub("\\.bed$","",basename(files))
+    if (!is.null(colData)) {
+        rownames(colData) <- sampleNames
+        }
 
-  for (i in seq_along(files)) {
-      data <- read.table(files[i],header = FALSE, sep="\t",
-                         stringsAsFactors=FALSE, quote="")
-      gr <- GRanges(seqnames = data$V1,
-                    ranges = IRanges(start = data$V2+1, end = data$V3))
-      mcols(gr)$m <- data$V13
-      mcols(gr)$cov <- data$V12 + data$V13
-      mcols(gr)$filter <- data$V14
-      names(gr) <- sampleNames[i]
-      gr_list[[sampleNames[i]]] <- gr
-  }
+    for (i in seq_along(files)) {
+        data <- read.table(files[i],header = FALSE, sep="\t",
+                    stringsAsFactors=FALSE, quote="")
+        gr <- GRanges(seqnames = data$V1,
+            ranges = IRanges(start = data$V2+1, end = data$V3))
+        mcols(gr)$m <- data$V13
+        mcols(gr)$cov <- data$V12 + data$V13
+        mcols(gr)$filter <- data$V14
+        names(gr) <- sampleNames[i]
+        gr_list[[sampleNames[i]]] <- gr
+        }
 
-  overlap_gr <- Reduce(subsetByOverlaps, gr_list)
-
-  m_cov_list <- lapply(gr_list, function(gr) {
-      overlap_data = gr[gr %over% overlap_gr]
-      data.frame(m = overlap_data$m, cov = overlap_data$cov,
-                 filter = overlap_data$filter)})
+    overlap_gr <- Reduce(subsetByOverlaps, gr_list)
 
-  m <- do.call(cbind,lapply(m_cov_list,`[[`, "m"))
-  cov <- do.call(cbind, lapply(m_cov_list, `[[`, "cov"))
-  filter <- do.call(cbind, lapply(m_cov_list, `[[`, "filter"))
+    m_cov_list <- lapply(gr_list, function(gr) {
+        overlap_data <- gr[gr %over% overlap_gr]
+        data.frame(m = overlap_data$m, cov = overlap_data$cov,
+            filter = overlap_data$filter)})
 
-  bsseq_obj <- BSseq(M = as.matrix(m), Cov = as.matrix(cov),
+    m <- do.call(cbind,lapply(m_cov_list,`[[`, "m"))
+    cov <- do.call(cbind, lapply(m_cov_list, `[[`, "cov"))
+    filter <- do.call(cbind, lapply(m_cov_list, `[[`, "filter"))
+
+    bsseq_obj <- BSseq(M = as.matrix(m), Cov = as.matrix(cov),
                      Filtered = as.matrix(filter),
                      coef = NULL,se.coef = NULL,
                      pos = start(overlap_gr),trans = NULL,
                      parameters = NULL, pData = colData, gr = NULL,
                      chr = as.vector(seqnames(overlap_gr)),
                      sampleNames = sampleNames,rmZeroCov = rmZeroCov)
 
-  if (strandCollapse) {
-      strandCollapse(bsseq_obj)
-  }
-  return(bsseq_obj)
+    if (strandCollapse) {
+        strandCollapse(bsseq_obj)
+    }
+
+    return(bsseq_obj)
 }
diff --git a/R/read.modkit.R b/R/read.modkit.R
@@ -1,78 +1,78 @@
-read.modkit = function(files,
-                       colData = NULL,
-                       rmZeroCov = FALSE,
-                       strandCollapse = TRUE){
-  gr_list = list()
-  sampleNames = sub("\\.bed$", "", basename(files))
-  if (!is.null(colData)){
-    rownames(colData) <- sampleNames
-  }
+read.modkit <- function(files,
+                        colData = NULL,
+                        rmZeroCov = FALSE,
+                        strandCollapse = TRUE){
+    gr_list <- list()
+    sampleNames <- sub("\\.bed$", "", basename(files))
+    if (!is.null(colData)){
+        rownames(colData) <- sampleNames
+        }
 
-  for (i in seq_along(files)){
-    data = read.table(files[i], header = FALSE, sep="\t",
-                      stringsAsFactors=FALSE, quote="")
+    for (i in seq_along(files)){
+        data <- read.table(files[i], header = FALSE, sep="\t",
+                    stringsAsFactors=FALSE, quote="")
+        gr <- GRanges(seqnames = data$V1,
+                  ranges = IRanges(start = data$V2+1, end = data$V3))
+        mcols(gr)$m <- data$V12
+        mcols(gr)$cov <- data$V12 + data$V13
+        mcols(gr)$filter <- data$V16
+        if (any(data$V14 != 0)){
+            mcols(gr)$other_mod <- data$V14
+            mcols(gr)$cov_other_mod <- data$V14 + data$V13
+            }
+        names(gr) <- sampleNames[i]
+        gr_list[[sampleNames[i]]] <- gr
+        }
 
-    gr = GRanges(seqnames = data$V1,
-                 ranges = IRanges(start = data$V2+1, end = data$V3))
-    mcols(gr)$m = data$V12
-    mcols(gr)$cov = data$V12 + data$V13
-    mcols(gr)$filter = data$V16
-    if (any(data$V14 != 0)){
-        mcols(gr)$other_mod = data$V14
-        mcols(gr)$cov_other_mod = data$V14 + data$V13}
-    names(gr) = sampleNames[i]
-    gr_list[[sampleNames[i]]] = gr
-  }
+    overlap_gr <- Reduce(subsetByOverlaps, gr_list)
 
-  overlap_gr = Reduce(subsetByOverlaps, gr_list)
+    m_cov_list <- lapply(gr_list, function(gr){
+        overlap_data <- gr[gr %over% overlap_gr]
+        if (!is.null(gr$other_mod)) {
+            data.frame(m = overlap_data$m, cov = overlap_data$cov,
+                filter = overlap_data$filter,
+                other_mod = overlap_data$other_mod,
+                cov_other_mod = overlap_data$cov_other_mod)
+            } else {
+                data.frame(m = overlap_data$m, cov = overlap_data$cov,
+                    filter = overlap_data$filter)
+                }
+        })
 
-  m_cov_list = lapply(gr_list, function(gr){
-    overlap_data = gr[gr %over% overlap_gr]
-    if (!is.null(gr$other_mod)) {
-      data.frame(m = overlap_data$m, cov = overlap_data$cov,
-                 filter = overlap_data$filter,
-                 other_mod = overlap_data$other_mod,
-                 cov_other_mod = overlap_data$cov_other_mod)
-      } else {
-      data.frame(m = overlap_data$m, cov = overlap_data$cov,
-                 filter = overlap_data$filter)
-      }
-    })
+    m <- do.call(cbind, lapply(m_cov_list, `[[`, "m"))
+    cov <- do.call(cbind, lapply(m_cov_list, `[[`, "cov"))
+    filter <- do.call(cbind, lapply(m_cov_list, `[[`, "filter"))
+    other_mod <- do.call(cbind, lapply(m_cov_list, `[[`, "other_mod"))
+    cov_other_mod <- do.call(cbind, lapply(m_cov_list, `[[`, "cov_other_mod"))
 
-  m = do.call(cbind, lapply(m_cov_list, `[[`, "m"))
-  cov = do.call(cbind, lapply(m_cov_list, `[[`, "cov"))
-  filter = do.call(cbind, lapply(m_cov_list, `[[`, "filter"))
-  other_mod = do.call(cbind, lapply(m_cov_list, `[[`, "other_mod"))
-  cov_other_mod = do.call(cbind, lapply(m_cov_list, `[[`, "cov_other_mod"))
+    bsseq_objs <- list()
+    bsseq_obj_m <- BSseq(M = as.matrix(m), Cov = as.matrix(cov),
+                      Filtered = as.matrix(filter),
+                      coef = NULL, se.coef = NULL,
+                      pos = start(overlap_gr), trans = NULL,
+                      parameters = NULL, pData = colData, gr = NULL,
+                      chr = as.vector(seqnames(overlap_gr)),
+                      sampleNames = sampleNames, rmZeroCov = rmZeroCov)
 
-  bsseq_objs = list()
-  bsseq_obj_m = BSseq(M = as.matrix(m), Cov = as.matrix(cov),
-                    Filtered = as.matrix(filter),
-                    coef = NULL, se.coef = NULL,
-                    pos = start(overlap_gr), trans = NULL,
-                    parameters = NULL, pData = colData, gr = NULL,
-                    chr = as.vector(seqnames(overlap_gr)),
-                    sampleNames = sampleNames, rmZeroCov = rmZeroCov)
+    bsseq_objs[[1]] <- bsseq_obj_m
 
-  bsseq_objs[[1]] = bsseq_obj_m
+    if (!is.null(other_mod)){
+        bsseq_obj_other_mod <- BSseq(M = as.matrix(other_mod),
+                                   Cov = as.matrix(cov_other_mod),
+                                   Filtered = as.matrix(filter),
+                                   coef = NULL, se.coef = NULL,
+                                   pos = start(overlap_gr), trans = NULL,
+                                   parameters = NULL, pData = colData, gr = NULL,
+                                   chr = as.vector(seqnames(overlap_gr)),
+                                   sampleNames = sampleNames, rmZeroCov = rmZeroCov)
 
-  if (!is.null(other_mod)){
-      bsseq_obj_other_mod = BSseq(M = as.matrix(other_mod),
-                                  Cov = as.matrix(cov_other_mod),
-                                  Filtered = as.matrix(filter),
-                                  coef = NULL, se.coef = NULL,
-                                  pos = start(overlap_gr), trans = NULL,
-                                  parameters = NULL, pData = colData, gr = NULL,
-                                  chr = as.vector(seqnames(overlap_gr)),
-                                  sampleNames = sampleNames, rmZeroCov = rmZeroCov)
+        bsseq_objs[[2]] <- bsseq_obj_other_mod
+        }
 
-      bsseq_objs[[2]] = bsseq_obj_other_mod
-    }
+    if (strandCollapse){
+        for (i in seq_along(bsseq_objs)) {
+            bsseq:::strandCollapse(bsseq_objs[[i]])}
+        }
 
-  if (strandCollapse){
-      for (i in 1:length(bsseq_objs)) {
-      bsseq:::strandCollapse(bsseq_objs[[i]])}
-  }
-
-  return(bsseq_objs)
+    return(bsseq_objs)
 }
diff --git a/inst/NEWS.Rd b/inst/NEWS.Rd
@@ -8,9 +8,16 @@
       years. }
     \item{Minor code and documentation chnages. Fixed usage of
       \code{class(x) == "y"} and added examples to some man pages.}
+    \item{\code{Bsseq()} is modified to store filtered matrix (ambigious modification         status) for nanopore data.
+         }
+    \item{New function \code{read.modkit()} to read in methyl bed files generated             from modkit and construct Bsseq objects. When there is no other                         modifications, one Bsseq object is constructed, and when there is other                 modification present, its methylation and corresponding coverage are also stored
+      and two Bsseq objects are constructed.
+          }
+    \item{New function \code{read.modbam2bed()} to read in, extract methylation,              coverage and filtered data and create Bsseq object from methyl bed files                obtained by modbam2bed. Some instructions for using modbam2bed is also provided.
+         }
   }
 }
-      
+
 
 \section{Version 1.17.x}{
   \itemize{

diff --git a/inst/extdata/modbam2bed/Ctr1.bed.gz b/inst/extdata/modbam2bed/Ctr1.bed.gz
diff --git a/inst/extdata/modbam2bed/Ctr2.bed.gz b/inst/extdata/modbam2bed/Ctr2.bed.gz
diff --git a/inst/extdata/modbam2bed/Ctr3.bed.gz b/inst/extdata/modbam2bed/Ctr3.bed.gz
diff --git a/inst/extdata/modbam2bed/Tret1.bed.gz b/inst/extdata/modbam2bed/Tret1.bed.gz
diff --git a/inst/extdata/modbam2bed/Tret2.bed.gz b/inst/extdata/modbam2bed/Tret2.bed.gz
diff --git a/inst/extdata/modbam2bed/Tret3.bed.gz b/inst/extdata/modbam2bed/Tret3.bed.gz
diff --git a/inst/extdata/modbam2bed/ctr1.chr10.chr11.bed.gz b/inst/extdata/modbam2bed/ctr1.chr10.chr11.bed.gz
diff --git a/inst/extdata/modbam2bed/ctr2.chr10.chr11.bed.gz b/inst/extdata/modbam2bed/ctr2.chr10.chr11.bed.gz
diff --git a/inst/extdata/modbam2bed/ctr3.chr10.chr11.bed.gz b/inst/extdata/modbam2bed/ctr3.chr10.chr11.bed.gz
diff --git a/inst/extdata/modbam2bed/tret1.chr10.chr11.bed.gz b/inst/extdata/modbam2bed/tret1.chr10.chr11.bed.gz
diff --git a/inst/extdata/modbam2bed/tret2.chr10.chr11.bed.gz b/inst/extdata/modbam2bed/tret2.chr10.chr11.bed.gz
diff --git a/inst/extdata/modbam2bed/tret3.chr10.chr11.bed.gz b/inst/extdata/modbam2bed/tret3.chr10.chr11.bed.gz
diff --git a/man/read.modbam2bed.Rd b/man/read.modbam2bed.Rd
@@ -6,17 +6,17 @@ Construct BSseq objects from nanopore BED files
 }
 \usage{
 read.modbam2bed(
-  files,
-  colData = NULL,
-  rmZeroCov = FALSE,
-  strandCollapse = TRUE
+    files,
+    colData = NULL,
+    rmZeroCov = FALSE,
+    strandCollapse = TRUE
 )
 }
 \arguments{
-  \item{files}{vector, BED files}
-  \item{colData}{data frame, phenotypic data with samples as rows and variables as columns}
-  \item{rmZeroCov}{A logical (1) indicating whether methylation loci that have zero coverage in all samples be removed}
-  \item{strandCollapse}{A logical (1) indicating whether stand-symmetric methylation loci (i.e. CpGs) should be collapsed across strands}
+    \item{files}{vector, BED files}
+    \item{colData}{data frame, phenotypic data with samples as rows and variables as columns}
+    \item{rmZeroCov}{A logical (1) indicating whether methylation loci that have zero coverage in all samples be removed}
+    \item{strandCollapse}{A logical (1) indicating whether stand-symmetric methylation loci (i.e. CpGs) should be collapsed across strands}
 }
 
 \value{
@@ -29,30 +29,30 @@ counts) is stored in modified-base BAM files. These modified-base BAM files
 are converted to bedMethyl (BED) files using \href{https://github.com/epi2me-labs/modbam2bed}{modbam2bed}.
 
 \subsection{Details for using modbam2bed}{
-  After installing modbam2bed, a conda environment is activated. Index files
+    After installing modbam2bed, a conda environment is activated. Index files
 for BAM files are created using \code{samtools index}. The code requires aligned reads with the Mm and Ml tags (MM and ML also supported), and the reference sequence used for alignment (<reference.fasta>).
 
 \itemize{
-  \item \code{-e, -- extended} to output canonical, modified, and filtered bases;
-  \item \code{-m, -- mod_base=BASE} to output modified base of interest, one of: 5mC, 5hmC, 5fC, 5caC, 5hmU, 5fU, 5caU, 6mA, 5oxoG, Xao. (Or modA, modC, modG, modT, modU, modN for generic modified base);
-  \item \code{-r, --region=chr:start-end} to output chromosome or genomic region of interest;
-  \item \code{-f, --threshold=THRESHOLD} to output filtered bases for
+    \item \code{-e, -- extended} to output canonical, modified, and filtered bases;
+    \item \code{-m, -- mod_base=BASE} to output modified base of interest, one of: 5mC, 5hmC, 5fC, 5caC, 5hmU, 5fU, 5caU, 6mA, 5oxoG, Xao. (Or modA, modC, modG, modT, modU, modN for generic modified base);
+    \item \code{-r, --region=chr:start-end} to output chromosome or genomic region of interest;
+    \item \code{-f, --threshold=THRESHOLD} to output filtered bases for
 probability lower than threshold (default = 0.66)
 }
 }
 
   \subsection{modbam2bed to Bsseq object}{
-  After creating BED files using modbam2bed, the BED files are read in and the Bsseq object is constructed via \code{read.modbam2bed()} function. The function reads in BED files, extract genomic regions, methylation, coverage, ambiguous modification status data and sample information and then construct Bsseq object using \code{BSseq} function within the package.}
+    After creating BED files using modbam2bed, the BED files are read in and the Bsseq object is constructed via \code{read.modbam2bed()} function. The function reads in BED files, extract genomic regions, methylation, coverage, ambiguous modification status data and sample information and then construct Bsseq object using \code{BSseq} function within the package.}
 }
 \examples{
-files <- c(system.file("extdata/modbam2bed/Ctr1.bed.gz", package = "bsseq"),
-           system.file("extdata/modbam2bed/Ctr2.bed.gz", package = "bsseq"),
-           system.file("extdata/modbam2bed/Ctr3.bed.gz", package = "bsseq"),
-           system.file("extdata/modbam2bed/Tret1.bed.gz", package = "bsseq"),
-           system.file("extdata/modbam2bed/Tret2.bed.gz", package = "bsseq"),
-           system.file("extdata/modbam2bed/Tret3.bed.gz", package = "bsseq"))
+files <- c(system.file("extdata/modbam2bed/ctr1.chr10.chr11.bed.gz", package = "bsseq"),
+    system.file("extdata/modbam2bed/ctr2.chr10.chr11.bed.gz", package = "bsseq"),
+    system.file("extdata/modbam2bed/ctr3.chr10.chr11.bed.gz", package = "bsseq"),
+    system.file("extdata/modbam2bed/tret1.chr10.chr11.bed.gz", package = "bsseq"),
+    system.file("extdata/modbam2bed/tret2.chr10.chr11.bed.gz", package = "bsseq"),
+    system.file("extdata/modbam2bed/tret3.chr10.chr11.bed.gz", package = "bsseq"))
 pd <- data.frame(condition = rep(c("control", "treatment"), each = 3),
-                 replicate = rep(c("rep1", "rep2", "rep3"), times = 2))
+    replicate = rep(c("rep1", "rep2", "rep3"), times = 2))
 bsseq_nano <- bsseq::read.modbam2bed(files,colData=pd,rmZeroCov = FALSE,
-                                     strandCollapse=TRUE)
+    strandCollapse=TRUE)
 }
diff --git a/man/read.modkit.Rd b/man/read.modkit.Rd
@@ -6,17 +6,17 @@ Construct BSseq objects from nanopore BED files
 }
 \usage{
 read.modkit(
-  files,
-  colData = NULL,
-  rmZeroCov = FALSE,
-  strandCollapse = TRUE
+    files,
+    colData = NULL,
+    rmZeroCov = FALSE,
+    strandCollapse = TRUE
 )
 }
 \arguments{
-  \item{files}{vector, BED files}
-  \item{colData}{data frame, phenotypic data with samples as rows and variables as columns}
-  \item{rmZeroCov}{A logical (1) indicating whether methylation loci that have zero coverage in all samples be removed}
-  \item{strandCollapse}{A logical (1) indicating whether stand-symmetric methylation loci (i.e. CpGs) should be collapsed across strands}
+    \item{files}{vector, BED files}
+    \item{colData}{data frame, phenotypic data with samples as rows and variables as columns}
+    \item{rmZeroCov}{A logical (1) indicating whether methylation loci that have zero coverage in all samples be removed}
+    \item{strandCollapse}{A logical (1) indicating whether stand-symmetric methylation loci (i.e. CpGs) should be collapsed across strands}
 }
 
 \value{
@@ -29,10 +29,10 @@ counts) is stored in modified-base BAM files. These modified-base BAM files
 are converted to bedMethyl (BED) files using \href{https://github.com/nanoporetech/modkit}{modkit}.
 
 \subsection{Details for modkit}{
-  Modkit outputs modified reads, unmodified reads, ambiguous modification reads (reads where the probability was below the threshold and usually failing the lowest 10th percentile), and other modified reads.}
+    Modkit outputs modified reads, unmodified reads, ambiguous modification reads (reads where the probability was below the threshold and usually failing the lowest 10th percentile), and other modified reads.}
 
 \subsection{modkit to Bsseq object}{
-  After creating BED files using modkit, the BED files are read in and the Bsseq object is constructed via \code{read.modkit()} function. The function reads in BED files, extract genomic regions, methylation, coverage, ambiguous modification status data and sample information and then construct Bsseq object using \code{BSseq} function within the package. If there are other modified reads, two Bsseq objects are constructed with the first one using major modified reads and their corresponding coverage and the second one using the other modified reads and their corresponding coverage.}
+    After creating BED files using modkit, the BED files are read in and the Bsseq object is constructed via \code{read.modkit()} function. The function reads in BED files, extract genomic regions, methylation, coverage, ambiguous modification status data and sample information and then construct Bsseq object using \code{BSseq} function within the package. If there are other modified reads, two Bsseq objects are constructed with the first one using major modified reads and their corresponding coverage and the second one using the other modified reads and their corresponding coverage.}
 }
 
 \examples{

diff --git a/tests/testthat/test_read.modbam2bed.R b/tests/testthat/test_read.modbam2bed.R
@@ -2,7 +2,7 @@ context("read.modbam2bed")
 
 # TODO: Re-factor read.modbam2bed() and update tests accordingly
 test_that("read.modbam2bed() works for BED files", {
-  infile <- system.file("extdata", "modbam2bed/Ctr1.bed.gz",package = "bsseq")
+  infile <- system.file("extdata", "modbam2bed/ctr1.chr10.chr11.bed.gz",package = "bsseq")
   bsseq <- read.modbam2bed(files = infile,
                         colData = NULL,
                         rmZeroCov = FALSE,