Skip to content

Commit

Permalink
read.modkit
Browse files Browse the repository at this point in the history
  • Loading branch information
yixuan-chen-elisa committed Nov 3, 2023
1 parent e10f4b9 commit 3c3e0f6
Show file tree
Hide file tree
Showing 9 changed files with 143 additions and 6 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Package: bsseq
Version: 1.39.0
Version: 1.37.1
Encoding: UTF-8
Title: Analyze, manage and store bisulfite sequencing data
Description: A collection of tools for analyzing and visualizing bisulfite
Expand Down Expand Up @@ -58,6 +58,7 @@ Collate:
read.bsmooth.R
read.bismark.R
read.modbam2bed.R
read.modkit.R
BSmooth.R
BSmooth.tstat.R
dmrFinder.R
Expand Down
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ export("BSseq", "getMeth", "getCoverage", "getBSseq", "getStats",
"combineList", "strandCollapse",
"plotRegion", "plotManyRegions",
"read.umtab", "read.umtab2", "read.bsmooth", "read.bismark",
"read.modbam2bed",
"read.modbam2bed", "read.modkit",
"poissonGoodnessOfFit", "binomialGoodnessOfFit",
"data.frame2GRanges", "BSseqTstat",
"BSseqStat",
Expand Down
4 changes: 2 additions & 2 deletions R/BSseq-class.R
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,8 @@ BSseq <- function(M = NULL, Cov = NULL, Filtered = NULL, coef = NULL, se.coef =

# Construct BSseq object ---------------------------------------------------

assays <- SimpleList(M = M, Cov = Cov, Filtered = Filtered, coef = coef,
se.coef = se.coef)
assays <- SimpleList(M = M, Cov = Cov, Filtered = Filtered,
coef = coef, se.coef = se.coef)
assays <- assays[!S4Vectors:::sapply_isNULL(assays)]
se <- SummarizedExperiment(
assays = assays,
Expand Down
78 changes: 78 additions & 0 deletions R/read.modkit.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
read.modkit = function(files,
colData = NULL,
rmZeroCov = FALSE,
strandCollapse = TRUE){
gr_list = list()
sampleNames = sub("\\.bed$", "", basename(files))
if (!is.null(colData)){
rownames(colData) <- sampleNames
}

for (i in seq_along(files)){
data = read.table(files[i], header = FALSE, sep="\t",
stringsAsFactors=FALSE, quote="")

gr = GRanges(seqnames = data$V1,
ranges = IRanges(start = data$V2+1, end = data$V3))
mcols(gr)$m = data$V12
mcols(gr)$cov = data$V12 + data$V13
mcols(gr)$filter = data$V16
if (any(data$V14 != 0)){
mcols(gr)$other_mod = data$V14
mcols(gr)$cov_other_mod = data$V14 + data$V13}
names(gr) = sampleNames[i]
gr_list[[sampleNames[i]]] = gr
}

overlap_gr = Reduce(subsetByOverlaps, gr_list)

m_cov_list = lapply(gr_list, function(gr){
overlap_data = gr[gr %over% overlap_gr]
if (!is.null(gr$other_mod)) {
data.frame(m = overlap_data$m, cov = overlap_data$cov,
filter = overlap_data$filter,
other_mod = overlap_data$other_mod,
cov_other_mod = overlap_data$cov_other_mod)
} else {
data.frame(m = overlap_data$m, cov = overlap_data$cov,
filter = overlap_data$filter)
}
})

m = do.call(cbind, lapply(m_cov_list, `[[`, "m"))
cov = do.call(cbind, lapply(m_cov_list, `[[`, "cov"))
filter = do.call(cbind, lapply(m_cov_list, `[[`, "filter"))
other_mod = do.call(cbind, lapply(m_cov_list, `[[`, "other_mod"))
cov_other_mod = do.call(cbind, lapply(m_cov_list, `[[`, "cov_other_mod"))

bsseq_objs = list()
bsseq_obj_m = BSseq(M = as.matrix(m), Cov = as.matrix(cov),
Filtered = as.matrix(filter),
coef = NULL, se.coef = NULL,
pos = start(overlap_gr), trans = NULL,
parameters = NULL, pData = colData, gr = NULL,
chr = as.vector(seqnames(overlap_gr)),
sampleNames = sampleNames, rmZeroCov = rmZeroCov)

bsseq_objs[[1]] = bsseq_obj_m

if (!is.null(other_mod)){
bsseq_obj_other_mod = BSseq(M = as.matrix(other_mod),
Cov = as.matrix(cov_other_mod),
Filtered = as.matrix(filter),
coef = NULL, se.coef = NULL,
pos = start(overlap_gr), trans = NULL,
parameters = NULL, pData = colData, gr = NULL,
chr = as.vector(seqnames(overlap_gr)),
sampleNames = sampleNames, rmZeroCov = rmZeroCov)

bsseq_objs[[2]] = bsseq_obj_other_mod
}

if (strandCollapse){
for (i in 1:length(bsseq_objs)) {
bsseq:::strandCollapse(bsseq_objs[[i]])}
}

return(bsseq_objs)
}
Binary file added inst/extdata/modkit/blood.cpg.chr22.bed.gz
Binary file not shown.
Binary file not shown.
2 changes: 0 additions & 2 deletions man/read.modbam2bed.Rd
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/read.modbam2bed.R
\name{read.modbam2bed}
\alias{read.modbam2bed}
\title{Construct BSseq objects from nanopore BED files}
Expand Down
47 changes: 47 additions & 0 deletions man/read.modkit.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
\name{read.modkit}
\alias{read.modkit}
\title{Construct BSseq objects from nanopore BED files}
\description{
Construct BSseq objects from nanopore BED files
}
\usage{
read.modkit(
files,
colData = NULL,
rmZeroCov = FALSE,
strandCollapse = TRUE
)
}
\arguments{
\item{files}{vector, BED files}
\item{colData}{data frame, phenotypic data with samples as rows and variables as columns}
\item{rmZeroCov}{A logical (1) indicating whether methylation loci that have zero coverage in all samples be removed}
\item{strandCollapse}{A logical (1) indicating whether stand-symmetric methylation loci (i.e. CpGs) should be collapsed across strands}
}

\value{
BSseq objects
}
\details{
This function reads in nanopore sequencing modified BED files
to Bsseq objects. Nanopore sequencing data (i.e. aggregated modified base
counts) is stored in modified-base BAM files. These modified-base BAM files
are converted to bedMethyl (BED) files using \href{https://github.com/nanoporetech/modkit}{modkit}.

\subsection{Details for modkit}{
Modkit outputs modified reads, unmodified reads, ambiguous modification reads (reads where the probability was below the threshold and usually failing the lowest 10th percentile), and other modified reads.}

\subsection{modkit to Bsseq object}{
After creating BED files using modkit, the BED files are read in and the Bsseq object is constructed via \code{read.modkit()} function. The function reads in BED files, extract genomic regions, methylation, coverage, ambiguous modification status data and sample information and then construct Bsseq object using \code{BSseq} function within the package. If there are other modified reads, two Bsseq objects are constructed with the first one using major modified reads and their corresponding coverage and the second one using the other modified reads and their corresponding coverage.}
}

\examples{
# only one modification and one BSseq object is constructed.
files <- c(system.file("extdata/modkit/blood.cpg.chr22.bed.gz", package = "bsseq"))
bsseq_nano <- bsseq::read.modkit(files, rmZeroCov = FALSE, strandCollapse=FALSE)

# there is other modification, so two BSseq objects are constructed.
files <- c(system.file("extdata/modkit/blood.cpg.other_mod.chr22.bed.gz",
package = "bsseq"))
bsseq_nano <- bsseq::read.modkit(files, rmZeroCov = FALSE, strandCollapse=TRUE)
}
13 changes: 13 additions & 0 deletions tests/testthat/test_read.modkit.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
context("read.modkit")

# TODO: Re-factor read.modkit() and update tests accordingly
test_that("read.modkit() works for BED files", {
infile <- system.file("extdata", "modkit/blood.cpg.chr22.bed.gz",
package = "bsseq")
bsseq <- read.modkit(files = infile,
colData = NULL,
rmZeroCov = FALSE,
strandCollapse = TRUE)

lapply(bsseq, function(x) {expect_is(x, "BSseq")})
})

0 comments on commit 3c3e0f6

Please sign in to comment.