diff --git a/R/methylDBClasses.R b/R/methylDBClasses.R index 40873fa6..bec1f91f 100644 --- a/R/methylDBClasses.R +++ b/R/methylDBClasses.R @@ -276,61 +276,88 @@ makeMethylRawDB<-function(df,dbpath,dbtype, # creates a methylRawDB object from a flat file database # it is called from read function or whenever this functionality is needed # can load a database with or without header -readMethylRawDB<-function(dbpath,dbtype=NULL, - sample.id=NULL, assembly=NULL ,context=NULL, - resolution=NULL,skip=0){ - - if(!file.exists(paste0(dbpath,".tbi"))) - { - Rsamtools::indexTabix(dbpath,seq=1, start=2, end=3, - skip=skip, comment="#", zeroBased=FALSE) +readMethylRawDB <- function(dbpath, + dbtype = NULL, + sample.id = NULL, + assembly = NULL, + context = NULL, + resolution = NULL) { + + if(!is.character(dbpath)) { + stop("'dbpath' must be a character string") } - - # if the tabix file includes a header generated with obj2tabix, - # it can easily be parsed into the respective object - head <- checkTabixHeader(tbxFile = dbpath, - message = paste("No Tabix Header Found,", - "trying to create methylRawDB from supplied arguments.")) - if(!is.null(head) & !anyNA(head)) { + if (dbpath == "") { + stop("No tabix file specified.") + } + # check if tabix file has header + header <- readTabixHeader(dbpath) + # parse header fields + parsedHeader <- parseTabixHeader(header) - - if(! any(head$class == c("methylRaw", "methylRawDB") ) ) { + if (!is.null(parsedHeader)) { + if (!any(parsedHeader$class == c("methylRaw", "methylRawDB"))) { stop( - paste("Tabix file does not originate from methylRaw or methylRawDB.\n", - "Please provide the correct class of data.") + paste( + "Tabix file does not originate from methylRaw or methylRawDB.\n", + "Please provide the correct class of data." + ) ) } - - if(is.null(head$num.records)) { - num.records=Rsamtools::countTabix(dbpath)[[1]] - } else num.records = head$num.records - obj <- new("methylRawDB", dbpath=normalizePath(dbpath), - num.records=num.records, sample.id = head$sample.ids, - assembly = head$assembly,context=head$context, - resolution=head$resolution,dbtype=head$dbtype) + if(is.null(parsedHeader$num.records)) { + parsedHeader$num.records <- NA_integer_ + } + + # arguments are extracted from parsedHeader + argList <- list( + num.records = parsedHeader$num.records, + sample.id = parsedHeader$sample.ids, + assembly = parsedHeader$assembly, + context = parsedHeader$context, + resolution = parsedHeader$resolution, + dbtype = parsedHeader$dbtype + ) } else { - + message( + paste( + "No compatible Tabix Header Found,", + "Using provided arguments to import object." + ) + ) + # else we need to generate it from the supplied arguments - argList <- list(sample.id = sample.id, assembly = assembly,context=context, - resolution=resolution,dbtype=dbtype) - checkMissingArg <- sapply(argList,is.null) - if(any(checkMissingArg)) { - stop("Missing argument: ",paste(names(argList[checkMissingArg]),collapse = ", ")) - } - - num.records=Rsamtools::countTabix(dbpath)[[1]] ## - - obj <- new("methylRawDB",dbpath=normalizePath(dbpath),num.records=num.records, - sample.id = sample.id, assembly = assembly,context=context, - resolution=resolution,dbtype=dbtype) - + argList <- list( + num.records = NA_integer_, + sample.id = sample.id, + assembly = assembly, + context = context, + resolution = resolution, + dbtype = dbtype + ) + } + checkMissingArg <- sapply(argList, is.null) + if (any(checkMissingArg)) { + stop("Missing argument: ", + paste(names(argList[checkMissingArg]), collapse = ", ") + ) } - - if(valid.methylRawDB(obj)) {return(obj)} - + + if (is.null(argList$num.records) || is.na(argList$num.records)) { + argList$num.records <- Rsamtools::countTabix(dbpath)[[1]] + } + + obj <- do.call(new, + args = c( + "methylRawDB", + dbpath = normalizePath(dbpath), + argList + )) + + if (valid.methylRawDB(obj)) { + return(obj) + } } @@ -628,86 +655,105 @@ makeMethylBaseDB<-function(df,dbpath,dbtype, readMethylBaseDB <- function(dbpath, dbtype = NULL, sample.ids = NULL, - assembly = NULL , + assembly = NULL, context = NULL, resolution = NULL, treatment = NULL, - destranded = NULL, - skip = 0) { + destranded = NULL) { + if (!is.character(dbpath)) { + stop("'dbpath' must be a character string") + } - if(!file.exists(paste0(dbpath, ".tbi"))) - { - Rsamtools::indexTabix(dbpath, seq=1, start=2, end=3, - skip=skip, comment="#", zeroBased=FALSE) + if (dbpath == "") { + stop("No tabix file specified.") } + # check if tabix file has header + header <- readTabixHeader(dbpath) + # parse header fields + parsedHeader <- parseTabixHeader(header) - # if the tabix file includes a header generated with obj2tabix, - # it can easily be parsed into the respective object - head <- checkTabixHeader(tbxFile = dbpath, - message = paste("No Tabix Header Found,", - "\nCreating methylBaseDB using supplied arguments.")) - - # initiate variables that could be missing - num.records = NULL - coverage.ind = NULL - numCs.ind = NULL - numTs.ind = NULL - - if(!is.null(head) & !anyNA(head)) { - - if(! any(head$class == c("methylBase", "methylBaseDB") ) ) { - stop("Tabix file does not originate from methylBase or methylBaseDB.\nPlease provide the correct class of data.") + if (!is.null(parsedHeader)) { + if (!any(parsedHeader$class == c("methylBase", "methylBaseDB"))) { + stop( + "Tabix file does not originate from methylBase or methylBaseDB.\n", + "Please provide the correct class of data." + ) } - - - sample.ids = head$sample.ids - assembly = head$assembly - context = head$context - resolution = head$resolution - dbtype = head$dbtype - treatment = head$treatment - destranded = head$destranded - - num.records = head$num.records - - coverage.ind = head$coverage.index - numCs.ind = head$numCs.index - numTs.ind = head$numTs.index - + + if (is.null(parsedHeader$num.records)) { + parsedHeader$num.records <- NA_integer_ + } + + if (is.null(parsedHeader$coverage.index)) { + parsedHeader$coverage.index <- NA_integer_ + } + + if (is.null(parsedHeader$numCs.index)) { + parsedHeader$numCs.index <- NA_integer_ + } + + if (is.null(parsedHeader$numTs.index)) { + parsedHeader$numTs.index <- NA_integer_ + } + + argList <- list( + sample.ids = parsedHeader$sample.ids, + assembly = parsedHeader$assembly, + context = parsedHeader$context, + resolution = parsedHeader$resolution, + dbtype = parsedHeader$dbtype, + treatment = parsedHeader$treatment, + destranded = parsedHeader$destranded, + num.records = parsedHeader$num.records, + coverage.index = parsedHeader$coverage.index, + numCs.index = parsedHeader$numCs.index, + numTs.index = parsedHeader$numTs.index + ) } else { - + message( + paste( + "No compatible Tabix Header Found,", + "Using provided arguments to import object." + ) + ) + # else we need to generate it from the supplied arguments - argList <- list(sample.ids = sample.ids, assembly = assembly,context=context, - resolution=resolution,dbtype=dbtype,treatment=treatment, - destranded=destranded) - checkMissingArg <- sapply(argList,is.null) - if(any(checkMissingArg)) { - stop("Missing argument: ",paste(names(argList[checkMissingArg]),collapse = ", ")) - } - + argList <- list( + sample.ids = sample.ids, assembly = assembly, context = context, + resolution = resolution, dbtype = dbtype, treatment = treatment, + destranded = destranded + ) } - - if(is.null(num.records)) { - num.records=Rsamtools::countTabix(dbpath)[[1]] + + checkMissingArg <- sapply(argList, is.null) + if (any(checkMissingArg)) { + stop("Missing argument: ", paste(names(argList[checkMissingArg]), collapse = ", ")) } - - if(is.null(coverage.ind)) { + + if (is.null(argList$num.records) || is.na(argList$num.records)) { + argList$num.records <- Rsamtools::countTabix(dbpath)[[1]] + } + + if (is.null(argList$coverage.index)) { # determine postion of coverage/numCs/numTs indices df <- headTabix(dbpath) - numsamples = (length(df) - 4) / 3 - coverage.ind = seq(5, by = 3, length.out = numsamples) - numCs.ind = coverage.ind + 1 - numTs.ind = coverage.ind + 2 + numsamples <- (length(df) - 4) / 3 + argList$coverage.index <- seq(5, by = 3, length.out = numsamples) + argList$numCs.index <- argList$coverage.index + 1 + argList$numTs.index <- argList$coverage.ind + 2 + } + + + obj <- do.call(new, + args = c("methylBaseDB", + dbpath = normalizePath(dbpath), + argList + ) + ) + + if (valid.methylBaseDB(obj)) { + return(obj) } - - obj <- new("methylBaseDB",dbpath=normalizePath(dbpath),num.records=num.records, - sample.ids = sample.ids, assembly = assembly,context=context, - resolution=resolution,dbtype=dbtype,treatment=treatment, - coverage.index=coverage.ind,numCs.index=numCs.ind,numTs.index=numTs.ind, - destranded=destranded) - - if(valid.methylBaseDB(obj)) {return(obj)} - } @@ -854,60 +900,83 @@ makeMethylDiffDB<-function(df,dbpath,dbtype, # PRIVATE function: # reads a methylDiffDB object from flat file database # it is called from read function or whenever this functionality is needed -readMethylDiffDB<-function(dbpath,dbtype=NULL, - sample.ids=NULL, assembly=NULL ,context=NULL, - resolution=NULL,treatment=NULL,destranded=NULL,skip=0){ - - if(!file.exists(paste0(dbpath,".tbi"))) - { - Rsamtools::indexTabix(dbpath,seq=1, start=2, end=3, - skip=skip, comment="#", zeroBased=FALSE) +readMethylDiffDB <- function(dbpath, + dbtype = NULL, + sample.ids = NULL, + assembly = NULL, + context = NULL, + resolution = NULL, + treatment = NULL, + destranded = NULL) { + + if (!is.character(dbpath)) { + stop("'dbpath' must be a character string") } - # if the tabix file includes a header generated with obj2tabix, - # it can easily be parsed into the respective object - head <- checkTabixHeader(tbxFile = dbpath, - message = paste( - "No Tabix Header Found,", - "\nCreating methylDiffDB using supplied arguments.")) - - - - if(!is.null(head) & !anyNA(head)) { + if (dbpath == "") { + stop("No tabix file specified.") + } + # check if tabix file has header + header <- readTabixHeader(dbpath) + # parse header fields + parsedHeader <- parseTabixHeader(header) - if(! any(head$class == c("methylDiff", "methylDiffDB") ) ) { - stop("Tabix file does not originate from methylDiff or methylDiffDB.\nPlease provide the correct class of data.") + if (!is.null(parsedHeader)) { + if (!any(parsedHeader$class == c("methylDiff", "methylDiffDB"))) { + stop( + "Tabix file does not originate from methylDiff or methylDiffDB.\n", + "Please provide the correct class of data." + ) + } + + if (is.null(parsedHeader$num.records)) { + parsedHeader$num.records <- NA_integer_ } - - if(is.null(head$num.records)) { - num.records=Rsamtools::countTabix(dbpath)[[1]] - } else num.records = head$num.records - - obj <- new("methylDiffDB",dbpath=normalizePath(dbpath),num.records=num.records, - sample.ids = head$sample.ids, assembly = head$assembly,context=head$context, - resolution=head$resolution,dbtype=head$dbtype,treatment=head$treatment, - destranded=head$destranded) + # arguments are extracted from parsedHeader + argList <- list( + sample.ids = parsedHeader$sample.ids, + assembly = parsedHeader$assembly, + context = parsedHeader$context, + resolution = parsedHeader$resolution, + dbtype = parsedHeader$dbtype, + treatment = parsedHeader$treatment, + destranded = parsedHeader$destranded + ) } else { - + message( + paste( + "No compatible Tabix Header Found,", + "Using provided arguments to import object." + ) + ) + # else we need to generate it from the supplied arguments - argList <- list(sample.ids = sample.ids, assembly = assembly,context=context, - resolution=resolution,dbtype=dbtype,treatment=treatment, - destranded=destranded) - checkMissingArg <- sapply(argList,is.null) - if(any(checkMissingArg)) { - stop("Missing argument: ",paste(names(argList[checkMissingArg]),collapse = ", ")) - } - num.records=Rsamtools::countTabix(dbpath)[[1]] ## - - obj <- new("methylDiffDB",dbpath=normalizePath(dbpath),num.records=num.records, - sample.ids = sample.ids, assembly = assembly,context=context, - resolution=resolution,dbtype=dbtype,treatment=treatment, - destranded=destranded) + argList <- list( + sample.ids = sample.ids, assembly = assembly, context = context, + resolution = resolution, dbtype = dbtype, treatment = treatment, + destranded = destranded, num.records = NA_integer_ + ) + } + checkMissingArg <- sapply(argList, is.null) + if (any(checkMissingArg)) { + stop("Missing argument: ", paste(names(argList[checkMissingArg]), collapse = ", ")) + } + if (is.null(argList$num.records) || is.na(argList$num.records)) { + argList$num.records <- Rsamtools::countTabix(dbpath)[[1]] + } + + obj <- do.call(new, + args = c("methylDiffDB", + dbpath = normalizePath(dbpath), + argList + ) + ) + + if (valid.methylDiffDB(obj)) { + return(obj) } - - if(valid.methylDiffDB(obj)) {return(obj)} } # coercion functions ------------------------------------------------------ @@ -986,34 +1055,33 @@ setAs("methylDiffDB","methylDiff", function(from) #' @rdname readMethylDB-methods readMethylDB <- function(dbpath) { - if(!file.exists(dbpath)) - { - stop("Tabix File does not exist:", dbpath,"\nPlease provide a path to a valid tabix file.") - } - - if(!file.exists(paste0(dbpath,".tbi"))) - { - Rsamtools::indexTabix(dbpath,seq=1, start=2, end=3, - skip=0, comment="#", zeroBased=FALSE) - } - # if the tabix file includes a header generated with obj2tabix, # it can easily be parsed into the respective object - head <- checkTabixHeader(tbxFile = dbpath, - message = paste( - "No Tabix Header Found,", - "\nPlease provide tabix file with header created ", - "from methylKit version >=1.13.1.")) - - if(is.null(head)) stop("Stopping here.") - - if(! head$class %in% c("methylRaw", "methylRawDB", - "methylBase", "methylBaseDB", - "methylDiff", "methylDiffDB") ) { - stop("Unsupported class:", head$class,"\nPlease provide the correct class of data.") + header <- readTabixHeader(dbpath) + + if(is.null(header)) { + stop(paste( + "No Tabix Header Found,", + "\nPlease provide tabix file with header created ", + "from methylKit version >=1.13.1." + )) + } + + parsedHeader <- parseTabixHeader(header) + + supported.classes <- c("methylRaw", "methylRawDB", + "methylBase", "methylBaseDB", + "methylDiff", "methylDiffDB") + + if (!parsedHeader$class %in% supported.classes) { + stop( + "Unsupported class:", + parsedHeader$class, + "\nPlease provide the correct class of data." + ) } - switch (head$class, + switch (parsedHeader$class, methylRaw = readMethylRawDB(dbpath), methylRawDB = readMethylRawDB(dbpath), methylBase = readMethylBaseDB(dbpath), diff --git a/R/tabix.functions.R b/R/tabix.functions.R index b94a4a28..81adf49f 100644 --- a/R/tabix.functions.R +++ b/R/tabix.functions.R @@ -152,24 +152,44 @@ catsub2tabix<-function(dir,pattern,filename,tabixHead=NULL,sort=FALSE){ #' make tabix from a flat file where first 3 columns #' are chr,start,end in that order -#' +#' #' @param filepath path to the uncompressed file #' @param skip number of lines to skip #' @param rm.file remove the uncompressed text file (default: yes) -#' +#' #' @usage makeMethTabix(filepath,skip=0) #' @noRd -makeMethTabix<-function(filepath,skip=0,rm.file=TRUE){ +makeMethTabix <- function(filepath, skip = 0, rm.file = TRUE) { message("compressing the file with bgzip...") - zipped <- Rsamtools::bgzip(filepath,overwrite=TRUE) - - if(rm.file){file.remove(filepath)} - + zipped <- Rsamtools::bgzip(filepath, overwrite = TRUE) + + if (rm.file) { + file.remove(filepath) + } + message("making tabix index...") - Rsamtools::indexTabix(zipped, - seq=1, start=2, end=3, - skip=skip, comment="#", zeroBased=FALSE) - + indexMethTabix(zipped) +} + + +#' index a tabix file created with makeMethTabix +#' +#' @param tabixfile path to the tabix file +#' +#' @usage indexMethTabix(tabixfile) +#' @noRd +indexMethTabix <- function(tabixfile) { + # Use Rsamtools to create the index + Rsamtools::indexTabix( + tabixfile, + seq = 1, + start = 2, + end = 3, + skip = 0, + # tabixHeader is ignored for indexing + comment = "#", + zeroBased = FALSE + ) } @@ -273,25 +293,66 @@ obj2tabix <- function(obj,filename,rm.txt=TRUE){ append = TRUE) } -#' function to check wether tabix header exists -#' and exit with message instead of error +#' Read the header of a tabix file if it exists #' #' @param tbxFile tabix file -#' @param message text to print instead of default #' @noRd -checkTabixHeader <- function(tbxFile,message=NULL) { - if(is.null(message)) message <- paste("Could not read header of file",tbxFile) - tryCatch(expr = readTabixHeader(tbxFile), - error = function(cond){ - # message(cond) - message(message) - return(NULL) - }) - - +readTabixHeader <- function(tbxFile){ + if (is.null(checkTabixFile(tbxFile))) { + return(invisible(NULL)) + } + + header <- tryCatch( + expr = Rsamtools::headerTabix(tbxFile)$header, + error = function(cond) { + message(cond) + return(invisible(NULL)) + } + ) + if (length(header) == 0) { + message(paste("The Tabix File:", tbxFile, "does not include a header.\n")) + return(invisible(NULL)) + } else { + return(header) + } +} + + +#' This function checks a tabix. It will verify the file and check if the +#' associated index exist. If the index cannot be found, the file will be reindexed. +#' +#' @return the path to the tabix file or NULL if the file does not exist +#' +#' @param tbxFile tabix file +#' @param createIndex should the index be created if it does not exist +#' @noRd +checkTabixFile <- function(tbxFile, createIndex = TRUE) { + + if(!is.character(tbxFile)) { + stop("tbxFile must be a character string") + } + + if (tbxFile == "") { + stop("No tabix file specified.") + } + if (!file.exists(tbxFile)) { + message(paste0("Tabix file '", tbxFile, "' does not exist.")) + return(invisible(NULL)) + } + if (!file.exists(paste0(tbxFile, ".tbi"))) { + if (createIndex && (file.access(dirname(tbxFile)) == 0)) { + message("creating tabix index") + indexMethTabix(tbxFile) + } else { + message(paste("Tabix index file", paste0(tbxFile, ".tbi"), "does not exist.")) + return(invisible(NULL)) + } + } + return(invisible(tbxFile)) } + #' function to create a tabix header from methylKit object's slots #' #' @param slotList list of slot items with respective values @@ -366,36 +427,33 @@ writeTabixHeader <- function(obj,tabixHead,filename) { #' function to parse the information stored in the tabix header #' -#' @param tbxFile tabix file +#' @param header tabix header as read by readTabixHeader #' @noRd -readTabixHeader <- function(tbxFile) { - - # save header values - h <- Rsamtools::headerTabix(tbxFile)$header - - # check if header is empty - if(length(h) == 0 ) stop(paste("The Tabix File:",tbxFile,"does not include a header.\n")) - +parseTabixHeader <- function(header) { + + if(is.null(header)) return(NULL) # parse the slots and format the values - headerVals <- sapply(h, - USE.NAMES = FALSE, - FUN = function(j) { - tmp <- unlist(strsplit(gsub("#", "", j),split = ":")) - val <- .formatShortSlotValues(tmp) - names(val) <- .decodeShortSlotNames(names(val)) - return(val) - } + headerVals <- sapply(header, + USE.NAMES = FALSE, + FUN = function(h) { + tmp <- unlist(strsplit(gsub("#", "", h), split = ":")) + val <- .formatShortSlotValues(tmp) + names(val) <- .decodeShortSlotNames(names(val)) + # parse the object class + if (tmp[1] == "Class") { + val <- tmp[2] + names(val) <- c("class") + } + return(val) + } ) - - headerVals <- headerVals[!is.null(headerVals)] - - # parse the object class - class <- unlist(sapply((strsplit(gsub("#", "", h),split = ":")), function(x) if(x[1]=="Class") return(x[2]))) - - headerVals$class <- class - + + # remove empty slots and duplicates + headerVals <- headerVals[ + (!sapply(headerVals, is.null)) & + (!duplicated(names(headerVals)))] + return(headerVals) - } diff --git a/tests/testthat/test-16-loadTabix.r b/tests/testthat/test-16-loadTabix.r index b84890fd..4d120978 100644 --- a/tests/testthat/test-16-loadTabix.r +++ b/tests/testthat/test-16-loadTabix.r @@ -69,7 +69,6 @@ test_that("reading of tabix without dbtype leads to error", { }) test_that("reading of tabix can be done with one wrapper readMethylDB", { - expect_error(readMethylDB(dbpath = no_header)) expect_is(readMethylDB(dbpath = "methylDB/my_raw.txt.bgz"),'methylRawDB') expect_is(readMethylDB(dbpath = mydblist[[1]]@dbpath),'methylRawDB') expect_is(readMethylDB(dbpath = "methylDB/my_base.txt.bgz"),'methylBaseDB') @@ -79,4 +78,29 @@ test_that("reading of tabix can be done with one wrapper readMethylDB", { }) +test_that("reading of tabix fails for tabix file without header", { + expect_error(readMethylRawDB(dbpath = no_header)) +}) + + +test_that("passing empty string fails", { + expect_error(readMethylRawDB(dbpath = "")) +}) + +test_that("passing object fails", { + expect_error(readMethylDB(dbpath = base)) +}) + +test_that("passing wrong object class fails", { + expect_error(readMethylRawDB(dbpath = base@dbpath), + "Tabix file does not originate from methylRaw or methylRawDB") +}) + +# remove index file +unlink(paste0(base@dbpath,".tbi")) +test_that("reading of tabix works for tabix file without index", { + expect_is(readMethylDB(dbpath = base@dbpath), "methylBaseDB") +}) + + unlink("tests/testthat/methylDB",recursive = TRUE) diff --git a/vignettes/methylKit.Rmd b/vignettes/methylKit.Rmd index bf29093f..2aea91a8 100644 --- a/vignettes/methylKit.Rmd +++ b/vignettes/methylKit.Rmd @@ -763,7 +763,7 @@ baseDB.obj <- makeMethylDB(methylBase.obj,"my/path") mydbpath <- getDBPath(baseDB.obj) rm(baseDB.obj) -methylKit:::checkTabixHeader(mydbpath) +methylKit:::readTabixHeader(mydbpath) readMethylDB(mydbpath) ```