diff --git a/DESCRIPTION b/DESCRIPTION index 951d255..6040304 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Authors@R: c( person("Toby", "Hocking", email="toby.hocking@r-project.org", role=c("aut", "cre"))) -Version: 2024.9.20 +Version: 2024.12.17 License: GPL-3 Title: Named Capture to Data Tables Description: User-friendly functions for extracting a data diff --git a/NAMESPACE b/NAMESPACE index fadf14d..6697c4a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,3 +15,4 @@ export(capture_first_df) export(capture_longer_spec) import(data.table) importFrom("stats","na.omit") +importFrom("utils","type.convert") diff --git a/NEWS b/NEWS index e11d70d..5fa6703 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,7 @@ +Changes in version 2024.12.17 + +- capture_first_vec, capture_all_str, capture_first_df, capture_first_glob, measure, capture_melt_single, capture_melt_multiple now support type.convert argument. TRUE means to use utils::type.convert(x,as.is=TRUE) as default conversion function (as.is=TRUE means to return character instead of factor), FALSE means identity, and otherwise can be any function to use as default conversion. + Changes in version 2024.9.20 - list(given_first=list(given, family)) examples + v5 + docs update. diff --git a/R/capture_all_str.R b/R/capture_all_str.R index 90f8547..ca4b658 100644 --- a/R/capture_all_str.R +++ b/R/capture_all_str.R @@ -12,12 +12,19 @@ capture_all_str <- structure(function # Capture all matches in a single subject ### must be string/function/list, as documented in capture_first_vec. engine=getOption("nc.engine", "PCRE"), ### character string, one of PCRE, ICU, RE2 - collapse="\n" + collapse="\n", ### separator string for combining elements of subject into a single ### string, used as collapse argument of base::paste. + type.convert=getOption("nc.type.convert", FALSE) +### Default conversion function, which will be used on each capture +### group, unless a specific conversion is specified for that +### group. If TRUE, use utils::type.convert; if FALSE, use +### base::identity; otherwise must be a function of at least one +### argument (character), returning an atomic vector of the same +### length. ){ stop_for_engine(engine) - L <- subject_var_args(...) + L <- subject_var_args(..., type.convert=type.convert) ## instead of explicitly using file.exists here (which may error for ## some strange/emoji subjects) we can just use readLines, which ## will error if arg has more than one element (invalid diff --git a/R/capture_first_df.R b/R/capture_first_df.R index 9f3fae4..4466fcb 100644 --- a/R/capture_first_df.R +++ b/R/capture_first_df.R @@ -24,10 +24,17 @@ capture_first_df <- structure(function # Capture first match in columns of a dat ### if TRUE (default to avoid data loss), stop with an error if any ### capture groups have the same name as an existing column of ### subject. - engine=getOption("nc.engine", "PCRE") + engine=getOption("nc.engine", "PCRE"), ### character string, one of PCRE, ICU, RE2. This engine will be used ### for each column, unless another engine is specified for that ### column in ... + type.convert=getOption("nc.type.convert", FALSE) +### Default conversion function, which will be used on each capture +### group, unless a specific conversion is specified for that +### group. If TRUE, use utils::type.convert; if FALSE, use +### base::identity; otherwise must be a function of at least one +### argument (character), returning an atomic vector of the same +### length. ){ all.arg.list <- list(...) subject <- all.arg.list[[1]] @@ -57,7 +64,7 @@ capture_first_df <- structure(function # Capture first match in columns of a dat for(col.name in names(col.pattern.list)){ subject.vec <- subject[[col.name]] col.arg.list <- c(list(subject.vec), col.pattern.list[[col.name]]) - maybe.rep <- c("engine", "nomatch.error") + maybe.rep <- c("engine", "nomatch.error", "type.convert") to.rep <- maybe.rep[!maybe.rep %in% names(col.arg.list)] col.arg.list[to.rep] <- lapply(to.rep, get, environment()) tryCatch({ diff --git a/R/capture_first_glob.R b/R/capture_first_glob.R index f6d2e6f..13f4aae 100644 --- a/R/capture_first_glob.R +++ b/R/capture_first_glob.R @@ -4,8 +4,8 @@ capture_first_glob <- structure(function (glob, ### string: glob specifying files to read. ..., -### pattern passed to capture_first_vec, used to get meta-data from -### file names. +### passed to capture_first_vec, should include pattern used on vector +### of file names to get meta-data. READ=fread ### function of one argument (file name) which returns a data table, ### default data.table::fread. diff --git a/R/capture_first_vec.R b/R/capture_first_vec.R index 89bed2f..1c7bd0c 100644 --- a/R/capture_first_vec.R +++ b/R/capture_first_vec.R @@ -26,10 +26,17 @@ capture_first_vec <- structure(function # Capture first match in each character ### if TRUE (default), stop with an error if any subject does not ### match; otherwise subjects that do not match are reported as ### missing/NA rows of the result. - engine=getOption("nc.engine", "PCRE") + engine=getOption("nc.engine", "PCRE"), ### character string, one of PCRE, ICU, RE2 + type.convert=getOption("nc.type.convert", FALSE) +### Default conversion function, which will be used on each capture +### group, unless a specific conversion is specified for that +### group. If TRUE, use utils::type.convert; if FALSE, use +### base::identity; otherwise must be a function of at least one +### argument (character), returning an atomic vector of the same +### length. ){ - L <- subject_var_args(...) + L <- subject_var_args(..., type.convert=type.convert) subject.vec <- L[["subject"]] stop_for_engine(engine) ##alias<< nc @@ -111,5 +118,36 @@ capture_first_vec <- structure(function # Capture first match in each character chr.pos.vec) nc::capture_first_vec(na.vec, range.pattern, nomatch.error=FALSE) + ## another subject from https://adventofcode.com/2024/day/14 + pvxy.subject <- c("p=0,4 v=3,-3","p=6,3 v=-1,-3") + nc::capture_first_vec( + pvxy.subject, + "p=", + px="[0-9]", + ",", + py="[0-9]", + " v=", + vx="[-0-9]+", + ",", + vy="[-0-9]+", + type.convert=TRUE) + + ## to do the same as above but with less repetition: + g <- function(prefix,suffix)nc::group( + name=paste0(prefix,suffix), + "[-0-9]+") + xy <- function(prefix)list( + prefix, + "=", + g(prefix,"x"), + ",", + g(prefix,"y")) + nc::capture_first_vec( + pvxy.subject, + xy("p"), + " ", + xy("v"), + type.convert=TRUE) + }) diff --git a/R/var_args_list.R b/R/var_args_list.R index a0052d3..aaeae5a 100644 --- a/R/var_args_list.R +++ b/R/var_args_list.R @@ -1,7 +1,9 @@ subject_var_args <- function ### Parse the complete argument list including subject. -(... +(..., ### subject, regex/conversion. + type.convert=getOption("nc.type.convert", FALSE) +### passed to var_args_list. ){ all.arg.list <- list(...) first.name <- names(all.arg.list[1]) @@ -11,7 +13,7 @@ subject_var_args <- function } subject <- all.arg.list[[1]] stop_for_subject(subject) - out.list <- var_args_list(all.arg.list[-1]) + out.list <- var_args_list(all.arg.list[-1], type.convert=type.convert) out.list$subject <- subject out.list ### Result of var_args_list plus subject. @@ -22,7 +24,7 @@ var_args_list <- structure(function ### capture_first_df, and capture_all_str. This function is mostly ### intended for internal use, but is useful if you want to see the ### regex pattern generated by the variable argument syntax. -(... +(..., ### character vectors (for regex patterns) or functions (which specify ### how to convert extracted character vectors to other types). All ### patterns must be character vectors of length 1. If the pattern is @@ -32,6 +34,13 @@ var_args_list <- structure(function ### at most one function which is used to convert the previous named ### pattern. Patterns may also be lists, which are parsed recursively ### for convenience. + type.convert=getOption("nc.type.convert", FALSE) +### Default conversion function, which will be used on each capture +### group, unless a specific conversion is specified for that +### group. If TRUE, use utils::type.convert; if FALSE, use +### base::identity; otherwise must be a function of at least one +### argument (character), returning an atomic vector of the same +### length. ){ var.arg.list <- list(...) fun.list <- list() @@ -50,7 +59,13 @@ var_args_list <- structure(function stop(domain=NA, gettextf("functions must not be named, problem: %s", pattern.name)) } group.i <- length(fun.list)+1L - fun.list[[group.i]] <- identity + fun.list[[group.i]] <- if(isTRUE(type.convert)){ + function(x)utils::type.convert(x,as.is=TRUE) + }else if(isFALSE(type.convert)){ + identity + }else if(is.function(type.convert)){ + type.convert + }else stop("type.convert should be either TRUE or FALSE or a function") names(fun.list)[[group.i]] <- pattern.name has.name <- TRUE "(" diff --git a/man/capture_all_str.Rd b/man/capture_all_str.Rd index 97fdc0a..a18d511 100644 --- a/man/capture_all_str.Rd +++ b/man/capture_all_str.Rd @@ -6,7 +6,9 @@ string or text file. It can be used to convert any regular text file (web page, log, etc) to a data table, see examples.} \usage{capture_all_str(..., engine = getOption("nc.engine", - "PCRE"), collapse = "\\n")} + "PCRE"), collapse = "\\n", + type.convert = getOption("nc.type.convert", + FALSE))} \arguments{ \item{\dots}{subject, name1=pattern1, fun1, etc. The first argument must be a subject character vector (or file name which is read via @@ -18,6 +20,12 @@ must be string/function/list, as documented in \code{\link{capture_first_vec}}.} \item{engine}{character string, one of PCRE, ICU, RE2} \item{collapse}{separator string for combining elements of subject into a single string, used as \code{collapse} argument of \code{\link[base]{paste}}.} + \item{type.convert}{Default conversion function, which will be used on each capture +\code{\link{group}}, unless a specific conversion is specified for that +\code{\link{group}}. If TRUE, use \code{\link[utils]{type.convert}}; if FALSE, use +\code{\link[base]{identity}}; otherwise must be a function of at least one +argument (character), returning an atomic vector of the same +length.} } \value{data.table with one row for each match, and one column for each diff --git a/man/capture_first_df.Rd b/man/capture_first_df.Rd index 066c76a..ab02409 100644 --- a/man/capture_first_df.Rd +++ b/man/capture_first_df.Rd @@ -8,7 +8,8 @@ column.} nomatch.error = getOption("nc.nomatch.error", TRUE), existing.error = getOption("nc.existing.error", TRUE), engine = getOption("nc.engine", - "PCRE"))} + "PCRE"), type.convert = getOption("nc.type.convert", + FALSE))} \arguments{ \item{\dots}{subject data frame, colName1=list(groupName1=pattern1, fun1, etc), colName2=list(etc), etc. First argument must be a data frame with @@ -31,6 +32,12 @@ subject.} \item{engine}{character string, one of PCRE, ICU, RE2. This \code{engine} will be used for each column, unless another \code{engine} is specified for that column in \code{...}} + \item{type.convert}{Default conversion function, which will be used on each capture +\code{\link{group}}, unless a specific conversion is specified for that +\code{\link{group}}. If TRUE, use \code{\link[utils]{type.convert}}; if FALSE, use +\code{\link[base]{identity}}; otherwise must be a function of at least one +argument (character), returning an atomic vector of the same +length.} } \value{data.table with same number of rows as subject, with an additional diff --git a/man/capture_first_glob.Rd b/man/capture_first_glob.Rd index 07124be..e09a79a 100644 --- a/man/capture_first_glob.Rd +++ b/man/capture_first_glob.Rd @@ -7,8 +7,8 @@ file name, and combine with contents of each file.} ..., READ = fread)} \arguments{ \item{glob}{string: \code{glob} specifying files to read.} - \item{\dots}{pattern passed to \code{\link{capture_first_vec}}, used to get meta-data from -file names.} + \item{\dots}{passed to \code{\link{capture_first_vec}}, should include pattern used on vector +of file names to get meta-data.} \item{READ}{function of one argument (file name) which returns a data table, default \code{\link[data.table]{fread}}.} } diff --git a/man/capture_first_vec.Rd b/man/capture_first_vec.Rd index 522d12d..1923d1b 100644 --- a/man/capture_first_vec.Rd +++ b/man/capture_first_vec.Rd @@ -16,7 +16,8 @@ definition of the regex you can use \code{\link{field}}, \code{\link{quantifier} \usage{capture_first_vec(..., nomatch.error = getOption("nc.nomatch.error", TRUE), engine = getOption("nc.engine", - "PCRE"))} + "PCRE"), type.convert = getOption("nc.type.convert", + FALSE))} \arguments{ \item{\dots}{subject, name1=pattern1, fun1, etc. The first argument must be a character vector of length>0 (subject strings to parse with a @@ -33,6 +34,12 @@ recursively using these rules.} match; otherwise subjects that do not match are reported as missing/NA rows of the result.} \item{engine}{character string, one of PCRE, ICU, RE2} + \item{type.convert}{Default conversion function, which will be used on each capture +\code{\link{group}}, unless a specific conversion is specified for that +\code{\link{group}}. If TRUE, use \code{\link[utils]{type.convert}}; if FALSE, use +\code{\link[base]{identity}}; otherwise must be a function of at least one +argument (character), returning an atomic vector of the same +length.} } \value{data.table with one row for each subject, and one column for each @@ -88,4 +95,35 @@ na.vec <- c( chr.pos.vec) nc::capture_first_vec(na.vec, range.pattern, nomatch.error=FALSE) +## another subject from https://adventofcode.com/2024/day/14 +pvxy.subject <- c("p=0,4 v=3,-3","p=6,3 v=-1,-3") +nc::capture_first_vec( + pvxy.subject, + "p=", + px="[0-9]", + ",", + py="[0-9]", + " v=", + vx="[-0-9]+", + ",", + vy="[-0-9]+", + type.convert=TRUE) + +## to do the same as above but with less repetition: +g <- function(prefix,suffix)nc::group( + name=paste0(prefix,suffix), + "[-0-9]+") +xy <- function(prefix)list( + prefix, + "=", + g(prefix,"x"), + ",", + g(prefix,"y")) +nc::capture_first_vec( + pvxy.subject, + xy("p"), + " ", + xy("v"), + type.convert=TRUE) + } diff --git a/man/subject_var_args.Rd b/man/subject_var_args.Rd index e22a511..10a8eb6 100644 --- a/man/subject_var_args.Rd +++ b/man/subject_var_args.Rd @@ -2,9 +2,12 @@ \alias{subject_var_args} \title{subject var args} \description{Parse the complete argument list including subject.} -\usage{subject_var_args(...)} +\usage{subject_var_args(..., + type.convert = getOption("nc.type.convert", + FALSE))} \arguments{ \item{\dots}{subject, regex/conversion.} + \item{type.convert}{passed to \code{\link{var_args_list}}.} } \value{Result of \code{\link{var_args_list}} plus subject.} diff --git a/man/var_args_list.Rd b/man/var_args_list.Rd index feaf937..ec833e4 100644 --- a/man/var_args_list.Rd +++ b/man/var_args_list.Rd @@ -5,7 +5,8 @@ \code{\link{capture_first_df}}, and \code{\link{capture_all_str}}. This function is mostly intended for internal use, but is useful if you want to see the regex pattern generated by the variable argument syntax.} -\usage{var_args_list(...)} +\usage{var_args_list(..., type.convert = getOption("nc.type.convert", + FALSE))} \arguments{ \item{\dots}{character vectors (for regex patterns) or functions (which specify how to convert extracted character vectors to other types). All @@ -16,6 +17,12 @@ pattern used for matching. Each named pattern may be followed by at most one function which is used to convert the previous named pattern. Patterns may also be lists, which are parsed recursively for convenience.} + \item{type.convert}{Default conversion function, which will be used on each capture +\code{\link{group}}, unless a specific conversion is specified for that +\code{\link{group}}. If TRUE, use \code{\link[utils]{type.convert}}; if FALSE, use +\code{\link[base]{identity}}; otherwise must be a function of at least one +argument (character), returning an atomic vector of the same +length.} } \value{a list with two named elements diff --git a/tests/testthat.R b/tests/testthat.R index 8698d91..5268305 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,2 +1,3 @@ data.table::setDTthreads(1) +Sys.setlocale("LC_MESSAGES",locale="C") if(require(testthat))test_check("nc") diff --git a/tests/testthat/test-CRAN-all.R b/tests/testthat/test-CRAN-all.R index 70d07c6..1a20f07 100644 --- a/tests/testthat/test-CRAN-all.R +++ b/tests/testthat/test-CRAN-all.R @@ -22,6 +22,7 @@ test_engines("capture_all_str returns data.frame with 0 rows, 1 int col", { expect_identical(computed$baz, integer()) }) +keep.digits <- function(x)as.integer(gsub("[^0-9]", "", x)) test_engines("capture_all_str returns data.table", { chr.pos.vec <- c( "chr10:213,054,000-213,055,000", @@ -29,7 +30,6 @@ test_engines("capture_all_str returns data.table", { "this will not match", NA, # neither will this. "chr1:110-111 chr2:220-222") # two possible matches. - keep.digits <- function(x)as.integer(gsub("[^0-9]", "", x)) computed <- capture_all_str( chr.pos.vec, chrom="chr.*?", @@ -49,6 +49,38 @@ test_engines("capture_all_str returns data.table", { expect_identical(computed, expected) }) +test_engines("capture_all_str(type.convert=TRUE) returns one int column", { + computed <- capture_all_str( + "chr1:2-3,000 chr4:5-6,000", + chrom="chr.*?", + ":", + chromStart=".*?", + "-", + chromEnd="[0-9,]*", + type.convert=TRUE) + expected <- data.table( + chrom=c("chr1","chr4"), + chromStart=c(2L,5L), + chromEnd=c("3,000","6,000")) + expect_identical(computed, expected) +}) + +test_engines("capture_all_str(type.convert=TRUE) returns two int columns", { + computed <- capture_all_str( + "chr1:2-3,000 chr4:5-6,000", + chrom="chr.*?", + ":", + chromStart=".*?", + "-", + chromEnd="[0-9,]*", keep.digits, + type.convert=TRUE) + expected <- data.table( + chrom=c("chr1","chr4"), + chromStart=c(2L,5L), + chromEnd=c(3000L,6000L)) + expect_identical(computed, expected) +}) + test_engines("capture_all_str errors for one argument", { expect_error({ capture_all_str("foo") @@ -121,4 +153,18 @@ test_engines("nested capture groups works", { expect_is(match.dt$sampleID, "integer") }) +test_engines("error for capture all regex with literal groups, match", { + expect_error({ + capture_all_str( + c("chr1:100-200", "chr2:5-6"), + chrom="chr.", + ":", + "([0-9]+)") + }, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE) +}) +test_engines("error for capture all regex with literal groups, no match", { + expect_error({ + nc::capture_all_str("alias(es)", foo="alias(es)") + }, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE) +}) diff --git a/tests/testthat/test-CRAN-df.R b/tests/testthat/test-CRAN-df.R index 4c38153..37a26ae 100644 --- a/tests/testthat/test-CRAN-df.R +++ b/tests/testthat/test-CRAN-df.R @@ -209,6 +209,78 @@ test_engines("two name groups not OK with named subject", { }, "must not conflict with existing column names") }) +test_engines("type.convert OK inside capture_first_df list", { + type.conv.result <- capture_first_df( + named.uniq.chr, + JobID=list( + job="[0-9]+", + "_", + "(?:",#begin alternate + task="[0-9]+", + "|",#either one task(above) or range(below) + range.pattern, + ")",#end alternate + "(?:[.]", + type=".*", identity, + ")?", + type.convert=as.numeric), + position=list( + name="chr.*?", + ":", + chromStart=".*?", keep.digits, + "-", + chromEnd="[0-9,]*", keep.digits)) + computed.cls <- sapply(type.conv.result, class) + expected.cls <- c( + JobID = "character", + position = "character", + job = "numeric", + task = "numeric", + task1 = "integer", + taskN = "integer", + type = "character", + name = "character", + chromStart = "integer", + chromEnd = "integer") + expect_identical(computed.cls, expected.cls) +}) + +test_engines("type.convert OK as capture_first_df arg", { + type.conv.result <- capture_first_df( + named.uniq.chr, + JobID=list( + job="[0-9]+", + "_", + "(?:",#begin alternate + task="[0-9]+", + "|",#either one task(above) or range(below) + range.pattern, + ")",#end alternate + "(?:[.]", + type=".*", identity, + ")?"), + position=list( + name="chr.*?", + ":", + chromStart=".*?", keep.digits, + "-", + chromEnd="[0-9,]*", keep.digits), + type.convert=as.factor) + computed.cls <- sapply(type.conv.result, class) + expected.cls <- c( + JobID = "character", + position = "character", + job = "factor", + task = "factor", + task1 = "integer", + taskN = "integer", + type = "character", + name = "factor", + chromStart = "integer", + chromEnd = "integer") + expect_identical(computed.cls, expected.cls) +}) + test_engines("error for no pattern", { expect_error({ capture_first_df(named.uniq.chr) diff --git a/tests/testthat/test-CRAN-glob.R b/tests/testthat/test-CRAN-glob.R index 4b19fee..6c310f6 100644 --- a/tests/testthat/test-CRAN-glob.R +++ b/tests/testthat/test-CRAN-glob.R @@ -7,11 +7,44 @@ db <- system.file("extdata/chip-seq-chunk-db", package="nc", mustWork=TRUE) glob <- paste0(db, "/*/*/counts/*") read.bedGraph <- function(f)data.table::fread( f, skip=1, col.names = c("chrom","start", "end", "count")) -data.chunk.pattern <- list( - data="H.*?", - "/", - chunk="[0-9]+", as.integer) -if(requireNamespace("R.utils"))test_engines("capture_first_glob returns expected columns", { - count.dt <- nc::capture_first_glob(glob, data.chunk.pattern, READ=read.bedGraph) - expect_identical(names(count.dt), c("data", "chunk", "chrom", "start", "end", "count")) -}) + +if(requireNamespace("R.utils")){ + + test_engines("capture_first_glob returns expected columns", { + count.dt <- nc::capture_first_glob( + glob, + data="H.*?", + "/", + chunk="[0-9]+", as.integer, + READ=read.bedGraph) + computed.cls <- sapply(count.dt, class) + expected.cls <- c( + data = "character", + chunk = "integer", + chrom = "character", + start = "integer", + end = "integer", + count = "integer") + expect_identical(computed.cls, expected.cls) + }) + + test_engines("capture_first_glob(type.convert) works", { + count.dt <- nc::capture_first_glob( + glob, + data="H.*?", + "/", + chunk="[0-9]+", + READ=read.bedGraph, + type.convert=TRUE) + computed.cls <- sapply(count.dt, class) + expected.cls <- c( + data = "character", + chunk = "integer", + chrom = "character", + start = "integer", + end = "integer", + count = "integer") + expect_identical(computed.cls, expected.cls) + }) + +} diff --git a/tests/testthat/test-CRAN-measure.R b/tests/testthat/test-CRAN-measure.R index af11dac..c6d4ac7 100644 --- a/tests/testthat/test-CRAN-measure.R +++ b/tests/testthat/test-CRAN-measure.R @@ -15,6 +15,16 @@ test_engines("measure single value", { expect_identical(out, expected) }) +test_engines("measure single value type.convert=TRUE", { + out <- melt(DT, measure.vars=nc::measure( + letter="[ab]", "_", number="[12]", type.convert=TRUE)) + expected <- data.table( + letter=c("a","b","a"), + number=as.integer(c(1,2,2)), + value=c(10,21,20)) + expect_identical(out, expected) +}) + test_engines("measure multiple values", { out <- melt(DT, measure.vars=nc::measure( column="[ab]", "_", number="[12]", as.integer)) @@ -25,3 +35,12 @@ test_engines("measure multiple values", { expect_identical(out, expected) }) +test_engines("measure multiple values type.convert=TRUE", { + out <- melt(DT, measure.vars=nc::measure( + column="[ab]", "_", number="[12]", type.convert=TRUE)) + expected <- data.table( + number=as.integer(c(1,2)), + a=c(10,20), + b=c(NA,21)) + expect_identical(out, expected) +}) diff --git a/tests/testthat/test-CRAN-melt.R b/tests/testthat/test-CRAN-melt.R index 7cf84e0..85d7a99 100644 --- a/tests/testthat/test-CRAN-melt.R +++ b/tests/testthat/test-CRAN-melt.R @@ -119,6 +119,12 @@ test_engines("melting lots of columns is OK", { expect_identical(out$value, i.vec) }) +test_engines("melting lots of columns type.convert=TRUE is OK", { + out <- capture_melt_single(one.row, "X", col="[0-9]+", type.convert=TRUE) + expect_identical(out$col, i.vec) + expect_identical(out$value, i.vec) +}) + DT.wide <- data.table(id=0, num_ref=1, name_ref="foo", num=2, name="bar") test_engines("converting NA to non-NA is an error", { expect_error({ diff --git a/tests/testthat/test-CRAN-multiple.R b/tests/testthat/test-CRAN-multiple.R index a25f581..31e02fe 100644 --- a/tests/testthat/test-CRAN-multiple.R +++ b/tests/testthat/test-CRAN-multiple.R @@ -276,6 +276,14 @@ test_engines("multiple melting lots of columns is OK", { expect_identical(out$C, c.vec) }) +test_engines("multiple melting lots of columns type.convert=TRUE is OK", { + out <- capture_melt_multiple( + one.row, column=".", "[.]", int="[0-9]+", type.convert=TRUE) + expect_identical(out$int, i.vec) + expect_identical(out$I, i.vec) + expect_identical(out$C, c.vec) +}) + wide.metrics <- data.table( FP.possible=8202, FN.possible=1835, FP.count=0, FN.count=1835) diff --git a/tests/testthat/test-CRAN-vec.R b/tests/testthat/test-CRAN-vec.R index 0002fc2..c29915e 100644 --- a/tests/testthat/test-CRAN-vec.R +++ b/tests/testthat/test-CRAN-vec.R @@ -46,22 +46,6 @@ test_engines("error for capture first regex with literal groups", { }, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE) }) -test_engines("error for capture all regex with literal groups, match", { - expect_error({ - capture_all_str( - c("chr1:100-200", "chr2:5-6"), - chrom="chr.", - ":", - "([0-9]+)") - }, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE) -}) - -test_engines("error for capture all regex with literal groups, no match", { - expect_error({ - nc::capture_all_str("alias(es)", foo="alias(es)") - }, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE) -}) - subject <- c( ten="chr10:213,054,000-213,055,000", chrNA="chrNA:111,000-222,000", @@ -101,6 +85,96 @@ test_engines("capture_first_vec returns data.table with int columns", { expect_identical(computed, expected) }) +test_engines("capture_first_vec(type.convert=TRUE) returns one int column", { + computed <- capture_first_vec( + "chr1:2-3,000", + chrom="chr.*?", + ":", + chromStart=".*?", + "-", + chromEnd="[0-9,]*", + type.convert=TRUE) + expected <- data.table( + chrom="chr1", + chromStart=2L, + chromEnd="3,000") + expect_identical(computed, expected) +}) + +test_engines("capture_first_vec(type.convert=TRUE) returns two int columns", { + computed <- capture_first_vec( + "chr1:2-3,000", + chrom="chr.*?", + ":", + chromStart=".*?", + "-", + chromEnd="[0-9,]*", keep.digits, + type.convert=TRUE) + expected <- data.table( + chrom="chr1", + chromStart=2L, + chromEnd=3000L) + expect_identical(computed, expected) +}) + +test_engines("capture_first_vec(type.convert=constant fun) returns all num columns", { + computed <- capture_first_vec( + "chr1:2-3,000", + chrom="chr.*?", + ":", + chromStart=".*?", + "-", + chromEnd="[0-9,]*", + type.convert=function(...)5) + expected <- data.table( + chrom=5, + chromStart=5, + chromEnd=5) + expect_identical(computed, expected) +}) + +test_engines("capture_first_vec(type.convert=as.integer) returns all int columns", { + computed <- suppressWarnings(capture_first_vec( + "chr1:2-3,000", + chrom="chr.*?", + ":", + chromStart=".*?", + "-", + chromEnd="[0-9,]*", + type.convert=as.integer)) + expected <- data.table( + chrom=NA_integer_, + chromStart=2L, + chromEnd=NA_integer_) + expect_identical(computed, expected) +}) + +test_engines("capture_first_vec(type.convert=invalid) errors", { + expect_error({ + capture_first_vec( + "chr1:2-3,000", + chrom="chr.*?", + ":", + chromStart=".*?", + "-", + chromEnd="[0-9,]*", + type.convert=function()5) + }, "type conversion functions should take one argument (character vector of captured text) and return an atomic vector of the same size; function for group 1(chrom) raised an error: unused argument (match.vec)", fixed=TRUE) +}) + +test_engines("capture_first_vec(type.convert='foo') errors", { + expect_error({ + capture_first_vec( + "chr1:2-3,000", + chrom="chr.*?", + ":", + chromStart=".*?", + "-", + chromEnd="[0-9,]*", + type.convert='foo') + }, "type.convert should be either TRUE or FALSE or a function", fixed=TRUE) +}) + test_engines("named function is an error", { expect_error({ capture_first_vec(