diff --git a/NEWS b/NEWS index 64ac40e..e770418 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,6 @@ Changes in version 2024.12.17 -- type.convert=TRUE means to use utils::type.convert(x,as.is=TRUE) as default conversion function (as.is=TRUE means to return character instead of factor), FALSE means identity, and otherwise can be any function to use as default conversion. +- capture_first_vec, capture_all_str, capture_first_df now support type.convert argument. TRUE means to use utils::type.convert(x,as.is=TRUE) as default conversion function (as.is=TRUE means to return character instead of factor), FALSE means identity, and otherwise can be any function to use as default conversion. Changes in version 2024.9.20 diff --git a/R/capture_first_df.R b/R/capture_first_df.R index 9f3fae4..4466fcb 100644 --- a/R/capture_first_df.R +++ b/R/capture_first_df.R @@ -24,10 +24,17 @@ capture_first_df <- structure(function # Capture first match in columns of a dat ### if TRUE (default to avoid data loss), stop with an error if any ### capture groups have the same name as an existing column of ### subject. - engine=getOption("nc.engine", "PCRE") + engine=getOption("nc.engine", "PCRE"), ### character string, one of PCRE, ICU, RE2. This engine will be used ### for each column, unless another engine is specified for that ### column in ... + type.convert=getOption("nc.type.convert", FALSE) +### Default conversion function, which will be used on each capture +### group, unless a specific conversion is specified for that +### group. If TRUE, use utils::type.convert; if FALSE, use +### base::identity; otherwise must be a function of at least one +### argument (character), returning an atomic vector of the same +### length. ){ all.arg.list <- list(...) subject <- all.arg.list[[1]] @@ -57,7 +64,7 @@ capture_first_df <- structure(function # Capture first match in columns of a dat for(col.name in names(col.pattern.list)){ subject.vec <- subject[[col.name]] col.arg.list <- c(list(subject.vec), col.pattern.list[[col.name]]) - maybe.rep <- c("engine", "nomatch.error") + maybe.rep <- c("engine", "nomatch.error", "type.convert") to.rep <- maybe.rep[!maybe.rep %in% names(col.arg.list)] col.arg.list[to.rep] <- lapply(to.rep, get, environment()) tryCatch({ diff --git a/man/capture_first_df.Rd b/man/capture_first_df.Rd index 066c76a..ab02409 100644 --- a/man/capture_first_df.Rd +++ b/man/capture_first_df.Rd @@ -8,7 +8,8 @@ column.} nomatch.error = getOption("nc.nomatch.error", TRUE), existing.error = getOption("nc.existing.error", TRUE), engine = getOption("nc.engine", - "PCRE"))} + "PCRE"), type.convert = getOption("nc.type.convert", + FALSE))} \arguments{ \item{\dots}{subject data frame, colName1=list(groupName1=pattern1, fun1, etc), colName2=list(etc), etc. First argument must be a data frame with @@ -31,6 +32,12 @@ subject.} \item{engine}{character string, one of PCRE, ICU, RE2. This \code{engine} will be used for each column, unless another \code{engine} is specified for that column in \code{...}} + \item{type.convert}{Default conversion function, which will be used on each capture +\code{\link{group}}, unless a specific conversion is specified for that +\code{\link{group}}. If TRUE, use \code{\link[utils]{type.convert}}; if FALSE, use +\code{\link[base]{identity}}; otherwise must be a function of at least one +argument (character), returning an atomic vector of the same +length.} } \value{data.table with same number of rows as subject, with an additional diff --git a/tests/testthat/test-CRAN-all.R b/tests/testthat/test-CRAN-all.R index 70d07c6..6853e02 100644 --- a/tests/testthat/test-CRAN-all.R +++ b/tests/testthat/test-CRAN-all.R @@ -121,4 +121,51 @@ test_engines("nested capture groups works", { expect_is(match.dt$sampleID, "integer") }) +test_engines("error for capture all regex with literal groups, match", { + expect_error({ + capture_all_str( + c("chr1:100-200", "chr2:5-6"), + chrom="chr.", + ":", + "([0-9]+)") + }, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE) +}) + +test_engines("error for capture all regex with literal groups, no match", { + expect_error({ + nc::capture_all_str("alias(es)", foo="alias(es)") + }, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE) +}) + +test_engines("capture_all_str(type.convert=TRUE) returns one int column", { + computed <- capture_all_str( + "chr1:2-3,000 chr4:5-6,000", + chrom="chr.*?", + ":", + chromStart=".*?", + "-", + chromEnd="[0-9,]*", + type.convert=TRUE) + expected <- data.table( + chrom=c("chr1","chr4"), + chromStart=c(2L,5L), + chromEnd=c("3,000","6,000")) + expect_identical(computed, expected) +}) + +test_engines("capture_all_str(type.convert=TRUE) returns two int columns", { + computed <- capture_all_str( + "chr1:2-3,000 chr4:5-6,000", + chrom="chr.*?", + ":", + chromStart=".*?", + "-", + chromEnd="[0-9,]*", keep.digits, + type.convert=TRUE) + expected <- data.table( + chrom=c("chr1","chr4"), + chromStart=c(2L,5L), + chromEnd=c(3000L,6000L)) + expect_identical(computed, expected) +}) diff --git a/tests/testthat/test-CRAN-df.R b/tests/testthat/test-CRAN-df.R index 4c38153..37a26ae 100644 --- a/tests/testthat/test-CRAN-df.R +++ b/tests/testthat/test-CRAN-df.R @@ -209,6 +209,78 @@ test_engines("two name groups not OK with named subject", { }, "must not conflict with existing column names") }) +test_engines("type.convert OK inside capture_first_df list", { + type.conv.result <- capture_first_df( + named.uniq.chr, + JobID=list( + job="[0-9]+", + "_", + "(?:",#begin alternate + task="[0-9]+", + "|",#either one task(above) or range(below) + range.pattern, + ")",#end alternate + "(?:[.]", + type=".*", identity, + ")?", + type.convert=as.numeric), + position=list( + name="chr.*?", + ":", + chromStart=".*?", keep.digits, + "-", + chromEnd="[0-9,]*", keep.digits)) + computed.cls <- sapply(type.conv.result, class) + expected.cls <- c( + JobID = "character", + position = "character", + job = "numeric", + task = "numeric", + task1 = "integer", + taskN = "integer", + type = "character", + name = "character", + chromStart = "integer", + chromEnd = "integer") + expect_identical(computed.cls, expected.cls) +}) + +test_engines("type.convert OK as capture_first_df arg", { + type.conv.result <- capture_first_df( + named.uniq.chr, + JobID=list( + job="[0-9]+", + "_", + "(?:",#begin alternate + task="[0-9]+", + "|",#either one task(above) or range(below) + range.pattern, + ")",#end alternate + "(?:[.]", + type=".*", identity, + ")?"), + position=list( + name="chr.*?", + ":", + chromStart=".*?", keep.digits, + "-", + chromEnd="[0-9,]*", keep.digits), + type.convert=as.factor) + computed.cls <- sapply(type.conv.result, class) + expected.cls <- c( + JobID = "character", + position = "character", + job = "factor", + task = "factor", + task1 = "integer", + taskN = "integer", + type = "character", + name = "factor", + chromStart = "integer", + chromEnd = "integer") + expect_identical(computed.cls, expected.cls) +}) + test_engines("error for no pattern", { expect_error({ capture_first_df(named.uniq.chr) diff --git a/tests/testthat/test-CRAN-vec.R b/tests/testthat/test-CRAN-vec.R index f7b5679..c29915e 100644 --- a/tests/testthat/test-CRAN-vec.R +++ b/tests/testthat/test-CRAN-vec.R @@ -46,22 +46,6 @@ test_engines("error for capture first regex with literal groups", { }, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE) }) -test_engines("error for capture all regex with literal groups, match", { - expect_error({ - capture_all_str( - c("chr1:100-200", "chr2:5-6"), - chrom="chr.", - ":", - "([0-9]+)") - }, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE) -}) - -test_engines("error for capture all regex with literal groups, no match", { - expect_error({ - nc::capture_all_str("alias(es)", foo="alias(es)") - }, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE) -}) - subject <- c( ten="chr10:213,054,000-213,055,000", chrNA="chrNA:111,000-222,000", @@ -191,38 +175,6 @@ test_engines("capture_first_vec(type.convert='foo') errors", { }, "type.convert should be either TRUE or FALSE or a function", fixed=TRUE) }) -test_engines("capture_all_str(type.convert=TRUE) returns one int column", { - computed <- capture_all_str( - "chr1:2-3,000 chr4:5-6,000", - chrom="chr.*?", - ":", - chromStart=".*?", - "-", - chromEnd="[0-9,]*", - type.convert=TRUE) - expected <- data.table( - chrom=c("chr1","chr4"), - chromStart=c(2L,5L), - chromEnd=c("3,000","6,000")) - expect_identical(computed, expected) -}) - -test_engines("capture_all_str(type.convert=TRUE) returns two int columns", { - computed <- capture_all_str( - "chr1:2-3,000 chr4:5-6,000", - chrom="chr.*?", - ":", - chromStart=".*?", - "-", - chromEnd="[0-9,]*", keep.digits, - type.convert=TRUE) - expected <- data.table( - chrom=c("chr1","chr4"), - chromStart=c(2L,5L), - chromEnd=c(3000L,6000L)) - expect_identical(computed, expected) -}) - test_engines("named function is an error", { expect_error({ capture_first_vec(