capture_first_df

tdhock · Dec 18, 2024 · f57aa54 · f57aa54
1 parent ef9d396
commit f57aa54
Show file tree

Hide file tree

Showing 6 changed files with 137 additions and 52 deletions.
diff --git a/NEWS b/NEWS
@@ -1,6 +1,6 @@
 Changes in version 2024.12.17
 
-- type.convert=TRUE means to use utils::type.convert(x,as.is=TRUE) as default conversion function (as.is=TRUE means to return character instead of factor), FALSE means identity, and otherwise can be any function to use as default conversion.
+- capture_first_vec, capture_all_str, capture_first_df now support type.convert argument. TRUE means to use utils::type.convert(x,as.is=TRUE) as default conversion function (as.is=TRUE means to return character instead of factor), FALSE means identity, and otherwise can be any function to use as default conversion.
 
 Changes in version 2024.9.20
 

diff --git a/R/capture_first_df.R b/R/capture_first_df.R
@@ -24,10 +24,17 @@ capture_first_df <- structure(function # Capture first match in columns of a dat
 ### if TRUE (default to avoid data loss), stop with an error if any
 ### capture groups have the same name as an existing column of
 ### subject.
-  engine=getOption("nc.engine", "PCRE")
+  engine=getOption("nc.engine", "PCRE"),
 ### character string, one of PCRE, ICU, RE2. This engine will be used
 ### for each column, unless another engine is specified for that
 ### column in ...
+  type.convert=getOption("nc.type.convert", FALSE)
+### Default conversion function, which will be used on each capture
+### group, unless a specific conversion is specified for that
+### group. If TRUE, use utils::type.convert; if FALSE, use
+### base::identity; otherwise must be a function of at least one
+### argument (character), returning an atomic vector of the same
+### length.
 ){
   all.arg.list <- list(...)
   subject <- all.arg.list[[1]]
@@ -57,7 +64,7 @@ capture_first_df <- structure(function # Capture first match in columns of a dat
   for(col.name in names(col.pattern.list)){
     subject.vec <- subject[[col.name]]
     col.arg.list <- c(list(subject.vec), col.pattern.list[[col.name]])
-    maybe.rep <- c("engine", "nomatch.error")
+    maybe.rep <- c("engine", "nomatch.error", "type.convert")
     to.rep <- maybe.rep[!maybe.rep %in% names(col.arg.list)]
     col.arg.list[to.rep] <- lapply(to.rep, get, environment())
     tryCatch({

diff --git a/man/capture_first_df.Rd b/man/capture_first_df.Rd
@@ -8,7 +8,8 @@ column.}
     nomatch.error = getOption("nc.nomatch.error", 
         TRUE), existing.error = getOption("nc.existing.error", 
         TRUE), engine = getOption("nc.engine", 
-        "PCRE"))}
+        "PCRE"), type.convert = getOption("nc.type.convert", 
+        FALSE))}
 \arguments{
   \item{\dots}{subject data frame, colName1=list(groupName1=pattern1, fun1, etc),
 colName2=list(etc), etc. First argument must be a data frame with
@@ -31,6 +32,12 @@ subject.}
   \item{engine}{character string, one of PCRE, ICU, RE2. This \code{engine} will be used
 for each column, unless another \code{engine} is specified for that
 column in \code{...}}
+  \item{type.convert}{Default conversion function, which will be used on each capture
+\code{\link{group}}, unless a specific conversion is specified for that
+\code{\link{group}}. If TRUE, use \code{\link[utils]{type.convert}}; if FALSE, use
+\code{\link[base]{identity}}; otherwise must be a function of at least one
+argument (character), returning an atomic vector of the same
+length.}
 }
 
 \value{data.table with same number of rows as subject, with an additional

diff --git a/tests/testthat/test-CRAN-all.R b/tests/testthat/test-CRAN-all.R
@@ -121,4 +121,51 @@ test_engines("nested capture groups works", {
   expect_is(match.dt$sampleID, "integer")
 })
 
+test_engines("error for capture all regex with literal groups, match", {
+  expect_error({
+    capture_all_str(
+      c("chr1:100-200", "chr2:5-6"),
+      chrom="chr.",
+      ":",
+      "([0-9]+)")
+  }, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE)
+})
+
+test_engines("error for capture all regex with literal groups, no match", {
+  expect_error({
+    nc::capture_all_str("alias(es)", foo="alias(es)")
+  }, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE)
+})
+
+test_engines("capture_all_str(type.convert=TRUE) returns one int column", {
+  computed <- capture_all_str(
+    "chr1:2-3,000 chr4:5-6,000",
+    chrom="chr.*?",
+    ":",
+    chromStart=".*?",
+    "-",
+    chromEnd="[0-9,]*",
+    type.convert=TRUE)
+  expected <- data.table(
+    chrom=c("chr1","chr4"),
+    chromStart=c(2L,5L),
+    chromEnd=c("3,000","6,000"))
+  expect_identical(computed, expected)
+})
+
+test_engines("capture_all_str(type.convert=TRUE) returns two int columns", {
+  computed <- capture_all_str(
+    "chr1:2-3,000 chr4:5-6,000",
+    chrom="chr.*?",
+    ":",
+    chromStart=".*?",
+    "-",
+    chromEnd="[0-9,]*", keep.digits,
+    type.convert=TRUE)
+  expected <- data.table(
+    chrom=c("chr1","chr4"),
+    chromStart=c(2L,5L),
+    chromEnd=c(3000L,6000L))
+  expect_identical(computed, expected)
+})
 
diff --git a/tests/testthat/test-CRAN-df.R b/tests/testthat/test-CRAN-df.R
@@ -209,6 +209,78 @@ test_engines("two name groups not OK with named subject", {
   }, "must not conflict with existing column names")
 })
 
+test_engines("type.convert OK inside capture_first_df list", {
+  type.conv.result <- capture_first_df(
+    named.uniq.chr,
+    JobID=list(
+      job="[0-9]+", 
+      "_",
+      "(?:",#begin alternate
+      task="[0-9]+", 
+      "|",#either one task(above) or range(below)
+      range.pattern,
+      ")",#end alternate
+      "(?:[.]",
+      type=".*", identity,
+      ")?",
+      type.convert=as.numeric),
+    position=list(
+      name="chr.*?",
+      ":",
+      chromStart=".*?", keep.digits,
+      "-",
+      chromEnd="[0-9,]*", keep.digits))
+  computed.cls <- sapply(type.conv.result, class)
+  expected.cls <- c(
+    JobID = "character",
+    position = "character",
+    job = "numeric", 
+    task = "numeric",
+    task1 = "integer",
+    taskN = "integer",
+    type = "character", 
+    name = "character",
+    chromStart = "integer",
+    chromEnd = "integer")
+  expect_identical(computed.cls, expected.cls)
+})
+
+test_engines("type.convert OK as capture_first_df arg", {
+  type.conv.result <- capture_first_df(
+    named.uniq.chr,
+    JobID=list(
+      job="[0-9]+", 
+      "_",
+      "(?:",#begin alternate
+      task="[0-9]+", 
+      "|",#either one task(above) or range(below)
+      range.pattern,
+      ")",#end alternate
+      "(?:[.]",
+      type=".*", identity,
+      ")?"),
+    position=list(
+      name="chr.*?",
+      ":",
+      chromStart=".*?", keep.digits,
+      "-",
+      chromEnd="[0-9,]*", keep.digits),
+    type.convert=as.factor)
+  computed.cls <- sapply(type.conv.result, class)
+  expected.cls <- c(
+    JobID = "character",
+    position = "character",
+    job = "factor", 
+    task = "factor",
+    task1 = "integer",
+    taskN = "integer",
+    type = "character", 
+    name = "factor",
+    chromStart = "integer",
+    chromEnd = "integer")
+  expect_identical(computed.cls, expected.cls)
+})
+
 test_engines("error for no pattern", {
   expect_error({
     capture_first_df(named.uniq.chr)

diff --git a/tests/testthat/test-CRAN-vec.R b/tests/testthat/test-CRAN-vec.R
@@ -46,22 +46,6 @@ test_engines("error for capture first regex with literal groups", {
   }, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE)
 })
 
-test_engines("error for capture all regex with literal groups, match", {
-  expect_error({
-    capture_all_str(
-      c("chr1:100-200", "chr2:5-6"),
-      chrom="chr.",
-      ":",
-      "([0-9]+)")
-  }, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE)
-})
-
-test_engines("error for capture all regex with literal groups, no match", {
-  expect_error({
-    nc::capture_all_str("alias(es)", foo="alias(es)")
-  }, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE)
-})
-
 subject <- c(
   ten="chr10:213,054,000-213,055,000",
   chrNA="chrNA:111,000-222,000",
@@ -191,38 +175,6 @@ test_engines("capture_first_vec(type.convert='foo') errors", {
   }, "type.convert should be either TRUE or FALSE or a function", fixed=TRUE)
 })
 
-test_engines("capture_all_str(type.convert=TRUE) returns one int column", {
-  computed <- capture_all_str(
-    "chr1:2-3,000 chr4:5-6,000",
-    chrom="chr.*?",
-    ":",
-    chromStart=".*?",
-    "-",
-    chromEnd="[0-9,]*",
-    type.convert=TRUE)
-  expected <- data.table(
-    chrom=c("chr1","chr4"),
-    chromStart=c(2L,5L),
-    chromEnd=c("3,000","6,000"))
-  expect_identical(computed, expected)
-})
-
-test_engines("capture_all_str(type.convert=TRUE) returns two int columns", {
-  computed <- capture_all_str(
-    "chr1:2-3,000 chr4:5-6,000",
-    chrom="chr.*?",
-    ":",
-    chromStart=".*?",
-    "-",
-    chromEnd="[0-9,]*", keep.digits,
-    type.convert=TRUE)
-  expected <- data.table(
-    chrom=c("chr1","chr4"),
-    chromStart=c(2L,5L),
-    chromEnd=c(3000L,6000L))
-  expect_identical(computed, expected)
-})
-
 test_engines("named function is an error", {
   expect_error({
     capture_first_vec(