Skip to content

Commit

Permalink
capture_first_df
Browse files Browse the repository at this point in the history
  • Loading branch information
Toby Dylan Hocking committed Dec 18, 2024
1 parent ef9d396 commit f57aa54
Show file tree
Hide file tree
Showing 6 changed files with 137 additions and 52 deletions.
2 changes: 1 addition & 1 deletion NEWS
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Changes in version 2024.12.17

- type.convert=TRUE means to use utils::type.convert(x,as.is=TRUE) as default conversion function (as.is=TRUE means to return character instead of factor), FALSE means identity, and otherwise can be any function to use as default conversion.
- capture_first_vec, capture_all_str, capture_first_df now support type.convert argument. TRUE means to use utils::type.convert(x,as.is=TRUE) as default conversion function (as.is=TRUE means to return character instead of factor), FALSE means identity, and otherwise can be any function to use as default conversion.

Changes in version 2024.9.20

Expand Down
11 changes: 9 additions & 2 deletions R/capture_first_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,17 @@ capture_first_df <- structure(function # Capture first match in columns of a dat
### if TRUE (default to avoid data loss), stop with an error if any
### capture groups have the same name as an existing column of
### subject.
engine=getOption("nc.engine", "PCRE")
engine=getOption("nc.engine", "PCRE"),
### character string, one of PCRE, ICU, RE2. This engine will be used
### for each column, unless another engine is specified for that
### column in ...
type.convert=getOption("nc.type.convert", FALSE)
### Default conversion function, which will be used on each capture
### group, unless a specific conversion is specified for that
### group. If TRUE, use utils::type.convert; if FALSE, use
### base::identity; otherwise must be a function of at least one
### argument (character), returning an atomic vector of the same
### length.
){
all.arg.list <- list(...)
subject <- all.arg.list[[1]]
Expand Down Expand Up @@ -57,7 +64,7 @@ capture_first_df <- structure(function # Capture first match in columns of a dat
for(col.name in names(col.pattern.list)){
subject.vec <- subject[[col.name]]
col.arg.list <- c(list(subject.vec), col.pattern.list[[col.name]])
maybe.rep <- c("engine", "nomatch.error")
maybe.rep <- c("engine", "nomatch.error", "type.convert")
to.rep <- maybe.rep[!maybe.rep %in% names(col.arg.list)]
col.arg.list[to.rep] <- lapply(to.rep, get, environment())
tryCatch({
Expand Down
9 changes: 8 additions & 1 deletion man/capture_first_df.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ column.}
nomatch.error = getOption("nc.nomatch.error",
TRUE), existing.error = getOption("nc.existing.error",
TRUE), engine = getOption("nc.engine",
"PCRE"))}
"PCRE"), type.convert = getOption("nc.type.convert",
FALSE))}
\arguments{
\item{\dots}{subject data frame, colName1=list(groupName1=pattern1, fun1, etc),
colName2=list(etc), etc. First argument must be a data frame with
Expand All @@ -31,6 +32,12 @@ subject.}
\item{engine}{character string, one of PCRE, ICU, RE2. This \code{engine} will be used
for each column, unless another \code{engine} is specified for that
column in \code{...}}
\item{type.convert}{Default conversion function, which will be used on each capture
\code{\link{group}}, unless a specific conversion is specified for that
\code{\link{group}}. If TRUE, use \code{\link[utils]{type.convert}}; if FALSE, use
\code{\link[base]{identity}}; otherwise must be a function of at least one
argument (character), returning an atomic vector of the same
length.}
}

\value{data.table with same number of rows as subject, with an additional
Expand Down
47 changes: 47 additions & 0 deletions tests/testthat/test-CRAN-all.R
Original file line number Diff line number Diff line change
Expand Up @@ -121,4 +121,51 @@ test_engines("nested capture groups works", {
expect_is(match.dt$sampleID, "integer")
})

test_engines("error for capture all regex with literal groups, match", {
expect_error({
capture_all_str(
c("chr1:100-200", "chr2:5-6"),
chrom="chr.",
":",
"([0-9]+)")
}, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE)
})

test_engines("error for capture all regex with literal groups, no match", {
expect_error({
nc::capture_all_str("alias(es)", foo="alias(es)")
}, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE)
})

test_engines("capture_all_str(type.convert=TRUE) returns one int column", {
computed <- capture_all_str(
"chr1:2-3,000 chr4:5-6,000",
chrom="chr.*?",
":",
chromStart=".*?",
"-",
chromEnd="[0-9,]*",
type.convert=TRUE)
expected <- data.table(
chrom=c("chr1","chr4"),
chromStart=c(2L,5L),
chromEnd=c("3,000","6,000"))
expect_identical(computed, expected)
})

test_engines("capture_all_str(type.convert=TRUE) returns two int columns", {
computed <- capture_all_str(
"chr1:2-3,000 chr4:5-6,000",
chrom="chr.*?",
":",
chromStart=".*?",
"-",
chromEnd="[0-9,]*", keep.digits,
type.convert=TRUE)
expected <- data.table(
chrom=c("chr1","chr4"),
chromStart=c(2L,5L),
chromEnd=c(3000L,6000L))
expect_identical(computed, expected)
})

72 changes: 72 additions & 0 deletions tests/testthat/test-CRAN-df.R
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,78 @@ test_engines("two name groups not OK with named subject", {
}, "must not conflict with existing column names")
})

test_engines("type.convert OK inside capture_first_df list", {
type.conv.result <- capture_first_df(
named.uniq.chr,
JobID=list(
job="[0-9]+",
"_",
"(?:",#begin alternate
task="[0-9]+",
"|",#either one task(above) or range(below)
range.pattern,
")",#end alternate
"(?:[.]",
type=".*", identity,
")?",
type.convert=as.numeric),
position=list(
name="chr.*?",
":",
chromStart=".*?", keep.digits,
"-",
chromEnd="[0-9,]*", keep.digits))
computed.cls <- sapply(type.conv.result, class)
expected.cls <- c(
JobID = "character",
position = "character",
job = "numeric",
task = "numeric",
task1 = "integer",
taskN = "integer",
type = "character",
name = "character",
chromStart = "integer",
chromEnd = "integer")
expect_identical(computed.cls, expected.cls)
})

test_engines("type.convert OK as capture_first_df arg", {
type.conv.result <- capture_first_df(
named.uniq.chr,
JobID=list(
job="[0-9]+",
"_",
"(?:",#begin alternate
task="[0-9]+",
"|",#either one task(above) or range(below)
range.pattern,
")",#end alternate
"(?:[.]",
type=".*", identity,
")?"),
position=list(
name="chr.*?",
":",
chromStart=".*?", keep.digits,
"-",
chromEnd="[0-9,]*", keep.digits),
type.convert=as.factor)
computed.cls <- sapply(type.conv.result, class)
expected.cls <- c(
JobID = "character",
position = "character",
job = "factor",
task = "factor",
task1 = "integer",
taskN = "integer",
type = "character",
name = "factor",
chromStart = "integer",
chromEnd = "integer")
expect_identical(computed.cls, expected.cls)
})

test_engines("error for no pattern", {
expect_error({
capture_first_df(named.uniq.chr)
Expand Down
48 changes: 0 additions & 48 deletions tests/testthat/test-CRAN-vec.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,22 +46,6 @@ test_engines("error for capture first regex with literal groups", {
}, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE)
})

test_engines("error for capture all regex with literal groups, match", {
expect_error({
capture_all_str(
c("chr1:100-200", "chr2:5-6"),
chrom="chr.",
":",
"([0-9]+)")
}, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE)
})

test_engines("error for capture all regex with literal groups, no match", {
expect_error({
nc::capture_all_str("alias(es)", foo="alias(es)")
}, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE)
})

subject <- c(
ten="chr10:213,054,000-213,055,000",
chrNA="chrNA:111,000-222,000",
Expand Down Expand Up @@ -191,38 +175,6 @@ test_engines("capture_first_vec(type.convert='foo') errors", {
}, "type.convert should be either TRUE or FALSE or a function", fixed=TRUE)
})

test_engines("capture_all_str(type.convert=TRUE) returns one int column", {
computed <- capture_all_str(
"chr1:2-3,000 chr4:5-6,000",
chrom="chr.*?",
":",
chromStart=".*?",
"-",
chromEnd="[0-9,]*",
type.convert=TRUE)
expected <- data.table(
chrom=c("chr1","chr4"),
chromStart=c(2L,5L),
chromEnd=c("3,000","6,000"))
expect_identical(computed, expected)
})

test_engines("capture_all_str(type.convert=TRUE) returns two int columns", {
computed <- capture_all_str(
"chr1:2-3,000 chr4:5-6,000",
chrom="chr.*?",
":",
chromStart=".*?",
"-",
chromEnd="[0-9,]*", keep.digits,
type.convert=TRUE)
expected <- data.table(
chrom=c("chr1","chr4"),
chromStart=c(2L,5L),
chromEnd=c(3000L,6000L))
expect_identical(computed, expected)
})

test_engines("named function is an error", {
expect_error({
capture_first_vec(
Expand Down

0 comments on commit f57aa54

Please sign in to comment.