From 9fbf533732ba2f8c2be0d982c518b6ed808c5c70 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Mon, 16 Dec 2024 15:05:38 -0500 Subject: [PATCH 01/17] type.convert arg --- DESCRIPTION | 2 +- NAMESPACE | 1 + NEWS | 4 ++++ R/capture_all_str.R | 7 +++++-- R/capture_first_vec.R | 7 +++++-- R/var_args_list.R | 14 ++++++++++---- 6 files changed, 26 insertions(+), 9 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 951d255..d732d5e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Authors@R: c( person("Toby", "Hocking", email="toby.hocking@r-project.org", role=c("aut", "cre"))) -Version: 2024.9.20 +Version: 2024.12.16 License: GPL-3 Title: Named Capture to Data Tables Description: User-friendly functions for extracting a data diff --git a/NAMESPACE b/NAMESPACE index fadf14d..6697c4a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,3 +15,4 @@ export(capture_first_df) export(capture_longer_spec) import(data.table) importFrom("stats","na.omit") +importFrom("utils","type.convert") diff --git a/NEWS b/NEWS index e11d70d..34d4086 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,7 @@ +Changes in version 2024.12.16 + +- type.convert=TRUE means to use type.convert() as default conversion function. + Changes in version 2024.9.20 - list(given_first=list(given, family)) examples + v5 + docs update. diff --git a/R/capture_all_str.R b/R/capture_all_str.R index 90f8547..c227a12 100644 --- a/R/capture_all_str.R +++ b/R/capture_all_str.R @@ -12,12 +12,15 @@ capture_all_str <- structure(function # Capture all matches in a single subject ### must be string/function/list, as documented in capture_first_vec. engine=getOption("nc.engine", "PCRE"), ### character string, one of PCRE, ICU, RE2 - collapse="\n" + collapse="\n", ### separator string for combining elements of subject into a single ### string, used as collapse argument of base::paste. + type.convert=getOption("nc.type.convert", FALSE) +### If TRUE, use utils::type.convert instead of base::identity for the +### default conversion. ){ stop_for_engine(engine) - L <- subject_var_args(...) + L <- subject_var_args(..., type.convert=type.convert) ## instead of explicitly using file.exists here (which may error for ## some strange/emoji subjects) we can just use readLines, which ## will error if arg has more than one element (invalid diff --git a/R/capture_first_vec.R b/R/capture_first_vec.R index 89bed2f..80d5dc3 100644 --- a/R/capture_first_vec.R +++ b/R/capture_first_vec.R @@ -26,10 +26,13 @@ capture_first_vec <- structure(function # Capture first match in each character ### if TRUE (default), stop with an error if any subject does not ### match; otherwise subjects that do not match are reported as ### missing/NA rows of the result. - engine=getOption("nc.engine", "PCRE") + engine=getOption("nc.engine", "PCRE"), ### character string, one of PCRE, ICU, RE2 + type.convert=getOption("nc.type.convert", FALSE) +### If TRUE, use utils::type.convert instead of base::identity for the +### default conversion. ){ - L <- subject_var_args(...) + L <- subject_var_args(..., type.convert=type.convert) subject.vec <- L[["subject"]] stop_for_engine(engine) ##alias<< nc diff --git a/R/var_args_list.R b/R/var_args_list.R index a0052d3..163b3b7 100644 --- a/R/var_args_list.R +++ b/R/var_args_list.R @@ -1,7 +1,10 @@ subject_var_args <- function ### Parse the complete argument list including subject. -(... +(..., ### subject, regex/conversion. + type.convert=getOption("nc.type.convert", FALSE) +### If TRUE, use utils::type.convert instead of base::identity for the +### default conversion. ){ all.arg.list <- list(...) first.name <- names(all.arg.list[1]) @@ -11,7 +14,7 @@ subject_var_args <- function } subject <- all.arg.list[[1]] stop_for_subject(subject) - out.list <- var_args_list(all.arg.list[-1]) + out.list <- var_args_list(all.arg.list[-1], type.convert=type.convert) out.list$subject <- subject out.list ### Result of var_args_list plus subject. @@ -22,7 +25,7 @@ var_args_list <- structure(function ### capture_first_df, and capture_all_str. This function is mostly ### intended for internal use, but is useful if you want to see the ### regex pattern generated by the variable argument syntax. -(... +(..., ### character vectors (for regex patterns) or functions (which specify ### how to convert extracted character vectors to other types). All ### patterns must be character vectors of length 1. If the pattern is @@ -32,6 +35,9 @@ var_args_list <- structure(function ### at most one function which is used to convert the previous named ### pattern. Patterns may also be lists, which are parsed recursively ### for convenience. + type.convert=getOption("nc.type.convert", FALSE) +### If TRUE, use utils::type.convert instead of base::identity for the +### default conversion. ){ var.arg.list <- list(...) fun.list <- list() @@ -50,7 +56,7 @@ var_args_list <- structure(function stop(domain=NA, gettextf("functions must not be named, problem: %s", pattern.name)) } group.i <- length(fun.list)+1L - fun.list[[group.i]] <- identity + fun.list[[group.i]] <- if(isTRUE(type.convert))type.convert else identity names(fun.list)[[group.i]] <- pattern.name has.name <- TRUE "(" From 9c73d4e30ca357fa53250134358bc0cd9d4478dd Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Mon, 16 Dec 2024 15:17:10 -0500 Subject: [PATCH 02/17] docs --- man/capture_all_str.Rd | 6 +++++- man/capture_first_vec.Rd | 5 ++++- man/subject_var_args.Rd | 6 +++++- man/var_args_list.Rd | 5 ++++- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/man/capture_all_str.Rd b/man/capture_all_str.Rd index 97fdc0a..7cf8e6c 100644 --- a/man/capture_all_str.Rd +++ b/man/capture_all_str.Rd @@ -6,7 +6,9 @@ string or text file. It can be used to convert any regular text file (web page, log, etc) to a data table, see examples.} \usage{capture_all_str(..., engine = getOption("nc.engine", - "PCRE"), collapse = "\\n")} + "PCRE"), collapse = "\\n", + type.convert = getOption("nc.type.convert", + FALSE))} \arguments{ \item{\dots}{subject, name1=pattern1, fun1, etc. The first argument must be a subject character vector (or file name which is read via @@ -18,6 +20,8 @@ must be string/function/list, as documented in \code{\link{capture_first_vec}}.} \item{engine}{character string, one of PCRE, ICU, RE2} \item{collapse}{separator string for combining elements of subject into a single string, used as \code{collapse} argument of \code{\link[base]{paste}}.} + \item{type.convert}{If TRUE, use \code{\link[utils]{type.convert}} instead of \code{\link[base]{identity}} for the +default conversion.} } \value{data.table with one row for each match, and one column for each diff --git a/man/capture_first_vec.Rd b/man/capture_first_vec.Rd index 522d12d..94eaa75 100644 --- a/man/capture_first_vec.Rd +++ b/man/capture_first_vec.Rd @@ -16,7 +16,8 @@ definition of the regex you can use \code{\link{field}}, \code{\link{quantifier} \usage{capture_first_vec(..., nomatch.error = getOption("nc.nomatch.error", TRUE), engine = getOption("nc.engine", - "PCRE"))} + "PCRE"), type.convert = getOption("nc.type.convert", + FALSE))} \arguments{ \item{\dots}{subject, name1=pattern1, fun1, etc. The first argument must be a character vector of length>0 (subject strings to parse with a @@ -33,6 +34,8 @@ recursively using these rules.} match; otherwise subjects that do not match are reported as missing/NA rows of the result.} \item{engine}{character string, one of PCRE, ICU, RE2} + \item{type.convert}{If TRUE, use \code{\link[utils]{type.convert}} instead of \code{\link[base]{identity}} for the +default conversion.} } \value{data.table with one row for each subject, and one column for each diff --git a/man/subject_var_args.Rd b/man/subject_var_args.Rd index e22a511..a78378d 100644 --- a/man/subject_var_args.Rd +++ b/man/subject_var_args.Rd @@ -2,9 +2,13 @@ \alias{subject_var_args} \title{subject var args} \description{Parse the complete argument list including subject.} -\usage{subject_var_args(...)} +\usage{subject_var_args(..., + type.convert = getOption("nc.type.convert", + FALSE))} \arguments{ \item{\dots}{subject, regex/conversion.} + \item{type.convert}{If TRUE, use \code{\link[utils]{type.convert}} instead of \code{\link[base]{identity}} for the +default conversion.} } \value{Result of \code{\link{var_args_list}} plus subject.} diff --git a/man/var_args_list.Rd b/man/var_args_list.Rd index feaf937..a98dabd 100644 --- a/man/var_args_list.Rd +++ b/man/var_args_list.Rd @@ -5,7 +5,8 @@ \code{\link{capture_first_df}}, and \code{\link{capture_all_str}}. This function is mostly intended for internal use, but is useful if you want to see the regex pattern generated by the variable argument syntax.} -\usage{var_args_list(...)} +\usage{var_args_list(..., type.convert = getOption("nc.type.convert", + FALSE))} \arguments{ \item{\dots}{character vectors (for regex patterns) or functions (which specify how to convert extracted character vectors to other types). All @@ -16,6 +17,8 @@ pattern used for matching. Each named pattern may be followed by at most one function which is used to convert the previous named pattern. Patterns may also be lists, which are parsed recursively for convenience.} + \item{type.convert}{If TRUE, use \code{\link[utils]{type.convert}} instead of \code{\link[base]{identity}} for the +default conversion.} } \value{a list with two named elements From 8804e83eb8a9d0eda398e8792d5b270df487cc86 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Mon, 16 Dec 2024 15:18:14 -0500 Subject: [PATCH 03/17] anonymous fun, as.is=TRUE --- R/var_args_list.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/var_args_list.R b/R/var_args_list.R index 163b3b7..4c01dca 100644 --- a/R/var_args_list.R +++ b/R/var_args_list.R @@ -56,7 +56,9 @@ var_args_list <- structure(function stop(domain=NA, gettextf("functions must not be named, problem: %s", pattern.name)) } group.i <- length(fun.list)+1L - fun.list[[group.i]] <- if(isTRUE(type.convert))type.convert else identity + fun.list[[group.i]] <- if(isTRUE(type.convert)){ + function(x)utils::type.convert(x,as.is=TRUE) + }else identity names(fun.list)[[group.i]] <- pattern.name has.name <- TRUE "(" From c8b96b26572f55fa38ec328c148578318a984afd Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Mon, 16 Dec 2024 15:18:19 -0500 Subject: [PATCH 04/17] add tests --- tests/testthat/test-CRAN-vec.R | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/testthat/test-CRAN-vec.R b/tests/testthat/test-CRAN-vec.R index 0002fc2..8d58b28 100644 --- a/tests/testthat/test-CRAN-vec.R +++ b/tests/testthat/test-CRAN-vec.R @@ -101,6 +101,38 @@ test_engines("capture_first_vec returns data.table with int columns", { expect_identical(computed, expected) }) +test_engines("capture_first_vec(type.convert=TRUE) returns one int column", { + computed <- capture_first_vec( + "chr1:2-3,000", + chrom="chr.*?", + ":", + chromStart=".*?", + "-", + chromEnd="[0-9,]*", + type.convert=TRUE) + expected <- data.table( + chrom="chr1", + chromStart=2L, + chromEnd="3,000") + expect_identical(computed, expected) +}) + +test_engines("capture_first_vec(type.convert=TRUE) returns two int columns", { + computed <- capture_first_vec( + "chr1:2-3,000", + chrom="chr.*?", + ":", + chromStart=".*?", + "-", + chromEnd="[0-9,]*", keep.digits, + type.convert=TRUE) + expected <- data.table( + chrom="chr1", + chromStart=2L, + chromEnd=3000L) + expect_identical(computed, expected) +}) + test_engines("named function is an error", { expect_error({ capture_first_vec( From 87f60391077f75255aef2fa74541f1bc1e823d7e Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Mon, 16 Dec 2024 15:21:18 -0500 Subject: [PATCH 05/17] test capture_all_str --- tests/testthat/test-CRAN-vec.R | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/testthat/test-CRAN-vec.R b/tests/testthat/test-CRAN-vec.R index 8d58b28..40fce60 100644 --- a/tests/testthat/test-CRAN-vec.R +++ b/tests/testthat/test-CRAN-vec.R @@ -133,6 +133,38 @@ test_engines("capture_first_vec(type.convert=TRUE) returns two int columns", { expect_identical(computed, expected) }) +test_engines("capture_all_str(type.convert=TRUE) returns one int column", { + computed <- capture_all_str( + "chr1:2-3,000 chr4:5-6,000", + chrom="chr.*?", + ":", + chromStart=".*?", + "-", + chromEnd="[0-9,]*", + type.convert=TRUE) + expected <- data.table( + chrom=c("chr1","chr4"), + chromStart=c(2L,5L), + chromEnd=c("3,000","6,000")) + expect_identical(computed, expected) +}) + +test_engines("capture_all_str(type.convert=TRUE) returns two int columns", { + computed <- capture_all_str( + "chr1:2-3,000 chr4:5-6,000", + chrom="chr.*?", + ":", + chromStart=".*?", + "-", + chromEnd="[0-9,]*", keep.digits, + type.convert=TRUE) + expected <- data.table( + chrom=c("chr1","chr4"), + chromStart=c(2L,5L), + chromEnd=c(3000L,6000L)) + expect_identical(computed, expected) +}) + test_engines("named function is an error", { expect_error({ capture_first_vec( From 1f364b19123dbf0d86a0b1c54fea71b17096178a Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Tue, 17 Dec 2024 08:54:35 -0500 Subject: [PATCH 06/17] update docs and code --- R/capture_first_vec.R | 8 ++++++-- R/var_args_list.R | 15 ++++++++++----- man/capture_first_vec.Rd | 8 ++++++-- man/subject_var_args.Rd | 3 +-- man/var_args_list.Rd | 8 ++++++-- 5 files changed, 29 insertions(+), 13 deletions(-) diff --git a/R/capture_first_vec.R b/R/capture_first_vec.R index 80d5dc3..f19e3fe 100644 --- a/R/capture_first_vec.R +++ b/R/capture_first_vec.R @@ -29,8 +29,12 @@ capture_first_vec <- structure(function # Capture first match in each character engine=getOption("nc.engine", "PCRE"), ### character string, one of PCRE, ICU, RE2 type.convert=getOption("nc.type.convert", FALSE) -### If TRUE, use utils::type.convert instead of base::identity for the -### default conversion. +### Default conversion function, which will be used on each capture +### group, unless a specific conversion is specified for that +### group. If TRUE, use utils::type.convert; if FALSE, use +### base::identity; otherwise must be a function of at least one +### argument (character), returning an atomic vector of the same +### length. ){ L <- subject_var_args(..., type.convert=type.convert) subject.vec <- L[["subject"]] diff --git a/R/var_args_list.R b/R/var_args_list.R index 4c01dca..989ac12 100644 --- a/R/var_args_list.R +++ b/R/var_args_list.R @@ -3,8 +3,7 @@ subject_var_args <- function (..., ### subject, regex/conversion. type.convert=getOption("nc.type.convert", FALSE) -### If TRUE, use utils::type.convert instead of base::identity for the -### default conversion. +### passed to var_args_list. ){ all.arg.list <- list(...) first.name <- names(all.arg.list[1]) @@ -36,8 +35,12 @@ var_args_list <- structure(function ### pattern. Patterns may also be lists, which are parsed recursively ### for convenience. type.convert=getOption("nc.type.convert", FALSE) -### If TRUE, use utils::type.convert instead of base::identity for the -### default conversion. +### Default conversion function, which will be used on each capture +### group, unless a specific conversion is specified for that +### group. If TRUE, use utils::type.convert; if FALSE, use +### base::identity; otherwise must be a function of at least one +### argument (character), returning an atomic vector of the same +### length. ){ var.arg.list <- list(...) fun.list <- list() @@ -58,7 +61,9 @@ var_args_list <- structure(function group.i <- length(fun.list)+1L fun.list[[group.i]] <- if(isTRUE(type.convert)){ function(x)utils::type.convert(x,as.is=TRUE) - }else identity + }else if(isFALSE(type.convert)){ + identity + }else type.convert names(fun.list)[[group.i]] <- pattern.name has.name <- TRUE "(" diff --git a/man/capture_first_vec.Rd b/man/capture_first_vec.Rd index 94eaa75..49df928 100644 --- a/man/capture_first_vec.Rd +++ b/man/capture_first_vec.Rd @@ -34,8 +34,12 @@ recursively using these rules.} match; otherwise subjects that do not match are reported as missing/NA rows of the result.} \item{engine}{character string, one of PCRE, ICU, RE2} - \item{type.convert}{If TRUE, use \code{\link[utils]{type.convert}} instead of \code{\link[base]{identity}} for the -default conversion.} + \item{type.convert}{Default conversion function, which will be used on each capture +\code{\link{group}}, unless a specific conversion is specified for that +\code{\link{group}}. If TRUE, use \code{\link[utils]{type.convert}}; if FALSE, use +\code{\link[base]{identity}}; otherwise must be a function of at least one +argument (character), returning an atomic vector of the same +length.} } \value{data.table with one row for each subject, and one column for each diff --git a/man/subject_var_args.Rd b/man/subject_var_args.Rd index a78378d..10a8eb6 100644 --- a/man/subject_var_args.Rd +++ b/man/subject_var_args.Rd @@ -7,8 +7,7 @@ FALSE))} \arguments{ \item{\dots}{subject, regex/conversion.} - \item{type.convert}{If TRUE, use \code{\link[utils]{type.convert}} instead of \code{\link[base]{identity}} for the -default conversion.} + \item{type.convert}{passed to \code{\link{var_args_list}}.} } \value{Result of \code{\link{var_args_list}} plus subject.} diff --git a/man/var_args_list.Rd b/man/var_args_list.Rd index a98dabd..ec833e4 100644 --- a/man/var_args_list.Rd +++ b/man/var_args_list.Rd @@ -17,8 +17,12 @@ pattern used for matching. Each named pattern may be followed by at most one function which is used to convert the previous named pattern. Patterns may also be lists, which are parsed recursively for convenience.} - \item{type.convert}{If TRUE, use \code{\link[utils]{type.convert}} instead of \code{\link[base]{identity}} for the -default conversion.} + \item{type.convert}{Default conversion function, which will be used on each capture +\code{\link{group}}, unless a specific conversion is specified for that +\code{\link{group}}. If TRUE, use \code{\link[utils]{type.convert}}; if FALSE, use +\code{\link[base]{identity}}; otherwise must be a function of at least one +argument (character), returning an atomic vector of the same +length.} } \value{a list with two named elements From 9716facb973352695405e65144400b7aa088f4e2 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Tue, 17 Dec 2024 09:10:50 -0500 Subject: [PATCH 07/17] test type.convert=fun --- DESCRIPTION | 2 +- NEWS | 4 +-- R/capture_all_str.R | 8 ++++-- tests/testthat/test-CRAN-vec.R | 45 ++++++++++++++++++++++++++++++++++ 4 files changed, 54 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d732d5e..6040304 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Authors@R: c( person("Toby", "Hocking", email="toby.hocking@r-project.org", role=c("aut", "cre"))) -Version: 2024.12.16 +Version: 2024.12.17 License: GPL-3 Title: Named Capture to Data Tables Description: User-friendly functions for extracting a data diff --git a/NEWS b/NEWS index 34d4086..64ac40e 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,6 @@ -Changes in version 2024.12.16 +Changes in version 2024.12.17 -- type.convert=TRUE means to use type.convert() as default conversion function. +- type.convert=TRUE means to use utils::type.convert(x,as.is=TRUE) as default conversion function (as.is=TRUE means to return character instead of factor), FALSE means identity, and otherwise can be any function to use as default conversion. Changes in version 2024.9.20 diff --git a/R/capture_all_str.R b/R/capture_all_str.R index c227a12..ca4b658 100644 --- a/R/capture_all_str.R +++ b/R/capture_all_str.R @@ -16,8 +16,12 @@ capture_all_str <- structure(function # Capture all matches in a single subject ### separator string for combining elements of subject into a single ### string, used as collapse argument of base::paste. type.convert=getOption("nc.type.convert", FALSE) -### If TRUE, use utils::type.convert instead of base::identity for the -### default conversion. +### Default conversion function, which will be used on each capture +### group, unless a specific conversion is specified for that +### group. If TRUE, use utils::type.convert; if FALSE, use +### base::identity; otherwise must be a function of at least one +### argument (character), returning an atomic vector of the same +### length. ){ stop_for_engine(engine) L <- subject_var_args(..., type.convert=type.convert) diff --git a/tests/testthat/test-CRAN-vec.R b/tests/testthat/test-CRAN-vec.R index 40fce60..bcf0ac6 100644 --- a/tests/testthat/test-CRAN-vec.R +++ b/tests/testthat/test-CRAN-vec.R @@ -133,6 +133,51 @@ test_engines("capture_first_vec(type.convert=TRUE) returns two int columns", { expect_identical(computed, expected) }) +test_engines("capture_first_vec(type.convert=constant fun) returns all num columns", { + computed <- capture_first_vec( + "chr1:2-3,000", + chrom="chr.*?", + ":", + chromStart=".*?", + "-", + chromEnd="[0-9,]*", + type.convert=function(...)5) + expected <- data.table( + chrom=5, + chromStart=5, + chromEnd=5) + expect_identical(computed, expected) +}) + +test_engines("capture_first_vec(type.convert=as.integer) returns all int columns", { + computed <- capture_first_vec( + "chr1:2-3,000", + chrom="chr.*?", + ":", + chromStart=".*?", + "-", + chromEnd="[0-9,]*", + type.convert=as.integer) + expected <- data.table( + chrom=NA_integer_, + chromStart=2L, + chromEnd=NA_integer_) + expect_identical(computed, expected) +}) + +test_engines("capture_first_vec(type.convert=invalid) errors", { + expect_error({ + capture_first_vec( + "chr1:2-3,000", + chrom="chr.*?", + ":", + chromStart=".*?", + "-", + chromEnd="[0-9,]*", + type.convert=function()5) + }, "type conversion functions should take one argument (character vector of captured text) and return an atomic vector of the same size; function for group 1(chrom) raised an error: unused argument (match.vec)", fixed=TRUE) +}) + test_engines("capture_all_str(type.convert=TRUE) returns one int column", { computed <- capture_all_str( "chr1:2-3,000 chr4:5-6,000", From b511fcb46c5edbf9a66bf70f85335bf42570dbd6 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Tue, 17 Dec 2024 21:16:13 -0500 Subject: [PATCH 08/17] doc up --- man/capture_all_str.Rd | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/man/capture_all_str.Rd b/man/capture_all_str.Rd index 7cf8e6c..a18d511 100644 --- a/man/capture_all_str.Rd +++ b/man/capture_all_str.Rd @@ -20,8 +20,12 @@ must be string/function/list, as documented in \code{\link{capture_first_vec}}.} \item{engine}{character string, one of PCRE, ICU, RE2} \item{collapse}{separator string for combining elements of subject into a single string, used as \code{collapse} argument of \code{\link[base]{paste}}.} - \item{type.convert}{If TRUE, use \code{\link[utils]{type.convert}} instead of \code{\link[base]{identity}} for the -default conversion.} + \item{type.convert}{Default conversion function, which will be used on each capture +\code{\link{group}}, unless a specific conversion is specified for that +\code{\link{group}}. If TRUE, use \code{\link[utils]{type.convert}}; if FALSE, use +\code{\link[base]{identity}}; otherwise must be a function of at least one +argument (character), returning an atomic vector of the same +length.} } \value{data.table with one row for each match, and one column for each From 96e30e6a3bd4245a3ae281dd1de3354c9d654191 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Tue, 17 Dec 2024 21:16:37 -0500 Subject: [PATCH 09/17] LC_MESSAGES=C for testing in non-English locales --- tests/testthat/test-CRAN-vec.R | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-CRAN-vec.R b/tests/testthat/test-CRAN-vec.R index bcf0ac6..f7b5679 100644 --- a/tests/testthat/test-CRAN-vec.R +++ b/tests/testthat/test-CRAN-vec.R @@ -150,14 +150,14 @@ test_engines("capture_first_vec(type.convert=constant fun) returns all num colum }) test_engines("capture_first_vec(type.convert=as.integer) returns all int columns", { - computed <- capture_first_vec( + computed <- suppressWarnings(capture_first_vec( "chr1:2-3,000", chrom="chr.*?", ":", chromStart=".*?", "-", chromEnd="[0-9,]*", - type.convert=as.integer) + type.convert=as.integer)) expected <- data.table( chrom=NA_integer_, chromStart=2L, @@ -178,6 +178,19 @@ test_engines("capture_first_vec(type.convert=invalid) errors", { }, "type conversion functions should take one argument (character vector of captured text) and return an atomic vector of the same size; function for group 1(chrom) raised an error: unused argument (match.vec)", fixed=TRUE) }) +test_engines("capture_first_vec(type.convert='foo') errors", { + expect_error({ + capture_first_vec( + "chr1:2-3,000", + chrom="chr.*?", + ":", + chromStart=".*?", + "-", + chromEnd="[0-9,]*", + type.convert='foo') + }, "type.convert should be either TRUE or FALSE or a function", fixed=TRUE) +}) + test_engines("capture_all_str(type.convert=TRUE) returns one int column", { computed <- capture_all_str( "chr1:2-3,000 chr4:5-6,000", From be8b1d76a36bebbecfc7000a97fb6effba0aab29 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Tue, 17 Dec 2024 21:16:56 -0500 Subject: [PATCH 10/17] LC_MESSAGES=C for testing in non-English locales --- tests/testthat.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat.R b/tests/testthat.R index 8698d91..5268305 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,2 +1,3 @@ data.table::setDTthreads(1) +Sys.setlocale("LC_MESSAGES",locale="C") if(require(testthat))test_check("nc") From ef9d396f62550b515b810f2504cc3ae0b98758e3 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Tue, 17 Dec 2024 21:17:11 -0500 Subject: [PATCH 11/17] type convert error for non-function --- R/var_args_list.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/var_args_list.R b/R/var_args_list.R index 989ac12..aaeae5a 100644 --- a/R/var_args_list.R +++ b/R/var_args_list.R @@ -63,7 +63,9 @@ var_args_list <- structure(function function(x)utils::type.convert(x,as.is=TRUE) }else if(isFALSE(type.convert)){ identity - }else type.convert + }else if(is.function(type.convert)){ + type.convert + }else stop("type.convert should be either TRUE or FALSE or a function") names(fun.list)[[group.i]] <- pattern.name has.name <- TRUE "(" From f57aa54cac543029541fd003afb506e329da9fe3 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Wed, 18 Dec 2024 09:21:44 -0500 Subject: [PATCH 12/17] capture_first_df --- NEWS | 2 +- R/capture_first_df.R | 11 +++++- man/capture_first_df.Rd | 9 ++++- tests/testthat/test-CRAN-all.R | 47 ++++++++++++++++++++++ tests/testthat/test-CRAN-df.R | 72 ++++++++++++++++++++++++++++++++++ tests/testthat/test-CRAN-vec.R | 48 ----------------------- 6 files changed, 137 insertions(+), 52 deletions(-) diff --git a/NEWS b/NEWS index 64ac40e..e770418 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,6 @@ Changes in version 2024.12.17 -- type.convert=TRUE means to use utils::type.convert(x,as.is=TRUE) as default conversion function (as.is=TRUE means to return character instead of factor), FALSE means identity, and otherwise can be any function to use as default conversion. +- capture_first_vec, capture_all_str, capture_first_df now support type.convert argument. TRUE means to use utils::type.convert(x,as.is=TRUE) as default conversion function (as.is=TRUE means to return character instead of factor), FALSE means identity, and otherwise can be any function to use as default conversion. Changes in version 2024.9.20 diff --git a/R/capture_first_df.R b/R/capture_first_df.R index 9f3fae4..4466fcb 100644 --- a/R/capture_first_df.R +++ b/R/capture_first_df.R @@ -24,10 +24,17 @@ capture_first_df <- structure(function # Capture first match in columns of a dat ### if TRUE (default to avoid data loss), stop with an error if any ### capture groups have the same name as an existing column of ### subject. - engine=getOption("nc.engine", "PCRE") + engine=getOption("nc.engine", "PCRE"), ### character string, one of PCRE, ICU, RE2. This engine will be used ### for each column, unless another engine is specified for that ### column in ... + type.convert=getOption("nc.type.convert", FALSE) +### Default conversion function, which will be used on each capture +### group, unless a specific conversion is specified for that +### group. If TRUE, use utils::type.convert; if FALSE, use +### base::identity; otherwise must be a function of at least one +### argument (character), returning an atomic vector of the same +### length. ){ all.arg.list <- list(...) subject <- all.arg.list[[1]] @@ -57,7 +64,7 @@ capture_first_df <- structure(function # Capture first match in columns of a dat for(col.name in names(col.pattern.list)){ subject.vec <- subject[[col.name]] col.arg.list <- c(list(subject.vec), col.pattern.list[[col.name]]) - maybe.rep <- c("engine", "nomatch.error") + maybe.rep <- c("engine", "nomatch.error", "type.convert") to.rep <- maybe.rep[!maybe.rep %in% names(col.arg.list)] col.arg.list[to.rep] <- lapply(to.rep, get, environment()) tryCatch({ diff --git a/man/capture_first_df.Rd b/man/capture_first_df.Rd index 066c76a..ab02409 100644 --- a/man/capture_first_df.Rd +++ b/man/capture_first_df.Rd @@ -8,7 +8,8 @@ column.} nomatch.error = getOption("nc.nomatch.error", TRUE), existing.error = getOption("nc.existing.error", TRUE), engine = getOption("nc.engine", - "PCRE"))} + "PCRE"), type.convert = getOption("nc.type.convert", + FALSE))} \arguments{ \item{\dots}{subject data frame, colName1=list(groupName1=pattern1, fun1, etc), colName2=list(etc), etc. First argument must be a data frame with @@ -31,6 +32,12 @@ subject.} \item{engine}{character string, one of PCRE, ICU, RE2. This \code{engine} will be used for each column, unless another \code{engine} is specified for that column in \code{...}} + \item{type.convert}{Default conversion function, which will be used on each capture +\code{\link{group}}, unless a specific conversion is specified for that +\code{\link{group}}. If TRUE, use \code{\link[utils]{type.convert}}; if FALSE, use +\code{\link[base]{identity}}; otherwise must be a function of at least one +argument (character), returning an atomic vector of the same +length.} } \value{data.table with same number of rows as subject, with an additional diff --git a/tests/testthat/test-CRAN-all.R b/tests/testthat/test-CRAN-all.R index 70d07c6..6853e02 100644 --- a/tests/testthat/test-CRAN-all.R +++ b/tests/testthat/test-CRAN-all.R @@ -121,4 +121,51 @@ test_engines("nested capture groups works", { expect_is(match.dt$sampleID, "integer") }) +test_engines("error for capture all regex with literal groups, match", { + expect_error({ + capture_all_str( + c("chr1:100-200", "chr2:5-6"), + chrom="chr.", + ":", + "([0-9]+)") + }, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE) +}) + +test_engines("error for capture all regex with literal groups, no match", { + expect_error({ + nc::capture_all_str("alias(es)", foo="alias(es)") + }, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE) +}) + +test_engines("capture_all_str(type.convert=TRUE) returns one int column", { + computed <- capture_all_str( + "chr1:2-3,000 chr4:5-6,000", + chrom="chr.*?", + ":", + chromStart=".*?", + "-", + chromEnd="[0-9,]*", + type.convert=TRUE) + expected <- data.table( + chrom=c("chr1","chr4"), + chromStart=c(2L,5L), + chromEnd=c("3,000","6,000")) + expect_identical(computed, expected) +}) + +test_engines("capture_all_str(type.convert=TRUE) returns two int columns", { + computed <- capture_all_str( + "chr1:2-3,000 chr4:5-6,000", + chrom="chr.*?", + ":", + chromStart=".*?", + "-", + chromEnd="[0-9,]*", keep.digits, + type.convert=TRUE) + expected <- data.table( + chrom=c("chr1","chr4"), + chromStart=c(2L,5L), + chromEnd=c(3000L,6000L)) + expect_identical(computed, expected) +}) diff --git a/tests/testthat/test-CRAN-df.R b/tests/testthat/test-CRAN-df.R index 4c38153..37a26ae 100644 --- a/tests/testthat/test-CRAN-df.R +++ b/tests/testthat/test-CRAN-df.R @@ -209,6 +209,78 @@ test_engines("two name groups not OK with named subject", { }, "must not conflict with existing column names") }) +test_engines("type.convert OK inside capture_first_df list", { + type.conv.result <- capture_first_df( + named.uniq.chr, + JobID=list( + job="[0-9]+", + "_", + "(?:",#begin alternate + task="[0-9]+", + "|",#either one task(above) or range(below) + range.pattern, + ")",#end alternate + "(?:[.]", + type=".*", identity, + ")?", + type.convert=as.numeric), + position=list( + name="chr.*?", + ":", + chromStart=".*?", keep.digits, + "-", + chromEnd="[0-9,]*", keep.digits)) + computed.cls <- sapply(type.conv.result, class) + expected.cls <- c( + JobID = "character", + position = "character", + job = "numeric", + task = "numeric", + task1 = "integer", + taskN = "integer", + type = "character", + name = "character", + chromStart = "integer", + chromEnd = "integer") + expect_identical(computed.cls, expected.cls) +}) + +test_engines("type.convert OK as capture_first_df arg", { + type.conv.result <- capture_first_df( + named.uniq.chr, + JobID=list( + job="[0-9]+", + "_", + "(?:",#begin alternate + task="[0-9]+", + "|",#either one task(above) or range(below) + range.pattern, + ")",#end alternate + "(?:[.]", + type=".*", identity, + ")?"), + position=list( + name="chr.*?", + ":", + chromStart=".*?", keep.digits, + "-", + chromEnd="[0-9,]*", keep.digits), + type.convert=as.factor) + computed.cls <- sapply(type.conv.result, class) + expected.cls <- c( + JobID = "character", + position = "character", + job = "factor", + task = "factor", + task1 = "integer", + taskN = "integer", + type = "character", + name = "factor", + chromStart = "integer", + chromEnd = "integer") + expect_identical(computed.cls, expected.cls) +}) + test_engines("error for no pattern", { expect_error({ capture_first_df(named.uniq.chr) diff --git a/tests/testthat/test-CRAN-vec.R b/tests/testthat/test-CRAN-vec.R index f7b5679..c29915e 100644 --- a/tests/testthat/test-CRAN-vec.R +++ b/tests/testthat/test-CRAN-vec.R @@ -46,22 +46,6 @@ test_engines("error for capture first regex with literal groups", { }, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE) }) -test_engines("error for capture all regex with literal groups, match", { - expect_error({ - capture_all_str( - c("chr1:100-200", "chr2:5-6"), - chrom="chr.", - ":", - "([0-9]+)") - }, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE) -}) - -test_engines("error for capture all regex with literal groups, no match", { - expect_error({ - nc::capture_all_str("alias(es)", foo="alias(es)") - }, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE) -}) - subject <- c( ten="chr10:213,054,000-213,055,000", chrNA="chrNA:111,000-222,000", @@ -191,38 +175,6 @@ test_engines("capture_first_vec(type.convert='foo') errors", { }, "type.convert should be either TRUE or FALSE or a function", fixed=TRUE) }) -test_engines("capture_all_str(type.convert=TRUE) returns one int column", { - computed <- capture_all_str( - "chr1:2-3,000 chr4:5-6,000", - chrom="chr.*?", - ":", - chromStart=".*?", - "-", - chromEnd="[0-9,]*", - type.convert=TRUE) - expected <- data.table( - chrom=c("chr1","chr4"), - chromStart=c(2L,5L), - chromEnd=c("3,000","6,000")) - expect_identical(computed, expected) -}) - -test_engines("capture_all_str(type.convert=TRUE) returns two int columns", { - computed <- capture_all_str( - "chr1:2-3,000 chr4:5-6,000", - chrom="chr.*?", - ":", - chromStart=".*?", - "-", - chromEnd="[0-9,]*", keep.digits, - type.convert=TRUE) - expected <- data.table( - chrom=c("chr1","chr4"), - chromStart=c(2L,5L), - chromEnd=c(3000L,6000L)) - expect_identical(computed, expected) -}) - test_engines("named function is an error", { expect_error({ capture_first_vec( From 130faa58ffc1e9b8f6c83de014004c06676261bd Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Wed, 18 Dec 2024 09:32:09 -0500 Subject: [PATCH 13/17] capture first glob test and doc --- R/capture_first_glob.R | 4 +-- man/capture_first_glob.Rd | 4 +-- tests/testthat/test-CRAN-glob.R | 49 +++++++++++++++++++++++++++------ 3 files changed, 45 insertions(+), 12 deletions(-) diff --git a/R/capture_first_glob.R b/R/capture_first_glob.R index f6d2e6f..13f4aae 100644 --- a/R/capture_first_glob.R +++ b/R/capture_first_glob.R @@ -4,8 +4,8 @@ capture_first_glob <- structure(function (glob, ### string: glob specifying files to read. ..., -### pattern passed to capture_first_vec, used to get meta-data from -### file names. +### passed to capture_first_vec, should include pattern used on vector +### of file names to get meta-data. READ=fread ### function of one argument (file name) which returns a data table, ### default data.table::fread. diff --git a/man/capture_first_glob.Rd b/man/capture_first_glob.Rd index 07124be..e09a79a 100644 --- a/man/capture_first_glob.Rd +++ b/man/capture_first_glob.Rd @@ -7,8 +7,8 @@ file name, and combine with contents of each file.} ..., READ = fread)} \arguments{ \item{glob}{string: \code{glob} specifying files to read.} - \item{\dots}{pattern passed to \code{\link{capture_first_vec}}, used to get meta-data from -file names.} + \item{\dots}{passed to \code{\link{capture_first_vec}}, should include pattern used on vector +of file names to get meta-data.} \item{READ}{function of one argument (file name) which returns a data table, default \code{\link[data.table]{fread}}.} } diff --git a/tests/testthat/test-CRAN-glob.R b/tests/testthat/test-CRAN-glob.R index 4b19fee..6c310f6 100644 --- a/tests/testthat/test-CRAN-glob.R +++ b/tests/testthat/test-CRAN-glob.R @@ -7,11 +7,44 @@ db <- system.file("extdata/chip-seq-chunk-db", package="nc", mustWork=TRUE) glob <- paste0(db, "/*/*/counts/*") read.bedGraph <- function(f)data.table::fread( f, skip=1, col.names = c("chrom","start", "end", "count")) -data.chunk.pattern <- list( - data="H.*?", - "/", - chunk="[0-9]+", as.integer) -if(requireNamespace("R.utils"))test_engines("capture_first_glob returns expected columns", { - count.dt <- nc::capture_first_glob(glob, data.chunk.pattern, READ=read.bedGraph) - expect_identical(names(count.dt), c("data", "chunk", "chrom", "start", "end", "count")) -}) + +if(requireNamespace("R.utils")){ + + test_engines("capture_first_glob returns expected columns", { + count.dt <- nc::capture_first_glob( + glob, + data="H.*?", + "/", + chunk="[0-9]+", as.integer, + READ=read.bedGraph) + computed.cls <- sapply(count.dt, class) + expected.cls <- c( + data = "character", + chunk = "integer", + chrom = "character", + start = "integer", + end = "integer", + count = "integer") + expect_identical(computed.cls, expected.cls) + }) + + test_engines("capture_first_glob(type.convert) works", { + count.dt <- nc::capture_first_glob( + glob, + data="H.*?", + "/", + chunk="[0-9]+", + READ=read.bedGraph, + type.convert=TRUE) + computed.cls <- sapply(count.dt, class) + expected.cls <- c( + data = "character", + chunk = "integer", + chrom = "character", + start = "integer", + end = "integer", + count = "integer") + expect_identical(computed.cls, expected.cls) + }) + +} From a5a089da6f46757d2d62c91585f51033220bbc64 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Wed, 18 Dec 2024 09:34:04 -0500 Subject: [PATCH 14/17] keep.digits outside test that block --- tests/testthat/test-CRAN-all.R | 67 +++++++++++++++++----------------- 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/tests/testthat/test-CRAN-all.R b/tests/testthat/test-CRAN-all.R index 6853e02..1a20f07 100644 --- a/tests/testthat/test-CRAN-all.R +++ b/tests/testthat/test-CRAN-all.R @@ -22,6 +22,7 @@ test_engines("capture_all_str returns data.frame with 0 rows, 1 int col", { expect_identical(computed$baz, integer()) }) +keep.digits <- function(x)as.integer(gsub("[^0-9]", "", x)) test_engines("capture_all_str returns data.table", { chr.pos.vec <- c( "chr10:213,054,000-213,055,000", @@ -29,7 +30,6 @@ test_engines("capture_all_str returns data.table", { "this will not match", NA, # neither will this. "chr1:110-111 chr2:220-222") # two possible matches. - keep.digits <- function(x)as.integer(gsub("[^0-9]", "", x)) computed <- capture_all_str( chr.pos.vec, chrom="chr.*?", @@ -49,6 +49,38 @@ test_engines("capture_all_str returns data.table", { expect_identical(computed, expected) }) +test_engines("capture_all_str(type.convert=TRUE) returns one int column", { + computed <- capture_all_str( + "chr1:2-3,000 chr4:5-6,000", + chrom="chr.*?", + ":", + chromStart=".*?", + "-", + chromEnd="[0-9,]*", + type.convert=TRUE) + expected <- data.table( + chrom=c("chr1","chr4"), + chromStart=c(2L,5L), + chromEnd=c("3,000","6,000")) + expect_identical(computed, expected) +}) + +test_engines("capture_all_str(type.convert=TRUE) returns two int columns", { + computed <- capture_all_str( + "chr1:2-3,000 chr4:5-6,000", + chrom="chr.*?", + ":", + chromStart=".*?", + "-", + chromEnd="[0-9,]*", keep.digits, + type.convert=TRUE) + expected <- data.table( + chrom=c("chr1","chr4"), + chromStart=c(2L,5L), + chromEnd=c(3000L,6000L)) + expect_identical(computed, expected) +}) + test_engines("capture_all_str errors for one argument", { expect_error({ capture_all_str("foo") @@ -136,36 +168,3 @@ test_engines("error for capture all regex with literal groups, no match", { nc::capture_all_str("alias(es)", foo="alias(es)") }, "regex contains more groups than names; please remove literal groups (parentheses) from the regex pattern, and use named arguments in R code instead", fixed=TRUE) }) - -test_engines("capture_all_str(type.convert=TRUE) returns one int column", { - computed <- capture_all_str( - "chr1:2-3,000 chr4:5-6,000", - chrom="chr.*?", - ":", - chromStart=".*?", - "-", - chromEnd="[0-9,]*", - type.convert=TRUE) - expected <- data.table( - chrom=c("chr1","chr4"), - chromStart=c(2L,5L), - chromEnd=c("3,000","6,000")) - expect_identical(computed, expected) -}) - -test_engines("capture_all_str(type.convert=TRUE) returns two int columns", { - computed <- capture_all_str( - "chr1:2-3,000 chr4:5-6,000", - chrom="chr.*?", - ":", - chromStart=".*?", - "-", - chromEnd="[0-9,]*", keep.digits, - type.convert=TRUE) - expected <- data.table( - chrom=c("chr1","chr4"), - chromStart=c(2L,5L), - chromEnd=c(3000L,6000L)) - expect_identical(computed, expected) -}) - From 4a29589dde55a55667822251aff6df112a1bc604 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Wed, 18 Dec 2024 09:36:53 -0500 Subject: [PATCH 15/17] measure type.convert=TRUE tests --- tests/testthat/test-CRAN-measure.R | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/testthat/test-CRAN-measure.R b/tests/testthat/test-CRAN-measure.R index af11dac..c6d4ac7 100644 --- a/tests/testthat/test-CRAN-measure.R +++ b/tests/testthat/test-CRAN-measure.R @@ -15,6 +15,16 @@ test_engines("measure single value", { expect_identical(out, expected) }) +test_engines("measure single value type.convert=TRUE", { + out <- melt(DT, measure.vars=nc::measure( + letter="[ab]", "_", number="[12]", type.convert=TRUE)) + expected <- data.table( + letter=c("a","b","a"), + number=as.integer(c(1,2,2)), + value=c(10,21,20)) + expect_identical(out, expected) +}) + test_engines("measure multiple values", { out <- melt(DT, measure.vars=nc::measure( column="[ab]", "_", number="[12]", as.integer)) @@ -25,3 +35,12 @@ test_engines("measure multiple values", { expect_identical(out, expected) }) +test_engines("measure multiple values type.convert=TRUE", { + out <- melt(DT, measure.vars=nc::measure( + column="[ab]", "_", number="[12]", type.convert=TRUE)) + expected <- data.table( + number=as.integer(c(1,2)), + a=c(10,20), + b=c(NA,21)) + expect_identical(out, expected) +}) From 781137cc821b26c2f4f258ffc27922f64751f85c Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Wed, 18 Dec 2024 09:40:14 -0500 Subject: [PATCH 16/17] capture_melt_single, capture_melt_multiple tests --- NEWS | 2 +- tests/testthat/test-CRAN-melt.R | 6 ++++++ tests/testthat/test-CRAN-multiple.R | 8 ++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index e770418..5fa6703 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,6 @@ Changes in version 2024.12.17 -- capture_first_vec, capture_all_str, capture_first_df now support type.convert argument. TRUE means to use utils::type.convert(x,as.is=TRUE) as default conversion function (as.is=TRUE means to return character instead of factor), FALSE means identity, and otherwise can be any function to use as default conversion. +- capture_first_vec, capture_all_str, capture_first_df, capture_first_glob, measure, capture_melt_single, capture_melt_multiple now support type.convert argument. TRUE means to use utils::type.convert(x,as.is=TRUE) as default conversion function (as.is=TRUE means to return character instead of factor), FALSE means identity, and otherwise can be any function to use as default conversion. Changes in version 2024.9.20 diff --git a/tests/testthat/test-CRAN-melt.R b/tests/testthat/test-CRAN-melt.R index 7cf84e0..85d7a99 100644 --- a/tests/testthat/test-CRAN-melt.R +++ b/tests/testthat/test-CRAN-melt.R @@ -119,6 +119,12 @@ test_engines("melting lots of columns is OK", { expect_identical(out$value, i.vec) }) +test_engines("melting lots of columns type.convert=TRUE is OK", { + out <- capture_melt_single(one.row, "X", col="[0-9]+", type.convert=TRUE) + expect_identical(out$col, i.vec) + expect_identical(out$value, i.vec) +}) + DT.wide <- data.table(id=0, num_ref=1, name_ref="foo", num=2, name="bar") test_engines("converting NA to non-NA is an error", { expect_error({ diff --git a/tests/testthat/test-CRAN-multiple.R b/tests/testthat/test-CRAN-multiple.R index a25f581..31e02fe 100644 --- a/tests/testthat/test-CRAN-multiple.R +++ b/tests/testthat/test-CRAN-multiple.R @@ -276,6 +276,14 @@ test_engines("multiple melting lots of columns is OK", { expect_identical(out$C, c.vec) }) +test_engines("multiple melting lots of columns type.convert=TRUE is OK", { + out <- capture_melt_multiple( + one.row, column=".", "[.]", int="[0-9]+", type.convert=TRUE) + expect_identical(out$int, i.vec) + expect_identical(out$I, i.vec) + expect_identical(out$C, c.vec) +}) + wide.metrics <- data.table( FP.possible=8202, FN.possible=1835, FP.count=0, FN.count=1835) From 01b223590e70e8b55c9a1dc851b683da5186f03a Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 20 Dec 2024 10:27:32 -0500 Subject: [PATCH 17/17] advent of code example --- R/capture_first_vec.R | 31 +++++++++++++++++++++++++++++++ man/capture_first_vec.Rd | 31 +++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/R/capture_first_vec.R b/R/capture_first_vec.R index f19e3fe..1c7bd0c 100644 --- a/R/capture_first_vec.R +++ b/R/capture_first_vec.R @@ -118,5 +118,36 @@ capture_first_vec <- structure(function # Capture first match in each character chr.pos.vec) nc::capture_first_vec(na.vec, range.pattern, nomatch.error=FALSE) + ## another subject from https://adventofcode.com/2024/day/14 + pvxy.subject <- c("p=0,4 v=3,-3","p=6,3 v=-1,-3") + nc::capture_first_vec( + pvxy.subject, + "p=", + px="[0-9]", + ",", + py="[0-9]", + " v=", + vx="[-0-9]+", + ",", + vy="[-0-9]+", + type.convert=TRUE) + + ## to do the same as above but with less repetition: + g <- function(prefix,suffix)nc::group( + name=paste0(prefix,suffix), + "[-0-9]+") + xy <- function(prefix)list( + prefix, + "=", + g(prefix,"x"), + ",", + g(prefix,"y")) + nc::capture_first_vec( + pvxy.subject, + xy("p"), + " ", + xy("v"), + type.convert=TRUE) + }) diff --git a/man/capture_first_vec.Rd b/man/capture_first_vec.Rd index 49df928..1923d1b 100644 --- a/man/capture_first_vec.Rd +++ b/man/capture_first_vec.Rd @@ -95,4 +95,35 @@ na.vec <- c( chr.pos.vec) nc::capture_first_vec(na.vec, range.pattern, nomatch.error=FALSE) +## another subject from https://adventofcode.com/2024/day/14 +pvxy.subject <- c("p=0,4 v=3,-3","p=6,3 v=-1,-3") +nc::capture_first_vec( + pvxy.subject, + "p=", + px="[0-9]", + ",", + py="[0-9]", + " v=", + vx="[-0-9]+", + ",", + vy="[-0-9]+", + type.convert=TRUE) + +## to do the same as above but with less repetition: +g <- function(prefix,suffix)nc::group( + name=paste0(prefix,suffix), + "[-0-9]+") +xy <- function(prefix)list( + prefix, + "=", + g(prefix,"x"), + ",", + g(prefix,"y")) +nc::capture_first_vec( + pvxy.subject, + xy("p"), + " ", + xy("v"), + type.convert=TRUE) + }