Skip to content

Add grepv() awareness to the package #2856

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
* `boolean_arithmetic_linter()` finds many more cases like `sum(x | y) == 0` where the total of a known-logical vector is compared to 0 (#1580, @MichaelChirico).
* `expect_lint()` has a new argument `ignore_order` (default `FALSE`), which, if `TRUE`, allows the `checks=` to be provided in arbitary order vs. how `lint()` produces them (@MichaelChirico).
* `undesirable_function_linter()` accepts unnamed entries, treating them as undesirable functions, e.g. `undesirable_function_linter("sum")` (#2536, @MichaelChirico).
* `fixed_regex_linter()` recognizes usage of the new (R 4.5.0) `grepv()` wrapper of `grep()`; `regex_subset_linter()` also recommends `grepv()` alternatives (#2855, @MichaelChirico).

### New linters

Expand Down
2 changes: 1 addition & 1 deletion R/fixed_regex_linter.R
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@
fixed_regex_linter <- function(allow_unescaped = FALSE) {
# regular expression pattern is the first argument
pos_1_regex_funs <- c(
"grep", "gsub", "sub", "regexec", "grepl", "regexpr", "gregexpr"
"grep", "grepv", "gsub", "sub", "regexec", "grepl", "regexpr", "gregexpr"
)

# regular expression pattern is the second argument
Expand Down
22 changes: 11 additions & 11 deletions R/regex_subset_linter.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#' Require usage of direct methods for subsetting strings via regex
#'
#' Using `value = TRUE` in [grep()] returns the subset of the input that matches
#' the pattern, e.g. `grep("[a-m]", letters, value = TRUE)` will return the
#' first 13 elements (`a` through `m`).
# TODO(R>=4.5.0): Just use [grepv()] directly. Need this while ?grepv doesn't exist.
#' Using [`grepv()`][grep] returns the subset of the input that matches the pattern,
#' e.g. `grepv("[a-m]", letters)` will return the first 13 elements (`a` through `m`).
#'
#' `letters[grep("[a-m]", letters)]` and `letters[grepl("[a-m]", letters)]`
#' both return the same thing, but more circuitously and more verbosely.
Expand All @@ -12,12 +12,12 @@
#' `str_detect()` and `str_which()`.
#'
#' @section Exceptions:
#' Note that `x[grep(pattern, x)]` and `grep(pattern, x, value = TRUE)`
#' are not _completely_ interchangeable when `x` is not character
#' (most commonly, when `x` is a factor), because the output of the
#' latter will be a character vector while the former remains a factor.
#' It still may be preferable to refactor such code, as it may be faster
#' to match the pattern on `levels(x)` and use that to subset instead.
#' Note that `x[grep(pattern, x)]` and `grepv(pattern, x)` are not
#' _completely_ interchangeable when `x` is not character (most commonly,
#' when `x` is a factor), because the output of the latter will be a
#' character vector while the former remains a factor. It still may be
#' preferable to refactor such code, as it may be faster to match the
#' pattern on `levels(x)` and use that to subset instead.
#'
#' @evalRd rd_tags("regex_subset_linter")
#'
Expand All @@ -35,7 +35,7 @@
#'
#' # okay
#' lint(
#' text = "grep(pattern, x, value = TRUE)",
#' text = "grepv(pattern, x)",
#' linters = regex_subset_linter()
#' )
#'
Expand Down Expand Up @@ -72,7 +72,7 @@ regex_subset_linter <- function() {
grep_expr,
source_expression = source_expression,
lint_message =
"Prefer grep(pattern, x, ..., value = TRUE) over x[grep(pattern, x, ...)] and x[grepl(pattern, x, ...)].",
"Prefer grepv(pattern, x, ...) over x[grep(pattern, x, ...)] and x[grepl(pattern, x, ...)].",
type = "warning"
)

Expand Down
19 changes: 9 additions & 10 deletions man/regex_subset_linter.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

122 changes: 62 additions & 60 deletions tests/testthat/test-fixed_regex_linter.R
Original file line number Diff line number Diff line change
@@ -1,30 +1,31 @@
test_that("fixed_regex_linter skips allowed usages", {
linter <- fixed_regex_linter()

expect_lint("gsub('^x', '', y)", NULL, linter)
expect_lint("grep('x$', '', y)", NULL, linter)
expect_lint("sub('[a-zA-Z]', '', y)", NULL, linter)
expect_lint("grepl(fmt, y)", NULL, linter)
expect_lint(R"{regexec('\\s', '', y)}", NULL, linter)
expect_lint("grep('a(?=b)', x, perl = TRUE)", NULL, linter)
expect_lint("grep('0+1', x, perl = TRUE)", NULL, linter)
expect_lint("grep('1*2', x)", NULL, linter)
expect_lint("grep('a|b', x)", NULL, linter)
expect_lint(R"{grep('\\[|\\]', x)}", NULL, linter)
expect_no_lint("gsub('^x', '', y)", linter)
expect_no_lint("grep('x$', y)", linter)
expect_no_lint("grepv('x$', y)", linter)
expect_no_lint("sub('[a-zA-Z]', '', y)", linter)
expect_no_lint("grepl(fmt, y)", linter)
expect_no_lint(R"{regexec('\\s', '', y)}", linter)
expect_no_lint("grep('a(?=b)', x, perl = TRUE)", linter)
expect_no_lint("grep('0+1', x, perl = TRUE)", linter)
expect_no_lint("grep('1*2', x)", linter)
expect_no_lint("grep('a|b', x)", linter)
expect_no_lint(R"{grep('\\[|\\]', x)}", linter)

# if fixed=TRUE is already set, regex patterns don't matter
expect_lint(R"{gsub('\\.', '', y, fixed = TRUE)}", NULL, linter)
expect_no_lint(R"{gsub('\\.', '', y, fixed = TRUE)}", linter)

# ignore.case=TRUE implies regex interpretation
expect_lint("gsub('abcdefg', '', y, ignore.case = TRUE)", NULL, linter)
expect_no_lint("gsub('abcdefg', '', y, ignore.case = TRUE)", linter)

# char classes starting with [] might contain other characters -> not fixed
expect_lint("sub('[][]', '', y)", NULL, linter)
expect_lint("sub('[][ ]', '', y)", NULL, linter)
expect_lint("sub('[],[]', '', y)", NULL, linter)
expect_no_lint("sub('[][]', '', y)", linter)
expect_no_lint("sub('[][ ]', '', y)", linter)
expect_no_lint("sub('[],[]', '', y)", linter)

# wrapper functions don't throw
expect_lint("gregexpr(pattern = pattern, data, perl = TRUE, ...)", NULL, linter)
expect_no_lint("gregexpr(pattern = pattern, data, perl = TRUE, ...)", linter)
})

test_that("fixed_regex_linter blocks simple disallowed usages", {
Expand All @@ -36,6 +37,7 @@ test_that("fixed_regex_linter blocks simple disallowed usages", {
expect_lint("gregexpr('a-z', y)", lint_msg, linter)
expect_lint(R"{regexec('\\$', x)}", lint_msg, linter)
expect_lint("grep('\n', x)", lint_msg, linter)
expect_lint("grepv('\n', x)", lint_msg, linter)

# naming the argument doesn't matter (if it's still used positionally)
expect_lint("gregexpr(pattern = 'a-z', y)", lint_msg, linter)
Expand Down Expand Up @@ -77,19 +79,19 @@ test_that("fixed_regex_linter catches regex like [.] or [$]", {
test_that("fixed_regex_linter catches null calls to strsplit as well", {
linter <- fixed_regex_linter()

expect_lint("strsplit(x, '^x')", NULL, linter)
expect_lint(R"{strsplit(x, '\\s')}", NULL, linter)
expect_lint("strsplit(x, 'a(?=b)', perl = TRUE)", NULL, linter)
expect_lint("strsplit(x, '0+1', perl = TRUE)", NULL, linter)
expect_lint("strsplit(x, 'a|b')", NULL, linter)
expect_no_lint("strsplit(x, '^x')", linter)
expect_no_lint(R"{strsplit(x, '\\s')}", linter)
expect_no_lint("strsplit(x, 'a(?=b)', perl = TRUE)", linter)
expect_no_lint("strsplit(x, '0+1', perl = TRUE)", linter)
expect_no_lint("strsplit(x, 'a|b')", linter)

expect_lint("tstrsplit(x, '1*2')", NULL, linter)
expect_lint("tstrsplit(x, '[a-zA-Z]')", NULL, linter)
expect_lint("tstrsplit(x, fmt)", NULL, linter)
expect_no_lint("tstrsplit(x, '1*2')", linter)
expect_no_lint("tstrsplit(x, '[a-zA-Z]')", linter)
expect_no_lint("tstrsplit(x, fmt)", linter)

# if fixed=TRUE is already set, regex patterns don't matter
expect_lint(R"{strsplit(x, '\\.', fixed = TRUE)}", NULL, linter)
expect_lint(R"{strsplit(x, '\\.', fixed = T)}", NULL, linter)
expect_no_lint(R"{strsplit(x, '\\.', fixed = TRUE)}", linter)
expect_no_lint(R"{strsplit(x, '\\.', fixed = T)}", linter)
})

test_that("fixed_regex_linter catches calls to strsplit as well", {
Expand All @@ -106,26 +108,26 @@ test_that("fixed_regex_linter is more exact about distinguishing \\s from \\:",
linter <- fixed_regex_linter()
lint_msg <- rex::rex("This regular expression is static")

expect_lint(R"{grep('\\s', '', x)}", NULL, linter)
expect_no_lint(R"{grep('\\s', '', x)}", linter)
expect_lint(R"{grep('\\:', '', x)}", lint_msg, linter)
})

## tests for stringr functions
test_that("fixed_regex_linter skips allowed stringr usages", {
linter <- fixed_regex_linter()

expect_lint("str_replace(y, '[a-zA-Z]', '')", NULL, linter)
expect_lint("str_replace_all(y, '^x', '')", NULL, linter)
expect_lint("str_detect(y, fmt)", NULL, linter)
expect_lint(R"{str_extract(y, '\\s')}", NULL, linter)
expect_lint(R"{str_extract_all(y, '\\s')}", NULL, linter)
expect_lint("str_which(x, '1*2')", NULL, linter)
expect_no_lint("str_replace(y, '[a-zA-Z]', '')", linter)
expect_no_lint("str_replace_all(y, '^x', '')", linter)
expect_no_lint("str_detect(y, fmt)", linter)
expect_no_lint(R"{str_extract(y, '\\s')}", linter)
expect_no_lint(R"{str_extract_all(y, '\\s')}", linter)
expect_no_lint("str_which(x, '1*2')", linter)

# if fixed() is already set, regex patterns don't matter
expect_lint(R"{str_replace(y, fixed('\\.'), '')}", NULL, linter)
expect_no_lint(R"{str_replace(y, fixed('\\.'), '')}", linter)

# namespace qualification doesn't matter
expect_lint("stringr::str_replace(y, stringr::fixed('abcdefg'), '')", NULL, linter)
expect_no_lint("stringr::str_replace(y, stringr::fixed('abcdefg'), '')", linter)
})

test_that("fixed_regex_linter blocks simple disallowed usages of stringr functions", {
Expand All @@ -148,11 +150,11 @@ test_that("fixed_regex_linter catches calls to str_split as well", {
linter <- fixed_regex_linter()
lint_msg <- rex::rex("This regular expression is static")

expect_lint("str_split(x, '^x')", NULL, linter)
expect_lint("str_split(x, fmt)", NULL, linter)
expect_no_lint("str_split(x, '^x')", linter)
expect_no_lint("str_split(x, fmt)", linter)

# if fixed() is already set, regex patterns don't matter
expect_lint(R"{str_split(x, fixed('\\.'))}", NULL, linter)
expect_no_lint(R"{str_split(x, fixed('\\.'))}", linter)
expect_lint(R"{str_split(x, '\\.')}", lint_msg, linter)
expect_lint("str_split(x, '[.]')", lint_msg, linter)
})
Expand All @@ -163,8 +165,8 @@ test_that("str_replace_all's multi-replacement version is handled", {
# While each of the replacements is fixed, and this _could_ in principle be replaced by
# a pipeline where each step does one of the replacements and fixed() is used, this is overkill.
# Instead, ensure that no lint is returned for this case
expect_lint('str_replace_all(x, c("one" = "1", "two" = "2", "three" = "3"))', NULL, linter)
expect_lint('grepl(c("a" = "b"), x)', NULL, linter)
expect_no_lint('str_replace_all(x, c("one" = "1", "two" = "2", "three" = "3"))', linter)
expect_no_lint('grepl(c("a" = "b"), x)', linter)
})

test_that("1- or 2-width octal escape sequences are handled", {
Expand Down Expand Up @@ -209,20 +211,20 @@ test_that("bracketed unicode escapes are caught", {

test_that("escaped characters are handled correctly", {
linter <- fixed_regex_linter()
expect_lint(R"{gsub('\n+', '', sql)}", NULL, linter)
expect_lint('gsub("\\n{2,}", "\n", D)', NULL, linter)
expect_lint(R'{gsub("[\r\n]", "", x)}', NULL, linter)
expect_lint(R'{gsub("\n $", "", y)}', NULL, linter)
expect_lint(R'{gsub("```\n*```r*\n*", "", x)}', NULL, linter)
expect_lint('strsplit(x, "(;|\n)")', NULL, linter)
expect_lint(R'{strsplit(x, "(;|\n)")}', NULL, linter)
expect_lint(R'{grepl("[\\W]", x, perl = TRUE)}', NULL, linter)
expect_lint(R'{grepl("[\\W]", x)}', NULL, linter)
expect_no_lint(R"{gsub('\n+', '', sql)}", linter)
expect_no_lint('gsub("\\n{2,}", "\n", D)', linter)
expect_no_lint(R'{gsub("[\r\n]", "", x)}', linter)
expect_no_lint(R'{gsub("\n $", "", y)}', linter)
expect_no_lint(R'{gsub("```\n*```r*\n*", "", x)}', linter)
expect_no_lint('strsplit(x, "(;|\n)")', linter)
expect_no_lint(R'{strsplit(x, "(;|\n)")}', linter)
expect_no_lint(R'{grepl("[\\W]", x, perl = TRUE)}', linter)
expect_no_lint(R'{grepl("[\\W]", x)}', linter)
})

# make sure the logic is properly vectorized
test_that("single expression with multiple regexes is OK", {
expect_lint('c(grep("^a", x), grep("b$", x))', NULL, fixed_regex_linter())
expect_no_lint('c(grep("^a", x), grep("b$", x))', fixed_regex_linter())
})

test_that("fixed replacements vectorize and recognize str_detect", {
Expand Down Expand Up @@ -344,8 +346,8 @@ local({
test_that("'unescaped' regex can optionally be skipped", {
linter <- fixed_regex_linter(allow_unescaped = TRUE)

expect_lint("grepl('a', x)", NULL, linter)
expect_lint("str_detect(x, 'a')", NULL, linter)
expect_no_lint("grepl('a', x)", linter)
expect_no_lint("str_detect(x, 'a')", linter)
expect_lint("grepl('[$]', x)", rex::rex('Use "$" with fixed = TRUE'), linter)
})

Expand All @@ -358,18 +360,18 @@ local({
lint_msg <- "This regular expression is static"

expect_lint(paste("x", pipe, "grepl(pattern = 'a')"), lint_msg, linter)
expect_lint(paste("x", pipe, "grepl(pattern = '^a')"), NULL, linter)
expect_lint(paste("x", pipe, "grepl(pattern = 'a', fixed = TRUE)"), NULL, linter)
expect_no_lint(paste("x", pipe, "grepl(pattern = '^a')"), linter)
expect_no_lint(paste("x", pipe, "grepl(pattern = 'a', fixed = TRUE)"), linter)
expect_lint(paste("x", pipe, "str_detect('a')"), lint_msg, linter)
expect_lint(paste("x", pipe, "str_detect('^a')"), NULL, linter)
expect_lint(paste("x", pipe, "str_detect(fixed('a'))"), NULL, linter)
expect_no_lint(paste("x", pipe, "str_detect('^a')"), linter)
expect_no_lint(paste("x", pipe, "str_detect(fixed('a'))"), linter)

expect_lint(paste("x", pipe, "gsub(pattern = 'a', replacement = '')"), lint_msg, linter)
expect_lint(paste("x", pipe, "gsub(pattern = '^a', replacement = '')"), NULL, linter)
expect_lint(paste("x", pipe, "gsub(pattern = 'a', replacement = '', fixed = TRUE)"), NULL, linter)
expect_no_lint(paste("x", pipe, "gsub(pattern = '^a', replacement = '')"), linter)
expect_no_lint(paste("x", pipe, "gsub(pattern = 'a', replacement = '', fixed = TRUE)"), linter)
expect_lint(paste("x", pipe, "str_replace('a', '')"), lint_msg, linter)
expect_lint(paste("x", pipe, "str_replace('^a', '')"), NULL, linter)
expect_lint(paste("x", pipe, "str_replace(fixed('a'), '')"), NULL, linter)
expect_no_lint(paste("x", pipe, "str_replace('^a', '')"), linter)
expect_no_lint(paste("x", pipe, "str_replace(fixed('a'), '')"), linter)
},
pipe = pipes,
.test_name = names(pipes)
Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test-regex_subset_linter.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ test_that("regex_subset_linter skips allowed usages", {

test_that("regex_subset_linter blocks simple disallowed usages", {
linter <- regex_subset_linter()
lint_msg <- rex::rex("Prefer grep(pattern, x, ..., value = TRUE)")
lint_msg <- rex::rex("Prefer grepv(pattern, x, ...)")

expect_lint("x[grep(ptn, x)]", lint_msg, linter)
expect_lint("names(y)[grepl(ptn, names(y), perl = TRUE)]", lint_msg, linter)
Expand Down
Loading