From f67a5254561ad8c7c59b8faacc09627a06ec550d Mon Sep 17 00:00:00 2001 From: Nathan Watson-Haigh Date: Fri, 10 Nov 2023 04:37:02 -0800 Subject: [PATCH 1/5] Better support for stringr in Snowflake * Added support for str_starts() and str_ends() by using Snowflake's REGEXP_INSTR() function * Refactored str_detect() to use Snowflake's CONTAINS() function instead of hacking with REGEXP() which anchors the start and end by default --- R/backend-snowflake.R | 41 +++++++++++++++---------- tests/testthat/test-backend-snowflake.R | 8 +++-- 2 files changed, 31 insertions(+), 18 deletions(-) diff --git a/R/backend-snowflake.R b/R/backend-snowflake.R index 5774be3cf..cb6fd61e7 100644 --- a/R/backend-snowflake.R +++ b/R/backend-snowflake.R @@ -38,23 +38,32 @@ sql_translation.Snowflake <- function(con) { str_locate = function(string, pattern) { sql_expr(POSITION(!!pattern, !!string)) }, - # REGEXP on Snowflaake "implicitly anchors a pattern at both ends", which - # str_detect does not. Left- and right-pad `pattern` with .* to get - # str_detect-like behavior str_detect = function(string, pattern, negate = FALSE) { - sql_str_pattern_switch( - string = string, - pattern = {{ pattern }}, - negate = negate, - f_fixed = sql_str_detect_fixed_instr("detect"), - f_regex = function(string, pattern, negate = FALSE) { - if (isTRUE(negate)) { - sql_expr(!(((!!string)) %REGEXP% (".*" || (!!pattern) || ".*"))) - } else { - sql_expr(((!!string)) %REGEXP% (".*" || (!!pattern) || ".*")) - } - } - ) + con <- sql_current_con() + + if (negate) { + translate_sql(!CONTAINS(!!string, !!pattern), con = con) + } else { + translate_sql(CONTAINS(!!string, !!pattern), con = con) + } + }, + str_starts = function(string, pattern, negate = FALSE) { + con <- sql_current_con() + + if (negate) { + translate_sql(REGEXP_INSTR(!!string, !!pattern) != 1L, con = con) + } else { + translate_sql(REGEXP_INSTR(!!string, !!pattern) == 1L, con = con) + } + }, + str_ends = function(string, pattern, negate = FALSE) { + con <- sql_current_con() + + if (negate) { + translate_sql(REGEXP_INSTR(!!string, !!pattern, 1L, 1L, 1L) != LENGTH(!!string) + 1L, con = con) + } else { + translate_sql(REGEXP_INSTR(!!string, !!pattern, 1L, 1L, 1L) == LENGTH(!!string) + 1L, con = con) + } }, # On Snowflake, REGEXP_REPLACE is used like this: # REGEXP_REPLACE( , [ , , diff --git a/tests/testthat/test-backend-snowflake.R b/tests/testthat/test-backend-snowflake.R index b1f37b14a..78f6f39dd 100644 --- a/tests/testthat/test-backend-snowflake.R +++ b/tests/testthat/test-backend-snowflake.R @@ -25,8 +25,8 @@ test_that("custom stringr functions translated correctly", { local_con(simulate_snowflake()) expect_equal(test_translate_sql(str_locate(x, y)), sql("POSITION(`y`, `x`)")) - expect_equal(test_translate_sql(str_detect(x, y)), sql("(`x`) REGEXP ('.*' || `y` || '.*')")) - expect_equal(test_translate_sql(str_detect(x, y, negate = TRUE)), sql("!((`x`) REGEXP ('.*' || `y` || '.*'))")) + expect_equal(test_translate_sql(str_detect(x, y)), sql("CONTAINS(`x`, `y`)")) + expect_equal(test_translate_sql(str_detect(x, y, negate = TRUE)), sql("NOT(CONTAINS(`x`, `y`))")) expect_equal(test_translate_sql(str_replace(x, y, z)), sql("REGEXP_REPLACE(`x`, `y`, `z`, 1.0, 1.0)")) expect_equal(test_translate_sql(str_replace(x, "\\d", z)), sql("REGEXP_REPLACE(`x`, '\\\\d', `z`, 1.0, 1.0)")) expect_equal(test_translate_sql(str_replace_all(x, y, z)), sql("REGEXP_REPLACE(`x`, `y`, `z`)")) @@ -34,6 +34,10 @@ test_that("custom stringr functions translated correctly", { expect_equal(test_translate_sql(str_remove(x, y)), sql("REGEXP_REPLACE(`x`, `y`, '', 1.0, 1.0)")) expect_equal(test_translate_sql(str_remove_all(x, y)), sql("REGEXP_REPLACE(`x`, `y`)")) expect_equal(test_translate_sql(str_trim(x)), sql("TRIM(`x`)")) + expect_equal(test_translate_sql(str_starts(x, y)), sql("REGEXP_INSTR(`x`, `y`) = 1")) + expect_equal(test_translate_sql(str_starts(x, y, negate = TRUE)), sql("REGEXP_INSTR(`x`, `y`) != 1")) + expect_equal(test_translate_sql(str_ends(x, y)), sql("REGEXP_INSTR(`x`, `y`, 1, 1, 1) = (LENGTH(`x`) + 1)")) + expect_equal(test_translate_sql(str_ends(x, y, negate = TRUE)), sql("REGEXP_INSTR(`x`, `y`, 1, 1, 1) != (LENGTH(`x`) + 1)")) }) test_that("aggregates are translated correctly", { From 70b600703880adb886e0320fb8235b0a842b9d78 Mon Sep 17 00:00:00 2001 From: Nathan Watson-Haigh Date: Fri, 10 Nov 2023 22:04:12 -0800 Subject: [PATCH 2/5] Moved str_detect() over to REGEXP_INSTR --- R/backend-snowflake.R | 4 ++-- tests/testthat/test-backend-snowflake.R | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/backend-snowflake.R b/R/backend-snowflake.R index cb6fd61e7..4b5d0dd23 100644 --- a/R/backend-snowflake.R +++ b/R/backend-snowflake.R @@ -42,9 +42,9 @@ sql_translation.Snowflake <- function(con) { con <- sql_current_con() if (negate) { - translate_sql(!CONTAINS(!!string, !!pattern), con = con) + translate_sql(REGEXP_INSTR(!!string, !!pattern) == 0L, con = con) } else { - translate_sql(CONTAINS(!!string, !!pattern), con = con) + translate_sql(REGEXP_INSTR(!!string, !!pattern) != 0L, con = con) } }, str_starts = function(string, pattern, negate = FALSE) { diff --git a/tests/testthat/test-backend-snowflake.R b/tests/testthat/test-backend-snowflake.R index 78f6f39dd..1fda83553 100644 --- a/tests/testthat/test-backend-snowflake.R +++ b/tests/testthat/test-backend-snowflake.R @@ -25,8 +25,8 @@ test_that("custom stringr functions translated correctly", { local_con(simulate_snowflake()) expect_equal(test_translate_sql(str_locate(x, y)), sql("POSITION(`y`, `x`)")) - expect_equal(test_translate_sql(str_detect(x, y)), sql("CONTAINS(`x`, `y`)")) - expect_equal(test_translate_sql(str_detect(x, y, negate = TRUE)), sql("NOT(CONTAINS(`x`, `y`))")) + expect_equal(test_translate_sql(str_detect(x, y)), sql("REGEXP_INSTR(`x`, `y`) != 0")) + expect_equal(test_translate_sql(str_detect(x, y, negate = TRUE)), sql("REGEXP_INSTR(`x`, `y`) = 0")) expect_equal(test_translate_sql(str_replace(x, y, z)), sql("REGEXP_REPLACE(`x`, `y`, `z`, 1.0, 1.0)")) expect_equal(test_translate_sql(str_replace(x, "\\d", z)), sql("REGEXP_REPLACE(`x`, '\\\\d', `z`, 1.0, 1.0)")) expect_equal(test_translate_sql(str_replace_all(x, y, z)), sql("REGEXP_REPLACE(`x`, `y`, `z`)")) From fa76df883f24f423e0514bf123657a98d0d0fcd7 Mon Sep 17 00:00:00 2001 From: Nathan Watson-Haigh Date: Mon, 13 Nov 2023 19:45:41 -0800 Subject: [PATCH 3/5] Better grepl support for Snowflake Move from using `REGEXP` to `REGEXP_INSTR` for cleaner syntax and support for case-insensitive search capabilities. --- R/backend-snowflake.R | 14 ++++++++------ tests/testthat/_snaps/backend-snowflake.md | 10 ---------- tests/testthat/test-backend-snowflake.R | 4 ++-- 3 files changed, 10 insertions(+), 18 deletions(-) diff --git a/R/backend-snowflake.R b/R/backend-snowflake.R index 4b5d0dd23..bcd7b2342 100644 --- a/R/backend-snowflake.R +++ b/R/backend-snowflake.R @@ -270,15 +270,17 @@ snowflake_grepl <- function(pattern, perl = FALSE, fixed = FALSE, useBytes = FALSE) { - # https://docs.snowflake.com/en/sql-reference/functions/regexp.html - check_unsupported_arg(ignore.case, FALSE, backend = "Snowflake") + con <- sql_current_con() + check_unsupported_arg(perl, FALSE, backend = "Snowflake") check_unsupported_arg(fixed, FALSE, backend = "Snowflake") check_unsupported_arg(useBytes, FALSE, backend = "Snowflake") - # REGEXP on Snowflaake "implicitly anchors a pattern at both ends", which - # grepl does not. Left- and right-pad `pattern` with .* to get grepl-like - # behavior - sql_expr(((!!x)) %REGEXP% (".*" || !!paste0("(", pattern, ")") || ".*")) + + # https://docs.snowflake.com/en/sql-reference/functions/regexp_instr.html + # REGEXP_INSTR optional parameters: position, occurrance, option, regex_parameters + regexp_parameters <- "c" + if(ignore.case) { regexp_parameters <- "i" } + translate_sql(REGEXP_INSTR(!!x, !!pattern, 1L, 1L, 0L, !!regexp_parameters) != 0L, con = con) } snowflake_round <- function(x, digits = 0L) { diff --git a/tests/testthat/_snaps/backend-snowflake.md b/tests/testthat/_snaps/backend-snowflake.md index e43d3042e..2eef02c36 100644 --- a/tests/testthat/_snaps/backend-snowflake.md +++ b/tests/testthat/_snaps/backend-snowflake.md @@ -1,13 +1,3 @@ -# custom scalar translated correctly - - Code - (expect_error(test_translate_sql(grepl("exp", x, ignore.case = TRUE)))) - Output - - Error in `grepl()`: - ! `ignore.case = TRUE` isn't supported in Snowflake translation. - i It must be FALSE instead. - # pmin() and pmax() respect na.rm Code diff --git a/tests/testthat/test-backend-snowflake.R b/tests/testthat/test-backend-snowflake.R index 1fda83553..0a6e58157 100644 --- a/tests/testthat/test-backend-snowflake.R +++ b/tests/testthat/test-backend-snowflake.R @@ -2,8 +2,8 @@ test_that("custom scalar translated correctly", { local_con(simulate_snowflake()) expect_equal(test_translate_sql(log10(x)), sql("LOG(10.0, `x`)")) expect_equal(test_translate_sql(round(x, digits = 1.1)), sql("ROUND((`x`) :: FLOAT, 1)")) - expect_equal(test_translate_sql(grepl("exp", x)), sql("(`x`) REGEXP ('.*' || '(exp)' || '.*')")) - expect_snapshot((expect_error(test_translate_sql(grepl("exp", x, ignore.case = TRUE))))) + expect_equal(test_translate_sql(grepl("exp", x)), sql("REGEXP_INSTR(`x`, 'exp', 1, 1, 0, 'c') != 0")) + expect_equal(test_translate_sql(grepl("exp", x, ignore.case = TRUE)), sql("REGEXP_INSTR(`x`, 'exp', 1, 1, 0, 'i') != 0")) }) test_that("pasting translated correctly", { From 72bf315ec7eaf80a566b28d82b17ccc797bada44 Mon Sep 17 00:00:00 2001 From: Nathan Watson-Haigh Date: Wed, 22 Nov 2023 20:22:12 -0800 Subject: [PATCH 4/5] Ensure escape characters are escaped --- R/backend-snowflake.R | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/R/backend-snowflake.R b/R/backend-snowflake.R index bcd7b2342..f89998047 100644 --- a/R/backend-snowflake.R +++ b/R/backend-snowflake.R @@ -41,6 +41,8 @@ sql_translation.Snowflake <- function(con) { str_detect = function(string, pattern, negate = FALSE) { con <- sql_current_con() + # Snowflake needs backslashes escaped, so we must increase the level of escaping + pattern <- gsub("\\", "\\\\", pattern, fixed = TRUE) if (negate) { translate_sql(REGEXP_INSTR(!!string, !!pattern) == 0L, con = con) } else { @@ -50,6 +52,8 @@ sql_translation.Snowflake <- function(con) { str_starts = function(string, pattern, negate = FALSE) { con <- sql_current_con() + # Snowflake needs backslashes escaped, so we must increase the level of escaping + pattern <- gsub("\\", "\\\\", pattern, fixed = TRUE) if (negate) { translate_sql(REGEXP_INSTR(!!string, !!pattern) != 1L, con = con) } else { @@ -59,6 +63,8 @@ sql_translation.Snowflake <- function(con) { str_ends = function(string, pattern, negate = FALSE) { con <- sql_current_con() + # Snowflake needs backslashes escaped, so we must increase the level of escaping + pattern <- gsub("\\", "\\\\", pattern, fixed = TRUE) if (negate) { translate_sql(REGEXP_INSTR(!!string, !!pattern, 1L, 1L, 1L) != LENGTH(!!string) + 1L, con = con) } else { @@ -280,6 +286,8 @@ snowflake_grepl <- function(pattern, # REGEXP_INSTR optional parameters: position, occurrance, option, regex_parameters regexp_parameters <- "c" if(ignore.case) { regexp_parameters <- "i" } + # Snowflake needs backslashes escaped, so we must increase the level of escaping + pattern <- gsub("\\", "\\\\", pattern, fixed = TRUE) translate_sql(REGEXP_INSTR(!!x, !!pattern, 1L, 1L, 0L, !!regexp_parameters) != 0L, con = con) } From 13358729af4c307ad09ca936a9e6ebd6181111de Mon Sep 17 00:00:00 2001 From: Nathan Watson-Haigh Date: Tue, 19 Dec 2023 15:18:03 -0800 Subject: [PATCH 5/5] Better suport for string matching in Snowflake (#1406) --- NEWS.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/NEWS.md b/NEWS.md index 45c2e46f0..9abe7a689 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,12 @@ # dbplyr (development version) +* Snowflake (@nathanhaigh, #1406) + * Added support for `str_starts()` and `str_ends()` via `REGEXP_INSTR()` + * Refactored `str_detect()` to use `REGEXP_INSTR()` so now supports + regular expressions. + * Refactored `grepl()` to use `REGEXP_INSTR()` so now supports + case-insensitive matching through `grepl(..., ignore.case = TRUE)` + * Functions qualified with the base namespace are now also translated, e.g. `base::paste0(x, "_1")` is now translated (@mgirlich, #1022).