From de8e8439113e82995a4c3c54ea0b8b3a30c493ff Mon Sep 17 00:00:00 2001 From: Hongze Zhang Date: Thu, 14 Dec 2023 14:56:23 +0800 Subject: [PATCH 1/4] fixup --- velox/functions/lib/Re2Functions.cpp | 23 +++++++++++++++---- .../functions/lib/tests/Re2FunctionsTest.cpp | 12 ++++++++++ 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/velox/functions/lib/Re2Functions.cpp b/velox/functions/lib/Re2Functions.cpp index 39c3f79e8b7a..a6b963f2b478 100644 --- a/velox/functions/lib/Re2Functions.cpp +++ b/velox/functions/lib/Re2Functions.cpp @@ -1056,7 +1056,7 @@ class PatternStringIterator { // The char follows escapeChar can only be one of (%, _, escapeChar). if (currentChar == escapeChar_ || currentChar == '_' || currentChar == '%') { - charKind_ = CharKind::kNormal; + charKind_ = CharKind::kEscaped; } else { VELOX_USER_FAIL( "Escape character must be followed by '%', '_' or the escape character itself: {}, escape {}", @@ -1087,6 +1087,10 @@ class PatternStringIterator { return charKind_ == CharKind::kSingleCharWildcard; } + bool isEscaped() { + return charKind_ == CharKind::kEscaped; + } + bool isWildcard() { return isAnyCharsWildcard() || isSingleCharWildcard(); } @@ -1106,8 +1110,10 @@ class PatternStringIterator { // NOTE: If escape char is set as '\', for pattern '\__', the first '_' is // not a wildcard, just a literal '_', the second '_' is a wildcard. kSingleCharWildcard, - // Chars that are not escape char & not wildcard char. - kNormal + // Char that is not escape char & not wildcard char. + kNormal, + // Char that was escaped by the lhs escape char. + kEscaped }; // Char at current cursor. @@ -1170,7 +1176,16 @@ PatternMetadata determinePatternKind( } else { // Record the first fixed pattern start. if (fixedPatternStart == -1) { - fixedPatternStart = iterator.currentIndex(); + if (iterator.isEscaped()) { + // We should include escape chars in the fixed pattern. Otherwise, + // a pattern like '%\\abc%' would produce '\abc' as fixed pattern + // which could lead to failure when trying to unescape the fixed + // pattern. + VELOX_CHECK_GT(iterator.currentIndex(), 0) + fixedPatternStart = iterator.currentIndex() - 1; + } else { + fixedPatternStart = iterator.currentIndex(); + } } else { // This is not the first fixed pattern, not supported, so fallback. if (iterator.isPreviousWildcard()) { diff --git a/velox/functions/lib/tests/Re2FunctionsTest.cpp b/velox/functions/lib/tests/Re2FunctionsTest.cpp index fc50fa176f97..d85eb6f90905 100644 --- a/velox/functions/lib/tests/Re2FunctionsTest.cpp +++ b/velox/functions/lib/tests/Re2FunctionsTest.cpp @@ -591,6 +591,18 @@ TEST_F(Re2FunctionsTest, likePatternWildcard) { testLike("\nabcde\n", "%bcf%", false); } +TEST_F(Re2FunctionsTest, likePatternEscapingEscapeChar) { + testLike(R"(\)", R"(\\)", '\\', true); + testLike(R"(\abc)", R"(\\%)", '\\', true); + testLike(R"(\abc)", R"(\\abc)", '\\', true); + testLike(R"(abc\abc)", R"(abc\\abc)", '\\', true); + testLike(R"(\abcdef)", R"(\\abc%)", '\\', true); + testLike(R"(\abcdefghijkl)", R"(\\abc%gh%)", '\\', true); + testLike(R"(abc\abc)", R"(%\\%)", '\\', true); + testLike(R"(abcdef\abcdef)", R"(%\\abc%)", '\\', true); + testLike(R"(abcdef\\\abcdef)", R"(%\\\\\\abc%)", '\\', true); +} + TEST_F(Re2FunctionsTest, likePatternFixed) { testLike("", "", true); testLike("abcde", "abcde", true); From 1e6b9e015defed0f38d37f87f3a5fbdfb1cac637 Mon Sep 17 00:00:00 2001 From: Hongze Zhang Date: Thu, 14 Dec 2023 16:44:45 +0800 Subject: [PATCH 2/4] fixup --- velox/functions/lib/Re2Functions.cpp | 85 ++++++++----------- .../functions/lib/tests/Re2FunctionsTest.cpp | 16 ++-- 2 files changed, 45 insertions(+), 56 deletions(-) diff --git a/velox/functions/lib/Re2Functions.cpp b/velox/functions/lib/Re2Functions.cpp index a6b963f2b478..a92a39cc7959 100644 --- a/velox/functions/lib/Re2Functions.cpp +++ b/velox/functions/lib/Re2Functions.cpp @@ -1024,15 +1024,13 @@ std::string unescape( class PatternStringIterator { public: PatternStringIterator(StringView pattern, std::optional escapeChar) - : pattern_(pattern), - escapeChar_(escapeChar), - lastIndex_{pattern_.size() - 1} {} + : pattern_(pattern), escapeChar_(escapeChar) {} // Advance the cursor to next char, escape char is automatically handled. // Return true if the cursor is advanced successfully, false otherwise(reached // the end of the pattern string). bool next() { - if (currentIndex_ == lastIndex_) { + if (nextStart_ == pattern_.size()) { return false; } @@ -1040,43 +1038,47 @@ class PatternStringIterator { (charKind_ == CharKind::kSingleCharWildcard || charKind_ == CharKind::kAnyCharsWildcard); - currentIndex_++; - auto currentChar = current(); + currentStart_ = nextStart_; + auto currentChar = charAt(currentStart_); if (currentChar == escapeChar_) { // Escape char should be followed by another char. VELOX_USER_CHECK_LT( - currentIndex_, - lastIndex_, + currentStart_ + 1, + pattern_.size(), "Escape character must be followed by '%', '_' or the escape character itself: {}, escape {}", pattern_, escapeChar_.value()) - currentIndex_++; - currentChar = current(); + currentChar = charAt(currentStart_ + 1); // The char follows escapeChar can only be one of (%, _, escapeChar). if (currentChar == escapeChar_ || currentChar == '_' || currentChar == '%') { - charKind_ = CharKind::kEscaped; + charKind_ = CharKind::kNormal; } else { VELOX_USER_FAIL( "Escape character must be followed by '%', '_' or the escape character itself: {}, escape {}", pattern_, escapeChar_.value()) } - } else if (currentChar == '_') { - charKind_ = CharKind::kSingleCharWildcard; - } else if (currentChar == '%') { - charKind_ = CharKind::kAnyCharsWildcard; + // One escape char plus the current char. + nextStart_ = currentStart_ + 2; } else { - charKind_ = CharKind::kNormal; + if (currentChar == '_') { + charKind_ = CharKind::kSingleCharWildcard; + } else if (currentChar == '%') { + charKind_ = CharKind::kAnyCharsWildcard; + } else { + charKind_ = CharKind::kNormal; + } + nextStart_ = currentStart_ + 1; } return true; } - // Current index of the cursor. - char currentIndex() const { - return currentIndex_; + // Start index of the current character. + size_t currentStart() const { + return currentStart_; } bool isAnyCharsWildcard() const { @@ -1087,10 +1089,6 @@ class PatternStringIterator { return charKind_ == CharKind::kSingleCharWildcard; } - bool isEscaped() { - return charKind_ == CharKind::kEscaped; - } - bool isWildcard() { return isAnyCharsWildcard() || isSingleCharWildcard(); } @@ -1110,22 +1108,21 @@ class PatternStringIterator { // NOTE: If escape char is set as '\', for pattern '\__', the first '_' is // not a wildcard, just a literal '_', the second '_' is a wildcard. kSingleCharWildcard, - // Char that is not escape char & not wildcard char. - kNormal, - // Char that was escaped by the lhs escape char. - kEscaped + // Chars that are not escape char & not wildcard char. + kNormal }; // Char at current cursor. - char current() const { - return pattern_.data()[currentIndex_]; + char charAt(size_t index) const { + VELOX_DCHECK(index >= 0 && index < pattern_.size()) + return pattern_.data()[index]; } const StringView pattern_; const std::optional escapeChar_; - const size_t lastIndex_; - int32_t currentIndex_{-1}; + size_t currentStart_{0}; + size_t nextStart_{0}; CharKind charKind_{CharKind::kNormal}; bool isPreviousWildcard_{false}; }; @@ -1133,15 +1130,15 @@ class PatternStringIterator { PatternMetadata determinePatternKind( StringView pattern, std::optional escapeChar) { - int32_t patternLength = pattern.size(); + const size_t patternLength = pattern.size(); // Index of the first % or _ character(not escaped). - int32_t wildcardStart = -1; + size_t wildcardStart = -1; // Index of the first character that is not % and not _. - int32_t fixedPatternStart = -1; + size_t fixedPatternStart = -1; // Index of the last character in the fixed pattern, used to retrieve the // fixed string for patterns of type kSubstring. - int32_t fixedPatternEnd = -1; + size_t fixedPatternEnd = -1; // Count of wildcard character sequences in pattern. size_t numWildcardSequences = 0; // Total number of % characters. @@ -1154,9 +1151,10 @@ PatternMetadata determinePatternKind( // Iterate through the pattern string to collect the stats for the simple // patterns that we can optimize. while (iterator.next()) { + const size_t currentStart = iterator.currentStart(); if (iterator.isWildcard()) { if (wildcardStart == -1) { - wildcardStart = iterator.currentIndex(); + wildcardStart = currentStart; } if (iterator.isSingleCharWildcard()) { @@ -1171,21 +1169,12 @@ PatternMetadata determinePatternKind( // Mark the end of the fixed pattern. if (fixedPatternStart != -1 && fixedPatternEnd == -1) { - fixedPatternEnd = iterator.currentIndex() - 1; + fixedPatternEnd = currentStart - 1; } } else { // Record the first fixed pattern start. if (fixedPatternStart == -1) { - if (iterator.isEscaped()) { - // We should include escape chars in the fixed pattern. Otherwise, - // a pattern like '%\\abc%' would produce '\abc' as fixed pattern - // which could lead to failure when trying to unescape the fixed - // pattern. - VELOX_CHECK_GT(iterator.currentIndex(), 0) - fixedPatternStart = iterator.currentIndex() - 1; - } else { - fixedPatternStart = iterator.currentIndex(); - } + fixedPatternStart = currentStart; } else { // This is not the first fixed pattern, not supported, so fallback. if (iterator.isPreviousWildcard()) { @@ -1198,7 +1187,7 @@ PatternMetadata determinePatternKind( // The pattern end may not been marked if there is no wildcard char after // pattern start, so we mark it here. if (fixedPatternStart != -1 && fixedPatternEnd == -1) { - fixedPatternEnd = iterator.currentIndex() - 1; + fixedPatternEnd = patternLength - 1; } // At this point pattern has max of one fixed pattern. diff --git a/velox/functions/lib/tests/Re2FunctionsTest.cpp b/velox/functions/lib/tests/Re2FunctionsTest.cpp index d85eb6f90905..7020e5213c53 100644 --- a/velox/functions/lib/tests/Re2FunctionsTest.cpp +++ b/velox/functions/lib/tests/Re2FunctionsTest.cpp @@ -592,15 +592,15 @@ TEST_F(Re2FunctionsTest, likePatternWildcard) { } TEST_F(Re2FunctionsTest, likePatternEscapingEscapeChar) { - testLike(R"(\)", R"(\\)", '\\', true); - testLike(R"(\abc)", R"(\\%)", '\\', true); - testLike(R"(\abc)", R"(\\abc)", '\\', true); - testLike(R"(abc\abc)", R"(abc\\abc)", '\\', true); - testLike(R"(\abcdef)", R"(\\abc%)", '\\', true); - testLike(R"(\abcdefghijkl)", R"(\\abc%gh%)", '\\', true); - testLike(R"(abc\abc)", R"(%\\%)", '\\', true); + // testLike(R"(\)", R"(\\)", '\\', true); + // testLike(R"(\abc)", R"(\\%)", '\\', true); + // testLike(R"(\abc)", R"(\\abc)", '\\', true); + // testLike(R"(abc\abc)", R"(abc\\abc)", '\\', true); + // testLike(R"(\abcdef)", R"(\\abc%)", '\\', true); + // testLike(R"(\abcdefghijkl)", R"(\\abc%gh%)", '\\', true); + // testLike(R"(abc\abc)", R"(%\\%)", '\\', true); testLike(R"(abcdef\abcdef)", R"(%\\abc%)", '\\', true); - testLike(R"(abcdef\\\abcdef)", R"(%\\\\\\abc%)", '\\', true); + // testLike(R"(abcdef\\\abcdef)", R"(%\\\\\\abc%)", '\\', true); } TEST_F(Re2FunctionsTest, likePatternFixed) { From b752d403be2ae7adb9674bed6170a841a6379cc1 Mon Sep 17 00:00:00 2001 From: Hongze Zhang Date: Thu, 14 Dec 2023 16:47:10 +0800 Subject: [PATCH 3/4] fixup --- velox/functions/lib/tests/Re2FunctionsTest.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/velox/functions/lib/tests/Re2FunctionsTest.cpp b/velox/functions/lib/tests/Re2FunctionsTest.cpp index 7020e5213c53..d85eb6f90905 100644 --- a/velox/functions/lib/tests/Re2FunctionsTest.cpp +++ b/velox/functions/lib/tests/Re2FunctionsTest.cpp @@ -592,15 +592,15 @@ TEST_F(Re2FunctionsTest, likePatternWildcard) { } TEST_F(Re2FunctionsTest, likePatternEscapingEscapeChar) { - // testLike(R"(\)", R"(\\)", '\\', true); - // testLike(R"(\abc)", R"(\\%)", '\\', true); - // testLike(R"(\abc)", R"(\\abc)", '\\', true); - // testLike(R"(abc\abc)", R"(abc\\abc)", '\\', true); - // testLike(R"(\abcdef)", R"(\\abc%)", '\\', true); - // testLike(R"(\abcdefghijkl)", R"(\\abc%gh%)", '\\', true); - // testLike(R"(abc\abc)", R"(%\\%)", '\\', true); + testLike(R"(\)", R"(\\)", '\\', true); + testLike(R"(\abc)", R"(\\%)", '\\', true); + testLike(R"(\abc)", R"(\\abc)", '\\', true); + testLike(R"(abc\abc)", R"(abc\\abc)", '\\', true); + testLike(R"(\abcdef)", R"(\\abc%)", '\\', true); + testLike(R"(\abcdefghijkl)", R"(\\abc%gh%)", '\\', true); + testLike(R"(abc\abc)", R"(%\\%)", '\\', true); testLike(R"(abcdef\abcdef)", R"(%\\abc%)", '\\', true); - // testLike(R"(abcdef\\\abcdef)", R"(%\\\\\\abc%)", '\\', true); + testLike(R"(abcdef\\\abcdef)", R"(%\\\\\\abc%)", '\\', true); } TEST_F(Re2FunctionsTest, likePatternFixed) { From fcf27fd15e0f83dd294266dba306bdc6c6488bb0 Mon Sep 17 00:00:00 2001 From: Hongze Zhang Date: Thu, 14 Dec 2023 20:22:29 +0800 Subject: [PATCH 4/4] fixup --- velox/functions/lib/Re2Functions.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/velox/functions/lib/Re2Functions.cpp b/velox/functions/lib/Re2Functions.cpp index a92a39cc7959..41bcbb700505 100644 --- a/velox/functions/lib/Re2Functions.cpp +++ b/velox/functions/lib/Re2Functions.cpp @@ -1133,12 +1133,12 @@ PatternMetadata determinePatternKind( const size_t patternLength = pattern.size(); // Index of the first % or _ character(not escaped). - size_t wildcardStart = -1; + int32_t wildcardStart = -1; // Index of the first character that is not % and not _. - size_t fixedPatternStart = -1; + int32_t fixedPatternStart = -1; // Index of the last character in the fixed pattern, used to retrieve the // fixed string for patterns of type kSubstring. - size_t fixedPatternEnd = -1; + int32_t fixedPatternEnd = -1; // Count of wildcard character sequences in pattern. size_t numWildcardSequences = 0; // Total number of % characters.