From 682a97b70e28f36b801dddffa043622a8f7d0ff0 Mon Sep 17 00:00:00 2001 From: Krishna Pai Date: Wed, 6 Dec 2023 16:02:32 -0800 Subject: [PATCH] Ensure we only use subgroups if they are matched in url_extract_* presto functions (#7866) Summary: It is possible to trigger address sanitizer when passed garbage strings in matchAuthorityAndPath function. We need to ensure that subgroups are actually matched, since accessing an unmatched string can lead to asan errors. Differential Revision: D51822902 --- velox/functions/prestosql/URLFunctions.cpp | 21 ++-- velox/functions/prestosql/URLFunctions.h | 112 ++++++++++-------- .../prestosql/tests/URLFunctionsTest.cpp | 8 ++ 3 files changed, 82 insertions(+), 59 deletions(-) diff --git a/velox/functions/prestosql/URLFunctions.cpp b/velox/functions/prestosql/URLFunctions.cpp index 5df470dd5db81..2495ea1073762 100644 --- a/velox/functions/prestosql/URLFunctions.cpp +++ b/velox/functions/prestosql/URLFunctions.cpp @@ -15,25 +15,24 @@ */ #include "URLFunctions.h" +#include #include "velox/type/Type.h" namespace facebook::velox::functions { -bool matchAuthorityAndPath( - const boost::cmatch& urlMatch, - boost::cmatch& authAndPathMatch, +std::optional matchAuthorityAndPath( + StringView authorityAndPath, boost::cmatch& authorityMatch, - bool& hasAuthority) { + int subGroup) { + boost::cmatch authAndPathMatch; static const boost::regex kAuthorityAndPathRegex("//([^/]*)(/.*)?"); - auto authorityAndPath = submatch(urlMatch, 3); if (!boost::regex_match( authorityAndPath.begin(), authorityAndPath.end(), authAndPathMatch, kAuthorityAndPathRegex)) { // Does not start with //, doesn't have authority. - hasAuthority = false; - return true; + return std::nullopt; } static const boost::regex kAuthorityRegex( @@ -45,12 +44,14 @@ bool matchAuthorityAndPath( const auto authority = authAndPathMatch[1]; if (!boost::regex_match( authority.first, authority.second, authorityMatch, kAuthorityRegex)) { - return false; // Invalid URI Authority. + return std::nullopt; // Invalid URI Authority. } - hasAuthority = true; + if (authorityMatch[subGroup].matched) { + return authorityMatch[subGroup].str(); + } - return true; + return std::nullopt; } } // namespace facebook::velox::functions diff --git a/velox/functions/prestosql/URLFunctions.h b/velox/functions/prestosql/URLFunctions.h index 714490cb29856..0c38c860fd911 100644 --- a/velox/functions/prestosql/URLFunctions.h +++ b/velox/functions/prestosql/URLFunctions.h @@ -17,6 +17,7 @@ #include #include +#include #include "velox/functions/Macros.h" #include "velox/functions/lib/string/StringImpl.h" @@ -28,8 +29,7 @@ FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) { return StringView(sub.first, sub.length()); } -template -bool parse(const TInString& rawUrl, boost::cmatch& match) { +bool parse(const char* rawUrlData, size_t rawUrlsize, boost::cmatch& match) { /// This regex is taken from RFC - 3986. /// See: https://www.rfc-editor.org/rfc/rfc3986#appendix-B /// The basic groups are: @@ -59,7 +59,24 @@ bool parse(const TInString& rawUrl, boost::cmatch& match) { "(#(.*))?"); // #fragment return boost::regex_match( - rawUrl.data(), rawUrl.data() + rawUrl.size(), match, kUriRegex); + rawUrlData, rawUrlData + rawUrlsize, match, kUriRegex); +} + +/// Parses the url and returns the matching subgroup if the particular sub group +/// is matched by the call to parse call above. +std::optional parse(StringView rawUrl, int subGroup) { + boost::cmatch match; + if (!parse(rawUrl.data(), rawUrl.size(), match)) { + return std::nullopt; + } + + VELOX_CHECK_LT(subGroup, match.size()); + + if (match[subGroup].matched) { + return submatch(match, subGroup); + } + + return std::nullopt; } FOLLY_ALWAYS_INLINE unsigned char toHex(unsigned char c) { @@ -110,7 +127,7 @@ FOLLY_ALWAYS_INLINE void urlEscape(TOutString& output, const TInString& input) { /// Performs initial validation of the URI. /// Checks if the URI contains ascii whitespaces or /// unescaped '%' chars. -bool isValidURI(const StringView& input) { +bool isValidURI(StringView input) { const char* p = input.data(); const char* end = p + input.size(); char buf[3]; @@ -178,11 +195,13 @@ FOLLY_ALWAYS_INLINE void urlUnescape( } // namespace -bool matchAuthorityAndPath( - const boost::cmatch& urlMatch, - boost::cmatch& authAndPathMatch, +/// Matches the authority (i.e host[:port], ipaddress), and path from a string +/// representing the authority and path. Returns true if the regex matches, and +/// sets the appropriate groups matching authority in authorityMatch. +std::optional matchAuthorityAndPath( + StringView authorityAndPath, boost::cmatch& authorityMatch, - bool& hasAuthority); + int subGroup); template struct UrlExtractProtocolFunction { @@ -201,11 +220,11 @@ struct UrlExtractProtocolFunction { result.setEmpty(); return false; } - boost::cmatch match; - if (!parse(url, match)) { - result.setEmpty(); + + if (auto protocol = parse(url, 2)) { + result.setNoCopy(protocol.value()); } else { - result.setNoCopy(submatch(match, 2)); + result.setEmpty(); } return true; } @@ -228,11 +247,11 @@ struct UrlExtractFragmentFunction { result.setEmpty(); return false; } - boost::cmatch match; - if (!parse(url, match)) { - result.setEmpty(); + + if (auto fragment = parse(url, 9)) { + result.setNoCopy(fragment.value()); } else { - result.setNoCopy(submatch(match, 9)); + result.setEmpty(); } return true; } @@ -255,19 +274,17 @@ struct UrlExtractHostFunction { result.setEmpty(); return false; } - boost::cmatch match; - if (!parse(url, match)) { + + auto authAndPath = parse(url, 3); + if (!authAndPath) { result.setEmpty(); return true; } - boost::cmatch authAndPathMatch; boost::cmatch authorityMatch; - bool hasAuthority; - if (matchAuthorityAndPath( - match, authAndPathMatch, authorityMatch, hasAuthority) && - hasAuthority) { - result.setNoCopy(submatch(authorityMatch, 3)); + if (auto host = + matchAuthorityAndPath(authAndPath.value(), authorityMatch)) { + result.setNoCopy(host.value()); } else { result.setEmpty(); } @@ -283,21 +300,18 @@ struct UrlExtractPortFunction { if (!isValidURI(url)) { return false; } - boost::cmatch match; - if (!parse(url, match)) { + + auto authAndPath = parse(url, 3); + if (!authAndPath) { return false; } - boost::cmatch authAndPathMatch; boost::cmatch authorityMatch; - bool hasAuthority; - if (matchAuthorityAndPath( - match, authAndPathMatch, authorityMatch, hasAuthority) && - hasAuthority) { - auto port = submatch(authorityMatch, 4); - if (!port.empty()) { + if (auto port = + matchAuthorityAndPath(authAndPath.value(), authorityMatch, 4)) { + if (!port.value().empty()) { try { - result = to(port); + result = to(port.value()); return true; } catch (folly::ConversionError const&) { } @@ -321,14 +335,13 @@ struct UrlExtractPathFunction { return false; } - boost::cmatch match; - if (!parse(url, match)) { + if (auto path = parse(url, 5)) { + urlUnescape(result, path.value()); + } else { result.setEmpty(); return false; } - urlUnescape(result, submatch(match, 5)); - return true; } }; @@ -350,14 +363,13 @@ struct UrlExtractQueryFunction { result.setEmpty(); return false; } - boost::cmatch match; - if (!parse(url, match)) { + + if (auto query = parse(url, 7)) { + result.setNoCopy(query.value()); + } else { result.setEmpty(); - return true; } - auto query = submatch(match, 7); - result.setNoCopy(query); return true; } }; @@ -380,14 +392,14 @@ struct UrlExtractParameterFunction { result.setEmpty(); return false; } - boost::cmatch match; - if (!parse(url, match)) { + + auto query = parse(url, 7); + if (!query) { result.setEmpty(); return false; } - auto query = submatch(match, 7); - if (!query.empty()) { + if (!query.value().empty()) { // Parse query string. static const boost::regex kQueryParamRegex( "(^|&)" // start of query or start of parameter "&" @@ -398,11 +410,13 @@ struct UrlExtractParameterFunction { ); const boost::cregex_iterator begin( - query.data(), query.data() + query.size(), kQueryParamRegex); + query.value().data(), + query.value().data() + query.value().size(), + kQueryParamRegex); boost::cregex_iterator end; for (auto it = begin; it != end; ++it) { - if (it->length(2) != 0) { // key shouldnt be empty. + if (it->length(2) != 0 && (*it)[2].matched) { // key shouldnt be empty. auto key = submatch((*it), 2); if (param.compare(key) == 0) { auto value = submatch((*it), 3); diff --git a/velox/functions/prestosql/tests/URLFunctionsTest.cpp b/velox/functions/prestosql/tests/URLFunctionsTest.cpp index fd07766cc1361..89b7250b92043 100644 --- a/velox/functions/prestosql/tests/URLFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/URLFunctionsTest.cpp @@ -114,6 +114,14 @@ TEST_F(URLFunctionsTest, validateURL) { std::nullopt, std::nullopt, std::nullopt); + validate( + "IC6S!8hGVRpo+!,yTaJEy/$RUZpqcr", + "", + "", + "IC6S!8hGVRpo !,yTaJEy/$RUZpqcr", + "", + "", + std::nullopt); } TEST_F(URLFunctionsTest, extractPath) {