diff --git a/velox/functions/prestosql/URLFunctions.cpp b/velox/functions/prestosql/URLFunctions.cpp index 5df470dd5db81..2495ea1073762 100644 --- a/velox/functions/prestosql/URLFunctions.cpp +++ b/velox/functions/prestosql/URLFunctions.cpp @@ -15,25 +15,24 @@ */ #include "URLFunctions.h" +#include #include "velox/type/Type.h" namespace facebook::velox::functions { -bool matchAuthorityAndPath( - const boost::cmatch& urlMatch, - boost::cmatch& authAndPathMatch, +std::optional matchAuthorityAndPath( + StringView authorityAndPath, boost::cmatch& authorityMatch, - bool& hasAuthority) { + int subGroup) { + boost::cmatch authAndPathMatch; static const boost::regex kAuthorityAndPathRegex("//([^/]*)(/.*)?"); - auto authorityAndPath = submatch(urlMatch, 3); if (!boost::regex_match( authorityAndPath.begin(), authorityAndPath.end(), authAndPathMatch, kAuthorityAndPathRegex)) { // Does not start with //, doesn't have authority. - hasAuthority = false; - return true; + return std::nullopt; } static const boost::regex kAuthorityRegex( @@ -45,12 +44,14 @@ bool matchAuthorityAndPath( const auto authority = authAndPathMatch[1]; if (!boost::regex_match( authority.first, authority.second, authorityMatch, kAuthorityRegex)) { - return false; // Invalid URI Authority. + return std::nullopt; // Invalid URI Authority. } - hasAuthority = true; + if (authorityMatch[subGroup].matched) { + return authorityMatch[subGroup].str(); + } - return true; + return std::nullopt; } } // namespace facebook::velox::functions diff --git a/velox/functions/prestosql/URLFunctions.h b/velox/functions/prestosql/URLFunctions.h index 714490cb29856..0c38c860fd911 100644 --- a/velox/functions/prestosql/URLFunctions.h +++ b/velox/functions/prestosql/URLFunctions.h @@ -17,6 +17,7 @@ #include #include +#include #include "velox/functions/Macros.h" #include "velox/functions/lib/string/StringImpl.h" @@ -28,8 +29,7 @@ FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) { return StringView(sub.first, sub.length()); } -template -bool parse(const TInString& rawUrl, boost::cmatch& match) { +bool parse(const char* rawUrlData, size_t rawUrlsize, boost::cmatch& match) { /// This regex is taken from RFC - 3986. /// See: https://www.rfc-editor.org/rfc/rfc3986#appendix-B /// The basic groups are: @@ -59,7 +59,24 @@ bool parse(const TInString& rawUrl, boost::cmatch& match) { "(#(.*))?"); // #fragment return boost::regex_match( - rawUrl.data(), rawUrl.data() + rawUrl.size(), match, kUriRegex); + rawUrlData, rawUrlData + rawUrlsize, match, kUriRegex); +} + +/// Parses the url and returns the matching subgroup if the particular sub group +/// is matched by the call to parse call above. +std::optional parse(StringView rawUrl, int subGroup) { + boost::cmatch match; + if (!parse(rawUrl.data(), rawUrl.size(), match)) { + return std::nullopt; + } + + VELOX_CHECK_LT(subGroup, match.size()); + + if (match[subGroup].matched) { + return submatch(match, subGroup); + } + + return std::nullopt; } FOLLY_ALWAYS_INLINE unsigned char toHex(unsigned char c) { @@ -110,7 +127,7 @@ FOLLY_ALWAYS_INLINE void urlEscape(TOutString& output, const TInString& input) { /// Performs initial validation of the URI. /// Checks if the URI contains ascii whitespaces or /// unescaped '%' chars. -bool isValidURI(const StringView& input) { +bool isValidURI(StringView input) { const char* p = input.data(); const char* end = p + input.size(); char buf[3]; @@ -178,11 +195,13 @@ FOLLY_ALWAYS_INLINE void urlUnescape( } // namespace -bool matchAuthorityAndPath( - const boost::cmatch& urlMatch, - boost::cmatch& authAndPathMatch, +/// Matches the authority (i.e host[:port], ipaddress), and path from a string +/// representing the authority and path. Returns true if the regex matches, and +/// sets the appropriate groups matching authority in authorityMatch. +std::optional matchAuthorityAndPath( + StringView authorityAndPath, boost::cmatch& authorityMatch, - bool& hasAuthority); + int subGroup); template struct UrlExtractProtocolFunction { @@ -201,11 +220,11 @@ struct UrlExtractProtocolFunction { result.setEmpty(); return false; } - boost::cmatch match; - if (!parse(url, match)) { - result.setEmpty(); + + if (auto protocol = parse(url, 2)) { + result.setNoCopy(protocol.value()); } else { - result.setNoCopy(submatch(match, 2)); + result.setEmpty(); } return true; } @@ -228,11 +247,11 @@ struct UrlExtractFragmentFunction { result.setEmpty(); return false; } - boost::cmatch match; - if (!parse(url, match)) { - result.setEmpty(); + + if (auto fragment = parse(url, 9)) { + result.setNoCopy(fragment.value()); } else { - result.setNoCopy(submatch(match, 9)); + result.setEmpty(); } return true; } @@ -255,19 +274,17 @@ struct UrlExtractHostFunction { result.setEmpty(); return false; } - boost::cmatch match; - if (!parse(url, match)) { + + auto authAndPath = parse(url, 3); + if (!authAndPath) { result.setEmpty(); return true; } - boost::cmatch authAndPathMatch; boost::cmatch authorityMatch; - bool hasAuthority; - if (matchAuthorityAndPath( - match, authAndPathMatch, authorityMatch, hasAuthority) && - hasAuthority) { - result.setNoCopy(submatch(authorityMatch, 3)); + if (auto host = + matchAuthorityAndPath(authAndPath.value(), authorityMatch)) { + result.setNoCopy(host.value()); } else { result.setEmpty(); } @@ -283,21 +300,18 @@ struct UrlExtractPortFunction { if (!isValidURI(url)) { return false; } - boost::cmatch match; - if (!parse(url, match)) { + + auto authAndPath = parse(url, 3); + if (!authAndPath) { return false; } - boost::cmatch authAndPathMatch; boost::cmatch authorityMatch; - bool hasAuthority; - if (matchAuthorityAndPath( - match, authAndPathMatch, authorityMatch, hasAuthority) && - hasAuthority) { - auto port = submatch(authorityMatch, 4); - if (!port.empty()) { + if (auto port = + matchAuthorityAndPath(authAndPath.value(), authorityMatch, 4)) { + if (!port.value().empty()) { try { - result = to(port); + result = to(port.value()); return true; } catch (folly::ConversionError const&) { } @@ -321,14 +335,13 @@ struct UrlExtractPathFunction { return false; } - boost::cmatch match; - if (!parse(url, match)) { + if (auto path = parse(url, 5)) { + urlUnescape(result, path.value()); + } else { result.setEmpty(); return false; } - urlUnescape(result, submatch(match, 5)); - return true; } }; @@ -350,14 +363,13 @@ struct UrlExtractQueryFunction { result.setEmpty(); return false; } - boost::cmatch match; - if (!parse(url, match)) { + + if (auto query = parse(url, 7)) { + result.setNoCopy(query.value()); + } else { result.setEmpty(); - return true; } - auto query = submatch(match, 7); - result.setNoCopy(query); return true; } }; @@ -380,14 +392,14 @@ struct UrlExtractParameterFunction { result.setEmpty(); return false; } - boost::cmatch match; - if (!parse(url, match)) { + + auto query = parse(url, 7); + if (!query) { result.setEmpty(); return false; } - auto query = submatch(match, 7); - if (!query.empty()) { + if (!query.value().empty()) { // Parse query string. static const boost::regex kQueryParamRegex( "(^|&)" // start of query or start of parameter "&" @@ -398,11 +410,13 @@ struct UrlExtractParameterFunction { ); const boost::cregex_iterator begin( - query.data(), query.data() + query.size(), kQueryParamRegex); + query.value().data(), + query.value().data() + query.value().size(), + kQueryParamRegex); boost::cregex_iterator end; for (auto it = begin; it != end; ++it) { - if (it->length(2) != 0) { // key shouldnt be empty. + if (it->length(2) != 0 && (*it)[2].matched) { // key shouldnt be empty. auto key = submatch((*it), 2); if (param.compare(key) == 0) { auto value = submatch((*it), 3); diff --git a/velox/functions/prestosql/tests/URLFunctionsTest.cpp b/velox/functions/prestosql/tests/URLFunctionsTest.cpp index fd07766cc1361..89b7250b92043 100644 --- a/velox/functions/prestosql/tests/URLFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/URLFunctionsTest.cpp @@ -114,6 +114,14 @@ TEST_F(URLFunctionsTest, validateURL) { std::nullopt, std::nullopt, std::nullopt); + validate( + "IC6S!8hGVRpo+!,yTaJEy/$RUZpqcr", + "", + "", + "IC6S!8hGVRpo !,yTaJEy/$RUZpqcr", + "", + "", + std::nullopt); } TEST_F(URLFunctionsTest, extractPath) {