diff --git a/velox/functions/prestosql/URLFunctions.cpp b/velox/functions/prestosql/URLFunctions.cpp index 5df470dd5db81..674808bd69005 100644 --- a/velox/functions/prestosql/URLFunctions.cpp +++ b/velox/functions/prestosql/URLFunctions.cpp @@ -22,8 +22,7 @@ namespace facebook::velox::functions { bool matchAuthorityAndPath( const boost::cmatch& urlMatch, boost::cmatch& authAndPathMatch, - boost::cmatch& authorityMatch, - bool& hasAuthority) { + boost::cmatch& authorityMatch) { static const boost::regex kAuthorityAndPathRegex("//([^/]*)(/.*)?"); auto authorityAndPath = submatch(urlMatch, 3); if (!boost::regex_match( @@ -32,8 +31,7 @@ bool matchAuthorityAndPath( authAndPathMatch, kAuthorityAndPathRegex)) { // Does not start with //, doesn't have authority. - hasAuthority = false; - return true; + return false; } static const boost::regex kAuthorityRegex( @@ -48,9 +46,11 @@ bool matchAuthorityAndPath( return false; // Invalid URI Authority. } - hasAuthority = true; + if (authorityMatch[3].matched) { + return true; + } - return true; + return false; } } // namespace facebook::velox::functions diff --git a/velox/functions/prestosql/URLFunctions.h b/velox/functions/prestosql/URLFunctions.h index 714490cb29856..d68bab9ce500e 100644 --- a/velox/functions/prestosql/URLFunctions.h +++ b/velox/functions/prestosql/URLFunctions.h @@ -28,8 +28,7 @@ FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) { return StringView(sub.first, sub.length()); } -template -bool parse(const TInString& rawUrl, boost::cmatch& match) { +bool parse(const StringView& rawUrl, boost::cmatch& match) { /// This regex is taken from RFC - 3986. /// See: https://www.rfc-editor.org/rfc/rfc3986#appendix-B /// The basic groups are: @@ -62,6 +61,20 @@ bool parse(const TInString& rawUrl, boost::cmatch& match) { rawUrl.data(), rawUrl.data() + rawUrl.size(), match, kUriRegex); } +/// Parses the url and returns true if the particular sub group +/// is matched by the call to parse(url, match). +bool parse(const StringView& rawUrl, boost::cmatch& match, int subGroup) { + if (!parse(rawUrl, match)) { + return false; + } + + if (subGroup >= match.size() || match[subGroup].matched == false) { + return false; + } + + return true; +} + FOLLY_ALWAYS_INLINE unsigned char toHex(unsigned char c) { return c < 10 ? (c + '0') : (c + 'A' - 10); } @@ -181,8 +194,7 @@ FOLLY_ALWAYS_INLINE void urlUnescape( bool matchAuthorityAndPath( const boost::cmatch& urlMatch, boost::cmatch& authAndPathMatch, - boost::cmatch& authorityMatch, - bool& hasAuthority); + boost::cmatch& authorityMatch); template struct UrlExtractProtocolFunction { @@ -229,7 +241,7 @@ struct UrlExtractFragmentFunction { return false; } boost::cmatch match; - if (!parse(url, match)) { + if (!parse(url, match, 9)) { result.setEmpty(); } else { result.setNoCopy(submatch(match, 9)); @@ -256,17 +268,14 @@ struct UrlExtractHostFunction { return false; } boost::cmatch match; - if (!parse(url, match)) { + if (!parse(url, match, 3)) { result.setEmpty(); return true; } boost::cmatch authAndPathMatch; boost::cmatch authorityMatch; - bool hasAuthority; - if (matchAuthorityAndPath( - match, authAndPathMatch, authorityMatch, hasAuthority) && - hasAuthority) { + if (matchAuthorityAndPath(match, authAndPathMatch, authorityMatch)) { result.setNoCopy(submatch(authorityMatch, 3)); } else { result.setEmpty(); @@ -284,16 +293,13 @@ struct UrlExtractPortFunction { return false; } boost::cmatch match; - if (!parse(url, match)) { + if (!parse(url, match, 4)) { return false; } boost::cmatch authAndPathMatch; boost::cmatch authorityMatch; - bool hasAuthority; - if (matchAuthorityAndPath( - match, authAndPathMatch, authorityMatch, hasAuthority) && - hasAuthority) { + if (matchAuthorityAndPath(match, authAndPathMatch, authorityMatch)) { auto port = submatch(authorityMatch, 4); if (!port.empty()) { try { @@ -322,7 +328,7 @@ struct UrlExtractPathFunction { } boost::cmatch match; - if (!parse(url, match)) { + if (!parse(url, match, 5)) { result.setEmpty(); return false; } @@ -351,7 +357,7 @@ struct UrlExtractQueryFunction { return false; } boost::cmatch match; - if (!parse(url, match)) { + if (!parse(url, match, 7)) { result.setEmpty(); return true; } @@ -381,7 +387,7 @@ struct UrlExtractParameterFunction { return false; } boost::cmatch match; - if (!parse(url, match)) { + if (!parse(url, match, 7)) { result.setEmpty(); return false; } @@ -402,7 +408,7 @@ struct UrlExtractParameterFunction { boost::cregex_iterator end; for (auto it = begin; it != end; ++it) { - if (it->length(2) != 0) { // key shouldnt be empty. + if (it->length(2) != 0 && (*it)[2].matched) { // key shouldnt be empty. auto key = submatch((*it), 2); if (param.compare(key) == 0) { auto value = submatch((*it), 3); diff --git a/velox/functions/prestosql/tests/URLFunctionsTest.cpp b/velox/functions/prestosql/tests/URLFunctionsTest.cpp index fd07766cc1361..89b7250b92043 100644 --- a/velox/functions/prestosql/tests/URLFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/URLFunctionsTest.cpp @@ -114,6 +114,14 @@ TEST_F(URLFunctionsTest, validateURL) { std::nullopt, std::nullopt, std::nullopt); + validate( + "IC6S!8hGVRpo+!,yTaJEy/$RUZpqcr", + "", + "", + "IC6S!8hGVRpo !,yTaJEy/$RUZpqcr", + "", + "", + std::nullopt); } TEST_F(URLFunctionsTest, extractPath) {