diff --git a/velox/docs/functions/spark/url.rst b/velox/docs/functions/spark/url.rst index 0f828f0855bec..273d83c9b88c0 100644 --- a/velox/docs/functions/spark/url.rst +++ b/velox/docs/functions/spark/url.rst @@ -5,24 +5,27 @@ URL Functions Introduction ------------ -The URL extraction function extracts components from HTTP URLs (or any valid URIs conforming to `RFC 3986 `_). The following syntax is supported: +The URL extraction functions extract components from HTTP URLs (or any valid URIs conforming to `RFC 3986 `_). The following syntax is supported: .. code-block:: bash - [protocol:][//host[:port]][path][?query][#fragment] + [protocol]://[[userinfo@]host[:port]][[path][?query]][#ref] Consider for example the below URI: .. code-block:: - http://www.ics.uci.edu/pub/ietf/uri/?k1=v1#Related + http://user:pass@example.com:8080/path1/p.php?k1=v1&k2=v2#Ref1 - scheme = http - authority = www.ics.uci.edu - path = /pub/ietf/uri/ - query = k1=v1 - fragment = Related + protocol = http + host = example.com + path = /path1/p.php + userinfo = user:pass + authority = user:pass@example.com:8080 + file = /path1/p.php?k1=v1&k2=v2 + query = k1=v1&k2=v2 + ref = Ref1 Invalid URI's @@ -36,25 +39,29 @@ digits after the percent character "%". All the url extract functions will retur # Examples of url functions with Invalid URI's. # Invalid URI due to whitespace - SELECT url_extract_path('foo '); -- NULL (1 row) - SELECT url_extract_host('http://www.foo.com '); -- NULL (1 row) + SELECT parse_url('foo ', 'FILE'); -- NULL (1 row) + SELECT parse_url('http://www.foo.com ', 'FILE'); -- NULL (1 row) # Invalid URI due to improper escaping of '%' - SELECT url_extract_path('https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%'); -- NULL (1 row) - SELECT url_extract_host('https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%'); -- NULL (1 row) + SELECT parse_url('https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%', 'FILE'); -- NULL (1 row) + SELECT parse_url('https://www.ucu.edu.uy/agenda/evento/%%UCUrlCompartir%%', 'FILE'); -- NULL (1 row) .. spark:function:: parse_url(string, partToExtract) -> varchar Extracts a part from a URL. The part to extract can be one of the following: + * `PROTOCOL`: The protocol. * `HOST`: The host name. * `PATH`: The path. + * `USERINFO` : The username and/or password. + * `AUTHORITY` : The host and optionally userinfo and/or port. + * `FILE` : The file. * `QUERY`: The query. - * `FRAGMENT`: The fragment. - * `PROTOCOL`: The protocol. + * `REF` : The reference. + :param string: The URL to extract the part from. - :param partToExtract: The part to extract from the URL. + :param partToExtract: The part to extract from the URL. Must be uppercase, lowercase values will return null. :return: The extracted part of the URL. .. code-block:: sql @@ -68,7 +75,7 @@ digits after the percent character "%". All the url extract functions will retur SELECT parse_url('http://www.ics.uci.edu/pub/ietf/uri/?k1=v1#Related', 'QUERY'); -- k1=v1 - SELECT parse_url('http://www.ics.uci.edu/pub/ietf/uri/?k1=v1#Related', 'FRAGMENT'); + SELECT parse_url('http://www.ics.uci.edu/pub/ietf/uri/?k1=v1#Related', 'REF'); -- Related SELECT parse_url('http://www.ics.uci.edu/pub/ietf/uri/?k1=v1#Related', 'PROTOCOL'); diff --git a/velox/functions/sparksql/URLFunctions.h b/velox/functions/sparksql/URLFunctions.h index e4e73b95e12c9..f3d526d862187 100644 --- a/velox/functions/sparksql/URLFunctions.h +++ b/velox/functions/sparksql/URLFunctions.h @@ -21,11 +21,10 @@ #include "velox/functions/Macros.h" namespace facebook::velox::functions::sparksql { -namespace { +namespace detail { -/// Performs initial validation of the URI. -/// Checks if the URI contains ascii whitespaces or -/// unescaped '%' chars. +// Checks if the URI contains ascii whitespaces or +// unescaped '%' chars. bool isValidURI(StringView input) { const char* p = input.data(); const char* end = p + input.size(); @@ -33,7 +32,7 @@ bool isValidURI(StringView input) { buf[2] = '\0'; char* endptr; for (; p < end; ++p) { - if (facebook::velox::functions::stringImpl::isAsciiWhiteSpace(*p)) { + if (stringImpl::isAsciiWhiteSpace(*p)) { return false; } @@ -84,8 +83,10 @@ bool parse(const TInString& rawUrl, boost::cmatch& match) { return boost::regex_match( rawUrl.data(), rawUrl.data() + rawUrl.size(), match, kUriRegex); } -// PARSE_URL(url, partToExtract) → string -// PARSE_URL(url, partToExtract, key) → string +} // namespace detail + +// parse_url(url, partToExtract) → string +// parse_url(url, partToExtract, key) → string // // Extracts a part of a URL. The partToExtract argument can be one of // 'PROTOCOL', 'HOST', 'PATH', 'REF', 'AUTHORITY', 'FILE', 'USERINFO', @@ -102,24 +103,12 @@ struct ParseUrlFunction { // ASCII input always produces ASCII result. static constexpr bool is_default_ascii_behavior = true; - static constexpr int kAuthPath = 3; - static constexpr int kQuery = 4; - static constexpr int kHost = 3; - static constexpr int kProto = 2; - static constexpr int kRef = 5; - // submatch indexes for authorityMatch - static constexpr int kPathHasAuth = 2; - static constexpr int kPathNoAuth = 3; - static constexpr int kUser = 1; - static constexpr int kPass = 2; - static constexpr int kPort = 4; - - FOLLY_ALWAYS_INLINE bool call( + bool call( out_type& result, const arg_type& url, const arg_type& partToExtract) { boost::cmatch match; - if (!parse(url, match)) { + if (!detail::parse(url, match)) { return false; } if (partToExtract == "PROTOCOL") { @@ -142,72 +131,19 @@ struct ParseUrlFunction { } return simpleMatch(authorityMatch, kHost, result); } else if (partToExtract == "PATH") { - std::string_view path = submatch(match, kAuthPath); - if (hasAuthority) { - path = submatch(authAndPathMatch, 2); - } - result.setNoCopy(StringView(path.data(), path.size())); - return true; - } - // Path[?Query]. - else if (partToExtract == "FILE") { - std::string_view path = submatch(match, kAuthPath); - if (hasAuthority) { - path = submatch(authAndPathMatch, 2); - } - std::string_view query = submatch(match, kQuery); - if (!query.empty()) { - result.setNoCopy(StringView( - path.data(), (query.data() + query.size()) - path.data())); - } else { - result.setNoCopy(StringView(path.data(), path.size())); - } - return true; - } - - // Username[:Password]. - else if (partToExtract == "USERINFO") { - if (!hasAuthority) { - return false; - } - std::string_view username = submatch(authorityMatch, 1); - std::string_view password = submatch(authorityMatch, 2); - if (!password.empty()) { - result.setNoCopy( - StringView(username.data(), password.end() - username.begin())); - return true; - } else if (!username.empty()) { - result.setNoCopy(StringView(username.data(), username.size())); - return true; - } else { - return false; - } - } - // [Userinfo@]Host[:Port]. - else if (partToExtract == "AUTHORITY") { - if (!hasAuthority) { - return false; - } - std::string_view host = submatch(authorityMatch, kHost); - std::string_view first = host; - std::string_view last = host; - - std::string_view username = submatch(authorityMatch, 1); - std::string_view port = submatch(authorityMatch, 4); - if (!username.empty()) { - first = username; - } - if (!port.empty()) { - last = port; - } - result.setNoCopy(StringView(first.data(), last.end() - first.begin())); - return true; + return matchPath(match, authAndPathMatch, hasAuthority, result); + } else if (partToExtract == "FILE") { + return matchFile(match, authAndPathMatch, hasAuthority, result); + } else if (partToExtract == "USERINFO") { + return matchUserinfo(match, authorityMatch, hasAuthority, result); + } else if (partToExtract == "AUTHORITY") { + return matchAuthority(authorityMatch, hasAuthority, result); } return false; } - FOLLY_ALWAYS_INLINE bool call( + bool call( out_type& result, const arg_type& url, const arg_type& partToExtract, @@ -221,7 +157,7 @@ struct ParseUrlFunction { } boost::cmatch match; - if (!parse(url, match)) { + if (!detail::parse(url, match)) { return false; } @@ -234,16 +170,16 @@ struct ParseUrlFunction { // start of next parameter ); - auto query = submatch(match, kQuery); + auto query = detail::submatch(match, kQuery); const boost::cregex_iterator begin( query.data(), query.data() + query.size(), kQueryParamRegex); boost::cregex_iterator end; for (auto it = begin; it != end; ++it) { if (it->length(2) != 0) { // key shouldn't be empty. - auto k = submatch((*it), 2); + auto k = detail::submatch((*it), 2); if (key.compare(k) == 0) { - auto value = submatch((*it), 3); + auto value = detail::submatch((*it), 3); if (value != "") { result.setNoCopy(value); return true; @@ -258,13 +194,13 @@ struct ParseUrlFunction { } private: - FOLLY_ALWAYS_INLINE bool matchAuthorityAndPath( + bool matchAuthorityAndPath( const boost::cmatch& urlMatch, boost::cmatch& authAndPathMatch, boost::cmatch& authorityMatch, bool& hasAuthority) { static const boost::regex kAuthorityAndPathRegex("//([^/]*)(/.*)?"); - auto authorityAndPath = submatch(urlMatch, kAuthPath); + auto authorityAndPath = detail::submatch(urlMatch, kAuthPath); if (!boost::regex_match( authorityAndPath.begin(), authorityAndPath.end(), @@ -293,18 +229,116 @@ struct ParseUrlFunction { hasAuthority = true; return true; } + FOLLY_ALWAYS_INLINE bool simpleMatch( const boost::cmatch& urlMatch, const int index, out_type& result) { - std::string_view sub = submatch(urlMatch, index); + StringView sub = detail::submatch(urlMatch, index); if (sub.empty()) { return false; } - result.setNoCopy(StringView(sub.data(), sub.size())); + result.setNoCopy(sub); return true; } + + bool matchPath( + const boost::cmatch& match, + const boost::cmatch& authAndPathMatch, + const bool hasAuthority, + out_type& result) { + StringView path = detail::submatch(match, kAuthPath); + if (hasAuthority) { + path = detail::submatch(authAndPathMatch, 2); + } + result.setNoCopy(path); + return true; + } + + bool matchFile( + const boost::cmatch& match, + const boost::cmatch& authAndPathMatch, + const bool hasAuthority, + out_type& result) { + // Path[?Query]. + std::string_view path = + detail::submatch(match, kAuthPath); + if (hasAuthority) { + path = detail::submatch(authAndPathMatch, 2); + } + std::string_view query = detail::submatch(match, kQuery); + if (!query.empty()) { + result.setNoCopy( + StringView(path.data(), (query.data() + query.size()) - path.data())); + } else { + result.setNoCopy(StringView(path.data(), path.size())); + } + return true; + } + + bool matchUserinfo( + const boost::cmatch& match, + const boost::cmatch& authorityMatch, + const bool hasAuthority, + out_type& result) { + // Username[:Password]. + if (!hasAuthority) { + return false; + } + std::string_view username = + detail::submatch(authorityMatch, kUser); + std::string_view password = + detail::submatch(authorityMatch, kPass); + if (!password.empty()) { + result.setNoCopy( + StringView(username.data(), password.end() - username.begin())); + return true; + } else if (!username.empty()) { + result.setNoCopy(StringView(username.data(), username.size())); + return true; + } else { + return false; + } + } + + bool matchAuthority( + const boost::cmatch& authorityMatch, + const bool hasAuthority, + out_type& result) { + // [Userinfo@]Host[:Port]. + if (!hasAuthority) { + return false; + } + std::string_view host = + detail::submatch(authorityMatch, kHost); + std::string_view first = host; + std::string_view last = host; + + std::string_view username = + detail::submatch(authorityMatch, kUser); + std::string_view port = + detail::submatch(authorityMatch, kPort); + if (!username.empty()) { + first = username; + } + if (!port.empty()) { + last = port; + } + result.setNoCopy(StringView(first.data(), last.end() - first.begin())); + return true; + } + + static constexpr int kAuthPath = 3; + static constexpr int kQuery = 4; + static constexpr int kHost = 3; + static constexpr int kProto = 2; + static constexpr int kRef = 5; + // submatch indexes for authorityMatch + static constexpr int kPathHasAuth = 2; + static constexpr int kPathNoAuth = 3; + static constexpr int kUser = 1; + static constexpr int kPass = 2; + static constexpr int kPort = 4; }; -} // namespace -} // namespace facebook::velox::functions::sparksql \ No newline at end of file +} // namespace facebook::velox::functions::sparksql