From fd84dce2c3d9584eb1d505fbee7038a170263175 Mon Sep 17 00:00:00 2001 From: "Schierbeck, Cody" Date: Mon, 15 Jan 2024 13:18:17 -0800 Subject: [PATCH] Initial implementation --- velox/functions/sparksql/Register.cpp | 6 + velox/functions/sparksql/URLFunctions.h | 285 ++++++++++++++++++ velox/functions/sparksql/tests/CMakeLists.txt | 1 + .../sparksql/tests/URLFunctionsTest.cpp | 222 ++++++++++++++ 4 files changed, 514 insertions(+) create mode 100644 velox/functions/sparksql/URLFunctions.h create mode 100644 velox/functions/sparksql/tests/URLFunctionsTest.cpp diff --git a/velox/functions/sparksql/Register.cpp b/velox/functions/sparksql/Register.cpp index 74d9c81e7f49d..65bd251a9c376 100644 --- a/velox/functions/sparksql/Register.cpp +++ b/velox/functions/sparksql/Register.cpp @@ -39,6 +39,7 @@ #include "velox/functions/sparksql/Size.h" #include "velox/functions/sparksql/String.h" #include "velox/functions/sparksql/StringToMap.h" +#include "velox/functions/sparksql/URLFunctions.h" #include "velox/functions/sparksql/UnscaledValueFunction.h" #include "velox/functions/sparksql/specialforms/DecimalRound.h" #include "velox/functions/sparksql/specialforms/MakeDecimal.h" @@ -329,6 +330,11 @@ void registerFunctions(const std::string& prefix) { registerFunction( {prefix + "might_contain"}); + registerFunction( + {prefix + "parse_url"}); + registerFunction( + {prefix + "parse_url"}); + registerArrayMinMaxFunctions(prefix); // Register decimal vector functions. diff --git a/velox/functions/sparksql/URLFunctions.h b/velox/functions/sparksql/URLFunctions.h new file mode 100644 index 0000000000000..fad027a6cfa01 --- /dev/null +++ b/velox/functions/sparksql/URLFunctions.h @@ -0,0 +1,285 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include "velox/functions/Macros.h" + +namespace facebook::velox::functions::sparksql { +namespace { + +static const boost::regex kUriRegex( + "(([a-zA-Z][a-zA-Z0-9+.-]*):)?" // scheme: + "([^?#]*)" // authority and path + "(?:\\?([^#]*))?" // ?query + "(?:#(.*))?"); // #fragment + +FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) { + const auto& sub = match[idx]; + return StringView(sub.first, sub.length()); +} + +template +bool parse(const TInString& rawUrl, boost::cmatch& match) { + return boost::regex_match( + rawUrl.data(), rawUrl.data() + rawUrl.size(), match, kUriRegex); +} + +enum submatchEnum { + PROTOCOL = 2, + QUERY = 4, + REF = 5, + HOST = 0, + PATH = 1, + FILE = 3, + USERINFO = 7, + AUTHORITY = 8, + UNKNOWN = 9 +}; + +struct submatchMap : std::map { + submatchMap() { + (*this)["PROTOCOL"] = PROTOCOL; + (*this)["QUERY"] = QUERY; + (*this)["REF"] = REF; + (*this)["HOST"] = HOST; + (*this)["PATH"] = PATH; + (*this)["FILE"] = FILE; + (*this)["USERINFO"] = USERINFO; + (*this)["AUTHORITY"] = AUTHORITY; + } +}; + +static const std::unordered_set requiresAuthority = { + submatchEnum::HOST, + submatchEnum::USERINFO, + submatchEnum::AUTHORITY}; + +} // namespace + +template +struct ParseUrlFunction { + VELOX_DEFINE_FUNCTION_TYPES(T); + + // Results refer to strings in the first argument. + static constexpr int32_t reuse_strings_from_arg = 0; + + // ASCII input always produces ASCII result. + static constexpr bool is_default_ascii_behavior = true; + + static constexpr int kAuthPath = 3; + static constexpr int kQuery = 4; + static constexpr int kHost = 3; + // submatch indexes for authorityMatch + static constexpr int kPathHasAuth = 2; + static constexpr int kPathNoAuth = 3; + static constexpr int kUser = 1; + static constexpr int kPass = 2; + static constexpr int kPort = 4; + + FOLLY_ALWAYS_INLINE bool call( + out_type& result, + const arg_type& url, + const arg_type& partToExtract) { + boost::cmatch match; + if (!parse(url, match)) { + return false; + } + + submatchEnum partToExtractEnum = getSubmatchEnum(partToExtract); + switch (partToExtractEnum) { + case UNKNOWN: + return false; + case PROTOCOL: + case QUERY: + case REF: + return singleMatch(result, int(partToExtractEnum), match); + default: + break; + } + // Cases above do not require authroity matching and handling + boost::cmatch authAndPathMatch; + boost::cmatch authorityMatch; + bool hasAuthority = false; + if (!matchAuthorityAndPath( + match, authAndPathMatch, authorityMatch, hasAuthority)) { + return false; + } + if (!hasAuthority && + requiresAuthority.find(partToExtractEnum) != requiresAuthority.end()) { + return false; + } + boost::cmatch* matchToUse = hasAuthority ? &authAndPathMatch : &match; + int pathMatch = hasAuthority ? kPathHasAuth : kPathNoAuth; + switch (partToExtractEnum) { + case HOST: + return singleMatch(result, kHost, authorityMatch); + case PATH: + singleMatch(result, pathMatch, *matchToUse); + return true; + // Path[?Query]. + case FILE: { + if (match[kQuery].matched) { + return doubleMatch(result, pathMatch, *matchToUse, kQuery, match); + } + singleMatch(result, pathMatch, *matchToUse); + return true; + } + // Username[:Password]. + case USERINFO: { + if (authorityMatch[kPass].matched) { + return doubleMatch( + result, kUser, authorityMatch, kPass, authorityMatch); + } + return singleMatch(result, kUser, authorityMatch); + } + // [Userinfo@]Host[:Port]. + case AUTHORITY: { + const char* start = authorityMatch[kUser].matched + ? authorityMatch[kUser].first + : authorityMatch[kHost].first; + auto* endMatch = authorityMatch[kPort].matched ? &authorityMatch[kPort] + : &authorityMatch[kHost]; + result.setNoCopy(StringView( + start, (*endMatch).first - start + (*endMatch).length())); + return true; + } + default: + return false; + } + + return false; + } + + FOLLY_ALWAYS_INLINE bool call( + out_type& result, + const arg_type& url, + const arg_type& partToExtract, + const arg_type& key) { + // Only "QUERY" support the third parameter. + if (partToExtract != "QUERY") { + return false; + } + if (key.empty()) { + return false; + } + + boost::cmatch match; + if (!parse(url, match)) { + return false; + } + + // Parse query string. + static const boost::regex kQueryParamRegex( + "(^|&)" // start of query or start of parameter "&" + "([^=&]*)=?" // parameter name and "=" if value is expected + "([^=&]*)" // parameter value + "(?=(&|$))" // forward reference, next should be end of query or + // start of next parameter + ); + + auto query = submatch(match, kQuery); + const boost::cregex_iterator begin( + query.data(), query.data() + query.size(), kQueryParamRegex); + boost::cregex_iterator end; + + for (auto it = begin; it != end; ++it) { + if (it->length(2) != 0) { // key shouldn't be empty. + auto k = submatch((*it), 2); + if (key.compare(k) == 0) { + auto value = submatch((*it), 3); + result.setNoCopy(value); + return true; + } + } + } + + return false; + } + + private: + bool singleMatch( + out_type& result, + int submatchIndex, + boost::cmatch& match) { + if (!match[submatchIndex].matched) { + return false; + } + result.setNoCopy(submatch(match, submatchIndex)); + return true; + } + + bool doubleMatch( + out_type& result, + int submatchIndex1, + boost::cmatch& match1, + int submatchIndex2, + boost::cmatch& match2) { + result.setNoCopy(StringView( + match1[submatchIndex1].first, + match2[submatchIndex2].first - match1[submatchIndex1].first + + match2[submatchIndex2].length())); + } + + submatchEnum getSubmatchEnum(const arg_type& partToExtract) { + static const submatchMap submatchMap; + auto it = submatchMap.find(partToExtract); + if (it != submatchMap.end()) { + return it->second; + } + return submatchEnum::UNKNOWN; + } + + FOLLY_ALWAYS_INLINE bool matchAuthorityAndPath( + const boost::cmatch& urlMatch, + boost::cmatch& authAndPathMatch, + boost::cmatch& authorityMatch, + bool& hasAuthority) { + static const boost::regex kAuthorityAndPathRegex("//([^/]*)(/.*)?"); + auto authorityAndPath = submatch(urlMatch, kAuthPath); + if (!boost::regex_match( + authorityAndPath.begin(), + authorityAndPath.end(), + authAndPathMatch, + kAuthorityAndPathRegex)) { + // Does not start with //, doesn't have authority. + hasAuthority = false; + return true; + } + + static const boost::regex kAuthorityRegex( + "(?:([^@:]*)(?::([^@]*))?@)?" // username, password. + "(\\[[^\\]]*\\]|[^\\[:]*)" // host (IP-literal (e.g. '['+IPv6+']', + // dotted-IPv4, or named host). + "(?::(\\d*))?"); // port. + + const auto authority = authAndPathMatch[1]; + if (!boost::regex_match( + authority.first, + authority.second, + authorityMatch, + kAuthorityRegex)) { + return false; // Invalid URI Authority. + } + + hasAuthority = true; + return true; + } +}; + +} // namespace facebook::velox::functions::sparksql \ No newline at end of file diff --git a/velox/functions/sparksql/tests/CMakeLists.txt b/velox/functions/sparksql/tests/CMakeLists.txt index 86d4d1fa5d52e..1af7fe56ce324 100644 --- a/velox/functions/sparksql/tests/CMakeLists.txt +++ b/velox/functions/sparksql/tests/CMakeLists.txt @@ -41,6 +41,7 @@ add_executable( StringTest.cpp StringToMapTest.cpp UnscaledValueFunctionTest.cpp + URLFunctionsTest.cpp XxHash64Test.cpp) add_test(velox_functions_spark_test velox_functions_spark_test) diff --git a/velox/functions/sparksql/tests/URLFunctionsTest.cpp b/velox/functions/sparksql/tests/URLFunctionsTest.cpp new file mode 100644 index 0000000000000..7ae6b1b79653c --- /dev/null +++ b/velox/functions/sparksql/tests/URLFunctionsTest.cpp @@ -0,0 +1,222 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include + +#include "velox/functions/sparksql/tests/SparkFunctionBaseTest.h" + +namespace facebook::velox::functions::sparksql::test { + +namespace { +std::optional operator+( + const std::optional& opt1, + const std::optional& opt2) { + if (!opt1.has_value()) { + return opt2; + } + if (!opt2.has_value()) { + return opt1; + } + return opt1.value() + opt2.value(); +} + +std::optional operator&( + const std::optional& opt1, + const std::optional& opt2) { + if (!opt1.has_value() || !opt2.has_value()) { + return std::nullopt; + } + return opt1.value() + opt2.value(); +} + +class URLFunctionsTest + : public functions::sparksql::test::SparkFunctionBaseTest { + protected: + void validate( + const std::optional& url, + const std::optional& expectedProtocol, + const std::optional& expectedUserinfo, + const std::optional& expectedHost, + const std::optional& expectedPort, + const std::optional& expectedPath, + const std::optional& expectedQuery, + const std::optional& expectedRef) { + const auto parseUrl = [&](const std::optional& partToExtract) + -> std::optional { + return evaluateOnce("parse_url(c0, c1)", url, partToExtract); + }; + + EXPECT_EQ(parseUrl("PROTOCOL"), expectedProtocol); + + EXPECT_EQ(parseUrl("USERINFO"), expectedUserinfo); + EXPECT_EQ(parseUrl("HOST"), expectedHost); + auto expectedAuthority = + (expectedUserinfo & "@") + expectedHost + (":" & expectedPort); + EXPECT_EQ(parseUrl("AUTHORITY"), expectedAuthority); + + EXPECT_EQ(parseUrl("PATH"), expectedPath); + EXPECT_EQ(parseUrl("QUERY"), expectedQuery); + + auto expectedFile = expectedPath + ("?" & expectedQuery); + EXPECT_EQ(parseUrl("FILE"), expectedFile); + + EXPECT_EQ(parseUrl("REF"), expectedRef); + } +}; + +TEST_F(URLFunctionsTest, validateURL) { + validate( + "http://user:pass@example.com:8080/path1/p.php?k1=v1&k2=v2#Ref1", + "http", + "user:pass", + "example.com", + "8080", + "/path1/p.php", + "k1=v1&k2=v2", + "Ref1"); + validate( + "HTTP://example.com/path1/p.php", + "HTTP", + std::nullopt, + "example.com", + std::nullopt, + "/path1/p.php", + std::nullopt, + std::nullopt); + validate( + "http://example.com:8080/path1/p.php?k1=v1&k2=v2#Ref1", + "http", + std::nullopt, + "example.com", + "8080", + "/path1/p.php", + "k1=v1&k2=v2", + "Ref1"); + validate( + "https://username@example.com", + "https", + "username", + "example.com", + std::nullopt, + "", + std::nullopt, + std::nullopt); + validate( + "https:/auth/login.html", + "https", + std::nullopt, + std::nullopt, + std::nullopt, + "/auth/login.html", + std::nullopt, + std::nullopt); + validate( + "foo", + std::nullopt, + std::nullopt, + std::nullopt, + std::nullopt, + "foo", + std::nullopt, + std::nullopt); +} + +TEST_F(URLFunctionsTest, validateParameter) { + const auto checkParseUrlWithKey = + [&](const std::optional& expected, + const std::optional& url, + const std::optional& key) { + const std::optional& partToExtract = "QUERY"; + EXPECT_EQ( + evaluateOnce( + "parse_url(c0, c1, c2)", url, partToExtract, key), + expected); + }; + + checkParseUrlWithKey( + "v2", "http://example.com/path1/p.php?k1=v1&k2=v2#Ref1", "k2"); + checkParseUrlWithKey( + "v1", "http://example.com/path1/p.php?k1=v1&k2=v2&k3&k4#Ref1", "k1"); + checkParseUrlWithKey( + "", "http://example.com/path1/p.php?k1=v1&k2=v2&k3&k4#Ref1", "k3"); + checkParseUrlWithKey( + std::nullopt, + "http://example.com/path1/p.php?k1=v1&k2=v2&k3&k4#Ref1", + "k6"); + checkParseUrlWithKey(std::nullopt, "foo", ""); +} + +TEST_F(URLFunctionsTest, sparkUT) { + const auto checkParseUrl = + [&](const std::optional& expected, + const std::optional& url, + const std::optional& partToExtract) { + EXPECT_EQ( + evaluateOnce("parse_url(c0, c1)", url, partToExtract), + expected); + }; + const auto checkParseUrlWithKey = + [&](const std::optional& expected, + const std::optional& url, + const std::optional& partToExtract, + const std::optional& key) { + EXPECT_EQ( + evaluateOnce( + "parse_url(c0, c1, c2)", url, partToExtract, key), + expected); + }; + + checkParseUrl( + "spark.apache.org", "http://spark.apache.org/path?query=1", "HOST"); + checkParseUrl("/path", "http://spark.apache.org/path?query=1", "PATH"); + checkParseUrl("query=1", "http://spark.apache.org/path?query=1", "QUERY"); + checkParseUrl("Ref", "http://spark.apache.org/path?query=1#Ref", "REF"); + checkParseUrl("http", "http://spark.apache.org/path?query=1", "PROTOCOL"); + checkParseUrl( + "/path?query=1", "http://spark.apache.org/path?query=1", "FILE"); + checkParseUrl( + "spark.apache.org:8080", + "http://spark.apache.org:8080/path?query=1", + "AUTHORITY"); + checkParseUrl( + "userinfo", "http://userinfo@spark.apache.org/path?query=1", "USERINFO"); + checkParseUrlWithKey( + "1", "http://spark.apache.org/path?query=1", "QUERY", "query"); + + // Null checking. + checkParseUrl(std::nullopt, std::nullopt, "HOST"); + checkParseUrl( + std::nullopt, "http://spark.apache.org/path?query=1", std::nullopt); + checkParseUrl(std::nullopt, std::nullopt, std::nullopt); + checkParseUrl(std::nullopt, "test", "HOST"); + checkParseUrl(std::nullopt, "http://spark.apache.org/path?query=1", "NO"); + checkParseUrl( + std::nullopt, "http://spark.apache.org/path?query=1", "USERINFO"); + checkParseUrlWithKey( + std::nullopt, "http://spark.apache.org/path?query=1", "HOST", "query"); + checkParseUrlWithKey( + std::nullopt, "http://spark.apache.org/path?query=1", "QUERY", "quer"); + checkParseUrlWithKey( + std::nullopt, + "http://spark.apache.org/path?query=1", + "QUERY", + std::nullopt); + checkParseUrlWithKey( + std::nullopt, "http://spark.apache.org/path?query=1", "QUERY", ""); +} + +} // namespace +} // namespace facebook::velox::functions::sparksql::test \ No newline at end of file