Skip to content

Commit

Permalink
Ensure we only use subgroups if they are matched in url_extract_* pre…
Browse files Browse the repository at this point in the history
…sto functions (#7866)

Summary:

It is possible to trigger address sanitizer when passed garbage strings in matchAuthorityAndPath function. We need to ensure that subgroups are actually matched, since accessing an unmatched string can lead to asan errors.

Differential Revision: D51822902
  • Loading branch information
kgpai authored and facebook-github-bot committed Dec 7, 2023
1 parent e963545 commit 682a97b
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 59 deletions.
21 changes: 11 additions & 10 deletions velox/functions/prestosql/URLFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,25 +15,24 @@
*/

#include "URLFunctions.h"
#include <optional>
#include "velox/type/Type.h"

namespace facebook::velox::functions {

bool matchAuthorityAndPath(
const boost::cmatch& urlMatch,
boost::cmatch& authAndPathMatch,
std::optional<std::string_view> matchAuthorityAndPath(
StringView authorityAndPath,
boost::cmatch& authorityMatch,
bool& hasAuthority) {
int subGroup) {
boost::cmatch authAndPathMatch;
static const boost::regex kAuthorityAndPathRegex("//([^/]*)(/.*)?");
auto authorityAndPath = submatch(urlMatch, 3);
if (!boost::regex_match(
authorityAndPath.begin(),
authorityAndPath.end(),
authAndPathMatch,
kAuthorityAndPathRegex)) {
// Does not start with //, doesn't have authority.
hasAuthority = false;
return true;
return std::nullopt;
}

static const boost::regex kAuthorityRegex(
Expand All @@ -45,12 +44,14 @@ bool matchAuthorityAndPath(
const auto authority = authAndPathMatch[1];
if (!boost::regex_match(
authority.first, authority.second, authorityMatch, kAuthorityRegex)) {
return false; // Invalid URI Authority.
return std::nullopt; // Invalid URI Authority.
}

hasAuthority = true;
if (authorityMatch[subGroup].matched) {
return authorityMatch[subGroup].str();
}

return true;
return std::nullopt;
}

} // namespace facebook::velox::functions
112 changes: 63 additions & 49 deletions velox/functions/prestosql/URLFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include <boost/regex.hpp>
#include <cctype>
#include <optional>
#include "velox/functions/Macros.h"
#include "velox/functions/lib/string/StringImpl.h"

Expand All @@ -28,8 +29,7 @@ FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) {
return StringView(sub.first, sub.length());
}

template <typename TInString>
bool parse(const TInString& rawUrl, boost::cmatch& match) {
bool parse(const char* rawUrlData, size_t rawUrlsize, boost::cmatch& match) {
/// This regex is taken from RFC - 3986.
/// See: https://www.rfc-editor.org/rfc/rfc3986#appendix-B
/// The basic groups are:
Expand Down Expand Up @@ -59,7 +59,24 @@ bool parse(const TInString& rawUrl, boost::cmatch& match) {
"(#(.*))?"); // #fragment

return boost::regex_match(
rawUrl.data(), rawUrl.data() + rawUrl.size(), match, kUriRegex);
rawUrlData, rawUrlData + rawUrlsize, match, kUriRegex);
}

/// Parses the url and returns the matching subgroup if the particular sub group
/// is matched by the call to parse call above.
std::optional<StringView> parse(StringView rawUrl, int subGroup) {
boost::cmatch match;
if (!parse(rawUrl.data(), rawUrl.size(), match)) {
return std::nullopt;
}

VELOX_CHECK_LT(subGroup, match.size());

if (match[subGroup].matched) {
return submatch(match, subGroup);
}

return std::nullopt;
}

FOLLY_ALWAYS_INLINE unsigned char toHex(unsigned char c) {
Expand Down Expand Up @@ -110,7 +127,7 @@ FOLLY_ALWAYS_INLINE void urlEscape(TOutString& output, const TInString& input) {
/// Performs initial validation of the URI.
/// Checks if the URI contains ascii whitespaces or
/// unescaped '%' chars.
bool isValidURI(const StringView& input) {
bool isValidURI(StringView input) {
const char* p = input.data();
const char* end = p + input.size();
char buf[3];
Expand Down Expand Up @@ -178,11 +195,13 @@ FOLLY_ALWAYS_INLINE void urlUnescape(

} // namespace

bool matchAuthorityAndPath(
const boost::cmatch& urlMatch,
boost::cmatch& authAndPathMatch,
/// Matches the authority (i.e host[:port], ipaddress), and path from a string
/// representing the authority and path. Returns true if the regex matches, and
/// sets the appropriate groups matching authority in authorityMatch.
std::optional<std::string_view> matchAuthorityAndPath(
StringView authorityAndPath,
boost::cmatch& authorityMatch,
bool& hasAuthority);
int subGroup);

template <typename T>
struct UrlExtractProtocolFunction {
Expand All @@ -201,11 +220,11 @@ struct UrlExtractProtocolFunction {
result.setEmpty();
return false;
}
boost::cmatch match;
if (!parse(url, match)) {
result.setEmpty();

if (auto protocol = parse(url, 2)) {
result.setNoCopy(protocol.value());
} else {
result.setNoCopy(submatch(match, 2));
result.setEmpty();
}
return true;
}
Expand All @@ -228,11 +247,11 @@ struct UrlExtractFragmentFunction {
result.setEmpty();
return false;
}
boost::cmatch match;
if (!parse(url, match)) {
result.setEmpty();

if (auto fragment = parse(url, 9)) {
result.setNoCopy(fragment.value());
} else {
result.setNoCopy(submatch(match, 9));
result.setEmpty();
}
return true;
}
Expand All @@ -255,19 +274,17 @@ struct UrlExtractHostFunction {
result.setEmpty();
return false;
}
boost::cmatch match;
if (!parse(url, match)) {

auto authAndPath = parse(url, 3);
if (!authAndPath) {
result.setEmpty();
return true;
}
boost::cmatch authAndPathMatch;
boost::cmatch authorityMatch;
bool hasAuthority;

if (matchAuthorityAndPath(
match, authAndPathMatch, authorityMatch, hasAuthority) &&
hasAuthority) {
result.setNoCopy(submatch(authorityMatch, 3));
if (auto host =
matchAuthorityAndPath(authAndPath.value(), authorityMatch)) {
result.setNoCopy(host.value());
} else {
result.setEmpty();
}
Expand All @@ -283,21 +300,18 @@ struct UrlExtractPortFunction {
if (!isValidURI(url)) {
return false;
}
boost::cmatch match;
if (!parse(url, match)) {

auto authAndPath = parse(url, 3);
if (!authAndPath) {
return false;
}

boost::cmatch authAndPathMatch;
boost::cmatch authorityMatch;
bool hasAuthority;
if (matchAuthorityAndPath(
match, authAndPathMatch, authorityMatch, hasAuthority) &&
hasAuthority) {
auto port = submatch(authorityMatch, 4);
if (!port.empty()) {
if (auto port =
matchAuthorityAndPath(authAndPath.value(), authorityMatch, 4)) {
if (!port.value().empty()) {
try {
result = to<int64_t>(port);
result = to<int64_t>(port.value());
return true;
} catch (folly::ConversionError const&) {
}
Expand All @@ -321,14 +335,13 @@ struct UrlExtractPathFunction {
return false;
}

boost::cmatch match;
if (!parse(url, match)) {
if (auto path = parse(url, 5)) {
urlUnescape(result, path.value());
} else {
result.setEmpty();
return false;
}

urlUnescape(result, submatch(match, 5));

return true;
}
};
Expand All @@ -350,14 +363,13 @@ struct UrlExtractQueryFunction {
result.setEmpty();
return false;
}
boost::cmatch match;
if (!parse(url, match)) {

if (auto query = parse(url, 7)) {
result.setNoCopy(query.value());
} else {
result.setEmpty();
return true;
}

auto query = submatch(match, 7);
result.setNoCopy(query);
return true;
}
};
Expand All @@ -380,14 +392,14 @@ struct UrlExtractParameterFunction {
result.setEmpty();
return false;
}
boost::cmatch match;
if (!parse(url, match)) {

auto query = parse(url, 7);
if (!query) {
result.setEmpty();
return false;
}

auto query = submatch(match, 7);
if (!query.empty()) {
if (!query.value().empty()) {
// Parse query string.
static const boost::regex kQueryParamRegex(
"(^|&)" // start of query or start of parameter "&"
Expand All @@ -398,11 +410,13 @@ struct UrlExtractParameterFunction {
);

const boost::cregex_iterator begin(
query.data(), query.data() + query.size(), kQueryParamRegex);
query.value().data(),
query.value().data() + query.value().size(),
kQueryParamRegex);
boost::cregex_iterator end;

for (auto it = begin; it != end; ++it) {
if (it->length(2) != 0) { // key shouldnt be empty.
if (it->length(2) != 0 && (*it)[2].matched) { // key shouldnt be empty.
auto key = submatch((*it), 2);
if (param.compare(key) == 0) {
auto value = submatch((*it), 3);
Expand Down
8 changes: 8 additions & 0 deletions velox/functions/prestosql/tests/URLFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,14 @@ TEST_F(URLFunctionsTest, validateURL) {
std::nullopt,
std::nullopt,
std::nullopt);
validate(
"IC6S!8hGVRpo+!,yTaJEy/$RUZpqcr",
"",
"",
"IC6S!8hGVRpo !,yTaJEy/$RUZpqcr",
"",
"",
std::nullopt);
}

TEST_F(URLFunctionsTest, extractPath) {
Expand Down

0 comments on commit 682a97b

Please sign in to comment.