Skip to content

Commit

Permalink
Ensure regex authority is matched to fix sanitizer errors (#7866)
Browse files Browse the repository at this point in the history
Summary:

It is possible to trigger address sanitizer when passed garbage strings in matchAuthorityAndPath function. We need to ensure that subgroups are actually matched, since accessing an unmatched string can lead to asan errors.

Differential Revision: D51822902
  • Loading branch information
kgpai authored and facebook-github-bot committed Dec 6, 2023
1 parent e7bebbd commit f3a088e
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 45 deletions.
17 changes: 8 additions & 9 deletions velox/functions/prestosql/URLFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,17 @@
namespace facebook::velox::functions {

bool matchAuthorityAndPath(
const boost::cmatch& urlMatch,
boost::cmatch& authAndPathMatch,
boost::cmatch& authorityMatch,
bool& hasAuthority) {
const StringView& authorityAndPath,
boost::cmatch& authorityMatch) {
boost::cmatch authAndPathMatch;
static const boost::regex kAuthorityAndPathRegex("//([^/]*)(/.*)?");
auto authorityAndPath = submatch(urlMatch, 3);
if (!boost::regex_match(
authorityAndPath.begin(),
authorityAndPath.end(),
authAndPathMatch,
kAuthorityAndPathRegex)) {
// Does not start with //, doesn't have authority.
hasAuthority = false;
return true;
return false;
}

static const boost::regex kAuthorityRegex(
Expand All @@ -48,9 +45,11 @@ bool matchAuthorityAndPath(
return false; // Invalid URI Authority.
}

hasAuthority = true;
if (authorityMatch[3].matched) {
return true;
}

return true;
return false;
}

} // namespace facebook::velox::functions
85 changes: 49 additions & 36 deletions velox/functions/prestosql/URLFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include <boost/regex.hpp>
#include <cctype>
#include <optional>
#include "velox/functions/Macros.h"
#include "velox/functions/lib/string/StringImpl.h"

Expand All @@ -28,8 +29,7 @@ FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) {
return StringView(sub.first, sub.length());
}

template <typename TInString>
bool parse(const TInString& rawUrl, boost::cmatch& match) {
bool parse(const StringView& rawUrl, boost::cmatch& match) {
/// This regex is taken from RFC - 3986.
/// See: https://www.rfc-editor.org/rfc/rfc3986#appendix-B
/// The basic groups are:
Expand Down Expand Up @@ -62,6 +62,23 @@ bool parse(const TInString& rawUrl, boost::cmatch& match) {
rawUrl.data(), rawUrl.data() + rawUrl.size(), match, kUriRegex);
}

/// Parses the url and returns the matching subgroup if the particular sub group
/// is matched by the call to parse(url, match).
std::optional<StringView>
parse(const StringView& rawUrl, boost::cmatch& match, int subGroup) {
if (!parse(rawUrl, match)) {
return std::nullopt;
}

VELOX_CHECK_LT(subGroup, match.size());

if (match[subGroup].matched) {
return submatch(match, subGroup);
}

return std::nullopt;
}

FOLLY_ALWAYS_INLINE unsigned char toHex(unsigned char c) {
return c < 10 ? (c + '0') : (c + 'A' - 10);
}
Expand Down Expand Up @@ -178,11 +195,12 @@ FOLLY_ALWAYS_INLINE void urlUnescape(

} // namespace

/// Matches the authority (i.e host[:port], ipaddress), and path from a string
/// representing the authority and path. Returns true if the regex matches, and
/// sets the appropriate groups matching authority in authorityMatch.
bool matchAuthorityAndPath(
const boost::cmatch& urlMatch,
boost::cmatch& authAndPathMatch,
boost::cmatch& authorityMatch,
bool& hasAuthority);
const StringView& authorityAndPath,
boost::cmatch& authorityMatch);

template <typename T>
struct UrlExtractProtocolFunction {
Expand All @@ -202,10 +220,10 @@ struct UrlExtractProtocolFunction {
return false;
}
boost::cmatch match;
if (!parse(url, match)) {
result.setEmpty();
if (auto protocol = parse(url, match, 2)) {
result.setNoCopy(protocol.value());
} else {
result.setNoCopy(submatch(match, 2));
result.setEmpty();
}
return true;
}
Expand All @@ -229,10 +247,10 @@ struct UrlExtractFragmentFunction {
return false;
}
boost::cmatch match;
if (!parse(url, match)) {
result.setEmpty();
if (auto fragment = parse(url, match, 9)) {
result.setNoCopy(fragment.value());
} else {
result.setNoCopy(submatch(match, 9));
result.setEmpty();
}
return true;
}
Expand All @@ -256,17 +274,14 @@ struct UrlExtractHostFunction {
return false;
}
boost::cmatch match;
if (!parse(url, match)) {
auto authAndPath = parse(url, match, 3);
if (!authAndPath) {
result.setEmpty();
return true;
}
boost::cmatch authAndPathMatch;
boost::cmatch authorityMatch;
bool hasAuthority;

if (matchAuthorityAndPath(
match, authAndPathMatch, authorityMatch, hasAuthority) &&
hasAuthority) {
if (matchAuthorityAndPath(authAndPath.value(), authorityMatch)) {
result.setNoCopy(submatch(authorityMatch, 3));
} else {
result.setEmpty();
Expand All @@ -284,16 +299,13 @@ struct UrlExtractPortFunction {
return false;
}
boost::cmatch match;
if (!parse(url, match)) {
if (!parse(url, match, 4)) {
return false;
}

boost::cmatch authAndPathMatch;
boost::cmatch authorityMatch;
bool hasAuthority;
if (matchAuthorityAndPath(
match, authAndPathMatch, authorityMatch, hasAuthority) &&
hasAuthority) {
auto authAndPath = submatch(match, 3);
if (matchAuthorityAndPath(authAndPath, authorityMatch)) {
auto port = submatch(authorityMatch, 4);
if (!port.empty()) {
try {
Expand Down Expand Up @@ -322,13 +334,13 @@ struct UrlExtractPathFunction {
}

boost::cmatch match;
if (!parse(url, match)) {
if (auto path = parse(url, match, 5)) {
urlUnescape(result, path.value());
} else {
result.setEmpty();
return false;
}

urlUnescape(result, submatch(match, 5));

return true;
}
};
Expand All @@ -351,13 +363,12 @@ struct UrlExtractQueryFunction {
return false;
}
boost::cmatch match;
if (!parse(url, match)) {
if (auto query = parse(url, match, 7)) {
result.setNoCopy(query.value());
} else {
result.setEmpty();
return true;
}

auto query = submatch(match, 7);
result.setNoCopy(query);
return true;
}
};
Expand All @@ -381,13 +392,13 @@ struct UrlExtractParameterFunction {
return false;
}
boost::cmatch match;
if (!parse(url, match)) {
auto query = parse(url, match, 7);
if (!query) {
result.setEmpty();
return false;
}

auto query = submatch(match, 7);
if (!query.empty()) {
if (!query.value().empty()) {
// Parse query string.
static const boost::regex kQueryParamRegex(
"(^|&)" // start of query or start of parameter "&"
Expand All @@ -398,11 +409,13 @@ struct UrlExtractParameterFunction {
);

const boost::cregex_iterator begin(
query.data(), query.data() + query.size(), kQueryParamRegex);
query.value().data(),
query.value().data() + query.value().size(),
kQueryParamRegex);
boost::cregex_iterator end;

for (auto it = begin; it != end; ++it) {
if (it->length(2) != 0) { // key shouldnt be empty.
if (it->length(2) != 0 && (*it)[2].matched) { // key shouldnt be empty.
auto key = submatch((*it), 2);
if (param.compare(key) == 0) {
auto value = submatch((*it), 3);
Expand Down
8 changes: 8 additions & 0 deletions velox/functions/prestosql/tests/URLFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,14 @@ TEST_F(URLFunctionsTest, validateURL) {
std::nullopt,
std::nullopt,
std::nullopt);
validate(
"IC6S!8hGVRpo+!,yTaJEy/$RUZpqcr",
"",
"",
"IC6S!8hGVRpo !,yTaJEy/$RUZpqcr",
"",
"",
std::nullopt);
}

TEST_F(URLFunctionsTest, extractPath) {
Expand Down

0 comments on commit f3a088e

Please sign in to comment.