Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Move some logic from URLFunctions.h to URIParser #11761

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
192 changes: 99 additions & 93 deletions velox/functions/prestosql/URIParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

namespace facebook::velox::functions {

namespace detail {
namespace {
using Mask = std::bitset<128>;

Mask createMask(size_t low, size_t high) {
Expand Down Expand Up @@ -321,95 +321,6 @@ bool isAtCompression(const char* str, const size_t len, const int32_t pos) {
return pos < len - 1 && str[pos] == ':' && str[pos + 1] == ':';
}

// IPv6address = 6( h16 ":" ) ls32
// / "::" 5( h16 ":" ) ls32
// / [ h16 ] "::" 4( h16 ":" ) ls32
// / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
// / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
// / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
// / [ *4( h16 ":" ) h16 ] "::" ls32
// / [ *5( h16 ":" ) h16 ] "::" h16
// / [ *6( h16 ":" ) h16 ] "::"
// h16 = 1*4HEXDIG
// ls32 = ( h16 ":" h16 ) / IPv4address
bool tryConsumeIPV6Address(const char* str, const size_t len, int32_t& pos) {
bool hasCompression = false;
uint8_t numBytes = 0;
int32_t posInAddress = pos;

if (isAtCompression(str, len, posInAddress)) {
hasCompression = true;
// Consume the compression '::'.
posInAddress += 2;
}

while (posInAddress < len && numBytes < 16) {
int32_t posInHex = posInAddress;
for (int i = 0; i < 4; i++) {
if (posInHex == len || !test(kHex, str[posInHex])) {
break;
}

posInHex++;
}

if (posInHex == posInAddress) {
// We need to be able to consume at least one hex digit.
break;
}

if (posInHex < len) {
if (str[posInHex] == '.') {
// We may be in the IPV4 Address.
if (tryConsumeIPV4Address(str, len, posInAddress)) {
numBytes += 4;
break;
} else {
// A '.' can't appear anywhere except in a valid IPV4 address.
return false;
}
}
if (str[posInHex] == ':') {
if (isAtCompression(str, len, posInHex)) {
if (hasCompression) {
// We can't have two compressions.
return false;
} else {
// We found a 2 byte hex value followed by a compression.
numBytes += 2;
hasCompression = true;
// Consume the hex block and the compression '::'.
posInAddress = posInHex + 2;
}
} else {
if (posInHex == len || !test(kHex, str[posInHex + 1])) {
// Peak ahead, we can't end on a single ':'.
return false;
}
// We found a 2 byte hex value followed by a single ':'.
numBytes += 2;
// Consume the hex block and the ':'.
posInAddress = posInHex + 1;
}
} else {
// We found a 2 byte hex value at the end of the string.
numBytes += 2;
posInAddress = posInHex;
break;
}
}
}

// A valid IPv6 address must have exactly 16 bytes, or a compression.
if ((numBytes == 16 && !hasCompression) ||
(hasCompression && numBytes <= 14 && numBytes % 2 == 0)) {
pos = posInAddress;
return true;
} else {
return false;
}
}

// IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
bool tryConsumeIPVFuture(const char* str, const size_t len, int32_t& pos) {
int32_t posInAddress = pos;
Expand Down Expand Up @@ -706,18 +617,113 @@ bool tryConsumeUri(const char* str, const size_t len, int32_t& pos, URI& uri) {
return true;
}

} // namespace detail
} // namespace

// IPv6address = 6( h16 ":" ) ls32
// / "::" 5( h16 ":" ) ls32
// / [ h16 ] "::" 4( h16 ":" ) ls32
// / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
// / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
// / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
// / [ *4( h16 ":" ) h16 ] "::" ls32
// / [ *5( h16 ":" ) h16 ] "::" h16
// / [ *6( h16 ":" ) h16 ] "::"
// h16 = 1*4HEXDIG
// ls32 = ( h16 ":" h16 ) / IPv4address
bool tryConsumeIPV6Address(const char* str, const size_t len, int32_t& pos) {
bool hasCompression = false;
uint8_t numBytes = 0;
int32_t posInAddress = pos;

if (isAtCompression(str, len, posInAddress)) {
hasCompression = true;
// Consume the compression '::'.
posInAddress += 2;
}

while (posInAddress < len && numBytes < 16) {
int32_t posInHex = posInAddress;
for (int i = 0; i < 4; i++) {
if (posInHex == len || !test(kHex, str[posInHex])) {
break;
}

posInHex++;
}

if (posInHex == posInAddress) {
// We need to be able to consume at least one hex digit.
break;
}

if (posInHex < len) {
if (str[posInHex] == '.') {
// We may be in the IPV4 Address.
if (tryConsumeIPV4Address(str, len, posInAddress)) {
numBytes += 4;
break;
} else {
// A '.' can't appear anywhere except in a valid IPV4 address.
return false;
}
}
if (str[posInHex] == ':') {
if (isAtCompression(str, len, posInHex)) {
if (hasCompression) {
// We can't have two compressions.
return false;
} else {
// We found a 2 byte hex value followed by a compression.
numBytes += 2;
hasCompression = true;
// Consume the hex block and the compression '::'.
posInAddress = posInHex + 2;

continue;
}
} else {
if (posInHex == len || !test(kHex, str[posInHex + 1])) {
// Peak ahead, we can't end on a single ':'.
return false;
}
// We found a 2 byte hex value followed by a single ':'.
numBytes += 2;
// Consume the hex block and the ':'.
posInAddress = posInHex + 1;

continue;
}
} else {
// We found a 2 byte hex value at the end of the string.
numBytes += 2;
posInAddress = posInHex;
break;
}
}

break;
}

// A valid IPv6 address must have exactly 16 bytes, or a compression.
if ((numBytes == 16 && !hasCompression) ||
(hasCompression && numBytes <= 14 && numBytes % 2 == 0)) {
pos = posInAddress;
return true;
} else {
return false;
}
}

// URI-reference = URI / relative-ref
bool parseUri(const StringView& uriStr, URI& uri) {
int32_t pos = 0;
if (detail::tryConsumeUri(uriStr.data(), uriStr.size(), pos, uri) &&
if (tryConsumeUri(uriStr.data(), uriStr.size(), pos, uri) &&
pos == uriStr.size()) {
return true;
}

pos = 0;
detail::consumeRelativeRef(uriStr.data(), uriStr.size(), pos, uri);
consumeRelativeRef(uriStr.data(), uriStr.size(), pos, uri);

return pos == uriStr.size();
}
Expand Down
67 changes: 67 additions & 0 deletions velox/functions/prestosql/URIParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,16 @@
*/
#pragma once

#include <boost/regex.hpp>
#include "velox/type/StringView.h"

namespace facebook::velox::functions {
namespace detail {
FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) {
const auto& sub = match[idx];
return StringView(sub.first, sub.length());
}
} // namespace detail
/// A struct containing the parts of the URI that were extracted during parsing.
/// If the field was not found, it is empty.
///
Expand All @@ -38,4 +45,64 @@ struct URI {

/// Parse a URI string into a URI struct according to RFC 3986.
bool parseUri(const StringView& uriStr, URI& uri);

/// If the string starting at str is a valid IPv6 address, returns true and pos
/// is updated to the first character after the IP address. Otherwise returns
/// false and pos is unchanged.
bool tryConsumeIPV6Address(const char* str, const size_t len, int32_t& pos);

template <typename T>
FOLLY_ALWAYS_INLINE bool isMultipleInvalidSequences(
const T& inputBuffer,
size_t inputIndex) {
return
// 0xe0 followed by a value less than 0xe0 or 0xf0 followed by a
// value less than 0x90 is considered an overlong encoding.
(inputBuffer[inputIndex] == '\xe0' &&
(inputBuffer[inputIndex + 1] & 0xe0) == 0x80) ||
(inputBuffer[inputIndex] == '\xf0' &&
(inputBuffer[inputIndex + 1] & 0xf0) == 0x80) ||
// 0xf4 followed by a byte >= 0x90 looks valid to
// tryGetUtf8CharLength, but is actually outside the range of valid
// code points.
(inputBuffer[inputIndex] == '\xf4' &&
(inputBuffer[inputIndex + 1] & 0xf0) != 0x80) ||
// The bytes 0xf5-0xff, 0xc0, and 0xc1 look like the start of
// multi-byte code points to tryGetUtf8CharLength, but are not part of
// any valid code point.
(unsigned char)inputBuffer[inputIndex] > 0xf4 ||
inputBuffer[inputIndex] == '\xc0' || inputBuffer[inputIndex] == '\xc1';
}

/// Find an extract the value for the parameter with key `param` from the query
/// portion of a URI `query`. `query` should already be decoded if necessary.
template <typename TString>
std::optional<StringView> extractParameter(
const StringView& query,
const TString& param) {
if (!query.empty()) {
// Parse query string.
static const boost::regex kQueryParamRegex(
"(^|&)" // start of query or start of parameter "&"
"([^=&]*)=?" // parameter name and "=" if value is expected
"([^&]*)" // parameter value (allows "=" to appear)
"(?=(&|$))" // forward reference, next should be end of query or
// start of next parameter
);

const boost::cregex_iterator begin(
query.data(), query.data() + query.size(), kQueryParamRegex);
boost::cregex_iterator end;

for (auto it = begin; it != end; ++it) {
if (it->length(2) != 0 && (*it)[2].matched) { // key shouldnt be empty.
auto key = detail::submatch((*it), 2);
if (param.compare(key) == 0) {
return detail::submatch((*it), 3);
}
}
}
}
return std::nullopt;
}
} // namespace facebook::velox::functions
54 changes: 3 additions & 51 deletions velox/functions/prestosql/URLFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
*/
#pragma once

#include <boost/regex.hpp>
#include "velox/external/utf8proc/utf8procImpl.h"
#include "velox/functions/Macros.h"
#include "velox/functions/lib/Utf8Utils.h"
Expand All @@ -39,11 +38,6 @@ constexpr std::array<std::string_view, 6> kDecodedReplacementCharacterStrings{
"\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd",
"\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd"};

FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) {
const auto& sub = match[idx];
return StringView(sub.first, sub.length());
}

FOLLY_ALWAYS_INLINE unsigned char toHex(unsigned char c) {
return c < 10 ? (c + '0') : (c + 'A' - 10);
}
Expand All @@ -54,29 +48,6 @@ FOLLY_ALWAYS_INLINE void charEscape(unsigned char c, char* output) {
output[2] = toHex(c % 16);
}

template <typename T>
FOLLY_ALWAYS_INLINE bool isMultipleInvalidSequences(
const T& inputBuffer,
size_t inputIndex) {
return
// 0xe0 followed by a value less than 0xe0 or 0xf0 followed by a
// value less than 0x90 is considered an overlong encoding.
(inputBuffer[inputIndex] == '\xe0' &&
(inputBuffer[inputIndex + 1] & 0xe0) == 0x80) ||
(inputBuffer[inputIndex] == '\xf0' &&
(inputBuffer[inputIndex + 1] & 0xf0) == 0x80) ||
// 0xf4 followed by a byte >= 0x90 looks valid to
// tryGetUtf8CharLength, but is actually outside the range of valid
// code points.
(inputBuffer[inputIndex] == '\xf4' &&
(inputBuffer[inputIndex + 1] & 0xf0) != 0x80) ||
// The bytes 0xf5-0xff, 0xc0, and 0xc1 look like the start of
// multi-byte code points to tryGetUtf8CharLength, but are not part of
// any valid code point.
(unsigned char)inputBuffer[inputIndex] > 0xf4 ||
inputBuffer[inputIndex] == '\xc0' || inputBuffer[inputIndex] == '\xc1';
}

/// Escapes ``input`` by encoding it so that it can be safely included in
/// URL query parameter names and values:
///
Expand Down Expand Up @@ -441,35 +412,16 @@ struct UrlExtractParameterFunction {
}

if (!uri.query.empty()) {
// Parse query string.
static const boost::regex kQueryParamRegex(
"(^|&)" // start of query or start of parameter "&"
"([^=&]*)=?" // parameter name and "=" if value is expected
"([^&]*)" // parameter value (allows "=" to appear)
"(?=(&|$))" // forward reference, next should be end of query or
// start of next parameter
);

StringView query = uri.query;
std::string unescapedQuery;
if (uri.queryHasEncoded) {
detail::urlUnescape(unescapedQuery, uri.query);
query = StringView(unescapedQuery);
}

const boost::cregex_iterator begin(
query.data(), query.data() + query.size(), kQueryParamRegex);
boost::cregex_iterator end;

for (auto it = begin; it != end; ++it) {
if (it->length(2) != 0 && (*it)[2].matched) { // key shouldnt be empty.
auto key = detail::submatch((*it), 2);
if (param.compare(key) == 0) {
auto value = detail::submatch((*it), 3);
result.copy_from(value);
return true;
}
}
if (const auto value = extractParameter(query, param)) {
result.copy_from(value.value());
return true;
}
}

Expand Down
Loading
Loading