Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor: Move some logic from URLFunctions.h to URIParser
Browse files Browse the repository at this point in the history
Summary:
There's some UDFs outside of prestosql's URL functions where I'd like to reuse some of the logic I
wrote for handling URIs.

Specifically:
* tryConsumeIPV6Address: this is generally useful for parsing IPv6 addresses  
* isMultipleInvalidSequences: this is generally useful for determining how many valid subsequences
  make up an invalid code point from tryGetUtf8CharLength.
* extractParameter: this is generally useful for extracting the parameter from a URI's query string

This change moves those functions into URIParser where they can be reused.

Differential Revision: D66832201
Kevin Wilfong authored and facebook-github-bot committed Dec 5, 2024
1 parent 37a5ffb commit 9044c56
Showing 3 changed files with 165 additions and 144 deletions.
186 changes: 93 additions & 93 deletions velox/functions/prestosql/URIParser.cpp
Original file line number Diff line number Diff line change
@@ -20,7 +20,7 @@

namespace facebook::velox::functions {

namespace detail {
namespace {
using Mask = std::bitset<128>;

Mask createMask(size_t low, size_t high) {
@@ -321,95 +321,6 @@ bool isAtCompression(const char* str, const size_t len, const int32_t pos) {
return pos < len - 1 && str[pos] == ':' && str[pos + 1] == ':';
}

// IPv6address = 6( h16 ":" ) ls32
// / "::" 5( h16 ":" ) ls32
// / [ h16 ] "::" 4( h16 ":" ) ls32
// / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
// / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
// / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
// / [ *4( h16 ":" ) h16 ] "::" ls32
// / [ *5( h16 ":" ) h16 ] "::" h16
// / [ *6( h16 ":" ) h16 ] "::"
// h16 = 1*4HEXDIG
// ls32 = ( h16 ":" h16 ) / IPv4address
bool tryConsumeIPV6Address(const char* str, const size_t len, int32_t& pos) {
bool hasCompression = false;
uint8_t numBytes = 0;
int32_t posInAddress = pos;

if (isAtCompression(str, len, posInAddress)) {
hasCompression = true;
// Consume the compression '::'.
posInAddress += 2;
}

while (posInAddress < len && numBytes < 16) {
int32_t posInHex = posInAddress;
for (int i = 0; i < 4; i++) {
if (posInHex == len || !test(kHex, str[posInHex])) {
break;
}

posInHex++;
}

if (posInHex == posInAddress) {
// We need to be able to consume at least one hex digit.
break;
}

if (posInHex < len) {
if (str[posInHex] == '.') {
// We may be in the IPV4 Address.
if (tryConsumeIPV4Address(str, len, posInAddress)) {
numBytes += 4;
break;
} else {
// A '.' can't appear anywhere except in a valid IPV4 address.
return false;
}
}
if (str[posInHex] == ':') {
if (isAtCompression(str, len, posInHex)) {
if (hasCompression) {
// We can't have two compressions.
return false;
} else {
// We found a 2 byte hex value followed by a compression.
numBytes += 2;
hasCompression = true;
// Consume the hex block and the compression '::'.
posInAddress = posInHex + 2;
}
} else {
if (posInHex == len || !test(kHex, str[posInHex + 1])) {
// Peak ahead, we can't end on a single ':'.
return false;
}
// We found a 2 byte hex value followed by a single ':'.
numBytes += 2;
// Consume the hex block and the ':'.
posInAddress = posInHex + 1;
}
} else {
// We found a 2 byte hex value at the end of the string.
numBytes += 2;
posInAddress = posInHex;
break;
}
}
}

// A valid IPv6 address must have exactly 16 bytes, or a compression.
if ((numBytes == 16 && !hasCompression) ||
(hasCompression && numBytes <= 14 && numBytes % 2 == 0)) {
pos = posInAddress;
return true;
} else {
return false;
}
}

// IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
bool tryConsumeIPVFuture(const char* str, const size_t len, int32_t& pos) {
int32_t posInAddress = pos;
@@ -706,18 +617,107 @@ bool tryConsumeUri(const char* str, const size_t len, int32_t& pos, URI& uri) {
return true;
}

} // namespace detail
} // namespace

// IPv6address = 6( h16 ":" ) ls32
// / "::" 5( h16 ":" ) ls32
// / [ h16 ] "::" 4( h16 ":" ) ls32
// / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
// / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
// / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
// / [ *4( h16 ":" ) h16 ] "::" ls32
// / [ *5( h16 ":" ) h16 ] "::" h16
// / [ *6( h16 ":" ) h16 ] "::"
// h16 = 1*4HEXDIG
// ls32 = ( h16 ":" h16 ) / IPv4address
bool tryConsumeIPV6Address(const char* str, const size_t len, int32_t& pos) {
bool hasCompression = false;
uint8_t numBytes = 0;
int32_t posInAddress = pos;

if (isAtCompression(str, len, posInAddress)) {
hasCompression = true;
// Consume the compression '::'.
posInAddress += 2;
}

while (posInAddress < len && numBytes < 16) {
int32_t posInHex = posInAddress;
for (int i = 0; i < 4; i++) {
if (posInHex == len || !test(kHex, str[posInHex])) {
break;
}

posInHex++;
}

if (posInHex == posInAddress) {
// We need to be able to consume at least one hex digit.
break;
}

if (posInHex < len) {
if (str[posInHex] == '.') {
// We may be in the IPV4 Address.
if (tryConsumeIPV4Address(str, len, posInAddress)) {
numBytes += 4;
break;
} else {
// A '.' can't appear anywhere except in a valid IPV4 address.
return false;
}
}
if (str[posInHex] == ':') {
if (isAtCompression(str, len, posInHex)) {
if (hasCompression) {
// We can't have two compressions.
return false;
} else {
// We found a 2 byte hex value followed by a compression.
numBytes += 2;
hasCompression = true;
// Consume the hex block and the compression '::'.
posInAddress = posInHex + 2;
}
} else {
if (posInHex == len || !test(kHex, str[posInHex + 1])) {
// Peak ahead, we can't end on a single ':'.
return false;
}
// We found a 2 byte hex value followed by a single ':'.
numBytes += 2;
// Consume the hex block and the ':'.
posInAddress = posInHex + 1;
}
} else {
// We found a 2 byte hex value at the end of the string.
numBytes += 2;
posInAddress = posInHex;
break;
}
}
}

// A valid IPv6 address must have exactly 16 bytes, or a compression.
if ((numBytes == 16 && !hasCompression) ||
(hasCompression && numBytes <= 14 && numBytes % 2 == 0)) {
pos = posInAddress;
return true;
} else {
return false;
}
}

// URI-reference = URI / relative-ref
bool parseUri(const StringView& uriStr, URI& uri) {
int32_t pos = 0;
if (detail::tryConsumeUri(uriStr.data(), uriStr.size(), pos, uri) &&
if (tryConsumeUri(uriStr.data(), uriStr.size(), pos, uri) &&
pos == uriStr.size()) {
return true;
}

pos = 0;
detail::consumeRelativeRef(uriStr.data(), uriStr.size(), pos, uri);
consumeRelativeRef(uriStr.data(), uriStr.size(), pos, uri);

return pos == uriStr.size();
}
67 changes: 67 additions & 0 deletions velox/functions/prestosql/URIParser.h
Original file line number Diff line number Diff line change
@@ -15,9 +15,16 @@
*/
#pragma once

#include <boost/regex.hpp>
#include "velox/type/StringView.h"

namespace facebook::velox::functions {
namespace detail {
FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) {
const auto& sub = match[idx];
return StringView(sub.first, sub.length());
}
} // namespace detail
/// A struct containing the parts of the URI that were extracted during parsing.
/// If the field was not found, it is empty.
///
@@ -38,4 +45,64 @@ struct URI {

/// Parse a URI string into a URI struct according to RFC 3986.
bool parseUri(const StringView& uriStr, URI& uri);

/// If the string starting at str is a valid IPv6 address, returns true and pos
/// is updated to the first character after the IP address. Otherwise returns
/// false and pos is unchanged.
bool tryConsumeIPV6Address(const char* str, const size_t len, int32_t& pos);

template <typename T>
FOLLY_ALWAYS_INLINE bool isMultipleInvalidSequences(
const T& inputBuffer,
size_t inputIndex) {
return
// 0xe0 followed by a value less than 0xe0 or 0xf0 followed by a
// value less than 0x90 is considered an overlong encoding.
(inputBuffer[inputIndex] == '\xe0' &&
(inputBuffer[inputIndex + 1] & 0xe0) == 0x80) ||
(inputBuffer[inputIndex] == '\xf0' &&
(inputBuffer[inputIndex + 1] & 0xf0) == 0x80) ||
// 0xf4 followed by a byte >= 0x90 looks valid to
// tryGetUtf8CharLength, but is actually outside the range of valid
// code points.
(inputBuffer[inputIndex] == '\xf4' &&
(inputBuffer[inputIndex + 1] & 0xf0) != 0x80) ||
// The bytes 0xf5-0xff, 0xc0, and 0xc1 look like the start of
// multi-byte code points to tryGetUtf8CharLength, but are not part of
// any valid code point.
(unsigned char)inputBuffer[inputIndex] > 0xf4 ||
inputBuffer[inputIndex] == '\xc0' || inputBuffer[inputIndex] == '\xc1';
}

/// Find an extract the value for the parameter with key `param` from the query
/// portion of a URI `query`. `query` should already be decoded if necessary.
template <typename TString>
std::optional<StringView> extractParameter(
const StringView& query,
const TString& param) {
if (!query.empty()) {
// Parse query string.
static const boost::regex kQueryParamRegex(
"(^|&)" // start of query or start of parameter "&"
"([^=&]*)=?" // parameter name and "=" if value is expected
"([^&]*)" // parameter value (allows "=" to appear)
"(?=(&|$))" // forward reference, next should be end of query or
// start of next parameter
);

const boost::cregex_iterator begin(
query.data(), query.data() + query.size(), kQueryParamRegex);
boost::cregex_iterator end;

for (auto it = begin; it != end; ++it) {
if (it->length(2) != 0 && (*it)[2].matched) { // key shouldnt be empty.
auto key = detail::submatch((*it), 2);
if (param.compare(key) == 0) {
return detail::submatch((*it), 3);
}
}
}
}
return std::nullopt;
}
} // namespace facebook::velox::functions
56 changes: 5 additions & 51 deletions velox/functions/prestosql/URLFunctions.h
Original file line number Diff line number Diff line change
@@ -15,7 +15,6 @@
*/
#pragma once

#include <boost/regex.hpp>
#include "velox/external/utf8proc/utf8procImpl.h"
#include "velox/functions/Macros.h"
#include "velox/functions/lib/Utf8Utils.h"
@@ -39,11 +38,6 @@ constexpr std::array<std::string_view, 6> kDecodedReplacementCharacterStrings{
"\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd",
"\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd"};

FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) {
const auto& sub = match[idx];
return StringView(sub.first, sub.length());
}

FOLLY_ALWAYS_INLINE unsigned char toHex(unsigned char c) {
return c < 10 ? (c + '0') : (c + 'A' - 10);
}
@@ -54,29 +48,6 @@ FOLLY_ALWAYS_INLINE void charEscape(unsigned char c, char* output) {
output[2] = toHex(c % 16);
}

template <typename T>
FOLLY_ALWAYS_INLINE bool isMultipleInvalidSequences(
const T& inputBuffer,
size_t inputIndex) {
return
// 0xe0 followed by a value less than 0xe0 or 0xf0 followed by a
// value less than 0x90 is considered an overlong encoding.
(inputBuffer[inputIndex] == '\xe0' &&
(inputBuffer[inputIndex + 1] & 0xe0) == 0x80) ||
(inputBuffer[inputIndex] == '\xf0' &&
(inputBuffer[inputIndex + 1] & 0xf0) == 0x80) ||
// 0xf4 followed by a byte >= 0x90 looks valid to
// tryGetUtf8CharLength, but is actually outside the range of valid
// code points.
(inputBuffer[inputIndex] == '\xf4' &&
(inputBuffer[inputIndex + 1] & 0xf0) != 0x80) ||
// The bytes 0xf5-0xff, 0xc0, and 0xc1 look like the start of
// multi-byte code points to tryGetUtf8CharLength, but are not part of
// any valid code point.
(unsigned char)inputBuffer[inputIndex] > 0xf4 ||
inputBuffer[inputIndex] == '\xc0' || inputBuffer[inputIndex] == '\xc1';
}

/// Escapes ``input`` by encoding it so that it can be safely included in
/// URL query parameter names and values:
///
@@ -440,35 +411,18 @@ struct UrlExtractParameterFunction {
}

if (!uri.query.empty()) {
// Parse query string.
static const boost::regex kQueryParamRegex(
"(^|&)" // start of query or start of parameter "&"
"([^=&]*)=?" // parameter name and "=" if value is expected
"([^&]*)" // parameter value (allows "=" to appear)
"(?=(&|$))" // forward reference, next should be end of query or
// start of next parameter
);

StringView query = uri.query;
std::string unescapedQuery;
if (uri.queryHasEncoded) {
detail::urlUnescape(unescapedQuery, uri.query);
query = StringView(unescapedQuery);
}

const boost::cregex_iterator begin(
query.data(), query.data() + query.size(), kQueryParamRegex);
boost::cregex_iterator end;

for (auto it = begin; it != end; ++it) {
if (it->length(2) != 0 && (*it)[2].matched) { // key shouldnt be empty.
auto key = detail::submatch((*it), 2);
if (param.compare(key) == 0) {
auto value = detail::submatch((*it), 3);
result.copy_from(value);
return true;
}
}
const auto value = extractParameter(query, param);

if (value.has_value()) {
result.copy_from(value.value());
return true;
}
}

0 comments on commit 9044c56

Please sign in to comment.