Skip to content

Commit

Permalink
Match Presto's behavior for invalid UTF-8 in url_encode (facebookincu…
Browse files Browse the repository at this point in the history
…bator#11518)

Summary:

Presto Java converts the URL to a Java String before encoding it in url_encode. Java replaces bytes
in an invalid UTF-8 character with 0xEF 0xBF 0xBD.

Velox encodes invalid UTF-8 characters as is, which leads to differences in results from Java and 
C++.

This diff adds a check when encoding URLs for invalid UTF-8 characters and does the same
replacement as Java.

Differential Revision: D65856104
  • Loading branch information
Kevin Wilfong authored and facebook-github-bot committed Nov 16, 2024
1 parent 0e9c3b9 commit ba7d5fe
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 8 deletions.
82 changes: 74 additions & 8 deletions velox/functions/prestosql/URLFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@
#pragma once

#include <boost/regex.hpp>
#include <cctype>
#include <optional>
#include "velox/functions/Macros.h"
#include "velox/functions/lib/string/StringImpl.h"
#include "velox/functions/lib/Utf8Utils.h"
#include "velox/functions/prestosql/URIParser.h"

namespace facebook::velox::functions {

namespace detail {
constexpr std::string_view kEncodedReplacementCharacter{"%EF%BF%BD"};

FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) {
const auto& sub = match[idx];
return StringView(sub.first, sub.length());
Expand All @@ -49,27 +49,93 @@ FOLLY_ALWAYS_INLINE void charEscape(unsigned char c, char* output) {
/// * All other characters are converted to UTF-8 and the bytes are encoded
/// as the string ``%XX`` where ``XX`` is the uppercase hexadecimal
/// value of the UTF-8 byte.
/// * If the character is invalid UTF-8 all bytes of the character are
/// converted to %EF%BF%BD.
template <typename TOutString, typename TInString>
FOLLY_ALWAYS_INLINE void urlEscape(TOutString& output, const TInString& input) {
auto inputSize = input.size();
output.reserve(inputSize * 3);
// In the worst case every byte is an invalid UTF-8 character.
output.reserve(inputSize * kEncodedReplacementCharacter.size());

auto inputBuffer = input.data();
auto outputBuffer = output.data();

size_t inputIndex = 0;
size_t outIndex = 0;
for (auto i = 0; i < inputSize; ++i) {
unsigned char p = inputBuffer[i];
while (inputIndex < inputSize) {
unsigned char p = inputBuffer[inputIndex];

if ((p >= 'a' && p <= 'z') || (p >= 'A' && p <= 'Z') ||
(p >= '0' && p <= '9') || p == '-' || p == '_' || p == '.' ||
p == '*') {
outputBuffer[outIndex++] = p;
inputIndex++;
} else if (p == ' ') {
outputBuffer[outIndex++] = '+';
inputIndex++;
} else {
charEscape(p, outputBuffer + outIndex);
outIndex += 3;
const auto charLength =
tryGetCharLength(inputBuffer + inputIndex, inputSize - inputIndex);
if (charLength > 0) {
for (int i = 0; i < charLength; ++i) {
charEscape(inputBuffer[inputIndex + i], outputBuffer + outIndex);
outIndex += 3;
}

inputIndex += charLength;
} else {
// According to the Unicode standard the "maximal subpart of an
// ill-formed subsequence" is the longest code unit subsequenece that is
// either well-formed or of length 1. A replacement character should be
// written for each of these. In practice tryGetCharLength breaks most
// cases into maximal subparts, the exceptions are overlong encodings or
// subsequences outside the range of valid 4 byte sequences. In both
// these cases we should just write out a replacement character for
// every byte in the sequence.
bool isMultipleInvalidSequences = false;
if (inputIndex < inputSize - 1) {
isMultipleInvalidSequences =
// 0xe0 followed by a value less than 0xe0 or 0xf0 followed by a
// value less than 0x90 is considered an overlong encoding.
(inputBuffer[inputIndex] == '\xe0' &&
(inputBuffer[inputIndex + 1] & 0xe0) == 0x80) ||
(inputBuffer[inputIndex] == '\xf0' &&
(inputBuffer[inputIndex + 1] & 0xf0) == 0x80) ||
// 0xf4 followed by a byte >= 0x90 looks valid to
// tryGetCharLength, but is actually outside the range of valid
// code points.
(inputBuffer[inputIndex] == '\xf4' &&
(inputBuffer[inputIndex + 1] & 0xf0) != 0x80) ||
// The bytes 0xf5-0xff, 0xc0, and 0xc1 look like the start of
// multi-byte code points to tryGetCharLength, but are not part of
// any valid code point.
(unsigned char)inputBuffer[inputIndex] > 0xf4 ||
inputBuffer[inputIndex] == '\xc0' ||
inputBuffer[inputIndex] == '\xc1';
}

if (isMultipleInvalidSequences) {
// For overlong encodings we write a replacement character for each
// byte.
for (int i = charLength; i < 0; ++i) {
std::memcpy(
outputBuffer + outIndex,
kEncodedReplacementCharacter.data(),
kEncodedReplacementCharacter.size());
outIndex += kEncodedReplacementCharacter.size();
}
} else {
// For other invalid encodings we write a single replacement character
// regardless of the number of bytes in the invalid character.
std::memcpy(
outputBuffer + outIndex,
kEncodedReplacementCharacter.data(),
kEncodedReplacementCharacter.size());
outIndex += kEncodedReplacementCharacter.size();
}

inputIndex += -charLength;
}
}
}
output.resize(outIndex);
Expand Down
29 changes: 29 additions & 0 deletions velox/functions/prestosql/tests/URLFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,35 @@ TEST_F(URLFunctionsTest, urlEncode) {
urlEncode("http://\u30c6\u30b9\u30c8"));
EXPECT_EQ("%7E%40%3A.-*_%2B+%E2%98%83", urlEncode("~@:.-*_+ \u2603"));
EXPECT_EQ("test", urlEncode("test"));
// Test a single byte invalid UTF-8 character.
EXPECT_EQ("te%EF%BF%BDst", urlEncode("te\x88st"));
// Test a multi-byte invalid UTF-8 character. (If the first byte is between
// 0xe0 and 0xef, it should be a 3 byte character, but we only have 2 bytes
// here.)
EXPECT_EQ("te%EF%BF%BDst", urlEncode("te\xe0\xb8st"));
// Test an overlong 3 byte UTF-8 character
EXPECT_EQ("%EF%BF%BD%EF%BF%BD", urlEncode("\xe0\x94"));
// Test an overlong 3 byte UTF-8 character with a continuation byte.
EXPECT_EQ("%EF%BF%BD%EF%BF%BD%EF%BF%BD", urlEncode("\xe0\x94\x83"));
// Test an overlong 4 byte UTF-8 character
EXPECT_EQ("%EF%BF%BD%EF%BF%BD", urlEncode("\xf0\x84"));
// Test an overlong 4 byte UTF-8 character with continuation bytes.
EXPECT_EQ(
"%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD", urlEncode("\xf0\x84\x90\x90"));
// Test a 4 byte UTF-8 character outside the range of valid values.
EXPECT_EQ(
"%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD", urlEncode("\xfa\x80\x80\x80"));
// Test the beginning of a 4 byte UTF-8 character followed by a
// non-continuation byte.
EXPECT_EQ("%EF%BF%BD%EF%BF%BD", urlEncode("\xf0\xe0"));
// Test the invalid byte 0xc0.
EXPECT_EQ("%EF%BF%BD%EF%BF%BD", urlEncode("\xc0\x83"));
// Test the invalid byte 0xc1.
EXPECT_EQ("%EF%BF%BD%EF%BF%BD", urlEncode("\xc1\x83"));
// Test a 4 byte UTF-8 character that looks valid, but is actually outside the
// range of valid values.
EXPECT_EQ(
"%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD", urlEncode("\xf4\x92\x83\x83"));
}

TEST_F(URLFunctionsTest, urlDecode) {
Expand Down

0 comments on commit ba7d5fe

Please sign in to comment.