Skip to content

Commit

Permalink
Introduce cappedByteLength to help with indexing UTF-8 strings (faceb…
Browse files Browse the repository at this point in the history
…ookincubator#8637)

Summary:
UTF strings may contain multi-byte characters that make character-based indexing inaccurate. This PR introduces functions stringImpl::cappedByteLength and stringCore::cappedByteLengthUnicode to help with indexing UTF strings that may contain multi-byte characters.

Pull Request resolved: facebookincubator#8637

Reviewed By: pedroerp

Differential Revision: D53627624

Pulled By: kgpai

fbshipit-source-id: 2f28a7d1bb81c1a5e875e7b8a6f300f1fc9fbb16
  • Loading branch information
codyschierbeck authored and facebook-github-bot committed Feb 13, 2024
1 parent aba702c commit b0eeef9
Show file tree
Hide file tree
Showing 3 changed files with 135 additions and 1 deletion.
27 changes: 27 additions & 0 deletions velox/functions/lib/string/StringCore.h
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,33 @@ cappedLengthUnicode(const char* input, size_t size, size_t maxChars) {
return numChars;
}

///
/// Return an capped length in bytes(controlled by maxChars) of a unicode
/// string. The returned length may be greater than maxCharacters if there are
/// multi-byte characters present in the input string.
///
/// This method is used to help with indexing unicode strings by byte position.
/// It is used to find the byte position of the Nth character in a string.
///
/// @param input input buffer that hold the string
/// @param size size of input buffer
/// @param maxChars stop counting characters if the string is longer
/// than this value
/// @return the number of bytes represented by the input utf8 string up to
/// maxChars
///
FOLLY_ALWAYS_INLINE int64_t
cappedByteLengthUnicode(const char* input, size_t size, int64_t maxChars) {
size_t utf8Position = 0;
size_t numCharacters = 0;
while (utf8Position < size && numCharacters < maxChars) {
auto charSize = utf8proc_char_length(input + utf8Position);
utf8Position += UNLIKELY(charSize < 0) ? 1 : charSize;
numCharacters++;
}
return utf8Position;
}

/// Returns the start byte index of the Nth instance of subString in
/// string. Search starts from startPosition. Positions start with 0. If not
/// found, -1 is returned. To facilitate finding overlapping strings, the
Expand Down
15 changes: 14 additions & 1 deletion velox/functions/lib/string/StringImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ FOLLY_ALWAYS_INLINE int64_t length(const T& input) {
}
}

/// Return a capped length(controlled by maxLength) of a string.
/// Return a capped length in characters(controlled by maxLength) of a string.
/// The returned length is not greater than maxLength.
template <bool isAscii, typename T>
FOLLY_ALWAYS_INLINE int64_t cappedLength(const T& input, size_t maxLength) {
Expand All @@ -122,6 +122,19 @@ FOLLY_ALWAYS_INLINE int64_t cappedLength(const T& input, size_t maxLength) {
}
}

/// Return a capped length in bytes(controlled by maxCharacters) of a string.
/// The returned length may be greater than maxCharacters if there are
/// multi-byte characters present in the input string.
template <bool isAscii, typename TString>
FOLLY_ALWAYS_INLINE int64_t
cappedByteLength(const TString& input, size_t maxCharacters) {
if constexpr (isAscii) {
return input.size() > maxCharacters ? maxCharacters : input.size();
} else {
return cappedByteLengthUnicode(input.data(), input.size(), maxCharacters);
}
}

/// Write the Unicode codePoint as string to the output string. The function
/// behavior is undefined when code point it invalid. Implements the logic of
/// presto chr function.
Expand Down
94 changes: 94 additions & 0 deletions velox/functions/lib/string/tests/StringImplTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,100 @@ TEST_F(StringImplTest, cappedLength) {
ASSERT_EQ(cappedLength</*isAscii*/ false>(input, 7), 5);
}

TEST_F(StringImplTest, cappedUnicodeBytes) {
// Test functions use case for indexing
// UTF strings.
std::string stringInput = "\xF4\x90\x80\x80Hello";
ASSERT_EQ('H', stringInput[cappedByteLength<false>(stringInput, 2) - 1]);
ASSERT_EQ('e', stringInput[cappedByteLength<false>(stringInput, 3) - 1]);
ASSERT_EQ('l', stringInput[cappedByteLength<false>(stringInput, 4) - 1]);
ASSERT_EQ('l', stringInput[cappedByteLength<false>(stringInput, 5) - 1]);
ASSERT_EQ('o', stringInput[cappedByteLength<false>(stringInput, 6) - 1]);
ASSERT_EQ('o', stringInput[cappedByteLength<false>(stringInput, 7) - 1]);

// Multi-byte chars
stringInput = "♫¡Singing is fun!♫";
auto sPos = cappedByteLength<false>(stringInput, 2);
auto exPos = cappedByteLength<false>(stringInput, 17);
ASSERT_EQ("Singing is fun!♫", stringInput.substr(sPos));
ASSERT_EQ("♫¡Singing is fun!", stringInput.substr(0, exPos));
ASSERT_EQ("Singing is fun!", stringInput.substr(sPos, exPos - sPos));

stringInput = std::string("abcd");
auto stringViewInput = std::string_view(stringInput);
ASSERT_EQ(cappedByteLength<true>(stringInput, 1), 1);
ASSERT_EQ(cappedByteLength<true>(stringInput, 2), 2);
ASSERT_EQ(cappedByteLength<true>(stringInput, 3), 3);
ASSERT_EQ(cappedByteLength<true>(stringInput, 4), 4);
ASSERT_EQ(cappedByteLength<true>(stringInput, 5), 4);
ASSERT_EQ(cappedByteLength<true>(stringInput, 6), 4);

ASSERT_EQ(cappedByteLength<true>(stringViewInput, 1), 1);
ASSERT_EQ(cappedByteLength<true>(stringViewInput, 2), 2);
ASSERT_EQ(cappedByteLength<true>(stringViewInput, 3), 3);
ASSERT_EQ(cappedByteLength<true>(stringViewInput, 4), 4);
ASSERT_EQ(cappedByteLength<true>(stringViewInput, 5), 4);
ASSERT_EQ(cappedByteLength<true>(stringViewInput, 6), 4);

stringInput = std::string("你好a世界");
stringViewInput = std::string_view(stringInput);
ASSERT_EQ(cappedByteLength<false>(stringInput, 1), 3);
ASSERT_EQ(cappedByteLength<false>(stringInput, 2), 6);
ASSERT_EQ(cappedByteLength<false>(stringInput, 3), 7);
ASSERT_EQ(cappedByteLength<false>(stringInput, 4), 10);
ASSERT_EQ(cappedByteLength<false>(stringInput, 5), 13);
ASSERT_EQ(cappedByteLength<false>(stringInput, 6), 13);

ASSERT_EQ(cappedByteLength<false>(stringViewInput, 1), 3);
ASSERT_EQ(cappedByteLength<false>(stringViewInput, 2), 6);
ASSERT_EQ(cappedByteLength<false>(stringViewInput, 3), 7);
ASSERT_EQ(cappedByteLength<false>(stringViewInput, 4), 10);
ASSERT_EQ(cappedByteLength<false>(stringViewInput, 5), 13);
ASSERT_EQ(cappedByteLength<false>(stringViewInput, 6), 13);

stringInput = std::string("\x80");
stringViewInput = std::string_view(stringInput);
ASSERT_EQ(cappedByteLength<false>(stringInput, 1), 1);
ASSERT_EQ(cappedByteLength<false>(stringInput, 2), 1);
ASSERT_EQ(cappedByteLength<false>(stringInput, 3), 1);
ASSERT_EQ(cappedByteLength<false>(stringInput, 4), 1);
ASSERT_EQ(cappedByteLength<false>(stringInput, 5), 1);
ASSERT_EQ(cappedByteLength<false>(stringInput, 6), 1);

ASSERT_EQ(cappedByteLength<false>(stringViewInput, 1), 1);
ASSERT_EQ(cappedByteLength<false>(stringViewInput, 2), 1);
ASSERT_EQ(cappedByteLength<false>(stringViewInput, 3), 1);
ASSERT_EQ(cappedByteLength<false>(stringViewInput, 4), 1);
ASSERT_EQ(cappedByteLength<false>(stringViewInput, 5), 1);
ASSERT_EQ(cappedByteLength<false>(stringViewInput, 6), 1);

stringInput.resize(2);
// Create corrupt data below.
char16_t c = u'\u04FF';
stringInput[0] = (char)c;
stringInput[1] = (char)c;

ASSERT_EQ(cappedByteLength<false>(stringInput, 1), 1);

stringInput.resize(4);
c = u'\u04F4';
char16_t c2 = u'\u048F';
char16_t c3 = u'\u04BF';
stringInput[0] = (char)c;
stringInput[1] = (char)c2;
stringInput[2] = (char)c3;
stringInput[3] = (char)c3;

stringViewInput = std::string_view(stringInput);
ASSERT_EQ(cappedByteLength<false>(stringInput, 1), 4);
ASSERT_EQ(cappedByteLength<false>(stringInput, 2), 4);
ASSERT_EQ(cappedByteLength<false>(stringInput, 3), 4);

ASSERT_EQ(cappedByteLength<false>(stringViewInput, 1), 4);
ASSERT_EQ(cappedByteLength<false>(stringViewInput, 2), 4);
ASSERT_EQ(cappedByteLength<false>(stringViewInput, 3), 4);
}

TEST_F(StringImplTest, badUnicodeLength) {
ASSERT_EQ(0, length</*isAscii*/ false>(std::string("")));
ASSERT_EQ(2, length</*isAscii*/ false>(std::string("ab")));
Expand Down

0 comments on commit b0eeef9

Please sign in to comment.