From b0eeef9177f3b79c010b243f9b2e4bd4d11120f3 Mon Sep 17 00:00:00 2001 From: "Schierbeck, Cody" Date: Tue, 13 Feb 2024 11:17:59 -0800 Subject: [PATCH] Introduce cappedByteLength to help with indexing UTF-8 strings (#8637) Summary: UTF strings may contain multi-byte characters that make character-based indexing inaccurate. This PR introduces functions stringImpl::cappedByteLength and stringCore::cappedByteLengthUnicode to help with indexing UTF strings that may contain multi-byte characters. Pull Request resolved: https://github.com/facebookincubator/velox/pull/8637 Reviewed By: pedroerp Differential Revision: D53627624 Pulled By: kgpai fbshipit-source-id: 2f28a7d1bb81c1a5e875e7b8a6f300f1fc9fbb16 --- velox/functions/lib/string/StringCore.h | 27 ++++++ velox/functions/lib/string/StringImpl.h | 15 ++- .../lib/string/tests/StringImplTest.cpp | 94 +++++++++++++++++++ 3 files changed, 135 insertions(+), 1 deletion(-) diff --git a/velox/functions/lib/string/StringCore.h b/velox/functions/lib/string/StringCore.h index ba988b06c4af..8fdcc4c61892 100644 --- a/velox/functions/lib/string/StringCore.h +++ b/velox/functions/lib/string/StringCore.h @@ -264,6 +264,33 @@ cappedLengthUnicode(const char* input, size_t size, size_t maxChars) { return numChars; } +/// +/// Return an capped length in bytes(controlled by maxChars) of a unicode +/// string. The returned length may be greater than maxCharacters if there are +/// multi-byte characters present in the input string. +/// +/// This method is used to help with indexing unicode strings by byte position. +/// It is used to find the byte position of the Nth character in a string. +/// +/// @param input input buffer that hold the string +/// @param size size of input buffer +/// @param maxChars stop counting characters if the string is longer +/// than this value +/// @return the number of bytes represented by the input utf8 string up to +/// maxChars +/// +FOLLY_ALWAYS_INLINE int64_t +cappedByteLengthUnicode(const char* input, size_t size, int64_t maxChars) { + size_t utf8Position = 0; + size_t numCharacters = 0; + while (utf8Position < size && numCharacters < maxChars) { + auto charSize = utf8proc_char_length(input + utf8Position); + utf8Position += UNLIKELY(charSize < 0) ? 1 : charSize; + numCharacters++; + } + return utf8Position; +} + /// Returns the start byte index of the Nth instance of subString in /// string. Search starts from startPosition. Positions start with 0. If not /// found, -1 is returned. To facilitate finding overlapping strings, the diff --git a/velox/functions/lib/string/StringImpl.h b/velox/functions/lib/string/StringImpl.h index 871f3bffd194..73b6a4366162 100644 --- a/velox/functions/lib/string/StringImpl.h +++ b/velox/functions/lib/string/StringImpl.h @@ -111,7 +111,7 @@ FOLLY_ALWAYS_INLINE int64_t length(const T& input) { } } -/// Return a capped length(controlled by maxLength) of a string. +/// Return a capped length in characters(controlled by maxLength) of a string. /// The returned length is not greater than maxLength. template FOLLY_ALWAYS_INLINE int64_t cappedLength(const T& input, size_t maxLength) { @@ -122,6 +122,19 @@ FOLLY_ALWAYS_INLINE int64_t cappedLength(const T& input, size_t maxLength) { } } +/// Return a capped length in bytes(controlled by maxCharacters) of a string. +/// The returned length may be greater than maxCharacters if there are +/// multi-byte characters present in the input string. +template +FOLLY_ALWAYS_INLINE int64_t +cappedByteLength(const TString& input, size_t maxCharacters) { + if constexpr (isAscii) { + return input.size() > maxCharacters ? maxCharacters : input.size(); + } else { + return cappedByteLengthUnicode(input.data(), input.size(), maxCharacters); + } +} + /// Write the Unicode codePoint as string to the output string. The function /// behavior is undefined when code point it invalid. Implements the logic of /// presto chr function. diff --git a/velox/functions/lib/string/tests/StringImplTest.cpp b/velox/functions/lib/string/tests/StringImplTest.cpp index 258eb6f37053..883949e33c3a 100644 --- a/velox/functions/lib/string/tests/StringImplTest.cpp +++ b/velox/functions/lib/string/tests/StringImplTest.cpp @@ -196,6 +196,100 @@ TEST_F(StringImplTest, cappedLength) { ASSERT_EQ(cappedLength(input, 7), 5); } +TEST_F(StringImplTest, cappedUnicodeBytes) { + // Test functions use case for indexing + // UTF strings. + std::string stringInput = "\xF4\x90\x80\x80Hello"; + ASSERT_EQ('H', stringInput[cappedByteLength(stringInput, 2) - 1]); + ASSERT_EQ('e', stringInput[cappedByteLength(stringInput, 3) - 1]); + ASSERT_EQ('l', stringInput[cappedByteLength(stringInput, 4) - 1]); + ASSERT_EQ('l', stringInput[cappedByteLength(stringInput, 5) - 1]); + ASSERT_EQ('o', stringInput[cappedByteLength(stringInput, 6) - 1]); + ASSERT_EQ('o', stringInput[cappedByteLength(stringInput, 7) - 1]); + + // Multi-byte chars + stringInput = "♫¡Singing is fun!♫"; + auto sPos = cappedByteLength(stringInput, 2); + auto exPos = cappedByteLength(stringInput, 17); + ASSERT_EQ("Singing is fun!♫", stringInput.substr(sPos)); + ASSERT_EQ("♫¡Singing is fun!", stringInput.substr(0, exPos)); + ASSERT_EQ("Singing is fun!", stringInput.substr(sPos, exPos - sPos)); + + stringInput = std::string("abcd"); + auto stringViewInput = std::string_view(stringInput); + ASSERT_EQ(cappedByteLength(stringInput, 1), 1); + ASSERT_EQ(cappedByteLength(stringInput, 2), 2); + ASSERT_EQ(cappedByteLength(stringInput, 3), 3); + ASSERT_EQ(cappedByteLength(stringInput, 4), 4); + ASSERT_EQ(cappedByteLength(stringInput, 5), 4); + ASSERT_EQ(cappedByteLength(stringInput, 6), 4); + + ASSERT_EQ(cappedByteLength(stringViewInput, 1), 1); + ASSERT_EQ(cappedByteLength(stringViewInput, 2), 2); + ASSERT_EQ(cappedByteLength(stringViewInput, 3), 3); + ASSERT_EQ(cappedByteLength(stringViewInput, 4), 4); + ASSERT_EQ(cappedByteLength(stringViewInput, 5), 4); + ASSERT_EQ(cappedByteLength(stringViewInput, 6), 4); + + stringInput = std::string("你好a世界"); + stringViewInput = std::string_view(stringInput); + ASSERT_EQ(cappedByteLength(stringInput, 1), 3); + ASSERT_EQ(cappedByteLength(stringInput, 2), 6); + ASSERT_EQ(cappedByteLength(stringInput, 3), 7); + ASSERT_EQ(cappedByteLength(stringInput, 4), 10); + ASSERT_EQ(cappedByteLength(stringInput, 5), 13); + ASSERT_EQ(cappedByteLength(stringInput, 6), 13); + + ASSERT_EQ(cappedByteLength(stringViewInput, 1), 3); + ASSERT_EQ(cappedByteLength(stringViewInput, 2), 6); + ASSERT_EQ(cappedByteLength(stringViewInput, 3), 7); + ASSERT_EQ(cappedByteLength(stringViewInput, 4), 10); + ASSERT_EQ(cappedByteLength(stringViewInput, 5), 13); + ASSERT_EQ(cappedByteLength(stringViewInput, 6), 13); + + stringInput = std::string("\x80"); + stringViewInput = std::string_view(stringInput); + ASSERT_EQ(cappedByteLength(stringInput, 1), 1); + ASSERT_EQ(cappedByteLength(stringInput, 2), 1); + ASSERT_EQ(cappedByteLength(stringInput, 3), 1); + ASSERT_EQ(cappedByteLength(stringInput, 4), 1); + ASSERT_EQ(cappedByteLength(stringInput, 5), 1); + ASSERT_EQ(cappedByteLength(stringInput, 6), 1); + + ASSERT_EQ(cappedByteLength(stringViewInput, 1), 1); + ASSERT_EQ(cappedByteLength(stringViewInput, 2), 1); + ASSERT_EQ(cappedByteLength(stringViewInput, 3), 1); + ASSERT_EQ(cappedByteLength(stringViewInput, 4), 1); + ASSERT_EQ(cappedByteLength(stringViewInput, 5), 1); + ASSERT_EQ(cappedByteLength(stringViewInput, 6), 1); + + stringInput.resize(2); + // Create corrupt data below. + char16_t c = u'\u04FF'; + stringInput[0] = (char)c; + stringInput[1] = (char)c; + + ASSERT_EQ(cappedByteLength(stringInput, 1), 1); + + stringInput.resize(4); + c = u'\u04F4'; + char16_t c2 = u'\u048F'; + char16_t c3 = u'\u04BF'; + stringInput[0] = (char)c; + stringInput[1] = (char)c2; + stringInput[2] = (char)c3; + stringInput[3] = (char)c3; + + stringViewInput = std::string_view(stringInput); + ASSERT_EQ(cappedByteLength(stringInput, 1), 4); + ASSERT_EQ(cappedByteLength(stringInput, 2), 4); + ASSERT_EQ(cappedByteLength(stringInput, 3), 4); + + ASSERT_EQ(cappedByteLength(stringViewInput, 1), 4); + ASSERT_EQ(cappedByteLength(stringViewInput, 2), 4); + ASSERT_EQ(cappedByteLength(stringViewInput, 3), 4); +} + TEST_F(StringImplTest, badUnicodeLength) { ASSERT_EQ(0, length(std::string(""))); ASSERT_EQ(2, length(std::string("ab")));