From b0eeef9177f3b79c010b243f9b2e4bd4d11120f3 Mon Sep 17 00:00:00 2001
From: "Schierbeck, Cody" <cody.schierbeck@intel.com>
Date: Tue, 13 Feb 2024 11:17:59 -0800
Subject: [PATCH] Introduce cappedByteLength to help with indexing UTF-8
 strings (#8637)

Summary:
UTF strings may contain multi-byte characters that make character-based indexing inaccurate. This PR introduces functions stringImpl::cappedByteLength and stringCore::cappedByteLengthUnicode to help with indexing UTF strings that may contain multi-byte characters.

Pull Request resolved: https://github.com/facebookincubator/velox/pull/8637

Reviewed By: pedroerp

Differential Revision: D53627624

Pulled By: kgpai

fbshipit-source-id: 2f28a7d1bb81c1a5e875e7b8a6f300f1fc9fbb16
---
 velox/functions/lib/string/StringCore.h       | 27 ++++++
 velox/functions/lib/string/StringImpl.h       | 15 ++-
 .../lib/string/tests/StringImplTest.cpp       | 94 +++++++++++++++++++
 3 files changed, 135 insertions(+), 1 deletion(-)
diff --git a/velox/functions/lib/string/StringCore.h b/velox/functions/lib/string/StringCore.h
index ba988b06c4af..8fdcc4c61892 100644
--- a/velox/functions/lib/string/StringCore.h
+++ b/velox/functions/lib/string/StringCore.h
@@ -264,6 +264,33 @@ cappedLengthUnicode(const char* input, size_t size, size_t maxChars) {
   return numChars;
 }
 
+///
+/// Return an capped length in bytes(controlled by maxChars) of a unicode
+/// string. The returned length may be greater than maxCharacters if there are
+/// multi-byte characters present in the input string.
+///
+/// This method is used to help with indexing unicode strings by byte position.
+/// It is used to find the byte position of the Nth character in a string.
+///
+/// @param input input buffer that hold the string
+/// @param size size of input buffer
+/// @param maxChars stop counting characters if the string is longer
+/// than this value
+/// @return the number of bytes represented by the input utf8 string up to
+/// maxChars
+///
+FOLLY_ALWAYS_INLINE int64_t
+cappedByteLengthUnicode(const char* input, size_t size, int64_t maxChars) {
+  size_t utf8Position = 0;
+  size_t numCharacters = 0;
+  while (utf8Position < size && numCharacters < maxChars) {
+    auto charSize = utf8proc_char_length(input + utf8Position);
+    utf8Position += UNLIKELY(charSize < 0) ? 1 : charSize;
+    numCharacters++;
+  }
+  return utf8Position;
+}
+
 /// Returns the start byte index of the Nth instance of subString in
 /// string. Search starts from startPosition. Positions start with 0. If not
 /// found, -1 is returned. To facilitate finding overlapping strings, the
diff --git a/velox/functions/lib/string/StringImpl.h b/velox/functions/lib/string/StringImpl.h
index 871f3bffd194..73b6a4366162 100644
--- a/velox/functions/lib/string/StringImpl.h
+++ b/velox/functions/lib/string/StringImpl.h
@@ -111,7 +111,7 @@ FOLLY_ALWAYS_INLINE int64_t length(const T& input) {
   }
 }
 
-/// Return a capped length(controlled by maxLength) of a string.
+/// Return a capped length in characters(controlled by maxLength) of a string.
 /// The returned length is not greater than maxLength.
 template <bool isAscii, typename T>
 FOLLY_ALWAYS_INLINE int64_t cappedLength(const T& input, size_t maxLength) {
@@ -122,6 +122,19 @@ FOLLY_ALWAYS_INLINE int64_t cappedLength(const T& input, size_t maxLength) {
   }
 }
 
+/// Return a capped length in bytes(controlled by maxCharacters) of a string.
+/// The returned length may be greater than maxCharacters if there are
+/// multi-byte characters present in the input string.
+template <bool isAscii, typename TString>
+FOLLY_ALWAYS_INLINE int64_t
+cappedByteLength(const TString& input, size_t maxCharacters) {
+  if constexpr (isAscii) {
+    return input.size() > maxCharacters ? maxCharacters : input.size();
+  } else {
+    return cappedByteLengthUnicode(input.data(), input.size(), maxCharacters);
+  }
+}
+
 /// Write the Unicode codePoint as string to the output string. The function
 /// behavior is undefined when code point it invalid. Implements the logic of
 /// presto chr function.
diff --git a/velox/functions/lib/string/tests/StringImplTest.cpp b/velox/functions/lib/string/tests/StringImplTest.cpp
index 258eb6f37053..883949e33c3a 100644
--- a/velox/functions/lib/string/tests/StringImplTest.cpp
+++ b/velox/functions/lib/string/tests/StringImplTest.cpp
@@ -196,6 +196,100 @@ TEST_F(StringImplTest, cappedLength) {
   ASSERT_EQ(cappedLength</*isAscii*/ false>(input, 7), 5);
 }
 
+TEST_F(StringImplTest, cappedUnicodeBytes) {
+  // Test functions use case for indexing
+  // UTF strings.
+  std::string stringInput = "\xF4\x90\x80\x80Hello";
+  ASSERT_EQ('H', stringInput[cappedByteLength<false>(stringInput, 2) - 1]);
+  ASSERT_EQ('e', stringInput[cappedByteLength<false>(stringInput, 3) - 1]);
+  ASSERT_EQ('l', stringInput[cappedByteLength<false>(stringInput, 4) - 1]);
+  ASSERT_EQ('l', stringInput[cappedByteLength<false>(stringInput, 5) - 1]);
+  ASSERT_EQ('o', stringInput[cappedByteLength<false>(stringInput, 6) - 1]);
+  ASSERT_EQ('o', stringInput[cappedByteLength<false>(stringInput, 7) - 1]);
+
+  // Multi-byte chars
+  stringInput = "♫¡Singing is fun!♫";
+  auto sPos = cappedByteLength<false>(stringInput, 2);
+  auto exPos = cappedByteLength<false>(stringInput, 17);
+  ASSERT_EQ("Singing is fun!♫", stringInput.substr(sPos));
+  ASSERT_EQ("♫¡Singing is fun!", stringInput.substr(0, exPos));
+  ASSERT_EQ("Singing is fun!", stringInput.substr(sPos, exPos - sPos));
+
+  stringInput = std::string("abcd");
+  auto stringViewInput = std::string_view(stringInput);
+  ASSERT_EQ(cappedByteLength<true>(stringInput, 1), 1);
+  ASSERT_EQ(cappedByteLength<true>(stringInput, 2), 2);
+  ASSERT_EQ(cappedByteLength<true>(stringInput, 3), 3);
+  ASSERT_EQ(cappedByteLength<true>(stringInput, 4), 4);
+  ASSERT_EQ(cappedByteLength<true>(stringInput, 5), 4);
+  ASSERT_EQ(cappedByteLength<true>(stringInput, 6), 4);
+
+  ASSERT_EQ(cappedByteLength<true>(stringViewInput, 1), 1);
+  ASSERT_EQ(cappedByteLength<true>(stringViewInput, 2), 2);
+  ASSERT_EQ(cappedByteLength<true>(stringViewInput, 3), 3);
+  ASSERT_EQ(cappedByteLength<true>(stringViewInput, 4), 4);
+  ASSERT_EQ(cappedByteLength<true>(stringViewInput, 5), 4);
+  ASSERT_EQ(cappedByteLength<true>(stringViewInput, 6), 4);
+
+  stringInput = std::string("你好a世界");
+  stringViewInput = std::string_view(stringInput);
+  ASSERT_EQ(cappedByteLength<false>(stringInput, 1), 3);
+  ASSERT_EQ(cappedByteLength<false>(stringInput, 2), 6);
+  ASSERT_EQ(cappedByteLength<false>(stringInput, 3), 7);
+  ASSERT_EQ(cappedByteLength<false>(stringInput, 4), 10);
+  ASSERT_EQ(cappedByteLength<false>(stringInput, 5), 13);
+  ASSERT_EQ(cappedByteLength<false>(stringInput, 6), 13);
+
+  ASSERT_EQ(cappedByteLength<false>(stringViewInput, 1), 3);
+  ASSERT_EQ(cappedByteLength<false>(stringViewInput, 2), 6);
+  ASSERT_EQ(cappedByteLength<false>(stringViewInput, 3), 7);
+  ASSERT_EQ(cappedByteLength<false>(stringViewInput, 4), 10);
+  ASSERT_EQ(cappedByteLength<false>(stringViewInput, 5), 13);
+  ASSERT_EQ(cappedByteLength<false>(stringViewInput, 6), 13);
+
+  stringInput = std::string("\x80");
+  stringViewInput = std::string_view(stringInput);
+  ASSERT_EQ(cappedByteLength<false>(stringInput, 1), 1);
+  ASSERT_EQ(cappedByteLength<false>(stringInput, 2), 1);
+  ASSERT_EQ(cappedByteLength<false>(stringInput, 3), 1);
+  ASSERT_EQ(cappedByteLength<false>(stringInput, 4), 1);
+  ASSERT_EQ(cappedByteLength<false>(stringInput, 5), 1);
+  ASSERT_EQ(cappedByteLength<false>(stringInput, 6), 1);
+
+  ASSERT_EQ(cappedByteLength<false>(stringViewInput, 1), 1);
+  ASSERT_EQ(cappedByteLength<false>(stringViewInput, 2), 1);
+  ASSERT_EQ(cappedByteLength<false>(stringViewInput, 3), 1);
+  ASSERT_EQ(cappedByteLength<false>(stringViewInput, 4), 1);
+  ASSERT_EQ(cappedByteLength<false>(stringViewInput, 5), 1);
+  ASSERT_EQ(cappedByteLength<false>(stringViewInput, 6), 1);
+
+  stringInput.resize(2);
+  // Create corrupt data below.
+  char16_t c = u'\u04FF';
+  stringInput[0] = (char)c;
+  stringInput[1] = (char)c;
+
+  ASSERT_EQ(cappedByteLength<false>(stringInput, 1), 1);
+
+  stringInput.resize(4);
+  c = u'\u04F4';
+  char16_t c2 = u'\u048F';
+  char16_t c3 = u'\u04BF';
+  stringInput[0] = (char)c;
+  stringInput[1] = (char)c2;
+  stringInput[2] = (char)c3;
+  stringInput[3] = (char)c3;
+
+  stringViewInput = std::string_view(stringInput);
+  ASSERT_EQ(cappedByteLength<false>(stringInput, 1), 4);
+  ASSERT_EQ(cappedByteLength<false>(stringInput, 2), 4);
+  ASSERT_EQ(cappedByteLength<false>(stringInput, 3), 4);
+
+  ASSERT_EQ(cappedByteLength<false>(stringViewInput, 1), 4);
+  ASSERT_EQ(cappedByteLength<false>(stringViewInput, 2), 4);
+  ASSERT_EQ(cappedByteLength<false>(stringViewInput, 3), 4);
+}
+
 TEST_F(StringImplTest, badUnicodeLength) {
   ASSERT_EQ(0, length</*isAscii*/ false>(std::string("")));
   ASSERT_EQ(2, length</*isAscii*/ false>(std::string("ab")));