diff --git a/builtin/builtin.mbti b/builtin/builtin.mbti index c999b80f2..837b12e04 100644 --- a/builtin/builtin.mbti +++ b/builtin/builtin.mbti @@ -653,6 +653,8 @@ impl String { substring(String, start~ : Int = .., end? : Int) -> String to_json(String) -> Json to_string(String) -> String + unsafe_get(String, Int) -> Char + unsafe_length(String) -> Int } impl Option { diff --git a/builtin/intrinsics.mbt b/builtin/intrinsics.mbt index 8cf02c30d..d3736aebe 100644 --- a/builtin/intrinsics.mbt +++ b/builtin/intrinsics.mbt @@ -287,12 +287,18 @@ pub fn FixedArray::make[T](len : Int, init : T) -> FixedArray[T] = "%fixedarray. ///| pub fn String::length(self : String) -> Int = "%string_length" +///| +pub fn String::unsafe_length(self : String) -> Int = "%string_length" + ///| pub fn String::op_get(self : String, idx : Int) -> Char = "%string_get" ///| pub fn String::get(self : String, idx : Int) -> Char = "%string_get" +///| +pub fn String::unsafe_get(self : String, idx : Int) -> Char = "%string_get" + ///| pub fn String::op_add(self : String, other : String) -> String = "%string_add" diff --git a/string/string.mbt b/string/string.mbt index 9fc0fa434..faedeb1b3 100644 --- a/string/string.mbt +++ b/string/string.mbt @@ -12,6 +12,133 @@ // See the License for the specific language governing permissions and // limitations under the License. +///| +/// Returns the UTF-16 code unit at the given index. +/// +/// # Examples +/// +/// ``` +/// let s = "Hello🤣"; +/// assert_eq!(s.code_unit_at(0), 'H'); +/// assert_eq!(s.code_unit_at(5), '\ud83e'); // First surrogate of 🤣 +/// assert_eq!(s.code_unit_at(6), '\udd23'); // Second surrogate of 🤣 +/// ``` +/// +/// # Panics +/// +/// Panics if the index is out of bounds. +pub fn code_unit_at(self : String, index : Int) -> Char { + guard index >= 0 && index < self.length() else { + abort("index out of bounds") + } + self.unsafe_get(index) +} + +///| +/// Returns the Unicode code point at the given index. +/// +/// This method counts Unicode code points (characters) rather than UTF-16 code units. +/// It properly handles surrogate pairs to return the correct Unicode character. +/// +/// # Examples +/// +/// ``` +/// let s = "Hello🤣"; +/// assert_eq!(s.code_point_at(0), 'H'); +/// assert_eq!(s.code_point_at(5), '🤣'); // Returns full emoji character +/// ``` +/// +/// # Panics +/// +/// Panics if: +/// - The index is out of bounds +/// - The string contains an invalid surrogate pair +pub fn code_point_at(self : String, index : Int) -> Char { + let code_unit_len = self.unsafe_length() + if index < 0 || index >= code_unit_len { + abort("index out of bounds") + } + for idx = 0, num = 0 + idx < code_unit_len && num < index + idx = idx + 1, num = num + 1 { + let c1 = self.unsafe_get(idx) + if is_leading_surrogate(c1) && idx + 1 < code_unit_len { + let c2 = self.unsafe_get(idx + 1) + if is_trailing_surrogate(c2) { + continue idx + 2, num + 1 + } else { + abort("invalid surrogate pair") + } + } + } else { + guard num == index else { abort("index out of bounds") } + let c1 = self.unsafe_get(idx) + if is_leading_surrogate(c1) && idx + 1 < code_unit_len { + let c2 = self.unsafe_get(idx + 1) + if is_trailing_surrogate(c2) { + code_point_of_surrogate_pair(c1, c2) + } else { + abort("invalid surrogate pair") + } + } else { + c1 + } + } +} + +///| +/// Returns the number of UTF-16 code units in the string. +/// +/// This method returns the raw length of the underlying UTF-16 encoded string. +/// Note that this may not match the number of visible characters (code points) +/// since some Unicode characters like emojis use two UTF-16 code units. +/// +/// For counting actual Unicode characters, use `code_point_length()` instead. +/// +/// # Examples +/// +/// ``` +/// let s = "Hello🤣"; +/// assert_eq!(s.code_unit_length(), 7); // 5 ASCII chars + 2 surrogate pairs +/// assert_eq!(s.code_point_length(), 6); // 6 actual characters +/// ``` +pub fn code_unit_length(self : String) -> Int { + self.unsafe_length() +} + +///| +/// Returns the number of Unicode code points (characters) in the string. +/// +/// This method counts actual Unicode characters, properly handling surrogate pairs +/// that represent single characters like emojis. For the raw UTF-16 code unit count, +/// use `code_unit_length()` instead. +/// +/// # Examples +/// +/// ``` +/// let s = "Hello🤣"; +/// assert_eq!(s.code_point_length(), 6); // 6 actual characters +/// assert_eq!(s.code_unit_length(), 7); // 5 ASCII chars + 2 surrogate pairs +/// ``` +pub fn code_point_length(self : String) -> Int { + let code_unit_len = self.unsafe_length() + for index = 0, len = 0 + index < code_unit_len + index = index + 1, len = len + 1 { + let c1 = self.unsafe_get(index) + if is_leading_surrogate(c1) && index + 1 < code_unit_len { + let c2 = self.unsafe_get(index + 1) + if is_trailing_surrogate(c2) { + continue index + 2, len + 1 + } else { + abort("invalid surrogate pair") + } + } + } else { + len + } +} + ///| /// Convert char array to string. /// diff --git a/string/string.mbti b/string/string.mbti index c19f92caf..e62c9437d 100644 --- a/string/string.mbti +++ b/string/string.mbti @@ -16,6 +16,10 @@ impl StringView { impl Show for StringView impl String { + code_point_at(String, Int) -> Char + code_point_length(String) -> Int + code_unit_at(String, Int) -> Char + code_unit_length(String) -> Int compare(String, String) -> Int concat(Array[String], separator~ : String = ..) -> String contains(String, String) -> Bool