Skip to content

Commit

Permalink
implement code_point and code_unit api for String
Browse files Browse the repository at this point in the history
  • Loading branch information
Yu-zh committed Jan 13, 2025
1 parent 59a57ff commit 38e6e46
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 0 deletions.
2 changes: 2 additions & 0 deletions builtin/builtin.mbti
Original file line number Diff line number Diff line change
Expand Up @@ -653,6 +653,8 @@ impl String {
substring(String, start~ : Int = .., end? : Int) -> String
to_json(String) -> Json
to_string(String) -> String
unsafe_get(String, Int) -> Char
unsafe_length(String) -> Int
}

impl Option {
Expand Down
6 changes: 6 additions & 0 deletions builtin/intrinsics.mbt
Original file line number Diff line number Diff line change
Expand Up @@ -287,12 +287,18 @@ pub fn FixedArray::make[T](len : Int, init : T) -> FixedArray[T] = "%fixedarray.
///|
pub fn String::length(self : String) -> Int = "%string_length"

///|
pub fn String::unsafe_length(self : String) -> Int = "%string_length"

///|
pub fn String::op_get(self : String, idx : Int) -> Char = "%string_get"

///|
pub fn String::get(self : String, idx : Int) -> Char = "%string_get"

///|
pub fn String::unsafe_get(self : String, idx : Int) -> Char = "%string_get"

///|
pub fn String::op_add(self : String, other : String) -> String = "%string_add"

Expand Down
127 changes: 127 additions & 0 deletions string/string.mbt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,133 @@
// See the License for the specific language governing permissions and
// limitations under the License.

///|
/// Returns the UTF-16 code unit at the given index.
///
/// # Examples
///
/// ```
/// let s = "Hello🤣";
/// assert_eq!(s.code_unit_at(0), 'H');
/// assert_eq!(s.code_unit_at(5), '\ud83e'); // First surrogate of 🤣
/// assert_eq!(s.code_unit_at(6), '\udd23'); // Second surrogate of 🤣
/// ```
///
/// # Panics
///
/// Panics if the index is out of bounds.
pub fn code_unit_at(self : String, index : Int) -> Char {
guard index >= 0 && index < self.length() else {
abort("index out of bounds")
}
self.unsafe_get(index)
}

///|
/// Returns the Unicode code point at the given index.
///
/// This method counts Unicode code points (characters) rather than UTF-16 code units.
/// It properly handles surrogate pairs to return the correct Unicode character.
///
/// # Examples
///
/// ```
/// let s = "Hello🤣";
/// assert_eq!(s.code_point_at(0), 'H');
/// assert_eq!(s.code_point_at(5), '🤣'); // Returns full emoji character
/// ```
///
/// # Panics
///
/// Panics if:
/// - The index is out of bounds
/// - The string contains an invalid surrogate pair
pub fn code_point_at(self : String, index : Int) -> Char {
let code_unit_len = self.unsafe_length()
if index < 0 || index >= code_unit_len {
abort("index out of bounds")
}
for idx = 0, num = 0
idx < code_unit_len && num < index
idx = idx + 1, num = num + 1 {
let c1 = self.unsafe_get(idx)
if is_leading_surrogate(c1) && idx + 1 < code_unit_len {
let c2 = self.unsafe_get(idx + 1)
if is_trailing_surrogate(c2) {
continue idx + 2, num + 1
} else {
abort("invalid surrogate pair")
}
}
} else {
guard num == index else { abort("index out of bounds") }
let c1 = self.unsafe_get(idx)
if is_leading_surrogate(c1) && idx + 1 < code_unit_len {
let c2 = self.unsafe_get(idx + 1)
if is_trailing_surrogate(c2) {
code_point_of_surrogate_pair(c1, c2)
} else {
abort("invalid surrogate pair")
}
} else {
c1
}
}
}

///|
/// Returns the number of UTF-16 code units in the string.
///
/// This method returns the raw length of the underlying UTF-16 encoded string.
/// Note that this may not match the number of visible characters (code points)
/// since some Unicode characters like emojis use two UTF-16 code units.
///
/// For counting actual Unicode characters, use `code_point_length()` instead.
///
/// # Examples
///
/// ```
/// let s = "Hello🤣";
/// assert_eq!(s.code_unit_length(), 7); // 5 ASCII chars + 2 surrogate pairs
/// assert_eq!(s.code_point_length(), 6); // 6 actual characters
/// ```
pub fn code_unit_length(self : String) -> Int {
self.unsafe_length()
}

///|
/// Returns the number of Unicode code points (characters) in the string.
///
/// This method counts actual Unicode characters, properly handling surrogate pairs
/// that represent single characters like emojis. For the raw UTF-16 code unit count,
/// use `code_unit_length()` instead.
///
/// # Examples
///
/// ```
/// let s = "Hello🤣";
/// assert_eq!(s.code_point_length(), 6); // 6 actual characters
/// assert_eq!(s.code_unit_length(), 7); // 5 ASCII chars + 2 surrogate pairs
/// ```
pub fn code_point_length(self : String) -> Int {
let code_unit_len = self.unsafe_length()
for index = 0, len = 0
index < code_unit_len
index = index + 1, len = len + 1 {
let c1 = self.unsafe_get(index)
if is_leading_surrogate(c1) && index + 1 < code_unit_len {
let c2 = self.unsafe_get(index + 1)
if is_trailing_surrogate(c2) {
continue index + 2, len + 1
} else {
abort("invalid surrogate pair")
}
}
} else {
len
}
}

///|
/// Convert char array to string.
///
Expand Down
4 changes: 4 additions & 0 deletions string/string.mbti
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ impl StringView {
impl Show for StringView

impl String {
code_point_at(String, Int) -> Char
code_point_length(String) -> Int
code_unit_at(String, Int) -> Char
code_unit_length(String) -> Int
compare(String, String) -> Int
concat(Array[String], separator~ : String = ..) -> String
contains(String, String) -> Bool
Expand Down

0 comments on commit 38e6e46

Please sign in to comment.