Skip to content

Commit

Permalink
add length api
Browse files Browse the repository at this point in the history
  • Loading branch information
Yu-zh authored and bobzhang committed Jan 20, 2025
1 parent 2d4771e commit 1363a09
Show file tree
Hide file tree
Showing 8 changed files with 170 additions and 111 deletions.
1 change: 0 additions & 1 deletion builtin/builtin.mbti
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,6 @@ impl String {
op_add(String, String) -> String
op_equal(String, String) -> Bool
op_get(String, Int) -> Char
rev_get(String, Int) -> Char
substring(String, start~ : Int = .., end? : Int) -> String
to_json(String) -> Json
to_string(String) -> String
Expand Down
46 changes: 0 additions & 46 deletions builtin/string.mbt
Original file line number Diff line number Diff line change
Expand Up @@ -158,52 +158,6 @@ pub fn codepoint_length(self : String) -> Int {
// codepoint_length(self)
//}

///|
/// Returns the character at the given index from the end of the string.
///
/// # Examples
///
/// ```moonbit
/// let s = "Hello🤣🤣🤣"
/// inspect!(s.rev_get(0), content="'🤣'")
/// inspect!(s.rev_get(4), content="'l'")
/// ```
///
/// # Panics
///
/// Panics if the index is out of bounds.
pub fn rev_get(self : String, index : Int) -> Char {
guard index >= 0 else { abort("index out of bounds") }
for utf16_offset = self.charcode_length() - 1, char_count = 0
utf16_offset >= 0 && char_count < index
utf16_offset = utf16_offset - 1, char_count = char_count + 1 {
let c1 = self.unsafe_charcode_at(utf16_offset)
if is_trailing_surrogate(c1) && utf16_offset - 1 >= 0 {
let c2 = self.unsafe_charcode_at(utf16_offset - 1)
if is_leading_surrogate(c2) {
continue utf16_offset - 2, char_count + 1
} else {
abort("invalid surrogate pair")
}
}
} else {
guard char_count == index && utf16_offset >= 0 else {
abort("index out of bounds")
}
let c1 = self.unsafe_charcode_at(utf16_offset)
if is_trailing_surrogate(c1) && utf16_offset - 1 >= 0 {
let c2 = self.unsafe_charcode_at(utf16_offset - 1)
if is_leading_surrogate(c2) {
code_point_of_surrogate_pair(c2, c1)
} else {
abort("invalid surrogate pair")
}
} else {
Char::from_int(c1)
}
}
}

///|
/// @intrinsic %string.substring
fn unsafe_substring(str : String, start : Int, end : Int) -> String {
Expand Down
12 changes: 0 additions & 12 deletions builtin/string_test.mbt
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,6 @@ test "substring" {
assert_eq!("abc".substring(start=1, end=2), "b")
}

test "panic rev_get1" {
let str = "Hello🤣🤣🤣"
let _ = str.rev_get(-1)

}

test "panic rev_get2" {
let str = "Hello🤣🤣🤣"
let _ = str.rev_get(8)

}

test "panic codepoint_at1" {
let str = "Hello🤣🤣🤣"
let _ = str.codepoint_at(8)
Expand Down
120 changes: 107 additions & 13 deletions string/string.mbt
Original file line number Diff line number Diff line change
Expand Up @@ -154,17 +154,17 @@ pub fn iter(self : String) -> Iter[Char] {
Iter::new(fn(yield_) {
let len = self.length()
for index = 0; index < len; index = index + 1 {
let c1 = self[index]
let c1 = self.unsafe_charcode_at(index)
if is_leading_surrogate(c1) && index + 1 < len {
let c2 = self[index + 1]
let c2 = self.unsafe_charcode_at(index + 1)
if is_trailing_surrogate(c2) {
let c = code_point_of_surrogate_pair(c1, c2)
guard let IterContinue = yield_(c) else { x => break x }
continue index + 2
}
}
//TODO: handle garbage input
guard let IterContinue = yield_(c1) else { x => break x }
guard let IterContinue = yield_(Char::from_int(c1)) else { x => break x }

} else {
IterContinue
Expand All @@ -177,17 +177,19 @@ pub fn iter2(self : String) -> Iter2[Int, Char] {
Iter2::new(fn(yield_) {
let len = self.length()
for index = 0, n = 0; index < len; index = index + 1, n = n + 1 {
let c1 = self[index]
let c1 = self.unsafe_charcode_at(index)
if is_leading_surrogate(c1) && index + 1 < len {
let c2 = self[index + 1]
let c2 = self.unsafe_charcode_at(index + 1)
if is_trailing_surrogate(c2) {
let c = code_point_of_surrogate_pair(c1, c2)
guard let IterContinue = yield_(n, c) else { x => break x }
continue index + 2, n + 1
}
}
//TODO: handle garbage input
guard let IterContinue = yield_(n, c1) else { x => break x }
guard let IterContinue = yield_(n, Char::from_int(c1)) else {
x => break x
}

} else {
IterContinue
Expand Down Expand Up @@ -245,17 +247,17 @@ pub fn rev_iter(self : String) -> Iter[Char] {
Iter::new(fn(yield_) {
let len = self.length()
for index = len - 1; index >= 0; index = index - 1 {
let c1 = self[index]
let c1 = self.unsafe_charcode_at(index)
if is_trailing_surrogate(c1) && index - 1 >= 0 {
let c2 = self[index - 1]
let c2 = self.unsafe_charcode_at(index - 1)
if is_leading_surrogate(c2) {
let c = code_point_of_surrogate_pair(c2, c1)
guard let IterContinue = yield_(c) else { x => break x }
continue index - 2
}
}
// TODO: handle garbage input
guard let IterContinue = yield_(c1) else { x => break x }
guard let IterContinue = yield_(Char::from_int(c1)) else { x => break x }

} else {
IterContinue
Expand Down Expand Up @@ -289,10 +291,10 @@ pub fn contains_char(self : String, c : Char) -> Bool {
pub fn trim_start(self : String, trim_set : String) -> String {
let len = self.length()
for i = 0; i < len; i = i + 1 {
let c1 = self[i]
let c1 = self.unsafe_charcode_at(i)
// check surrogate pair
if is_leading_surrogate(c1) && i + 1 < len {
let c2 = self[i + 1]
let c2 = self.unsafe_charcode_at(i + 1)
if is_trailing_surrogate(c2) {
let ch = code_point_of_surrogate_pair(c1, c2)
if trim_set.contains_char(ch) {
Expand All @@ -315,10 +317,10 @@ pub fn trim_start(self : String, trim_set : String) -> String {
pub fn trim_end(self : String, trim_set : String) -> String {
let len = self.length()
for i = len - 1; i >= 0; i = i - 1 {
let c2 = self[i]
let c2 = self.unsafe_charcode_at(i)
// check surrogate pair
if is_trailing_surrogate(c2) && i - 1 >= 0 {
let c1 = self[i - 1]
let c1 = self.unsafe_charcode_at(i - 1)
if is_leading_surrogate(c1) {
let ch = code_point_of_surrogate_pair(c1, c2)
if trim_set.contains_char(ch) {
Expand Down Expand Up @@ -676,3 +678,95 @@ pub fn pad_end(self : String, total_width : Int, padding_char : Char) -> String
buf.to_string()
}
}

///|
/// Returns the character at the given index from the end of the string.
///
/// # Examples
///
/// ```moonbit
/// let s = "Hello🤣🤣🤣"
/// inspect!(s.rev_get(0), content="'🤣'")
/// inspect!(s.rev_get(4), content="'l'")
/// ```
///
/// # Panics
///
/// Panics if the index is out of bounds.
pub fn String::rev_get(self : String, index : Int) -> Char {
guard index >= 0 else { abort("index out of bounds") }
for utf16_offset = self.charcode_length() - 1, char_count = 0
utf16_offset >= 0 && char_count < index
utf16_offset = utf16_offset - 1, char_count = char_count + 1 {
let c1 = self.unsafe_charcode_at(utf16_offset)
if is_trailing_surrogate(c1) && utf16_offset - 1 >= 0 {
let c2 = self.unsafe_charcode_at(utf16_offset - 1)
if is_leading_surrogate(c2) {
continue utf16_offset - 2, char_count + 1
} else {
abort("invalid surrogate pair")
}
}
} else {
guard char_count == index && utf16_offset >= 0 else {
abort("index out of bounds")
}
let c1 = self.unsafe_charcode_at(utf16_offset)
if is_trailing_surrogate(c1) && utf16_offset - 1 >= 0 {
let c2 = self.unsafe_charcode_at(utf16_offset - 1)
if is_leading_surrogate(c2) {
code_point_of_surrogate_pair(c2, c1)
} else {
abort("invalid surrogate pair")
}
} else {
Char::from_int(c1)
}
}
}

///|
/// Test if the length of the string is equal to the given length.
///
/// This has O(n) complexity where n is the length in the parameter.
pub fn String::length_eq(self : String, len : Int) -> Bool {
let codeunit_len = self.charcode_length()
for index = 0, count = 0
index < codeunit_len && count < len
index = index + 1, count = count + 1 {
let c1 = self.unsafe_charcode_at(index)
if is_leading_surrogate(c1) && index + 1 < codeunit_len {
let c2 = self.unsafe_charcode_at(index + 1)
if is_trailing_surrogate(c2) {
continue index + 2, count + 1
} else {
abort("invalid surrogate pair")
}
}
} else {
count == len && index == codeunit_len
}
}

///|
/// Test if the length of the string is greater than or equal to the given length.
///
/// This has O(n) complexity where n is the length in the parameter.
pub fn String::length_ge(self : String, len : Int) -> Bool {
let codeunit_len = self.charcode_length()
for index = 0, count = 0
index < codeunit_len && count < len
index = index + 1, count = count + 1 {
let c1 = self.unsafe_charcode_at(index)
if is_leading_surrogate(c1) && index + 1 < codeunit_len {
let c2 = self.unsafe_charcode_at(index + 1)
if is_trailing_surrogate(c2) {
continue index + 2, count + 1
} else {
abort("invalid surrogate pair")
}
}
} else {
count >= len
}
}
3 changes: 3 additions & 0 deletions string/string.mbti
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ impl String {
iter(String) -> Iter[Char]
iter2(String) -> Iter2[Int, Char]
last_index_of(String, String, from~ : Int = ..) -> Int
length_eq(String, Int) -> Bool
length_ge(String, Int) -> Bool
op_as_view(String, start~ : StringIndex = .., end? : StringIndex) -> StringView
pad_end(String, Int, Char) -> String
pad_start(String, Int, Char) -> String
Expand All @@ -52,6 +54,7 @@ impl String {
replace_all(String, old~ : String, new~ : String) -> String
rev(String) -> String
rev_fold[A](String, init~ : A, (A, Char) -> A) -> A
rev_get(String, Int) -> Char
rev_iter(String) -> Iter[Char]
split(String, String) -> Iter[String]
starts_with(String, String) -> Bool
Expand Down
25 changes: 25 additions & 0 deletions string/string_test.mbt
Original file line number Diff line number Diff line change
Expand Up @@ -516,3 +516,28 @@ test "pad_right" {
inspect!("22".pad_end(2, '0'), content="22")
inspect!("5".pad_end(4, 'x'), content="5xxx")
}

test "panic rev_get1" {
let str = "Hello🤣🤣🤣"
let _ = str.rev_get(-1)

}

test "panic rev_get2" {
let str = "Hello🤣🤣🤣"
let _ = str.rev_get(8)

}

test "length_ge" {
let str = "Hello🤣🤣🤣"
assert_true!(str.length_ge(0))
assert_true!(str.length_ge(8))
assert_false!(str.length_ge(9))
}

test "length_eq" {
let str = "Hello🤣🤣🤣"
assert_true!(str.length_eq(8))
assert_false!(str.length_eq(9))
}
28 changes: 12 additions & 16 deletions string/utils.mbt
Original file line number Diff line number Diff line change
Expand Up @@ -25,37 +25,33 @@ let min_trailing_surrogate = 0xDC00
let max_trailing_surrogate = 0xDFFF

///|
fn is_leading_surrogate(c : Char) -> Bool {
let code = c.to_int()
min_leading_surrogate <= code && code <= max_leading_surrogate
fn is_leading_surrogate(c : Int) -> Bool {
min_leading_surrogate <= c && c <= max_leading_surrogate
}

test "is_leading_surrogate" {
inspect!(is_leading_surrogate("🤣"[0]), content="true")
inspect!(is_leading_surrogate("🤣"[1]), content="false")
inspect!(is_leading_surrogate("🤣".charcode_at(0)), content="true")
inspect!(is_leading_surrogate("🤣".charcode_at(1)), content="false")
}

///|
fn is_trailing_surrogate(c : Char) -> Bool {
let code = c.to_int()
min_trailing_surrogate <= code && code <= max_trailing_surrogate
fn is_trailing_surrogate(c : Int) -> Bool {
min_trailing_surrogate <= c && c <= max_trailing_surrogate
}

test "is_trailing_surrogate" {
inspect!(is_trailing_surrogate("🤣"[0]), content="false")
inspect!(is_trailing_surrogate("🤣"[1]), content="true")
inspect!(is_trailing_surrogate("🤣".charcode_at(0)), content="false")
inspect!(is_trailing_surrogate("🤣".charcode_at(1)), content="true")
}

///|
fn code_point_of_surrogate_pair(leading : Char, trailing : Char) -> Char {
Char::from_int(
(leading.to_int() - 0xD800) * 0x400 + trailing.to_int() - 0xDC00 + 0x10000,
)
fn code_point_of_surrogate_pair(leading : Int, trailing : Int) -> Char {
Char::from_int((leading - 0xD800) * 0x400 + trailing - 0xDC00 + 0x10000)
}

test "code_point_of_surrogate_pair" {
let s = "😀"
let leading = s[0]
let trailing = s[1]
let leading = s.charcode_at(0)
let trailing = s.charcode_at(1)
inspect!(code_point_of_surrogate_pair(leading, trailing), content="'😀'")
}
Loading

0 comments on commit 1363a09

Please sign in to comment.