diff --git a/builtin/builtin.mbti b/builtin/builtin.mbti index d1c0a73a4..8010054e4 100644 --- a/builtin/builtin.mbti +++ b/builtin/builtin.mbti @@ -674,7 +674,6 @@ impl String { op_add(String, String) -> String op_equal(String, String) -> Bool op_get(String, Int) -> Char - rev_get(String, Int) -> Char substring(String, start~ : Int = .., end? : Int) -> String to_json(String) -> Json to_string(String) -> String diff --git a/builtin/string.mbt b/builtin/string.mbt index ad372dfd3..5153a4937 100644 --- a/builtin/string.mbt +++ b/builtin/string.mbt @@ -158,52 +158,6 @@ pub fn codepoint_length(self : String) -> Int { // codepoint_length(self) //} -///| -/// Returns the character at the given index from the end of the string. -/// -/// # Examples -/// -/// ```moonbit -/// let s = "Hello🤣🤣🤣" -/// inspect!(s.rev_get(0), content="'🤣'") -/// inspect!(s.rev_get(4), content="'l'") -/// ``` -/// -/// # Panics -/// -/// Panics if the index is out of bounds. -pub fn rev_get(self : String, index : Int) -> Char { - guard index >= 0 else { abort("index out of bounds") } - for utf16_offset = self.charcode_length() - 1, char_count = 0 - utf16_offset >= 0 && char_count < index - utf16_offset = utf16_offset - 1, char_count = char_count + 1 { - let c1 = self.unsafe_charcode_at(utf16_offset) - if is_trailing_surrogate(c1) && utf16_offset - 1 >= 0 { - let c2 = self.unsafe_charcode_at(utf16_offset - 1) - if is_leading_surrogate(c2) { - continue utf16_offset - 2, char_count + 1 - } else { - abort("invalid surrogate pair") - } - } - } else { - guard char_count == index && utf16_offset >= 0 else { - abort("index out of bounds") - } - let c1 = self.unsafe_charcode_at(utf16_offset) - if is_trailing_surrogate(c1) && utf16_offset - 1 >= 0 { - let c2 = self.unsafe_charcode_at(utf16_offset - 1) - if is_leading_surrogate(c2) { - code_point_of_surrogate_pair(c2, c1) - } else { - abort("invalid surrogate pair") - } - } else { - Char::from_int(c1) - } - } -} - ///| /// @intrinsic %string.substring fn unsafe_substring(str : String, start : Int, end : Int) -> String { diff --git a/builtin/string_test.mbt b/builtin/string_test.mbt index 360fc453e..16bbe6d25 100644 --- a/builtin/string_test.mbt +++ b/builtin/string_test.mbt @@ -46,18 +46,6 @@ test "substring" { assert_eq!("abc".substring(start=1, end=2), "b") } -test "panic rev_get1" { - let str = "Hello🤣🤣🤣" - let _ = str.rev_get(-1) - -} - -test "panic rev_get2" { - let str = "Hello🤣🤣🤣" - let _ = str.rev_get(8) - -} - test "panic codepoint_at1" { let str = "Hello🤣🤣🤣" let _ = str.codepoint_at(8) diff --git a/string/string.mbt b/string/string.mbt index 2db7d4e48..c37eee940 100644 --- a/string/string.mbt +++ b/string/string.mbt @@ -154,9 +154,9 @@ pub fn iter(self : String) -> Iter[Char] { Iter::new(fn(yield_) { let len = self.length() for index = 0; index < len; index = index + 1 { - let c1 = self[index] + let c1 = self.unsafe_charcode_at(index) if is_leading_surrogate(c1) && index + 1 < len { - let c2 = self[index + 1] + let c2 = self.unsafe_charcode_at(index + 1) if is_trailing_surrogate(c2) { let c = code_point_of_surrogate_pair(c1, c2) guard let IterContinue = yield_(c) else { x => break x } @@ -164,7 +164,7 @@ pub fn iter(self : String) -> Iter[Char] { } } //TODO: handle garbage input - guard let IterContinue = yield_(c1) else { x => break x } + guard let IterContinue = yield_(Char::from_int(c1)) else { x => break x } } else { IterContinue @@ -177,9 +177,9 @@ pub fn iter2(self : String) -> Iter2[Int, Char] { Iter2::new(fn(yield_) { let len = self.length() for index = 0, n = 0; index < len; index = index + 1, n = n + 1 { - let c1 = self[index] + let c1 = self.unsafe_charcode_at(index) if is_leading_surrogate(c1) && index + 1 < len { - let c2 = self[index + 1] + let c2 = self.unsafe_charcode_at(index + 1) if is_trailing_surrogate(c2) { let c = code_point_of_surrogate_pair(c1, c2) guard let IterContinue = yield_(n, c) else { x => break x } @@ -187,7 +187,9 @@ pub fn iter2(self : String) -> Iter2[Int, Char] { } } //TODO: handle garbage input - guard let IterContinue = yield_(n, c1) else { x => break x } + guard let IterContinue = yield_(n, Char::from_int(c1)) else { + x => break x + } } else { IterContinue @@ -245,9 +247,9 @@ pub fn rev_iter(self : String) -> Iter[Char] { Iter::new(fn(yield_) { let len = self.length() for index = len - 1; index >= 0; index = index - 1 { - let c1 = self[index] + let c1 = self.unsafe_charcode_at(index) if is_trailing_surrogate(c1) && index - 1 >= 0 { - let c2 = self[index - 1] + let c2 = self.unsafe_charcode_at(index - 1) if is_leading_surrogate(c2) { let c = code_point_of_surrogate_pair(c2, c1) guard let IterContinue = yield_(c) else { x => break x } @@ -255,7 +257,7 @@ pub fn rev_iter(self : String) -> Iter[Char] { } } // TODO: handle garbage input - guard let IterContinue = yield_(c1) else { x => break x } + guard let IterContinue = yield_(Char::from_int(c1)) else { x => break x } } else { IterContinue @@ -289,10 +291,10 @@ pub fn contains_char(self : String, c : Char) -> Bool { pub fn trim_start(self : String, trim_set : String) -> String { let len = self.length() for i = 0; i < len; i = i + 1 { - let c1 = self[i] + let c1 = self.unsafe_charcode_at(i) // check surrogate pair if is_leading_surrogate(c1) && i + 1 < len { - let c2 = self[i + 1] + let c2 = self.unsafe_charcode_at(i + 1) if is_trailing_surrogate(c2) { let ch = code_point_of_surrogate_pair(c1, c2) if trim_set.contains_char(ch) { @@ -315,10 +317,10 @@ pub fn trim_start(self : String, trim_set : String) -> String { pub fn trim_end(self : String, trim_set : String) -> String { let len = self.length() for i = len - 1; i >= 0; i = i - 1 { - let c2 = self[i] + let c2 = self.unsafe_charcode_at(i) // check surrogate pair if is_trailing_surrogate(c2) && i - 1 >= 0 { - let c1 = self[i - 1] + let c1 = self.unsafe_charcode_at(i - 1) if is_leading_surrogate(c1) { let ch = code_point_of_surrogate_pair(c1, c2) if trim_set.contains_char(ch) { @@ -676,3 +678,95 @@ pub fn pad_end(self : String, total_width : Int, padding_char : Char) -> String buf.to_string() } } + +///| +/// Returns the character at the given index from the end of the string. +/// +/// # Examples +/// +/// ```moonbit +/// let s = "Hello🤣🤣🤣" +/// inspect!(s.rev_get(0), content="'🤣'") +/// inspect!(s.rev_get(4), content="'l'") +/// ``` +/// +/// # Panics +/// +/// Panics if the index is out of bounds. +pub fn String::rev_get(self : String, index : Int) -> Char { + guard index >= 0 else { abort("index out of bounds") } + for utf16_offset = self.charcode_length() - 1, char_count = 0 + utf16_offset >= 0 && char_count < index + utf16_offset = utf16_offset - 1, char_count = char_count + 1 { + let c1 = self.unsafe_charcode_at(utf16_offset) + if is_trailing_surrogate(c1) && utf16_offset - 1 >= 0 { + let c2 = self.unsafe_charcode_at(utf16_offset - 1) + if is_leading_surrogate(c2) { + continue utf16_offset - 2, char_count + 1 + } else { + abort("invalid surrogate pair") + } + } + } else { + guard char_count == index && utf16_offset >= 0 else { + abort("index out of bounds") + } + let c1 = self.unsafe_charcode_at(utf16_offset) + if is_trailing_surrogate(c1) && utf16_offset - 1 >= 0 { + let c2 = self.unsafe_charcode_at(utf16_offset - 1) + if is_leading_surrogate(c2) { + code_point_of_surrogate_pair(c2, c1) + } else { + abort("invalid surrogate pair") + } + } else { + Char::from_int(c1) + } + } +} + +///| +/// Test if the length of the string is equal to the given length. +/// +/// This has O(n) complexity where n is the length in the parameter. +pub fn String::length_eq(self : String, len : Int) -> Bool { + let codeunit_len = self.charcode_length() + for index = 0, count = 0 + index < codeunit_len && count < len + index = index + 1, count = count + 1 { + let c1 = self.unsafe_charcode_at(index) + if is_leading_surrogate(c1) && index + 1 < codeunit_len { + let c2 = self.unsafe_charcode_at(index + 1) + if is_trailing_surrogate(c2) { + continue index + 2, count + 1 + } else { + abort("invalid surrogate pair") + } + } + } else { + count == len && index == codeunit_len + } +} + +///| +/// Test if the length of the string is greater than or equal to the given length. +/// +/// This has O(n) complexity where n is the length in the parameter. +pub fn String::length_ge(self : String, len : Int) -> Bool { + let codeunit_len = self.charcode_length() + for index = 0, count = 0 + index < codeunit_len && count < len + index = index + 1, count = count + 1 { + let c1 = self.unsafe_charcode_at(index) + if is_leading_surrogate(c1) && index + 1 < codeunit_len { + let c2 = self.unsafe_charcode_at(index + 1) + if is_trailing_surrogate(c2) { + continue index + 2, count + 1 + } else { + abort("invalid surrogate pair") + } + } + } else { + count >= len + } +} diff --git a/string/string.mbti b/string/string.mbti index dcc0ea81e..ca7f9fff8 100644 --- a/string/string.mbti +++ b/string/string.mbti @@ -44,6 +44,8 @@ impl String { iter(String) -> Iter[Char] iter2(String) -> Iter2[Int, Char] last_index_of(String, String, from~ : Int = ..) -> Int + length_eq(String, Int) -> Bool + length_ge(String, Int) -> Bool op_as_view(String, start~ : StringIndex = .., end? : StringIndex) -> StringView pad_end(String, Int, Char) -> String pad_start(String, Int, Char) -> String @@ -52,6 +54,7 @@ impl String { replace_all(String, old~ : String, new~ : String) -> String rev(String) -> String rev_fold[A](String, init~ : A, (A, Char) -> A) -> A + rev_get(String, Int) -> Char rev_iter(String) -> Iter[Char] split(String, String) -> Iter[String] starts_with(String, String) -> Bool diff --git a/string/string_test.mbt b/string/string_test.mbt index d9aa06d9b..01c11fcad 100644 --- a/string/string_test.mbt +++ b/string/string_test.mbt @@ -516,3 +516,28 @@ test "pad_right" { inspect!("22".pad_end(2, '0'), content="22") inspect!("5".pad_end(4, 'x'), content="5xxx") } + +test "panic rev_get1" { + let str = "Hello🤣🤣🤣" + let _ = str.rev_get(-1) + +} + +test "panic rev_get2" { + let str = "Hello🤣🤣🤣" + let _ = str.rev_get(8) + +} + +test "length_ge" { + let str = "Hello🤣🤣🤣" + assert_true!(str.length_ge(0)) + assert_true!(str.length_ge(8)) + assert_false!(str.length_ge(9)) +} + +test "length_eq" { + let str = "Hello🤣🤣🤣" + assert_true!(str.length_eq(8)) + assert_false!(str.length_eq(9)) +} diff --git a/string/utils.mbt b/string/utils.mbt index d1ada06cb..41f31402c 100644 --- a/string/utils.mbt +++ b/string/utils.mbt @@ -25,37 +25,33 @@ let min_trailing_surrogate = 0xDC00 let max_trailing_surrogate = 0xDFFF ///| -fn is_leading_surrogate(c : Char) -> Bool { - let code = c.to_int() - min_leading_surrogate <= code && code <= max_leading_surrogate +fn is_leading_surrogate(c : Int) -> Bool { + min_leading_surrogate <= c && c <= max_leading_surrogate } test "is_leading_surrogate" { - inspect!(is_leading_surrogate("🤣"[0]), content="true") - inspect!(is_leading_surrogate("🤣"[1]), content="false") + inspect!(is_leading_surrogate("🤣".charcode_at(0)), content="true") + inspect!(is_leading_surrogate("🤣".charcode_at(1)), content="false") } ///| -fn is_trailing_surrogate(c : Char) -> Bool { - let code = c.to_int() - min_trailing_surrogate <= code && code <= max_trailing_surrogate +fn is_trailing_surrogate(c : Int) -> Bool { + min_trailing_surrogate <= c && c <= max_trailing_surrogate } test "is_trailing_surrogate" { - inspect!(is_trailing_surrogate("🤣"[0]), content="false") - inspect!(is_trailing_surrogate("🤣"[1]), content="true") + inspect!(is_trailing_surrogate("🤣".charcode_at(0)), content="false") + inspect!(is_trailing_surrogate("🤣".charcode_at(1)), content="true") } ///| -fn code_point_of_surrogate_pair(leading : Char, trailing : Char) -> Char { - Char::from_int( - (leading.to_int() - 0xD800) * 0x400 + trailing.to_int() - 0xDC00 + 0x10000, - ) +fn code_point_of_surrogate_pair(leading : Int, trailing : Int) -> Char { + Char::from_int((leading - 0xD800) * 0x400 + trailing - 0xDC00 + 0x10000) } test "code_point_of_surrogate_pair" { let s = "😀" - let leading = s[0] - let trailing = s[1] + let leading = s.charcode_at(0) + let trailing = s.charcode_at(1) inspect!(code_point_of_surrogate_pair(leading, trailing), content="'😀'") } diff --git a/string/view.mbt b/string/view.mbt index 7c4e3ddc3..cb6f00b85 100644 --- a/string/view.mbt +++ b/string/view.mbt @@ -66,10 +66,10 @@ pub fn index_at( let mut utf16_offset = start._ let mut char_count = 0 while utf16_offset < str_len && char_count < offset_by { - let c1 = self[utf16_offset] + let c1 = self.unsafe_charcode_at(utf16_offset) // check if this is a surrogate pair if is_leading_surrogate(c1) && utf16_offset + 1 < str_len { - let c2 = self[utf16_offset + 1] + let c2 = self.unsafe_charcode_at(utf16_offset + 1) if is_trailing_surrogate(c2) { utf16_offset = utf16_offset + 2 char_count = char_count + 1 @@ -127,9 +127,9 @@ pub fn index_at_rev( // Iterating backwards from the end of the string. [utf16_offset] always // points to the last skipped character. while utf16_offset > 0 && char_count < offset_by { - let c1 = self[utf16_offset - 1] + let c1 = self.unsafe_charcode_at(utf16_offset - 1) if is_trailing_surrogate(c1) && utf16_offset - 2 >= 0 { - let c2 = self[utf16_offset - 2] + let c2 = self.unsafe_charcode_at(utf16_offset - 2) if is_leading_surrogate(c2) { utf16_offset = utf16_offset - 2 char_count = char_count + 1 @@ -154,9 +154,9 @@ pub fn index_at_rev( pub fn StringView::length(self : StringView) -> Int { let mut len = 0 for index = self.start; index < self.end; index = index + 1 { - let c1 = self.str[index] + let c1 = self.str.unsafe_charcode_at(index) if is_leading_surrogate(c1) && index + 1 < self.end { - let c2 = self.str[index + 1] + let c2 = self.str.unsafe_charcode_at(index + 1) if is_trailing_surrogate(c2) { len = len + 1 continue index + 2 @@ -177,9 +177,9 @@ pub fn length_eq(self : StringView, len : Int) -> Bool { for index = self.start, self_len = 0 index < self.end && self_len < len index = index + 1, self_len = self_len + 1 { - let c1 = self.str[index] + let c1 = self.str.unsafe_charcode_at(index) if is_leading_surrogate(c1) && index + 1 < self.end { - let c2 = self.str[index + 1] + let c2 = self.str.unsafe_charcode_at(index + 1) if is_trailing_surrogate(c2) { continue index + 2, self_len + 1 } else { @@ -199,9 +199,9 @@ pub fn length_ge(self : StringView, len : Int) -> Bool { for index = self.start, self_len = 0 index < self.end && self_len < len index = index + 1, self_len = self_len + 1 { - let c1 = self.str[index] + let c1 = self.str.unsafe_charcode_at(index) if is_leading_surrogate(c1) && index + 1 < self.end { - let c2 = self.str[index + 1] + let c2 = self.str.unsafe_charcode_at(index + 1) if is_trailing_surrogate(c2) { continue index + 2, self_len + 1 } else { @@ -323,9 +323,9 @@ pub fn StringView::op_get(self : StringView, index : Int) -> Char { let mut utf16_offset = self.start let mut char_count = 0 while char_count < index && utf16_offset < self.end { - let c1 = self.str[utf16_offset] + let c1 = self.str.unsafe_charcode_at(utf16_offset) if is_leading_surrogate(c1) && utf16_offset + 1 < self.str.length() { - let c2 = self.str[utf16_offset + 1] + let c2 = self.str.unsafe_charcode_at(utf16_offset + 1) if is_trailing_surrogate(c2) { utf16_offset = utf16_offset + 2 char_count = char_count + 1 @@ -340,16 +340,16 @@ pub fn StringView::op_get(self : StringView, index : Int) -> Char { guard char_count == index && utf16_offset < self.end else { abort("Index out of bounds: cannot access index \{index}") } - let c1 = self.str[utf16_offset] + let c1 = self.str.unsafe_charcode_at(utf16_offset) if is_leading_surrogate(c1) { - let c2 = self.str[utf16_offset + 1] + let c2 = self.str.unsafe_charcode_at(utf16_offset + 1) if is_trailing_surrogate(c2) { code_point_of_surrogate_pair(c1, c2) } else { abort("invalid surrogate pair") } } else { - c1 + Char::from_int(c1) } } @@ -376,9 +376,9 @@ pub fn rev_get(self : StringView, index : Int) -> Char { let mut utf16_offset = self.end - 1 let mut char_count = 0 while char_count < index && utf16_offset >= self.start { - let c1 = self.str[utf16_offset] + let c1 = self.str.unsafe_charcode_at(utf16_offset) if is_trailing_surrogate(c1) && utf16_offset - 1 >= self.start { - let c2 = self.str[utf16_offset - 1] + let c2 = self.str.unsafe_charcode_at(utf16_offset - 1) if is_leading_surrogate(c2) { utf16_offset = utf16_offset - 2 char_count = char_count + 1 @@ -393,16 +393,16 @@ pub fn rev_get(self : StringView, index : Int) -> Char { guard char_count == index && utf16_offset >= self.start else { abort("Index out of bounds: cannot access index \{index} in reverse") } - let c1 = self.str[utf16_offset] + let c1 = self.str.unsafe_charcode_at(utf16_offset) if is_trailing_surrogate(c1) { - let c2 = self.str[utf16_offset - 1] + let c2 = self.str.unsafe_charcode_at(utf16_offset - 1) if is_leading_surrogate(c2) { code_point_of_surrogate_pair(c2, c1) } else { abort("invalid surrogate pair") } } else { - c1 + Char::from_int(c1) } } @@ -417,16 +417,16 @@ pub impl Show for StringView with output(self, logger) { pub fn StringView::iter(self : StringView) -> Iter[Char] { Iter::new(fn(yield_) { for index = self.start; index < self.end; index = index + 1 { - let c1 = self.str[index] + let c1 = self.str.unsafe_charcode_at(index) if is_leading_surrogate(c1) && index + 1 < self.end { - let c2 = self.str[index + 1] + let c2 = self.str.unsafe_charcode_at(index + 1) if is_trailing_surrogate(c2) { let c = code_point_of_surrogate_pair(c1, c2) guard let IterContinue = yield_(c) else { x => break x } continue index + 2 } } - guard let IterContinue = yield_(c1) else { x => break x } + guard let IterContinue = yield_(Char::from_int(c1)) else { x => break x } } else { IterContinue