From c890b7d8327e31a7d9544ca46e1a5d51d8ae8265 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 24 Sep 2024 14:30:53 -0400 Subject: [PATCH 01/12] optimize _StringSliceIter Signed-off-by: martinvuyk --- .gitignore | 2 ++ stdlib/src/utils/string_slice.mojo | 35 ++++++++++++------------------ 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index 7beaae8a60..1917d2257b 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,8 @@ venv/ ENV/ env.bak/ venv.bak/ +.magic/ +magic.lock # MacOS .DS_Store diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index 1904633013..7a832ccb9d 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -134,7 +134,6 @@ struct _StringSliceIter[ """ var index: Int - var continuation_bytes: Int var ptr: UnsafePointer[UInt8] var length: Int @@ -144,10 +143,6 @@ struct _StringSliceIter[ self.index = 0 if forward else length self.ptr = unsafe_pointer self.length = length - self.continuation_bytes = 0 - for i in range(length): - if _utf8_byte_type(unsafe_pointer[i]) == 1: - self.continuation_bytes += 1 fn __iter__(self) -> Self: return self @@ -155,12 +150,9 @@ struct _StringSliceIter[ fn __next__(inout self) -> StringSlice[lifetime]: @parameter if forward: - var byte_len = 1 - if self.continuation_bytes > 0: - var byte_type = _utf8_byte_type(self.ptr[self.index]) - if byte_type != 0: - byte_len = int(byte_type) - self.continuation_bytes -= byte_len - 1 + var byte_len = _utf8_first_byte_sequence_length( + self.ptr[self.index] + ) self.index += byte_len return StringSlice[lifetime]( unsafe_from_utf8_ptr=self.ptr + (self.index - byte_len), @@ -168,25 +160,26 @@ struct _StringSliceIter[ ) else: var byte_len = 1 - if self.continuation_bytes > 0: - var byte_type = _utf8_byte_type(self.ptr[self.index - 1]) - if byte_type != 0: - while byte_type == 1: - byte_len += 1 - var b = self.ptr[self.index - byte_len] - byte_type = _utf8_byte_type(b) - self.continuation_bytes -= byte_len - 1 + while _utf8_byte_type(self.ptr[self.index - 1]) == 1: + byte_len += 1 + var b = self.ptr[self.index - byte_len] self.index -= byte_len return StringSlice[lifetime]( unsafe_from_utf8_ptr=self.ptr + self.index, len=byte_len ) fn __len__(self) -> Int: + var cont_bytes = _count_utf8_continuation_bytes( + Span[UInt8, ImmutableAnyLifetime]( + unsafe_ptr=self.ptr, len=self.length + ) + ) + @parameter if forward: - return self.length - self.index - self.continuation_bytes + return self.length - self.index - cont_bytes else: - return self.index - self.continuation_bytes + return self.index - cont_bytes struct StringSlice[ From 97543061dda714b8c4bc28482f27564c7ba2b89c Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 24 Sep 2024 14:35:59 -0400 Subject: [PATCH 02/12] fix __len__ Signed-off-by: martinvuyk --- stdlib/src/utils/string_slice.mojo | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index 7a832ccb9d..2b8210af96 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -169,17 +169,20 @@ struct _StringSliceIter[ ) fn __len__(self) -> Int: - var cont_bytes = _count_utf8_continuation_bytes( - Span[UInt8, ImmutableAnyLifetime]( - unsafe_ptr=self.ptr, len=self.length - ) - ) @parameter if forward: - return self.length - self.index - cont_bytes + return self.length - self.index - _count_utf8_continuation_bytes( + Span[UInt8, ImmutableAnyLifetime]( + unsafe_ptr=self.ptr, len=self.index + ) + ) else: - return self.index - cont_bytes + return self.index - _count_utf8_continuation_bytes( + Span[UInt8, ImmutableAnyLifetime]( + unsafe_ptr=self.ptr, len=self.index + ) + ) struct StringSlice[ From fd15159c586f4b769b4a593fb8d5e6fd921d539e Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 24 Sep 2024 14:36:23 -0400 Subject: [PATCH 03/12] mojo format Signed-off-by: martinvuyk --- stdlib/src/utils/string_slice.mojo | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index 2b8210af96..43a7c50f55 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -169,12 +169,15 @@ struct _StringSliceIter[ ) fn __len__(self) -> Int: - @parameter if forward: - return self.length - self.index - _count_utf8_continuation_bytes( - Span[UInt8, ImmutableAnyLifetime]( - unsafe_ptr=self.ptr, len=self.index + return ( + self.length + - self.index + - _count_utf8_continuation_bytes( + Span[UInt8, ImmutableAnyLifetime]( + unsafe_ptr=self.ptr, len=self.index + ) ) ) else: From 73b83e8fbdb86bd9ef2ca895f24a70b78accaa4e Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 24 Sep 2024 14:42:42 -0400 Subject: [PATCH 04/12] fix detail Signed-off-by: martinvuyk --- stdlib/src/utils/string_slice.mojo | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index 43a7c50f55..47f44eeee5 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -170,22 +170,18 @@ struct _StringSliceIter[ fn __len__(self) -> Int: @parameter - if forward: - return ( - self.length - - self.index - - _count_utf8_continuation_bytes( - Span[UInt8, ImmutableAnyLifetime]( - unsafe_ptr=self.ptr, len=self.index - ) - ) + @always_inline + fn count() -> Int: + alias S = Span[UInt8, ImmutableAnyLifetime] + return _count_utf8_continuation_bytes( + S(unsafe_ptr=self.ptr, len=self.index) ) + + @parameter + if forward: + return self.length - self.index - count() else: - return self.index - _count_utf8_continuation_bytes( - Span[UInt8, ImmutableAnyLifetime]( - unsafe_ptr=self.ptr, len=self.index - ) - ) + return self.index - count() struct StringSlice[ From 4e758d3b7139825fb7a7f81cdfd50826e91e28e6 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 24 Sep 2024 14:44:48 -0400 Subject: [PATCH 05/12] fix detail Signed-off-by: martinvuyk --- stdlib/src/utils/string_slice.mojo | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index 47f44eeee5..2a24b666a4 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -169,19 +169,15 @@ struct _StringSliceIter[ ) fn __len__(self) -> Int: - @parameter - @always_inline - fn count() -> Int: - alias S = Span[UInt8, ImmutableAnyLifetime] - return _count_utf8_continuation_bytes( - S(unsafe_ptr=self.ptr, len=self.index) - ) + alias S = Span[UInt8, ImmutableAnyLifetime] + alias count = _count_utf8_continuation_bytes @parameter if forward: - return self.length - self.index - count() + var amnt = count(S(unsafe_ptr=self.ptr, len=self.index)) + return self.length - self.index - amnt else: - return self.index - count() + return self.index - count(S(unsafe_ptr=self.ptr, len=self.index)) struct StringSlice[ From 64f53185a21539fdfda0bc7171d77a3d9bf7d14d Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 24 Sep 2024 14:50:16 -0400 Subject: [PATCH 06/12] fix detail Signed-off-by: martinvuyk --- stdlib/src/utils/string_slice.mojo | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index 2a24b666a4..e8ebc1bbbb 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -160,9 +160,8 @@ struct _StringSliceIter[ ) else: var byte_len = 1 - while _utf8_byte_type(self.ptr[self.index - 1]) == 1: + while _utf8_byte_type(self.ptr[self.index - byte_len]) == 1: byte_len += 1 - var b = self.ptr[self.index - byte_len] self.index -= byte_len return StringSlice[lifetime]( unsafe_from_utf8_ptr=self.ptr + self.index, len=byte_len From 7b927e719d1e562a33eabd4c5cfb1877bf33feea Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Sun, 20 Oct 2024 19:32:15 -0300 Subject: [PATCH 07/12] fix details Signed-off-by: martinvuyk --- stdlib/src/utils/string_slice.mojo | 37 +++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index c5a8706ef2..5820148407 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -66,6 +66,22 @@ fn _unicode_codepoint_utf8_byte_length(c: Int) -> Int: return int((sizes < c).cast[DType.uint8]().reduce_add()) +@always_inline +fn _utf8_first_byte_sequence_length(b: Byte) -> Int: + """Get the length of the sequence starting with given byte. Do note that + this does not work correctly if given a continuation byte.""" + + debug_assert( + (b & 0b1100_0000) != 0b1000_0000, + ( + "Function `_utf8_first_byte_sequence_length()` does not work" + " correctly if given a continuation byte." + ), + ) + var flipped = ~b + return int(count_leading_zeros(flipped) + (flipped >> 7)) + + fn _shift_unicode_to_utf8(ptr: UnsafePointer[UInt8], c: Int, num_bytes: Int): """Shift unicode to utf8 representation. @@ -201,11 +217,11 @@ struct _StringSliceIter[ """ var index: Int - var ptr: UnsafePointer[UInt8] + var ptr: UnsafePointer[Byte] var length: Int fn __init__( - inout self, *, unsafe_pointer: UnsafePointer[UInt8], length: Int + inout self, *, unsafe_pointer: UnsafePointer[Byte], length: Int ): self.index = 0 if forward else length self.ptr = unsafe_pointer @@ -217,16 +233,14 @@ struct _StringSliceIter[ fn __next__(inout self) -> StringSlice[origin]: @parameter if forward: - var byte_len = _utf8_first_byte_sequence_length( - self.ptr[self.index] - ) + byte_len = _utf8_first_byte_sequence_length(self.ptr[self.index]) self.index += byte_len return StringSlice[origin]( unsafe_from_utf8_ptr=self.ptr + (self.index - byte_len), len=byte_len, ) else: - var byte_len = 1 + byte_len = 1 while _utf8_byte_type(self.ptr[self.index - byte_len]) == 1: byte_len += 1 self.index -= byte_len @@ -239,15 +253,16 @@ struct _StringSliceIter[ return self.__len__() > 0 fn __len__(self) -> Int: - alias S = Span[UInt8, ImmutableAnyLifetime] - alias count = _count_utf8_continuation_bytes + alias S = Span[UInt8, ImmutableAnyOrigin] + alias _count = _count_utf8_continuation_bytes @parameter if forward: - var amnt = count(S(unsafe_ptr=self.ptr, len=self.index)) - return self.length - self.index - amnt + remaining = self.length - self.index + cont = _count(S(unsafe_ptr=self.ptr + self.index, len=remaining)) + return remaining - cont else: - return self.index - count(S(unsafe_ptr=self.ptr, len=self.index)) + return self.index - _count(S(unsafe_ptr=self.ptr, len=self.index)) @value From a3cc27d960407fdfad27ce4861f4463cc5429f39 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Sun, 20 Oct 2024 23:57:50 -0300 Subject: [PATCH 08/12] fix details Signed-off-by: martinvuyk --- stdlib/src/utils/string_slice.mojo | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index 5820148407..40701bcacd 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -208,7 +208,7 @@ struct _StringSliceIter[ origin: Origin[is_mutable].type, forward: Bool = True, ]: - """Iterator for StringSlice + """Iterator for StringSlice over unicode characters. Parameters: is_mutable: Whether the slice is mutable. @@ -250,7 +250,7 @@ struct _StringSliceIter[ @always_inline fn __hasmore__(self) -> Bool: - return self.__len__() > 0 + return self.index < self.length fn __len__(self) -> Int: alias S = Span[UInt8, ImmutableAnyOrigin] From 3b3b786828b56b6e3e58a6ea47581030a77391ff Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Mon, 21 Oct 2024 08:24:18 -0300 Subject: [PATCH 09/12] fix detail Signed-off-by: martinvuyk --- stdlib/src/utils/string_slice.mojo | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index 40701bcacd..20348313c3 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -250,7 +250,11 @@ struct _StringSliceIter[ @always_inline fn __hasmore__(self) -> Bool: - return self.index < self.length + @parameter + if forward: + return self.index < self.length + else: + return self.index > 0 fn __len__(self) -> Int: alias S = Span[UInt8, ImmutableAnyOrigin] From a3fb1ad70953f515609d8a2969b8143000a9f57e Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Mon, 28 Oct 2024 15:54:05 -0300 Subject: [PATCH 10/12] fix details Signed-off-by: martinvuyk --- stdlib/src/utils/string_slice.mojo | 31 ++++++++++++------------------ 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index 58a838f5aa..5249050a3c 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -75,13 +75,9 @@ fn _utf8_first_byte_sequence_length(b: Byte) -> Int: debug_assert( (b & 0b1100_0000) != 0b1000_0000, - ( - "Function `_utf8_first_byte_sequence_length()` does not work" - " correctly if given a continuation byte." - ), + "Function does not work correctly if given a continuation byte.", ) - var flipped = ~b - return int(count_leading_zeros(flipped) + (flipped >> 7)) + return int(count_leading_zeros(~b)) + int(b < 0b1000_0000) fn _shift_unicode_to_utf8(ptr: UnsafePointer[UInt8], c: Int, num_bytes: Int): @@ -218,13 +214,13 @@ struct _StringSliceIter[ forward: The iteration direction. `False` is backwards. """ + alias _S = StringSlice[origin] + alias _U = UnsafePointer[Byte] var index: Int - var ptr: UnsafePointer[Byte] + var ptr: Self._U var length: Int - fn __init__( - inout self, *, unsafe_pointer: UnsafePointer[Byte], length: Int - ): + fn __init__(inout self, *, unsafe_pointer: Self._U, length: Int): self.index = 0 if forward else length self.ptr = unsafe_pointer self.length = length @@ -232,23 +228,20 @@ struct _StringSliceIter[ fn __iter__(self) -> Self: return self - fn __next__(inout self) -> StringSlice[origin]: + fn __next__(inout self) -> Self._S: @parameter if forward: byte_len = _utf8_first_byte_sequence_length(self.ptr[self.index]) + i = self.index self.index += byte_len - return StringSlice[origin]( - unsafe_from_utf8_ptr=self.ptr + (self.index - byte_len), - len=byte_len, - ) + return Self._S(unsafe_from_utf8_ptr=self.ptr + i, len=byte_len) else: byte_len = 1 while _utf8_byte_type(self.ptr[self.index - byte_len]) == 1: byte_len += 1 self.index -= byte_len - return StringSlice[origin]( - unsafe_from_utf8_ptr=self.ptr + self.index, len=byte_len - ) + p = self.ptr + self.index + return Self._S(unsafe_from_utf8_ptr=p, len=byte_len) @always_inline fn __hasmore__(self) -> Bool: @@ -259,7 +252,7 @@ struct _StringSliceIter[ return self.index > 0 fn __len__(self) -> Int: - alias S = Span[UInt8, ImmutableAnyOrigin] + alias S = Span[Byte, ImmutableAnyOrigin] alias _count = _count_utf8_continuation_bytes @parameter From e9b79c95d4dc220a43a4313a33e7735e221bf755 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Mon, 4 Nov 2024 17:20:49 -0300 Subject: [PATCH 11/12] add iterator len test Signed-off-by: martinvuyk --- stdlib/test/collections/test_string.mojo | 30 ++++++++++++++---------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/stdlib/test/collections/test_string.mojo b/stdlib/test/collections/test_string.mojo index b5a1d3007c..a6d911ea2c 100644 --- a/stdlib/test/collections/test_string.mojo +++ b/stdlib/test/collections/test_string.mojo @@ -1264,19 +1264,23 @@ def test_string_iter(): var idx = -1 vs = String("mojo🔥") - for item in vs: - idx += 1 - if idx == 0: - assert_equal("m", item) - elif idx == 1: - assert_equal("o", item) - elif idx == 2: - assert_equal("j", item) - elif idx == 3: - assert_equal("o", item) - elif idx == 4: - assert_equal("🔥", item) - assert_equal(4, idx) + var iterator = vs.__iter__() + assert_equal(5, len(iterator)) + var item = iterator.__next__() + assert_equal("m", item) + assert_equal(4, len(iterator)) + item = iterator.__next__() + assert_equal("o", item) + assert_equal(3, len(iterator)) + item = iterator.__next__() + assert_equal("j", item) + assert_equal(2, len(iterator)) + item = iterator.__next__() + assert_equal("o", item) + assert_equal(1, len(iterator)) + item = iterator.__next__() + assert_equal("🔥", item) + assert_equal(0, len(iterator)) var items = List[String]( "mojo🔥", From f86eb9a8f53de697b6f0aaaf761f0c9ca86a25a0 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Thu, 21 Nov 2024 10:29:25 -0300 Subject: [PATCH 12/12] fix unsafe ptr constructors Signed-off-by: martinvuyk --- stdlib/src/utils/string_slice.mojo | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index fed9b69ed0..b943504163 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -221,10 +221,10 @@ struct _StringSliceIter[ @parameter if forward: remaining = self.length - self.index - cont = _count(S(unsafe_ptr=self.ptr + self.index, len=remaining)) + cont = _count(S(ptr=self.ptr + self.index, length=remaining)) return remaining - cont else: - return self.index - _count(S(unsafe_ptr=self.ptr, len=self.index)) + return self.index - _count(S(ptr=self.ptr, length=self.index)) @value