From ab7c2f5dd427b527c37ec544ecee685b2e4e4083 Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Fri, 22 Nov 2024 06:02:12 +0800 Subject: [PATCH 01/25] feat: buffer api (placeholder) --- buffer/buffer.mbt | 37 +++++++++++++++++++++++++++++++------ buffer/buffer.mbti | 38 +++++++++++++++++++++++--------------- builtin/bytes.mbt | 29 +++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 21 deletions(-) diff --git a/buffer/buffer.mbt b/buffer/buffer.mbt index b22db3ace..89130c939 100644 --- a/buffer/buffer.mbt +++ b/buffer/buffer.mbt @@ -84,11 +84,12 @@ pub fn to_string(self : T) -> String { /// Return a new unchecked string contains the data in buffer. /// Note this function does not validate the encoding of the byte sequence, /// it simply copy the bytes into a new String. -pub fn to_unchecked_string(self : T) -> String { - Bytes::from_fixedarray(self.data).to_unchecked_string( - offset=0, - length=self.len, - ) +pub fn to_unchecked_string( + self : T, + offset~ : Int = 0, + length~ : Int = self.len +) -> String { + Bytes::from_fixedarray(self.data).to_unchecked_string(offset~, length~) } ///| @@ -154,6 +155,12 @@ pub fn write_char(self : T, value : Char) -> Unit { self.len += inc } +pub fn write_utf8_char(self : T, value : Char) -> Unit { + self.grow_if_necessary(self.len + 4) + let inc = self.data.set_utf8_char(self.len, value) + self.len += inc +} + ///| /// Write a byte into buffer. pub fn write_byte(self : T, value : Byte) -> Unit { @@ -162,6 +169,10 @@ pub fn write_byte(self : T, value : Byte) -> Unit { self.len += 1 } +pub fn blit(self : T, srcoff : Int, dst : T, dstoff : Int, len : Int) -> Unit { + Bytes::blit(self.to_bytes(), srcoff, dst.to_bytes(), dstoff, len) +} + ///| pub fn reset(self : T) -> Unit { self.data = self.initial_data @@ -173,7 +184,21 @@ pub fn to_bytes(self : T) -> Bytes { Bytes::from_fixedarray(self.data, len=self.len) } +///| +pub fn to_array(self : T) -> Array[Byte] { + self.to_bytes().to_array() +} + +///| +pub fn op_set(self : T, index : Int, value : Byte) -> Unit { + self.data[index] = value +} + +pub fn op_get(self : T, index : Int) -> Byte { + self.data[index] +} + ///| pub impl Show for T with output(self, logger) { - logger.write_string(self.to_unchecked_string()) + logger.write_string(self.to_unchecked_string(offset=0, length=self.len)) } diff --git a/buffer/buffer.mbti b/buffer/buffer.mbti index 41570ecb8..91bd69d78 100644 --- a/buffer/buffer.mbti +++ b/buffer/buffer.mbti @@ -5,21 +5,29 @@ package moonbitlang/core/buffer // Types and methods type T impl T { - is_empty(Self) -> Bool - length(Self) -> Int - new(size_hint~ : Int = ..) -> Self - reset(Self) -> Unit - to_bytes(Self) -> Bytes - to_string(Self) -> String //deprecated - to_unchecked_string(Self) -> String - write_byte(Self, Byte) -> Unit - write_bytes(Self, Bytes) -> Unit - write_char(Self, Char) -> Unit - write_object(Self, Show) -> Unit - write_string(Self, String) -> Unit - write_sub_string(Self, String, Int, Int) -> Unit //deprecated - write_substring(Self, String, Int, Int) -> Unit -} + blit(Self, Int, Self, Int, Int) -> Unit + from_array(Array[Byte]) -> Self + from_bytes(Bytes) -> Self + is_empty(Self) -> Bool + length(Self) -> Int + new(size_hint~ : Int = ..) -> Self + new(~size_hint : Int = ..) -> Self + op_get(Self, Int) -> Byte + op_set(Self, Int, Byte) -> Unit + reset(Self) -> Unit + to_array(Self) -> Array[Byte] + to_bytes(Self) -> Bytes + to_string(Self) -> String //deprecated + to_unchecked_string(Self, ~offset : Int, ~length : Int) -> String + write_byte(Self, Byte) -> Unit + write_bytes(Self, Bytes) -> Unit + write_char(Self, Char) -> Unit + write_object(Self, Show) -> Unit + write_string(Self, String) -> Unit + write_sub_string(Self, String, Int, Int) -> Unit //deprecated + write_substring(Self, String, Int, Int) -> Unit + write_utf8_char(Self, Char) -> Unit} + impl Show for T // Type aliases diff --git a/builtin/bytes.mbt b/builtin/bytes.mbt index 4a54f8cde..a77baefe4 100644 --- a/builtin/bytes.mbt +++ b/builtin/bytes.mbt @@ -168,6 +168,35 @@ pub fn set_utf8_char(self : Bytes, offset : Int, value : Char) -> Int { } } +pub fn set_utf8_char( + self : FixedArray[Byte], + offset : Int, + value : Char +) -> Int { + let code = value.to_uint() + if code < 0x80 { + self[offset] = ((code & 0x7F) | 0x00).to_byte() + 1 + } else if code < 0x0800 { + self[offset] = (((code >> 6) & 0x1F) | 0xC0).to_byte() + self[offset + 1] = ((code & 0x3F) | 0x80).to_byte() + 2 + } else if code < 0x010000 { + self[offset] = (((code >> 12) & 0x0F) | 0xE0).to_byte() + self[offset + 1] = (((code >> 6) & 0x3F) | 0x80).to_byte() + self[offset + 2] = ((code & 0x3F) | 0x80).to_byte() + 3 + } else if code < 0x110000 { + self[offset] = (((code >> 18) & 0x07) | 0xF0).to_byte() + self[offset + 1] = (((code >> 12) & 0x3F) | 0x80).to_byte() + self[offset + 2] = (((code >> 6) & 0x3F) | 0x80).to_byte() + self[offset + 3] = ((code & 0x3F) | 0x80).to_byte() + 4 + } else { + abort("Char out of range") + } +} + ///| /// Fill utf16 encoded char `value` into byte sequence `self`, starting at `offset`. /// It return the length of bytes has been written. From 673e1bdc197f27986b55661ab2b0ad0de3e60095 Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Fri, 22 Nov 2024 06:02:23 +0800 Subject: [PATCH 02/25] feat: int api (placeholder) --- int/int.mbt | 8 ++++++++ int/int.mbti | 1 + 2 files changed, 9 insertions(+) diff --git a/int/int.mbt b/int/int.mbt index 54d320a8f..5aa82f510 100644 --- a/int/int.mbt +++ b/int/int.mbt @@ -41,3 +41,11 @@ pub fn abs(self : Int) -> Int { self } } + +pub fn minimum(self : Int, x : Int) -> Int { + if self > x { + x + } else { + self + } +} diff --git a/int/int.mbti b/int/int.mbti index e7694923f..cb1c6f6d4 100644 --- a/int/int.mbti +++ b/int/int.mbti @@ -10,6 +10,7 @@ let min_value : Int impl Int { abs(Int) -> Int + minimum(Int, Int) -> Int } // Type aliases From 6c633db5d4a38ef975753ef8bf7dafa0899446d6 Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Fri, 22 Nov 2024 06:02:34 +0800 Subject: [PATCH 03/25] feat: string api (placeholder) --- buffer/buffer.mbti | 43 ++++++++++++++++++++----------------------- builtin/builtin.mbti | 1 + string/string.mbt | 5 +++++ string/string.mbti | 1 + 4 files changed, 27 insertions(+), 23 deletions(-) diff --git a/buffer/buffer.mbti b/buffer/buffer.mbti index 91bd69d78..c53097c8f 100644 --- a/buffer/buffer.mbti +++ b/buffer/buffer.mbti @@ -5,29 +5,26 @@ package moonbitlang/core/buffer // Types and methods type T impl T { - blit(Self, Int, Self, Int, Int) -> Unit - from_array(Array[Byte]) -> Self - from_bytes(Bytes) -> Self - is_empty(Self) -> Bool - length(Self) -> Int - new(size_hint~ : Int = ..) -> Self - new(~size_hint : Int = ..) -> Self - op_get(Self, Int) -> Byte - op_set(Self, Int, Byte) -> Unit - reset(Self) -> Unit - to_array(Self) -> Array[Byte] - to_bytes(Self) -> Bytes - to_string(Self) -> String //deprecated - to_unchecked_string(Self, ~offset : Int, ~length : Int) -> String - write_byte(Self, Byte) -> Unit - write_bytes(Self, Bytes) -> Unit - write_char(Self, Char) -> Unit - write_object(Self, Show) -> Unit - write_string(Self, String) -> Unit - write_sub_string(Self, String, Int, Int) -> Unit //deprecated - write_substring(Self, String, Int, Int) -> Unit - write_utf8_char(Self, Char) -> Unit} - + blit(Self, Int, Self, Int, Int) -> Unit + is_empty(Self) -> Bool + length(Self) -> Int + new(size_hint~ : Int = ..) -> Self + op_get(Self, Int) -> Byte + op_set(Self, Int, Byte) -> Unit + reset(Self) -> Unit + to_array(Self) -> Array[Byte] + to_bytes(Self) -> Bytes + to_string(Self) -> String //deprecated + to_unchecked_string(Self, offset~ : Int = .., length~ : Int = ..) -> String + write_byte(Self, Byte) -> Unit + write_bytes(Self, Bytes) -> Unit + write_char(Self, Char) -> Unit + write_object(Self, Show) -> Unit + write_string(Self, String) -> Unit + write_sub_string(Self, String, Int, Int) -> Unit //deprecated + write_substring(Self, String, Int, Int) -> Unit + write_utf8_char(Self, Char) -> Unit +} impl Show for T // Type aliases diff --git a/builtin/builtin.mbti b/builtin/builtin.mbti index 0935940a1..d1c4ed4fd 100644 --- a/builtin/builtin.mbti +++ b/builtin/builtin.mbti @@ -668,6 +668,7 @@ impl FixedArray { op_set[T](Self[T], Int, T) -> Unit set[T](Self[T], Int, T) -> Unit set_utf16_char(Self[Byte], Int, Char) -> Int + set_utf8_char(Self[Byte], Int, Char) -> Int to_json[X : ToJson](Self[X]) -> Json to_string[X : Show](Self[X]) -> String unsafe_blit[A](Self[A], Int, Self[A], Int, Int) -> Unit diff --git a/string/string.mbt b/string/string.mbt index 2328d864c..a320c9761 100644 --- a/string/string.mbt +++ b/string/string.mbt @@ -31,6 +31,11 @@ pub fn String::from_array(chars : Array[Char]) -> String { buf.to_string() } +pub fn String::from_iter(iter : Iter[Char]) -> String { + let chars = iter.collect() + String::from_array(chars) +} + ///| /// Concatenate strings. /// diff --git a/string/string.mbti b/string/string.mbti index f21058e69..e7b8b35f0 100644 --- a/string/string.mbti +++ b/string/string.mbti @@ -14,6 +14,7 @@ impl String { ends_with(String, String) -> Bool fold[A](String, init~ : A, (A, Char) -> A) -> A from_array(Array[Char]) -> String + from_iter(Iter[Char]) -> String index_of(String, String, from~ : Int = ..) -> Int is_blank(String) -> Bool is_empty(String) -> Bool From a947d8293cb140868101a790433a08028574f2a5 Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Fri, 22 Nov 2024 06:02:46 +0800 Subject: [PATCH 04/25] feat: encoding api (placeholder) --- encoding/decoding.mbt | 325 +++++++++++++++++++++++++++++++++++++ encoding/encoding.mbt | 13 ++ encoding/encoding.mbti | 26 +++ encoding/encoding_test.mbt | 105 ++++++++++++ encoding/moon.pkg.json | 15 ++ encoding/types.mbt | 96 +++++++++++ 6 files changed, 580 insertions(+) create mode 100644 encoding/decoding.mbt create mode 100644 encoding/encoding.mbt create mode 100644 encoding/encoding.mbti create mode 100644 encoding/encoding_test.mbt create mode 100644 encoding/moon.pkg.json create mode 100644 encoding/types.mbt diff --git a/encoding/decoding.mbt b/encoding/decoding.mbt new file mode 100644 index 000000000..ab5598ab6 --- /dev/null +++ b/encoding/decoding.mbt @@ -0,0 +1,325 @@ +// Copyright 2024 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +const U_REP = '\u{FFFD}' + +// consider const +let utf_8_len = [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, + 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +] + +fn r_utf_8(buf : Buffer, offset : Int, length : Int) -> Decode { + fn uchar(c : Int) { + Uchar(Char::from_int(c)) + } + + match length { + 1 => uchar(buf[offset].to_int()) + 2 => { + let b0 = buf[offset].to_int() + let b1 = buf[offset + 1].to_int() + if (b1 >> 6) != 0b10 { + malformed(buf, offset, length) + } else { + uchar(((b0 & 0x1F) << 6) | (b1 & 0x3F)) + } + } + 3 => { + let b0 = buf[offset].to_int() + let b1 = buf[offset + 1].to_int() + let b2 = buf[offset + 2].to_int() + let c = ((b0 & 0x0F) << 12) | (((b1 & 0x3F) << 6) | (b2 & 0x3F)) + if (b2 >> 6) != 0b10 { + malformed(buf, offset, length) + } else { + match b0 { + 0xE0 => + if b1 < 0xA0 || 0xBF < b1 { + malformed(buf, offset, length) + } else { + uchar(c) + } + 0xED => + if b1 < 0x80 || 0x9F < b1 { + malformed(buf, offset, length) + } else { + uchar(c) + } + _ => + if (b1 >> 6) != 0b10 { + malformed(buf, offset, length) + } else { + uchar(c) + } + } + } + } + 4 => { + let b0 = buf[offset].to_int() + let b1 = buf[offset + 1].to_int() + let b2 = buf[offset + 2].to_int() + let b3 = buf[offset + 3].to_int() + let c = ((b0 & 0x07) << 18) | + ((b1 & 0x3F) << 12) | + ((b2 & 0x3F) << 6) | + (b3 & 0x3F) + if (b3 >> 6) != 0b10 || (b2 >> 6) != 0b10 { + malformed(buf, offset, length) + } else { + match b0 { + 0xF0 => + if b1 < 0x90 || 0xBF < b1 { + malformed(buf, offset, length) + } else { + uchar(c) + } + 0xF4 => + if b1 < 0x80 || 0x8F < b1 { + malformed(buf, offset, length) + } else { + uchar(c) + } + _ => + if (b1 >> 6) != 0b10 { + malformed(buf, offset, length) + } else { + uchar(c) + } + } + } + } + _ => panic() + } +} + +fn r_utf_16(buf : Buffer, offset0 : Int, offset1 : Int) -> UTF16Decode { + let b0 = buf[offset0].to_int() + let b1 = buf[offset1].to_int() + let u = (b0 << 8) | b1 + if u < 0xD800 || u > 0xDFFF { + UTF16Uchar(Char::from_int(u)) + } else if u > 0xDBFF { + UTF16Malformed( + buf.to_unchecked_string(offset=@int.minimum(offset0, offset1), length=2), + ) + } else { + Hi(u) + } +} + +fn r_utf_16_lo(hi : Int, buf : Buffer, offset0 : Int, offset1 : Int) -> Decode { + let b0 = buf[offset0].to_int() + let b1 = buf[offset1].to_int() + let lo = (b0 << 8) | b1 + if lo < 0xDC00 || lo > 0xDFFF { + malformed_pair( + offset0 < offset1, + hi, + buf, + @int.minimum(offset0, offset1), + 2, + ) + } else { + Uchar(Char::from_int(((hi & 0x3FF) << 10) | ((lo & 0x3FF) + 0x10000))) + } +} + +fn decode(self : Decoder) -> Decode { + (self.k)(self) +} + +fn decoder(~encoding : Encoding, src : Buffer) -> Decoder { + let i = src + let i_pos = 0 + let i_max = src.length() - 1 + let t = Buffer::from_bytes(b"\x00\x00\x00\x00") + let t_len = 0 + let t_need = 0 + let k = match encoding { + UTF8 => decode_utf_8 + UTF16 => decode_utf_16le + UTF16BE => decode_utf_16le + UTF16LE => decode_utf_16le + } + { i, i_pos, i_max, t, t_len, t_need, k } +} + +fn ret(self : Decoder, k : Cont, v : Decode) -> Decode { + self.k = k + v +} + +priv enum UTF16Decode { + Hi(Int) + UTF16Malformed(String) + UTF16Uchar(Char) +} + +fn t_decode_utf_16le_lo(hi : Int, decoder : Decoder) -> Decode { + if decoder.t_len < decoder.t_need { + decoder.ret( + decode_utf_16le, + malformed_pair(false, hi, decoder.t, 0, decoder.t_len), + ) + } else { + decoder.ret(decode_utf_16le, r_utf_16_lo(hi, decoder.t, 1, 0)) + } +} + +fn t_decode_utf_16le(self : Decoder) -> Decode { + if self.t_len < self.t_need { + self.ret(decode_utf_16le, malformed(self.t, 0, self.t_len)) + } else { + self.decode_utf_16le_lo(r_utf_16(self.t, 1, 0)) + } +} + +fn decode_utf_16le_lo(self : Decoder, v : UTF16Decode) -> Decode { + match v { + UTF16Uchar(u) => Uchar(u) + UTF16Malformed(s) => Malformed(s) + Hi(hi) => { + let rem = self.i_rem() + if rem < 2 { + self.t_need(2) + t_fill(@tuple.curry(t_decode_utf_16le_lo)(hi), self) + } else { + let j = self.i_pos + self.i_pos = self.i_pos + 2 + r_utf_16_lo(hi, self.i, j + 1, j) + } + } + } +} + +fn decode_utf_16le(self : Decoder) -> Decode { + let rem = self.i_rem() + match rem.compare(0) { + // rem < 0 + -1 => Decode::End + // rem = 0 + 0 => self.refill(decode_utf_16le) + // rem > 0 + 1 => + if rem < 2 { + self.t_need(2) + t_fill(t_decode_utf_16le, self) + } else { + let j = self.i_pos + self.i_pos = self.i_pos + 2 + // mark + self.decode_utf_16le_lo(r_utf_16(self.i, j + 1, j)) + } + _ => abort("unreachable") + } +} + +fn t_decode_utf_8(self : Decoder) -> Decode { + if self.t_len < self.t_need { + malformed(self.t, 0, self.t_len) + } else { + r_utf_8(self.t, 0, self.t_len) + } +} + +fn decode_utf_8(self : Decoder) -> Decode { + let rem = self.i_rem() + match rem.compare(0) { + // rem < 0 + -1 => Decode::End + // rem = 0 + 0 => self.refill(decode_utf_8) + // rem > 0 + 1 => { + let idx = self.i[self.i_pos].to_int() + let need = utf_8_len[idx] + if rem < need { + self.t_need(need) + t_fill(t_decode_utf_8, self) + } else { + let j = self.i_pos + if need == 0 { + self.i_pos = self.i_pos + 1 + self.ret(decode_utf_8, malformed(self.i, j, 1)) + } else { + self.i_pos = self.i_pos + need + self.ret(decode_utf_8, r_utf_8(self.i, j, need)) + } + } + } + _ => abort("unreachable") + } +} + +fn i_rem(self : Decoder) -> Int { + self.i_max - self.i_pos + 1 +} + +fn eoi(self : Decoder) -> Unit { + self.i = Buffer::new() + self.i_pos = 0 + self.i_max = @int.min_value +} + +fn refill(self : Decoder, k : Cont) -> Decode { + // only Buffer + self.eoi() + k(self) +} + +fn t_need(self : Decoder, need : Int) -> Unit { + self.t_len = 0 + self.t_need = need +} + +fn t_fill(k : Cont, decoder : Decoder) -> Decode { + fn blit(decoder : Decoder, l : Int) -> Unit { + decoder.i.blit(decoder.i_pos, decoder.t, decoder.t_len, l) + decoder.i_pos = decoder.i_pos + 1 + decoder.t_len = decoder.t_len + 1 + } + + let rem = decoder.i_rem() + if rem < 0 { // eoi + k(decoder) + } else { + let need = decoder.t_need - decoder.t_len + if rem < need { + blit(decoder, rem) + decoder.refill(@tuple.curry(t_fill)(k)) + } else { + blit(decoder, need) + k(decoder) + } + } +} + +pub fn decode_lossy(~encoding : Encoding = UTF8, src : Buffer) -> Stream { + let decoder = decoder(~encoding, src) + { decoder, lossy: true } +} + +pub fn decode_strict(~encoding : Encoding = UTF8, src : Buffer) -> Stream { + let decoder = decoder(~encoding, src) + { decoder, lossy: false } +} diff --git a/encoding/encoding.mbt b/encoding/encoding.mbt new file mode 100644 index 000000000..f307e1338 --- /dev/null +++ b/encoding/encoding.mbt @@ -0,0 +1,13 @@ +// Copyright 2024 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. diff --git a/encoding/encoding.mbti b/encoding/encoding.mbti new file mode 100644 index 000000000..f62a27936 --- /dev/null +++ b/encoding/encoding.mbti @@ -0,0 +1,26 @@ +package moonbitlang/core/encoding + +alias @moonbitlang/core/buffer as @buffer + +// Values +fn decode_lossy(~encoding : Encoding = .., @buffer.T) -> Stream + +fn decode_strict(~encoding : Encoding = .., @buffer.T) -> Stream + +// Types and methods +pub enum Encoding { + UTF8 + UTF16 + UTF16BE + UTF16LE +} + +type Stream +impl Stream { + iter(Self) -> Iter[Char] +} + +// Type aliases + +// Traits + diff --git a/encoding/encoding_test.mbt b/encoding/encoding_test.mbt new file mode 100644 index 000000000..64a13a3a9 --- /dev/null +++ b/encoding/encoding_test.mbt @@ -0,0 +1,105 @@ +// Copyright 2024 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +test "decoding String (UTF16LE) to String" { + let src = "你好👀" + let buf = @buffer.T::from_bytes(src.to_bytes()) + inspect!( + buf.to_bytes(), + content= + #|b"\x60\x4f\x7d\x59\x3d\xd8\x40\xdc" + , + ) + let stream = @encoding.decode_lossy(encoding=UTF16LE, buf) + inspect!(String::from_iter(stream.iter()), content=src) +} + +test "decoding UTF16LE to String" { + let src = "你好👀" + let buf = @buffer.T::new(size_hint=10) + buf.write_bytes(b"\x60\x4f") + buf.write_bytes(b"\x7d\x59") + buf.write_bytes(b"\x3d\xd8\x40\xdc") + inspect!( + buf.to_bytes(), + content= + #|b"\x60\x4f\x7d\x59\x3d\xd8\x40\xdc" + , + ) + let stream = @encoding.decode_lossy(encoding=UTF16LE, buf) + inspect!(String::from_iter(stream.iter()), content=src) +} + +test "decoding UTF8 to String" { + let buf = @buffer.T::new(size_hint=10) + buf.write_bytes(b"\xe4\xbd\xa0") + buf.write_bytes(b"\xe5\xa5\xbd") + buf.write_bytes(b"\xf0\x9f\x91\x80") + inspect!( + buf.to_bytes(), + content= + #|b"\xe4\xbd\xa0\xe5\xa5\xbd\xf0\x9f\x91\x80" + , + ) + let stream = @encoding.decode_lossy(encoding=UTF8, buf) + inspect!(String::from_iter(stream.iter()), content="你好👀") +} + +test "decoding encoded String (UTF16LE) to String" { + let src = "👋再见" + let buf = @buffer.T::new(size_hint=10) + for s in src { + buf.write_char(s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\x3d\xd8\x4b\xdc\x8d\x51\xc1\x89" + , + ) + let stream = @encoding.decode_lossy(encoding=UTF16LE, buf) + inspect!(String::from_iter(stream.iter()), content=src) +} + +test "decoding encoded UTF8 to String" { + let src = "👋再见" + let buf = @buffer.T::new(size_hint=10) + for s in src { + buf.write_utf8_char(s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xf0\x9f\x91\x8b\xe5\x86\x8d\xe8\xa7\x81" + , + ) + let stream = @encoding.decode_lossy(buf) // defaults to UTF8 + inspect!(String::from_iter(stream.iter()), content=src) +} + +test "decoding encoded UTF8" { + let src = "👋再见" + let buf = @buffer.T::new(size_hint=10) + for s in src { + buf.write_utf8_char(s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xf0\x9f\x91\x8b\xe5\x86\x8d\xe8\xa7\x81" + , + ) + let stream = @encoding.decode_lossy(buf) // defaults to UTF8 + inspect!(stream.iter().collect(), content="['👋', '再', '见']") +} diff --git a/encoding/moon.pkg.json b/encoding/moon.pkg.json new file mode 100644 index 000000000..eb6312aee --- /dev/null +++ b/encoding/moon.pkg.json @@ -0,0 +1,15 @@ +{ + "import": [ + "moonbitlang/core/builtin", + "moonbitlang/core/buffer", + "moonbitlang/core/coverage", + "moonbitlang/core/string", + "moonbitlang/core/bytes", + "moonbitlang/core/tuple", + "moonbitlang/core/array", + "moonbitlang/core/char", + "moonbitlang/core/int" + ], + "test-import": [ + ] +} diff --git a/encoding/types.mbt b/encoding/types.mbt new file mode 100644 index 000000000..072b4d720 --- /dev/null +++ b/encoding/types.mbt @@ -0,0 +1,96 @@ +// Copyright 2024 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +typealias Buffer = @buffer.T + +typealias Cont = (Decoder) -> Decode + +pub enum Encoding { + UTF8 + UTF16 + UTF16BE + UTF16LE +} + +// Decoder + +priv struct Decoder { + mut i : Buffer + mut i_pos : Int + mut i_max : Int + t : Buffer + mut t_len : Int + mut t_need : Int + mut k : Cont +} + +priv enum Decode { + End + Malformed(String) + Uchar(Char) +} + +fn malformed(buf : Buffer, offset : Int, length : Int) -> Decode { + Malformed(buf.to_unchecked_string(~offset, ~length)) +} + +fn malformed_pair( + be : Bool, + hi : Int, + buf : Buffer, + offset : Int, + length : Int +) -> Decode { + let bs1 = buf.to_unchecked_string(~offset, ~length).to_bytes() + let bs0 = Buffer::new(size_hint=2) + let (j0, j1) = if be { (0, 1) } else { (1, 0) } + bs0[j0] = (hi >> 8).to_byte() + bs0[j1] = hi.land(0xFF).to_byte() + let arr = bs0.to_array() + arr.append(bs1.to_array()) + let bs = Buffer::from_array(arr) + Malformed(bs.to_unchecked_string(offset=0, length=bs.length())) +} + +// Stream + +struct Stream { + decoder : Decoder + lossy : Bool +} + +pub fn iter(self : Stream) -> Iter[Char] { + Iter::new( + fn(yield) { + loop self.decoder.decode() { + Uchar(u) => { + if yield(u) == IterEnd { + break IterEnd + } + continue self.decoder.decode() + } + Malformed(s) => { + if not(self.lossy) { + abort(s) + } + if yield(U_REP) == IterEnd { + break IterEnd + } + continue self.decoder.decode() + } + End => break IterEnd + } + }, + ) +} From 3be567b1b4d72ce245704fe6518d1f5c01f269ce Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Fri, 22 Nov 2024 12:43:02 +0800 Subject: [PATCH 05/25] feat: buffer `T::from{bytes, array}` api (placeholder) --- buffer/buffer.mbt | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/buffer/buffer.mbt b/buffer/buffer.mbt index 89130c939..98db77203 100644 --- a/buffer/buffer.mbt +++ b/buffer/buffer.mbt @@ -100,6 +100,20 @@ pub fn T::new(size_hint~ : Int = 0) -> T { { data, len: 0, initial_data: data } } +pub fn T::from_bytes(bytes : Bytes) -> T { + let buf = T::new(size_hint=bytes.length()) + buf.write_bytes(bytes) + buf +} + +pub fn T::from_array(arr : Array[Byte]) -> T { + let buf = T::new(size_hint=arr.length()) + for byte in arr { + buf.write_byte(byte) + } + buf +} + ///| /// Write a string into buffer. pub fn write_string(self : T, value : String) -> Unit { From 7abfb67567104d5ee99fb1d2918adf6bc217ec13 Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Fri, 22 Nov 2024 12:43:17 +0800 Subject: [PATCH 06/25] feat: encoding api (placeholder) --- encoding/decoding.mbt | 25 +++++++++++++++---------- encoding/types.mbt | 26 ++++++++++++-------------- 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/encoding/decoding.mbt b/encoding/decoding.mbt index ab5598ab6..7d1589d89 100644 --- a/encoding/decoding.mbt +++ b/encoding/decoding.mbt @@ -28,7 +28,7 @@ let utf_8_len = [ 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ] -fn r_utf_8(buf : Buffer, offset : Int, length : Int) -> Decode { +fn r_utf_8(buf : @buffer.T, offset : Int, length : Int) -> Decode { fn uchar(c : Int) { Uchar(Char::from_int(c)) } @@ -112,7 +112,7 @@ fn r_utf_8(buf : Buffer, offset : Int, length : Int) -> Decode { } } -fn r_utf_16(buf : Buffer, offset0 : Int, offset1 : Int) -> UTF16Decode { +fn r_utf_16(buf : @buffer.T, offset0 : Int, offset1 : Int) -> UTF16Decode { let b0 = buf[offset0].to_int() let b1 = buf[offset1].to_int() let u = (b0 << 8) | b1 @@ -127,7 +127,12 @@ fn r_utf_16(buf : Buffer, offset0 : Int, offset1 : Int) -> UTF16Decode { } } -fn r_utf_16_lo(hi : Int, buf : Buffer, offset0 : Int, offset1 : Int) -> Decode { +fn r_utf_16_lo( + hi : Int, + buf : @buffer.T, + offset0 : Int, + offset1 : Int +) -> Decode { let b0 = buf[offset0].to_int() let b1 = buf[offset1].to_int() let lo = (b0 << 8) | b1 @@ -148,11 +153,11 @@ fn decode(self : Decoder) -> Decode { (self.k)(self) } -fn decoder(~encoding : Encoding, src : Buffer) -> Decoder { +fn decoder(encoding~ : Encoding, src : @buffer.T) -> Decoder { let i = src let i_pos = 0 let i_max = src.length() - 1 - let t = Buffer::from_bytes(b"\x00\x00\x00\x00") + let t = @buffer.from_bytes(b"\x00\x00\x00\x00") let t_len = 0 let t_need = 0 let k = match encoding { @@ -276,7 +281,7 @@ fn i_rem(self : Decoder) -> Int { } fn eoi(self : Decoder) -> Unit { - self.i = Buffer::new() + self.i = @buffer.new() self.i_pos = 0 self.i_max = @int.min_value } @@ -314,12 +319,12 @@ fn t_fill(k : Cont, decoder : Decoder) -> Decode { } } -pub fn decode_lossy(~encoding : Encoding = UTF8, src : Buffer) -> Stream { - let decoder = decoder(~encoding, src) +pub fn decode_lossy(encoding~ : Encoding = UTF8, src : @buffer.T) -> Stream { + let decoder = decoder(encoding~, src) { decoder, lossy: true } } -pub fn decode_strict(~encoding : Encoding = UTF8, src : Buffer) -> Stream { - let decoder = decoder(~encoding, src) +pub fn decode_strict(encoding~ : Encoding = UTF8, src : @buffer.T) -> Stream { + let decoder = decoder(encoding~, src) { decoder, lossy: false } } diff --git a/encoding/types.mbt b/encoding/types.mbt index 072b4d720..ba476103c 100644 --- a/encoding/types.mbt +++ b/encoding/types.mbt @@ -12,11 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -typealias Buffer = @buffer.T - typealias Cont = (Decoder) -> Decode -pub enum Encoding { +pub(all) enum Encoding { UTF8 UTF16 UTF16BE @@ -26,10 +24,10 @@ pub enum Encoding { // Decoder priv struct Decoder { - mut i : Buffer + mut i : @buffer.T mut i_pos : Int mut i_max : Int - t : Buffer + t : @buffer.T mut t_len : Int mut t_need : Int mut k : Cont @@ -41,25 +39,25 @@ priv enum Decode { Uchar(Char) } -fn malformed(buf : Buffer, offset : Int, length : Int) -> Decode { - Malformed(buf.to_unchecked_string(~offset, ~length)) +fn malformed(buf : @buffer.T, offset : Int, length : Int) -> Decode { + Malformed(buf.to_unchecked_string(offset~, length~)) } fn malformed_pair( be : Bool, hi : Int, - buf : Buffer, + buf : @buffer.T, offset : Int, length : Int ) -> Decode { - let bs1 = buf.to_unchecked_string(~offset, ~length).to_bytes() - let bs0 = Buffer::new(size_hint=2) + let bs1 = buf.to_unchecked_string(offset~, length~).to_bytes() + let bs0 = @buffer.new(size_hint=2) let (j0, j1) = if be { (0, 1) } else { (1, 0) } bs0[j0] = (hi >> 8).to_byte() bs0[j1] = hi.land(0xFF).to_byte() let arr = bs0.to_array() arr.append(bs1.to_array()) - let bs = Buffer::from_array(arr) + let bs = @buffer.from_array(arr) Malformed(bs.to_unchecked_string(offset=0, length=bs.length())) } @@ -72,10 +70,10 @@ struct Stream { pub fn iter(self : Stream) -> Iter[Char] { Iter::new( - fn(yield) { + fn(yield_) { loop self.decoder.decode() { Uchar(u) => { - if yield(u) == IterEnd { + if yield_(u) == IterEnd { break IterEnd } continue self.decoder.decode() @@ -84,7 +82,7 @@ pub fn iter(self : Stream) -> Iter[Char] { if not(self.lossy) { abort(s) } - if yield(U_REP) == IterEnd { + if yield_(U_REP) == IterEnd { break IterEnd } continue self.decoder.decode() From 7f9fbd58b701aa98aebf3d6c1b94598a56607289 Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Fri, 22 Nov 2024 12:43:33 +0800 Subject: [PATCH 07/25] moon info --- buffer/buffer.mbti | 2 ++ encoding/encoding.mbti | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/buffer/buffer.mbti b/buffer/buffer.mbti index c53097c8f..019414de2 100644 --- a/buffer/buffer.mbti +++ b/buffer/buffer.mbti @@ -6,6 +6,8 @@ package moonbitlang/core/buffer type T impl T { blit(Self, Int, Self, Int, Int) -> Unit + from_array(Array[Byte]) -> Self + from_bytes(Bytes) -> Self is_empty(Self) -> Bool length(Self) -> Int new(size_hint~ : Int = ..) -> Self diff --git a/encoding/encoding.mbti b/encoding/encoding.mbti index f62a27936..38000d3c7 100644 --- a/encoding/encoding.mbti +++ b/encoding/encoding.mbti @@ -3,12 +3,12 @@ package moonbitlang/core/encoding alias @moonbitlang/core/buffer as @buffer // Values -fn decode_lossy(~encoding : Encoding = .., @buffer.T) -> Stream +fn decode_lossy(encoding~ : Encoding = .., @buffer.T) -> Stream -fn decode_strict(~encoding : Encoding = .., @buffer.T) -> Stream +fn decode_strict(encoding~ : Encoding = .., @buffer.T) -> Stream // Types and methods -pub enum Encoding { +pub(all) enum Encoding { UTF8 UTF16 UTF16BE From 0fc9cd40d1eb646de23b7e6ea1ecc4954db39d8a Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Fri, 22 Nov 2024 12:49:11 +0800 Subject: [PATCH 08/25] moon fmt with block-style --- buffer/buffer.mbt | 5 +++++ builtin/bytes.mbt | 1 + encoding/decoding.mbt | 22 ++++++++++++++++++++++ encoding/types.mbt | 8 ++++++++ int/int.mbt | 1 + string/string.mbt | 1 + 6 files changed, 38 insertions(+) diff --git a/buffer/buffer.mbt b/buffer/buffer.mbt index 98db77203..ad8672bba 100644 --- a/buffer/buffer.mbt +++ b/buffer/buffer.mbt @@ -100,12 +100,14 @@ pub fn T::new(size_hint~ : Int = 0) -> T { { data, len: 0, initial_data: data } } +///| pub fn T::from_bytes(bytes : Bytes) -> T { let buf = T::new(size_hint=bytes.length()) buf.write_bytes(bytes) buf } +///| pub fn T::from_array(arr : Array[Byte]) -> T { let buf = T::new(size_hint=arr.length()) for byte in arr { @@ -169,6 +171,7 @@ pub fn write_char(self : T, value : Char) -> Unit { self.len += inc } +///| pub fn write_utf8_char(self : T, value : Char) -> Unit { self.grow_if_necessary(self.len + 4) let inc = self.data.set_utf8_char(self.len, value) @@ -183,6 +186,7 @@ pub fn write_byte(self : T, value : Byte) -> Unit { self.len += 1 } +///| pub fn blit(self : T, srcoff : Int, dst : T, dstoff : Int, len : Int) -> Unit { Bytes::blit(self.to_bytes(), srcoff, dst.to_bytes(), dstoff, len) } @@ -208,6 +212,7 @@ pub fn op_set(self : T, index : Int, value : Byte) -> Unit { self.data[index] = value } +///| pub fn op_get(self : T, index : Int) -> Byte { self.data[index] } diff --git a/builtin/bytes.mbt b/builtin/bytes.mbt index a77baefe4..2708d0c60 100644 --- a/builtin/bytes.mbt +++ b/builtin/bytes.mbt @@ -168,6 +168,7 @@ pub fn set_utf8_char(self : Bytes, offset : Int, value : Char) -> Int { } } +///| pub fn set_utf8_char( self : FixedArray[Byte], offset : Int, diff --git a/encoding/decoding.mbt b/encoding/decoding.mbt index 7d1589d89..e19e8fd4f 100644 --- a/encoding/decoding.mbt +++ b/encoding/decoding.mbt @@ -12,8 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +///| const U_REP = '\u{FFFD}' +///| // consider const let utf_8_len = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -28,6 +30,7 @@ let utf_8_len = [ 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ] +///| fn r_utf_8(buf : @buffer.T, offset : Int, length : Int) -> Decode { fn uchar(c : Int) { Uchar(Char::from_int(c)) @@ -112,6 +115,7 @@ fn r_utf_8(buf : @buffer.T, offset : Int, length : Int) -> Decode { } } +///| fn r_utf_16(buf : @buffer.T, offset0 : Int, offset1 : Int) -> UTF16Decode { let b0 = buf[offset0].to_int() let b1 = buf[offset1].to_int() @@ -127,6 +131,7 @@ fn r_utf_16(buf : @buffer.T, offset0 : Int, offset1 : Int) -> UTF16Decode { } } +///| fn r_utf_16_lo( hi : Int, buf : @buffer.T, @@ -149,10 +154,12 @@ fn r_utf_16_lo( } } +///| fn decode(self : Decoder) -> Decode { (self.k)(self) } +///| fn decoder(encoding~ : Encoding, src : @buffer.T) -> Decoder { let i = src let i_pos = 0 @@ -169,17 +176,20 @@ fn decoder(encoding~ : Encoding, src : @buffer.T) -> Decoder { { i, i_pos, i_max, t, t_len, t_need, k } } +///| fn ret(self : Decoder, k : Cont, v : Decode) -> Decode { self.k = k v } +///| priv enum UTF16Decode { Hi(Int) UTF16Malformed(String) UTF16Uchar(Char) } +///| fn t_decode_utf_16le_lo(hi : Int, decoder : Decoder) -> Decode { if decoder.t_len < decoder.t_need { decoder.ret( @@ -191,6 +201,7 @@ fn t_decode_utf_16le_lo(hi : Int, decoder : Decoder) -> Decode { } } +///| fn t_decode_utf_16le(self : Decoder) -> Decode { if self.t_len < self.t_need { self.ret(decode_utf_16le, malformed(self.t, 0, self.t_len)) @@ -199,6 +210,7 @@ fn t_decode_utf_16le(self : Decoder) -> Decode { } } +///| fn decode_utf_16le_lo(self : Decoder, v : UTF16Decode) -> Decode { match v { UTF16Uchar(u) => Uchar(u) @@ -217,6 +229,7 @@ fn decode_utf_16le_lo(self : Decoder, v : UTF16Decode) -> Decode { } } +///| fn decode_utf_16le(self : Decoder) -> Decode { let rem = self.i_rem() match rem.compare(0) { @@ -239,6 +252,7 @@ fn decode_utf_16le(self : Decoder) -> Decode { } } +///| fn t_decode_utf_8(self : Decoder) -> Decode { if self.t_len < self.t_need { malformed(self.t, 0, self.t_len) @@ -247,6 +261,7 @@ fn t_decode_utf_8(self : Decoder) -> Decode { } } +///| fn decode_utf_8(self : Decoder) -> Decode { let rem = self.i_rem() match rem.compare(0) { @@ -276,27 +291,32 @@ fn decode_utf_8(self : Decoder) -> Decode { } } +///| fn i_rem(self : Decoder) -> Int { self.i_max - self.i_pos + 1 } +///| fn eoi(self : Decoder) -> Unit { self.i = @buffer.new() self.i_pos = 0 self.i_max = @int.min_value } +///| fn refill(self : Decoder, k : Cont) -> Decode { // only Buffer self.eoi() k(self) } +///| fn t_need(self : Decoder, need : Int) -> Unit { self.t_len = 0 self.t_need = need } +///| fn t_fill(k : Cont, decoder : Decoder) -> Decode { fn blit(decoder : Decoder, l : Int) -> Unit { decoder.i.blit(decoder.i_pos, decoder.t, decoder.t_len, l) @@ -319,11 +339,13 @@ fn t_fill(k : Cont, decoder : Decoder) -> Decode { } } +///| pub fn decode_lossy(encoding~ : Encoding = UTF8, src : @buffer.T) -> Stream { let decoder = decoder(encoding~, src) { decoder, lossy: true } } +///| pub fn decode_strict(encoding~ : Encoding = UTF8, src : @buffer.T) -> Stream { let decoder = decoder(encoding~, src) { decoder, lossy: false } diff --git a/encoding/types.mbt b/encoding/types.mbt index ba476103c..fdd397d3d 100644 --- a/encoding/types.mbt +++ b/encoding/types.mbt @@ -12,8 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +///| typealias Cont = (Decoder) -> Decode +///| pub(all) enum Encoding { UTF8 UTF16 @@ -23,6 +25,7 @@ pub(all) enum Encoding { // Decoder +///| priv struct Decoder { mut i : @buffer.T mut i_pos : Int @@ -33,16 +36,19 @@ priv struct Decoder { mut k : Cont } +///| priv enum Decode { End Malformed(String) Uchar(Char) } +///| fn malformed(buf : @buffer.T, offset : Int, length : Int) -> Decode { Malformed(buf.to_unchecked_string(offset~, length~)) } +///| fn malformed_pair( be : Bool, hi : Int, @@ -63,11 +69,13 @@ fn malformed_pair( // Stream +///| struct Stream { decoder : Decoder lossy : Bool } +///| pub fn iter(self : Stream) -> Iter[Char] { Iter::new( fn(yield_) { diff --git a/int/int.mbt b/int/int.mbt index 5aa82f510..753d0bb6b 100644 --- a/int/int.mbt +++ b/int/int.mbt @@ -42,6 +42,7 @@ pub fn abs(self : Int) -> Int { } } +///| pub fn minimum(self : Int, x : Int) -> Int { if self > x { x diff --git a/string/string.mbt b/string/string.mbt index a320c9761..d436ab83a 100644 --- a/string/string.mbt +++ b/string/string.mbt @@ -31,6 +31,7 @@ pub fn String::from_array(chars : Array[Char]) -> String { buf.to_string() } +///| pub fn String::from_iter(iter : Iter[Char]) -> String { let chars = iter.collect() String::from_array(chars) From aad5d6b1d68e86dc28571e196578fcf9baf8eecc Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Fri, 22 Nov 2024 13:20:36 +0800 Subject: [PATCH 09/25] some tests update --- encoding/encoding_test.mbt | 88 +++++++++++++++++++++++++++++++++++--- 1 file changed, 82 insertions(+), 6 deletions(-) diff --git a/encoding/encoding_test.mbt b/encoding/encoding_test.mbt index 64a13a3a9..f8fab85ad 100644 --- a/encoding/encoding_test.mbt +++ b/encoding/encoding_test.mbt @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -test "decoding String (UTF16LE) to String" { +test "lossy decoding String (UTF16LE encoded) to String" { let src = "你好👀" let buf = @buffer.T::from_bytes(src.to_bytes()) inspect!( @@ -25,7 +25,7 @@ test "decoding String (UTF16LE) to String" { inspect!(String::from_iter(stream.iter()), content=src) } -test "decoding UTF16LE to String" { +test "lossy decoding UTF16LE encoded data to String" { let src = "你好👀" let buf = @buffer.T::new(size_hint=10) buf.write_bytes(b"\x60\x4f") @@ -41,7 +41,7 @@ test "decoding UTF16LE to String" { inspect!(String::from_iter(stream.iter()), content=src) } -test "decoding UTF8 to String" { +test "lossy decoding UTF8 encoded data to String" { let buf = @buffer.T::new(size_hint=10) buf.write_bytes(b"\xe4\xbd\xa0") buf.write_bytes(b"\xe5\xa5\xbd") @@ -56,7 +56,7 @@ test "decoding UTF8 to String" { inspect!(String::from_iter(stream.iter()), content="你好👀") } -test "decoding encoded String (UTF16LE) to String" { +test "lossy decoding String (UTF16LE encoded) to String" { let src = "👋再见" let buf = @buffer.T::new(size_hint=10) for s in src { @@ -72,7 +72,7 @@ test "decoding encoded String (UTF16LE) to String" { inspect!(String::from_iter(stream.iter()), content=src) } -test "decoding encoded UTF8 to String" { +test "lossy decoding UTF8 encoded data to String" { let src = "👋再见" let buf = @buffer.T::new(size_hint=10) for s in src { @@ -88,7 +88,7 @@ test "decoding encoded UTF8 to String" { inspect!(String::from_iter(stream.iter()), content=src) } -test "decoding encoded UTF8" { +test "lossy decoding UTF8 encoded data" { let src = "👋再见" let buf = @buffer.T::new(size_hint=10) for s in src { @@ -103,3 +103,79 @@ test "decoding encoded UTF8" { let stream = @encoding.decode_lossy(buf) // defaults to UTF8 inspect!(stream.iter().collect(), content="['👋', '再', '见']") } + +test "lossy decoding UTF8 encoded data with UTF16LE" { + let src = "跑步🏃游泳🏊" + let buf = @buffer.T::new(size_hint=10) + for s in src { + buf.write_char(s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xd1\x8d\x65\x6b\x3c\xd8\xc3\xdf\x38\x6e\xf3\x6c\x3c\xd8\xca\xdf" + , + ) + let stream = @encoding.decode_lossy(buf) // defaults to UTF8 + inspect!( + stream.iter().collect(), + content="['э', 'e', 'k', '<', '�', '�', 'n', '�', '�']", + ) +} + +test "lossy decoding UTF16LE encoded data with UTF8" { + let src = "跑步🏃游泳🏊" + let buf = @buffer.T::new(size_hint=10) + for s in src { + buf.write_utf8_char(s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xe8\xb7\x91\xe6\xad\xa5\xf0\x9f\x8f\x83\xe6\xb8\xb8\xe6\xb3\xb3\xf0\x9f\x8f\x8a" + , + ) + let stream = @encoding.decode_lossy(encoding=UTF16LE, buf) // defaults to UTF8 + inspect!( + stream.iter().collect(), + content="['럨', '', 'ꖭ', '鿰', '莏', '룦', '', '뎳', '鿰', '誏']", + ) +} + +// test "scrictly decoding UTF8 encoded data with UTF16LE" { +// let src = "跑步🏃游泳🏊" +// let buf = @buffer.T::new(size_hint=10) +// for s in src { +// buf.write_char(s) +// } +// inspect!( +// buf.to_bytes(), +// content= +// #|b"\xd1\x8d\x65\x6b\x3c\xd8\xc3\xdf\x38\x6e\xf3\x6c\x3c\xd8\xca\xdf" +// , +// ) +// let stream = @encoding.decode_strict(buf) // defaults to UTF8 +// inspect!( +// stream.iter().collect(), +// content="['э', 'e', 'k', '<', '�', '�', 'n', '�', '�']", +// ) +// } +// +// test "scrictly decoding UTF16LE encoded data with UTF8" { +// let src = "跑步🏃游泳🏊" +// let buf = @buffer.T::new(size_hint=10) +// for s in src { +// buf.write_utf8_char(s) +// } +// inspect!( +// buf.to_bytes(), +// content= +// #|b"\xe8\xb7\x91\xe6\xad\xa5\xf0\x9f\x8f\x83\xe6\xb8\xb8\xe6\xb3\xb3\xf0\x9f\x8f\x8a" +// , +// ) +// let stream = @encoding.decode_strict(encoding=UTF16LE, buf) +// inspect!( +// stream.iter().collect(), +// content="['럨', '', 'ꖭ', '鿰', '莏', '룦', '', '뎳', '鿰', '誏']", +// ) +// } From 763135c0599672f6776315f876fbe09a403e8fa6 Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Mon, 25 Nov 2024 15:21:54 +0800 Subject: [PATCH 10/25] refactor: guard buffer op_{get, set} --- buffer/buffer.mbt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/buffer/buffer.mbt b/buffer/buffer.mbt index ad8672bba..83e4e6fcf 100644 --- a/buffer/buffer.mbt +++ b/buffer/buffer.mbt @@ -209,11 +209,15 @@ pub fn to_array(self : T) -> Array[Byte] { ///| pub fn op_set(self : T, index : Int, value : Byte) -> Unit { + let len = self.length() + guard index >= 0 && index < len self.data[index] = value } ///| pub fn op_get(self : T, index : Int) -> Byte { + let len = self.length() + guard index >= 0 && index < len self.data[index] } From bda6d066de2045dee7bc263e82b131fbbe5f90bf Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Mon, 25 Nov 2024 15:22:20 +0800 Subject: [PATCH 11/25] chore --- encoding/encoding_test.mbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/encoding/encoding_test.mbt b/encoding/encoding_test.mbt index f8fab85ad..a71ee7200 100644 --- a/encoding/encoding_test.mbt +++ b/encoding/encoding_test.mbt @@ -135,7 +135,7 @@ test "lossy decoding UTF16LE encoded data with UTF8" { #|b"\xe8\xb7\x91\xe6\xad\xa5\xf0\x9f\x8f\x83\xe6\xb8\xb8\xe6\xb3\xb3\xf0\x9f\x8f\x8a" , ) - let stream = @encoding.decode_lossy(encoding=UTF16LE, buf) // defaults to UTF8 + let stream = @encoding.decode_lossy(encoding=UTF16LE, buf) inspect!( stream.iter().collect(), content="['럨', '', 'ꖭ', '鿰', '莏', '룦', '', '뎳', '鿰', '誏']", From a09296affd263c85be0ccea9e2049cdf611c1354 Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Mon, 25 Nov 2024 15:28:36 +0800 Subject: [PATCH 12/25] feat: use optional params instead of laballed dep in buffer method --- buffer/buffer.mbt | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/buffer/buffer.mbt b/buffer/buffer.mbt index 83e4e6fcf..a8404cf81 100644 --- a/buffer/buffer.mbt +++ b/buffer/buffer.mbt @@ -84,11 +84,15 @@ pub fn to_string(self : T) -> String { /// Return a new unchecked string contains the data in buffer. /// Note this function does not validate the encoding of the byte sequence, /// it simply copy the bytes into a new String. -pub fn to_unchecked_string( - self : T, - offset~ : Int = 0, - length~ : Int = self.len -) -> String { +pub fn to_unchecked_string(self : T, offset? : Int, length? : Int) -> String { + let offset = match offset { + None => 0 + Some(x) => x + } + let length = match length { + None => self.len + Some(x) => x + } Bytes::from_fixedarray(self.data).to_unchecked_string(offset~, length~) } From 1df6c3d2e24379d3a16311f0977b0f4553ca15b2 Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Mon, 25 Nov 2024 16:15:03 +0800 Subject: [PATCH 13/25] moon info --- buffer/buffer.mbti | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buffer/buffer.mbti b/buffer/buffer.mbti index 019414de2..28a79725b 100644 --- a/buffer/buffer.mbti +++ b/buffer/buffer.mbti @@ -17,7 +17,7 @@ impl T { to_array(Self) -> Array[Byte] to_bytes(Self) -> Bytes to_string(Self) -> String //deprecated - to_unchecked_string(Self, offset~ : Int = .., length~ : Int = ..) -> String + to_unchecked_string(Self, offset? : Int, length? : Int) -> String write_byte(Self, Byte) -> Unit write_bytes(Self, Bytes) -> Unit write_char(Self, Char) -> Unit From 471354dcba28852b8dae96f1b01d2ee7e461ddc9 Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Mon, 25 Nov 2024 16:48:59 +0800 Subject: [PATCH 14/25] feat: builtin try_collect --- builtin/builtin.mbti | 1 + builtin/iter.mbt | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/builtin/builtin.mbti b/builtin/builtin.mbti index d1c4ed4fd..1df5121a5 100644 --- a/builtin/builtin.mbti +++ b/builtin/builtin.mbti @@ -246,6 +246,7 @@ impl Iter { tap[T](Self[T], (T) -> Unit) -> Self[T] //deprecated to_array[T](Self[T]) -> Array[T] to_string[T : Show](Self[T]) -> String + try_collect[T, E : Error](Self[Result[T, E]]) -> Array[T]!E } impl[T : Show] Show for Iter[T] diff --git a/builtin/iter.mbt b/builtin/iter.mbt index 28e03ba78..75a2f6263 100644 --- a/builtin/iter.mbt +++ b/builtin/iter.mbt @@ -786,6 +786,19 @@ pub fn collect[T](self : Iter[T]) -> Array[T] { result } +///| +/// Collects the elements of the iterator into an array. +pub fn try_collect[T, E : Error](self : Iter[Result[T, E]]) -> Array[T]!E { + let result = [] + for a in self { + match a { + Ok(x) => result.push(x) + Err(e) => raise e + } + } + result +} + ///| /// Iter itself is an iterator. /// so that it works with array spread operator. e.g, `[..iter]` From fa81954c0c59b37297409339de9d703c923b5626 Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Mon, 25 Nov 2024 16:49:20 +0800 Subject: [PATCH 15/25] feat: StrictChars --- encoding/decoding.mbt | 11 +++--- encoding/encoding.mbti | 16 ++++++--- encoding/encoding_test.mbt | 71 ++++++++++++++++++-------------------- encoding/types.mbt | 57 +++++++++++++++++++++++------- 4 files changed, 97 insertions(+), 58 deletions(-) diff --git a/encoding/decoding.mbt b/encoding/decoding.mbt index e19e8fd4f..e6b8dd5ef 100644 --- a/encoding/decoding.mbt +++ b/encoding/decoding.mbt @@ -340,13 +340,16 @@ fn t_fill(k : Cont, decoder : Decoder) -> Decode { } ///| -pub fn decode_lossy(encoding~ : Encoding = UTF8, src : @buffer.T) -> Stream { +pub fn decode_lossy(encoding~ : Encoding = UTF8, src : @buffer.T) -> LossyChars { let decoder = decoder(encoding~, src) - { decoder, lossy: true } + decoder } ///| -pub fn decode_strict(encoding~ : Encoding = UTF8, src : @buffer.T) -> Stream { +pub fn decode_strict( + encoding~ : Encoding = UTF8, + src : @buffer.T +) -> StrictChars { let decoder = decoder(encoding~, src) - { decoder, lossy: false } + decoder } diff --git a/encoding/encoding.mbti b/encoding/encoding.mbti index 38000d3c7..efbe7c65e 100644 --- a/encoding/encoding.mbti +++ b/encoding/encoding.mbti @@ -3,11 +3,14 @@ package moonbitlang/core/encoding alias @moonbitlang/core/buffer as @buffer // Values -fn decode_lossy(encoding~ : Encoding = .., @buffer.T) -> Stream +fn decode_lossy(encoding~ : Encoding = .., @buffer.T) -> LossyChars -fn decode_strict(encoding~ : Encoding = .., @buffer.T) -> Stream +fn decode_strict(encoding~ : Encoding = .., @buffer.T) -> StrictChars // Types and methods +type DecodeError +impl Show for DecodeError + pub(all) enum Encoding { UTF8 UTF16 @@ -15,11 +18,16 @@ pub(all) enum Encoding { UTF16LE } -type Stream -impl Stream { +type LossyChars +impl LossyChars { iter(Self) -> Iter[Char] } +type StrictChars +impl StrictChars { + iter(Self) -> Iter[Result[Char, DecodeError]] +} + // Type aliases // Traits diff --git a/encoding/encoding_test.mbt b/encoding/encoding_test.mbt index a71ee7200..2000d60c3 100644 --- a/encoding/encoding_test.mbt +++ b/encoding/encoding_test.mbt @@ -142,40 +142,37 @@ test "lossy decoding UTF16LE encoded data with UTF8" { ) } -// test "scrictly decoding UTF8 encoded data with UTF16LE" { -// let src = "跑步🏃游泳🏊" -// let buf = @buffer.T::new(size_hint=10) -// for s in src { -// buf.write_char(s) -// } -// inspect!( -// buf.to_bytes(), -// content= -// #|b"\xd1\x8d\x65\x6b\x3c\xd8\xc3\xdf\x38\x6e\xf3\x6c\x3c\xd8\xca\xdf" -// , -// ) -// let stream = @encoding.decode_strict(buf) // defaults to UTF8 -// inspect!( -// stream.iter().collect(), -// content="['э', 'e', 'k', '<', '�', '�', 'n', '�', '�']", -// ) -// } -// -// test "scrictly decoding UTF16LE encoded data with UTF8" { -// let src = "跑步🏃游泳🏊" -// let buf = @buffer.T::new(size_hint=10) -// for s in src { -// buf.write_utf8_char(s) -// } -// inspect!( -// buf.to_bytes(), -// content= -// #|b"\xe8\xb7\x91\xe6\xad\xa5\xf0\x9f\x8f\x83\xe6\xb8\xb8\xe6\xb3\xb3\xf0\x9f\x8f\x8a" -// , -// ) -// let stream = @encoding.decode_strict(encoding=UTF16LE, buf) -// inspect!( -// stream.iter().collect(), -// content="['럨', '', 'ꖭ', '鿰', '莏', '룦', '', '뎳', '鿰', '誏']", -// ) -// } +test "scrictly decoding UTF8 encoded data with UTF16LE" { + let src = "跑步🏃游泳🏊" + let buf = @buffer.T::new(size_hint=10) + for s in src { + buf.write_char(s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xd1\x8d\x65\x6b\x3c\xd8\xc3\xdf\x38\x6e\xf3\x6c\x3c\xd8\xca\xdf" + , + ) + let stream = @encoding.decode_strict(buf) // defaults to UTF8 + inspect!(stream.iter().try_collect?(), content="Err(쏘)") +} + +test "scrictly decoding UTF16LE encoded data with UTF8" { + let src = "跑步🏃游泳🏊" + let buf = @buffer.T::new(size_hint=10) + for s in src { + buf.write_utf8_char(s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xe8\xb7\x91\xe6\xad\xa5\xf0\x9f\x8f\x83\xe6\xb8\xb8\xe6\xb3\xb3\xf0\x9f\x8f\x8a" + , + ) + let stream = @encoding.decode_strict(encoding=UTF16LE, buf) + inspect!( + stream.iter().try_collect?(), + content="Ok(['럨', '', 'ꖭ', '鿰', '莏', '룦', '', '뎳', '鿰', '誏'])", + ) +} diff --git a/encoding/types.mbt b/encoding/types.mbt index fdd397d3d..de8fa9a30 100644 --- a/encoding/types.mbt +++ b/encoding/types.mbt @@ -67,33 +67,64 @@ fn malformed_pair( Malformed(bs.to_unchecked_string(offset=0, length=bs.length())) } -// Stream +// Chars ///| -struct Stream { - decoder : Decoder - lossy : Bool -} +type LossyChars Decoder ///| -pub fn iter(self : Stream) -> Iter[Char] { +pub fn iter(self : LossyChars) -> Iter[Char] { Iter::new( fn(yield_) { - loop self.decoder.decode() { + loop self._.decode() { Uchar(u) => { if yield_(u) == IterEnd { break IterEnd } - continue self.decoder.decode() + continue self._.decode() } - Malformed(s) => { - if not(self.lossy) { - abort(s) - } + Malformed(_) => { if yield_(U_REP) == IterEnd { break IterEnd } - continue self.decoder.decode() + continue self._.decode() + } + End => break IterEnd + } + }, + ) +} + +///| +type StrictChars Decoder + +///| +type! DecodeError String + +///| +pub impl Show for DecodeError with output(self, logger) { + match self { + DecodeError(err) => logger.write_string(err) + } +} + +///| +pub fn iter(self : StrictChars) -> Iter[Result[Char, DecodeError]] { + Iter::new( + fn(yield_) { + loop self._.decode() { + Uchar(u) => { + if yield_(Ok(u)) == IterEnd { + break IterEnd + } + continue self._.decode() + } + Malformed(s) => { + let err = DecodeError(s) + if yield_(Err(err)) == IterEnd { + break IterEnd + } + continue self._.decode() } End => break IterEnd } From 775533a01c586b46cc61a03403ca094e46ddb0a0 Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Mon, 25 Nov 2024 17:25:07 +0800 Subject: [PATCH 16/25] refactor --- encoding/decoding.mbt | 373 +++++++++++++++++++++--------------------- 1 file changed, 189 insertions(+), 184 deletions(-) diff --git a/encoding/decoding.mbt b/encoding/decoding.mbt index e6b8dd5ef..6e0d1c74c 100644 --- a/encoding/decoding.mbt +++ b/encoding/decoding.mbt @@ -16,7 +16,6 @@ const U_REP = '\u{FFFD}' ///| -// consider const let utf_8_len = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -30,6 +29,140 @@ let utf_8_len = [ 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ] +///| +pub fn decode_lossy(encoding~ : Encoding = UTF8, src : @buffer.T) -> LossyChars { + let decoder = decoder(encoding~, src) + decoder +} + +///| +pub fn decode_strict( + encoding~ : Encoding = UTF8, + src : @buffer.T +) -> StrictChars { + let decoder = decoder(encoding~, src) + decoder +} + +// Implements + +///| +fn decoder(encoding~ : Encoding, src : @buffer.T) -> Decoder { + let i = src + let i_pos = 0 + let i_max = src.length() - 1 + let t = @buffer.from_bytes(b"\x00\x00\x00\x00") + let t_len = 0 + let t_need = 0 + let k = match encoding { + UTF8 => decode_utf_8 + UTF16 => decode_utf_16be // TODO: BE + UTF16BE => decode_utf_16be + UTF16LE => decode_utf_16le + } + { i, i_pos, i_max, t, t_len, t_need, k } +} + +///| +fn decode(self : Decoder) -> Decode { + (self.k)(self) +} + +///| +fn ret(self : Decoder, k : Cont, v : Decode) -> Decode { + self.k = k + v +} + +///| +fn i_rem(self : Decoder) -> Int { + self.i_max - self.i_pos + 1 +} + +///| +fn eoi(self : Decoder) -> Unit { + self.i = @buffer.new() + self.i_pos = 0 + self.i_max = @int.min_value +} + +///| +fn refill(self : Decoder, k : Cont) -> Decode { + // only Buffer + self.eoi() + k(self) +} + +///| +fn t_need(self : Decoder, need : Int) -> Unit { + self.t_len = 0 + self.t_need = need +} + +///| +fn t_fill(k : Cont, decoder : Decoder) -> Decode { + fn blit(decoder : Decoder, l : Int) -> Unit { + decoder.i.blit(decoder.i_pos, decoder.t, decoder.t_len, l) + decoder.i_pos = decoder.i_pos + 1 + decoder.t_len = decoder.t_len + 1 + } + + let rem = decoder.i_rem() + if rem < 0 { // eoi + k(decoder) + } else { + let need = decoder.t_need - decoder.t_len + if rem < need { + blit(decoder, rem) + decoder.refill(@tuple.curry(t_fill)(k)) + } else { + blit(decoder, need) + k(decoder) + } + } +} + +// UTF8 + +///| +fn decode_utf_8(self : Decoder) -> Decode { + let rem = self.i_rem() + match rem.compare(0) { + // rem < 0 + -1 => Decode::End + // rem = 0 + 0 => self.refill(decode_utf_8) + // rem > 0 + 1 => { + let idx = self.i[self.i_pos].to_int() + let need = utf_8_len[idx] + if rem < need { + self.t_need(need) + t_fill(t_decode_utf_8, self) + } else { + let j = self.i_pos + if need == 0 { + self.i_pos = self.i_pos + 1 + self.ret(decode_utf_8, malformed(self.i, j, 1)) + } else { + self.i_pos = self.i_pos + need + self.ret(decode_utf_8, r_utf_8(self.i, j, need)) + } + } + } + _ => abort("unreachable") + } +} + +///| +fn t_decode_utf_8(self : Decoder) -> Decode { + if self.t_len < self.t_need { + malformed(self.t, 0, self.t_len) + } else { + r_utf_8(self.t, 0, self.t_len) + } +} + ///| fn r_utf_8(buf : @buffer.T, offset : Int, length : Int) -> Decode { fn uchar(c : Int) { @@ -115,72 +248,7 @@ fn r_utf_8(buf : @buffer.T, offset : Int, length : Int) -> Decode { } } -///| -fn r_utf_16(buf : @buffer.T, offset0 : Int, offset1 : Int) -> UTF16Decode { - let b0 = buf[offset0].to_int() - let b1 = buf[offset1].to_int() - let u = (b0 << 8) | b1 - if u < 0xD800 || u > 0xDFFF { - UTF16Uchar(Char::from_int(u)) - } else if u > 0xDBFF { - UTF16Malformed( - buf.to_unchecked_string(offset=@int.minimum(offset0, offset1), length=2), - ) - } else { - Hi(u) - } -} - -///| -fn r_utf_16_lo( - hi : Int, - buf : @buffer.T, - offset0 : Int, - offset1 : Int -) -> Decode { - let b0 = buf[offset0].to_int() - let b1 = buf[offset1].to_int() - let lo = (b0 << 8) | b1 - if lo < 0xDC00 || lo > 0xDFFF { - malformed_pair( - offset0 < offset1, - hi, - buf, - @int.minimum(offset0, offset1), - 2, - ) - } else { - Uchar(Char::from_int(((hi & 0x3FF) << 10) | ((lo & 0x3FF) + 0x10000))) - } -} - -///| -fn decode(self : Decoder) -> Decode { - (self.k)(self) -} - -///| -fn decoder(encoding~ : Encoding, src : @buffer.T) -> Decoder { - let i = src - let i_pos = 0 - let i_max = src.length() - 1 - let t = @buffer.from_bytes(b"\x00\x00\x00\x00") - let t_len = 0 - let t_need = 0 - let k = match encoding { - UTF8 => decode_utf_8 - UTF16 => decode_utf_16le - UTF16BE => decode_utf_16le - UTF16LE => decode_utf_16le - } - { i, i_pos, i_max, t, t_len, t_need, k } -} - -///| -fn ret(self : Decoder, k : Cont, v : Decode) -> Decode { - self.k = k - v -} +// UTF16LE ///| priv enum UTF16Decode { @@ -189,46 +257,6 @@ priv enum UTF16Decode { UTF16Uchar(Char) } -///| -fn t_decode_utf_16le_lo(hi : Int, decoder : Decoder) -> Decode { - if decoder.t_len < decoder.t_need { - decoder.ret( - decode_utf_16le, - malformed_pair(false, hi, decoder.t, 0, decoder.t_len), - ) - } else { - decoder.ret(decode_utf_16le, r_utf_16_lo(hi, decoder.t, 1, 0)) - } -} - -///| -fn t_decode_utf_16le(self : Decoder) -> Decode { - if self.t_len < self.t_need { - self.ret(decode_utf_16le, malformed(self.t, 0, self.t_len)) - } else { - self.decode_utf_16le_lo(r_utf_16(self.t, 1, 0)) - } -} - -///| -fn decode_utf_16le_lo(self : Decoder, v : UTF16Decode) -> Decode { - match v { - UTF16Uchar(u) => Uchar(u) - UTF16Malformed(s) => Malformed(s) - Hi(hi) => { - let rem = self.i_rem() - if rem < 2 { - self.t_need(2) - t_fill(@tuple.curry(t_decode_utf_16le_lo)(hi), self) - } else { - let j = self.i_pos - self.i_pos = self.i_pos + 2 - r_utf_16_lo(hi, self.i, j + 1, j) - } - } - } -} - ///| fn decode_utf_16le(self : Decoder) -> Decode { let rem = self.i_rem() @@ -253,103 +281,80 @@ fn decode_utf_16le(self : Decoder) -> Decode { } ///| -fn t_decode_utf_8(self : Decoder) -> Decode { +fn t_decode_utf_16le(self : Decoder) -> Decode { if self.t_len < self.t_need { - malformed(self.t, 0, self.t_len) + self.ret(decode_utf_16le, malformed(self.t, 0, self.t_len)) } else { - r_utf_8(self.t, 0, self.t_len) + self.decode_utf_16le_lo(r_utf_16(self.t, 1, 0)) } } ///| -fn decode_utf_8(self : Decoder) -> Decode { - let rem = self.i_rem() - match rem.compare(0) { - // rem < 0 - -1 => Decode::End - // rem = 0 - 0 => self.refill(decode_utf_8) - // rem > 0 - 1 => { - let idx = self.i[self.i_pos].to_int() - let need = utf_8_len[idx] - if rem < need { - self.t_need(need) - t_fill(t_decode_utf_8, self) +fn decode_utf_16le_lo(self : Decoder, v : UTF16Decode) -> Decode { + match v { + UTF16Uchar(u) => Uchar(u) + UTF16Malformed(s) => Malformed(s) + Hi(hi) => { + let rem = self.i_rem() + if rem < 2 { + self.t_need(2) + t_fill(@tuple.curry(t_decode_utf_16le_lo)(hi), self) } else { let j = self.i_pos - if need == 0 { - self.i_pos = self.i_pos + 1 - self.ret(decode_utf_8, malformed(self.i, j, 1)) - } else { - self.i_pos = self.i_pos + need - self.ret(decode_utf_8, r_utf_8(self.i, j, need)) - } + self.i_pos = self.i_pos + 2 + r_utf_16_lo(hi, self.i, j + 1, j) } } - _ => abort("unreachable") } } ///| -fn i_rem(self : Decoder) -> Int { - self.i_max - self.i_pos + 1 -} - -///| -fn eoi(self : Decoder) -> Unit { - self.i = @buffer.new() - self.i_pos = 0 - self.i_max = @int.min_value -} - -///| -fn refill(self : Decoder, k : Cont) -> Decode { - // only Buffer - self.eoi() - k(self) -} - -///| -fn t_need(self : Decoder, need : Int) -> Unit { - self.t_len = 0 - self.t_need = need -} - -///| -fn t_fill(k : Cont, decoder : Decoder) -> Decode { - fn blit(decoder : Decoder, l : Int) -> Unit { - decoder.i.blit(decoder.i_pos, decoder.t, decoder.t_len, l) - decoder.i_pos = decoder.i_pos + 1 - decoder.t_len = decoder.t_len + 1 - } - - let rem = decoder.i_rem() - if rem < 0 { // eoi - k(decoder) +fn t_decode_utf_16le_lo(hi : Int, decoder : Decoder) -> Decode { + if decoder.t_len < decoder.t_need { + decoder.ret( + decode_utf_16le, + malformed_pair(false, hi, decoder.t, 0, decoder.t_len), + ) } else { - let need = decoder.t_need - decoder.t_len - if rem < need { - blit(decoder, rem) - decoder.refill(@tuple.curry(t_fill)(k)) - } else { - blit(decoder, need) - k(decoder) - } + decoder.ret(decode_utf_16le, r_utf_16_lo(hi, decoder.t, 1, 0)) } } ///| -pub fn decode_lossy(encoding~ : Encoding = UTF8, src : @buffer.T) -> LossyChars { - let decoder = decoder(encoding~, src) - decoder +fn r_utf_16_lo( + hi : Int, + buf : @buffer.T, + offset0 : Int, + offset1 : Int +) -> Decode { + let b0 = buf[offset0].to_int() + let b1 = buf[offset1].to_int() + let lo = (b0 << 8) | b1 + if lo < 0xDC00 || lo > 0xDFFF { + malformed_pair( + offset0 < offset1, + hi, + buf, + @int.minimum(offset0, offset1), + 2, + ) + } else { + Uchar(Char::from_int(((hi & 0x3FF) << 10) | ((lo & 0x3FF) + 0x10000))) + } } ///| -pub fn decode_strict( - encoding~ : Encoding = UTF8, - src : @buffer.T -) -> StrictChars { - let decoder = decoder(encoding~, src) - decoder +fn r_utf_16(buf : @buffer.T, offset0 : Int, offset1 : Int) -> UTF16Decode { + let b0 = buf[offset0].to_int() + let b1 = buf[offset1].to_int() + let u = (b0 << 8) | b1 + if u < 0xD800 || u > 0xDFFF { + UTF16Uchar(Char::from_int(u)) + } else if u > 0xDBFF { + UTF16Malformed( + buf.to_unchecked_string(offset=@int.minimum(offset0, offset1), length=2), + ) + } else { + Hi(u) + } } From 6ff5817366ac8b36079acf18350b6287c91c4277 Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Mon, 25 Nov 2024 17:30:31 +0800 Subject: [PATCH 17/25] feat: decode_ no labelled dependency --- encoding/decoding.mbt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/encoding/decoding.mbt b/encoding/decoding.mbt index 6e0d1c74c..88461cc8a 100644 --- a/encoding/decoding.mbt +++ b/encoding/decoding.mbt @@ -30,24 +30,24 @@ let utf_8_len = [ ] ///| -pub fn decode_lossy(encoding~ : Encoding = UTF8, src : @buffer.T) -> LossyChars { - let decoder = decoder(encoding~, src) +pub fn decode_lossy(encoding : Encoding, src : @buffer.T) -> LossyChars { + let decoder = decoder(encoding, src) decoder } ///| pub fn decode_strict( - encoding~ : Encoding = UTF8, + encoding : Encoding, src : @buffer.T ) -> StrictChars { - let decoder = decoder(encoding~, src) + let decoder = decoder(encoding, src) decoder } // Implements ///| -fn decoder(encoding~ : Encoding, src : @buffer.T) -> Decoder { +fn decoder(encoding : Encoding, src : @buffer.T) -> Decoder { let i = src let i_pos = 0 let i_max = src.length() - 1 From 7688e38d136d863479ae8221868f4a90fa644530 Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Mon, 25 Nov 2024 23:28:50 +0800 Subject: [PATCH 18/25] feat: support decode UTF16BE --- encoding/decoding.mbt | 60 +++++++++++++++++++++++++++- encoding/encoding_test.mbt | 81 +++++++++++++++++++++++++++++++------- 2 files changed, 126 insertions(+), 15 deletions(-) diff --git a/encoding/decoding.mbt b/encoding/decoding.mbt index 88461cc8a..ae47594b3 100644 --- a/encoding/decoding.mbt +++ b/encoding/decoding.mbt @@ -56,7 +56,7 @@ fn decoder(encoding : Encoding, src : @buffer.T) -> Decoder { let t_need = 0 let k = match encoding { UTF8 => decode_utf_8 - UTF16 => decode_utf_16be // TODO: BE + UTF16 => decode_utf_16be UTF16BE => decode_utf_16be UTF16LE => decode_utf_16le } @@ -358,3 +358,61 @@ fn r_utf_16(buf : @buffer.T, offset0 : Int, offset1 : Int) -> UTF16Decode { Hi(u) } } + +// UTF16BE + +fn decode_utf_16be(self : Decoder) -> Decode { + let rem = self.i_rem() + match rem.compare(0) { + // rem < 0 + -1 => Decode::End + // rem = 0 + 0 => self.refill(decode_utf_16be) + // rem > 0 + 1 => { + if rem < 2 { + self.t_need(2) + t_fill(t_decode_utf_16be, self) + } else { + let j = self.i_pos + self.i_pos = self.i_pos + 2 + self.decode_utf_16be_lo(r_utf_16(self.i, j, (j + 1))) + } + } + _ => abort("unreachable") + } +} + +fn t_decode_utf_16be(self : Decoder) -> Decode { + if self.t_len < self.t_need { + self.ret(decode_utf_16be, malformed(self.t, 0, self.t_len)) + } else { + self.decode_utf_16be_lo(r_utf_16(self.t, 0, 1)) + } +} + +fn decode_utf_16be_lo(self : Decoder, decode : UTF16Decode) -> Decode { + match decode { + UTF16Uchar(x)=> self.ret(decode_utf_16be, Uchar(x)) + UTF16Malformed(x)=> self.ret(decode_utf_16be, Malformed(x)) + Hi(hi) => { + let rem = self.i_rem() + if rem < 2 { + self.t_need(2) + t_fill(@tuple.curry(t_decode_utf_16be_lo)(hi), self) + } else { + let j = self.i_pos + self.i_pos = self.i_pos + 2 + self.ret(decode_utf_16be, r_utf_16_lo(hi, self.i, j, (j + 1))) + } + } + } +} + +fn t_decode_utf_16be_lo(hi : Int, self : Decoder) -> Decode { + if self.t_len < self.t_need { + self.ret(decode_utf_16be, malformed_pair(true, hi, self.t, 0, self.t_len)) + } else { + self.ret(decode_utf_16be, r_utf_16_lo(hi, self.t, 0, 1)) + } +} diff --git a/encoding/encoding_test.mbt b/encoding/encoding_test.mbt index 2000d60c3..5eab775f1 100644 --- a/encoding/encoding_test.mbt +++ b/encoding/encoding_test.mbt @@ -21,12 +21,11 @@ test "lossy decoding String (UTF16LE encoded) to String" { #|b"\x60\x4f\x7d\x59\x3d\xd8\x40\xdc" , ) - let stream = @encoding.decode_lossy(encoding=UTF16LE, buf) + let stream = @encoding.decode_lossy(UTF16LE, buf) inspect!(String::from_iter(stream.iter()), content=src) } test "lossy decoding UTF16LE encoded data to String" { - let src = "你好👀" let buf = @buffer.T::new(size_hint=10) buf.write_bytes(b"\x60\x4f") buf.write_bytes(b"\x7d\x59") @@ -37,8 +36,43 @@ test "lossy decoding UTF16LE encoded data to String" { #|b"\x60\x4f\x7d\x59\x3d\xd8\x40\xdc" , ) - let stream = @encoding.decode_lossy(encoding=UTF16LE, buf) - inspect!(String::from_iter(stream.iter()), content=src) + let stream = @encoding.decode_lossy(UTF16LE, buf) + inspect!(String::from_iter(stream.iter()), content="你好👀") +} + +test "lossy decoding UTF16BE encoded data to String" { + let buf = @buffer.T::new(size_hint=10) + buf.write_bytes(b"\xd8\x3d\xdc\x08") + buf.write_bytes(b"\xd8\x3d\xdc\x31") + buf.write_bytes(b"\xd8\x3d\xdc\x07") + buf.write_bytes(b"\xd8\x3d\xdc\x30") + inspect!( + buf.to_bytes(), + content= + #|b"\xd8\x3d\xdc\x08\xd8\x3d\xdc\x31\xd8\x3d\xdc\x07\xd8\x3d\xdc\x30" + , + ) + let stream = @encoding.decode_lossy(UTF16BE, buf) + inspect!(String::from_iter(stream.iter()), content="🐈🐱🐇🐰") +} + +test "lossy decoding UTF16 (alias of UTF16BE) encoded data to String" { + let buf = @buffer.T::new(size_hint=24) + buf.write_bytes(b"\x18\x65") + buf.write_bytes(b"\x18\x20") + buf.write_bytes(b"\x18\x73") + buf.write_bytes(b"\x18\x64") + buf.write_bytes(b"\x18\x73") + buf.write_bytes(b"\x18\x36") + buf.write_bytes(b"\x18\x20") + inspect!( + buf.to_bytes(), + content= + #|b"\x18\x65\x18\x20\x18\x73\x18\x64\x18\x73\x18\x36\x18\x20" + , + ) + let stream = @encoding.decode_lossy(UTF16, buf) + inspect!(String::from_iter(stream.iter()), content="ᡥᠠᡳᡤᡳᠶᠠ") } test "lossy decoding UTF8 encoded data to String" { @@ -52,7 +86,7 @@ test "lossy decoding UTF8 encoded data to String" { #|b"\xe4\xbd\xa0\xe5\xa5\xbd\xf0\x9f\x91\x80" , ) - let stream = @encoding.decode_lossy(encoding=UTF8, buf) + let stream = @encoding.decode_lossy(UTF8, buf) inspect!(String::from_iter(stream.iter()), content="你好👀") } @@ -68,7 +102,7 @@ test "lossy decoding String (UTF16LE encoded) to String" { #|b"\x3d\xd8\x4b\xdc\x8d\x51\xc1\x89" , ) - let stream = @encoding.decode_lossy(encoding=UTF16LE, buf) + let stream = @encoding.decode_lossy(UTF16LE, buf) inspect!(String::from_iter(stream.iter()), content=src) } @@ -84,7 +118,7 @@ test "lossy decoding UTF8 encoded data to String" { #|b"\xf0\x9f\x91\x8b\xe5\x86\x8d\xe8\xa7\x81" , ) - let stream = @encoding.decode_lossy(buf) // defaults to UTF8 + let stream = @encoding.decode_lossy(UTF8, buf) inspect!(String::from_iter(stream.iter()), content=src) } @@ -100,7 +134,7 @@ test "lossy decoding UTF8 encoded data" { #|b"\xf0\x9f\x91\x8b\xe5\x86\x8d\xe8\xa7\x81" , ) - let stream = @encoding.decode_lossy(buf) // defaults to UTF8 + let stream = @encoding.decode_lossy(UTF8, buf) inspect!(stream.iter().collect(), content="['👋', '再', '见']") } @@ -116,7 +150,7 @@ test "lossy decoding UTF8 encoded data with UTF16LE" { #|b"\xd1\x8d\x65\x6b\x3c\xd8\xc3\xdf\x38\x6e\xf3\x6c\x3c\xd8\xca\xdf" , ) - let stream = @encoding.decode_lossy(buf) // defaults to UTF8 + let stream = @encoding.decode_lossy(UTF8, buf) inspect!( stream.iter().collect(), content="['э', 'e', 'k', '<', '�', '�', 'n', '�', '�']", @@ -135,14 +169,14 @@ test "lossy decoding UTF16LE encoded data with UTF8" { #|b"\xe8\xb7\x91\xe6\xad\xa5\xf0\x9f\x8f\x83\xe6\xb8\xb8\xe6\xb3\xb3\xf0\x9f\x8f\x8a" , ) - let stream = @encoding.decode_lossy(encoding=UTF16LE, buf) + let stream = @encoding.decode_lossy(UTF16LE, buf) inspect!( stream.iter().collect(), content="['럨', '', 'ꖭ', '鿰', '莏', '룦', '', '뎳', '鿰', '誏']", ) } -test "scrictly decoding UTF8 encoded data with UTF16LE" { +test "strictly decoding UTF8 encoded data with UTF16LE" { let src = "跑步🏃游泳🏊" let buf = @buffer.T::new(size_hint=10) for s in src { @@ -154,11 +188,11 @@ test "scrictly decoding UTF8 encoded data with UTF16LE" { #|b"\xd1\x8d\x65\x6b\x3c\xd8\xc3\xdf\x38\x6e\xf3\x6c\x3c\xd8\xca\xdf" , ) - let stream = @encoding.decode_strict(buf) // defaults to UTF8 + let stream = @encoding.decode_strict(UTF8, buf) inspect!(stream.iter().try_collect?(), content="Err(쏘)") } -test "scrictly decoding UTF16LE encoded data with UTF8" { +test "strictly decoding UTF16LE encoded data with UTF8" { let src = "跑步🏃游泳🏊" let buf = @buffer.T::new(size_hint=10) for s in src { @@ -170,9 +204,28 @@ test "scrictly decoding UTF16LE encoded data with UTF8" { #|b"\xe8\xb7\x91\xe6\xad\xa5\xf0\x9f\x8f\x83\xe6\xb8\xb8\xe6\xb3\xb3\xf0\x9f\x8f\x8a" , ) - let stream = @encoding.decode_strict(encoding=UTF16LE, buf) + let stream = @encoding.decode_strict(UTF16LE, buf) inspect!( stream.iter().try_collect?(), content="Ok(['럨', '', 'ꖭ', '鿰', '莏', '룦', '', '뎳', '鿰', '誏'])", ) } + +test "strictly decoding UTF16BE encoded data with UTF8" { + let src = "跑步🏃游泳🏊" + let buf = @buffer.T::new(size_hint=10) + for s in src { + buf.write_utf8_char(s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xe8\xb7\x91\xe6\xad\xa5\xf0\x9f\x8f\x83\xe6\xb8\xb8\xe6\xb3\xb3\xf0\x9f\x8f\x8a" + , + ) + let stream = @encoding.decode_strict(UTF16BE, buf) + inspect!( + stream.iter().try_collect?(), + content="Ok(['', '釦', '궥', '', '较', '', '룦', '뎳', '', '辊'])", + ) +} From a6e8c1810fcf31b9c1e881adb2dfcace75e6fb80 Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Tue, 26 Nov 2024 08:31:17 +0800 Subject: [PATCH 19/25] refactor: moon fmt --- encoding/decoding.mbt | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/encoding/decoding.mbt b/encoding/decoding.mbt index ae47594b3..b25be1cac 100644 --- a/encoding/decoding.mbt +++ b/encoding/decoding.mbt @@ -36,10 +36,7 @@ pub fn decode_lossy(encoding : Encoding, src : @buffer.T) -> LossyChars { } ///| -pub fn decode_strict( - encoding : Encoding, - src : @buffer.T -) -> StrictChars { +pub fn decode_strict(encoding : Encoding, src : @buffer.T) -> StrictChars { let decoder = decoder(encoding, src) decoder } @@ -361,6 +358,7 @@ fn r_utf_16(buf : @buffer.T, offset0 : Int, offset1 : Int) -> UTF16Decode { // UTF16BE +///| fn decode_utf_16be(self : Decoder) -> Decode { let rem = self.i_rem() match rem.compare(0) { @@ -369,20 +367,20 @@ fn decode_utf_16be(self : Decoder) -> Decode { // rem = 0 0 => self.refill(decode_utf_16be) // rem > 0 - 1 => { + 1 => if rem < 2 { self.t_need(2) t_fill(t_decode_utf_16be, self) } else { let j = self.i_pos self.i_pos = self.i_pos + 2 - self.decode_utf_16be_lo(r_utf_16(self.i, j, (j + 1))) + self.decode_utf_16be_lo(r_utf_16(self.i, j, j + 1)) } - } _ => abort("unreachable") } } +///| fn t_decode_utf_16be(self : Decoder) -> Decode { if self.t_len < self.t_need { self.ret(decode_utf_16be, malformed(self.t, 0, self.t_len)) @@ -391,10 +389,11 @@ fn t_decode_utf_16be(self : Decoder) -> Decode { } } +///| fn decode_utf_16be_lo(self : Decoder, decode : UTF16Decode) -> Decode { match decode { - UTF16Uchar(x)=> self.ret(decode_utf_16be, Uchar(x)) - UTF16Malformed(x)=> self.ret(decode_utf_16be, Malformed(x)) + UTF16Uchar(x) => self.ret(decode_utf_16be, Uchar(x)) + UTF16Malformed(x) => self.ret(decode_utf_16be, Malformed(x)) Hi(hi) => { let rem = self.i_rem() if rem < 2 { @@ -403,12 +402,13 @@ fn decode_utf_16be_lo(self : Decoder, decode : UTF16Decode) -> Decode { } else { let j = self.i_pos self.i_pos = self.i_pos + 2 - self.ret(decode_utf_16be, r_utf_16_lo(hi, self.i, j, (j + 1))) + self.ret(decode_utf_16be, r_utf_16_lo(hi, self.i, j, j + 1)) } } } } +///| fn t_decode_utf_16be_lo(hi : Int, self : Decoder) -> Decode { if self.t_len < self.t_need { self.ret(decode_utf_16be, malformed_pair(true, hi, self.t, 0, self.t_len)) From 3b6e4264c43218fb230796653de11b88bbce7ec8 Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Tue, 26 Nov 2024 08:32:55 +0800 Subject: [PATCH 20/25] refactor: rename --- encoding/{encoding_test.mbt => decoding_test.mbt} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename encoding/{encoding_test.mbt => decoding_test.mbt} (100%) diff --git a/encoding/encoding_test.mbt b/encoding/decoding_test.mbt similarity index 100% rename from encoding/encoding_test.mbt rename to encoding/decoding_test.mbt From aed3e5b795fc3674af83bb04ed85d034e46eec2c Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Tue, 26 Nov 2024 08:33:34 +0800 Subject: [PATCH 21/25] feat: encode init --- encoding/encoding.mbt | 18 ++++++++++++++++++ encoding/encoding_test.mbt | 24 ++++++++++++++++++++++++ encoding/moon.pkg.json | 1 + 3 files changed, 43 insertions(+) create mode 100644 encoding/encoding_test.mbt diff --git a/encoding/encoding.mbt b/encoding/encoding.mbt index f307e1338..0da1b87c9 100644 --- a/encoding/encoding.mbt +++ b/encoding/encoding.mbt @@ -11,3 +11,21 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + +///| +pub fn encode(encoding : Encoding, src : String) -> Bytes! { + let buf = @buffer.T::from_bytes(src.to_bytes()) + // MoonBit String encoded UTF16LE + let chars = decode_strict(UTF16LE, buf) + let new_buf = @buffer.T::new(size_hint=buf.length()) + let write = match encoding { + UTF8 => @buffer.write_utf8_char + UTF16 => @buffer.write_char // TODO: no + UTF16BE => @buffer.write_char + UTF16LE => @buffer.write_char + } + for char in chars { + write(new_buf, char.unwrap_or_error!()) + } + new_buf.to_bytes() +} diff --git a/encoding/encoding_test.mbt b/encoding/encoding_test.mbt new file mode 100644 index 000000000..e763ae245 --- /dev/null +++ b/encoding/encoding_test.mbt @@ -0,0 +1,24 @@ +// Copyright 2024 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +test "pp" { + let src = "你好👀" + let bytes = @encoding.encode!(UTF8, src) + inspect!( + bytes, + content= + #|b"\xe4\xbd\xa0\xe5\xa5\xbd\xf0\x9f\x91\x80" + , + ) +} diff --git a/encoding/moon.pkg.json b/encoding/moon.pkg.json index eb6312aee..b2e471183 100644 --- a/encoding/moon.pkg.json +++ b/encoding/moon.pkg.json @@ -8,6 +8,7 @@ "moonbitlang/core/tuple", "moonbitlang/core/array", "moonbitlang/core/char", + "moonbitlang/core/result", "moonbitlang/core/int" ], "test-import": [ From 816077edc796305a48f7e4d1177f0f0fcc07daf4 Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Tue, 26 Nov 2024 08:34:02 +0800 Subject: [PATCH 22/25] refactor: sort import --- encoding/moon.pkg.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/encoding/moon.pkg.json b/encoding/moon.pkg.json index b2e471183..58dd8726e 100644 --- a/encoding/moon.pkg.json +++ b/encoding/moon.pkg.json @@ -1,15 +1,15 @@ { "import": [ - "moonbitlang/core/builtin", + "moonbitlang/core/array", "moonbitlang/core/buffer", - "moonbitlang/core/coverage", - "moonbitlang/core/string", + "moonbitlang/core/builtin", "moonbitlang/core/bytes", - "moonbitlang/core/tuple", - "moonbitlang/core/array", "moonbitlang/core/char", + "moonbitlang/core/coverage", + "moonbitlang/core/int", "moonbitlang/core/result", - "moonbitlang/core/int" + "moonbitlang/core/string", + "moonbitlang/core/tuple" ], "test-import": [ ] From 8ae2bb9915691461ed06a1faaab81814e372f8f4 Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Tue, 26 Nov 2024 10:12:01 +0800 Subject: [PATCH 23/25] feat: set_utf16{le, be}_char --- builtin/builtin.mbti | 6 +++-- builtin/bytes.mbt | 62 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 64 insertions(+), 4 deletions(-) diff --git a/builtin/builtin.mbti b/builtin/builtin.mbti index 1df5121a5..740afb865 100644 --- a/builtin/builtin.mbti +++ b/builtin/builtin.mbti @@ -668,7 +668,9 @@ impl FixedArray { op_get[T](Self[T], Int) -> T op_set[T](Self[T], Int, T) -> Unit set[T](Self[T], Int, T) -> Unit - set_utf16_char(Self[Byte], Int, Char) -> Int + set_utf16_char(Self[Byte], Int, Char) -> Int //deprecated + set_utf16be_char(Self[Byte], Int, Char) -> Int + set_utf16le_char(Self[Byte], Int, Char) -> Int set_utf8_char(Self[Byte], Int, Char) -> Int to_json[X : ToJson](Self[X]) -> Json to_string[X : Show](Self[X]) -> String @@ -687,7 +689,7 @@ impl Bytes { op_equal(Bytes, Bytes) -> Bool op_get(Bytes, Int) -> Byte op_set(Bytes, Int, Byte) -> Unit - set_utf16_char(Bytes, Int, Char) -> Int + set_utf16_char(Bytes, Int, Char) -> Int //deprecated set_utf8_char(Bytes, Int, Char) -> Int //deprecated sub_string(Bytes, Int, Int) -> String //deprecated to_string(Bytes) -> String //deprecated diff --git a/builtin/bytes.mbt b/builtin/bytes.mbt index 2708d0c60..97501aa19 100644 --- a/builtin/bytes.mbt +++ b/builtin/bytes.mbt @@ -140,7 +140,7 @@ pub fn copy(self : Bytes) -> Bytes { } ///| -/// Fill utf8 encoded char `value` into byte sequence `self`, starting at `offset`. +/// Fill UTF8 encoded char `value` into byte sequence `self`, starting at `offset`. /// It return the length of bytes has been written. /// @alert deprecated "The type Bytes is about to be changed to be immutable. Use `FixedArray[Byte]` or `Buffer` instead." pub fn set_utf8_char(self : Bytes, offset : Int, value : Char) -> Int { @@ -199,9 +199,10 @@ pub fn set_utf8_char( } ///| -/// Fill utf16 encoded char `value` into byte sequence `self`, starting at `offset`. +/// Fill UTF16 encoded char `value` into byte sequence `self`, starting at `offset`. /// It return the length of bytes has been written. /// @alert unsafe "Panic if the [value] is out of range" +/// @alert deprecated "The type Bytes is about to be changed to be immutable. Use `FixedArray[Byte]` or `Buffer` instead." pub fn set_utf16_char(self : Bytes, offset : Int, value : Char) -> Int { let code = value.to_uint() if code < 0x10000 { @@ -226,6 +227,7 @@ pub fn set_utf16_char(self : Bytes, offset : Int, value : Char) -> Int { /// Fill utf16 encoded char `value` into byte sequence `self`, starting at `offset`. /// It return the length of bytes has been written. /// @alert unsafe "Panic if the [value] is out of range" +/// @alert deprecated "Use `set_utf16le_char` instead" pub fn set_utf16_char( self : FixedArray[Byte], offset : Int, @@ -250,6 +252,62 @@ pub fn set_utf16_char( } } +///| +/// Fill UTF16LE encoded char `value` into byte sequence `self`, starting at `offset`. +/// It return the length of bytes has been written. +/// @alert unsafe "Panic if the [value] is out of range" +pub fn set_utf16le_char( + self : FixedArray[Byte], + offset : Int, + value : Char +) -> Int { + let code = value.to_uint() + if code < 0x10000 { + self[offset] = (code & 0xFF).to_byte() + self[offset + 1] = (code >> 8).to_byte() + 2 + } else if code < 0x110000 { + let hi = code - 0x10000 + let lo = (hi >> 10) | 0xD800 + let hi = (hi & 0x3FF) | 0xDC00 + self[offset] = (lo & 0xFF).to_byte() + self[offset + 1] = (lo >> 8).to_byte() + self[offset + 2] = (hi & 0xFF).to_byte() + self[offset + 3] = (hi >> 8).to_byte() + 4 + } else { + abort("Char out of range") + } +} + +///| +/// Fill UTF16BE encoded char `value` into byte sequence `self`, starting at `offset`. +/// It return the length of bytes has been written. +/// @alert unsafe "Panic if the [value] is out of range" +pub fn set_utf16be_char( + self : FixedArray[Byte], + offset : Int, + value : Char +) -> Int { + let code = value.to_uint() + if code < 0x10000 { + self[offset] = (code >> 0xFF).to_byte() + self[offset + 1] = (code & 0xFF).to_byte() + 2 + } else if code < 0x110000 { + let hi = code - 0x10000 + let lo = (hi >> 10) | 0xD800 + let hi = (hi & 0x3FF) | 0xDC00 + self[offset] = (lo >> 8).to_byte() + self[offset + 1] = (lo & 0xFF).to_byte() + self[offset + 2] = (hi >> 8).to_byte() + self[offset + 3] = (hi & 0xFF).to_byte() + 4 + } else { + abort("Char out of range") + } +} + ///| pub fn op_equal(self : Bytes, other : Bytes) -> Bool { if self.length() != other.length() { From bc6d71454fab98857bdca1170aa9accf9fae3beb Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Tue, 26 Nov 2024 10:12:27 +0800 Subject: [PATCH 24/25] feat: write_utf16be_char --- buffer/buffer.mbt | 13 +++++++++++-- buffer/buffer.mbti | 1 + 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/buffer/buffer.mbt b/buffer/buffer.mbt index a8404cf81..9ec49f187 100644 --- a/buffer/buffer.mbt +++ b/buffer/buffer.mbt @@ -168,14 +168,23 @@ pub fn write_sub_string( } ///| -/// Write a char into buffer. +/// Write a char into buffer as UTF16LE. pub fn write_char(self : T, value : Char) -> Unit { self.grow_if_necessary(self.len + 4) - let inc = self.data.set_utf16_char(self.len, value) + let inc = self.data.set_utf16le_char(self.len, value) self.len += inc } ///| +/// Write a char into buffer as UTF16BE. +pub fn write_utf16be_char(self : T, value : Char) -> Unit { + self.grow_if_necessary(self.len + 4) + let inc = self.data.set_utf16be_char(self.len, value) + self.len += inc +} + +///| +/// Write a char into buffer as UTF8. pub fn write_utf8_char(self : T, value : Char) -> Unit { self.grow_if_necessary(self.len + 4) let inc = self.data.set_utf8_char(self.len, value) diff --git a/buffer/buffer.mbti b/buffer/buffer.mbti index 28a79725b..d2bf8c98a 100644 --- a/buffer/buffer.mbti +++ b/buffer/buffer.mbti @@ -25,6 +25,7 @@ impl T { write_string(Self, String) -> Unit write_sub_string(Self, String, Int, Int) -> Unit //deprecated write_substring(Self, String, Int, Int) -> Unit + write_utf16be_char(Self, Char) -> Unit write_utf8_char(Self, Char) -> Unit } impl Show for T From e8e8659e291f1e27aeac9503e37502345503f750 Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Tue, 26 Nov 2024 10:12:35 +0800 Subject: [PATCH 25/25] feat: encoding --- encoding/decoding.mbt | 4 ++-- encoding/decoding_test.mbt | 20 ++++++++++---------- encoding/encoding.mbt | 16 ++++++++++------ encoding/encoding.mbti | 8 +++++--- encoding/encoding_test.mbt | 37 +++++++++++++++++++++++++++++++++++-- encoding/types.mbt | 4 ++-- 6 files changed, 64 insertions(+), 25 deletions(-) diff --git a/encoding/decoding.mbt b/encoding/decoding.mbt index b25be1cac..19f577406 100644 --- a/encoding/decoding.mbt +++ b/encoding/decoding.mbt @@ -53,9 +53,9 @@ fn decoder(encoding : Encoding, src : @buffer.T) -> Decoder { let t_need = 0 let k = match encoding { UTF8 => decode_utf_8 - UTF16 => decode_utf_16be - UTF16BE => decode_utf_16be + UTF16 => decode_utf_16le UTF16LE => decode_utf_16le + UTF16BE => decode_utf_16be } { i, i_pos, i_max, t, t_len, t_need, k } } diff --git a/encoding/decoding_test.mbt b/encoding/decoding_test.mbt index 5eab775f1..324cfb985 100644 --- a/encoding/decoding_test.mbt +++ b/encoding/decoding_test.mbt @@ -56,19 +56,19 @@ test "lossy decoding UTF16BE encoded data to String" { inspect!(String::from_iter(stream.iter()), content="🐈🐱🐇🐰") } -test "lossy decoding UTF16 (alias of UTF16BE) encoded data to String" { - let buf = @buffer.T::new(size_hint=24) - buf.write_bytes(b"\x18\x65") - buf.write_bytes(b"\x18\x20") - buf.write_bytes(b"\x18\x73") - buf.write_bytes(b"\x18\x64") - buf.write_bytes(b"\x18\x73") - buf.write_bytes(b"\x18\x36") - buf.write_bytes(b"\x18\x20") +test "lossy decoding UTF16 (alias of UTF16LE) encoded data to String" { + let buf = @buffer.T::new(size_hint=20) + buf.write_bytes(b"\x65\x18") + buf.write_bytes(b"\x20\x18") + buf.write_bytes(b"\x73\x18") + buf.write_bytes(b"\x64\x18") + buf.write_bytes(b"\x73\x18") + buf.write_bytes(b"\x36\x18") + buf.write_bytes(b"\x20\x18") inspect!( buf.to_bytes(), content= - #|b"\x18\x65\x18\x20\x18\x73\x18\x64\x18\x73\x18\x36\x18\x20" + #|b"\x65\x18\x20\x18\x73\x18\x64\x18\x73\x18\x36\x18\x20\x18" , ) let stream = @encoding.decode_lossy(UTF16, buf) diff --git a/encoding/encoding.mbt b/encoding/encoding.mbt index 0da1b87c9..a6afe84e4 100644 --- a/encoding/encoding.mbt +++ b/encoding/encoding.mbt @@ -13,19 +13,23 @@ // limitations under the License. ///| -pub fn encode(encoding : Encoding, src : String) -> Bytes! { +pub fn encode(encoding : Encoding, src : String) -> Bytes { + // NOTE: special case: MoonBit String are already valid UTF16(LE) bytes + match encoding { + UTF16 | UTF16LE => return src.to_bytes() + _ => () + } let buf = @buffer.T::from_bytes(src.to_bytes()) - // MoonBit String encoded UTF16LE let chars = decode_strict(UTF16LE, buf) let new_buf = @buffer.T::new(size_hint=buf.length()) let write = match encoding { UTF8 => @buffer.write_utf8_char - UTF16 => @buffer.write_char // TODO: no - UTF16BE => @buffer.write_char - UTF16LE => @buffer.write_char + UTF16BE => @buffer.write_utf16be_char + _ => abort("unreachable") } for char in chars { - write(new_buf, char.unwrap_or_error!()) + // SAFETY: Assume String are always valid UTF16LE + write(new_buf, char.unwrap()) } new_buf.to_bytes() } diff --git a/encoding/encoding.mbti b/encoding/encoding.mbti index efbe7c65e..a28205bef 100644 --- a/encoding/encoding.mbti +++ b/encoding/encoding.mbti @@ -3,9 +3,11 @@ package moonbitlang/core/encoding alias @moonbitlang/core/buffer as @buffer // Values -fn decode_lossy(encoding~ : Encoding = .., @buffer.T) -> LossyChars +fn decode_lossy(Encoding, @buffer.T) -> LossyChars -fn decode_strict(encoding~ : Encoding = .., @buffer.T) -> StrictChars +fn decode_strict(Encoding, @buffer.T) -> StrictChars + +fn encode(Encoding, String) -> Bytes // Types and methods type DecodeError @@ -14,8 +16,8 @@ impl Show for DecodeError pub(all) enum Encoding { UTF8 UTF16 - UTF16BE UTF16LE + UTF16BE } type LossyChars diff --git a/encoding/encoding_test.mbt b/encoding/encoding_test.mbt index e763ae245..2ff086c9b 100644 --- a/encoding/encoding_test.mbt +++ b/encoding/encoding_test.mbt @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -test "pp" { +test "encoding String to UTF8" { let src = "你好👀" - let bytes = @encoding.encode!(UTF8, src) + let bytes = @encoding.encode(UTF8, src) inspect!( bytes, content= @@ -22,3 +22,36 @@ test "pp" { , ) } + +test "encoding String to UTF16 (alias of UTF16LE)" { + let src = "LISP programmers know the value of everything" + let bytes = @encoding.encode(UTF16, src) + inspect!( + bytes, + content= + #|b"\x4c\x00\x49\x00\x53\x00\x50\x00\x20\x00\x70\x00\x72\x00\x6f\x00\x67\x00\x72\x00\x61\x00\x6d\x00\x6d\x00\x65\x00\x72\x00\x73\x00\x20\x00\x6b\x00\x6e\x00\x6f\x00\x77\x00\x20\x00\x74\x00\x68\x00\x65\x00\x20\x00\x76\x00\x61\x00\x6c\x00\x75\x00\x65\x00\x20\x00\x6f\x00\x66\x00\x20\x00\x65\x00\x76\x00\x65\x00\x72\x00\x79\x00\x74\x00\x68\x00\x69\x00\x6e\x00\x67\x00" + , + ) +} + +test "encoding String to UTF16LE" { + let src = "and the cost of nothing" + let bytes = @encoding.encode(UTF16LE, src) + inspect!( + bytes, + content= + #|b"\x61\x00\x6e\x00\x64\x00\x20\x00\x74\x00\x68\x00\x65\x00\x20\x00\x63\x00\x6f\x00\x73\x00\x74\x00\x20\x00\x6f\x00\x66\x00\x20\x00\x6e\x00\x6f\x00\x74\x00\x68\x00\x69\x00\x6e\x00\x67\x00" + , + ) +} + +test "encoding String to UTF16BE" { + let src = "λf.(λx.f(x x))(λx.f(x x))" + let bytes = @encoding.encode(UTF16BE, src) + inspect!( + bytes, + content= + #|b"\x00\xbb\x00\x66\x00\x2e\x00\x28\x00\xbb\x00\x78\x00\x2e\x00\x66\x00\x28\x00\x78\x00\x20\x00\x78\x00\x29\x00\x29\x00\x28\x00\xbb\x00\x78\x00\x2e\x00\x66\x00\x28\x00\x78\x00\x20\x00\x78\x00\x29\x00\x29" + , + ) +} diff --git a/encoding/types.mbt b/encoding/types.mbt index de8fa9a30..84164f2a2 100644 --- a/encoding/types.mbt +++ b/encoding/types.mbt @@ -18,9 +18,9 @@ typealias Cont = (Decoder) -> Decode ///| pub(all) enum Encoding { UTF8 - UTF16 - UTF16BE + UTF16 // alias of UTF16LE UTF16LE + UTF16BE } // Decoder