From 6fa8655e8b85c0f8358050c504d411180dc31855 Mon Sep 17 00:00:00 2001 From: Jinser Kafka Date: Wed, 27 Nov 2024 22:43:15 +0800 Subject: [PATCH] feat: new package encoding --- encoding/decoding.mbt | 458 +++++++++++++++++++++++++++++++++++++ encoding/decoding_test.mbt | 336 +++++++++++++++++++++++++++ encoding/encoding.mbt | 87 +++++++ encoding/encoding.mbti | 44 ++++ encoding/encoding_test.mbt | 57 +++++ encoding/moon.pkg.json | 4 + encoding/types.mbt | 133 +++++++++++ 7 files changed, 1119 insertions(+) create mode 100644 encoding/decoding.mbt create mode 100644 encoding/decoding_test.mbt create mode 100644 encoding/encoding.mbt create mode 100644 encoding/encoding.mbti create mode 100644 encoding/encoding_test.mbt create mode 100644 encoding/moon.pkg.json create mode 100644 encoding/types.mbt diff --git a/encoding/decoding.mbt b/encoding/decoding.mbt new file mode 100644 index 0000000..49dcb70 --- /dev/null +++ b/encoding/decoding.mbt @@ -0,0 +1,458 @@ +// Copyright 2024 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///| +const U_REP = '\u{FFFD}' + +///| +let utf_8_len = [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, + 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +] + +///| +/// Decodes bytes from a specified encoding into lossily decoded characters. +/// +/// # Parameters +/// - `encoding`: The character encoding of the input `bytes`. +/// - `src`: A `bytes` representing the encoded string in the specified format. +/// +/// # Returns +/// +/// A `LossyChars` iterator representing the decoded characters, with invalid byte sequences replaced by a replacement character. +/// +/// # Behavior +/// +/// - Any invalid sequences in the `bytes` are replaced with a replacement character (`\u{FFFD}`), preventing decoding errors. +/// +/// # Examples +/// +/// ```moonbit +/// let buf = @buffer.T::new(size_hint=10) +/// buf.write_bytes(b"\xe4\xbd\xa0") // "你" in UTF8 +/// buf.write_bytes(b"\xe5\xa5\xbd") // "好" in UTF8 +/// buf.write_bytes(b"\xf0\x9f\x91\x80") // "👀" in UTF8 +/// let chars = @encoding.decode_lossy(UTF8, buf.to_bytes()) +/// let arr = chars.iter().collect() // Array of unicode point code: `['你', '好', '👀']` +/// let str = String::from_array(arr) // MoonBit String, representing as UTF16LE: `"你好👀"` +/// ``` +pub fn decode_lossy(encoding : Encoding, src : Bytes) -> LossyChars { + let decoder = decoder(encoding, src) + decoder +} + +///| +/// Decodes bytes from a specified encoding into strictly decoded characters. +/// +/// # Parameters +/// +/// - `encoding`: The character encoding of the input `bytes`. +/// - `src`: A `bytes` representing the encoded string in the specified format. +/// +/// # Returns +/// +/// A `StrictChars` iterator representing the decoded characters. +/// +/// # Behavior +/// +/// - Assumes all sequences in the `bytes` are valid and will raise errors if invalid sequences are encountered. +/// +/// # Examples +/// +/// ```moonbit +/// let buf = @buffer.T::new(size_hint=10) +/// buf.write_bytes(b"\xe4\xbd\xa0") // "你" in UTF8 +/// buf.write_bytes(b"\xe5\xa5\xbd") // "好" in UTF8 +/// buf.write_bytes(b"\xf0\x9f\x91\x80") // "👀" in UTF8 +/// let chars = @encoding.decode_strict(UTF8, buf.to_bytes()) +/// let arr = chars.iter().try_collect!() // Array of unicode point code: `['你', '好', '👀']` +/// let str = String::from_array(arr) // MoonBit String, representing as UTF16LE: `"你好👀"` +/// ``` +pub fn decode_strict(encoding : Encoding, src : Bytes) -> StrictChars { + let decoder = decoder(encoding, src) + decoder +} + +// Implementations + +///| +fn decoder(encoding : Encoding, src : Bytes) -> Decoder { + let i = src + let i_pos = 0 + let i_max = src.length() - 1 + let t = b"\x00\x00\x00\x00" + let t_len = 0 + let t_need = 0 + let k = match encoding { + UTF8 => decode_utf_8 + UTF16 => decode_utf_16le + UTF16LE => decode_utf_16le + UTF16BE => decode_utf_16be + } + { i, i_pos, i_max, t, t_len, t_need, k } +} + +///| +fn decode(self : Decoder) -> Decode { + (self.k)(self) +} + +///| +fn ret(self : Decoder, k : Cont, v : Decode) -> Decode { + self.k = k + v +} + +///| +fn i_rem(self : Decoder) -> Int { + self.i_max - self.i_pos + 1 +} + +///| +fn eoi(self : Decoder) -> Unit { + self.i = @bytes.default() + self.i_pos = 0 + self.i_max = @int.min_value +} + +///| +fn refill(self : Decoder, k : Cont) -> Decode { + // only Bytes + self.eoi() + k(self) +} + +///| +fn t_need(self : Decoder, need : Int) -> Unit { + self.t_len = 0 + self.t_need = need +} + +///| +fn t_fill(k : Cont, decoder : Decoder) -> Decode { + fn blit(decoder : Decoder, l : Int) -> Unit { + decoder.i.blit(decoder.i_pos, decoder.t, decoder.t_len, l) + decoder.i_pos = decoder.i_pos + 1 + decoder.t_len = decoder.t_len + 1 + } + + let rem = decoder.i_rem() + if rem < 0 { // eoi + k(decoder) + } else { + let need = decoder.t_need - decoder.t_len + if rem < need { + blit(decoder, rem) + decoder.refill(@tuple.curry(t_fill)(k)) + } else { + blit(decoder, need) + k(decoder) + } + } +} + +// UTF8 + +///| +fn decode_utf_8(self : Decoder) -> Decode { + let rem = self.i_rem() + match rem.compare(0) { + // rem < 0 + -1 => Decode::End + // rem = 0 + 0 => self.refill(decode_utf_8) + // rem > 0 + 1 => { + let idx = self.i[self.i_pos].to_int() + let need = utf_8_len[idx] + if rem < need { + self.t_need(need) + t_fill(t_decode_utf_8, self) + } else { + let j = self.i_pos + if need == 0 { + self.i_pos = self.i_pos + 1 + self.ret(decode_utf_8, malformed(self.i, j, 1)) + } else { + self.i_pos = self.i_pos + need + self.ret(decode_utf_8, r_utf_8(self.i, j, need)) + } + } + } + _ => abort("unreachable") + } +} + +///| +fn t_decode_utf_8(self : Decoder) -> Decode { + if self.t_len < self.t_need { + malformed(self.t, 0, self.t_len) + } else { + r_utf_8(self.t, 0, self.t_len) + } +} + +///| +fn r_utf_8(buf : Bytes, offset : Int, length : Int) -> Decode { + fn uchar(c : Int) { + Uchar(Char::from_int(c)) + } + + match length { + 1 => uchar(buf[offset].to_int()) + 2 => { + let b0 = buf[offset].to_int() + let b1 = buf[offset + 1].to_int() + if (b1 >> 6) != 0b10 { + malformed(buf, offset, length) + } else { + uchar(((b0 & 0x1F) << 6) | (b1 & 0x3F)) + } + } + 3 => { + let b0 = buf[offset].to_int() + let b1 = buf[offset + 1].to_int() + let b2 = buf[offset + 2].to_int() + let c = ((b0 & 0x0F) << 12) | (((b1 & 0x3F) << 6) | (b2 & 0x3F)) + if (b2 >> 6) != 0b10 { + malformed(buf, offset, length) + } else { + match b0 { + 0xE0 => + if b1 < 0xA0 || 0xBF < b1 { + malformed(buf, offset, length) + } else { + uchar(c) + } + 0xED => + if b1 < 0x80 || 0x9F < b1 { + malformed(buf, offset, length) + } else { + uchar(c) + } + _ => + if (b1 >> 6) != 0b10 { + malformed(buf, offset, length) + } else { + uchar(c) + } + } + } + } + 4 => { + let b0 = buf[offset].to_int() + let b1 = buf[offset + 1].to_int() + let b2 = buf[offset + 2].to_int() + let b3 = buf[offset + 3].to_int() + let c = ((b0 & 0x07) << 18) | + ((b1 & 0x3F) << 12) | + ((b2 & 0x3F) << 6) | + (b3 & 0x3F) + if (b3 >> 6) != 0b10 || (b2 >> 6) != 0b10 { + malformed(buf, offset, length) + } else { + match b0 { + 0xF0 => + if b1 < 0x90 || 0xBF < b1 { + malformed(buf, offset, length) + } else { + uchar(c) + } + 0xF4 => + if b1 < 0x80 || 0x8F < b1 { + malformed(buf, offset, length) + } else { + uchar(c) + } + _ => + if (b1 >> 6) != 0b10 { + malformed(buf, offset, length) + } else { + uchar(c) + } + } + } + } + _ => panic() + } +} + +// UTF16LE + +///| +priv enum UTF16Decode { + Hi(Int) + UTF16Malformed(String) + UTF16Uchar(Char) +} + +///| +fn decode_utf_16le(self : Decoder) -> Decode { + let rem = self.i_rem() + match rem.compare(0) { + // rem < 0 + -1 => Decode::End + // rem = 0 + 0 => self.refill(decode_utf_16le) + // rem > 0 + 1 => + if rem < 2 { + self.t_need(2) + t_fill(t_decode_utf_16le, self) + } else { + let j = self.i_pos + self.i_pos = self.i_pos + 2 + // mark + self.decode_utf_16le_lo(r_utf_16(self.i, j + 1, j)) + } + _ => abort("unreachable") + } +} + +///| +fn t_decode_utf_16le(self : Decoder) -> Decode { + if self.t_len < self.t_need { + self.ret(decode_utf_16le, malformed(self.t, 0, self.t_len)) + } else { + self.decode_utf_16le_lo(r_utf_16(self.t, 1, 0)) + } +} + +///| +fn decode_utf_16le_lo(self : Decoder, v : UTF16Decode) -> Decode { + match v { + UTF16Uchar(u) => Uchar(u) + UTF16Malformed(s) => Malformed(s) + Hi(hi) => { + let rem = self.i_rem() + if rem < 2 { + self.t_need(2) + t_fill(@tuple.curry(t_decode_utf_16le_lo)(hi), self) + } else { + let j = self.i_pos + self.i_pos = self.i_pos + 2 + r_utf_16_lo(hi, self.i, j + 1, j) + } + } + } +} + +///| +fn t_decode_utf_16le_lo(hi : Int, decoder : Decoder) -> Decode { + if decoder.t_len < decoder.t_need { + decoder.ret( + decode_utf_16le, + malformed_pair(false, hi, decoder.t, 0, decoder.t_len), + ) + } else { + decoder.ret(decode_utf_16le, r_utf_16_lo(hi, decoder.t, 1, 0)) + } +} + +///| +fn r_utf_16_lo(hi : Int, buf : Bytes, offset0 : Int, offset1 : Int) -> Decode { + let b0 = buf[offset0].to_int() + let b1 = buf[offset1].to_int() + let lo = (b0 << 8) | b1 + if lo < 0xDC00 || lo > 0xDFFF { + malformed_pair(offset0 < offset1, hi, buf, @int.min(offset0, offset1), 2) + } else { + Uchar(Char::from_int(((hi & 0x3FF) << 10) | ((lo & 0x3FF) + 0x10000))) + } +} + +///| +fn r_utf_16(buf : Bytes, offset0 : Int, offset1 : Int) -> UTF16Decode { + let b0 = buf[offset0].to_int() + let b1 = buf[offset1].to_int() + let u = (b0 << 8) | b1 + if u < 0xD800 || u > 0xDFFF { + UTF16Uchar(Char::from_int(u)) + } else if u > 0xDBFF { + UTF16Malformed( + buf.to_unchecked_string(offset=@int.min(offset0, offset1), length=2), + ) + } else { + Hi(u) + } +} + +// UTF16BE + +///| +fn decode_utf_16be(self : Decoder) -> Decode { + let rem = self.i_rem() + match rem.compare(0) { + // rem < 0 + -1 => Decode::End + // rem = 0 + 0 => self.refill(decode_utf_16be) + // rem > 0 + 1 => + if rem < 2 { + self.t_need(2) + t_fill(t_decode_utf_16be, self) + } else { + let j = self.i_pos + self.i_pos = self.i_pos + 2 + self.decode_utf_16be_lo(r_utf_16(self.i, j, j + 1)) + } + _ => abort("unreachable") + } +} + +///| +fn t_decode_utf_16be(self : Decoder) -> Decode { + if self.t_len < self.t_need { + self.ret(decode_utf_16be, malformed(self.t, 0, self.t_len)) + } else { + self.decode_utf_16be_lo(r_utf_16(self.t, 0, 1)) + } +} + +///| +fn decode_utf_16be_lo(self : Decoder, decode : UTF16Decode) -> Decode { + match decode { + UTF16Uchar(x) => self.ret(decode_utf_16be, Uchar(x)) + UTF16Malformed(x) => self.ret(decode_utf_16be, Malformed(x)) + Hi(hi) => { + let rem = self.i_rem() + if rem < 2 { + self.t_need(2) + t_fill(@tuple.curry(t_decode_utf_16be_lo)(hi), self) + } else { + let j = self.i_pos + self.i_pos = self.i_pos + 2 + self.ret(decode_utf_16be, r_utf_16_lo(hi, self.i, j, j + 1)) + } + } + } +} + +///| +fn t_decode_utf_16be_lo(hi : Int, self : Decoder) -> Decode { + if self.t_len < self.t_need { + self.ret(decode_utf_16be, malformed_pair(true, hi, self.t, 0, self.t_len)) + } else { + self.ret(decode_utf_16be, r_utf_16_lo(hi, self.t, 0, 1)) + } +} diff --git a/encoding/decoding_test.mbt b/encoding/decoding_test.mbt new file mode 100644 index 0000000..4491a42 --- /dev/null +++ b/encoding/decoding_test.mbt @@ -0,0 +1,336 @@ +// Copyright 2024 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///| +fn string_from_lossy_chars(chars : @encoding.LossyChars) -> String { + let arr = chars.iter().collect() + String::from_array(arr) +} + +// lossy + +test "lossy decoding String (UTF16LE encoded) to String (buffer.write_bytes)" { + let src = "你好👀" + let buf = @buffer.T::new(size_hint=src.to_bytes().length()) + buf.write_bytes(src.to_bytes()) + inspect!( + buf.to_bytes(), + content= + #|b"\x60\x4f\x7d\x59\x3d\xd8\x40\xdc" + , + ) + let chars = @encoding.decode_lossy(UTF16LE, buf.to_bytes()) + inspect!(string_from_lossy_chars(chars), content=src) +} + +test "lossy decoding String (UTF16LE encoded) to String (buffer.write_char)" { + let src = "👋再见" + let buf = @buffer.T::new(size_hint=10) + for s in src { + buf.write_char(s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\x3d\xd8\x4b\xdc\x8d\x51\xc1\x89" + , + ) + let chars = @encoding.decode_lossy(UTF16LE, buf.to_bytes()) + inspect!(string_from_lossy_chars(chars), content=src) +} + +test "lossy decoding UTF16LE encoded data to String" { + let buf = @buffer.T::new(size_hint=10) + buf.write_bytes(b"\x60\x4f") + buf.write_bytes(b"\x7d\x59") + buf.write_bytes(b"\x3d\xd8\x40\xdc") + inspect!( + buf.to_bytes(), + content= + #|b"\x60\x4f\x7d\x59\x3d\xd8\x40\xdc" + , + ) + let chars = @encoding.decode_lossy(UTF16LE, buf.to_bytes()) + inspect!(string_from_lossy_chars(chars), content="你好👀") +} + +test "lossy decoding UTF16 (alias for UTF16LE) encoded data to String" { + let buf = @buffer.T::new(size_hint=20) + buf.write_bytes(b"\x65\x18") + buf.write_bytes(b"\x20\x18") + buf.write_bytes(b"\x73\x18") + buf.write_bytes(b"\x64\x18") + buf.write_bytes(b"\x73\x18") + buf.write_bytes(b"\x36\x18") + buf.write_bytes(b"\x20\x18") + inspect!( + buf.to_bytes(), + content= + #|b"\x65\x18\x20\x18\x73\x18\x64\x18\x73\x18\x36\x18\x20\x18" + , + ) + let chars = @encoding.decode_lossy(UTF16, buf.to_bytes()) + inspect!(string_from_lossy_chars(chars), content="ᡥᠠᡳᡤᡳᠶᠠ") +} + +test "lossy decoding UTF16BE encoded data to String" { + let buf = @buffer.T::new(size_hint=10) + buf.write_bytes(b"\xd8\x3d\xdc\x08") + buf.write_bytes(b"\xd8\x3d\xdc\x31") + buf.write_bytes(b"\xd8\x3d\xdc\x07") + buf.write_bytes(b"\xd8\x3d\xdc\x30") + inspect!( + buf.to_bytes(), + content= + #|b"\xd8\x3d\xdc\x08\xd8\x3d\xdc\x31\xd8\x3d\xdc\x07\xd8\x3d\xdc\x30" + , + ) + let chars = @encoding.decode_lossy(UTF16BE, buf.to_bytes()) + inspect!(string_from_lossy_chars(chars), content="🐈🐱🐇🐰") +} + +test "lossy decoding UTF8 encoded data to String" { + let buf = @buffer.T::new(size_hint=10) + buf.write_bytes(b"\xe4\xbd\xa0") + buf.write_bytes(b"\xe5\xa5\xbd") + buf.write_bytes(b"\xf0\x9f\x91\x80") + inspect!( + buf.to_bytes(), + content= + #|b"\xe4\xbd\xa0\xe5\xa5\xbd\xf0\x9f\x91\x80" + , + ) + let chars = @encoding.decode_lossy(UTF8, buf.to_bytes()) + inspect!(string_from_lossy_chars(chars), content="你好👀") +} + +test "lossy decoding UTF8 encoded bytes to String" { + let src = "👋再见" + let buf = @buffer.T::new(size_hint=10) + for s in src { + @encoding.write_utf8_char(buf, s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xf0\x9f\x91\x8b\xe5\x86\x8d\xe8\xa7\x81" + , + ) + let chars = @encoding.decode_lossy(UTF8, buf.to_bytes()) + inspect!(string_from_lossy_chars(chars), content=src) +} + +test "lossy decoding UTF8 encoded data" { + let src = "👋再见" + let buf = @buffer.T::new(size_hint=10) + for s in src { + @encoding.write_utf8_char(buf, s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xf0\x9f\x91\x8b\xe5\x86\x8d\xe8\xa7\x81" + , + ) + let chars = @encoding.decode_lossy(UTF8, buf.to_bytes()) + inspect!(chars.iter().collect(), content="['👋', '再', '见']") +} + +test "lossy decoding UTF16LE encoded data with UTF8" { + let src = "跑步🏃游泳🏊" + let buf = @buffer.T::new(size_hint=10) + for s in src { + @encoding.write_utf16le_char(buf, s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xd1\x8d\x65\x6b\x3c\xd8\xc3\xdf\x38\x6e\xf3\x6c\x3c\xd8\xca\xdf" + , + ) + let chars = @encoding.decode_lossy(UTF8, buf.to_bytes()) + inspect!( + chars.iter().collect(), + content="['э', 'e', 'k', '<', '�', '�', 'n', '�', '�']", + ) +} + +test "lossy decoding UTF8 encoded data with UTF16LE" { + let src = "跑步🏃游泳🏊" + let buf = @buffer.T::new(size_hint=10) + for s in src { + @encoding.write_utf8_char(buf, s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xe8\xb7\x91\xe6\xad\xa5\xf0\x9f\x8f\x83\xe6\xb8\xb8\xe6\xb3\xb3\xf0\x9f\x8f\x8a" + , + ) + let chars = @encoding.decode_lossy(UTF16LE, buf.to_bytes()) + inspect!( + chars.iter().collect(), + content="['럨', '', 'ꖭ', '鿰', '莏', '룦', '', '뎳', '鿰', '誏']", + ) +} + +test "lossy decoding UTF16BE encoded data with UTF8" { + let src = "跑步🏃游泳🏊" + let buf = @buffer.T::new(size_hint=10) + for s in src { + @encoding.write_utf16be_char(buf, s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\x00\xd1\x00\x65\xd8\x3c\xdf\xc3\x00\x38\x00\xf3\xd8\x3c\xdf\xca" + , + ) + let chars = @encoding.decode_lossy(UTF8, buf.to_bytes()) + inspect!( + chars.iter().collect(), + content= + #|['\x00', '�', 'e', '�', '�', '\x00', '8', '\x00', '�', '�'] + , + ) +} + +test "lossy decoding UTF8 encoded data with UTF16BE" { + let src = "跑步🏃游泳🏊" + let buf = @buffer.T::new(size_hint=10) + for s in src { + @encoding.write_utf8_char(buf, s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xe8\xb7\x91\xe6\xad\xa5\xf0\x9f\x8f\x83\xe6\xb8\xb8\xe6\xb3\xb3\xf0\x9f\x8f\x8a" + , + ) + let chars = @encoding.decode_lossy(UTF16BE, buf.to_bytes()) + inspect!( + chars.iter().collect(), + content="['', '釦', '궥', '', '较', '', '룦', '뎳', '', '辊']", + ) +} + +test "lossy decoding UTF16LE encoded data with UTF16BE" { + let src = "跑步🏃游泳🏊" + let buf = @buffer.T::new(size_hint=10) + for s in src { + @encoding.write_utf16le_char(buf, s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xd1\x8d\x65\x6b\x3c\xd8\xc3\xdf\x38\x6e\xf3\x6c\x3c\xd8\xca\xdf" + , + ) + let chars = @encoding.decode_lossy(UTF16BE, buf.to_bytes()) + inspect!( + chars.iter().collect(), + content="['톍', '敫', '㳘', '쏟', '㡮', '', '㳘', '쫟']", + ) +} + +test "lossy decoding UTF16BE encoded data with UTF16LE" { + let src = "跑步🏃游泳🏊" + let buf = @buffer.T::new(size_hint=10) + for s in src { + @encoding.write_utf16be_char(buf, s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\x00\xd1\x00\x65\xd8\x3c\xdf\xc3\x00\x38\x00\xf3\xd8\x3c\xdf\xca" + , + ) + let chars = @encoding.decode_lossy(UTF16LE, buf.to_bytes()) + inspect!( + chars.iter().collect(), + content="['턀', '攀', '㳘', '쏟', '㠀', '', '㳘', '쫟']", + ) +} + +// strictly + +test "strictly decoding UTF16LE encoded data with UTF8" { + let src = "跑步🏃游泳🏊" + let buf = @buffer.T::new(size_hint=10) + for s in src { + @encoding.write_utf16le_char(buf, s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xd1\x8d\x65\x6b\x3c\xd8\xc3\xdf\x38\x6e\xf3\x6c\x3c\xd8\xca\xdf" + , + ) + let chars = @encoding.decode_strict(UTF8, buf.to_bytes()) + inspect!(chars.iter().try_collect?(), content="Err(쏘)") +} + +test "strictly decoding UTF8 encoded data with UTF16LE" { + let src = "跑步🏃游泳🏊" + let buf = @buffer.T::new(size_hint=10) + for s in src { + @encoding.write_utf8_char(buf, s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xe8\xb7\x91\xe6\xad\xa5\xf0\x9f\x8f\x83\xe6\xb8\xb8\xe6\xb3\xb3\xf0\x9f\x8f\x8a" + , + ) + let chars = @encoding.decode_strict(UTF16LE, buf.to_bytes()) + inspect!( + chars.iter().try_collect?(), + content="Ok(['럨', '', 'ꖭ', '鿰', '莏', '룦', '', '뎳', '鿰', '誏'])", + ) +} + +test "strictly decoding UTF8 encoded data with UTF16BE" { + let src = "跑步🏃游泳🏊" + let buf = @buffer.T::new(size_hint=10) + for s in src { + @encoding.write_utf8_char(buf, s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\xe8\xb7\x91\xe6\xad\xa5\xf0\x9f\x8f\x83\xe6\xb8\xb8\xe6\xb3\xb3\xf0\x9f\x8f\x8a" + , + ) + let chars = @encoding.decode_strict(UTF16BE, buf.to_bytes()) + inspect!( + chars.iter().try_collect?(), + content="Ok(['', '釦', '궥', '', '较', '', '룦', '뎳', '', '辊'])", + ) +} + +test "strictly decoding UTF16BE encoded data with UTF8" { + let src = "跑步🏃游泳🏊" + let buf = @buffer.T::new(size_hint=10) + for s in src { + @encoding.write_utf16be_char(buf, s) + } + inspect!( + buf.to_bytes(), + content= + #|b"\x00\xd1\x00\x65\xd8\x3c\xdf\xc3\x00\x38\x00\xf3\xd8\x3c\xdf\xca" + , + ) + let chars = @encoding.decode_strict(UTF8, buf.to_bytes()) + inspect!(chars.iter().try_collect?(), content="Err(Ñ)") +} diff --git a/encoding/encoding.mbt b/encoding/encoding.mbt new file mode 100644 index 0000000..0cff879 --- /dev/null +++ b/encoding/encoding.mbt @@ -0,0 +1,87 @@ +// Copyright 2024 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///| +/// Encode a given string to the specified character encoding and returns the resulting bytes. +/// +/// # Parameters +/// +/// - `encoding` : The target encoding format. +/// - `src`: The input string to be encoded. +/// +/// # Returns +/// +/// A `bytes` representing the encoded string in the selected format. +/// +/// # Examples +/// +/// ```moonbit +/// let src = "Hello, World!" +/// let encoded_bytes = encode(UTF8, src) +/// ``` +pub fn encode(encoding : Encoding, src : String) -> Bytes { + // NOTE: special case: MoonBit String are already valid UTF16(LE) bytes + match encoding { + UTF16 | UTF16LE => return src.to_bytes() + _ => () + } + let bytes = src.to_bytes() + let chars = decode_strict(UTF16LE, bytes) + let new_buf = @buffer.T::new(size_hint=bytes.length()) + let write = match encoding { + UTF8 => write_utf8_char + UTF16BE => write_utf16be_char + _ => abort("unreachable") + } + for char in chars { + // SAFETY: Assume String are always valid UTF16LE + write(new_buf, char.unwrap()) + } + new_buf.to_bytes() +} + +///| +fn write_char( + write : (FixedArray[Byte], Int, Char) -> Int +) -> (@buffer.T, Char) -> Unit { + let fixedArr = FixedArray::makei(4, fn { _ => b'\x00' }) + fn { + buf, value => { + let len = write(fixedArr, 0, value) + let arr = fixedArr.iter().take(len).collect() + buf.write_bytes(@bytes.from_array(arr)) + } + } +} + +///| +/// Write a char into buffer as UTF8. +pub let write_utf8_char : (@buffer.T, Char) -> Unit = write_char( + FixedArray::set_utf8_char, +) + +///| +/// Write a char into buffer as UTF16LE. +/// Alias for `write_utf16le_char` +pub let write_utf16_char : (@buffer.T, Char) -> Unit = @buffer.write_char + +///| +/// Write a char into buffer as UTF16LE. +pub let write_utf16le_char : (@buffer.T, Char) -> Unit = @buffer.write_char + +///| +/// Write a char into buffer as UTF16BE. +pub let write_utf16be_char : (@buffer.T, Char) -> Unit = write_char( + FixedArray::set_utf16be_char, +) diff --git a/encoding/encoding.mbti b/encoding/encoding.mbti new file mode 100644 index 0000000..9881812 --- /dev/null +++ b/encoding/encoding.mbti @@ -0,0 +1,44 @@ +package moonbitlang/x/encoding + +alias @moonbitlang/core/buffer as @buffer + +// Values +fn decode_lossy(Encoding, Bytes) -> LossyChars + +fn decode_strict(Encoding, Bytes) -> StrictChars + +fn encode(Encoding, String) -> Bytes + +fn write_utf16_char(@buffer.T, Char) -> Unit + +fn write_utf16be_char(@buffer.T, Char) -> Unit + +fn write_utf16le_char(@buffer.T, Char) -> Unit + +fn write_utf8_char(@buffer.T, Char) -> Unit + +// Types and methods +type DecodeError +impl Show for DecodeError + +pub(all) enum Encoding { + UTF8 + UTF16 + UTF16LE + UTF16BE +} + +type LossyChars +impl LossyChars { + iter(Self) -> Iter[Char] +} + +type StrictChars +impl StrictChars { + iter(Self) -> Iter[Result[Char, DecodeError]] +} + +// Type aliases + +// Traits + diff --git a/encoding/encoding_test.mbt b/encoding/encoding_test.mbt new file mode 100644 index 0000000..2ff086c --- /dev/null +++ b/encoding/encoding_test.mbt @@ -0,0 +1,57 @@ +// Copyright 2024 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +test "encoding String to UTF8" { + let src = "你好👀" + let bytes = @encoding.encode(UTF8, src) + inspect!( + bytes, + content= + #|b"\xe4\xbd\xa0\xe5\xa5\xbd\xf0\x9f\x91\x80" + , + ) +} + +test "encoding String to UTF16 (alias of UTF16LE)" { + let src = "LISP programmers know the value of everything" + let bytes = @encoding.encode(UTF16, src) + inspect!( + bytes, + content= + #|b"\x4c\x00\x49\x00\x53\x00\x50\x00\x20\x00\x70\x00\x72\x00\x6f\x00\x67\x00\x72\x00\x61\x00\x6d\x00\x6d\x00\x65\x00\x72\x00\x73\x00\x20\x00\x6b\x00\x6e\x00\x6f\x00\x77\x00\x20\x00\x74\x00\x68\x00\x65\x00\x20\x00\x76\x00\x61\x00\x6c\x00\x75\x00\x65\x00\x20\x00\x6f\x00\x66\x00\x20\x00\x65\x00\x76\x00\x65\x00\x72\x00\x79\x00\x74\x00\x68\x00\x69\x00\x6e\x00\x67\x00" + , + ) +} + +test "encoding String to UTF16LE" { + let src = "and the cost of nothing" + let bytes = @encoding.encode(UTF16LE, src) + inspect!( + bytes, + content= + #|b"\x61\x00\x6e\x00\x64\x00\x20\x00\x74\x00\x68\x00\x65\x00\x20\x00\x63\x00\x6f\x00\x73\x00\x74\x00\x20\x00\x6f\x00\x66\x00\x20\x00\x6e\x00\x6f\x00\x74\x00\x68\x00\x69\x00\x6e\x00\x67\x00" + , + ) +} + +test "encoding String to UTF16BE" { + let src = "λf.(λx.f(x x))(λx.f(x x))" + let bytes = @encoding.encode(UTF16BE, src) + inspect!( + bytes, + content= + #|b"\x00\xbb\x00\x66\x00\x2e\x00\x28\x00\xbb\x00\x78\x00\x2e\x00\x66\x00\x28\x00\x78\x00\x20\x00\x78\x00\x29\x00\x29\x00\x28\x00\xbb\x00\x78\x00\x2e\x00\x66\x00\x28\x00\x78\x00\x20\x00\x78\x00\x29\x00\x29" + , + ) +} diff --git a/encoding/moon.pkg.json b/encoding/moon.pkg.json new file mode 100644 index 0000000..c600e42 --- /dev/null +++ b/encoding/moon.pkg.json @@ -0,0 +1,4 @@ +{ + "import": [ ], + "test-import": [ ] +} diff --git a/encoding/types.mbt b/encoding/types.mbt new file mode 100644 index 0000000..6900c80 --- /dev/null +++ b/encoding/types.mbt @@ -0,0 +1,133 @@ +// Copyright 2024 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///| +typealias Cont = (Decoder) -> Decode + +///| +pub(all) enum Encoding { + UTF8 + UTF16 // alias for UTF16LE + UTF16LE + UTF16BE +} + +// Decoder + +///| +priv struct Decoder { + mut i : Bytes + mut i_pos : Int + mut i_max : Int + t : Bytes + mut t_len : Int + mut t_need : Int + mut k : Cont +} + +///| +priv enum Decode { + End + Malformed(String) + Uchar(Char) +} + +///| +fn malformed(buf : Bytes, offset : Int, length : Int) -> Decode { + Malformed(buf.to_unchecked_string(offset~, length~)) +} + +///| +fn malformed_pair( + be : Bool, + hi : Int, + buf : Bytes, + offset : Int, + length : Int +) -> Decode { + let bs1 = buf.to_unchecked_string(offset~, length~).to_bytes() + let bs0 = b"\x00\x00" + let (j0, j1) = if be { (0, 1) } else { (1, 0) } + bs0[j0] = (hi >> 8).to_byte() + bs0[j1] = hi.land(0xFF).to_byte() + let bs = @buffer.new(size_hint=bs0.length() + bs1.length()) + bs.write_bytes(bs0) + bs.write_bytes(bs1) + Malformed(bs.to_bytes().to_unchecked_string(offset=0, length=bs.length())) +} + +// Chars + +///| +type LossyChars Decoder + +///| +pub fn iter(self : LossyChars) -> Iter[Char] { + Iter::new( + fn(yield_) { + loop self._.decode() { + Uchar(u) => { + if yield_(u) == IterEnd { + break IterEnd + } + continue self._.decode() + } + Malformed(_) => { + if yield_(U_REP) == IterEnd { + break IterEnd + } + continue self._.decode() + } + End => break IterEnd + } + }, + ) +} + +///| +type StrictChars Decoder + +///| +type! DecodeError String + +///| +pub impl Show for DecodeError with output(self, logger) { + match self { + DecodeError(err) => logger.write_string(err) + } +} + +///| +pub fn iter(self : StrictChars) -> Iter[Result[Char, DecodeError]] { + Iter::new( + fn(yield_) { + loop self._.decode() { + Uchar(u) => { + if yield_(Ok(u)) == IterEnd { + break IterEnd + } + continue self._.decode() + } + Malformed(s) => { + let err = DecodeError(s) + if yield_(Err(err)) == IterEnd { + break IterEnd + } + continue self._.decode() + } + End => break IterEnd + } + }, + ) +}