tokiwa-software · maxteufel · Jul 15, 2024 · Jul 11, 2024 · Jul 11, 2024 · Jul 12, 2024
diff --git a/lib/encodings/base32.fz b/lib/encodings/base32.fz
@@ -0,0 +1,169 @@
+# This file is part of the Fuzion language implementation.
+#
+# The Fuzion language implementation is free software: you can redistribute it
+# and/or modify it under the terms of the GNU General Public License as published
+# by the Free Software Foundation, version 3 of the License.
+#
+# The Fuzion language implementation is distributed in the hope that it will be
+# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+# License for more details.
+#
+# You should have received a copy of the GNU General Public License along with The
+# Fuzion language implementation.  If not, see <https://www.gnu.org/licenses/>.
+
+
+# -----------------------------------------------------------------------
+#
+#  Tokiwa Software GmbH, Germany
+#
+#  Source code of Fuzion standard library feature base32
+#
+# -----------------------------------------------------------------------
+
+# Base32 encoding and decoding as defined in RFC 4648
+# https://datatracker.ietf.org/doc/html/rfc4648#section-6
+#
+public base32 is
+
+  # allows redefinition e.g. for base32hex
+  module get_alphabet => "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567".utf8.as_array
+  module encoding_name => "base32"
+
+  # NYI: compiler does currently not optimize features without arguments to treat them like fields
+  alphabet := get_alphabet
+
+  # decode a valid base32 character to 5 bits
+  module quintet_bits(n u8) =>
+    if 65 <= n <= 90        # case A-Z
+      n.as_u64 - 65
+
+    else #if 50 <= n <= 55  # case 2-7
+      n.as_u64 - 24
+
+  # checks if a character is valid in the encoding
+  module is_valid(c u8) =>
+    (65 <= c <= 90) || (50 <= c <= 55)
+
+  # Encodes a given byte sequence in base32, output is padded to multiple of 8
+  # returns a sequence of ascii values
+  public encode(data array u8) =>
+
+    # extract 8 quintets from 40 bit (of an 64 bit integer)
+    enc40(n u64) =>
+      array u8 8 i->
+        idx := ((n >> (35-i*5).as_u64) & 31).as_i32
+        alphabet[idx]
+
+    for
+      res Sequence u8 := [], next        # the encoded data
+      i := 0, i+1
+      last_n u64 := 0, i %% 5 ?  0 : n
+      b in data
+      n := (last_n << 8) + b.as_u64
+      next := if i%5=4 then res ++ enc40 n
+              else          res
+    else
+      bit_len := data.length%5 * 8       # number of bits in last input block
+
+      if bit_len = 0
+        res
+      else
+        block_len := bit_len/5 + (bit_len%%5 ? 0 : 1)  # number ob characters in last block
+        res ++ (enc40 (last_n<<((u64 40)-bit_len.as_u64))).slice 0 block_len ++ (array u8 (8-block_len) _->61)
+
+
+  # Encodes a given byte sequence in base32, output is padded to multiple of 8
+  # returns a string
+  public encode_to_string(data array u8) =>
+    String.type.from_bytes (encode data)
+
+
+  # decodes a base32 string, decoding is strict as required by RFC 4648
+  # lowercase letters, non alphabet characters, line breaks, missing padding cause errors
+  # NYI: decoding does currently not reject encodings where the padding bits have not been set to zero prior to encoding
+  #      therefore in some cases multiple encodings can be decoded to the same data
+  #      See RFC4648 section 3.5: https://datatracker.ietf.org/doc/html/rfc4648#section-3.5
+  public decode_str(data String) =>
+    decode data.utf8.as_array
+
+
+  # decodes a sequence of ASCII characters, decoding is strict as required by RFC 4648
+  # lowercase letters, non alphabet characters, line breaks, missing padding cause errors
+  # NYI: decoding does currently not reject encodings where the padding bits have not been set to zero prior to encoding
+  #      therefore in some cases multiple encodings can be decoded to the same data
+  #      See RFC4648 section 3.5: https://datatracker.ietf.org/doc/html/rfc4648#section-3.5
+  public decode(data array u8) =>
+
+    # determine size of padding, i.e. number of '=' (61 in ASCII) at the end
+    pad_size :=
+      for
+        pad_len := 0, pad_len + (data[i] = 61 ? 1 : 0)
+        i in (data.indices.reverse)
+        _ in 1..6                             # padding can not be longer than 6
+      while data[i] = 61
+      else
+        pad_len = 2 ? 1                       # padding can not be 2
+                    : (pad_len = 5 ? 4        # padding can not be 5
+                                   : pad_len)
+
+    dec_input(i) =>
+
+      if i >= data.length
+        error "length of input data is not multiple of 8, as required by RFC4648"
+      else
+        c := data[i]
+
+        # base32 alphabet character
+        if (is_valid c)
+          outcome (quintet_bits c)
+
+        # padding character =
+        else if c = 61
+          if i < data.length - pad_size
+            # only complain about pad car if length is ok, otherwise wrong length is probably more helpful
+            if data.length%%8
+              error """
+                    padding character '=' not allowed within the input data, only at the very end, \
+                    as required by RFC464 (padding length of 2 or 5 can never occur)"""
+            else
+              error "length of input data is not multiple of 8, as required by RFC4648"
+          else outcome (u64 0)  # replace padding with zeros for decoding
+
+        # line break
+        else if c = 10 || c = 13
+          error """
+                line breaks are not allowed within encoded data, as required by RFC464, found \
+                {if c=10 then "LF" else "CR"} at position $i"""
+
+        # other non alphabet character
+        else
+          inv_char := String.type.from_bytes (data.slice i (i+4 > data.length ? data.length : i+4))
+                            .substring_codepoint 0 1
+
+          error "invalid $encoding_name input at byte position $i, decoding to unicode character '$inv_char'"
+
+    for
+      res list u8 := nil, res ++ bytes  # contains the decoded data at the end
+      nxt := 0, nxt + 8
+      last_err := false, is_err
+      qnt_last list (outcome u64) := nil, quintets
+
+    while nxt < data.length && !last_err
+    do
+      quintets := (nxt :: +1).map(i->dec_input i).take 8
+      is_err := (quintets ∃ el -> el.is_error)
+
+      # convert quintets in 40 bit number, break up in three bytes
+      bits := if is_err then 0
+              else quintets.map (.val)
+                           .zip (((u64 35) :: -5).take 8) (<<)
+                           .foldf (|) (u64 0)
+      bytes := [(u64 32), 24, 16, 8, 0].map (i->(bits >> i).low8bits)
+
+    else
+      if last_err
+        (qnt_last.filter (e -> e.is_error)).first.err
+      else
+        dump_size := 5 - ((40 - (pad_size * 5)) / 8)    # number of decoded bytes caused by zeroed padding
+        outcome (res.take res.count-dump_size).as_array # remove zero bytes caused by padding
diff --git a/lib/encodings/base32hex.fz b/lib/encodings/base32hex.fz
@@ -0,0 +1,43 @@
+# This file is part of the Fuzion language implementation.
+#
+# The Fuzion language implementation is free software: you can redistribute it
+# and/or modify it under the terms of the GNU General Public License as published
+# by the Free Software Foundation, version 3 of the License.
+#
+# The Fuzion language implementation is distributed in the hope that it will be
+# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+# License for more details.
+#
+# You should have received a copy of the GNU General Public License along with The
+# Fuzion language implementation.  If not, see <https://www.gnu.org/licenses/>.
+
+
+# -----------------------------------------------------------------------
+#
+#  Tokiwa Software GmbH, Germany
+#
+#  Source code of Fuzion standard library feature base32hex
+#
+# -----------------------------------------------------------------------
+
+# Base32hex encoding and decoding as defined in RFC 4648
+# https://datatracker.ietf.org/doc/html/rfc4648#section-7
+#
+public base32hex : base32 is
+
+  # allows redefinition e.g. for base32hexhex
+  redef get_alphabet => "0123456789ABCDEFGHIJKLMNOPQRSTUV".utf8.as_array
+  redef encoding_name => "base32hex"
+
+  # decode a valid base32hex character to 5 bits
+  redef quintet_bits(n u8) =>
+    if 65 <= n <= 86        # case A-V
+      n.as_u64 - 55
+
+    else #if 48 <= n <= 57  # case 0-9
+      n.as_u64 - 48
+
+  # checks if a character is valid in the encoding
+  redef is_valid(c u8) =>
+    (65 <= c <= 86) || (48 <= c <= 57)
diff --git a/tests/base32/Makefile b/tests/base32/Makefile
@@ -0,0 +1,25 @@
+# This file is part of the Fuzion language implementation.
+#
+# The Fuzion language implementation is free software: you can redistribute it
+# and/or modify it under the terms of the GNU General Public License as published
+# by the Free Software Foundation, version 3 of the License.
+#
+# The Fuzion language implementation is distributed in the hope that it will be
+# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+# License for more details.
+#
+# You should have received a copy of the GNU General Public License along with The
+# Fuzion language implementation.  If not, see <https://www.gnu.org/licenses/>.
+
+
+# -----------------------------------------------------------------------
+#
+#  Tokiwa Software GmbH, Germany
+#
+#  Source code of Fuzion test Makefile
+#
+# -----------------------------------------------------------------------
+
+override NAME = base32_test
+include ../simple.mk