From 99c6c7f8bdf02535f98d1df3f81113ecfc2ee43c Mon Sep 17 00:00:00 2001 From: Simon von Hackewitz Date: Thu, 11 Jul 2024 17:43:23 +0200 Subject: [PATCH 1/4] implement encodings base32 and base32hex base32 (running its test) causes a java exception, left there for debugging --- lib/encodings/base32.fz | 169 ++++++++++++++++++ lib/encodings/base32hex.fz | 45 +++++ tests/base32/Makefile | 25 +++ tests/base32/base32_test.fz | 161 +++++++++++++++++ tests/base32/base32_test.fz.expected_err | 0 tests/base32/base32_test.fz.expected_out | 24 +++ tests/base32hex/Makefile | 25 +++ tests/base32hex/base32hex_test.fz | 161 +++++++++++++++++ .../base32hex/base32hex_test.fz.expected_err | 0 .../base32hex/base32hex_test.fz.expected_out | 24 +++ 10 files changed, 634 insertions(+) create mode 100644 lib/encodings/base32.fz create mode 100644 lib/encodings/base32hex.fz create mode 100644 tests/base32/Makefile create mode 100644 tests/base32/base32_test.fz create mode 100644 tests/base32/base32_test.fz.expected_err create mode 100644 tests/base32/base32_test.fz.expected_out create mode 100644 tests/base32hex/Makefile create mode 100644 tests/base32hex/base32hex_test.fz create mode 100644 tests/base32hex/base32hex_test.fz.expected_err create mode 100644 tests/base32hex/base32hex_test.fz.expected_out diff --git a/lib/encodings/base32.fz b/lib/encodings/base32.fz new file mode 100644 index 0000000000..761a043c8a --- /dev/null +++ b/lib/encodings/base32.fz @@ -0,0 +1,169 @@ +# This file is part of the Fuzion language implementation. +# +# The Fuzion language implementation is free software: you can redistribute it +# and/or modify it under the terms of the GNU General Public License as published +# by the Free Software Foundation, version 3 of the License. +# +# The Fuzion language implementation is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public +# License for more details. +# +# You should have received a copy of the GNU General Public License along with The +# Fuzion language implementation. If not, see . + + +# ----------------------------------------------------------------------- +# +# Tokiwa Software GmbH, Germany +# +# Source code of Fuzion standard library feature base32 +# +# ----------------------------------------------------------------------- + +# Base32 encoding and decoding as defined in RFC 4648 +# +public base32 is + + # allows redefinition e.g. for base32hex + module get_alphabet => "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567".utf8.as_array + module encoding_name => "base32" + + # NYI: compiler does currently not optimize features without arguments to treat them like fields + alphabet := get_alphabet + + # decode a valid base32 characters to 5 bits + # visibility module to allow redefinition in base32hex + module quintet_bits(n u8) => + if n >= 65 && n <= 90 # case A-Z + n.as_u64 - 65 + + else #if n >= 50 && n <= 55 # case 2-7 + n.as_u64 - 24 + + # checks if a character is valid in the encoding + # visibility module to allow redefinition in base32hex + module is_valid(c u8) => + (c >= 65 && c <= 90) || (c >= 50 && c <= 55) + + # Encodes a given byte sequence in base32, output is padded to multiple of 8 + # returns a sequence of ascii values + public encode(data) => + + # extract 8 quintets from 40 bit (of an 64 bit integer) + enc40(n u64) => + block := array i32 8 (i->((n >> (35-i*5).as_u64) & 31).as_i32) + block.map (x->alphabet[x]) + + for + res Sequence u8 := [], next # the encoded input data + i := 0, i+1 + last_n u64 := 0, i %% 5 ? 0 : n + b in data + n := (last_n << 8) + b.as_u64 + next := if i%5=4 then res ++ enc40 n + else res + else + bit_len := data.length%5 * 8 # number of bits in last input block + + if bit_len = 0 + res + else + block_len := bit_len/5 + (bit_len%%5 ? 0 : 1) # number ob characters in last block + res ++ (enc40 (last_n<<((u64 40)-bit_len.as_u64))).slice 0 block_len ++ (array u8 (8-block_len) _->61) + + + # Encodes a given byte sequence in base32, output is padded to multiple of 8 + # returns a string + public encode_to_string(data array u8) => + String.type.from_bytes (encode data) + + + # decodes a base32 string, decoding is strict as required by RFC 4648 + # lowercase letters, non alphabet characters, line breaks, missing padding cause errors + # NYI: decoding does currently not reject encodings where the padding bits have not been set to zero prior to encoding + # therefore in some cases multiple encodings can be decoded to the same data + # See RFC4648 section 3.5: https://datatracker.ietf.org/doc/html/rfc4648#section-3.5 + public decode_str(data String) => + decode data.utf8.as_array + + + # decodes a sequence of ASCII characters, decoding is strict as required by RFC 4648 + # lowercase letters, non alphabet characters, line breaks, missing padding cause errors + # NYI: decoding does currently not reject encodings where the padding bits have not been set to zero prior to encoding + # therefore in some cases multiple encodings can be decoded to the same data + # See RFC4648 section 3.5: https://datatracker.ietf.org/doc/html/rfc4648#section-3.5 + public decode(data) => + + # determine size of padding, i.e. number of '=' (61 in ASCII) at the end + pad_size := + for + pad_len := 0, pad_len + (data[i] = 61 ? 1 : 0) + i in (data.indices.reverse) + _ in 1..6 # padding can not be longer than 6 + while data[i] = 61 + else + pad_len = 2 ? 1 # padding can not be 2 + : (pad_len = 5 ? 4 # padding can not be 5 + : pad_len) + + dec_input(i) => + + if i >= data.length + error "length of input data is not multiple of 8, as required by RFC4648" + else + c := data[i] + + # base32 alphabet character + if (is_valid c) + outcome (quintet_bits c) + + # padding character = + else if c = 61 + if i < data.length - pad_size + # only complain about pad car if length is ok, otherwise wrong length is probably more helpful + if data.length%%8 + error """ + padding character '=' not allowed within the input data, only at the very end, \ + as required by RFC464 (padding length of 2 or 5 can never occur)""" + else + error "length of input data is not multiple of 8, as required by RFC4648" + else outcome (u64 0) # replace padding with zeros for decoding + + # line break + else if c = 10 || c = 13 + error """ + line breaks are not allowed within encoded data, as required by RFC464, found \ + {if c=10 then "LF" else "CR"} at position $i""" + + # other non alphabet character + else + inv_char := String.type.from_bytes (data.slice i (i+4 > data.length ? data.length : i+4)) + .substring_codepoint 0 1 + + error "invalid $encoding_name input at byte position $i, decoding to unicode character '$inv_char'" + + for + res list u8 := nil, res ++ bytes # contains the decoded data at the end + nxt := 0, nxt + 8 + last_err := false, is_err + qnt_last list (outcome u64) := nil, quintets + + while nxt < data.length && !last_err + do + quintets := (nxt :: +1).map(i->dec_input i).take 8 + is_err := (quintets ∃ el -> el.is_error) + + # convert quintets in 40 bit number, break up in three bytes + bits := if is_err then 0 + else quintets.map (.val) + .zip (((u64 35) :: -5).take 8) (<<) + .foldf (|) (u64 0) + bytes := [(u64 32), 24, 16, 8, 0].map (i->(bits >> i).low8bits) + + else + if last_err + (qnt_last.filter (e -> e.is_error)).first.err + else + dump_size := 5 - ((40 - (pad_size * 5)) / 8) # number of decoded bytes caused by zeroed padding + outcome (res.take res.count-dump_size).as_array # remove zero bytes caused by padding diff --git a/lib/encodings/base32hex.fz b/lib/encodings/base32hex.fz new file mode 100644 index 0000000000..00d1b3964c --- /dev/null +++ b/lib/encodings/base32hex.fz @@ -0,0 +1,45 @@ +# This file is part of the Fuzion language implementation. +# +# The Fuzion language implementation is free software: you can redistribute it +# and/or modify it under the terms of the GNU General Public License as published +# by the Free Software Foundation, version 3 of the License. +# +# The Fuzion language implementation is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public +# License for more details. +# +# You should have received a copy of the GNU General Public License along with The +# Fuzion language implementation. If not, see . + + +# ----------------------------------------------------------------------- +# +# Tokiwa Software GmbH, Germany +# +# Source code of Fuzion standard library feature base32hex +# +# ----------------------------------------------------------------------- + +# Base32hex encoding and decoding as defined in RFC 4648 +# +public base32hex : base32 is + + # allows redefinition e.g. for base32hexhex + redef get_alphabet => "0123456789ABCDEFGHIJKLMNOPQRSTUV".utf8.as_array + redef encoding_name => "base32hex" + + # NYI: This causes a java exception + alphabet := get_alphabet + + # decode a valid base32hex characters to 5 bits + redef quintet_bits(n u8) => + if n >= 65 && n <= 86 # case A-V + n.as_u64 - 55 + + else #if n >= 48 && n <= 57 # case 0-9 + n.as_u64 - 48 + + # checks if a character is valid in the encoding + redef is_valid(c u8) => + (c >= 65 && c <= 86) || (c >= 48 && c <= 57) diff --git a/tests/base32/Makefile b/tests/base32/Makefile new file mode 100644 index 0000000000..e01fca7f55 --- /dev/null +++ b/tests/base32/Makefile @@ -0,0 +1,25 @@ +# This file is part of the Fuzion language implementation. +# +# The Fuzion language implementation is free software: you can redistribute it +# and/or modify it under the terms of the GNU General Public License as published +# by the Free Software Foundation, version 3 of the License. +# +# The Fuzion language implementation is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public +# License for more details. +# +# You should have received a copy of the GNU General Public License along with The +# Fuzion language implementation. If not, see . + + +# ----------------------------------------------------------------------- +# +# Tokiwa Software GmbH, Germany +# +# Source code of Fuzion test Makefile +# +# ----------------------------------------------------------------------- + +override NAME = base32_test +include ../simple.mk diff --git a/tests/base32/base32_test.fz b/tests/base32/base32_test.fz new file mode 100644 index 0000000000..5b070d5dd0 --- /dev/null +++ b/tests/base32/base32_test.fz @@ -0,0 +1,161 @@ +# This file is part of the Fuzion language implementation. +# +# The Fuzion language implementation is free software: you can redistribute it +# and/or modify it under the terms of the GNU General Public License as published +# by the Free Software Foundation, version 3 of the License. +# +# The Fuzion language implementation is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public +# License for more details. +# +# You should have received a copy of the GNU General Public License along with The +# Fuzion language implementation. If not, see . + + +# ----------------------------------------------------------------------- +# +# Tokiwa Software GmbH, Germany +# +# Source code of Fuzion test base32 +# +# ----------------------------------------------------------------------- + +base32_test is + + my_c : choice String (array u8) is + mk_choice(x my_c) => x + + # RFC 4618 test vectors + base32_test_vectors array (tuple my_c String) := + [(mk_choice $"", ""), + (mk_choice $"f", "MY======"), + (mk_choice "fo", "MZXQ===="), + (mk_choice "foo", "MZXW6==="), + (mk_choice "foob", "MZXW6YQ="), + (mk_choice "fooba", "MZXW6YTB"), + (mk_choice "foobar", "MZXW6YTBOI======")] + + # Additional test vectors + own_test_vectors array (tuple my_c String) := + [(mk_choice [(u8 0), 0, 0, 0], "AAAAAAA="), + (mk_choice [(u8 255), 255, 255], "77776==="), + (mk_choice (array u8 5 _->255), "77777777"), + (mk_choice "123890ABCXYZabcxyz_:;>~<%&\$§!", "GEZDGOBZGBAUEQ2YLFNGCYTDPB4XUXZ2HM7H4PBFEYSMFJZB"), + (mk_choice """ + Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidun\ + t ut labore et dolore magna aliquyam erat, sed dia voluptua. At vero eos et accusam et justo d\ + uo dolores et ea rebum. + + Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.""", + """ + JRXXEZLNEBUXA43VNUQGI33MN5ZCA43JOQQGC3LFOQWCAY3PNZZWK5DFOR2XEIDTMFSGS4DTMNUW4ZZAMVWGS5DSFQQHGZ\ + LEEBSGSYLNEBXG63TVNV4SAZLJOJWW6ZBAORSW24DPOIQGS3TWNFSHK3TUEB2XIIDMMFRG64TFEBSXIIDEN5WG64TFEBWW\ + CZ3OMEQGC3DJOF2XSYLNEBSXEYLUFQQHGZLEEBSGSYJAOZXWY5LQOR2WCLRAIF2CA5TFOJXSAZLPOMQGK5BAMFRWG5LTMF\ + WSAZLUEBVHK43UN4QGI5LPEBSG63DPOJSXGIDFOQQGKYJAOJSWE5LNFYFAUU3UMV2CAY3MNF2GCIDLMFZWIIDHOVRGK4TH\ + OJSW4LBANZXSA43FMEQHIYLLNFWWC5DBEBZWC3TDOR2XGIDFON2CATDPOJSW2IDJOBZXK3JAMRXWY33SEBZWS5BAMFWWK5BO""")] + + + + # ENCODING + + say """ + Testing base32 encoding...""" + + enc_test(test_vectors array (tuple my_c String), name String) => + for results list (outcome String) := nil, results.concat (out:nil) + tup in test_vectors + do + (plain, code_expected) := tup + code_actual := match plain + str String => encodings.base32.encode_to_string str.utf8.as_array + arr array u8 => encodings.base32.encode_to_string arr + out := + if code_actual = code_expected + outcome "ok" + else + plain_str := match plain + str String => str + arr array u8 => $arr + error "encode '$plain_str' produced '$code_actual' but should have been '$code_expected'" + else + if results ∀ (.ok) + say "$name test vectors are encoded correctly" + else + say "Failed encoding $name test vectors:" + results.filter (.is_error) + .map (.err.as_string) + .map (" "+) + .for_each say + say "" + enc_test base32_test_vectors "RFC 4648" + enc_test own_test_vectors "Additional" + + + + # DECODING + + say """ + \n + Testing base32 decoding...""" + dec_test(test_vectors array (tuple my_c String), name String) => + for results list (outcome String) := nil, results.concat (out:nil) + tup in test_vectors + do + (plain_exp, code) := tup + out := + match encodings.base32.decode code.utf8.as_array + actual array u8 => + match plain_exp + str String => + if str = String.from_bytes actual + outcome "ok" + else + error "decoding $code produced '{String.from_bytes actual}' but should have been '$str'" + arr array u8 => + if arr.length=actual.length && ((arr.zip actual (a,b->a=b)) ∀ x->x) + outcome "ok" + else + error "decoding $code produced '$actual' but should have been '$arr'" + e error => error "decoding failed when it should not have: {e.msg}" + else + if results ∀ (.ok) + say "$name test vectors are decoded correctly" + else + say "Failed decoding $name test vectors:" + results.filter (.is_error) + .map (.err.as_string) + .map (" "+) + .for_each say + say "" + dec_test base32_test_vectors "RFC 4648" + dec_test own_test_vectors "Additional" + + + + # ERROR MESSAGES + + say """ + \n + Test error messages when decoding broken base32... + """ + + broken_enc := [""" + MZXW + 6===""", # line break + "MZXW6==", # padding to short + "MZX=====", # invalid padding length / encoding length with valid overall length + "11111111", # non alphabet character + "88888888", # non alphabet character + "MZ=XW6==", # pad char within encoding + "MZXW6====", # padding to long + "MZX W6===", # space + "MZXW;6===", # non alphabet ascii character + "MZX🌍6==="] # non alphabet multi byte unicode character + + for t in broken_enc do + yak "$t: " + say (match encodings.base32.decode_str t + arr array u8 => String.type.from_bytes arr + e error => e.as_string) + say "" diff --git a/tests/base32/base32_test.fz.expected_err b/tests/base32/base32_test.fz.expected_err new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/base32/base32_test.fz.expected_out b/tests/base32/base32_test.fz.expected_out new file mode 100644 index 0000000000..bd163dfd57 --- /dev/null +++ b/tests/base32/base32_test.fz.expected_out @@ -0,0 +1,24 @@ +Testing base32 encoding... +RFC 4648 test vectors are encoded correctly +Additional test vectors are encoded correctly + + +Testing base32 decoding... +RFC 4648 test vectors are decoded correctly +Additional test vectors are decoded correctly + + +Test error messages when decoding broken base32... + +MZXW +6===: error: line breaks are not allowed within encoded data, as required by RFC464, found LF at position 4 +MZXW6==: error: length of input data is not multiple of 8, as required by RFC4648 +MZX=====: error: padding character '=' not allowed within the input data, only at the very end, as required by RFC464 (padding length of 2 or 5 can never occur) +11111111: error: invalid base32 input at byte position 0, decoding to unicode character '1' +88888888: error: invalid base32 input at byte position 0, decoding to unicode character '8' +MZ=XW6==: error: padding character '=' not allowed within the input data, only at the very end, as required by RFC464 (padding length of 2 or 5 can never occur) +MZXW6====: error: length of input data is not multiple of 8, as required by RFC4648 +MZX W6===: error: invalid base32 input at byte position 3, decoding to unicode character ' ' +MZXW;6===: error: invalid base32 input at byte position 4, decoding to unicode character ';' +MZX🌍6===: error: invalid base32 input at byte position 3, decoding to unicode character '🌍' + diff --git a/tests/base32hex/Makefile b/tests/base32hex/Makefile new file mode 100644 index 0000000000..5ba386cf13 --- /dev/null +++ b/tests/base32hex/Makefile @@ -0,0 +1,25 @@ +# This file is part of the Fuzion language implementation. +# +# The Fuzion language implementation is free software: you can redistribute it +# and/or modify it under the terms of the GNU General Public License as published +# by the Free Software Foundation, version 3 of the License. +# +# The Fuzion language implementation is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public +# License for more details. +# +# You should have received a copy of the GNU General Public License along with The +# Fuzion language implementation. If not, see . + + +# ----------------------------------------------------------------------- +# +# Tokiwa Software GmbH, Germany +# +# Source code of Fuzion test Makefile +# +# ----------------------------------------------------------------------- + +override NAME = base32hex_test +include ../simple.mk diff --git a/tests/base32hex/base32hex_test.fz b/tests/base32hex/base32hex_test.fz new file mode 100644 index 0000000000..e0d869e0f8 --- /dev/null +++ b/tests/base32hex/base32hex_test.fz @@ -0,0 +1,161 @@ +# This file is part of the Fuzion language implementation. +# +# The Fuzion language implementation is free software: you can redistribute it +# and/or modify it under the terms of the GNU General Public License as published +# by the Free Software Foundation, version 3 of the License. +# +# The Fuzion language implementation is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public +# License for more details. +# +# You should have received a copy of the GNU General Public License along with The +# Fuzion language implementation. If not, see . + + +# ----------------------------------------------------------------------- +# +# Tokiwa Software GmbH, Germany +# +# Source code of Fuzion test base32hex +# +# ----------------------------------------------------------------------- + +base32hex_test is + + my_c : choice String (array u8) is + mk_choice(x my_c) => x + + # RFC 4618 test vectors + base32hex_test_vectors array (tuple my_c String) := + [(mk_choice $"", ""), + (mk_choice "f", "CO======"), + (mk_choice "fo", "CPNG===="), + (mk_choice "foo", "CPNMU==="), + (mk_choice "foob", "CPNMUOG="), + (mk_choice "fooba", "CPNMUOJ1"), + (mk_choice "foobar", "CPNMUOJ1E8======")] + + # Additional test vectors + own_test_vectors array (tuple my_c String) := + [(mk_choice [(u8 0), 0, 0, 0], "0000000="), + (mk_choice [(u8 255), 255, 255], "VVVVU==="), + (mk_choice (array u8 5 _->255), "VVVVVVVV"), + (mk_choice "123890ABCXYZabcxyz_:;>~<%&\$§!", "64P36E1P610K4GQOB5D62OJ3F1SNKNPQ7CV7SF154OIC59P1"), + (mk_choice """ + Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidun\ + t ut labore et dolore magna aliquyam erat, sed dia voluptua. At vero eos et accusam et justo d\ + uo dolores et ea rebum. + + Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.""", + """ + 9HNN4PBD41KN0SRLDKG68RRCDTP20SR9EGG62RB5EGM20ORFDPPMAT35EHQN483JC5I6IS3JCDKMSPP0CLM6IT3I5GG76P\ + B441I6IOBD41N6URJLDLSI0PB9E9MMUP10EHIMQS3FE8G6IRJMD5I7ARJK41QN883CC5H6USJ541IN8834DTM6USJ541MM\ + 2PREC4G62R39E5QNIOBD41IN4OBK5GG76PB441I6IO90EPNMOTBGEHQM2BH085Q20TJ5E9NI0PBFECG6AT10C5HM6TBJC5\ + MI0PBK41L7ASRKDSG68TBF41I6UR3FE9IN6835EGG6AO90E9IM4TBD5O50KKRKCLQ20ORCD5Q6283BC5PM8837ELH6ASJ7\ + E9IMSB10DPNI0SR5C4G78OBBD5MM2T3141PM2RJ3EHQN6835EDQ20J3FE9IMQ839E1PNAR90CHNMORRI41PMIT10C5MMAT1E""")] + + + + # ENCODING + + say """ + Testing base32hex encoding...""" + + enc_test(test_vectors array (tuple my_c String), name String) => + for results list (outcome String) := nil, results.concat (out:nil) + tup in test_vectors + do + (plain, code_expected) := tup + code_actual := match plain + str String => encodings.base32hex.encode_to_string str.utf8.as_array + arr array u8 => encodings.base32hex.encode_to_string arr + out := + if code_actual = code_expected + outcome "ok" + else + plain_str := match plain + str String => str + arr array u8 => $arr + error "encode '$plain_str' produced '$code_actual' but should have been '$code_expected'" + else + if results ∀ (.ok) + say "$name test vectors are encoded correctly" + else + say "Failed encoding $name test vectors:" + results.filter (.is_error) + .map (.err.as_string) + .map (" "+) + .for_each say + say "" + enc_test base32hex_test_vectors "RFC 4648" + enc_test own_test_vectors "Additional" + + + + # DECODING + + say """ + \n + Testing base32hex decoding...""" + dec_test(test_vectors array (tuple my_c String), name String) => + for results list (outcome String) := nil, results.concat (out:nil) + tup in test_vectors + do + (plain_exp, code) := tup + out := + match encodings.base32hex.decode code.utf8.as_array + actual array u8 => + match plain_exp + str String => + if str = String.from_bytes actual + outcome "ok" + else + error "decoding $code produced '{String.from_bytes actual}' but should have been '$str'" + arr array u8 => + if arr.length=actual.length && ((arr.zip actual (a,b->a=b)) ∀ x->x) + outcome "ok" + else + error "decoding $code produced '$actual' but should have been '$arr'" + e error => error "decoding failed when it should not have: {e.msg}" + else + if results ∀ (.ok) + say "$name test vectors are decoded correctly" + else + say "Failed decoding $name test vectors:" + results.filter (.is_error) + .map (.err.as_string) + .map (" "+) + .for_each say + say "" + dec_test base32hex_test_vectors "RFC 4648" + dec_test own_test_vectors "Additional" + + + + # ERROR MESSAGES + + say """ + \n + Test error messages when decoding broken base32hex... + """ + + broken_enc := [""" + CPN + MU===""", # line break + "CPNMU==", # padding to short + "WWWWWWWW", # non alphabet character + "ZZZZZZZZ", # non alphabet character + "CPN=====", # invalid padding length / encoding length with valid overall length + "CPN=MU==", # pad char within encoding + "CPNMU====", # padding to long + "CPNM U===", # space + "CP;NMU===", # non alphabet ascii character + "CPNM🌍U=="] # non alphabet multi byte unicode character + + for t in broken_enc do + yak "$t: " + say (match encodings.base32hex.decode_str t + arr array u8 => String.type.from_bytes arr + e error => e.as_string) + say "" diff --git a/tests/base32hex/base32hex_test.fz.expected_err b/tests/base32hex/base32hex_test.fz.expected_err new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/base32hex/base32hex_test.fz.expected_out b/tests/base32hex/base32hex_test.fz.expected_out new file mode 100644 index 0000000000..b065916a20 --- /dev/null +++ b/tests/base32hex/base32hex_test.fz.expected_out @@ -0,0 +1,24 @@ +Testing base32hex encoding... +RFC 4648 test vectors are encoded correctly +Additional test vectors are encoded correctly + + +Testing base32hex decoding... +RFC 4648 test vectors are decoded correctly +Additional test vectors are decoded correctly + + +Test error messages when decoding broken base32hex... + +CPN +MU===: error: line breaks are not allowed within encoded data, as required by RFC464, found LF at position 3 +CPNMU==: error: length of input data is not multiple of 8, as required by RFC4648 +WWWWWWWW: error: invalid base32hex input at byte position 0, decoding to unicode character 'W' +ZZZZZZZZ: error: invalid base32hex input at byte position 0, decoding to unicode character 'Z' +CPN=====: error: padding character '=' not allowed within the input data, only at the very end, as required by RFC464 (padding length of 2 or 5 can never occur) +CPN=MU==: error: padding character '=' not allowed within the input data, only at the very end, as required by RFC464 (padding length of 2 or 5 can never occur) +CPNMU====: error: length of input data is not multiple of 8, as required by RFC4648 +CPNM U===: error: invalid base32hex input at byte position 4, decoding to unicode character ' ' +CP;NMU===: error: invalid base32hex input at byte position 2, decoding to unicode character ';' +CPNM🌍U==: error: invalid base32hex input at byte position 4, decoding to unicode character '🌍' + From 9e2c6111ff1c166f641d4a55c9884df96ab16b47 Mon Sep 17 00:00:00 2001 From: Simon von Hackewitz Date: Thu, 11 Jul 2024 17:54:20 +0200 Subject: [PATCH 2/4] remove bug from base32hex --- lib/encodings/base32hex.fz | 3 --- 1 file changed, 3 deletions(-) diff --git a/lib/encodings/base32hex.fz b/lib/encodings/base32hex.fz index 00d1b3964c..8ca96f8099 100644 --- a/lib/encodings/base32hex.fz +++ b/lib/encodings/base32hex.fz @@ -29,9 +29,6 @@ public base32hex : base32 is redef get_alphabet => "0123456789ABCDEFGHIJKLMNOPQRSTUV".utf8.as_array redef encoding_name => "base32hex" - # NYI: This causes a java exception - alphabet := get_alphabet - # decode a valid base32hex characters to 5 bits redef quintet_bits(n u8) => if n >= 65 && n <= 86 # case A-V From 3bd67a8b87adaf5d9b7dfa7e8b3955e20d9ef2cd Mon Sep 17 00:00:00 2001 From: Simon von Hackewitz Date: Fri, 12 Jul 2024 12:29:52 +0200 Subject: [PATCH 3/4] apply suggestions from code review --- lib/encodings/base32.fz | 16 ++++++++-------- lib/encodings/base32hex.fz | 3 ++- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/lib/encodings/base32.fz b/lib/encodings/base32.fz index 761a043c8a..29f5db4edf 100644 --- a/lib/encodings/base32.fz +++ b/lib/encodings/base32.fz @@ -22,6 +22,7 @@ # ----------------------------------------------------------------------- # Base32 encoding and decoding as defined in RFC 4648 +# https://datatracker.ietf.org/doc/html/rfc4648#section-6 # public base32 is @@ -32,8 +33,7 @@ public base32 is # NYI: compiler does currently not optimize features without arguments to treat them like fields alphabet := get_alphabet - # decode a valid base32 characters to 5 bits - # visibility module to allow redefinition in base32hex + # decode a valid base32 character to 5 bits module quintet_bits(n u8) => if n >= 65 && n <= 90 # case A-Z n.as_u64 - 65 @@ -42,21 +42,21 @@ public base32 is n.as_u64 - 24 # checks if a character is valid in the encoding - # visibility module to allow redefinition in base32hex module is_valid(c u8) => (c >= 65 && c <= 90) || (c >= 50 && c <= 55) # Encodes a given byte sequence in base32, output is padded to multiple of 8 # returns a sequence of ascii values - public encode(data) => + public encode(data array u8) => # extract 8 quintets from 40 bit (of an 64 bit integer) enc40(n u64) => - block := array i32 8 (i->((n >> (35-i*5).as_u64) & 31).as_i32) - block.map (x->alphabet[x]) + array u8 8 i-> + idx := ((n >> (35-i*5).as_u64) & 31).as_i32 + alphabet[idx] for - res Sequence u8 := [], next # the encoded input data + res Sequence u8 := [], next # the encoded data i := 0, i+1 last_n u64 := 0, i %% 5 ? 0 : n b in data @@ -93,7 +93,7 @@ public base32 is # NYI: decoding does currently not reject encodings where the padding bits have not been set to zero prior to encoding # therefore in some cases multiple encodings can be decoded to the same data # See RFC4648 section 3.5: https://datatracker.ietf.org/doc/html/rfc4648#section-3.5 - public decode(data) => + public decode(data array u8) => # determine size of padding, i.e. number of '=' (61 in ASCII) at the end pad_size := diff --git a/lib/encodings/base32hex.fz b/lib/encodings/base32hex.fz index 8ca96f8099..1be70403f5 100644 --- a/lib/encodings/base32hex.fz +++ b/lib/encodings/base32hex.fz @@ -22,6 +22,7 @@ # ----------------------------------------------------------------------- # Base32hex encoding and decoding as defined in RFC 4648 +# https://datatracker.ietf.org/doc/html/rfc4648#section-7 # public base32hex : base32 is @@ -29,7 +30,7 @@ public base32hex : base32 is redef get_alphabet => "0123456789ABCDEFGHIJKLMNOPQRSTUV".utf8.as_array redef encoding_name => "base32hex" - # decode a valid base32hex characters to 5 bits + # decode a valid base32hex character to 5 bits redef quintet_bits(n u8) => if n >= 65 && n <= 86 # case A-V n.as_u64 - 55 From a3d6fd1e7187dc49652486a82beca8fa26a10901 Mon Sep 17 00:00:00 2001 From: Simon von Hackewitz Date: Fri, 12 Jul 2024 15:05:26 +0200 Subject: [PATCH 4/4] improve comparisons --- lib/encodings/base32.fz | 6 +++--- lib/encodings/base32hex.fz | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/encodings/base32.fz b/lib/encodings/base32.fz index 29f5db4edf..0e614cf561 100644 --- a/lib/encodings/base32.fz +++ b/lib/encodings/base32.fz @@ -35,15 +35,15 @@ public base32 is # decode a valid base32 character to 5 bits module quintet_bits(n u8) => - if n >= 65 && n <= 90 # case A-Z + if 65 <= n <= 90 # case A-Z n.as_u64 - 65 - else #if n >= 50 && n <= 55 # case 2-7 + else #if 50 <= n <= 55 # case 2-7 n.as_u64 - 24 # checks if a character is valid in the encoding module is_valid(c u8) => - (c >= 65 && c <= 90) || (c >= 50 && c <= 55) + (65 <= c <= 90) || (50 <= c <= 55) # Encodes a given byte sequence in base32, output is padded to multiple of 8 # returns a sequence of ascii values diff --git a/lib/encodings/base32hex.fz b/lib/encodings/base32hex.fz index 1be70403f5..2289d9b846 100644 --- a/lib/encodings/base32hex.fz +++ b/lib/encodings/base32hex.fz @@ -32,12 +32,12 @@ public base32hex : base32 is # decode a valid base32hex character to 5 bits redef quintet_bits(n u8) => - if n >= 65 && n <= 86 # case A-V + if 65 <= n <= 86 # case A-V n.as_u64 - 55 - else #if n >= 48 && n <= 57 # case 0-9 + else #if 48 <= n <= 57 # case 0-9 n.as_u64 - 48 # checks if a character is valid in the encoding redef is_valid(c u8) => - (c >= 65 && c <= 86) || (c >= 48 && c <= 57) + (65 <= c <= 86) || (48 <= c <= 57)