Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lib: add base32 and base32hex encodings #3368

Merged
merged 4 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
169 changes: 169 additions & 0 deletions lib/encodings/base32.fz
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
# This file is part of the Fuzion language implementation.
#
# The Fuzion language implementation is free software: you can redistribute it
# and/or modify it under the terms of the GNU General Public License as published
# by the Free Software Foundation, version 3 of the License.
#
# The Fuzion language implementation is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
# License for more details.
#
# You should have received a copy of the GNU General Public License along with The
# Fuzion language implementation. If not, see <https://www.gnu.org/licenses/>.


# -----------------------------------------------------------------------
#
# Tokiwa Software GmbH, Germany
#
# Source code of Fuzion standard library feature base32
#
# -----------------------------------------------------------------------

# Base32 encoding and decoding as defined in RFC 4648
simonvonhackewitz marked this conversation as resolved.
Show resolved Hide resolved
# https://datatracker.ietf.org/doc/html/rfc4648#section-6
#
public base32 is

# allows redefinition e.g. for base32hex
module get_alphabet => "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567".utf8.as_array
module encoding_name => "base32"

# NYI: compiler does currently not optimize features without arguments to treat them like fields
alphabet := get_alphabet

# decode a valid base32 character to 5 bits
module quintet_bits(n u8) =>
if 65 <= n <= 90 # case A-Z
n.as_u64 - 65

else #if 50 <= n <= 55 # case 2-7
n.as_u64 - 24

# checks if a character is valid in the encoding
module is_valid(c u8) =>
(65 <= c <= 90) || (50 <= c <= 55)

# Encodes a given byte sequence in base32, output is padded to multiple of 8
# returns a sequence of ascii values
public encode(data array u8) =>

# extract 8 quintets from 40 bit (of an 64 bit integer)
enc40(n u64) =>
array u8 8 i->
idx := ((n >> (35-i*5).as_u64) & 31).as_i32
alphabet[idx]

for
res Sequence u8 := [], next # the encoded data
i := 0, i+1
last_n u64 := 0, i %% 5 ? 0 : n
b in data
n := (last_n << 8) + b.as_u64
next := if i%5=4 then res ++ enc40 n
else res
else
bit_len := data.length%5 * 8 # number of bits in last input block

if bit_len = 0
res
else
block_len := bit_len/5 + (bit_len%%5 ? 0 : 1) # number ob characters in last block
res ++ (enc40 (last_n<<((u64 40)-bit_len.as_u64))).slice 0 block_len ++ (array u8 (8-block_len) _->61)


# Encodes a given byte sequence in base32, output is padded to multiple of 8
# returns a string
public encode_to_string(data array u8) =>
String.type.from_bytes (encode data)


# decodes a base32 string, decoding is strict as required by RFC 4648
# lowercase letters, non alphabet characters, line breaks, missing padding cause errors
# NYI: decoding does currently not reject encodings where the padding bits have not been set to zero prior to encoding
# therefore in some cases multiple encodings can be decoded to the same data
# See RFC4648 section 3.5: https://datatracker.ietf.org/doc/html/rfc4648#section-3.5
public decode_str(data String) =>
decode data.utf8.as_array


# decodes a sequence of ASCII characters, decoding is strict as required by RFC 4648
# lowercase letters, non alphabet characters, line breaks, missing padding cause errors
# NYI: decoding does currently not reject encodings where the padding bits have not been set to zero prior to encoding
# therefore in some cases multiple encodings can be decoded to the same data
# See RFC4648 section 3.5: https://datatracker.ietf.org/doc/html/rfc4648#section-3.5
public decode(data array u8) =>

# determine size of padding, i.e. number of '=' (61 in ASCII) at the end
pad_size :=
for
pad_len := 0, pad_len + (data[i] = 61 ? 1 : 0)
i in (data.indices.reverse)
_ in 1..6 # padding can not be longer than 6
while data[i] = 61
else
pad_len = 2 ? 1 # padding can not be 2
: (pad_len = 5 ? 4 # padding can not be 5
: pad_len)

dec_input(i) =>

if i >= data.length
error "length of input data is not multiple of 8, as required by RFC4648"
else
c := data[i]

# base32 alphabet character
if (is_valid c)
outcome (quintet_bits c)

# padding character =
else if c = 61
if i < data.length - pad_size
# only complain about pad car if length is ok, otherwise wrong length is probably more helpful
if data.length%%8
error """
padding character '=' not allowed within the input data, only at the very end, \
as required by RFC464 (padding length of 2 or 5 can never occur)"""
else
error "length of input data is not multiple of 8, as required by RFC4648"
else outcome (u64 0) # replace padding with zeros for decoding

# line break
else if c = 10 || c = 13
error """
line breaks are not allowed within encoded data, as required by RFC464, found \
{if c=10 then "LF" else "CR"} at position $i"""

# other non alphabet character
else
inv_char := String.type.from_bytes (data.slice i (i+4 > data.length ? data.length : i+4))
.substring_codepoint 0 1

error "invalid $encoding_name input at byte position $i, decoding to unicode character '$inv_char'"

for
res list u8 := nil, res ++ bytes # contains the decoded data at the end
nxt := 0, nxt + 8
last_err := false, is_err
qnt_last list (outcome u64) := nil, quintets

while nxt < data.length && !last_err
do
quintets := (nxt :: +1).map(i->dec_input i).take 8
is_err := (quintets ∃ el -> el.is_error)

# convert quintets in 40 bit number, break up in three bytes
bits := if is_err then 0
else quintets.map (.val)
.zip (((u64 35) :: -5).take 8) (<<)
.foldf (|) (u64 0)
bytes := [(u64 32), 24, 16, 8, 0].map (i->(bits >> i).low8bits)

else
if last_err
(qnt_last.filter (e -> e.is_error)).first.err
else
dump_size := 5 - ((40 - (pad_size * 5)) / 8) # number of decoded bytes caused by zeroed padding
outcome (res.take res.count-dump_size).as_array # remove zero bytes caused by padding
43 changes: 43 additions & 0 deletions lib/encodings/base32hex.fz
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# This file is part of the Fuzion language implementation.
#
# The Fuzion language implementation is free software: you can redistribute it
# and/or modify it under the terms of the GNU General Public License as published
# by the Free Software Foundation, version 3 of the License.
#
# The Fuzion language implementation is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
# License for more details.
#
# You should have received a copy of the GNU General Public License along with The
# Fuzion language implementation. If not, see <https://www.gnu.org/licenses/>.


# -----------------------------------------------------------------------
#
# Tokiwa Software GmbH, Germany
#
# Source code of Fuzion standard library feature base32hex
#
# -----------------------------------------------------------------------

# Base32hex encoding and decoding as defined in RFC 4648
# https://datatracker.ietf.org/doc/html/rfc4648#section-7
#
public base32hex : base32 is

# allows redefinition e.g. for base32hexhex
redef get_alphabet => "0123456789ABCDEFGHIJKLMNOPQRSTUV".utf8.as_array
redef encoding_name => "base32hex"

# decode a valid base32hex character to 5 bits
redef quintet_bits(n u8) =>
if 65 <= n <= 86 # case A-V
n.as_u64 - 55

else #if 48 <= n <= 57 # case 0-9
n.as_u64 - 48

# checks if a character is valid in the encoding
redef is_valid(c u8) =>
(65 <= c <= 86) || (48 <= c <= 57)
25 changes: 25 additions & 0 deletions tests/base32/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# This file is part of the Fuzion language implementation.
#
# The Fuzion language implementation is free software: you can redistribute it
# and/or modify it under the terms of the GNU General Public License as published
# by the Free Software Foundation, version 3 of the License.
#
# The Fuzion language implementation is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
# License for more details.
#
# You should have received a copy of the GNU General Public License along with The
# Fuzion language implementation. If not, see <https://www.gnu.org/licenses/>.


# -----------------------------------------------------------------------
#
# Tokiwa Software GmbH, Germany
#
# Source code of Fuzion test Makefile
#
# -----------------------------------------------------------------------

override NAME = base32_test
include ../simple.mk
Loading