diff --git a/MANIFEST.txt b/MANIFEST.txt new file mode 100644 index 0000000..e999066 --- /dev/null +++ b/MANIFEST.txt @@ -0,0 +1 @@ +python/include *.txt diff --git a/README.md b/README.md index b146e74..c8c90d6 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,15 @@ USAGE ### C ### - #include +To build as a shared library in $PREFIX. + + mkdir -p $PREFIX/include/ $PREFIX/lib/ + gcc -shared -Wl,-soname,libbase92 -o $PREFIX/lib/libbase92.so -fPIC -Ic/src c/src/base92.c + cp -a c/src/base92.h $PREFIX/include/ + +Use: + + #include ... strcmp(base92encode("hello world", 11), "Fc_$aOTdKnsM*k") == 0; base92decode("Fc_$aOTdKnsM*k", &length); @@ -29,6 +37,12 @@ USAGE ### Python ### +To build: + + cd python + python setup.py build_ext --inplace + python setup.py install + Fire up your favorite python: >>> import base92 diff --git a/c/src/base92.c b/c/src/base92.c index 3906124..d5c50f7 100644 --- a/c/src/base92.c +++ b/c/src/base92.c @@ -79,9 +79,9 @@ unsigned char* base92encode(unsigned char* str, int len) { int tmp; unsigned char c; unsigned char *res; - + if (len == 0) { - return "~"; + return (unsigned char*)"~"; } // precalculate how much space we need to malloc size = (len * 8) % 13; @@ -159,17 +159,13 @@ unsigned char* base92decode(unsigned char* str, int* len) { unsigned char* res; unsigned long workspace; unsigned short wssize; - size = strlen(str); + size = strlen((char*)str); // handle small cases first - if (strcmp(str, "~") == 0 || size == 0) { + if (strcmp((char*)str, "~") == 0 || size == 0) { res = (unsigned char*)malloc(sizeof(char) * 1); res[0] = 0; return res; } - // this case does not fit the specs - if (size < 2) { - res = NULL; - } // calculate size *len = ((size/2 * 13) + (size%2 * 6)) / 8; res = (unsigned char *)malloc(sizeof(char) * (*len)); diff --git a/python/MANIFEST.txt b/python/MANIFEST.txt deleted file mode 100644 index ab30e9a..0000000 --- a/python/MANIFEST.txt +++ /dev/null @@ -1 +0,0 @@ -include *.txt diff --git a/python/README.txt b/python/README.txt index 323abcb..c326a2e 100644 --- a/python/README.txt +++ b/python/README.txt @@ -6,6 +6,17 @@ A little library for encoding byte-strings into strings easily typeable on a standard US 101-key keyboard, with strictly better information density than base64 or base85 encodings. +It is python3 compatible and has a C backend. + +-------- +BUILDING +-------- + +Compile the C extension and install. + + python setup.py build_ext --inplace # creates base92/base92_extension.so + python setup.py install + ----- USAGE ----- @@ -17,6 +28,26 @@ Fire up your favorite python:: 'hello world' >>> base92.encode('\x61\xf2\x05\x99\x42') 'DJ8gER!' + + >>> import base92.test + >>> base92.test.run() + testing and cross validating encoders and decoders from modules [, ] + selected regression tests passed + generating 10000 random byte strings + 10000 randomized X == decode(encode(X)) tests passed + performance of module on the 10000 random byte strings + - encoding: 0.00835490226746s + - decoding: 0.00846481323242s + performance of module on the 10000 random byte strings + - encoding: 1.75639009476s + - decoding: 1.28861784935s + +If the C backend is not available, the python backend will be used: + + rm -f base92/base92_extension.so + + >>> import base92 + Falling back to base92 python backend due to: No module named base92_extension We use doctests, so running the tests is as easy as executing the base92.py library file with your python. @@ -25,10 +56,7 @@ base92.py library file with your python. MISC ---- -This library is pure python: there may be a cbase92 forthcoming, -backed by a C library. - -This library has not been tested with python3. +This library has a C extension as a backend and falls back to python if the backend isn't available. There is more information available at diff --git a/python/base92/__init__.py b/python/base92/__init__.py index ad05b28..0dcf099 100644 --- a/python/base92/__init__.py +++ b/python/base92/__init__.py @@ -1,5 +1,5 @@ -''' -Import routines from base92.base92 for manipulating base92 encoded strings. +""" +Import routines from base92.cbase92 or base92.base92 for manipulating base92 encoded strings. Example: @@ -9,6 +9,18 @@ 'Fc_$aOTdKnsM*k' >>> decode(x) 'hello world' -''' +""" -from base92 import encode, decode, b92encode, b92decode, __version__ +from . import base92 + +try: + from . import cbase92 + preferred_base92 = cbase92 +except (ImportError, OSError) as e: + print('Falling back to base92 python backend due to: {}'.format(e)) + preferred_base92 = base92 + cbase92 = None + +encode = b92encode = preferred_base92.encode +decode = b92decode = preferred_base92.decode +__version__ = base92.__version__ diff --git a/python/base92/base92.py b/python/base92/base92.py index fe39c1e..1b2ce0e 100644 --- a/python/base92/base92.py +++ b/python/base92/base92.py @@ -4,16 +4,17 @@ # think this stuff is worth it, you can buy me a beer in return # - Nathan Hwang (thenoviceoof) -''' + +""" base92: a library for encoding byte strings ->>> x = encode('hello world') +>>> x = encode(b'hello world') >>> x 'Fc_$aOTdKnsM*k' >>> decode(x) 'hello world' ->>> y = encode('^\xb6;\xbb\xe0\x1e\xee\xd0\x93\xcb"\xbb\x8fZ\xcd\xc3') +>>> y = encode(b'^\xb6;\xbb\xe0\x1e\xee\xd0\x93\xcb"\xbb\x8fZ\xcd\xc3') >>> y "C=i.w6'IvB/viUpRAwco" >>> decode(y) @@ -22,14 +23,32 @@ this is a regression test >>> decode(encode('aoeuaoeuaoeu')) 'aoeuaoeuaoeu' -''' - -import math +""" __version__ = (1, 0, 3) +__all__ = [ + 'encode', + 'decode', + 'b92encode', + 'b92decode', + 'base92encode', + 'base92decode', +] + + +if bytes is str: + _chr = chr + _ord = ord +else: + import struct + _chr = struct.Struct(">B").pack + _ord = lambda v: v if isinstance(v, int) else ord(v) + del struct + + def base92_chr(val): - ''' + """ Map an integer value <91 to a char >>> base92_chr(0) @@ -46,163 +65,158 @@ def base92_chr(val): Traceback (most recent call last): ... ValueError: val must be in [0, 91) - ''' + """ if val < 0 or val >= 91: raise ValueError('val must be in [0, 91)') if val == 0: - return '!' + return 33 # b'!' # 33 == ord('!') elif val <= 61: - return chr(ord('#') + val - 1) + return 35 + val - 1 # 35 == ord('#') else: - return chr(ord('a') + val - 62) + return 97 + val - 62 # 97 == ord('a') -def base92_ord(val): - ''' + +def base92_ord(val, _excl=_ord(b'!'), _sharp=_ord(b'#'), _under=_ord(b'_'), _a=_ord(b'a'), _rcurl=_ord(b'}')): + """ Map a char to an integer - >>> base92_ord('!') + >>> base92_ord(b'!') 0 - >>> base92_ord('#') + >>> base92_ord(b'#') 1 - >>> base92_ord('_') + >>> base92_ord(b'_') 61 - >>> base92_ord('a') + >>> base92_ord(b'a') 62 - >>> base92_ord('}') + >>> base92_ord(b'}') 90 - >>> base92_ord(' ') + >>> base92_ord(b' ') Traceback (most recent call last): ... ValueError: val is not a base92 character - ''' - num = ord(val) - if val == '!': + """ + num = _ord(val) + if num == _excl: return 0 - elif ord('#') <= num and num <= ord('_'): - return num - ord('#') + 1 - elif ord('a') <= num and num <= ord('}'): - return num - ord('a') + 62 + elif _sharp <= num <= _under: + return num - _sharp + 1 + elif _a <= num <= _rcurl: + return num - _a + 62 else: raise ValueError('val is not a base92 character') -def base92_encode(bytstr): - ''' + +def encode(bytstr): + """ Take a byte-string, and encode it in base 91 - >>> base92_encode("") + >>> base92_encode(b"") '~' - >>> base92_encode("\\x00") + >>> base92_encode(b"\\x00") '!!' - >>> base92_encode("\x01") + >>> base92_encode(b"\x01") '!B' - >>> base92_encode("\xff") + >>> base92_encode(b"\xff") '|_' - >>> base92_encode("aa") + >>> base92_encode(b"aa") 'D8*' - >>> base92_encode("aaaaaaaaaaaaa") + >>> base92_encode(b"aaaaaaaaaaaaa") 'D81RPya.)hgNA(%s' >>> base92_encode([16,32,48]) "'_$," - ''' + """ # always encode *something*, in case we need to avoid empty strings if not bytstr: - return '~' + return b'~' # make sure we have a bytstr - if not isinstance(bytstr, basestring): + if isinstance(bytstr, bytes): + pass + elif isinstance(bytstr, str): + bytstr = bytstr.encode() + else: # we'll assume it's a sequence of ints - bytstr = ''.join([chr(b) for b in bytstr]) + bytstr = b''.join(_chr(b) for b in bytstr) # prime the pump - bitstr = '' - while len(bitstr) < 13 and bytstr: - bitstr += '{:08b}'.format(ord(bytstr[0])) - bytstr = bytstr[1:] - resstr = '' - while len(bitstr) > 13 or bytstr: - i = int(bitstr[:13], 2) - resstr += base92_chr(i / 91) - resstr += base92_chr(i % 91) - bitstr = bitstr[13:] - while len(bitstr) < 13 and bytstr: - bitstr += '{:08b}'.format(ord(bytstr[0])) - bytstr = bytstr[1:] - if bitstr: - if len(bitstr) < 7: - bitstr += '0' * (6 - len(bitstr)) - resstr += base92_chr(int(bitstr,2)) - else: - bitstr += '0' * (13 - len(bitstr)) - i = int(bitstr, 2) - resstr += base92_chr(i / 91) - resstr += base92_chr(i % 91) - return resstr - -def base92_decode(bstr): - ''' + nbytes = len(bytstr) + size = (nbytes * 8) % 13 + size = 2 * (nbytes * 8) // 13 + (0 if size == 0 else (1 if size < 7 else 2)) + resstr = bytearray(size) + workspace = 0 + wssize = 0 + j = 0 + for byte in bytstr: + workspace = workspace << 8 | _ord(byte) + wssize += 8 + if wssize < 13: + continue + tmp = (workspace >> (wssize - 13)) & 8191 + resstr[j] = base92_chr(tmp // 91) + j += 1 + resstr[j] = base92_chr(tmp % 91) + j += 1 + wssize -= 13 + if wssize <= 0: + pass + elif wssize < 7: + tmp = (workspace << (6 - wssize)) & 63 + resstr[j] = base92_chr(tmp) + j += 1 + else: + tmp = (workspace << (13 - wssize)) & 8191 + resstr[j] = base92_chr(tmp // 91) + j += 1 + resstr[j] = base92_chr(tmp % 91) + j += 1 + return bytes(resstr[:j]) + + +def decode(bstr): + """ Take a base92 encoded string, convert it back to a byte-string - >>> base92_decode("") + >>> base92_decode(b"") '' - >>> base92_decode("~") + >>> base92_decode(b"~") '' - >>> base92_decode("!!") + >>> base92_decode(b"!!") '\\x00' - >>> base92_decode("!B") + >>> base92_decode(b"!B") '\\x01' - >>> base92_decode("|_") + >>> base92_decode(b"|_") '\\xff' - >>> base92_decode("D8*") + >>> base92_decode(b"D8*") 'aa' - >>> base92_decode("D81RPya.)hgNA(%s") + >>> base92_decode(b"D81RPya.)hgNA(%s") 'aaaaaaaaaaaaa' - ''' - bitstr = '' - resstr = '' - if bstr == '~': - return '' + """ + if isinstance(bstr, str): + bstr = bstr.encode() + if bstr == b'~': + return b'' + nbytes = len(bstr) + size = ((nbytes // 2 * 13) + (nbytes % 2 * 6)) // 8 + resstr = bytearray(size) + workspace = 0 + wssize = 0 + j = 0 # we always have pairs of characters - for i in range(len(bstr)/2): - x = base92_ord(bstr[2*i])*91 + base92_ord(bstr[2*i+1]) - bitstr += '{:013b}'.format(x) - while 8 <= len(bitstr): - resstr += chr(int(bitstr[0:8], 2)) - bitstr = bitstr[8:] + for i in range(nbytes // 2): + workspace = (workspace << 13) | (base92_ord(bstr[2*i]) * 91 + base92_ord(bstr[2*i+1])) + wssize += 13 + while wssize >= 8: + resstr[j] = (workspace >> (wssize - 8)) & 255 + wssize -= 8 + j += 1 # if we have an extra char, check for extras - if len(bstr) % 2 == 1: - x = base92_ord(bstr[-1]) - bitstr += '{:06b}'.format(x) - while 8 <= len(bitstr): - resstr += chr(int(bitstr[0:8], 2)) - bitstr = bitstr[8:] - return resstr - -encode = base92_encode -b92encode = base92_encode - -decode = base92_decode -b92decode = base92_decode - -if __name__ == "__main__": - import doctest - doctest.testmod() - - ## more correctness tests - import hashlib - import random - def gen_bytes(s): - return hashlib.sha512(s).digest()[:random.randint(1,64)] - for i in range(10000): - s = gen_bytes(str(random.random())) - assert s == decode(encode(s)) - print('correctness spot check passed') - - ## size tests - # import base64 - # import base85 - # from pprint import pprint - # sd = [(len(base64.b64encode('a'*i)), - # len(base85.b85encode('a'*i)), - # len(encode('a'*i))) - # for i in range(1,128)] - # pprint(sd) - # print sum(a-c for a,b,c in sd)/float(len(sd)) - # print sum(b-c for a,b,c in sd)/float(len(sd)) + if nbytes % 2 == 1: + workspace = (workspace << 6) | base92_ord(bstr[-1]) + wssize += 6 + while wssize >= 8: + resstr[j] = (workspace >> (wssize - 8)) & 255 + wssize -= 8 + j += 1 + return bytes(resstr[:j]) + + +base92_encode = b92encode = encode +base92_decode = b92decode = decode diff --git a/python/base92/base92_extension.c b/python/base92/base92_extension.c new file mode 100644 index 0000000..a9687bc --- /dev/null +++ b/python/base92/base92_extension.c @@ -0,0 +1,326 @@ +// method "how to write a c-extension" copied in part from https://github.com/Blosc/python-blosc/blob/master/blosc/blosc_extension.c + +#define PY_SSIZE_T_CLEAN /* allows Py_ssize_t in s# format for parsing arguments */ +#include "Python.h" + +static PyObject *Base92Error; + +static void +base92_error(int err, const char *msg) +{ + PyErr_Format(Base92Error, "Error %d %s", err, msg); +} + +unsigned char ENCODE_MAPPING[256] = (unsigned char[]){ + 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, + 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, + 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, + 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, + 94, 95, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, + 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, + 125, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0 +}; +unsigned char DECODE_MAPPING[256] = (unsigned char[]){ + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 0, 255, 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, + 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 255, 62, 63, 64, + 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, + 85, 86, 87, 88, 89, 90, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255 +}; + +unsigned char base92chr_encode(unsigned char byt) { + return ENCODE_MAPPING[byt]; +} + +unsigned char base92chr_decode(unsigned char byt) { + return DECODE_MAPPING[byt]; +} + +PyDoc_STRVAR(encode__doc__, "encode(input: bytes) -> str -- Return encoded string.\n"); + +static PyObject * +PyBase92_encode(PyObject *self, PyObject *args) +{ + Py_buffer view; + PyObject *output; + char *output_ptr; + const uint8_t *input; + const char *format; + size_t nbytes, size; + size_t i, j; // i for raw, j for encoded + unsigned long workspace; // bits holding bin + unsigned short wssize; // number of good bits in workspace + int tmp; + unsigned char c; + + /* Accept some kind of input */ +#if PY_MAJOR_VERSION <= 2 + /* s* : bytes like object including unicode and anything that supports + * the buffer interface */ + format = "s*:encode"; +#elif PY_MAJOR_VERSION >= 3 + /* y* :bytes like object EXCLUDING unicode and anything that supports + * the buffer interface. This is the recommended way to accept binary + * data in Python 3. */ + format = "y*:encode"; +#endif + if (!PyArg_ParseTuple(args, format, &view)) + return NULL; + + nbytes = view.len; + input = (const uint8_t*)view.buf; + + if (nbytes == 0) { + PyBuffer_Release(&view); + /* Alloc memory for encoding */ + if (!(output = PyBytes_FromStringAndSize("~", 1))) + return NULL; + return output; + } + + // precalculate how much space we need to malloc + size = (nbytes * 8) % 13; + if (size == 0) { + size = 2 * ((nbytes * 8) / 13); + } else if (size < 7) { + size = 2 * ((nbytes * 8) / 13) + 1; + } else { + size = 2 * ((nbytes * 8) / 13) + 2; + } + + /* Alloc memory for encoding */ + if (!(output = PyBytes_FromStringAndSize(NULL, size))) { + PyBuffer_Release(&view); + return NULL; + } + + output_ptr = PyBytes_AS_STRING(output); + + workspace = 0; + wssize = 0; + j = 0; + for (i = 0; i < nbytes; i++) { + workspace = workspace << 8 | input[i]; + wssize += 8; + if (wssize >= 13) { + tmp = (workspace >> (wssize - 13)) & 8191; + c = base92chr_encode(tmp / 91); + if (c == 0) { + // do something, illegal character + PyBuffer_Release(&view); + Py_DECREF(output); + base92_error(0, "while encoding byte 1"); + return NULL; + } + output_ptr[j++] = c; + c = base92chr_encode(tmp % 91); + if (c == 0) { + // do something, illegal character + PyBuffer_Release(&view); + Py_DECREF(output); + base92_error(0, "while encoding byte 2"); + return NULL; + } + output_ptr[j++] = c; + wssize -= 13; + } + } + // encode a last byte + if (0 < wssize && wssize < 7) { + tmp = (workspace << (6 - wssize)) & 63; // pad the right side + c = base92chr_encode(tmp); + if (c == 0) { + // do something, illegal character + PyBuffer_Release(&view); + Py_DECREF(output); + base92_error(0, "while encoding last byte 0"); + return NULL; + } + output_ptr[j] = c; + } else if (7 <= wssize) { + tmp = (workspace << (13 - wssize)) & 8191; // pad the right side + c = base92chr_encode(tmp / 91); + if (c == 0) { + // do something, illegal character + PyBuffer_Release(&view); + Py_DECREF(output); + base92_error(0, "while encoding last byte 1"); + return NULL; + } + output_ptr[j++] = c; + c = base92chr_encode(tmp % 91); + if (c == 0) { + // do something, illegal character + PyBuffer_Release(&view); + Py_DECREF(output); + base92_error(0, "while encoding last byte 2"); + return NULL; + } + output_ptr[j] = c; + } + + PyBuffer_Release(&view); + return output; +} + + + +PyDoc_STRVAR(decode__doc__, "decode(input: str) -> bytes -- Return decoded data.\n"); + +static PyObject * +PyBase92_decode(PyObject *self, PyObject *args) +{ + Py_buffer view; + PyObject *output; + char *output_ptr; + const uint8_t *input; + const char *format; + size_t nbytes, size, i, j; + int b1, b2; + unsigned long workspace; + unsigned short wssize; + + /* Accept some kind of input */ +#if PY_MAJOR_VERSION <= 2 + /* s* : bytes like object including unicode and anything that supports + * the buffer interface */ + format = "s*:decode"; +#elif PY_MAJOR_VERSION >= 3 + /* y* :bytes like object EXCLUDING unicode and anything that supports + * the buffer interface. This is the recommended way to accept binary + * data in Python 3. */ + format = "y*:decode"; +#endif + if (!PyArg_ParseTuple(args, format, &view)) + return NULL; + + nbytes = view.len; + input = (const uint8_t*)view.buf; + + if (nbytes == 0 || (nbytes == 1 && input[0] == 126)) { // "~" + PyBuffer_Release(&view); + /* Alloc memory for decoding */ + if (!(output = PyBytes_FromStringAndSize(NULL, 0))) + return NULL; + return output; + } + + // calculate size + size = ((nbytes / 2 * 13) + (nbytes % 2 * 6)) / 8; + + /* Alloc memory for decoding */ + if (!(output = PyBytes_FromStringAndSize(NULL, size))) { + PyBuffer_Release(&view); + return NULL; + } + + output_ptr = PyBytes_AS_STRING(output); + + // handle pairs of chars + workspace = 0; + wssize = 0; + j = 0; + for (i = 0; i + 1 < nbytes; i += 2) { + b1 = base92chr_decode(input[i]); + b2 = base92chr_decode(input[i+1]); + workspace = (workspace << 13) | (b1 * 91 + b2); + wssize += 13; + while (wssize >= 8) { + output_ptr[j++] = (workspace >> (wssize - 8)) & 255; + wssize -= 8; + } + } + // handle single char + if (nbytes % 2 == 1) { + workspace = (workspace << 6) | base92chr_decode(input[nbytes - 1]); + wssize += 6; + while (wssize >= 8) { + output_ptr[j++] = (workspace >> (wssize - 8)) & 255; + wssize -= 8; + } + } + + PyBuffer_Release(&view); + return output; +} + + +static PyMethodDef base92_methods[] = +{ + {"encode", (PyCFunction)PyBase92_encode, METH_VARARGS, encode__doc__}, + {"decode", (PyCFunction)PyBase92_decode, METH_VARARGS, decode__doc__}, + {NULL, NULL, 0, NULL} /* Sentinel */ +}; + + +#if PY_MAJOR_VERSION < 3 +/* Python 2 module initialization */ +PyMODINIT_FUNC +initbase92_extension(void) +{ + PyObject *m; + m = Py_InitModule("base92_extension", base92_methods); + if (m == NULL) + return; + + Base92Error = PyErr_NewException("base92_extension.error", NULL, NULL); + if (Base92Error != NULL) { + Py_INCREF(Base92Error); + PyModule_AddObject(m, "error", Base92Error); + } +} +# else +/* Python 3 module initialization */ +static struct PyModuleDef base92_def = { + PyModuleDef_HEAD_INIT, + "base92_extension", + NULL, + -1, + base92_methods +}; + +PyMODINIT_FUNC +PyInit_base92_extension(void) { + return PyModule_Create(&base92_def); +} +#endif diff --git a/python/base92/cbase92.py b/python/base92/cbase92.py new file mode 100644 index 0000000..01c8ba6 --- /dev/null +++ b/python/base92/cbase92.py @@ -0,0 +1,69 @@ +# THE BEERWARE LICENSE (Revision 42): +# wrote this file. As long as you retain this notice you +# can do whatever you want with this stuff. If we meet some day, and you +# think this stuff is worth it, you can buy me a beer in return +# - Nathan Hwang (thenoviceoof) + +''' +base92: a library for encoding byte strings + +>>> x = encode(b'hello world') +>>> str(x.decode()) +'Fc_$aOTdKnsM*k' +>>> str(decode(x).decode()) +'hello world' + +>>> y = encode(b'^\xb6;\xbb\xe0\x1e\xee\xd0\x93\xcb"\xbb\x8fZ\xcd\xc3') +>>> y +"C=i.w6'IvB/viUpRAwco" +>>> decode(y) +'^\\xb6;\\xbb\\xe0\\x1e\\xee\\xd0\\x93\\xcb"\\xbb\\x8fZ\\xcd\\xc3' + +this is a regression test +>>> str(decode(encode('aoeuaoeuaoeu')).decode()) +'aoeuaoeuaoeu' +''' + +__version__ = (1, 0, 3) + +__all__ = [ + 'encode', + 'decode', + 'b92encode', + 'b92decode', + 'base92encode', + 'base92decode', +] + +from .base92_extension import encode, decode + +base92_encode = b92encode = encode +base92_decode = b92decode = decode + + +def test(): + import doctest + doctest.testmod() + + ## more correctness tests + import random + for _ in range(10000): + s = bytes(bytearray(random.getrandbits(8) for _ in range(random.randint(0, 255)))) + assert s == decode(encode(s)), 'decode(encode({!r})) = decode({!r}) = {!r}'.format(s, encode(s), decode(encode(s))) + print('correctness spot check passed') + + ## size tests + # import base64 + # import base85 + # from pprint import pprint + # sd = [(len(base64.b64encode('a'*i)), + # len(base85.b85encode('a'*i)), + # len(encode('a'*i))) + # for i in range(1,128)] + # pprint(sd) + # print sum(a-c for a,b,c in sd)/float(len(sd)) + # print sum(b-c for a,b,c in sd)/float(len(sd)) + + +if __name__ == "__main__": + test() diff --git a/python/base92/test.py b/python/base92/test.py new file mode 100644 index 0000000..a74dab2 --- /dev/null +++ b/python/base92/test.py @@ -0,0 +1,70 @@ +import time +import random + +from . import base92, cbase92 + + +def gen_bytes(maxlen=255): + return bytes(bytearray(random.getrandbits(8) for _ in range(random.randint(0, 255)))) + + +def cross_validate(modules, to_encode, expected_encoded=None): + encoded = {m: m.encode(to_encode) for m in modules} + if expected_encoded is not None: + encoded['expected'] = expected_encoded + assert len(set(encoded.values())) == 1, 'different encodings of {!r}: {}'.format(to_encode, encoded) + decoded = {(m_enc, m_dec): m_dec.decode(data) for m_enc, data in encoded.items() for m_dec in modules} + decoded['expected'] = to_encode + assert len(set(decoded.values())) == 1, 'different decodings of {!r}: {}\nencodings: {}'.format(to_encode, decoded, encoded) + + +def run(modules=(base92, cbase92), random_count=10000, silent=False): + modules = list({m for m in modules if m}) + + if not silent: + print('testing and cross validating encoders and decoders from modules {}'.format(modules)) + + for s, e in [(b'', b'~'), (b'b', b'DL'), (b'hello world', b'Fc_$aOTdKnsM*k'), (b'\x93', b'Ub')]: + cross_validate(modules, s, e) + + if not silent: + print('selected regression tests passed\ngenerating {} random byte strings'.format(random_count)) + + # more correctness tests + + random_bytes = [gen_bytes() for _ in range(random_count)] + for s in random_bytes: + cross_validate(modules, s) + + if not silent: + print('{} randomized X == decode(encode(X)) tests passed'.format(random_count)) + + for m in modules: + enc = 0.0 + dec = 0.0 + for s in random_bytes: + start_enc = time.time() + x = m.encode(s) + stop_enc = time.time() + enc += stop_enc - start_enc + m.decode(x) + dec += time.time() - stop_enc + print('performance of module {} on the {} random byte strings'.format(m, random_count)) + print('- encoding: {}s'.format(enc)) + print('- decoding: {}s'.format(dec)) + + # size tests + # import base64 + # import base85 + # from pprint import pprint + # sd = [(len(base64.b64encode('a'*i)), + # len(base85.b85encode('a'*i)), + # len(encode('a'*i))) + # for i in range(1,128)] + # pprint(sd) + # print sum(a-c for a,b,c in sd)/float(len(sd)) + # print sum(b-c for a,b,c in sd)/float(len(sd)) + + +if __name__ == '__main__': + run() diff --git a/python/setup.py b/python/setup.py deleted file mode 100644 index 45386e6..0000000 --- a/python/setup.py +++ /dev/null @@ -1,15 +0,0 @@ -from distutils.core import setup - -setup( - name='base92', - version='1.0.3', - author='thenoviceoof', - author_email='thenoviceoof@gmail.com', - packages=['base92'], - scripts=[], - url='https://github.com/thenoviceoof/base92', - license='LICENSE.txt', - description='A library to create base92 encoded strings', - long_description=open('README.txt').read(), - install_requires=[], -) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..f460acb --- /dev/null +++ b/setup.py @@ -0,0 +1,36 @@ +import os +import platform +from setuptools import find_packages, setup, Extension + +CFLAGS = os.environ.get('CFLAGS', '').split() +LFLAGS = os.environ.get('LFLAGS', '').split() +def_macros = [] + +extensions = [] +if platform.system() != 'Windows': + extensions.append(Extension( + 'base92.base92_extension', + include_dirs=['python/base92'], + define_macros=def_macros, + sources=['python/base92/base92_extension.c'], + library_dirs=[], + libraries=[], + extra_link_args=LFLAGS, + extra_compile_args=CFLAGS, + )) + +setup( + name='base92', + version='1.0.3', + author='thenoviceoof', + author_email='thenoviceoof@gmail.com', + packages=find_packages(where='python'), + package_dir={'': 'python'}, + scripts=[], + url='https://github.com/thenoviceoof/base92', + license='LICENSE.txt', + description='A library to create base92 encoded strings', + long_description=open('python/README.txt').read(), + install_requires=[], + ext_modules = extensions, +)