Skip to content

Commit

Permalink
Expanding bloom export import (#45)
Browse files Browse the repository at this point in the history
* import/export of expanding and rotating blooms

* push - pop functionality; begin tests

* include the first tests

* a few more tests

* ensure correct parameters sent to parent class

* test python 3.7

* add import/export tests

* add downloads [skip-ci]
  • Loading branch information
barrust authored Nov 17, 2018
1 parent cf806c2 commit 03e1e60
Show file tree
Hide file tree
Showing 13 changed files with 291 additions and 39 deletions.
14 changes: 9 additions & 5 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
language: python
python:
- "2.7"
- "3.4"
- "3.5"
- "3.6"
matrix:
include:
- python: 2.7
- python: 3.4
- python: 3.5
- python: 3.6
- python: 3.7
dist: xenial
sudo: true

# install python dependencies including this package in the travis
# virtualenv
Expand Down
3 changes: 3 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ PyProbables
.. image:: https://img.shields.io/badge/license-MIT-blue.svg
:target: https://opensource.org/licenses/MIT/
:alt: License
.. image:: https://pepy.tech/badge/pyprobables
:target: https://pepy.tech/project/pyprobables
:alt: Downloads

**pyprobables** is a pure-python library for probabilistic data structures.
The goal is to provide the developer with a pure-python implementation of
Expand Down
6 changes: 4 additions & 2 deletions probables/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
CountMeanSketch, CountMeanMinSketch)
from . cuckoo import (CuckooFilter, CountingCuckooFilter)
from . exceptions import (InitializationError, NotSupportedError,
ProbablesBaseException, CuckooFilterFullError)
ProbablesBaseException, CuckooFilterFullError,
RotatingBloomFilterError)

__author__ = 'Tyler Barrus'
__maintainer__ = 'Tyler Barrus'
Expand All @@ -22,4 +23,5 @@
'HeavyHitters', 'StreamThreshold', 'CuckooFilter',
'CountingCuckooFilter', 'InitializationError', 'NotSupportedError',
'ProbablesBaseException', 'CuckooFilterFullError',
'ExpandingBloomFilter', 'RotatingBloomFilter']
'ExpandingBloomFilter', 'RotatingBloomFilter',
'RotatingBloomFilterError']
15 changes: 8 additions & 7 deletions probables/blooms/bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ class BloomFilter(BaseBloom):
Args:
est_elements (int): The number of estimated elements to be added
false_positive_rate (float): The desired false positive rate
filepath (string): Path to file to load
hex_string (string): Hex based representation to be loaded
filepath (str): Path to file to load
hex_string (str): Hex based representation to be loaded
hash_function (function): Hashing strategy function to use \
`hf(key, number)`
Returns:
Expand Down Expand Up @@ -202,10 +202,10 @@ class BloomFilterOnDisk(BaseBloom):
(https://github.com/barrust/bloom)
Args:
filepath (string): Path to file to load
filepath (str): Path to file to load
est_elements (int): The number of estimated elements to be added
false_positive_rate (float): The desired false positive rate
hex_string (string): Hex based representation to be loaded
hex_string (str): Hex based representation to be loaded
hash_function (function): Hashing strategy function to use \
`hf(key, number)`
Returns:
Expand Down Expand Up @@ -243,7 +243,8 @@ def __init__(self, filepath, est_elements=None, false_positive_rate=None,
self)._set_optimized_params(est_elements, fpr,
hash_function)
super(BloomFilterOnDisk,
self).__init__('reg-ondisk', est_elements, vals[1],
self).__init__('reg-ondisk', est_elements=est_elements,
false_positive_rate=vals[1],
hash_function=vals[0])
# do the on disk things
with open(filepath, 'wb') as filepointer:
Expand Down Expand Up @@ -285,8 +286,8 @@ def __load(self, filepath, hash_function=None):
self)._set_optimized_params(mybytes[0], mybytes[2],
hash_function)
super(BloomFilterOnDisk,
self).__init__('reg-ondisk', mybytes[0], vals[1],
hash_function=vals[0])
self).__init__('reg-ondisk', est_elements=mybytes[0],
false_positive_rate=vals[1], hash_function=vals[0])
self.__file_pointer = open(filepath, 'r+b')
self._bloom = mmap.mmap(self.__file_pointer.fileno(), 0)
self._on_disk = True
Expand Down
4 changes: 2 additions & 2 deletions probables/blooms/countingbloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ class CountingBloomFilter(BaseBloom):
Args:
est_elements (int): The number of estimated elements to be added
false_positive_rate (float): The desired false positive rate
filepath (string): Path to file to load
hex_string (string): Hex based representation to be loaded
filepath (str): Path to file to load
hex_string (str): Hex based representation to be loaded
hash_function (function): Hashing strategy function to use \
`hf(key, number)`
Returns:
Expand Down
103 changes: 89 additions & 14 deletions probables/blooms/expandingbloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@
'''
from __future__ import (unicode_literals, absolute_import, print_function)

import os
from struct import (pack, unpack, calcsize)

from . bloom import (BloomFilter)
from .. utilities import (is_valid_file)
from .. exceptions import (RotatingBloomFilterError)


class ExpandingBloomFilter(object):
Expand All @@ -17,27 +22,33 @@ class ExpandingBloomFilter(object):
Args:
est_elements (int): The number of estimated elements to be added
false_positive_rate (float): The desired false positive rate
filepath (str): Path to file to load
hash_function (function): Hashing strategy function to use \
`hf(key, number)`
Returns:
ExpandingBloomFilter: An expanding Bloom Filter object
Note:
At this point, the expanding Bloom Filter does not support \
`export` or `import` '''
Initialization order of operations:
1) Filepath
2) est_elements and false_positive_rate'''

__slots__ = ['_blooms', '__fpr', '__est_elements', '__hash_func',
'__added_elements']

def __init__(self, est_elements=None, false_positive_rate=None,
hash_function=None):
filepath=None, hash_function=None):
''' initialize '''
self._blooms = list()
self.__fpr = false_positive_rate
self.__est_elements = est_elements
self.__hash_func = hash_function
self.__added_elements = 0 # total added...
# add in the initial bloom filter!
self.__add_bloom_filter()

if is_valid_file(filepath):
self.__load(filepath)
else:
# add in the initial bloom filter!
self.__add_bloom_filter()

def __contains__(self, key):
''' setup the `in` functionality '''
Expand Down Expand Up @@ -65,6 +76,10 @@ def elements_added(self):
''' int: The total number of elements added '''
return self.__added_elements

def push(self):
''' Push a new expansion onto the Bloom Filter '''
self.__add_bloom_filter()

def check(self, key):
''' Check to see if the key is in the Bloom Filter
Expand Down Expand Up @@ -127,6 +142,44 @@ def __check_for_growth(self):
if self._blooms[-1].elements_added >= self.__est_elements:
self.__add_bloom_filter()

def export(self, filepath):
''' Export an expanding Bloom Filter, or subclass, to disk
Args:
filepath (str): The path to the file to import '''
with open(filepath, 'wb') as fileobj:
# add all the different Bloom bit arrays...
for blm in self._blooms:
rep = 'B' * blm.bloom_length
fileobj.write(pack(rep, *blm.bloom))
fileobj.write(pack('QQQf', len(self._blooms),
self.estimated_elements,
self.elements_added,
self.false_positive_rate))

def __load(self, filename):
''' load a file '''
with open(filename, 'rb') as fileobj:
offset = calcsize('QQQf')
fileobj.seek(offset * -1, os.SEEK_END)
size, est_els, els_added, fpr = unpack('QQQf', fileobj.read(offset))

fileobj.seek(0, os.SEEK_SET)
# set the basic defaults
self._blooms = list()
self.__added_elements = els_added
self.__fpr = fpr
self.__est_elements = est_els
for _ in range(size):
blm = BloomFilter(est_elements=self.__est_elements,
false_positive_rate=self.__fpr,
hash_function=self.__hash_func)
# now we need to read in the correct number of bytes...
offset = calcsize('B') * blm.bloom_length
rep = 'B' * blm.bloom_length
blm._bloom = list(unpack(rep, fileobj.read(offset)))
self._blooms.append(blm)


class RotatingBloomFilter(ExpandingBloomFilter):
''' Simple Rotating Bloom Filter implementation that allows for the "older"
Expand All @@ -141,24 +194,30 @@ class RotatingBloomFilter(ExpandingBloomFilter):
max_queue_size (int): This is the number is used to determine the \
maximum number of Bloom Filters. Total elements added is based on \
`max_queue_size * est_elements`
filepath (str): Path to file to load
hash_function (function): Hashing strategy function to use \
`hf(key, number)`
Note:
Initialization order of operations:
1) Filepath
2) est_elements and false_positive_rate
'''
__slots__ = ['_blooms', '__fpr', '__est_elements', '__hash_func',
'__added_elements', '_queue_size']

def __init__(self, est_elements=None, false_positive_rate=None,
max_queue_size=10, hash_function=None):
max_queue_size=10, filepath=None, hash_function=None):
''' initialize '''
super(RotatingBloomFilter,
self).__init__(est_elements=est_elements,
false_positive_rate=false_positive_rate,
filepath=filepath,
hash_function=hash_function)
self.__fpr = false_positive_rate
self.__est_elements = est_elements
self.__hash_func = hash_function
self._queue_size = max_queue_size
self.__added_elements = 0
self.__added_elements = self.elements_added
self.__est_elements = self.estimated_elements
self.__fpr = self.false_positive_rate
self.__hash_func = hash_function

@property
def max_queue_size(self):
Expand All @@ -185,18 +244,34 @@ def add_alt(self, hashes, force=False):
self._blooms[-1].add_alt(hashes)

def pop(self):
''' Pop an element off of the queue '''
''' Pop the oldest Bloom Filter off of the queue without pushing a new
Bloom Filter onto the queue
Raises:
RotatingBloomFilterError: Unable to rotate the Bloom Filter'''
if self.current_queue_size == 1:
msg = "Popping a Bloom Filter will result in an unusable system!"
raise RotatingBloomFilterError(msg)
self._blooms.pop(0)

def push(self):
''' Push a new bloom filter onto the queue and rotate if necessary '''
self.__rotate_bloom_filter(force=True)

def __rotate_bloom_filter(self, force=False):
''' handle determining if/when the Bloom Filter queue needs to be
rotated '''
blm = self._blooms[-1]
ready_to_rotate = blm.elements_added == blm.estimated_elements
neeeds_to_pop = self.current_queue_size < self._queue_size
if force or (ready_to_rotate and neeeds_to_pop):
no_need_to_pop = self.current_queue_size < self._queue_size
if force and no_need_to_pop:
self.__add_bloom_filter()
elif force: # must need to be pop'd first!
blm = self._blooms.pop(0)
self.__add_bloom_filter()
elif ready_to_rotate and no_need_to_pop:
self.__add_bloom_filter()
elif force or ready_to_rotate:
elif ready_to_rotate:
blm = self._blooms.pop(0)
self.__add_bloom_filter()

Expand Down
8 changes: 4 additions & 4 deletions probables/countminsketch/countminsketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class CountMinSketch(object):
depth (int): The depth of the count-min sketch
confidence (float): The level of confidence desired
error_rate (float): The desired error rate
filepath (string): Path to file to load
filepath (str): Path to file to load
hash_function (function): Hashing strategy function to use \
`hf(key, number)`
Returns:
Expand Down Expand Up @@ -363,7 +363,7 @@ class CountMeanSketch(CountMinSketch):
depth (int): The depth of the count-min sketch
confidence (float): The level of confidence desired
error_rate (float): The desired error rate
filepath (string): Path to file to load
filepath (str): Path to file to load
hash_function (function): Hashing strategy function to use \
`hf(key, number)`
Returns:
Expand Down Expand Up @@ -399,7 +399,7 @@ class CountMeanMinSketch(CountMinSketch):
depth (int): The depth of the count-min sketch
confidence (float): The level of confidence desired
error_rate (float): The desired error rate
filepath (string): Path to file to load
filepath (str): Path to file to load
hash_function (function): Hashing strategy function to use \
`hf(key, number)`
Returns:
Expand Down Expand Up @@ -435,7 +435,7 @@ class HeavyHitters(CountMinSketch):
depth (int): The depth of the count-min sketch
confidence (float): The level of confidence desired
error_rate (float): The desired error rate
filepath (string): Path to file to load
filepath (str): Path to file to load
hash_function (function): Hashing strategy function to use \
`hf(key, number)`
Returns:
Expand Down
10 changes: 10 additions & 0 deletions probables/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,13 @@ class CuckooFilterFullError(ProbablesBaseException):
def __init__(self, message):
self.message = message
super(CuckooFilterFullError, self).__init__(self.message)


class RotatingBloomFilterError(ProbablesBaseException):
''' RotatingBloomFilter unable to rotate Blooms Exceptions
Args:
message (str): The error message to be reported '''
def __init__(self, message):
self.message = message
super(RotatingBloomFilterError, self).__init__(self.message)
4 changes: 2 additions & 2 deletions probables/hashes.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def default_md5(key, depth=1):
key (str): The element to be hashed
depth (int): The number of hash permutations to compute
Returns:
int: 64-bit hashed representation of key
list(int): List of 64-bit hashed representation of key hashes
Note:
Returns the upper-most 64 bits '''
return md5(key).digest()
Expand All @@ -111,7 +111,7 @@ def default_sha256(key, depth=1):
key (str): The element to be hashed
depth (int): The number of hash permutations to compute
Returns:
int: 64-bit hashed representation of key
list(int): List of 64-bit hashed representation of key hashes
Note:
Returns the upper-most 64 bits '''
return sha256(key).digest()
3 changes: 3 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
[bdist_wheel]
universal=1

[pep8]
max-line-length=120

[pycodestyle]
max-line-length=120

Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ def read_file(filepath):
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6'
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
],
test_suite = 'tests'
)
2 changes: 1 addition & 1 deletion tests/cuckoo_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def test_cuckoo_filter_fing_msg(self):
''' test valid fingerprint size message '''
def runner():
''' runner '''
cko = CuckooFilter(capacity=100, bucket_size=2, finger_size=5)
CuckooFilter(capacity=100, bucket_size=2, finger_size=5)

self.assertRaises(ValueError, runner)
try:
Expand Down
Loading

0 comments on commit 03e1e60

Please sign in to comment.