Skip to content

Commit

Permalink
Expanding Bloom Filter (#43)
Browse files Browse the repository at this point in the history
* first pass at an expanding bloom filter!
* document the expanding bloom filter
  • Loading branch information
barrust authored Nov 10, 2018
1 parent 9ad336c commit 022ef22
Show file tree
Hide file tree
Showing 10 changed files with 209 additions and 11 deletions.
6 changes: 6 additions & 0 deletions docs/source/code.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@ BloomFilterOnDisk

For more information of all methods and properties, see `BloomFilter`_.

ExpandingBloomFilter
+++++++++++++++++++++++++++++++

.. autoclass:: probables.ExpandingBloomFilter
:members:

CountingBloomFilter
+++++++++++++++++++++++++++++++

Expand Down
11 changes: 11 additions & 0 deletions docs/source/quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,17 @@ Bloom Filter that is run directly off of disk instead of in memory. This
can be useful for very large Bloom Filters or when needing to access many
Blooms that are exported to file.

Expanding Bloom Filter
"""""""""""""""""""""""""""""""""""""""""""""""

The **Expanding Bloom Filter** is a specialized version of the standard
Bloom Filter that automatically grows to ensure that the desired false positive
rate is not exceeded. This is ideal for situations that it is a wild guess to
determine the number of elements that will be added.

At this time, it is not possible to import or export an **Expanding Bloom
Filter** but that is a planned feature.


Counting Bloom Filter
"""""""""""""""""""""""""""""""""""""""""""""""
Expand Down
8 changes: 5 additions & 3 deletions probables/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
''' pyprobables module '''
from __future__ import (unicode_literals, absolute_import, print_function)
from . blooms import (BloomFilter, BloomFilterOnDisk, CountingBloomFilter)
from . blooms import (BloomFilter, BloomFilterOnDisk, CountingBloomFilter,
ExpandingBloomFilter)
from . countminsketch import (CountMinSketch, HeavyHitters, StreamThreshold,
CountMeanSketch, CountMeanMinSketch)
from . cuckoo import (CuckooFilter, CountingCuckooFilter)
Expand All @@ -11,7 +12,7 @@
__maintainer__ = 'Tyler Barrus'
__email__ = '[email protected]'
__license__ = 'MIT'
__version__ = '0.2.0'
__version__ = '0.2.5'
__credits__ = []
__url__ = 'https://github.com/barrust/pyprobables'
__bugtrack_url__ = 'https://github.com/barrust/pyprobables/issues'
Expand All @@ -20,4 +21,5 @@
'CountMinSketch', 'CountMeanSketch', 'CountMeanMinSketch',
'HeavyHitters', 'StreamThreshold', 'CuckooFilter',
'CountingCuckooFilter', 'InitializationError', 'NotSupportedError',
'ProbablesBaseException', 'CuckooFilterFullError']
'ProbablesBaseException', 'CuckooFilterFullError',
'ExpandingBloomFilter']
4 changes: 3 additions & 1 deletion probables/blooms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,7 @@

from . bloom import (BloomFilter, BloomFilterOnDisk)
from . countingbloom import (CountingBloomFilter)
from . expandingbloom import (ExpandingBloomFilter)

__all__ = ['BloomFilter', 'BloomFilterOnDisk', 'CountingBloomFilter']
__all__ = ['BloomFilter', 'BloomFilterOnDisk', 'CountingBloomFilter',
'ExpandingBloomFilter']
4 changes: 2 additions & 2 deletions probables/blooms/basebloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,12 @@ def __init__(self, blm_type, est_elements=None, false_positive_rate=None,
self._els_added = 0
self._on_disk = False # not on disk
self.__blm_type = blm_type
if self.__blm_type in ['regular', 'reg-ondisk']:
if self.__blm_type in ['regular', 'reg-ondisk', 'expanding']:
self.__impt_type = 'B'
else:
self.__impt_type = 'I'

if blm_type in ['regular', 'reg-ondisk']:
if blm_type in ['regular', 'reg-ondisk', 'expanding']:
msg = ('Insufecient parameters to set up the Bloom Filter')
else:
msg = ('Insufecient parameters to set up the Counting Bloom '
Expand Down
122 changes: 122 additions & 0 deletions probables/blooms/expandingbloom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
''' BloomFilter, python implementation
License: MIT
Author: Tyler Barrus ([email protected])
URL: https://github.com/barrust/pyprobables
'''
from __future__ import (unicode_literals, absolute_import, print_function)

from . bloom import (BloomFilter)


class ExpandingBloomFilter(object):
''' Simple expanding Bloom Filter implementation for use in python; the
Bloom Fiter will automatically expand, or grow, if the false
positive rate is about to become greater than the desired false
positive rate.
Args:
est_elements (int): The number of estimated elements to be added
false_positive_rate (float): The desired false positive rate
hash_function (function): Hashing strategy function to use \
`hf(key, number)`
Returns:
ExpandingBloomFilter: An expanding Bloom Filter object
Note:
At this point, the expanding Bloom Filter does not support \
`export` or `import` '''

def __init__(self, est_elements=None, false_positive_rate=None,
hash_function=None):
''' initialize '''
self._blooms = list()
self.__fpr = false_positive_rate
self.__est_elements = est_elements
self.__hash_func = hash_function
self.__added_elements = 0 # total added...
# add in the initial bloom filter!
self.__add_bloom_filter()

def __contains__(self, key):
''' setup the `in` functionality '''
return self.check(key)

@property
def expansions(self):
''' int: The number of expansions '''
return len(self._blooms) - 1

@property
def false_positive_rate(self):
''' float: The desired false positive rate of the expanding Bloom \
Filter '''
return self.__fpr

@property
def estimated_elements(self):
'''int: The original number of elements estimated to be in the Bloom \
Filter '''
return self.__est_elements

@property
def elements_added(self):
''' int: The total number of elements added '''
return self.__added_elements

def __add_bloom_filter(self):
''' build a new bloom and add it on! '''
blm = BloomFilter(self.__est_elements, self.__fpr, self.__hash_func)
self._blooms.append(blm)

def __check_for_growth(self):
''' detereming if the bloom filter should automatically grow '''
if self._blooms[-1].elements_added >= self.__est_elements:
self.__add_bloom_filter()

def check(self, key):
''' Check to see if the key is in the Bloom Filter
Args:
key (str): The key to check for in the Bloom Filter
Returns:
bool: `True` if the element is likely present; `False` if \
definately not present '''
hashes = self._blooms[0].hashes(key)
return self.check_alt(hashes)

def check_alt(self, hashes):
''' Check to see if the hashes are in the Bloom Filter
Args:
hashes (list): The hash representation to check for in the \
Bloom Filter
Returns:
bool: `True` if the element is likely present; `False` if \
definately not present '''
for blm in self._blooms:
if blm.check_alt(hashes):
return True
return False

def add(self, key, force=False):
''' Add the key to the Bloom Filter
Args:
key (str): The element to be inserted
force (bool): `True` will force it to be inserted, even if it \
likely has been inserted before \
`False` will only insert if not found in the Bloom Filter '''
hashes = self._blooms[0].hashes(key)
self.add_alt(hashes, force)

def add_alt(self, hashes, force=False):
''' Add the element represented by hashes into the Bloom Filter
Args:
hashes (list): A list of integers representing the key to insert
force (bool): `True` will force it to be inserted, even if \
it likely has been inserted before \
`False` will only insert if not found in the Bloom Filter '''
self.__added_elements += 1
if force or not self.check_alt(hashes):
self.__check_for_growth()
self._blooms[-1].add_alt(hashes)
3 changes: 1 addition & 2 deletions probables/countminsketch/countminsketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,8 +451,7 @@ class HeavyHitters(CountMinSketch):
For width and depth, width may realistically be in the thousands \
while depth is in the single digit to teens '''

__slots__ = CountMinSketch.__slots__
__slots__.extend(['__top_x', '__top_x_size', '__num_hitters', '__smallest'])
__slots__ = ['__top_x', '__top_x_size', '__num_hitters', '__smallest']

def __init__(self, num_hitters=100, width=None, depth=None,
confidence=None, error_rate=None, filepath=None,
Expand Down
4 changes: 2 additions & 2 deletions probables/cuckoo/countingcuckoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ class CountingCuckooFilter(CuckooFilter):
Returns:
CountingCuckooFilter: A Cuckoo Filter object '''

__slots__ = CuckooFilter.__slots__
__slots__.extend(['__unique_elements'])
__slots__ = ['__unique_elements', '_inserted_elements', '_bucket_size',
'__max_cuckoo_swaps', '_cuckoo_capacity', '_buckets']

def __init__(self, capacity=10000, bucket_size=4, max_swaps=500,
expansion_rate=2, auto_expand=True, finger_size=4,
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[bdist_wheel]
universal=1

[pep8]
[pycodestyle]
max-line-length=120

[flake8]
Expand Down
56 changes: 56 additions & 0 deletions tests/expandingbloom_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# -*- coding: utf-8 -*-
''' Unittest class '''
from __future__ import (unicode_literals, absolute_import, print_function)
import unittest
from probables import (ExpandingBloomFilter)

class TestExpandingBloomFilter(unittest.TestCase):

def test_ebf_init(self):
''' test the initialization of an expanding bloom filter '''
blm = ExpandingBloomFilter(est_elements=10, false_positive_rate=0.05)
self.assertEqual(blm.expansions, 0)

def test_ebf_add_lots(self):
''' test adding "lots" of elements to force the expansion '''
blm = ExpandingBloomFilter(est_elements=10, false_positive_rate=0.05)
for i in range(100):
blm.add("{}".format(i), True)
self.assertEqual(blm.expansions, 9)

def test_ebf_add_lots_without_force(self):
''' testing adding "lots" but force them to be inserted multiple times'''
blm = ExpandingBloomFilter(est_elements=10, false_positive_rate=0.05)
# simulate false positives... notice it didn't grow a few...
for i in range(120):
blm.add("{}".format(i))
self.assertEqual(blm.expansions, 9)
self.assertEqual(blm.elements_added, 120)

def test_ebf_check(self):
''' ensure that checking the expanding bloom filter works '''
blm = ExpandingBloomFilter(est_elements=25, false_positive_rate=0.05)
# expand it out some first!
for i in range(100):
blm.add("{}".format(i))
blm.add('this is a test')
blm.add('this is another test')
self.assertGreater(blm.expansions, 1)
self.assertEqual(blm.check('this is a test'), True)
self.assertEqual(blm.check('this is another test'), True)
self.assertEqual(blm.check('this is yet another test'), False)
self.assertEqual(blm.check('this is not another test'), False)

def test_ebf_contains(self):
''' ensure that "in" functionality for the expanding bloom filter works '''
blm = ExpandingBloomFilter(est_elements=25, false_positive_rate=0.05)
# expand it out some first!
for i in range(100):
blm.add("{}".format(i))
blm.add('this is a test')
blm.add('this is another test')
self.assertGreater(blm.expansions, 1)
self.assertEqual('this is a test' in blm, True)
self.assertEqual('this is another test' in blm, True)
self.assertEqual('this is yet another test' in blm, False)
self.assertEqual('this is not another test' in blm, False)

0 comments on commit 022ef22

Please sign in to comment.