From d313ebc672ce0071f68dd62c81731b5eb4f20a40 Mon Sep 17 00:00:00 2001 From: Tyler Barrus Date: Mon, 12 Nov 2018 13:48:55 -0500 Subject: [PATCH] Rotating bloom (#44) --- CHANGELOG.md | 8 +++ docs/source/code.rst | 7 ++ docs/source/quickstart.rst | 11 +++ probables/__init__.py | 6 +- probables/blooms/__init__.py | 4 +- probables/blooms/expandingbloom.py | 112 +++++++++++++++++++++++++---- tests/cuckoo_test.py | 8 +-- tests/expandingbloom_test.py | 45 +++++++++++- 8 files changed, 178 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ec0c52c..ab44b59 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # PyProbables Changelog +### Version 0.2.6 +* Bloom Filters: + * Addition of a Rotating Bloom Filter + +### Version 0.2.5 +* Bloom Filters: + * Addition of an Expanding Bloom Filter + ### Version 0.2.0 * Use __slots__ diff --git a/docs/source/code.rst b/docs/source/code.rst index 4b5ba38..8a5c310 100644 --- a/docs/source/code.rst +++ b/docs/source/code.rst @@ -46,6 +46,13 @@ ExpandingBloomFilter .. autoclass:: probables.ExpandingBloomFilter :members: +RotatingBloomFilter ++++++++++++++++++++++++++++++++ + +.. autoclass:: probables.RotatingBloomFilter + :members: + :inherited-members: + CountingBloomFilter +++++++++++++++++++++++++++++++ diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index 74ac548..9c6e22a 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -93,6 +93,17 @@ determine the number of elements that will be added. At this time, it is not possible to import or export an **Expanding Bloom Filter** but that is a planned feature. +Rotating Bloom Filter +""""""""""""""""""""""""""""""""""""""""""""""" + +The **Rotating Bloom Filter** is a specialized version of the standard +Bloom Filter that rolls of earlier entries into the filter as they become more +stale. The popping of the queue can be done either programmatically or +automatically. + +At this time, it is not possible to import or export an **Expanding Bloom +Filter** but that is a planned feature. + Counting Bloom Filter """"""""""""""""""""""""""""""""""""""""""""""" diff --git a/probables/__init__.py b/probables/__init__.py index b68b95b..38eb539 100644 --- a/probables/__init__.py +++ b/probables/__init__.py @@ -1,7 +1,7 @@ ''' pyprobables module ''' from __future__ import (unicode_literals, absolute_import, print_function) from . blooms import (BloomFilter, BloomFilterOnDisk, CountingBloomFilter, - ExpandingBloomFilter) + ExpandingBloomFilter, RotatingBloomFilter) from . countminsketch import (CountMinSketch, HeavyHitters, StreamThreshold, CountMeanSketch, CountMeanMinSketch) from . cuckoo import (CuckooFilter, CountingCuckooFilter) @@ -12,7 +12,7 @@ __maintainer__ = 'Tyler Barrus' __email__ = 'barrust@gmail.com' __license__ = 'MIT' -__version__ = '0.2.5' +__version__ = '0.2.6' __credits__ = [] __url__ = 'https://github.com/barrust/pyprobables' __bugtrack_url__ = 'https://github.com/barrust/pyprobables/issues' @@ -22,4 +22,4 @@ 'HeavyHitters', 'StreamThreshold', 'CuckooFilter', 'CountingCuckooFilter', 'InitializationError', 'NotSupportedError', 'ProbablesBaseException', 'CuckooFilterFullError', - 'ExpandingBloomFilter'] + 'ExpandingBloomFilter', 'RotatingBloomFilter'] diff --git a/probables/blooms/__init__.py b/probables/blooms/__init__.py index 38ae44d..565a734 100644 --- a/probables/blooms/__init__.py +++ b/probables/blooms/__init__.py @@ -3,7 +3,7 @@ from . bloom import (BloomFilter, BloomFilterOnDisk) from . countingbloom import (CountingBloomFilter) -from . expandingbloom import (ExpandingBloomFilter) +from . expandingbloom import (ExpandingBloomFilter, RotatingBloomFilter) __all__ = ['BloomFilter', 'BloomFilterOnDisk', 'CountingBloomFilter', - 'ExpandingBloomFilter'] + 'ExpandingBloomFilter', 'RotatingBloomFilter'] diff --git a/probables/blooms/expandingbloom.py b/probables/blooms/expandingbloom.py index 53db77e..3398629 100644 --- a/probables/blooms/expandingbloom.py +++ b/probables/blooms/expandingbloom.py @@ -25,6 +25,9 @@ class ExpandingBloomFilter(object): At this point, the expanding Bloom Filter does not support \ `export` or `import` ''' + __slots__ = ['_blooms', '__fpr', '__est_elements', '__hash_func', + '__added_elements'] + def __init__(self, est_elements=None, false_positive_rate=None, hash_function=None): ''' initialize ''' @@ -62,16 +65,6 @@ def elements_added(self): ''' int: The total number of elements added ''' return self.__added_elements - def __add_bloom_filter(self): - ''' build a new bloom and add it on! ''' - blm = BloomFilter(self.__est_elements, self.__fpr, self.__hash_func) - self._blooms.append(blm) - - def __check_for_growth(self): - ''' detereming if the bloom filter should automatically grow ''' - if self._blooms[-1].elements_added >= self.__est_elements: - self.__add_bloom_filter() - def check(self, key): ''' Check to see if the key is in the Bloom Filter @@ -103,8 +96,8 @@ def add(self, key, force=False): Args: key (str): The element to be inserted force (bool): `True` will force it to be inserted, even if it \ - likely has been inserted before \ - `False` will only insert if not found in the Bloom Filter ''' + likely has been inserted before `False` will \ + only insert if not found in the Bloom Filter ''' hashes = self._blooms[0].hashes(key) self.add_alt(hashes, force) @@ -115,8 +108,101 @@ def add_alt(self, hashes, force=False): hashes (list): A list of integers representing the key to insert force (bool): `True` will force it to be inserted, even if \ it likely has been inserted before \ - `False` will only insert if not found in the Bloom Filter ''' + `False` will only insert if not found in the \ + Bloom Filter ''' self.__added_elements += 1 if force or not self.check_alt(hashes): self.__check_for_growth() self._blooms[-1].add_alt(hashes) + + def __add_bloom_filter(self): + ''' build a new bloom and add it on! ''' + blm = BloomFilter(est_elements=self.__est_elements, + false_positive_rate=self.__fpr, + hash_function=self.__hash_func) + self._blooms.append(blm) + + def __check_for_growth(self): + ''' detereming if the bloom filter should automatically grow ''' + if self._blooms[-1].elements_added >= self.__est_elements: + self.__add_bloom_filter() + + +class RotatingBloomFilter(ExpandingBloomFilter): + ''' Simple Rotating Bloom Filter implementation that allows for the "older" + elements added to be removed, in chunks. As the queue fills up, those + elements inserted earlier will be bulk removed. This also provides the + user with the oportunity to force the removal instead of it being time + based. + + Args: + est_elements (int): The number of estimated elements to be added + false_positive_rate (float): The desired false positive rate + max_queue_size (int): This is the number is used to determine the \ + maximum number of Bloom Filters. Total elements added is based on \ + `max_queue_size * est_elements` + hash_function (function): Hashing strategy function to use \ + `hf(key, number)` + ''' + __slots__ = ['_blooms', '__fpr', '__est_elements', '__hash_func', + '__added_elements', '_queue_size'] + + def __init__(self, est_elements=None, false_positive_rate=None, + max_queue_size=10, hash_function=None): + ''' initialize ''' + super(RotatingBloomFilter, + self).__init__(est_elements=est_elements, + false_positive_rate=false_positive_rate, + hash_function=hash_function) + self.__fpr = false_positive_rate + self.__est_elements = est_elements + self.__hash_func = hash_function + self._queue_size = max_queue_size + self.__added_elements = 0 + + @property + def max_queue_size(self): + ''' int: The maximum size for the queue ''' + return self._queue_size + + @property + def current_queue_size(self): + ''' int: The current size of the queue ''' + return len(self._blooms) + + def add_alt(self, hashes, force=False): + ''' Add the element represented by hashes into the Bloom Filter + + Args: + hashes (list): A list of integers representing the key to insert + force (bool): `True` will force it to be inserted, even if \ + it likely has been inserted before \ + `False` will only insert if not found in the \ + Bloom Filter ''' + self.__added_elements += 1 + if force or not self.check_alt(hashes): + self.__rotate_bloom_filter() + self._blooms[-1].add_alt(hashes) + + def pop(self): + ''' Pop an element off of the queue ''' + self.__rotate_bloom_filter(force=True) + + def __rotate_bloom_filter(self, force=False): + ''' handle determining if/when the Bloom Filter queue needs to be + rotated ''' + blm = self._blooms[-1] + ready_to_rotate = blm.elements_added == blm.estimated_elements + neeeds_to_pop = self.current_queue_size < self._queue_size + if force or (ready_to_rotate and neeeds_to_pop): + self.__add_bloom_filter() + elif force or ready_to_rotate: + blm = self._blooms.pop(0) + self.__add_bloom_filter() + + def __add_bloom_filter(self): + ''' build a new bloom and add it on! ''' + blm = BloomFilter(est_elements=self.__est_elements, + false_positive_rate=self.__fpr, + hash_function=self.__hash_func) + self._blooms.append(blm) diff --git a/tests/cuckoo_test.py b/tests/cuckoo_test.py index cafe17d..a7ce913 100644 --- a/tests/cuckoo_test.py +++ b/tests/cuckoo_test.py @@ -114,7 +114,7 @@ def test_cuckoo_filter_fing_size(self): ''' test bad fingerprint size < 1 ''' def runner(): ''' runner ''' - cko = CuckooFilter(capacity=100, bucket_size=2, finger_size=0) + CuckooFilter(capacity=100, bucket_size=2, finger_size=0) self.assertRaises(ValueError, runner) @@ -122,15 +122,15 @@ def test_cuckoo_filter_fing_size_2(self): ''' test bad fingerprint size > 4 ''' def runner(): ''' runner ''' - cko = CuckooFilter(capacity=100, bucket_size=2, finger_size=5) + CuckooFilter(capacity=100, bucket_size=2, finger_size=5) self.assertRaises(ValueError, runner) def test_cuckoo_filter_fing_size_3(self): ''' test valid fingerprint size ''' try: - cko = CuckooFilter(capacity=100, bucket_size=2, finger_size=1) - except: + CuckooFilter(capacity=100, bucket_size=2, finger_size=1) + except ValueError: self.assertEqual(True, False) self.assertEqual(True, True) diff --git a/tests/expandingbloom_test.py b/tests/expandingbloom_test.py index 7b721e8..4d1f1d4 100644 --- a/tests/expandingbloom_test.py +++ b/tests/expandingbloom_test.py @@ -2,7 +2,7 @@ ''' Unittest class ''' from __future__ import (unicode_literals, absolute_import, print_function) import unittest -from probables import (ExpandingBloomFilter) +from probables import (ExpandingBloomFilter, RotatingBloomFilter) class TestExpandingBloomFilter(unittest.TestCase): @@ -54,3 +54,46 @@ def test_ebf_contains(self): self.assertEqual('this is another test' in blm, True) self.assertEqual('this is yet another test' in blm, False) self.assertEqual('this is not another test' in blm, False) + + +class TestRotatingBloomFilter(unittest.TestCase): + + def test_rbf_init(self): + ''' test the initialization of an rotating bloom filter ''' + blm = RotatingBloomFilter(est_elements=10, false_positive_rate=0.05, + max_queue_size=10) + self.assertEqual(blm.expansions, 0) + self.assertEqual(blm.max_queue_size, 10) + + def test_rfb_rotate(self): + ''' test that the bloom filter rotates the first bloom off the stack ''' + blm = RotatingBloomFilter(est_elements=10, false_positive_rate=0.05, + max_queue_size=5) + self.assertEqual(blm.expansions, 0) + blm.add('test') + self.assertEqual(blm.expansions, 0) + for i in range(10): + blm.add('{}'.format(i), force=True) + self.assertEqual(blm.expansions, 1) + self.assertEqual(blm.current_queue_size, 2) + self.assertEqual(blm.check('test'), True) + + for i in range(10, 20): + blm.add('{}'.format(i), force=True) + self.assertEqual(blm.check('test'), True) + self.assertEqual(blm.current_queue_size, 3) + + for i in range(20, 30): + blm.add('{}'.format(i), force=True) + self.assertEqual(blm.check('test'), True) + self.assertEqual(blm.current_queue_size, 4) + + for i in range(30, 40): + blm.add('{}'.format(i), force=True) + self.assertEqual(blm.check('test'), True) + self.assertEqual(blm.current_queue_size, 5) + + for i in range(40, 50): + blm.add('{}'.format(i), force=True) + self.assertEqual(blm.check('test'), False) # it should roll off + self.assertEqual(blm.current_queue_size, 5)