-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* first pass at an expanding bloom filter! * document the expanding bloom filter
- Loading branch information
Showing
10 changed files
with
209 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
''' pyprobables module ''' | ||
from __future__ import (unicode_literals, absolute_import, print_function) | ||
from . blooms import (BloomFilter, BloomFilterOnDisk, CountingBloomFilter) | ||
from . blooms import (BloomFilter, BloomFilterOnDisk, CountingBloomFilter, | ||
ExpandingBloomFilter) | ||
from . countminsketch import (CountMinSketch, HeavyHitters, StreamThreshold, | ||
CountMeanSketch, CountMeanMinSketch) | ||
from . cuckoo import (CuckooFilter, CountingCuckooFilter) | ||
|
@@ -11,7 +12,7 @@ | |
__maintainer__ = 'Tyler Barrus' | ||
__email__ = '[email protected]' | ||
__license__ = 'MIT' | ||
__version__ = '0.2.0' | ||
__version__ = '0.2.5' | ||
__credits__ = [] | ||
__url__ = 'https://github.com/barrust/pyprobables' | ||
__bugtrack_url__ = 'https://github.com/barrust/pyprobables/issues' | ||
|
@@ -20,4 +21,5 @@ | |
'CountMinSketch', 'CountMeanSketch', 'CountMeanMinSketch', | ||
'HeavyHitters', 'StreamThreshold', 'CuckooFilter', | ||
'CountingCuckooFilter', 'InitializationError', 'NotSupportedError', | ||
'ProbablesBaseException', 'CuckooFilterFullError'] | ||
'ProbablesBaseException', 'CuckooFilterFullError', | ||
'ExpandingBloomFilter'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
''' BloomFilter, python implementation | ||
License: MIT | ||
Author: Tyler Barrus ([email protected]) | ||
URL: https://github.com/barrust/pyprobables | ||
''' | ||
from __future__ import (unicode_literals, absolute_import, print_function) | ||
|
||
from . bloom import (BloomFilter) | ||
|
||
|
||
class ExpandingBloomFilter(object): | ||
''' Simple expanding Bloom Filter implementation for use in python; the | ||
Bloom Fiter will automatically expand, or grow, if the false | ||
positive rate is about to become greater than the desired false | ||
positive rate. | ||
Args: | ||
est_elements (int): The number of estimated elements to be added | ||
false_positive_rate (float): The desired false positive rate | ||
hash_function (function): Hashing strategy function to use \ | ||
`hf(key, number)` | ||
Returns: | ||
ExpandingBloomFilter: An expanding Bloom Filter object | ||
Note: | ||
At this point, the expanding Bloom Filter does not support \ | ||
`export` or `import` ''' | ||
|
||
def __init__(self, est_elements=None, false_positive_rate=None, | ||
hash_function=None): | ||
''' initialize ''' | ||
self._blooms = list() | ||
self.__fpr = false_positive_rate | ||
self.__est_elements = est_elements | ||
self.__hash_func = hash_function | ||
self.__added_elements = 0 # total added... | ||
# add in the initial bloom filter! | ||
self.__add_bloom_filter() | ||
|
||
def __contains__(self, key): | ||
''' setup the `in` functionality ''' | ||
return self.check(key) | ||
|
||
@property | ||
def expansions(self): | ||
''' int: The number of expansions ''' | ||
return len(self._blooms) - 1 | ||
|
||
@property | ||
def false_positive_rate(self): | ||
''' float: The desired false positive rate of the expanding Bloom \ | ||
Filter ''' | ||
return self.__fpr | ||
|
||
@property | ||
def estimated_elements(self): | ||
'''int: The original number of elements estimated to be in the Bloom \ | ||
Filter ''' | ||
return self.__est_elements | ||
|
||
@property | ||
def elements_added(self): | ||
''' int: The total number of elements added ''' | ||
return self.__added_elements | ||
|
||
def __add_bloom_filter(self): | ||
''' build a new bloom and add it on! ''' | ||
blm = BloomFilter(self.__est_elements, self.__fpr, self.__hash_func) | ||
self._blooms.append(blm) | ||
|
||
def __check_for_growth(self): | ||
''' detereming if the bloom filter should automatically grow ''' | ||
if self._blooms[-1].elements_added >= self.__est_elements: | ||
self.__add_bloom_filter() | ||
|
||
def check(self, key): | ||
''' Check to see if the key is in the Bloom Filter | ||
Args: | ||
key (str): The key to check for in the Bloom Filter | ||
Returns: | ||
bool: `True` if the element is likely present; `False` if \ | ||
definately not present ''' | ||
hashes = self._blooms[0].hashes(key) | ||
return self.check_alt(hashes) | ||
|
||
def check_alt(self, hashes): | ||
''' Check to see if the hashes are in the Bloom Filter | ||
Args: | ||
hashes (list): The hash representation to check for in the \ | ||
Bloom Filter | ||
Returns: | ||
bool: `True` if the element is likely present; `False` if \ | ||
definately not present ''' | ||
for blm in self._blooms: | ||
if blm.check_alt(hashes): | ||
return True | ||
return False | ||
|
||
def add(self, key, force=False): | ||
''' Add the key to the Bloom Filter | ||
Args: | ||
key (str): The element to be inserted | ||
force (bool): `True` will force it to be inserted, even if it \ | ||
likely has been inserted before \ | ||
`False` will only insert if not found in the Bloom Filter ''' | ||
hashes = self._blooms[0].hashes(key) | ||
self.add_alt(hashes, force) | ||
|
||
def add_alt(self, hashes, force=False): | ||
''' Add the element represented by hashes into the Bloom Filter | ||
Args: | ||
hashes (list): A list of integers representing the key to insert | ||
force (bool): `True` will force it to be inserted, even if \ | ||
it likely has been inserted before \ | ||
`False` will only insert if not found in the Bloom Filter ''' | ||
self.__added_elements += 1 | ||
if force or not self.check_alt(hashes): | ||
self.__check_for_growth() | ||
self._blooms[-1].add_alt(hashes) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
[bdist_wheel] | ||
universal=1 | ||
|
||
[pep8] | ||
[pycodestyle] | ||
max-line-length=120 | ||
|
||
[flake8] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
# -*- coding: utf-8 -*- | ||
''' Unittest class ''' | ||
from __future__ import (unicode_literals, absolute_import, print_function) | ||
import unittest | ||
from probables import (ExpandingBloomFilter) | ||
|
||
class TestExpandingBloomFilter(unittest.TestCase): | ||
|
||
def test_ebf_init(self): | ||
''' test the initialization of an expanding bloom filter ''' | ||
blm = ExpandingBloomFilter(est_elements=10, false_positive_rate=0.05) | ||
self.assertEqual(blm.expansions, 0) | ||
|
||
def test_ebf_add_lots(self): | ||
''' test adding "lots" of elements to force the expansion ''' | ||
blm = ExpandingBloomFilter(est_elements=10, false_positive_rate=0.05) | ||
for i in range(100): | ||
blm.add("{}".format(i), True) | ||
self.assertEqual(blm.expansions, 9) | ||
|
||
def test_ebf_add_lots_without_force(self): | ||
''' testing adding "lots" but force them to be inserted multiple times''' | ||
blm = ExpandingBloomFilter(est_elements=10, false_positive_rate=0.05) | ||
# simulate false positives... notice it didn't grow a few... | ||
for i in range(120): | ||
blm.add("{}".format(i)) | ||
self.assertEqual(blm.expansions, 9) | ||
self.assertEqual(blm.elements_added, 120) | ||
|
||
def test_ebf_check(self): | ||
''' ensure that checking the expanding bloom filter works ''' | ||
blm = ExpandingBloomFilter(est_elements=25, false_positive_rate=0.05) | ||
# expand it out some first! | ||
for i in range(100): | ||
blm.add("{}".format(i)) | ||
blm.add('this is a test') | ||
blm.add('this is another test') | ||
self.assertGreater(blm.expansions, 1) | ||
self.assertEqual(blm.check('this is a test'), True) | ||
self.assertEqual(blm.check('this is another test'), True) | ||
self.assertEqual(blm.check('this is yet another test'), False) | ||
self.assertEqual(blm.check('this is not another test'), False) | ||
|
||
def test_ebf_contains(self): | ||
''' ensure that "in" functionality for the expanding bloom filter works ''' | ||
blm = ExpandingBloomFilter(est_elements=25, false_positive_rate=0.05) | ||
# expand it out some first! | ||
for i in range(100): | ||
blm.add("{}".format(i)) | ||
blm.add('this is a test') | ||
blm.add('this is another test') | ||
self.assertGreater(blm.expansions, 1) | ||
self.assertEqual('this is a test' in blm, True) | ||
self.assertEqual('this is another test' in blm, True) | ||
self.assertEqual('this is yet another test' in blm, False) | ||
self.assertEqual('this is not another test' in blm, False) |