mirror of
https://hub.njuu.cf/TheAlgorithms/Python.git
synced 2023-10-11 13:06:12 +08:00
Bloom Filter (#8615)
* Bloom filter with tests
* has functions constant
* fix type
* isort
* passing ruff
* type hints
* type hints
* from fail to erro
* captital leter
* type hints requested by boot
* descriptive name for m
* more descriptibe arguments II
* moved movies_test to doctest
* commented doctest
* removed test_probability
* estimated error
* added types
* again hash_
* Update data_structures/hashing/bloom_filter.py
Co-authored-by: Christian Clauss <cclauss@me.com>
* from b to bloom
* Update data_structures/hashing/bloom_filter.py
Co-authored-by: Christian Clauss <cclauss@me.com>
* Update data_structures/hashing/bloom_filter.py
Co-authored-by: Christian Clauss <cclauss@me.com>
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* syntax error in dict comprehension
* from goodfather to godfather
* removed Interestellar
* forgot the last Godfather
* Revert "removed Interestellar"
This reverts commit 35fa5f5c4b
.
* pretty dict
* Apply suggestions from code review
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* Update bloom_filter.py
---------
Co-authored-by: Christian Clauss <cclauss@me.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
2f9b03393c
commit
14bdd174bb
105
data_structures/hashing/bloom_filter.py
Normal file
105
data_structures/hashing/bloom_filter.py
Normal file
@ -0,0 +1,105 @@
|
||||
"""
|
||||
See https://en.wikipedia.org/wiki/Bloom_filter
|
||||
|
||||
The use of this data structure is to test membership in a set.
|
||||
Compared to Python's built-in set() it is more space-efficient.
|
||||
In the following example, only 8 bits of memory will be used:
|
||||
>>> bloom = Bloom(size=8)
|
||||
|
||||
Initially, the filter contains all zeros:
|
||||
>>> bloom.bitstring
|
||||
'00000000'
|
||||
|
||||
When an element is added, two bits are set to 1
|
||||
since there are 2 hash functions in this implementation:
|
||||
>>> "Titanic" in bloom
|
||||
False
|
||||
>>> bloom.add("Titanic")
|
||||
>>> bloom.bitstring
|
||||
'01100000'
|
||||
>>> "Titanic" in bloom
|
||||
True
|
||||
|
||||
However, sometimes only one bit is added
|
||||
because both hash functions return the same value
|
||||
>>> bloom.add("Avatar")
|
||||
>>> "Avatar" in bloom
|
||||
True
|
||||
>>> bloom.format_hash("Avatar")
|
||||
'00000100'
|
||||
>>> bloom.bitstring
|
||||
'01100100'
|
||||
|
||||
Not added elements should return False ...
|
||||
>>> not_present_films = ("The Godfather", "Interstellar", "Parasite", "Pulp Fiction")
|
||||
>>> {
|
||||
... film: bloom.format_hash(film) for film in not_present_films
|
||||
... } # doctest: +NORMALIZE_WHITESPACE
|
||||
{'The Godfather': '00000101',
|
||||
'Interstellar': '00000011',
|
||||
'Parasite': '00010010',
|
||||
'Pulp Fiction': '10000100'}
|
||||
>>> any(film in bloom for film in not_present_films)
|
||||
False
|
||||
|
||||
but sometimes there are false positives:
|
||||
>>> "Ratatouille" in bloom
|
||||
True
|
||||
>>> bloom.format_hash("Ratatouille")
|
||||
'01100000'
|
||||
|
||||
The probability increases with the number of elements added.
|
||||
The probability decreases with the number of bits in the bitarray.
|
||||
>>> bloom.estimated_error_rate
|
||||
0.140625
|
||||
>>> bloom.add("The Godfather")
|
||||
>>> bloom.estimated_error_rate
|
||||
0.25
|
||||
>>> bloom.bitstring
|
||||
'01100101'
|
||||
"""
|
||||
from hashlib import md5, sha256
|
||||
|
||||
HASH_FUNCTIONS = (sha256, md5)
|
||||
|
||||
|
||||
class Bloom:
|
||||
def __init__(self, size: int = 8) -> None:
|
||||
self.bitarray = 0b0
|
||||
self.size = size
|
||||
|
||||
def add(self, value: str) -> None:
|
||||
h = self.hash_(value)
|
||||
self.bitarray |= h
|
||||
|
||||
def exists(self, value: str) -> bool:
|
||||
h = self.hash_(value)
|
||||
return (h & self.bitarray) == h
|
||||
|
||||
def __contains__(self, other: str) -> bool:
|
||||
return self.exists(other)
|
||||
|
||||
def format_bin(self, bitarray: int) -> str:
|
||||
res = bin(bitarray)[2:]
|
||||
return res.zfill(self.size)
|
||||
|
||||
@property
|
||||
def bitstring(self) -> str:
|
||||
return self.format_bin(self.bitarray)
|
||||
|
||||
def hash_(self, value: str) -> int:
|
||||
res = 0b0
|
||||
for func in HASH_FUNCTIONS:
|
||||
position = (
|
||||
int.from_bytes(func(value.encode()).digest(), "little") % self.size
|
||||
)
|
||||
res |= 2**position
|
||||
return res
|
||||
|
||||
def format_hash(self, value: str) -> str:
|
||||
return self.format_bin(self.hash_(value))
|
||||
|
||||
@property
|
||||
def estimated_error_rate(self) -> float:
|
||||
n_ones = bin(self.bitarray).count("1")
|
||||
return (n_ones / self.size) ** len(HASH_FUNCTIONS)
|
Loading…
Reference in New Issue
Block a user