url去重 --布隆过滤器 bloom filter原理及python实现

时间:2022-10-01 12:02:32

https://blog.csdn.net/a1368783069/article/details/52137417

# -*- encoding: utf-8 -*-
"""This module implements a bloom filter probabilistic data structure and
an a Scalable Bloom Filter that grows in size as your add more items to it
without increasing the false positive error_rate. Requires the bitarray library: http://pypi.python.org/pypi/bitarray/ >>> from pybloom import BloomFilter
>>> f = BloomFilter(capacity=10000, error_rate=0.001)
>>> for i in range(0, f.capacity):
... _ = f.add(i)
...
>>> 0 in f
True
>>> f.capacity in f
False
>>> len(f) <= f.capacity
True
>>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate
True >>> from pybloom import ScalableBloomFilter
>>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
>>> count = 10000
>>> for i in range(0, count):
... _ = sbf.add(i)
...
>>> sbf.capacity > count
True
>>> len(sbf) <= count
True
>>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate
True """
import math
import hashlib
from struct import unpack, pack, calcsize try:
import bitarray
except ImportError:
raise ImportError('pybloom requires bitarray >= 0.3.4') __version__ = '1.1'
__author__ = "Jay Baird <jay@mochimedia.com>, Bob Ippolito <bob@redivi.com>,\
Marius Eriksen <marius@monkey.org>,\
Alex Brasetvik <alex@brasetvik.com>" def make_hashfuncs(num_slices, num_bits):
if num_bits >= (1 << 31):
fmt_code, chunk_size = 'Q', 8
elif num_bits >= (1 << 15):
fmt_code, chunk_size = 'I', 4
else:
fmt_code, chunk_size = 'H', 2
total_hash_bits = 8 * num_slices * chunk_size
if total_hash_bits > 384:
hashfn = hashlib.sha512
elif total_hash_bits > 256:
hashfn = hashlib.sha384
elif total_hash_bits > 160:
hashfn = hashlib.sha256
elif total_hash_bits > 128:
hashfn = hashlib.sha1
else:
hashfn = hashlib.md5
fmt = fmt_code * (hashfn().digest_size // chunk_size)
num_salts, extra = divmod(num_slices, len(fmt))
if extra:
num_salts += 1
salts = [hashfn(hashfn(pack('I', i)).digest()) for i in range(num_salts)]
def _make_hashfuncs(key):
#if isinstance(key, unicode):
# key = key.encode('utf-8')
#else:
# key = str(key)
key = str(key).encode("utf-8")
rval = []
for salt in salts:
h = salt.copy()
h.update(key)
rval.extend(uint % num_bits for uint in unpack(fmt, h.digest()))
del rval[num_slices:]
return rval
return _make_hashfuncs class BloomFilter(object):
FILE_FMT = '<dQQQQ' def __init__(self, capacity, error_rate=0.001):
"""Implements a space-efficient probabilistic data structure capacity
this BloomFilter must be able to store at least *capacity* elements
while maintaining no more than *error_rate* chance of false
positives
error_rate
the error_rate of the filter returning false positives. This
determines the filters capacity. Inserting more than capacity
elements greatly increases the chance of false positives. >>> b = BloomFilter(capacity=100000, error_rate=0.001)
>>> b.add("test")
False
>>> "test" in b
True """
if not (0 < error_rate < 1):
raise ValueError("Error_Rate must be between 0 and 1.")
if not capacity > 0:
raise ValueError("Capacity must be > 0")
# given M = num_bits, k = num_slices, p = error_rate, n = capacity
# solving for m = bits_per_slice
# n ~= M * ((ln(2) ** 2) / abs(ln(P)))
# n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
# m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))
num_slices = int(math.ceil(math.log(1 / error_rate, 2)))
# the error_rate constraint assumes a fill rate of 1/2
# so we double the capacity to simplify the API
bits_per_slice = int(math.ceil(
(2 * capacity * abs(math.log(error_rate))) /
(num_slices * (math.log(2) ** 2))))
self._setup(error_rate, num_slices, bits_per_slice, capacity, 0)
self.bitarray = bitarray.bitarray(self.num_bits, endian='little')
self.bitarray.setall(False) def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count):
self.error_rate = error_rate
self.num_slices = num_slices
self.bits_per_slice = bits_per_slice
self.capacity = capacity
self.num_bits = num_slices * bits_per_slice
self.count = count
self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice) def __contains__(self, key):
"""Tests a key's membership in this bloom filter. >>> b = BloomFilter(capacity=100)
>>> b.add("hello")
False
>>> "hello" in b
True """
bits_per_slice = self.bits_per_slice
bitarray = self.bitarray
if not isinstance(key, list):
hashes = self.make_hashes(key)
else:
hashes = key
offset = 0
for k in hashes:
if not bitarray[offset + k]:
return False
offset += bits_per_slice
return True def __len__(self):
"""Return the number of keys stored by this bloom filter."""
return self.count def add(self, key, skip_check=False):
""" Adds a key to this bloom filter. If the key already exists in this
filter it will return True. Otherwise False. >>> b = BloomFilter(capacity=100)
>>> b.add("hello")
False
>>> b.add("hello")
True """
bitarray = self.bitarray
bits_per_slice = self.bits_per_slice
hashes = self.make_hashes(key)
if not skip_check and hashes in self:
return True
if self.count > self.capacity:
raise IndexError("BloomFilter is at capacity")
offset = 0
for k in hashes:
self.bitarray[offset + k] = True
offset += bits_per_slice
self.count += 1
return False def copy(self):
"""Return a copy of this bloom filter.
"""
new_filter = BloomFilter(self.capacity, self.error_rate)
new_filter.bitarray = self.bitarray.copy()
return new_filter def union(self, other):
""" Calculates the union of the two underlying bitarrays and returns
a new bloom filter object."""
if self.capacity != other.capacity or \
self.error_rate != other.error_rate:
raise ValueError("Unioning filters requires both filters to have \
both the same capacity and error rate")
new_bloom = self.copy()
new_bloom.bitarray = new_bloom.bitarray | other.bitarray
return new_bloom def __or__(self, other):
return self.union(other) def intersection(self, other):
""" Calculates the union of the two underlying bitarrays and returns
a new bloom filter object."""
if self.capacity != other.capacity or \
self.error_rate != other.error_rate:
raise ValueError("Intersecting filters requires both filters to \
have equal capacity and error rate")
new_bloom = self.copy()
new_bloom.bitarray = new_bloom.bitarray & other.bitarray
return new_bloom def __and__(self, other):
return self.intersection(other) def tofile(self, f):
"""Write the bloom filter to file object `f'. Underlying bits
are written as machine values. This is much more space
efficient than pickling the object."""
f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices,
self.bits_per_slice, self.capacity, self.count))
self.bitarray.tofile(f) @classmethod
def fromfile(cls, f, n=-1):
"""Read a bloom filter from file-object `f' serialized with
``BloomFilter.tofile''. If `n' > 0 read only so many bytes."""
headerlen = calcsize(cls.FILE_FMT) if 0 < n < headerlen:
raise ValueError('n too small!') filter = cls(1) # Bogus instantiation, we will `_setup'.
filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen)))
filter.bitarray = bitarray.bitarray(endian='little')
if n > 0:
filter.bitarray.fromfile(f, n - headerlen)
else:
filter.bitarray.fromfile(f)
if filter.num_bits != filter.bitarray.length() and \
(filter.num_bits + (8 - filter.num_bits % 8)
!= filter.bitarray.length()):
raise ValueError('Bit length mismatch!') return filter def __getstate__(self):
d = self.__dict__.copy()
del d['make_hashes']
return d def __setstate__(self, d):
self.__dict__.update(d)
self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice) class ScalableBloomFilter(object):
SMALL_SET_GROWTH = 2 # slower, but takes up less memory
LARGE_SET_GROWTH = 4 # faster, but takes up more memory faster
FILE_FMT = '<idQd' def __init__(self, initial_capacity=100, error_rate=0.001,
mode=SMALL_SET_GROWTH):
"""Implements a space-efficient probabilistic data structure that
grows as more items are added while maintaining a steady false
positive rate initial_capacity
the initial capacity of the filter
error_rate
the error_rate of the filter returning false positives. This
determines the filters capacity. Going over capacity greatly
increases the chance of false positives.
mode
can be either ScalableBloomFilter.SMALL_SET_GROWTH or
ScalableBloomFilter.LARGE_SET_GROWTH. SMALL_SET_GROWTH is slower
but uses less memory. LARGE_SET_GROWTH is faster but consumes
memory faster. >>> b = ScalableBloomFilter(initial_capacity=512, error_rate=0.001, \
mode=ScalableBloomFilter.SMALL_SET_GROWTH)
>>> b.add("test")
False
>>> "test" in b
True
>>> unicode_string = u'¡'
>>> b.add(unicode_string)
False
>>> unicode_string in b
True
"""
if not error_rate or error_rate < 0:
raise ValueError("Error_Rate must be a decimal less than 0.")
self._setup(mode, 0.9, initial_capacity, error_rate)
self.filters = [] def _setup(self, mode, ratio, initial_capacity, error_rate):
self.scale = mode
self.ratio = ratio
self.initial_capacity = initial_capacity
self.error_rate = error_rate def __contains__(self, key):
"""Tests a key's membership in this bloom filter. >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \
mode=ScalableBloomFilter.SMALL_SET_GROWTH)
>>> b.add("hello")
False
>>> "hello" in b
True """
for f in reversed(self.filters):
if key in f:
return True
return False def add(self, key):
"""Adds a key to this bloom filter.
If the key already exists in this filter it will return True.
Otherwise False. >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \
mode=ScalableBloomFilter.SMALL_SET_GROWTH)
>>> b.add("hello")
False
>>> b.add("hello")
True """
if key in self:
return True
filter = self.filters[-1] if self.filters else None
if filter is None or filter.count >= filter.capacity:
num_filters = len(self.filters)
filter = BloomFilter(
capacity=self.initial_capacity * (self.scale ** num_filters),
error_rate=self.error_rate * (self.ratio ** num_filters))
self.filters.append(filter)
filter.add(key, skip_check=True)
return False @property
def capacity(self):
"""Returns the total capacity for all filters in this SBF"""
return sum([f.capacity for f in self.filters]) @property
def count(self):
return len(self) def tofile(self, f):
"""Serialize this ScalableBloomFilter into the file-object
`f'."""
f.write(pack(self.FILE_FMT, self.scale, self.ratio,
self.initial_capacity, self.error_rate)) # Write #-of-filters
f.write(pack('<l', len(self.filters))) if len(self.filters) > 0:
# Then each filter directly, with a header describing
# their lengths.
headerpos = f.tell()
headerfmt = '<' + 'Q'*(len(self.filters))
f.write('.' * calcsize(headerfmt))
filter_sizes = []
for filter in self.filters:
begin = f.tell()
filter.tofile(f)
filter_sizes.append(f.tell() - begin) f.seek(headerpos)
f.write(pack(headerfmt, *filter_sizes)) @classmethod
def fromfile(cls, f):
"""Deserialize the ScalableBloomFilter in file object `f'."""
filter = cls()
filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT))))
nfilters, = unpack('<l', f.read(calcsize('<l')))
if nfilters > 0:
header_fmt = '<' + 'Q'*nfilters
bytes = f.read(calcsize(header_fmt))
filter_lengths = unpack(header_fmt, bytes)
for fl in filter_lengths:
filter.filters.append(BloomFilter.fromfile(f, fl))
else:
filter.filters = [] return filter def __len__(self):
"""Returns the total number of elements stored in this SBF"""
return sum([f.count for f in self.filters]) if __name__ == "__main__":
import doctest
doctest.testmod()

  

url去重 --布隆过滤器 bloom filter原理及python实现的更多相关文章

  1. &lbrack;转载&rsqb;布隆过滤器&lpar;Bloom Filter&rpar;

    [转载]布隆过滤器(Bloom Filter) 这部分学习资料来源:https://www.youtube.com/watch?v=v7AzUcZ4XA4 Filter判断不在,那就是肯定不在:Fil ...

  2. 布隆过滤器&lpar;Bloom Filter&rpar;的原理和实现

    什么情况下需要布隆过滤器? 先来看几个比较常见的例子 字处理软件中,需要检查一个英语单词是否拼写正确 在 FBI,一个嫌疑人的名字是否已经在嫌疑名单上 在网络爬虫里,一个网址是否被访问过 yahoo, ...

  3. 浅谈布隆过滤器Bloom Filter

    先从一道面试题开始: 给A,B两个文件,各存放50亿条URL,每条URL占用64字节,内存限制是4G,让你找出A,B文件共同的URL. 这个问题的本质在于判断一个元素是否在一个集合中.哈希表以O(1) ...

  4. 【面试突击】-缓存击穿(布隆过滤器 Bloom Filter)

    原文地址:https://blog.csdn.net/fouy_yun/article/details/81075432 前面的文章介绍了缓存的分类和使用的场景.通常情况下,缓存是加速系统响应的一种途 ...

  5. 布隆过滤器 Bloom Filter 2

    date: 2020-04-01 17:00:00 updated: 2020-04-01 17:00:00 Bloom Filter 布隆过滤器 之前的一版笔记 点此跳转 1. 什么是布隆过滤器 本 ...

  6. 探索C&num;之布隆过滤器&lpar;Bloom filter&rpar;

    阅读目录: 背景介绍 算法原理 误判率 BF改进 总结 背景介绍 Bloom filter(后面简称BF)是Bloom在1970年提出的二进制向量数据结构.通俗来说就是在大数据集合下高效判断某个成员是 ...

  7. 【转】探索C&num;之布隆过滤器&lpar;Bloom filter&rpar;

    原文:蘑菇先生,http://www.cnblogs.com/mushroom/p/4556801.html 背景介绍 Bloom filter(后面简称BF)是Bloom在1970年提出的二进制向量 ...

  8. 布隆过滤器&lpar;Bloom Filter&rpar;详解——基于多hash的概率查找思想

    转自:http://www.cnblogs.com/haippy/archive/2012/07/13/2590351.html   布隆过滤器[1](Bloom Filter)是由布隆(Burton ...

  9. &lbrack;转载&rsqb; 布隆过滤器&lpar;Bloom Filter&rpar;详解

    转载自http://www.cnblogs.com/haippy/archive/2012/07/13/2590351.html   布隆过滤器[1](Bloom Filter)是由布隆(Burton ...

随机推荐

  1. removeClass 按钮点击添加class效果

    html代码: <div class="game"> <span class="active">全部</span> < ...

  2. tyvj1013 找啊找啊找GF

    描述 "找啊找啊找GF,找到一个好GF,吃顿饭啊拉拉手,你是我的好GF.再见.""诶,别再见啊..."七夕...七夕...七夕这个日子,对于sqybi这种单身的 ...

  3. spring aop配置及用例说明(1)

    欢迎转载交流,博客地址http://www.cnblogs.com/shizhongtao/p/3469776.html 首先,什么是aop,其实通俗一点讲就是,再方法执行时候我们加入其它业务逻辑.比 ...

  4. 文件I&sol;O(不带缓冲)之原子操作

    一.添写至一个文件 考虑一个进程,它要将数据添加到一个文件尾端.早期的UNIX系统并不支持open的O_APPEND选项,所以程序被编写成下列形式: ) < ) /* position to E ...

  5. Android 实现ActionBar定制

    我们在使用Android手机时,经常发现应用中的ActionBar和我们平时使用的ActionBar相差非常大.简单的说就是,其他应用的 ActionBar为什么那么绚丽,自己应用的ActionBar ...

  6. python&lowbar;如何实现可迭代对象和迭代器对象?

    什么是可迭代对象? 列表.字符串 for循环的本质? for循环要确保in后面的对象为可迭代对象,如何确保? iter() 方法得到一个迭代器对象 不停.__next__() 方法对迭代器对象进行迭代 ...

  7. Java手动释放对象

    伪代码 public void updateUser(BufferedWriter writer, BufferedReader reader) { List<User> array = ...

  8. jenkins--java配置

    进入jenkins然后-->系统配置-->Global Tool Configuration

  9. java压缩zip文件中文乱码问题

    用java来打包文件生成压缩文件,有两个地方会出现乱码 1.内容的中文乱码问题,这个问题网上很多人给出了解决方法,两种:修改sun的源码:使用开源的类库org.apache.tools.zip.Zip ...

  10. python 爬虫系列07-天气爬虫

    看天气 import requests from bs4 import BeautifulSoup ALL_DATA = [] def parse_page(url): headers = { 'Us ...