url去重 --布隆过滤器 bloom filter原理及python实现

https://blog.csdn.net/a1368783069/article/details/52137417

# -*- encoding: utf-8 -*-

"""This module implements a bloom filter probabilistic data structure and

an a Scalable Bloom Filter that grows in size as your add more items to it

without increasing the false positive error_rate.

Requires the bitarray library: http://pypi.python.org/pypi/bitarray/

    >>> from pybloom import BloomFilter

    >>> f = BloomFilter(capacity=10000, error_rate=0.001)

    >>> for i in range(0, f.capacity):

    ...     _ = f.add(i)

    ...

    >>> 0 in f

    True

    >>> f.capacity in f

    False

    >>> len(f) <= f.capacity

    True

    >>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate

    True

    >>> from pybloom import ScalableBloomFilter

    >>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)

    >>> count = 10000

    >>> for i in range(0, count):

    ...     _ = sbf.add(i)

    ...

    >>> sbf.capacity > count

    True

    >>> len(sbf) <= count

    True

    >>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate

    True

"""

import math

import hashlib

from struct import unpack, pack, calcsize

try:

    import bitarray

except ImportError:

    raise ImportError('pybloom requires bitarray >= 0.3.4')

__version__ = '1.1'

__author__  = "Jay Baird <jay@mochimedia.com>, Bob Ippolito <bob@redivi.com>,\

               Marius Eriksen <marius@monkey.org>,\

               Alex Brasetvik <alex@brasetvik.com>"

def make_hashfuncs(num_slices, num_bits):

    if num_bits >= (1 << 31):

        fmt_code, chunk_size = 'Q', 8

    elif num_bits >= (1 << 15):

        fmt_code, chunk_size = 'I', 4

    else:

        fmt_code, chunk_size = 'H', 2

    total_hash_bits = 8 * num_slices * chunk_size

    if total_hash_bits > 384:

        hashfn = hashlib.sha512

    elif total_hash_bits > 256:

        hashfn = hashlib.sha384

    elif total_hash_bits > 160:

        hashfn = hashlib.sha256

    elif total_hash_bits > 128:

        hashfn = hashlib.sha1

    else:

        hashfn = hashlib.md5

    fmt = fmt_code * (hashfn().digest_size // chunk_size)

    num_salts, extra = divmod(num_slices, len(fmt))

    if extra:

        num_salts += 1

    salts = [hashfn(hashfn(pack('I', i)).digest()) for i in range(num_salts)]

    def _make_hashfuncs(key):

        #if isinstance(key, unicode):

        #    key = key.encode('utf-8')

        #else:

        #    key = str(key)

        key = str(key).encode("utf-8")

        rval = []

        for salt in salts:

            h = salt.copy()

            h.update(key)

            rval.extend(uint % num_bits for uint in unpack(fmt, h.digest()))

        del rval[num_slices:]

        return rval

    return _make_hashfuncs

class BloomFilter(object):

    FILE_FMT = '<dQQQQ'

    def __init__(self, capacity, error_rate=0.001):

        """Implements a space-efficient probabilistic data structure

        capacity

            this BloomFilter must be able to store at least *capacity* elements

            while maintaining no more than *error_rate* chance of false

            positives

        error_rate

            the error_rate of the filter returning false positives. This

            determines the filters capacity. Inserting more than capacity

            elements greatly increases the chance of false positives.

        >>> b = BloomFilter(capacity=100000, error_rate=0.001)

        >>> b.add("test")

        False

        >>> "test" in b

        True

        """

        if not (0 < error_rate < 1):

            raise ValueError("Error_Rate must be between 0 and 1.")

        if not capacity > 0:

            raise ValueError("Capacity must be > 0")

        # given M = num_bits, k = num_slices, p = error_rate, n = capacity

        # solving for m = bits_per_slice

        # n ~= M * ((ln(2) ** 2) / abs(ln(P)))

        # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))

        # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))

        num_slices = int(math.ceil(math.log(1 / error_rate, 2)))

        # the error_rate constraint assumes a fill rate of 1/2

        # so we double the capacity to simplify the API

        bits_per_slice = int(math.ceil(

            (2 * capacity * abs(math.log(error_rate))) /

            (num_slices * (math.log(2) ** 2))))

        self._setup(error_rate, num_slices, bits_per_slice, capacity, 0)

        self.bitarray = bitarray.bitarray(self.num_bits, endian='little')

        self.bitarray.setall(False)

    def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count):

        self.error_rate = error_rate

        self.num_slices = num_slices

        self.bits_per_slice = bits_per_slice

        self.capacity = capacity

        self.num_bits = num_slices * bits_per_slice

        self.count = count

        self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)

    def __contains__(self, key):

        """Tests a key's membership in this bloom filter.

        >>> b = BloomFilter(capacity=100)

        >>> b.add("hello")

        False

        >>> "hello" in b

        True

        """

        bits_per_slice = self.bits_per_slice

        bitarray = self.bitarray

        if not isinstance(key, list):

            hashes = self.make_hashes(key)

        else:

            hashes = key

        offset = 0

        for k in hashes:

            if not bitarray[offset + k]:

                return False

            offset += bits_per_slice

        return True

    def __len__(self):

        """Return the number of keys stored by this bloom filter."""

        return self.count

    def add(self, key, skip_check=False):

        """ Adds a key to this bloom filter. If the key already exists in this

        filter it will return True. Otherwise False.

        >>> b = BloomFilter(capacity=100)

        >>> b.add("hello")

        False

        >>> b.add("hello")

        True

        """

        bitarray = self.bitarray

        bits_per_slice = self.bits_per_slice

        hashes = self.make_hashes(key)

        if not skip_check and hashes in self:

            return True

        if self.count > self.capacity:

            raise IndexError("BloomFilter is at capacity")

        offset = 0

        for k in hashes:

            self.bitarray[offset + k] = True

            offset += bits_per_slice

        self.count += 1

        return False

    def copy(self):

        """Return a copy of this bloom filter.

        """

        new_filter = BloomFilter(self.capacity, self.error_rate)

        new_filter.bitarray = self.bitarray.copy()

        return new_filter

    def union(self, other):

        """ Calculates the union of the two underlying bitarrays and returns

        a new bloom filter object."""

        if self.capacity != other.capacity or \

            self.error_rate != other.error_rate:

            raise ValueError("Unioning filters requires both filters to have \

both the same capacity and error rate")

        new_bloom = self.copy()

        new_bloom.bitarray = new_bloom.bitarray | other.bitarray

        return new_bloom

    def __or__(self, other):

        return self.union(other)

    def intersection(self, other):

        """ Calculates the union of the two underlying bitarrays and returns

        a new bloom filter object."""

        if self.capacity != other.capacity or \

            self.error_rate != other.error_rate:

            raise ValueError("Intersecting filters requires both filters to \

have equal capacity and error rate")

        new_bloom = self.copy()

        new_bloom.bitarray = new_bloom.bitarray & other.bitarray

        return new_bloom

    def __and__(self, other):

        return self.intersection(other)

    def tofile(self, f):

        """Write the bloom filter to file object `f'. Underlying bits

        are written as machine values. This is much more space

        efficient than pickling the object."""

        f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices,

                     self.bits_per_slice, self.capacity, self.count))

        self.bitarray.tofile(f)

    @classmethod

    def fromfile(cls, f, n=-1):

        """Read a bloom filter from file-object `f' serialized with

        ``BloomFilter.tofile''. If `n' > 0 read only so many bytes."""

        headerlen = calcsize(cls.FILE_FMT)

        if 0 < n < headerlen:

            raise ValueError('n too small!')

        filter = cls(1)  # Bogus instantiation, we will `_setup'.

        filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen)))

        filter.bitarray = bitarray.bitarray(endian='little')

        if n > 0:

            filter.bitarray.fromfile(f, n - headerlen)

        else:

            filter.bitarray.fromfile(f)

        if filter.num_bits != filter.bitarray.length() and \

               (filter.num_bits + (8 - filter.num_bits % 8)

                != filter.bitarray.length()):

            raise ValueError('Bit length mismatch!')

        return filter

    def __getstate__(self):

        d = self.__dict__.copy()

        del d['make_hashes']

        return d

    def __setstate__(self, d):

        self.__dict__.update(d)

        self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)

class ScalableBloomFilter(object):

    SMALL_SET_GROWTH = 2 # slower, but takes up less memory

    LARGE_SET_GROWTH = 4 # faster, but takes up more memory faster

    FILE_FMT = '<idQd'

    def __init__(self, initial_capacity=100, error_rate=0.001,

                 mode=SMALL_SET_GROWTH):

        """Implements a space-efficient probabilistic data structure that

        grows as more items are added while maintaining a steady false

        positive rate

        initial_capacity

            the initial capacity of the filter

        error_rate

            the error_rate of the filter returning false positives. This

            determines the filters capacity. Going over capacity greatly

            increases the chance of false positives.

        mode

            can be either ScalableBloomFilter.SMALL_SET_GROWTH or

            ScalableBloomFilter.LARGE_SET_GROWTH. SMALL_SET_GROWTH is slower

            but uses less memory. LARGE_SET_GROWTH is faster but consumes

            memory faster.

        >>> b = ScalableBloomFilter(initial_capacity=512, error_rate=0.001, \

                                    mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        >>> b.add("test")

        False

        >>> "test" in b

        True

        >>> unicode_string = u'¡'

        >>> b.add(unicode_string)

        False

        >>> unicode_string in b

        True

        """

        if not error_rate or error_rate < 0:

            raise ValueError("Error_Rate must be a decimal less than 0.")

        self._setup(mode, 0.9, initial_capacity, error_rate)

        self.filters = []

    def _setup(self, mode, ratio, initial_capacity, error_rate):

        self.scale = mode

        self.ratio = ratio

        self.initial_capacity = initial_capacity

        self.error_rate = error_rate

    def __contains__(self, key):

        """Tests a key's membership in this bloom filter.

        >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \

                                    mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        >>> b.add("hello")

        False

        >>> "hello" in b

        True

        """

        for f in reversed(self.filters):

            if key in f:

                return True

        return False

    def add(self, key):

        """Adds a key to this bloom filter.

        If the key already exists in this filter it will return True.

        Otherwise False.

        >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \

                                    mode=ScalableBloomFilter.SMALL_SET_GROWTH)

        >>> b.add("hello")

        False

        >>> b.add("hello")

        True

        """

        if key in self:

            return True

        filter = self.filters[-1] if self.filters else None

        if filter is None or filter.count >= filter.capacity:

            num_filters = len(self.filters)

            filter = BloomFilter(

                capacity=self.initial_capacity * (self.scale ** num_filters),

                error_rate=self.error_rate * (self.ratio ** num_filters))

            self.filters.append(filter)

        filter.add(key, skip_check=True)

        return False

    @property

    def capacity(self):

        """Returns the total capacity for all filters in this SBF"""

        return sum([f.capacity for f in self.filters])

    @property

    def count(self):

        return len(self)

    def tofile(self, f):

        """Serialize this ScalableBloomFilter into the file-object

        `f'."""

        f.write(pack(self.FILE_FMT, self.scale, self.ratio,

                     self.initial_capacity, self.error_rate))

        # Write #-of-filters

        f.write(pack('<l', len(self.filters)))

        if len(self.filters) > 0:

            # Then each filter directly, with a header describing

            # their lengths.

            headerpos = f.tell()

            headerfmt = '<' + 'Q'*(len(self.filters))

            f.write('.' * calcsize(headerfmt))

            filter_sizes = []

            for filter in self.filters:

                begin = f.tell()

                filter.tofile(f)

                filter_sizes.append(f.tell() - begin)

            f.seek(headerpos)

            f.write(pack(headerfmt, *filter_sizes))

    @classmethod

    def fromfile(cls, f):

        """Deserialize the ScalableBloomFilter in file object `f'."""

        filter = cls()

        filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT))))

        nfilters, = unpack('<l', f.read(calcsize('<l')))

        if nfilters > 0:

            header_fmt = '<' + 'Q'*nfilters

            bytes = f.read(calcsize(header_fmt))

            filter_lengths = unpack(header_fmt, bytes)

            for fl in filter_lengths:

                filter.filters.append(BloomFilter.fromfile(f, fl))

        else:

            filter.filters = []

        return filter

    def __len__(self):

        """Returns the total number of elements stored in this SBF"""

        return sum([f.count for f in self.filters])

if __name__ == "__main__":

    import doctest

    doctest.testmod()

url去重 --布隆过滤器 bloom filter原理及python实现的更多相关文章

[转载]布隆过滤器(Bloom Filter)
[转载]布隆过滤器(Bloom Filter) 这部分学习资料来源:https://www.youtube.com/watch?v=v7AzUcZ4XA4 Filter判断不在,那就是肯定不在:Fil ...
布隆过滤器(Bloom Filter)的原理和实现
什么情况下需要布隆过滤器? 先来看几个比较常见的例子字处理软件中,需要检查一个英语单词是否拼写正确在 FBI,一个嫌疑人的名字是否已经在嫌疑名单上在网络爬虫里,一个网址是否被访问过 yahoo, ...
浅谈布隆过滤器Bloom Filter
先从一道面试题开始: 给A,B两个文件,各存放50亿条URL,每条URL占用64字节,内存限制是4G,让你找出A,B文件共同的URL. 这个问题的本质在于判断一个元素是否在一个集合中.哈希表以O(1) ...
【面试突击】-缓存击穿（布隆过滤器 Bloom Filter）
原文地址:https://blog.csdn.net/fouy_yun/article/details/81075432 前面的文章介绍了缓存的分类和使用的场景.通常情况下,缓存是加速系统响应的一种途 ...
布隆过滤器 Bloom Filter 2
date: 2020-04-01 17:00:00 updated: 2020-04-01 17:00:00 Bloom Filter 布隆过滤器之前的一版笔记点此跳转 1. 什么是布隆过滤器本 ...
探索C&num;之布隆过滤器(Bloom filter)
阅读目录: 背景介绍算法原理误判率 BF改进总结背景介绍 Bloom filter(后面简称BF)是Bloom在1970年提出的二进制向量数据结构.通俗来说就是在大数据集合下高效判断某个成员是 ...
【转】探索C&num;之布隆过滤器(Bloom filter)
原文:蘑菇先生,http://www.cnblogs.com/mushroom/p/4556801.html 背景介绍 Bloom filter(后面简称BF)是Bloom在1970年提出的二进制向量 ...
布隆过滤器(Bloom Filter)详解——基于多hash的概率查找思想
转自:http://www.cnblogs.com/haippy/archive/2012/07/13/2590351.html 布隆过滤器［1］(Bloom Filter)是由布隆(Burton ...
[转载] 布隆过滤器(Bloom Filter)详解
转载自http://www.cnblogs.com/haippy/archive/2012/07/13/2590351.html 布隆过滤器［1］(Bloom Filter)是由布隆(Burton ...

随机推荐

removeClass 按钮点击添加class效果
html代码: <div class="game"> <span class="active">全部</span> < ...
tyvj1013 找啊找啊找GF
描述 "找啊找啊找GF,找到一个好GF,吃顿饭啊拉拉手,你是我的好GF.再见.""诶,别再见啊..."七夕...七夕...七夕这个日子,对于sqybi这种单身的 ...
spring aop配置及用例说明（1）
欢迎转载交流,博客地址http://www.cnblogs.com/shizhongtao/p/3469776.html 首先,什么是aop,其实通俗一点讲就是,再方法执行时候我们加入其它业务逻辑.比 ...
文件I/O（不带缓冲）之原子操作
一.添写至一个文件考虑一个进程,它要将数据添加到一个文件尾端.早期的UNIX系统并不支持open的O_APPEND选项,所以程序被编写成下列形式: ) < ) /* position to E ...
Android 实现ActionBar定制
我们在使用Android手机时,经常发现应用中的ActionBar和我们平时使用的ActionBar相差非常大.简单的说就是,其他应用的 ActionBar为什么那么绚丽,自己应用的ActionBar ...
python&lowbar;如何实现可迭代对象和迭代器对象？
什么是可迭代对象? 列表.字符串 for循环的本质? for循环要确保in后面的对象为可迭代对象,如何确保? iter() 方法得到一个迭代器对象不停.__next__() 方法对迭代器对象进行迭代 ...
Java手动释放对象
伪代码 public void updateUser(BufferedWriter writer, BufferedReader reader) { List<User> array = ...
jenkins--java配置
进入jenkins然后-->系统配置-->Global Tool Configuration
java压缩zip文件中文乱码问题
用java来打包文件生成压缩文件,有两个地方会出现乱码 1.内容的中文乱码问题,这个问题网上很多人给出了解决方法,两种:修改sun的源码:使用开源的类库org.apache.tools.zip.Zip ...
python 爬虫系列07-天气爬虫
看天气 import requests from bs4 import BeautifulSoup ALL_DATA = [] def parse_page(url): headers = { 'Us ...