同步操作将从 mktime/python-learn 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
#coding:utf-8# Bloom filters in Python# Adam Langley <agl@imperialviolet.org># 给CountedBloom加了一个max_count 张沈鹏 <zsp007@gmail.com># Bloom-Filter算法简介# http://www.googlechinablog.com/2007/07/bloom-filter.html# http://zh.wikipedia.org/wiki/%E5%B8%83%E9%9A%86%E8%BF%87%E6%BB%A4%E5%99%A8# 这个计算器可以帮你求最佳的参数# http://www.cc.gatech.edu/~manolios/bloom-filters/calculator.html# CountedBloom 的 buckets 参数对应于计算器的m,也就是"m denotes the number of bits in the Bloom filter"import arrayimport structmixarray = array.array ('B', '\x00' * 256)# The mixarray is based on RC4 and is used as diffusion in the hashing functiondef mixarray_init (mixarray):for i in range (256):mixarray[i] = ik = 7for j in range (4):for i in range (256):s = mixarray[i]k = (k + s) % 256mixarray[i] = mixarray[k]mixarray[k] = smixarray_init (mixarray)class Bloom (object):'''Bloom filters provide a fast and compact way of checking set membership. They do this by introducing a risk of afalse positive (but there are no false negatives).For more information see http://www.cs.wisc.edu/~cao/papers/summary-cache/node8.html'''def __init__ (self, bytes, hashes, data = None):'''@bytes is the size of the bloom filter in 8-bit bytes and @hashes is the number of hash functions to use. Consult theweb page linked above for values to use. If in doubt, bytes = num_elements and hashes = 4'''self.hashes = hashesself.bytes = bytesif data == None:self.a = self._make_array (bytes)else:assert len (data) == bytesself.a = datadef init_from_counted (self, cnt):'''Set the contents of this filter from the contents of the counted filter @cnt. You have to match sizes'''if self.bytes * 8 != (len (cnt.a) * 2):raise ValueError ('Filters are not the same size')for i in xrange (len (cnt.a)):b = cnt.a[i]b1 = (b & 0xf0) >> 4b2 = (b & 0x0f)if b1:self.a[(i * 2) // 8] |= self.bitmask[(i * 2) % 8]if b2:self.a[(i * 2 + 1) // 8] |= self.bitmask[(i * 2 + 1) % 8]def _make_array (self, size):a = array.array ('B')# stupidly, there's no good way that I can see of resizing an array without allocing a huge string to do so# thus I use this, slightly odd, method:blocklen = 256arrayblock = array.array ('B', '\x00' * blocklen)todo = sizewhile (todo >= blocklen):a.extend (arrayblock)todo -= blocklenif todo:a.extend (array.array ('B', '\x00' * todo))# now a is of the right lengthreturn adef _hashfunc (self, n, val):'''Apply the nth hash function'''global mixarrayb = [ord(x) for x in struct.pack ('I', val)]c = array.array ('B', [0, 0, 0, 0])for i in range (4):c[i] = mixarray[(b[i] + n) % 256]return struct.unpack ('I', c.tostring())[0]bitmask = [0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01]def insert (self, val):for i in range (self.hashes):n = self._hashfunc (i, val) % (self.bytes * 8)self.a[n // 8] |= self.bitmask[n % 8]def __contains__ (self, val):for i in range (self.hashes):n = self._hashfunc (i, val) % (self.bytes * 8)if not self.a[n // 8] & self.bitmask[n % 8]:return 0return 1MAX_COUNT = 15class CountedBloom (Bloom):'''Just like a Bloom filter, but provides counting (e.g. you can delete as well). This uses 4 bits per bucket, so isgenerally four times larger than the same non-counted bloom filter.'''def __init__ (self, buckets, hashes):'''Please note that @buckets must be even. Also note that with a Bloom object you give the number of *bytes* and each byte is 8 buckets. Here you're giving the number of buckets.'''assert buckets % 2 == 0self.hashes = hashesself.buckets = bucketsself.a = self._make_array (buckets // 2)def insert (self, val):masks = [(0x0f, 0xf0), (0xf0, 0x0f)]shifts = [4, 0 ]for i in range (self.hashes):n = self._hashfunc (i, val) % self.bucketsbyte = n // 2bucket = n % 2(notmask, mask) = masks[bucket]shift = shifts[bucket]bval = ((self.a[byte] & mask) >> shift)if bval < MAX_COUNT: # we shouldn't increment it if it's at the maximumbval += 1self.a[byte] = (self.a[byte] & notmask) | (bval << shift)def __contains__ (self, val):masks = [(0x0f, 0xf0), (0xf0, 0x0f)]shifts = [4, 0]for i in range (self.hashes):n = self._hashfunc (i, val) % self.bucketsbyte = n // 2bucket = n % 2(notmask, mask) = masks[bucket]shift = shifts[bucket]bval = ((self.a[byte] & mask) >> shift)if bval == 0:return 0return 1def max_count(self, val):masks = [(0x0f, 0xf0), (0xf0, 0x0f)]shifts = [4, 0]count_val = MAX_COUNTfor i in range (self.hashes):n = self._hashfunc (i, val) % self.bucketsbyte = n // 2bucket = n % 2(notmask, mask) = masks[bucket]shift = shifts[bucket]bval = ((self.a[byte] & mask) >> shift)if bval < MAX_COUNT:if bval == 0:return 0else:count_val = bvalreturn count_valdef __delitem__ (self, val):masks = [(0x0f, 0xf0), (0xf0, 0x0f)]shifts = [4, 0]for i in range (self.hashes):n = self._hashfunc (i, val) % self.bucketsbyte = n // 2bucket = n % 2(notmask, mask) = masks[bucket]shift = shifts[bucket]bval = ((self.a[byte] & mask) >> shift)if bval < MAX_COUNT: # we shouldn't decrement it if it's at the maximumbval -= 1self.a[byte] = (self.a[byte] & notmask) | (bval << shift)__all__ = ['Bloom']if __name__ == '__main__':print 'Testing bloom filter: there should be no assertion failures'a = Bloom (3, 4)a.insert (45)print a.aa.insert (17)print a.aa.insert (12)print a.aassert 45 in aassert 45 in aassert not 33 in aassert 45 in aassert 17 in aassert 12 in ac = 0for x in range (255):if x in a:c += 1print cprint float(c)/255a = CountedBloom (24, 4)a.insert (45)print a.aa.insert (17)print a.aa.insert (12)a.insert (12)print "a.max_count(12)", a.max_count(12)a.insert ("张沈鹏")a.insert ("张沈鹏")a.insert ("张沈鹏")print "a.max_count(zsp)", a.max_count(12)print a.aassert 45 in aassert 45 in aassert not 33 in aassert 45 in aassert 17 in aassert 12 in ac = 0for x in range (255):if x in a:c += 1print cprint float(c)/255del a[45]assert not 45 in aa2 = Bloom (3, 4)a2.init_from_counted (a)print a2.aassert 17 in a2assert 12 in a2assert not 45 in a
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。