-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathdgobloom.go
193 lines (149 loc) · 4.61 KB
/
dgobloom.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
/*
Package dgobloom implements a simple Bloom Filter for strings.
A Bloom Filter is a probablistic data structure which allows testing set membership.
A negative answer means the value is not in the set. A positive answer means the element
is probably is the set. The desired rate false positives can be set at filter construction time.
Copyright (c) 2011 Damian Gryski <[email protected]>
Licensed under the GPLv3, or at your option any later version.
*/
package dgobloom
import (
"hash"
"math"
)
// Internal routines for the bit vector
type bitvector []uint32
// get bit 'bit' in the bitvector d
func (d bitvector) get(bit uint32) uint {
shift := bit % 32
bb := d[bit/32]
bb &= (1 << shift)
return uint(bb >> shift)
}
// set bit 'bit' in the bitvector d
func (d bitvector) set(bit uint32) {
d[bit/32] |= (1 << (bit % 32))
}
// 32-bit, which is why it only goes up to 16
// return the integer >= i which is a power of two
func nextPowerOfTwo(i uint64) uint64 {
n := i - 1
n |= n >> 1
n |= n >> 2
n |= n >> 4
n |= n >> 8
n |= n >> 16
n |= n >> 32
n++
return n
}
// BloomFilter allow probabilistic membership tests
type BloomFilter interface {
// Insert an element into the set.
Insert(b []byte) bool
// Determine if an element is in the set
Exists(b []byte) bool
// Return the number of elements currently stored in the set
Elements() uint32
// Merge two bloom filters
Merge(BloomFilter)
// Compress a bloom filter
Compress()
}
// Internal struct for our bloom filter
type bloomFilter struct {
capacity uint32
elements uint32
bits uint64 // size of bit vector in bits
filter bitvector // our filter bit vector
h hash.Hash32
salts [][]byte
}
func (bf *bloomFilter) Elements() uint32 { return bf.elements }
// FilterBits returns the number of bits required for the desired capacity and false positive rate.
func FilterBits(capacity uint32, falsePositiveRate float64) uint64 {
bits := float64(capacity) * -math.Log(falsePositiveRate) / (math.Log(2.0) * math.Log(2.0)) // in bits
m := nextPowerOfTwo(uint64(bits))
if m < 1024 {
return 1024
}
return m
}
// SaltsRequired returns the number of salts required by the constructor for the desired capacity and false positive rate.
func SaltsRequired(capacity uint32, falsePositiveRate float64) uint {
m := FilterBits(capacity, falsePositiveRate)
salts := uint(0.7 * float32(float64(m)/float64(capacity)))
if salts < 2 {
return 2
}
return salts
}
func uint32ToByteArray(salt uint32) []byte {
p := make([]byte, 4)
p[0] = byte(salt >> 24)
p[1] = byte(salt >> 16)
p[2] = byte(salt >> 8)
p[3] = byte(salt)
return p
}
// NewBloomFilter returns a new bloom filter with the specified capacity and false positive rate.
// The hash function h will be salted with the array of salts.
func NewBloomFilter(capacity uint32, falsePositiveRate float64, h hash.Hash32, salts []uint32) BloomFilter {
bf := new(bloomFilter)
bf.capacity = capacity
bf.bits = FilterBits(capacity, falsePositiveRate)
bf.filter = make([]uint32, uint(bf.bits+31)/32)
bf.h = h
bf.salts = make([][]byte, len(salts))
for i, s := range salts {
bf.salts[i] = uint32ToByteArray(s)
}
return bf
}
// Insert inserts the byte array b into the bloom filter.
// If the function returns false, the capacity of the bloom filter has been reached. Further inserts will increase the rate of false positives.
func (bf *bloomFilter) Insert(b []byte) bool {
bf.elements++
for _, s := range bf.salts {
bf.h.Reset()
bf.h.Write(s)
bf.h.Write(b)
bf.filter.set(uint32(uint64(bf.h.Sum32()) % bf.bits))
}
return bf.elements < bf.capacity
}
// Exists checks the bloom filter for the byte array b
func (bf *bloomFilter) Exists(b []byte) bool {
for _, s := range bf.salts {
bf.h.Reset()
bf.h.Write(s)
bf.h.Write(b)
if bf.filter.get(uint32(uint64(bf.h.Sum32())%bf.bits)) == 0 {
return false
}
}
return true
}
// Merge adds bf2 into the current bloom filter. They must have the same dimensions and be constructed with identical random seeds.
func (bf *bloomFilter) Merge(bf2 BloomFilter) {
other := bf2.(*bloomFilter)
for i, v := range other.filter {
bf.filter[i] |= v
}
}
// Compress halves the space used by the bloom filter, at the cost of increased error rate.
func (bf *bloomFilter) Compress() {
w := len(bf.filter)
if w&(w-1) != 0 {
panic("width must be a power of two")
}
neww := w / 2
// We allocate a new array here so old space can actually be garbage collected.
// TODO(dgryski): reslice and only reallocate every few compressions
row := make([]uint32, neww)
for j := 0; j < neww; j++ {
row[j] = bf.filter[j] | bf.filter[j+neww]
}
bf.filter = row
bf.bits /= 2
}