-
Notifications
You must be signed in to change notification settings - Fork 3
/
bloomfilter.go
156 lines (139 loc) · 4.97 KB
/
bloomfilter.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
package bloomfilter
// #cgo CFLAGS: -Wall
// #cgo LDFLAGS: -lm
// #include<math.h>
import "C"
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
"io"
"math"
"github.com/Workiva/go-datastructures/bitarray"
)
var strategyList []Strategy = []Strategy{&Murur128Mitz32{}, &Murur128Mitz64{}}
// BloomFilter definition includes the number of hash functions, bit array, and strategy for hashing.
type BloomFilter struct {
numHashFunctions int
array bitarray.BitArray
strategy Strategy
}
// NewBloomFilter creates a new BloomFilter instance with `Murur128Mitz64` as default strategy.
func NewBloomFilter(expectedInsertions int, errRate float64) (*BloomFilter, error) {
return NewBloomFilterWithStrategy(expectedInsertions, errRate, &Murur128Mitz64{})
}
// NewBloomFilterWithStrategy creates a new BloomFilter instance with given expected insertions/capacity,
// error rate, and strategy. For now the available strategies are
// * &Murur128Mitz32{}
// * &Murur128Mitz64{}
func NewBloomFilterWithStrategy(expectedInsertions int, errRate float64, strategy Strategy) (*BloomFilter, error) {
if errRate <= 0.0 {
return nil, errors.New("error rate must be > 0.0")
}
if errRate >= 1.0 {
return nil, errors.New("error rate must be < 1.0")
}
if expectedInsertions < 0 {
return nil, errors.New("expected insertions must be >= 0")
}
if expectedInsertions == 0 {
expectedInsertions = 1
}
numBits := numOfBits(expectedInsertions, errRate)
numHashFunctions := numOfHashFunctions(expectedInsertions, numBits)
bloomFilter := &BloomFilter{
numHashFunctions: numHashFunctions,
array: bitarray.NewBitArray(uint64(math.Ceil(float64(numBits)/64.0) * 64.0)),
strategy: strategy,
}
return bloomFilter, nil
}
// FromBytes creates a new BloomFilter instance by deserializing from byte array.
func FromBytes(b []byte) (*BloomFilter, error) {
reader := bytes.NewReader(b)
// read strategy
strategyByte, err := reader.ReadByte()
if err != nil {
return nil, fmt.Errorf("failed to read strategy: %v", err)
}
strategyIndex := int(strategyByte)
if strategyIndex >= len(strategyList) {
return nil, fmt.Errorf("unknown strategy byte: %v", strategyByte)
}
strategy := strategyList[strategyIndex]
// read number of hash functions
numHashFuncByte, err := reader.ReadByte()
if err != nil {
return nil, fmt.Errorf("failed to read number of hash functions: %v", err)
}
numHashFunctions := int(numHashFuncByte)
// read bitarray capacity
numUint64Bytes, err := io.ReadAll(io.LimitReader(reader, 4))
if err != nil {
return nil, fmt.Errorf("failed to read number of bits: %v", err)
}
if len(numUint64Bytes) != 4 {
return nil, fmt.Errorf("not a valid uint32 bytes: %v", numUint64Bytes)
}
numUint64 := binary.BigEndian.Uint32(numUint64Bytes)
array := bitarray.NewBitArray(uint64(numUint64) * 64)
// put blocks back to bitarray
for blockIdx := 0; blockIdx < int(numUint64); blockIdx++ {
block, err := io.ReadAll(io.LimitReader(reader, 8))
if err != nil {
return nil, fmt.Errorf("failed to build bitarray: %v", err)
}
if len(block) != 8 {
return nil, fmt.Errorf("not a valid uint64 bytes: %v", block)
}
num := binary.BigEndian.Uint64(block)
var pos uint64 = 1 << 63
var index uint64
for i := 0; i < 64; i++ {
if num&pos > 0 {
index = uint64(blockIdx*64 + (64 - i - 1))
array.SetBit(index)
}
pos >>= 1
}
}
return &BloomFilter{
numHashFunctions: numHashFunctions,
array: array,
strategy: strategy,
}, nil
}
// Put inserts element of any type into BloomFilter.
func (bf *BloomFilter) Put(key interface{}) bool {
return bf.strategy.put(key, bf.numHashFunctions, bf.array)
}
// MightContain returns a boolean value to indicate if given element is in BloomFilter.
func (bf *BloomFilter) MightContain(key interface{}) bool {
return bf.strategy.mightContain(key, bf.numHashFunctions, bf.array)
}
// ToBytes serializes BloomFilter to byte array, which is compatible with
// Java's Guava library.
func (bf *BloomFilter) ToBytes() []byte {
buf := new(bytes.Buffer)
buf.WriteByte(byte(bf.strategy.getOrdinal()))
buf.WriteByte(byte(bf.numHashFunctions))
binary.Write(buf, binary.BigEndian, uint32(math.Ceil(float64(bf.array.Capacity())/64.0)))
for iter := bf.array.Blocks(); iter.Next(); {
_, block := iter.Value()
binary.Write(buf, binary.BigEndian, block)
}
return buf.Bytes()
}
func numOfBits(expectedInsertions int, errRate float64) int {
if errRate == 0.0 {
errRate = math.Pow(2.0, -1074.0) // the same number of Double.MIN_VALUE in Java
}
errorRate := C.double(errRate)
// Use C functions to calculate logarithm here since Go's built-in math lib doesn't give accurate result.
// See https://github.com/golang/go/issues/9546 for details.
return int(C.double(-expectedInsertions) * C.log(errorRate) / (C.log(C.double(2.0)) * C.log(C.double(2.0))))
}
func numOfHashFunctions(expectedInsertions int, numBits int) int {
return int(math.Max(1.0, float64(C.round(C.double(numBits)/C.double(expectedInsertions)*C.log(C.double(2.0))))))
}