/Users/deen/code/yugabyte-db/src/yb/util/bloom_filter.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // |
18 | | // The following only applies to changes made to this file as part of YugaByte development. |
19 | | // |
20 | | // Portions Copyright (c) YugaByte, Inc. |
21 | | // |
22 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
23 | | // in compliance with the License. You may obtain a copy of the License at |
24 | | // |
25 | | // http://www.apache.org/licenses/LICENSE-2.0 |
26 | | // |
27 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
28 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
29 | | // or implied. See the License for the specific language governing permissions and limitations |
30 | | // under the License. |
31 | | // |
32 | | #ifndef YB_UTIL_BLOOM_FILTER_H |
33 | | #define YB_UTIL_BLOOM_FILTER_H |
34 | | |
35 | | #include "yb/gutil/hash/city.h" |
36 | | #include "yb/gutil/macros.h" |
37 | | #include "yb/util/bitmap.h" |
38 | | #include "yb/util/slice.h" |
39 | | |
40 | | namespace yb { |
41 | | |
42 | | // Probe calculated from a given key. This caches the calculated |
43 | | // hash values which are necessary for probing into a Bloom Filter, |
44 | | // so that when many bloom filters have to be consulted for a given |
45 | | // key, we only need to calculate the hashes once. |
46 | | // |
47 | | // This is implemented based on the idea of double-hashing from the following paper: |
48 | | // "Less Hashing, Same Performance: Building a Better Bloom Filter" |
49 | | // Kirsch and Mitzenmacher, ESA 2006 |
50 | | // http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf |
51 | | // |
52 | | // Currently, the implementation uses the 64-bit City Hash. |
53 | | // TODO: an SSE CRC32 hash is probably ~20% faster. Come back to this |
54 | | // at some point. |
55 | | class BloomKeyProbe { |
56 | | public: |
57 | | // Default constructor - this is only used to instantiate an object |
58 | | // and later reassign by assignment from another instance |
59 | 0 | BloomKeyProbe() {} |
60 | | |
61 | | // Construct a probe from the given key. |
62 | | // |
63 | | // NOTE: proper operation requires that the referenced memory remain |
64 | | // valid for the lifetime of this object. |
65 | 104k | explicit BloomKeyProbe(const Slice &key) : key_(key) { |
66 | 104k | uint64_t h = util_hash::CityHash64( |
67 | 104k | reinterpret_cast<const char *>(key.data()), |
68 | 104k | key.size()); |
69 | | |
70 | | // Use the top and bottom halves of the 64-bit hash |
71 | | // as the two independent hash functions for mixing. |
72 | 104k | h_1_ = static_cast<uint32>(h); |
73 | 104k | h_2_ = static_cast<uint32>(h >> 32); |
74 | 104k | } |
75 | | |
76 | 0 | const Slice &key() const { return key_; } |
77 | | |
78 | | // The initial hash value. See MixHash() for usage example. |
79 | 104k | uint32_t initial_hash() const { |
80 | 104k | return h_1_; |
81 | 104k | } |
82 | | |
83 | | // Mix the given hash function with the second calculated hash |
84 | | // value. A sequence of independent hashes can be calculated |
85 | | // by repeatedly calling MixHash() on its previous result. |
86 | 276k | uint32_t MixHash(uint32_t h) const { |
87 | 276k | return h + h_2_; |
88 | 276k | } |
89 | | |
90 | | private: |
91 | | Slice key_; |
92 | | |
93 | | // The two hashes. |
94 | | uint32_t h_1_; |
95 | | uint32_t h_2_; |
96 | | }; |
97 | | |
98 | | // Sizing parameters for the constructor to BloomFilterBuilder. |
99 | | // This is simply to provide a nicer API than a bunch of overloaded |
100 | | // constructors. |
101 | | class BloomFilterSizing { |
102 | | public: |
103 | | // Size the bloom filter by a fixed size and false positive rate. |
104 | | // |
105 | | // Picks the number of entries to achieve the above. |
106 | | static BloomFilterSizing BySizeAndFPRate(size_t n_bytes, double fp_rate); |
107 | | |
108 | | // Size the bloom filer by an expected count and false positive rate. |
109 | | // |
110 | | // Picks the number of bytes to achieve the above. |
111 | | static BloomFilterSizing ByCountAndFPRate(size_t expected_count, double fp_rate); |
112 | | |
113 | 2 | size_t n_bytes() const { return n_bytes_; } |
114 | 2 | size_t expected_count() const { return expected_count_; } |
115 | | |
116 | | private: |
117 | | BloomFilterSizing(size_t n_bytes, size_t expected_count) : |
118 | | n_bytes_(n_bytes), |
119 | | expected_count_(expected_count) |
120 | 1 | {} |
121 | | |
122 | | size_t n_bytes_; |
123 | | size_t expected_count_; |
124 | | }; |
125 | | |
126 | | |
127 | | // Builder for a BloomFilter structure. |
128 | | class BloomFilterBuilder { |
129 | | public: |
130 | | // Create a bloom filter. |
131 | | // See BloomFilterSizing static methods to specify this argument. |
132 | | explicit BloomFilterBuilder(const BloomFilterSizing &sizing); |
133 | | |
134 | | // Clear all entries, reset insertion count. |
135 | | void Clear(); |
136 | | |
137 | | // Add the given key to the bloom filter. |
138 | | void AddKey(const BloomKeyProbe &probe); |
139 | | |
140 | | // Return an estimate of the false positive rate. |
141 | | double false_positive_rate() const; |
142 | | |
143 | 2 | size_t n_bytes() const { |
144 | 2 | return n_bits_ / 8; |
145 | 2 | } |
146 | | |
147 | 1 | size_t n_bits() const { |
148 | 1 | return n_bits_; |
149 | 1 | } |
150 | | |
151 | | // Return a slice view into this Bloom Filter, suitable for |
152 | | // writing out to a file. |
153 | 1 | const Slice slice() const { |
154 | 1 | return Slice(&bitmap_[0], n_bytes()); |
155 | 1 | } |
156 | | |
157 | | // Return the number of hashes that are calculated for each entry |
158 | | // in the bloom filter. |
159 | 1 | size_t n_hashes() const { return n_hashes_; } |
160 | | |
161 | 0 | size_t expected_count() const { return expected_count_; } |
162 | | |
163 | | // Return the number of keys inserted. |
164 | 0 | size_t count() const { return n_inserted_; } |
165 | | |
166 | | private: |
167 | | size_t n_bits_; |
168 | | std::unique_ptr<uint8_t[]> bitmap_; |
169 | | |
170 | | // The number of hash functions to compute. |
171 | | size_t n_hashes_; |
172 | | |
173 | | // The expected number of elements, for which the bloom is optimized. |
174 | | size_t expected_count_; |
175 | | |
176 | | // The number of elements inserted so far since the last Reset. |
177 | | size_t n_inserted_; |
178 | | |
179 | | DISALLOW_COPY_AND_ASSIGN(BloomFilterBuilder); |
180 | | }; |
181 | | |
182 | | // Wrapper around a byte array for reading it as a bloom filter. |
183 | | class BloomFilter { |
184 | | public: |
185 | | BloomFilter(const Slice &data, size_t n_hashes); |
186 | | |
187 | | // Return true if the filter may contain the given key. |
188 | | bool MayContainKey(const BloomKeyProbe &probe) const; |
189 | | |
190 | | private: |
191 | | friend class BloomFilterBuilder; |
192 | | static uint32_t PickBit(uint32_t hash, size_t n_bits); |
193 | | |
194 | | size_t n_bits_; |
195 | | const uint8_t *bitmap_; |
196 | | |
197 | | size_t n_hashes_; |
198 | | }; |
199 | | |
200 | | |
201 | | //////////////////////////////////////////////////////////// |
202 | | // Inline implementations |
203 | | //////////////////////////////////////////////////////////// |
204 | | |
205 | 276k | inline uint32_t BloomFilter::PickBit(uint32_t hash, size_t n_bits) { |
206 | 276k | switch (n_bits) { |
207 | | // Fast path for the default bloom filter block size. Bitwise math |
208 | | // is much faster than division. |
209 | 0 | case 4096 * 8: |
210 | 0 | return hash & (n_bits - 1); |
211 | | |
212 | 276k | default: |
213 | 276k | return hash % n_bits; |
214 | 276k | } |
215 | 276k | } |
216 | | |
217 | 2.00k | inline void BloomFilterBuilder::AddKey(const BloomKeyProbe &probe) { |
218 | 2.00k | uint32_t h = probe.initial_hash(); |
219 | 14.0k | for (size_t i = 0; i < n_hashes_; i++) { |
220 | 12.0k | uint32_t bitpos = BloomFilter::PickBit(h, n_bits_); |
221 | 12.0k | BitmapSet(&bitmap_[0], bitpos); |
222 | 12.0k | h = probe.MixHash(h); |
223 | 12.0k | } |
224 | 2.00k | n_inserted_++; |
225 | 2.00k | } |
226 | | |
227 | 102k | inline bool BloomFilter::MayContainKey(const BloomKeyProbe &probe) const { |
228 | 102k | uint32_t h = probe.initial_hash(); |
229 | | |
230 | | // Basic unrolling by 2s gives a small benefit here since the two bit positions |
231 | | // can be calculated in parallel -- it's a 50% chance that the first will be |
232 | | // set even if it's a bloom miss, in which case we can parallelize the load. |
233 | 102k | auto rem_hashes = n_hashes_; |
234 | 135k | while (rem_hashes >= 2) { |
235 | 132k | uint32_t bitpos1 = PickBit(h, n_bits_); |
236 | 132k | h = probe.MixHash(h); |
237 | 132k | uint32_t bitpos2 = PickBit(h, n_bits_); |
238 | 132k | h = probe.MixHash(h); |
239 | | |
240 | 132k | if (!BitmapTest(&bitmap_[0], bitpos1) || |
241 | 99.0k | !BitmapTest(&bitmap_[0], bitpos2)) { |
242 | 99.0k | return false; |
243 | 99.0k | } |
244 | | |
245 | 33.3k | rem_hashes -= 2; |
246 | 33.3k | } |
247 | | |
248 | 2.99k | while (rem_hashes) { |
249 | 0 | uint32_t bitpos = PickBit(h, n_bits_); |
250 | 0 | if (!BitmapTest(&bitmap_[0], bitpos)) { |
251 | 0 | return false; |
252 | 0 | } |
253 | 0 | h = probe.MixHash(h); |
254 | 0 | rem_hashes--; |
255 | 0 | } |
256 | 2.99k | return true; |
257 | 2.99k | } |
258 | | |
259 | | } // namespace yb |
260 | | |
261 | | #endif // YB_UTIL_BLOOM_FILTER_H |