YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/util/bloom_filter.h
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
//
18
// The following only applies to changes made to this file as part of YugaByte development.
19
//
20
// Portions Copyright (c) YugaByte, Inc.
21
//
22
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
23
// in compliance with the License.  You may obtain a copy of the License at
24
//
25
// http://www.apache.org/licenses/LICENSE-2.0
26
//
27
// Unless required by applicable law or agreed to in writing, software distributed under the License
28
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
29
// or implied.  See the License for the specific language governing permissions and limitations
30
// under the License.
31
//
32
#ifndef YB_UTIL_BLOOM_FILTER_H
33
#define YB_UTIL_BLOOM_FILTER_H
34
35
#include "yb/gutil/hash/city.h"
36
#include "yb/gutil/macros.h"
37
#include "yb/util/bitmap.h"
38
#include "yb/util/slice.h"
39
40
namespace yb {
41
42
// Probe calculated from a given key. This caches the calculated
43
// hash values which are necessary for probing into a Bloom Filter,
44
// so that when many bloom filters have to be consulted for a given
45
// key, we only need to calculate the hashes once.
46
//
47
// This is implemented based on the idea of double-hashing from the following paper:
48
//   "Less Hashing, Same Performance: Building a Better Bloom Filter"
49
//   Kirsch and Mitzenmacher, ESA 2006
50
//   http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf
51
//
52
// Currently, the implementation uses the 64-bit City Hash.
53
// TODO: an SSE CRC32 hash is probably ~20% faster. Come back to this
54
// at some point.
55
class BloomKeyProbe {
56
 public:
57
  // Default constructor - this is only used to instantiate an object
58
  // and later reassign by assignment from another instance
59
0
  BloomKeyProbe() {}
60
61
  // Construct a probe from the given key.
62
  //
63
  // NOTE: proper operation requires that the referenced memory remain
64
  // valid for the lifetime of this object.
65
  explicit BloomKeyProbe(const Slice &key) : key_(key) {
66
    uint64_t h = util_hash::CityHash64(
67
      reinterpret_cast<const char *>(key.data()),
68
      key.size());
69
70
    // Use the top and bottom halves of the 64-bit hash
71
    // as the two independent hash functions for mixing.
72
    h_1_ = static_cast<uint32>(h);
73
    h_2_ = static_cast<uint32>(h >> 32);
74
  }
75
76
0
  const Slice &key() const { return key_; }
77
78
  // The initial hash value. See MixHash() for usage example.
79
  uint32_t initial_hash() const {
80
    return h_1_;
81
  }
82
83
  // Mix the given hash function with the second calculated hash
84
  // value. A sequence of independent hashes can be calculated
85
  // by repeatedly calling MixHash() on its previous result.
86
  uint32_t MixHash(uint32_t h) const {
87
    return h + h_2_;
88
  }
89
90
 private:
91
  Slice key_;
92
93
  // The two hashes.
94
  uint32_t h_1_;
95
  uint32_t h_2_;
96
};
97
98
// Sizing parameters for the constructor to BloomFilterBuilder.
99
// This is simply to provide a nicer API than a bunch of overloaded
100
// constructors.
101
class BloomFilterSizing {
102
 public:
103
  // Size the bloom filter by a fixed size and false positive rate.
104
  //
105
  // Picks the number of entries to achieve the above.
106
  static BloomFilterSizing BySizeAndFPRate(size_t n_bytes, double fp_rate);
107
108
  // Size the bloom filer by an expected count and false positive rate.
109
  //
110
  // Picks the number of bytes to achieve the above.
111
  static BloomFilterSizing ByCountAndFPRate(size_t expected_count, double fp_rate);
112
113
2
  size_t n_bytes() const { return n_bytes_; }
114
2
  size_t expected_count() const { return expected_count_; }
115
116
 private:
117
  BloomFilterSizing(size_t n_bytes, size_t expected_count) :
118
    n_bytes_(n_bytes),
119
    expected_count_(expected_count)
120
1
  {}
121
122
  size_t n_bytes_;
123
  size_t expected_count_;
124
};
125
126
127
// Builder for a BloomFilter structure.
128
class BloomFilterBuilder {
129
 public:
130
  // Create a bloom filter.
131
  // See BloomFilterSizing static methods to specify this argument.
132
  explicit BloomFilterBuilder(const BloomFilterSizing &sizing);
133
134
  // Clear all entries, reset insertion count.
135
  void Clear();
136
137
  // Add the given key to the bloom filter.
138
  void AddKey(const BloomKeyProbe &probe);
139
140
  // Return an estimate of the false positive rate.
141
  double false_positive_rate() const;
142
143
2
  size_t n_bytes() const {
144
2
    return n_bits_ / 8;
145
2
  }
146
147
  size_t n_bits() const {
148
    return n_bits_;
149
  }
150
151
  // Return a slice view into this Bloom Filter, suitable for
152
  // writing out to a file.
153
  const Slice slice() const {
154
    return Slice(&bitmap_[0], n_bytes());
155
  }
156
157
  // Return the number of hashes that are calculated for each entry
158
  // in the bloom filter.
159
  size_t n_hashes() const { return n_hashes_; }
160
161
0
  size_t expected_count() const { return expected_count_; }
162
163
  // Return the number of keys inserted.
164
0
  size_t count() const { return n_inserted_; }
165
166
 private:
167
  size_t n_bits_;
168
  std::unique_ptr<uint8_t[]> bitmap_;
169
170
  // The number of hash functions to compute.
171
  size_t n_hashes_;
172
173
  // The expected number of elements, for which the bloom is optimized.
174
  size_t expected_count_;
175
176
  // The number of elements inserted so far since the last Reset.
177
  size_t n_inserted_;
178
179
  DISALLOW_COPY_AND_ASSIGN(BloomFilterBuilder);
180
};
181
182
// Wrapper around a byte array for reading it as a bloom filter.
183
class BloomFilter {
184
 public:
185
  BloomFilter(const Slice &data, size_t n_hashes);
186
187
  // Return true if the filter may contain the given key.
188
  bool MayContainKey(const BloomKeyProbe &probe) const;
189
190
 private:
191
  friend class BloomFilterBuilder;
192
  static uint32_t PickBit(uint32_t hash, size_t n_bits);
193
194
  size_t n_bits_;
195
  const uint8_t *bitmap_;
196
197
  size_t n_hashes_;
198
};
199
200
201
////////////////////////////////////////////////////////////
202
// Inline implementations
203
////////////////////////////////////////////////////////////
204
205
inline uint32_t BloomFilter::PickBit(uint32_t hash, size_t n_bits) {
206
  switch (n_bits) {
207
    // Fast path for the default bloom filter block size. Bitwise math
208
    // is much faster than division.
209
    case 4096 * 8:
210
      return hash & (n_bits - 1);
211
212
    default:
213
      return hash % n_bits;
214
  }
215
}
216
217
inline void BloomFilterBuilder::AddKey(const BloomKeyProbe &probe) {
218
  uint32_t h = probe.initial_hash();
219
  for (size_t i = 0; i < n_hashes_; i++) {
220
    uint32_t bitpos = BloomFilter::PickBit(h, n_bits_);
221
    BitmapSet(&bitmap_[0], bitpos);
222
    h = probe.MixHash(h);
223
  }
224
  n_inserted_++;
225
}
226
227
inline bool BloomFilter::MayContainKey(const BloomKeyProbe &probe) const {
228
  uint32_t h = probe.initial_hash();
229
230
  // Basic unrolling by 2s gives a small benefit here since the two bit positions
231
  // can be calculated in parallel -- it's a 50% chance that the first will be
232
  // set even if it's a bloom miss, in which case we can parallelize the load.
233
  auto rem_hashes = n_hashes_;
234
  while (rem_hashes >= 2) {
235
    uint32_t bitpos1 = PickBit(h, n_bits_);
236
    h = probe.MixHash(h);
237
    uint32_t bitpos2 = PickBit(h, n_bits_);
238
    h = probe.MixHash(h);
239
240
    if (!BitmapTest(&bitmap_[0], bitpos1) ||
241
        !BitmapTest(&bitmap_[0], bitpos2)) {
242
      return false;
243
    }
244
245
    rem_hashes -= 2;
246
  }
247
248
  while (rem_hashes) {
249
    uint32_t bitpos = PickBit(h, n_bits_);
250
    if (!BitmapTest(&bitmap_[0], bitpos)) {
251
      return false;
252
    }
253
    h = probe.MixHash(h);
254
    rem_hashes--;
255
  }
256
  return true;
257
}
258
259
} // namespace yb
260
261
#endif // YB_UTIL_BLOOM_FILTER_H