YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/gutil/hash/hash.h
Line
Count
Source (jump to first uncovered line)
1
//
2
// Copyright (C) 1999 and onwards Google, Inc.
3
//
4
// The following only applies to changes made to this file as part of YugaByte development.
5
//
6
// Portions Copyright (c) YugaByte, Inc.
7
//
8
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
9
// in compliance with the License.  You may obtain a copy of the License at
10
//
11
// http://www.apache.org/licenses/LICENSE-2.0
12
//
13
// Unless required by applicable law or agreed to in writing, software distributed under the License
14
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
15
// or implied.  See the License for the specific language governing permissions and limitations
16
// under the License.
17
//
18
//
19
// This file contains routines for hashing and fingerprinting.
20
//
21
// A hash function takes an arbitrary input bitstring (string, char*,
22
// number) and turns it into a hash value (a fixed-size number) such
23
// that unequal input values have a high likelihood of generating
24
// unequal hash values.  A fingerprint is a hash whose design is
25
// biased towards avoiding hash collisions, possibly at the expense of
26
// other characteristics such as execution speed.
27
//
28
// In general, if you are only using the hash values inside a single
29
// executable -- you're not writing the values to disk, and you don't
30
// depend on another instance of your program, running on another
31
// machine, generating the same hash values as you -- you want to use
32
// a HASH.  Otherwise, you want to use a FINGERPRINT.
33
//
34
// RECOMMENDED HASH FOR STRINGS:    GoodFastHash
35
//
36
// It is a functor, so you can use it like this:
37
//     hash_map<string, xxx, GoodFastHash<string> >
38
//     hash_set<char *, GoodFastHash<char*> >
39
//
40
// RECOMMENDED HASH FOR NUMBERS:    hash<>
41
//
42
// Note that this is likely the identity hash, so if your
43
// numbers are "non-random" (especially in the low bits), another
44
// choice is better.  You can use it like this:
45
//     hash_map<int, xxx>
46
//     hash_set<uint64>
47
//
48
// RECOMMENDED HASH FOR POINTERS:    hash<>
49
//
50
// This is also likely the identity hash.
51
//
52
// RECOMMENDED HASH FOR STRUCTS:    hash<some_fingerprint(struct)>
53
//
54
// Take a fingerprint of the struct, and use that as the key.
55
// For instance: const uint64 hash_data[] = { s.foo, bit_cast<uint64>(s.bar) };
56
//    uint64 fprint = <fingerprint fn>(reinterpret_cast<const char*>(hash_data),
57
//                                     sizeof(hash_data));
58
//    hash_map[fprint] = whatever;
59
//
60
// RECOMMENDED FINGERPRINT:         Fingerprint2011
61
//
62
// (In util/hash/fingerprint2011.h)
63
// In particular, do *not* use Fingerprint in new code; it has
64
// problems with excess collisions.
65
//
66
// OTHER HASHES AND FINGERPRINTS:
67
//
68
//
69
// The wiki page also has good advice for when to use a fingerprint vs
70
// a hash.
71
//
72
//
73
// Note: if your file declares hash_map<string, ...> or
74
// hash_set<string>, it will use the default hash function,
75
// hash<string>.  This is not a great choice.  Always provide an
76
// explicit functor, such as GoodFastHash, as a template argument.
77
// (Either way, you will need to #include this file to get the
78
// necessary definition.)
79
//
80
// Some of the hash functions below are documented to be fixed
81
// forever; the rest (whether they're documented as so or not) may
82
// change over time.  If you require a hash function that does not
83
// change over time, you should have unittests enforcing this
84
// property.  We already have several such functions; see
85
// hash_unittest.cc for the details and unittests.
86
87
#ifndef YB_GUTIL_HASH_HASH_H
88
#define YB_GUTIL_HASH_HASH_H
89
90
#include <stddef.h>
91
#include <stdint.h>     // for uintptr_t
92
#include <string.h>
93
#include <algorithm>
94
#include <string>
95
#include <utility>
96
97
#include "yb/gutil/casts.h"
98
#include "yb/gutil/int128.h"
99
#include "yb/gutil/integral_types.h"
100
#include "yb/gutil/macros.h"
101
#include "yb/gutil/port.h"
102
#include "yb/gutil/hash/city.h"
103
#include "yb/gutil/hash/hash128to64.h"
104
#include "yb/gutil/hash/jenkins.h"
105
#include "yb/gutil/hash/jenkins_lookup2.h"
106
#include "yb/gutil/hash/legacy_hash.h"
107
#include "yb/gutil/hash/string_hash.h"
108
109
// ----------------------------------------------------------------------
110
// Fingerprint()
111
//   Not recommended for new code.  Instead, use Fingerprint2011(),
112
//   a higher-quality and faster hash function.  See fingerprint2011.h.
113
//
114
//   Fingerprinting a string (or char*) will never return 0 or 1,
115
//   in case you want a couple of special values.  However,
116
//   fingerprinting a numeric type may produce 0 or 1.
117
//
118
//   The hash mapping of Fingerprint() will never change.
119
//
120
//   Note: AVOID USING FINGERPRINT if at all possible.  Use
121
//   Fingerprint2011 (in fingerprint2011.h) instead.
122
//   Fingerprint() is susceptible to collisions for even short
123
//   strings with low edit distance; see
124
//   Example collisions:
125
//     "01056/02" vs. "11057/02"
126
//     "LTA 02" vs. "MTA 12"
127
//   The same study found only one collision each for CityHash64() and
128
//   MurmurHash64(), from more than 2^32 inputs, and on medium-length
129
//   strings with large edit distances.These issues, among others,
130
//   led to the recommendation that new code should avoid Fingerprint().
131
// ----------------------------------------------------------------------
132
extern uint64 FingerprintReferenceImplementation(const char *s, uint32 len);
133
extern uint64 FingerprintInterleavedImplementation(const char *s, uint32 len);
134
0
inline uint64 Fingerprint(const char *s, uint32 len) {
135
0
  if (sizeof(s) == 8) {  // 64-bit systems have 8-byte pointers.
136
0
    // The better choice when we have a decent number of registers.
137
0
    return FingerprintInterleavedImplementation(s, len);
138
0
  } else {
139
0
    return FingerprintReferenceImplementation(s, len);
140
0
  }
141
0
}
142
143
// Routine that combines together the hi/lo part of a fingerprint
144
// and changes the result appropriately to avoid returning 0/1.
145
0
inline uint64 CombineFingerprintHalves(uint32 hi, uint32 lo) {
146
0
  uint64 result = (static_cast<uint64>(hi) << 32) | static_cast<uint64>(lo);
147
0
  if ((hi == 0) && (lo < 2)) {
148
0
    result ^= GG_ULONGLONG(0x130f9bef94a0a928);
149
0
  }
150
0
  return result;
151
0
}
Unexecuted instantiation: CombineFingerprintHalves(unsigned int, unsigned int)
Unexecuted instantiation: CombineFingerprintHalves(unsigned int, unsigned int)
152
153
0
inline uint64 Fingerprint(const std::string& s) {
154
0
  return Fingerprint(s.data(), static_cast<uint32>(s.size()));
155
0
}
156
20.9M
inline uint64 Hash64StringWithSeed(const std::string& s, uint64 c) {
157
20.9M
  return Hash64StringWithSeed(s.data(), static_cast<uint32>(s.size()), c);
158
20.9M
}
159
0
inline uint64 Fingerprint(schar c) {
160
0
  return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
161
0
}
162
0
inline uint64 Fingerprint(char c) {
163
0
  return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
164
0
}
165
0
inline uint64 Fingerprint(uint16 c) {
166
0
  return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
167
0
}
168
0
inline uint64 Fingerprint(int16 c) {
169
0
  return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
170
0
}
171
0
inline uint64 Fingerprint(uint32 c) {
172
0
  return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
173
0
}
174
0
inline uint64 Fingerprint(int32 c) {
175
0
  return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
176
0
}
177
0
inline uint64 Fingerprint(uint64 c) {
178
0
  return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
179
0
}
180
0
inline uint64 Fingerprint(int64 c) {
181
0
  return Hash64NumWithSeed(static_cast<uint64>(c), MIX64);
182
0
}
183
184
// This concatenates two 64-bit fingerprints. It is a convenience function to
185
// get a fingerprint for a combination of already fingerprinted components.
186
// It assumes that each input is already a good fingerprint itself.
187
// Note that this is legacy code and new code should use its replacement
188
// FingerprintCat2011().
189
//
190
// Note that in general it's impossible to construct Fingerprint(str)
191
// from the fingerprints of substrings of str.  One shouldn't expect
192
// FingerprintCat(Fingerprint(x), Fingerprint(y)) to indicate
193
// anything about Fingerprint(StrCat(x, y)).
194
0
inline uint64 FingerprintCat(uint64 fp1, uint64 fp2) {
195
0
  return Hash64NumWithSeed(fp1, fp2);
196
0
}
197
198
// If you want an excellent string hash function, and you don't mind if it
199
// might change when you sync and recompile, please use GoodFastHash<>.
200
// For most applications, GoodFastHash<> is a good choice, better than
201
// hash<string> or hash<char*> or similar.  GoodFastHash<> can change
202
// from time to time and may differ across platforms, and we'll strive
203
// to keep improving it.
204
//
205
// By the way, when deleting the contents of a hash_set of pointers, it is
206
// unsafe to delete *iterator because the hash function may be called on
207
// the next iterator advance.  Use STLDeleteContainerPointers().
208
209
template<class X> struct GoodFastHash;
210
211
// This intended to be a "good" hash function.  It may change from time to time.
212
template<> struct GoodFastHash<char*> {
213
0
  size_t operator()(const char* s) const {
214
0
    return HashStringThoroughly(s, strlen(s));
215
0
  }
216
  // Less than operator for MSVC.
217
0
  bool operator()(const char* a, const char* b) const {
218
0
    return strcmp(a, b) < 0;
219
0
  }
220
  static const size_t bucket_size = 4;  // These are required by MSVC
221
  static const size_t min_buckets = 8;  // 4 and 8 are defaults.
222
};
223
224
// This intended to be a "good" hash function.  It may change from time to time.
225
template<> struct GoodFastHash<const char*> {
226
0
  size_t operator()(const char* s) const {
227
0
    return HashStringThoroughly(s, strlen(s));
228
0
  }
229
  // Less than operator for MSVC.
230
0
  bool operator()(const char* a, const char* b) const {
231
0
    return strcmp(a, b) < 0;
232
0
  }
233
  static const size_t bucket_size = 4;  // These are required by MSVC
234
  static const size_t min_buckets = 8;  // 4 and 8 are defaults.
235
};
236
237
// This intended to be a "good" hash function.  It may change from time to time.
238
template<class _CharT, class _Traits, class _Alloc>
239
struct GoodFastHash<std::basic_string<_CharT, _Traits, _Alloc> > {
240
  size_t operator()(const std::basic_string<_CharT, _Traits, _Alloc>& k) const {
241
    return HashStringThoroughly(k.data(), k.length() * sizeof(k[0]));
242
  }
243
  // Less than operator for MSVC.
244
  bool operator()(const std::basic_string<_CharT, _Traits, _Alloc>& a,
245
                  const std::basic_string<_CharT, _Traits, _Alloc>& b) const {
246
    return a < b;
247
  }
248
  static const size_t bucket_size = 4;  // These are required by MSVC
249
  static const size_t min_buckets = 8;  // 4 and 8 are defaults.
250
};
251
252
// This intended to be a "good" hash function.  It may change from time to time.
253
template<class _CharT, class _Traits, class _Alloc>
254
struct GoodFastHash<const std::basic_string<_CharT, _Traits, _Alloc> > {
255
  size_t operator()(const std::basic_string<_CharT, _Traits, _Alloc>& k) const {
256
    return HashStringThoroughly(k.data(), k.length() * sizeof(k[0]));
257
  }
258
  // Less than operator for MSVC.
259
  bool operator()(const std::basic_string<_CharT, _Traits, _Alloc>& a,
260
                  const std::basic_string<_CharT, _Traits, _Alloc>& b) const {
261
    return a < b;
262
  }
263
  static const size_t bucket_size = 4;  // These are required by MSVC
264
  static const size_t min_buckets = 8;  // 4 and 8 are defaults.
265
};
266
267
#endif  // YB_GUTIL_HASH_HASH_H