/Users/deen/code/yugabyte-db/src/yb/gutil/hash/hash.h
Line | Count | Source (jump to first uncovered line) |
1 | | // |
2 | | // Copyright (C) 1999 and onwards Google, Inc. |
3 | | // |
4 | | // The following only applies to changes made to this file as part of YugaByte development. |
5 | | // |
6 | | // Portions Copyright (c) YugaByte, Inc. |
7 | | // |
8 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
9 | | // in compliance with the License. You may obtain a copy of the License at |
10 | | // |
11 | | // http://www.apache.org/licenses/LICENSE-2.0 |
12 | | // |
13 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
14 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
15 | | // or implied. See the License for the specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // |
18 | | // |
19 | | // This file contains routines for hashing and fingerprinting. |
20 | | // |
21 | | // A hash function takes an arbitrary input bitstring (string, char*, |
22 | | // number) and turns it into a hash value (a fixed-size number) such |
23 | | // that unequal input values have a high likelihood of generating |
24 | | // unequal hash values. A fingerprint is a hash whose design is |
25 | | // biased towards avoiding hash collisions, possibly at the expense of |
26 | | // other characteristics such as execution speed. |
27 | | // |
28 | | // In general, if you are only using the hash values inside a single |
29 | | // executable -- you're not writing the values to disk, and you don't |
30 | | // depend on another instance of your program, running on another |
31 | | // machine, generating the same hash values as you -- you want to use |
32 | | // a HASH. Otherwise, you want to use a FINGERPRINT. |
33 | | // |
34 | | // RECOMMENDED HASH FOR STRINGS: GoodFastHash |
35 | | // |
36 | | // It is a functor, so you can use it like this: |
37 | | // hash_map<string, xxx, GoodFastHash<string> > |
38 | | // hash_set<char *, GoodFastHash<char*> > |
39 | | // |
40 | | // RECOMMENDED HASH FOR NUMBERS: hash<> |
41 | | // |
42 | | // Note that this is likely the identity hash, so if your |
43 | | // numbers are "non-random" (especially in the low bits), another |
44 | | // choice is better. You can use it like this: |
45 | | // hash_map<int, xxx> |
46 | | // hash_set<uint64> |
47 | | // |
48 | | // RECOMMENDED HASH FOR POINTERS: hash<> |
49 | | // |
50 | | // This is also likely the identity hash. |
51 | | // |
52 | | // RECOMMENDED HASH FOR STRUCTS: hash<some_fingerprint(struct)> |
53 | | // |
54 | | // Take a fingerprint of the struct, and use that as the key. |
55 | | // For instance: const uint64 hash_data[] = { s.foo, bit_cast<uint64>(s.bar) }; |
56 | | // uint64 fprint = <fingerprint fn>(reinterpret_cast<const char*>(hash_data), |
57 | | // sizeof(hash_data)); |
58 | | // hash_map[fprint] = whatever; |
59 | | // |
60 | | // RECOMMENDED FINGERPRINT: Fingerprint2011 |
61 | | // |
62 | | // (In util/hash/fingerprint2011.h) |
63 | | // In particular, do *not* use Fingerprint in new code; it has |
64 | | // problems with excess collisions. |
65 | | // |
66 | | // OTHER HASHES AND FINGERPRINTS: |
67 | | // |
68 | | // |
69 | | // The wiki page also has good advice for when to use a fingerprint vs |
70 | | // a hash. |
71 | | // |
72 | | // |
73 | | // Note: if your file declares hash_map<string, ...> or |
74 | | // hash_set<string>, it will use the default hash function, |
75 | | // hash<string>. This is not a great choice. Always provide an |
76 | | // explicit functor, such as GoodFastHash, as a template argument. |
77 | | // (Either way, you will need to #include this file to get the |
78 | | // necessary definition.) |
79 | | // |
80 | | // Some of the hash functions below are documented to be fixed |
81 | | // forever; the rest (whether they're documented as so or not) may |
82 | | // change over time. If you require a hash function that does not |
83 | | // change over time, you should have unittests enforcing this |
84 | | // property. We already have several such functions; see |
85 | | // hash_unittest.cc for the details and unittests. |
86 | | |
87 | | #ifndef YB_GUTIL_HASH_HASH_H |
88 | | #define YB_GUTIL_HASH_HASH_H |
89 | | |
90 | | #include <stddef.h> |
91 | | #include <stdint.h> // for uintptr_t |
92 | | #include <string.h> |
93 | | #include <algorithm> |
94 | | #include <string> |
95 | | #include <utility> |
96 | | |
97 | | #include "yb/gutil/casts.h" |
98 | | #include "yb/gutil/int128.h" |
99 | | #include "yb/gutil/integral_types.h" |
100 | | #include "yb/gutil/macros.h" |
101 | | #include "yb/gutil/port.h" |
102 | | #include "yb/gutil/hash/city.h" |
103 | | #include "yb/gutil/hash/hash128to64.h" |
104 | | #include "yb/gutil/hash/jenkins.h" |
105 | | #include "yb/gutil/hash/jenkins_lookup2.h" |
106 | | #include "yb/gutil/hash/legacy_hash.h" |
107 | | #include "yb/gutil/hash/string_hash.h" |
108 | | |
109 | | // ---------------------------------------------------------------------- |
110 | | // Fingerprint() |
111 | | // Not recommended for new code. Instead, use Fingerprint2011(), |
112 | | // a higher-quality and faster hash function. See fingerprint2011.h. |
113 | | // |
114 | | // Fingerprinting a string (or char*) will never return 0 or 1, |
115 | | // in case you want a couple of special values. However, |
116 | | // fingerprinting a numeric type may produce 0 or 1. |
117 | | // |
118 | | // The hash mapping of Fingerprint() will never change. |
119 | | // |
120 | | // Note: AVOID USING FINGERPRINT if at all possible. Use |
121 | | // Fingerprint2011 (in fingerprint2011.h) instead. |
122 | | // Fingerprint() is susceptible to collisions for even short |
123 | | // strings with low edit distance; see |
124 | | // Example collisions: |
125 | | // "01056/02" vs. "11057/02" |
126 | | // "LTA 02" vs. "MTA 12" |
127 | | // The same study found only one collision each for CityHash64() and |
128 | | // MurmurHash64(), from more than 2^32 inputs, and on medium-length |
129 | | // strings with large edit distances.These issues, among others, |
130 | | // led to the recommendation that new code should avoid Fingerprint(). |
131 | | // ---------------------------------------------------------------------- |
132 | | extern uint64 FingerprintReferenceImplementation(const char *s, uint32 len); |
133 | | extern uint64 FingerprintInterleavedImplementation(const char *s, uint32 len); |
134 | 0 | inline uint64 Fingerprint(const char *s, uint32 len) { |
135 | 0 | if (sizeof(s) == 8) { // 64-bit systems have 8-byte pointers. |
136 | 0 | // The better choice when we have a decent number of registers. |
137 | 0 | return FingerprintInterleavedImplementation(s, len); |
138 | 0 | } else { |
139 | 0 | return FingerprintReferenceImplementation(s, len); |
140 | 0 | } |
141 | 0 | } |
142 | | |
143 | | // Routine that combines together the hi/lo part of a fingerprint |
144 | | // and changes the result appropriately to avoid returning 0/1. |
145 | 0 | inline uint64 CombineFingerprintHalves(uint32 hi, uint32 lo) { |
146 | 0 | uint64 result = (static_cast<uint64>(hi) << 32) | static_cast<uint64>(lo); |
147 | 0 | if ((hi == 0) && (lo < 2)) { |
148 | 0 | result ^= GG_ULONGLONG(0x130f9bef94a0a928); |
149 | 0 | } |
150 | 0 | return result; |
151 | 0 | } Unexecuted instantiation: CombineFingerprintHalves(unsigned int, unsigned int) Unexecuted instantiation: CombineFingerprintHalves(unsigned int, unsigned int) |
152 | | |
153 | 0 | inline uint64 Fingerprint(const std::string& s) { |
154 | 0 | return Fingerprint(s.data(), static_cast<uint32>(s.size())); |
155 | 0 | } |
156 | 20.9M | inline uint64 Hash64StringWithSeed(const std::string& s, uint64 c) { |
157 | 20.9M | return Hash64StringWithSeed(s.data(), static_cast<uint32>(s.size()), c); |
158 | 20.9M | } |
159 | 0 | inline uint64 Fingerprint(schar c) { |
160 | 0 | return Hash64NumWithSeed(static_cast<uint64>(c), MIX64); |
161 | 0 | } |
162 | 0 | inline uint64 Fingerprint(char c) { |
163 | 0 | return Hash64NumWithSeed(static_cast<uint64>(c), MIX64); |
164 | 0 | } |
165 | 0 | inline uint64 Fingerprint(uint16 c) { |
166 | 0 | return Hash64NumWithSeed(static_cast<uint64>(c), MIX64); |
167 | 0 | } |
168 | 0 | inline uint64 Fingerprint(int16 c) { |
169 | 0 | return Hash64NumWithSeed(static_cast<uint64>(c), MIX64); |
170 | 0 | } |
171 | 0 | inline uint64 Fingerprint(uint32 c) { |
172 | 0 | return Hash64NumWithSeed(static_cast<uint64>(c), MIX64); |
173 | 0 | } |
174 | 0 | inline uint64 Fingerprint(int32 c) { |
175 | 0 | return Hash64NumWithSeed(static_cast<uint64>(c), MIX64); |
176 | 0 | } |
177 | 0 | inline uint64 Fingerprint(uint64 c) { |
178 | 0 | return Hash64NumWithSeed(static_cast<uint64>(c), MIX64); |
179 | 0 | } |
180 | 0 | inline uint64 Fingerprint(int64 c) { |
181 | 0 | return Hash64NumWithSeed(static_cast<uint64>(c), MIX64); |
182 | 0 | } |
183 | | |
184 | | // This concatenates two 64-bit fingerprints. It is a convenience function to |
185 | | // get a fingerprint for a combination of already fingerprinted components. |
186 | | // It assumes that each input is already a good fingerprint itself. |
187 | | // Note that this is legacy code and new code should use its replacement |
188 | | // FingerprintCat2011(). |
189 | | // |
190 | | // Note that in general it's impossible to construct Fingerprint(str) |
191 | | // from the fingerprints of substrings of str. One shouldn't expect |
192 | | // FingerprintCat(Fingerprint(x), Fingerprint(y)) to indicate |
193 | | // anything about Fingerprint(StrCat(x, y)). |
194 | 0 | inline uint64 FingerprintCat(uint64 fp1, uint64 fp2) { |
195 | 0 | return Hash64NumWithSeed(fp1, fp2); |
196 | 0 | } |
197 | | |
198 | | // If you want an excellent string hash function, and you don't mind if it |
199 | | // might change when you sync and recompile, please use GoodFastHash<>. |
200 | | // For most applications, GoodFastHash<> is a good choice, better than |
201 | | // hash<string> or hash<char*> or similar. GoodFastHash<> can change |
202 | | // from time to time and may differ across platforms, and we'll strive |
203 | | // to keep improving it. |
204 | | // |
205 | | // By the way, when deleting the contents of a hash_set of pointers, it is |
206 | | // unsafe to delete *iterator because the hash function may be called on |
207 | | // the next iterator advance. Use STLDeleteContainerPointers(). |
208 | | |
209 | | template<class X> struct GoodFastHash; |
210 | | |
211 | | // This intended to be a "good" hash function. It may change from time to time. |
212 | | template<> struct GoodFastHash<char*> { |
213 | 0 | size_t operator()(const char* s) const { |
214 | 0 | return HashStringThoroughly(s, strlen(s)); |
215 | 0 | } |
216 | | // Less than operator for MSVC. |
217 | 0 | bool operator()(const char* a, const char* b) const { |
218 | 0 | return strcmp(a, b) < 0; |
219 | 0 | } |
220 | | static const size_t bucket_size = 4; // These are required by MSVC |
221 | | static const size_t min_buckets = 8; // 4 and 8 are defaults. |
222 | | }; |
223 | | |
224 | | // This intended to be a "good" hash function. It may change from time to time. |
225 | | template<> struct GoodFastHash<const char*> { |
226 | 0 | size_t operator()(const char* s) const { |
227 | 0 | return HashStringThoroughly(s, strlen(s)); |
228 | 0 | } |
229 | | // Less than operator for MSVC. |
230 | 0 | bool operator()(const char* a, const char* b) const { |
231 | 0 | return strcmp(a, b) < 0; |
232 | 0 | } |
233 | | static const size_t bucket_size = 4; // These are required by MSVC |
234 | | static const size_t min_buckets = 8; // 4 and 8 are defaults. |
235 | | }; |
236 | | |
237 | | // This intended to be a "good" hash function. It may change from time to time. |
238 | | template<class _CharT, class _Traits, class _Alloc> |
239 | | struct GoodFastHash<std::basic_string<_CharT, _Traits, _Alloc> > { |
240 | | size_t operator()(const std::basic_string<_CharT, _Traits, _Alloc>& k) const { |
241 | | return HashStringThoroughly(k.data(), k.length() * sizeof(k[0])); |
242 | | } |
243 | | // Less than operator for MSVC. |
244 | | bool operator()(const std::basic_string<_CharT, _Traits, _Alloc>& a, |
245 | | const std::basic_string<_CharT, _Traits, _Alloc>& b) const { |
246 | | return a < b; |
247 | | } |
248 | | static const size_t bucket_size = 4; // These are required by MSVC |
249 | | static const size_t min_buckets = 8; // 4 and 8 are defaults. |
250 | | }; |
251 | | |
252 | | // This intended to be a "good" hash function. It may change from time to time. |
253 | | template<class _CharT, class _Traits, class _Alloc> |
254 | | struct GoodFastHash<const std::basic_string<_CharT, _Traits, _Alloc> > { |
255 | | size_t operator()(const std::basic_string<_CharT, _Traits, _Alloc>& k) const { |
256 | | return HashStringThoroughly(k.data(), k.length() * sizeof(k[0])); |
257 | | } |
258 | | // Less than operator for MSVC. |
259 | | bool operator()(const std::basic_string<_CharT, _Traits, _Alloc>& a, |
260 | | const std::basic_string<_CharT, _Traits, _Alloc>& b) const { |
261 | | return a < b; |
262 | | } |
263 | | static const size_t bucket_size = 4; // These are required by MSVC |
264 | | static const size_t min_buckets = 8; // 4 and 8 are defaults. |
265 | | }; |
266 | | |
267 | | #endif // YB_GUTIL_HASH_HASH_H |