/Users/deen/code/yugabyte-db/src/yb/rocksdb/memtable/hash_linklist_rep.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under the BSD-style license found in the |
3 | | // LICENSE file in the root directory of this source tree. An additional grant |
4 | | // of patent rights can be found in the PATENTS file in the same directory. |
5 | | // |
6 | | // The following only applies to changes made to this file as part of YugaByte development. |
7 | | // |
8 | | // Portions Copyright (c) YugaByte, Inc. |
9 | | // |
10 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
11 | | // in compliance with the License. You may obtain a copy of the License at |
12 | | // |
13 | | // http://www.apache.org/licenses/LICENSE-2.0 |
14 | | // |
15 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
16 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
17 | | // or implied. See the License for the specific language governing permissions and limitations |
18 | | // under the License. |
19 | | // |
20 | | |
21 | | #ifndef ROCKSDB_LITE |
22 | | #include "yb/rocksdb/memtable/hash_linklist_rep.h" |
23 | | |
24 | | #include <algorithm> |
25 | | #include <atomic> |
26 | | #include "yb/rocksdb/memtablerep.h" |
27 | | #include "yb/rocksdb/util/arena.h" |
28 | | #include "yb/util/slice.h" |
29 | | #include "yb/rocksdb/slice_transform.h" |
30 | | #include "yb/rocksdb/port/port.h" |
31 | | #include "yb/rocksdb/util/histogram.h" |
32 | | #include "yb/rocksdb/util/murmurhash.h" |
33 | | #include "yb/rocksdb/db/memtable.h" |
34 | | #include "yb/rocksdb/db/skiplist.h" |
35 | | |
36 | | namespace rocksdb { |
37 | | namespace { |
38 | | |
39 | | typedef const char* Key; |
40 | | typedef SkipList<Key, const MemTableRep::KeyComparator&> MemtableSkipList; |
41 | | typedef std::atomic<void*> Pointer; |
42 | | |
43 | | // A data structure used as the header of a link list of a hash bucket. |
44 | | struct BucketHeader { |
45 | | Pointer next; |
46 | | std::atomic<uint32_t> num_entries; |
47 | | |
48 | | explicit BucketHeader(void* n, uint32_t count) |
49 | 79.1k | : next(n), num_entries(count) {} |
50 | | |
51 | 1.75M | bool IsSkipListBucket() { |
52 | 1.75M | return next.load(std::memory_order_relaxed) == this; |
53 | 1.75M | } |
54 | | |
55 | 2.47M | uint32_t GetNumEntries() const { |
56 | 2.47M | return num_entries.load(std::memory_order_relaxed); |
57 | 2.47M | } |
58 | | |
59 | | // REQUIRES: called from single-threaded Insert() |
60 | 529k | void IncNumEntries() { |
61 | | // Only one thread can do write at one time. No need to do atomic |
62 | | // incremental. Update it with relaxed load and store. |
63 | 529k | num_entries.store(GetNumEntries() + 1, std::memory_order_relaxed); |
64 | 529k | } |
65 | | }; |
66 | | |
67 | | // A data structure used as the header of a skip list of a hash bucket. |
68 | | struct SkipListBucketHeader { |
69 | | BucketHeader Counting_header; |
70 | | MemtableSkipList skip_list; |
71 | | |
72 | | explicit SkipListBucketHeader(const MemTableRep::KeyComparator& cmp, |
73 | | MemTableAllocator* allocator, uint32_t count) |
74 | | : Counting_header(this, // Pointing to itself to indicate header type. |
75 | | count), |
76 | 52 | skip_list(cmp, allocator) {} |
77 | | }; |
78 | | |
79 | | struct Node { |
80 | | // Accessors/mutators for links. Wrapped in methods so we can |
81 | | // add the appropriate barriers as necessary. |
82 | 1.05M | Node* Next() { |
83 | | // Use an 'acquire load' so that we observe a fully initialized |
84 | | // version of the returned Node. |
85 | 1.05M | return next_.load(std::memory_order_acquire); |
86 | 1.05M | } |
87 | 110k | void SetNext(Node* x) { |
88 | | // Use a 'release store' so that anybody who reads through this |
89 | | // pointer observes a fully initialized version of the inserted node. |
90 | 110k | next_.store(x, std::memory_order_release); |
91 | 110k | } |
92 | | // No-barrier variants that can be safely used in a few locations. |
93 | 0 | Node* NoBarrier_Next() { |
94 | 0 | return next_.load(std::memory_order_relaxed); |
95 | 0 | } |
96 | | |
97 | 301k | void NoBarrier_SetNext(Node* x) { next_.store(x, std::memory_order_relaxed); } |
98 | | |
99 | | // Needed for placement new below which is fine |
100 | 719k | Node() {} |
101 | | |
102 | | private: |
103 | | std::atomic<Node*> next_; |
104 | | |
105 | | // Prohibit copying due to the below |
106 | | Node(const Node&) = delete; |
107 | | Node& operator=(const Node&) = delete; |
108 | | |
109 | | public: |
110 | | char key[1]; |
111 | | }; |
112 | | |
113 | | // Memory structure of the mem table: |
114 | | // It is a hash table, each bucket points to one entry, a linked list or a |
115 | | // skip list. In order to track total number of records in a bucket to determine |
116 | | // whether should switch to skip list, a header is added just to indicate |
117 | | // number of entries in the bucket. |
118 | | // |
119 | | // |
120 | | // +-----> NULL Case 1. Empty bucket |
121 | | // | |
122 | | // | |
123 | | // | +---> +-------+ |
124 | | // | | | Next +--> NULL |
125 | | // | | +-------+ |
126 | | // +-----+ | | | | Case 2. One Entry in bucket. |
127 | | // | +-+ | | Data | next pointer points to |
128 | | // +-----+ | | | NULL. All other cases |
129 | | // | | | | | next pointer is not NULL. |
130 | | // +-----+ | +-------+ |
131 | | // | +---+ |
132 | | // +-----+ +-> +-------+ +> +-------+ +-> +-------+ |
133 | | // | | | | Next +--+ | Next +--+ | Next +-->NULL |
134 | | // +-----+ | +-------+ +-------+ +-------+ |
135 | | // | +-----+ | Count | | | | | |
136 | | // +-----+ +-------+ | Data | | Data | |
137 | | // | | | | | | |
138 | | // +-----+ Case 3. | | | | |
139 | | // | | A header +-------+ +-------+ |
140 | | // +-----+ points to |
141 | | // | | a linked list. Count indicates total number |
142 | | // +-----+ of rows in this bucket. |
143 | | // | | |
144 | | // +-----+ +-> +-------+ <--+ |
145 | | // | | | | Next +----+ |
146 | | // +-----+ | +-------+ Case 4. A header points to a skip |
147 | | // | +----+ | Count | list and next pointer points to |
148 | | // +-----+ +-------+ itself, to distinguish case 3 or 4. |
149 | | // | | | | Count still is kept to indicates total |
150 | | // +-----+ | Skip +--> of entries in the bucket for debugging |
151 | | // | | | List | Data purpose. |
152 | | // | | | +--> |
153 | | // +-----+ | | |
154 | | // | | +-------+ |
155 | | // +-----+ |
156 | | // |
157 | | // We don't have data race when changing cases because: |
158 | | // (1) When changing from case 2->3, we create a new bucket header, put the |
159 | | // single node there first without changing the original node, and do a |
160 | | // release store when changing the bucket pointer. In that case, a reader |
161 | | // who sees a stale value of the bucket pointer will read this node, while |
162 | | // a reader sees the correct value because of the release store. |
163 | | // (2) When changing case 3->4, a new header is created with skip list points |
164 | | // to the data, before doing an acquire store to change the bucket pointer. |
165 | | // The old header and nodes are never changed, so any reader sees any |
166 | | // of those existing pointers will guarantee to be able to iterate to the |
167 | | // end of the linked list. |
168 | | // (3) Header's next pointer in case 3 might change, but they are never equal |
169 | | // to itself, so no matter a reader sees any stale or newer value, it will |
170 | | // be able to correctly distinguish case 3 and 4. |
171 | | // |
172 | | // The reason that we use case 2 is we want to make the format to be efficient |
173 | | // when the utilization of buckets is relatively low. If we use case 3 for |
174 | | // single entry bucket, we will need to waste 12 bytes for every entry, |
175 | | // which can be significant decrease of memory utilization. |
176 | | class HashLinkListRep : public MemTableRep { |
177 | | public: |
178 | | HashLinkListRep(const MemTableRep::KeyComparator& compare, |
179 | | MemTableAllocator* allocator, const SliceTransform* transform, |
180 | | size_t bucket_size, uint32_t threshold_use_skiplist, |
181 | | size_t huge_page_tlb_size, Logger* logger, |
182 | | int bucket_entries_logging_threshold, |
183 | | bool if_log_bucket_dist_when_flash); |
184 | | |
185 | | KeyHandle Allocate(const size_t len, char** buf) override; |
186 | | |
187 | | void Insert(KeyHandle handle) override; |
188 | | |
189 | | bool Contains(const char* key) const override; |
190 | | |
191 | | size_t ApproximateMemoryUsage() override; |
192 | | |
193 | | virtual void Get(const LookupKey& k, void* callback_args, |
194 | | bool (*callback_func)(void* arg, |
195 | | const char* entry)) override; |
196 | | |
197 | | virtual ~HashLinkListRep(); |
198 | | |
199 | | MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override; |
200 | | |
201 | | virtual MemTableRep::Iterator* GetDynamicPrefixIterator( |
202 | | Arena* arena = nullptr) override; |
203 | | |
204 | | private: |
205 | | friend class DynamicIterator; |
206 | | |
207 | | size_t bucket_size_; |
208 | | |
209 | | // Maps slices (which are transformed user keys) to buckets of keys sharing |
210 | | // the same transform. |
211 | | Pointer* buckets_; |
212 | | |
213 | | const uint32_t threshold_use_skiplist_; |
214 | | |
215 | | // The user-supplied transform whose domain is the user keys. |
216 | | const SliceTransform* transform_; |
217 | | |
218 | | const MemTableRep::KeyComparator& compare_; |
219 | | |
220 | | Logger* logger_; |
221 | | int bucket_entries_logging_threshold_; |
222 | | bool if_log_bucket_dist_when_flash_; |
223 | | |
224 | | bool LinkListContains(Node* head, const Slice& key) const; |
225 | | |
226 | | SkipListBucketHeader* GetSkipListBucketHeader(Pointer* first_next_pointer) |
227 | | const; |
228 | | |
229 | | Node* GetLinkListFirstNode(Pointer* first_next_pointer) const; |
230 | | |
231 | 1.74M | Slice GetPrefix(const Slice& internal_key) const { |
232 | 1.74M | return transform_->Transform(ExtractUserKey(internal_key)); |
233 | 1.74M | } |
234 | | |
235 | 2.18M | size_t GetHash(const Slice& slice) const { |
236 | 2.18M | return MurmurHash(slice.data(), static_cast<int>(slice.size()), 0) % |
237 | 2.18M | bucket_size_; |
238 | 2.18M | } |
239 | | |
240 | 1.46M | Pointer* GetBucket(size_t i) const { |
241 | 1.46M | return static_cast<Pointer*>(buckets_[i].load(std::memory_order_acquire)); |
242 | 1.46M | } |
243 | | |
244 | 1.46M | Pointer* GetBucket(const Slice& slice) const { |
245 | 1.46M | return GetBucket(GetHash(slice)); |
246 | 1.46M | } |
247 | | |
248 | 350 | bool Equal(const Slice& a, const Key& b) const { |
249 | 350 | return (compare_(b, a) == 0); |
250 | 350 | } |
251 | | |
252 | 322 | bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } |
253 | | |
254 | 757k | bool KeyIsAfterNode(const Slice& internal_key, const Node* n) const { |
255 | | // nullptr n is considered infinite |
256 | 757k | return (n != nullptr) && (compare_(n->key, internal_key) < 0); |
257 | 757k | } |
258 | | |
259 | 91.6k | bool KeyIsAfterNode(const Key& key, const Node* n) const { |
260 | | // nullptr n is considered infinite |
261 | 91.6k | return (n != nullptr) && (compare_(n->key, key) < 0); |
262 | 91.6k | } |
263 | | |
264 | | |
265 | | Node* FindGreaterOrEqualInBucket(Node* head, const Slice& key) const; |
266 | | |
267 | | class FullListIterator : public MemTableRep::Iterator { |
268 | | public: |
269 | | explicit FullListIterator(MemtableSkipList* list, Allocator* allocator) |
270 | 753 | : iter_(list), full_list_(list), allocator_(allocator) {} |
271 | | |
272 | 753 | virtual ~FullListIterator() { |
273 | 753 | } |
274 | | |
275 | | // Returns true iff the iterator is positioned at a valid node. |
276 | 1.25M | bool Valid() const override { return iter_.Valid(); } |
277 | | |
278 | | // Returns the key at the current position. |
279 | | // REQUIRES: Valid() |
280 | 627k | const char* key() const override { |
281 | 627k | assert(Valid()); |
282 | 0 | return iter_.key(); |
283 | 627k | } |
284 | | |
285 | | // Advances to the next position. |
286 | | // REQUIRES: Valid() |
287 | 313k | void Next() override { |
288 | 313k | assert(Valid()); |
289 | 0 | iter_.Next(); |
290 | 313k | } |
291 | | |
292 | | // Advances to the previous position. |
293 | | // REQUIRES: Valid() |
294 | 0 | void Prev() override { |
295 | 0 | assert(Valid()); |
296 | 0 | iter_.Prev(); |
297 | 0 | } |
298 | | |
299 | | // Advance to the first entry with a key >= target |
300 | | virtual void Seek(const Slice& internal_key, |
301 | 428 | const char* memtable_key) override { |
302 | 428 | const char* encoded_key = |
303 | 428 | (memtable_key != nullptr) ? |
304 | 428 | memtable_key0 : EncodeKey(&tmp_, internal_key); |
305 | 428 | iter_.Seek(encoded_key); |
306 | 428 | } |
307 | | |
308 | | // Position at the first entry in collection. |
309 | | // Final state of iterator is Valid() iff collection is not empty. |
310 | 325 | void SeekToFirst() override { iter_.SeekToFirst(); } |
311 | | |
312 | | // Position at the last entry in collection. |
313 | | // Final state of iterator is Valid() iff collection is not empty. |
314 | 0 | void SeekToLast() override { iter_.SeekToLast(); } |
315 | | private: |
316 | | MemtableSkipList::Iterator iter_; |
317 | | // To destruct with the iterator. |
318 | | std::unique_ptr<MemtableSkipList> full_list_; |
319 | | std::unique_ptr<Allocator> allocator_; |
320 | | std::string tmp_; // For passing to EncodeKey |
321 | | }; |
322 | | |
323 | | class LinkListIterator : public MemTableRep::Iterator { |
324 | | public: |
325 | | explicit LinkListIterator(const HashLinkListRep* const hash_link_list_rep, |
326 | | Node* head) |
327 | | : hash_link_list_rep_(hash_link_list_rep), |
328 | | head_(head), |
329 | 764 | node_(nullptr) {} |
330 | | |
331 | 764 | virtual ~LinkListIterator() {} |
332 | | |
333 | | // Returns true iff the iterator is positioned at a valid node. |
334 | 1.31M | bool Valid() const override { return node_ != nullptr; } |
335 | | |
336 | | // Returns the key at the current position. |
337 | | // REQUIRES: Valid() |
338 | 412k | const char* key() const override { |
339 | 412k | assert(Valid()); |
340 | 0 | return node_->key; |
341 | 412k | } |
342 | | |
343 | | // Advances to the next position. |
344 | | // REQUIRES: Valid() |
345 | 301k | void Next() override { |
346 | 301k | assert(Valid()); |
347 | 0 | node_ = node_->Next(); |
348 | 301k | } |
349 | | |
350 | | // Advances to the previous position. |
351 | | // REQUIRES: Valid() |
352 | 0 | void Prev() override { |
353 | | // Prefix iterator does not support total order. |
354 | | // We simply set the iterator to invalid state |
355 | 0 | Reset(nullptr); |
356 | 0 | } |
357 | | |
358 | | // Advance to the first entry with a key >= target |
359 | | virtual void Seek(const Slice& internal_key, |
360 | 300k | const char* memtable_key) override { |
361 | 300k | node_ = hash_link_list_rep_->FindGreaterOrEqualInBucket(head_, |
362 | 300k | internal_key); |
363 | 300k | } |
364 | | |
365 | | // Position at the first entry in collection. |
366 | | // Final state of iterator is Valid() iff collection is not empty. |
367 | 0 | void SeekToFirst() override { |
368 | | // Prefix iterator does not support total order. |
369 | | // We simply set the iterator to invalid state |
370 | 0 | Reset(nullptr); |
371 | 0 | } |
372 | | |
373 | | // Position at the last entry in collection. |
374 | | // Final state of iterator is Valid() iff collection is not empty. |
375 | 0 | void SeekToLast() override { |
376 | | // Prefix iterator does not support total order. |
377 | | // We simply set the iterator to invalid state |
378 | 0 | Reset(nullptr); |
379 | 0 | } |
380 | | |
381 | | protected: |
382 | 300k | void Reset(Node* head) { |
383 | 300k | head_ = head; |
384 | 300k | node_ = nullptr; |
385 | 300k | } |
386 | | private: |
387 | | friend class HashLinkListRep; |
388 | | const HashLinkListRep* const hash_link_list_rep_; |
389 | | Node* head_; |
390 | | Node* node_; |
391 | | |
392 | 443 | virtual void SeekToHead() { |
393 | 443 | node_ = head_; |
394 | 443 | } |
395 | | }; |
396 | | |
397 | | class DynamicIterator : public HashLinkListRep::LinkListIterator { |
398 | | public: |
399 | | explicit DynamicIterator(const HashLinkListRep& memtable_rep) |
400 | | : HashLinkListRep::LinkListIterator(&memtable_rep, nullptr), |
401 | 103 | memtable_rep_(memtable_rep) {} |
402 | | |
403 | | // Advance to the first entry with a key >= target |
404 | 300k | void Seek(const Slice& k, const char* memtable_key) override { |
405 | 300k | auto transformed = memtable_rep_.GetPrefix(k); |
406 | 300k | auto* bucket = memtable_rep_.GetBucket(transformed); |
407 | | |
408 | 300k | SkipListBucketHeader* skip_list_header = |
409 | 300k | memtable_rep_.GetSkipListBucketHeader(bucket); |
410 | 300k | if (skip_list_header != nullptr) { |
411 | | // The bucket is organized as a skip list |
412 | 2 | if (!skip_list_iter_) { |
413 | 2 | skip_list_iter_.reset( |
414 | 2 | new MemtableSkipList::Iterator(&skip_list_header->skip_list)); |
415 | 2 | } else { |
416 | 0 | skip_list_iter_->SetList(&skip_list_header->skip_list); |
417 | 0 | } |
418 | 2 | if (memtable_key != nullptr) { |
419 | 0 | skip_list_iter_->Seek(memtable_key); |
420 | 2 | } else { |
421 | 2 | IterKey encoded_key; |
422 | 2 | encoded_key.EncodeLengthPrefixedKey(k); |
423 | 2 | skip_list_iter_->Seek(encoded_key.GetKey().cdata()); |
424 | 2 | } |
425 | 300k | } else { |
426 | | // The bucket is organized as a linked list |
427 | 300k | skip_list_iter_.reset(); |
428 | 300k | Reset(memtable_rep_.GetLinkListFirstNode(bucket)); |
429 | 300k | HashLinkListRep::LinkListIterator::Seek(k, memtable_key); |
430 | 300k | } |
431 | 300k | } |
432 | | |
433 | 1.31M | bool Valid() const override { |
434 | 1.31M | if (skip_list_iter_) { |
435 | 9 | return skip_list_iter_->Valid(); |
436 | 9 | } |
437 | 1.31M | return HashLinkListRep::LinkListIterator::Valid(); |
438 | 1.31M | } |
439 | | |
440 | 410k | const char* key() const override { |
441 | 410k | if (skip_list_iter_) { |
442 | 14 | return skip_list_iter_->key(); |
443 | 14 | } |
444 | 410k | return HashLinkListRep::LinkListIterator::key(); |
445 | 410k | } |
446 | | |
447 | 300k | void Next() override { |
448 | 300k | if (skip_list_iter_) { |
449 | 7 | skip_list_iter_->Next(); |
450 | 300k | } else { |
451 | 300k | HashLinkListRep::LinkListIterator::Next(); |
452 | 300k | } |
453 | 300k | } |
454 | | |
455 | | private: |
456 | | // the underlying memtable |
457 | | const HashLinkListRep& memtable_rep_; |
458 | | std::unique_ptr<MemtableSkipList::Iterator> skip_list_iter_; |
459 | | }; |
460 | | |
461 | | class EmptyIterator : public MemTableRep::Iterator { |
462 | | // This is used when there wasn't a bucket. It is cheaper than |
463 | | // instantiating an empty bucket over which to iterate. |
464 | | public: |
465 | 0 | EmptyIterator() { } |
466 | 0 | bool Valid() const override { return false; } |
467 | 0 | const char* key() const override { |
468 | 0 | assert(false); |
469 | 0 | return nullptr; |
470 | 0 | } |
471 | 0 | void Next() override {} |
472 | 0 | void Prev() override {} |
473 | | virtual void Seek(const Slice& user_key, |
474 | 0 | const char* memtable_key) override {} |
475 | 0 | void SeekToFirst() override {} |
476 | 0 | void SeekToLast() override {} |
477 | | |
478 | | private: |
479 | | }; |
480 | | }; |
481 | | |
482 | | HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare, |
483 | | MemTableAllocator* allocator, |
484 | | const SliceTransform* transform, |
485 | | size_t bucket_size, |
486 | | uint32_t threshold_use_skiplist, |
487 | | size_t huge_page_tlb_size, Logger* logger, |
488 | | int bucket_entries_logging_threshold, |
489 | | bool if_log_bucket_dist_when_flash) |
490 | | : MemTableRep(allocator), |
491 | | bucket_size_(bucket_size), |
492 | | // Threshold to use skip list doesn't make sense if less than 3, so we |
493 | | // force it to be minimum of 3 to simplify implementation. |
494 | | threshold_use_skiplist_(std::max(threshold_use_skiplist, 3U)), |
495 | | transform_(transform), |
496 | | compare_(compare), |
497 | | logger_(logger), |
498 | | bucket_entries_logging_threshold_(bucket_entries_logging_threshold), |
499 | 695 | if_log_bucket_dist_when_flash_(if_log_bucket_dist_when_flash) { |
500 | 695 | char* mem = allocator_->AllocateAligned(sizeof(Pointer) * bucket_size, |
501 | 695 | huge_page_tlb_size, logger); |
502 | | |
503 | 695 | buckets_ = new (mem) Pointer[bucket_size]; |
504 | | |
505 | 303k | for (size_t i = 0; i < bucket_size_; ++i302k ) { |
506 | 302k | buckets_[i].store(nullptr, std::memory_order_relaxed); |
507 | 302k | } |
508 | 695 | } |
509 | | |
510 | 695 | HashLinkListRep::~HashLinkListRep() { |
511 | 695 | } |
512 | | |
513 | 719k | KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) { |
514 | 719k | char* mem = allocator_->AllocateAligned(sizeof(Node) + len); |
515 | 719k | Node* x = new (mem) Node(); |
516 | 719k | *buf = x->key; |
517 | 719k | return static_cast<void*>(x); |
518 | 719k | } |
519 | | |
520 | | SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader( |
521 | 1.27M | Pointer* first_next_pointer) const { |
522 | 1.27M | if (first_next_pointer == nullptr) { |
523 | 287 | return nullptr; |
524 | 287 | } |
525 | 1.27M | if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) { |
526 | | // Single entry bucket |
527 | 190k | return nullptr; |
528 | 190k | } |
529 | | // Counting header |
530 | 1.08M | BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer); |
531 | 1.08M | if (header->IsSkipListBucket()) { |
532 | 859k | assert(header->GetNumEntries() > threshold_use_skiplist_); |
533 | 0 | auto* skip_list_bucket_header = |
534 | 859k | reinterpret_cast<SkipListBucketHeader*>(header); |
535 | 859k | assert(skip_list_bucket_header->Counting_header.next.load( |
536 | 859k | std::memory_order_relaxed) == header); |
537 | 0 | return skip_list_bucket_header; |
538 | 859k | } |
539 | 221k | assert(header->GetNumEntries() <= threshold_use_skiplist_); |
540 | 0 | return nullptr; |
541 | 1.08M | } |
542 | | |
543 | 412k | Node* HashLinkListRep::GetLinkListFirstNode(Pointer* first_next_pointer) const { |
544 | 412k | if (first_next_pointer == nullptr) { |
545 | 287 | return nullptr; |
546 | 287 | } |
547 | 411k | if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) { |
548 | | // Single entry bucket |
549 | 190k | return reinterpret_cast<Node*>(first_next_pointer); |
550 | 190k | } |
551 | | // Counting header |
552 | 221k | BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer); |
553 | 221k | if (!header->IsSkipListBucket()) { |
554 | 221k | assert(header->GetNumEntries() <= threshold_use_skiplist_); |
555 | 0 | return reinterpret_cast<Node*>( |
556 | 221k | header->next.load(std::memory_order_acquire)); |
557 | 221k | } |
558 | 0 | assert(header->GetNumEntries() > threshold_use_skiplist_); |
559 | 0 | return nullptr; |
560 | 221k | } |
561 | | |
562 | 719k | void HashLinkListRep::Insert(KeyHandle handle) { |
563 | 719k | Node* x = static_cast<Node*>(handle); |
564 | 719k | assert(!Contains(x->key)); |
565 | 0 | Slice internal_key = GetLengthPrefixedSlice(x->key); |
566 | 719k | auto transformed = GetPrefix(internal_key); |
567 | 719k | auto& bucket = buckets_[GetHash(transformed)]; |
568 | 719k | Pointer* first_next_pointer = |
569 | 719k | static_cast<Pointer*>(bucket.load(std::memory_order_relaxed)); |
570 | | |
571 | 719k | if (first_next_pointer == nullptr) { |
572 | | // Case 1. empty bucket |
573 | | // NoBarrier_SetNext() suffices since we will add a barrier when |
574 | | // we publish a pointer to "x" in prev[i]. |
575 | 189k | x->NoBarrier_SetNext(nullptr); |
576 | 189k | bucket.store(x, std::memory_order_release); |
577 | 189k | return; |
578 | 189k | } |
579 | | |
580 | 529k | BucketHeader* header = nullptr; |
581 | 529k | if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) { |
582 | | // Case 2. only one entry in the bucket |
583 | | // Need to convert to a Counting bucket and turn to case 4. |
584 | 79.0k | Node* first = reinterpret_cast<Node*>(first_next_pointer); |
585 | | // Need to add a bucket header. |
586 | | // We have to first convert it to a bucket with header before inserting |
587 | | // the new node. Otherwise, we might need to change next pointer of first. |
588 | | // In that case, a reader might sees the next pointer is NULL and wrongly |
589 | | // think the node is a bucket header. |
590 | 79.0k | auto* mem = allocator_->AllocateAligned(sizeof(BucketHeader)); |
591 | 79.0k | header = new (mem) BucketHeader(first, 1); |
592 | 79.0k | bucket.store(header, std::memory_order_release); |
593 | 450k | } else { |
594 | 450k | header = reinterpret_cast<BucketHeader*>(first_next_pointer); |
595 | 450k | if (header->IsSkipListBucket()) { |
596 | | // Case 4. Bucket is already a skip list |
597 | 418k | assert(header->GetNumEntries() > threshold_use_skiplist_); |
598 | 0 | auto* skip_list_bucket_header = |
599 | 418k | reinterpret_cast<SkipListBucketHeader*>(header); |
600 | | // Only one thread can execute Insert() at one time. No need to do atomic |
601 | | // incremental. |
602 | 418k | skip_list_bucket_header->Counting_header.IncNumEntries(); |
603 | 418k | skip_list_bucket_header->skip_list.Insert(x->key); |
604 | 418k | return; |
605 | 418k | } |
606 | 450k | } |
607 | | |
608 | 111k | if (bucket_entries_logging_threshold_ > 0 && |
609 | 111k | header->GetNumEntries() == |
610 | 111k | static_cast<uint32_t>(bucket_entries_logging_threshold_)) { |
611 | 2.04k | RINFO(logger_, "HashLinkedList bucket %" ROCKSDB_PRIszt |
612 | 2.04k | " has more than %d " |
613 | 2.04k | "entries. Key to insert: %s", |
614 | 2.04k | GetHash(transformed), header->GetNumEntries(), |
615 | 2.04k | GetLengthPrefixedSlice(x->key).ToString(true).c_str()); |
616 | 2.04k | } |
617 | | |
618 | 111k | if (header->GetNumEntries() == threshold_use_skiplist_) { |
619 | | // Case 3. number of entries reaches the threshold so need to convert to |
620 | | // skip list. |
621 | 52 | LinkListIterator bucket_iter( |
622 | 52 | this, reinterpret_cast<Node*>( |
623 | 52 | first_next_pointer->load(std::memory_order_relaxed))); |
624 | 52 | auto mem = allocator_->AllocateAligned(sizeof(SkipListBucketHeader)); |
625 | 52 | SkipListBucketHeader* new_skip_list_header = new (mem) |
626 | 52 | SkipListBucketHeader(compare_, allocator_, header->GetNumEntries() + 1); |
627 | 52 | auto& skip_list = new_skip_list_header->skip_list; |
628 | | |
629 | | // Add all current entries to the skip list |
630 | 260 | for (bucket_iter.SeekToHead(); bucket_iter.Valid(); bucket_iter.Next()208 ) { |
631 | 208 | skip_list.Insert(bucket_iter.key()); |
632 | 208 | } |
633 | | |
634 | | // insert the new entry |
635 | 52 | skip_list.Insert(x->key); |
636 | | // Set the bucket |
637 | 52 | bucket.store(new_skip_list_header, std::memory_order_release); |
638 | 111k | } else { |
639 | | // Case 5. Need to insert to the sorted linked list without changing the |
640 | | // header. |
641 | 111k | Node* first = |
642 | 111k | reinterpret_cast<Node*>(header->next.load(std::memory_order_relaxed)); |
643 | 111k | assert(first != nullptr); |
644 | | // Advance counter unless the bucket needs to be advanced to skip list. |
645 | | // In that case, we need to make sure the previous count never exceeds |
646 | | // threshold_use_skiplist_ to avoid readers to cast to wrong format. |
647 | 0 | header->IncNumEntries(); |
648 | | |
649 | 111k | Node* cur = first; |
650 | 111k | Node* prev = nullptr; |
651 | 263k | while (true) { |
652 | 263k | if (cur == nullptr) { |
653 | 110k | break; |
654 | 110k | } |
655 | 152k | Node* next = cur->Next(); |
656 | | // Make sure the lists are sorted. |
657 | | // If x points to head_ or next points nullptr, it is trivially satisfied. |
658 | 152k | assert((cur == first) || (next == nullptr) || |
659 | 152k | KeyIsAfterNode(next->key, cur)); |
660 | 152k | if (KeyIsAfterNode(internal_key, cur)) { |
661 | | // Keep searching in this list |
662 | 152k | prev = cur; |
663 | 152k | cur = next; |
664 | 152k | } else { |
665 | 322 | break; |
666 | 322 | } |
667 | 152k | } |
668 | | |
669 | | // Our data structure does not allow duplicate insertion |
670 | 111k | assert(cur == nullptr || !Equal(x->key, cur->key)); |
671 | | |
672 | | // NoBarrier_SetNext() suffices since we will add a barrier when |
673 | | // we publish a pointer to "x" in prev[i]. |
674 | 0 | x->NoBarrier_SetNext(cur); |
675 | | |
676 | 111k | if (prev) { |
677 | 110k | prev->SetNext(x); |
678 | 110k | } else { |
679 | 210 | header->next.store(static_cast<void*>(x), std::memory_order_release); |
680 | 210 | } |
681 | 111k | } |
682 | 111k | } |
683 | | |
684 | 719k | bool HashLinkListRep::Contains(const char* key) const { |
685 | 719k | Slice internal_key = GetLengthPrefixedSlice(key); |
686 | | |
687 | 719k | auto transformed = GetPrefix(internal_key); |
688 | 719k | auto bucket = GetBucket(transformed); |
689 | 719k | if (bucket == nullptr) { |
690 | 189k | return false; |
691 | 189k | } |
692 | | |
693 | 529k | SkipListBucketHeader* skip_list_header = GetSkipListBucketHeader(bucket); |
694 | 529k | if (skip_list_header != nullptr) { |
695 | 418k | return skip_list_header->skip_list.Contains(key); |
696 | 418k | } else { |
697 | 111k | return LinkListContains(GetLinkListFirstNode(bucket), internal_key); |
698 | 111k | } |
699 | 529k | } |
700 | | |
701 | 721k | size_t HashLinkListRep::ApproximateMemoryUsage() { |
702 | | // Memory is always allocated from the allocator. |
703 | 721k | return 0; |
704 | 721k | } |
705 | | |
706 | | void HashLinkListRep::Get(const LookupKey& k, void* callback_args, |
707 | 441k | bool (*callback_func)(void* arg, const char* entry)) { |
708 | 441k | auto transformed = transform_->Transform(k.user_key()); |
709 | 441k | auto bucket = GetBucket(transformed); |
710 | | |
711 | 441k | auto* skip_list_header = GetSkipListBucketHeader(bucket); |
712 | 441k | if (skip_list_header != nullptr441k ) { |
713 | | // Is a skip list |
714 | 441k | MemtableSkipList::Iterator iter(&skip_list_header->skip_list); |
715 | 441k | for (iter.Seek(k.memtable_key().cdata()); |
716 | 441k | iter.Valid() && callback_func(callback_args, iter.key())440k ; |
717 | 441k | iter.Next()0 ) { |
718 | 0 | } |
719 | 18.4E | } else { |
720 | 18.4E | auto* link_list_head = GetLinkListFirstNode(bucket); |
721 | 18.4E | if (link_list_head != nullptr) { |
722 | 218 | LinkListIterator iter(this, link_list_head); |
723 | 218 | for (iter.Seek(k.internal_key(), nullptr); |
724 | 218 | iter.Valid() && callback_func(callback_args, iter.key())171 ; |
725 | 218 | iter.Next()0 ) { |
726 | 0 | } |
727 | 218 | } |
728 | 18.4E | } |
729 | 441k | } |
730 | | |
731 | 753 | MemTableRep::Iterator* HashLinkListRep::GetIterator(Arena* alloc_arena) { |
732 | | // allocate a new arena of similar size to the one currently in use |
733 | 753 | Arena* new_arena = new Arena(allocator_->BlockSize()); |
734 | 753 | auto list = new MemtableSkipList(compare_, new_arena); |
735 | 753 | HistogramImpl keys_per_bucket_hist; |
736 | | |
737 | 3.73k | for (size_t i = 0; i < bucket_size_; ++i2.98k ) { |
738 | 2.98k | int count = 0; |
739 | 2.98k | auto* bucket = GetBucket(i); |
740 | 2.98k | if (bucket != nullptr) { |
741 | 779 | auto* skip_list_header = GetSkipListBucketHeader(bucket); |
742 | 779 | if (skip_list_header != nullptr) { |
743 | | // Is a skip list |
744 | 388 | MemtableSkipList::Iterator itr(&skip_list_header->skip_list); |
745 | 358k | for (itr.SeekToFirst(); itr.Valid(); itr.Next()358k ) { |
746 | 358k | list->Insert(itr.key()); |
747 | 358k | count++; |
748 | 358k | } |
749 | 391 | } else { |
750 | 391 | auto* link_list_head = GetLinkListFirstNode(bucket); |
751 | 391 | if (link_list_head != nullptr) { |
752 | 391 | LinkListIterator itr(this, link_list_head); |
753 | 1.23k | for (itr.SeekToHead(); itr.Valid(); itr.Next()848 ) { |
754 | 848 | list->Insert(itr.key()); |
755 | 848 | count++; |
756 | 848 | } |
757 | 391 | } |
758 | 391 | } |
759 | 779 | } |
760 | 2.98k | if (if_log_bucket_dist_when_flash_) { |
761 | 2.98k | keys_per_bucket_hist.Add(count); |
762 | 2.98k | } |
763 | 2.98k | } |
764 | 753 | if (if_log_bucket_dist_when_flash_ && logger_ != nullptr) { |
765 | 753 | RINFO(logger_, "hashLinkedList Entry distribution among buckets: %s", |
766 | 753 | keys_per_bucket_hist.ToString().c_str()); |
767 | 753 | } |
768 | | |
769 | 753 | if (alloc_arena == nullptr) { |
770 | 0 | return new FullListIterator(list, new_arena); |
771 | 753 | } else { |
772 | 753 | auto mem = alloc_arena->AllocateAligned(sizeof(FullListIterator)); |
773 | 753 | return new (mem) FullListIterator(list, new_arena); |
774 | 753 | } |
775 | 753 | } |
776 | | |
777 | | MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator( |
778 | 103 | Arena* alloc_arena) { |
779 | 103 | if (alloc_arena == nullptr) { |
780 | 0 | return new DynamicIterator(*this); |
781 | 103 | } else { |
782 | 103 | auto mem = alloc_arena->AllocateAligned(sizeof(DynamicIterator)); |
783 | 103 | return new (mem) DynamicIterator(*this); |
784 | 103 | } |
785 | 103 | } |
786 | | |
787 | | bool HashLinkListRep::LinkListContains(Node* head, |
788 | 111k | const Slice& user_key) const { |
789 | 111k | Node* x = FindGreaterOrEqualInBucket(head, user_key); |
790 | 111k | return (x != nullptr && Equal(user_key, x->key)350 ); |
791 | 111k | } |
792 | | |
793 | | Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head, |
794 | 411k | const Slice& key) const { |
795 | 411k | Node* x = head; |
796 | 715k | while (true) { |
797 | 715k | if (x == nullptr) { |
798 | 111k | return x; |
799 | 111k | } |
800 | 604k | Node* next = x->Next(); |
801 | | // Make sure the lists are sorted. |
802 | | // If x points to head_ or next points nullptr, it is trivially satisfied. |
803 | 604k | assert((x == head) || (next == nullptr) || KeyIsAfterNode(next->key, x)); |
804 | 604k | if (KeyIsAfterNode(key, x)) { |
805 | | // Keep searching in this list |
806 | 304k | x = next; |
807 | 304k | } else { |
808 | 300k | break; |
809 | 300k | } |
810 | 604k | } |
811 | 300k | return x; |
812 | 411k | } |
813 | | |
814 | | } // anon namespace |
815 | | |
816 | | MemTableRep* HashLinkListRepFactory::CreateMemTableRep( |
817 | | const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator, |
818 | 695 | const SliceTransform* transform, Logger* logger) { |
819 | 695 | return new HashLinkListRep(compare, allocator, transform, bucket_count_, |
820 | 695 | threshold_use_skiplist_, huge_page_tlb_size_, |
821 | 695 | logger, bucket_entries_logging_threshold_, |
822 | 695 | if_log_bucket_dist_when_flash_); |
823 | 695 | } |
824 | | |
825 | | MemTableRepFactory* NewHashLinkListRepFactory( |
826 | | size_t bucket_count, size_t huge_page_tlb_size, |
827 | | int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash, |
828 | 279 | uint32_t threshold_use_skiplist) { |
829 | 279 | return new HashLinkListRepFactory( |
830 | 279 | bucket_count, threshold_use_skiplist, huge_page_tlb_size, |
831 | 279 | bucket_entries_logging_threshold, if_log_bucket_dist_when_flash); |
832 | 279 | } |
833 | | |
834 | | } // namespace rocksdb |
835 | | #endif // ROCKSDB_LITE |