YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/rocksdb/memtable/hash_linklist_rep.cc
Line
Count
Source (jump to first uncovered line)
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under the BSD-style license found in the
3
//  LICENSE file in the root directory of this source tree. An additional grant
4
//  of patent rights can be found in the PATENTS file in the same directory.
5
//
6
// The following only applies to changes made to this file as part of YugaByte development.
7
//
8
// Portions Copyright (c) YugaByte, Inc.
9
//
10
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
11
// in compliance with the License.  You may obtain a copy of the License at
12
//
13
// http://www.apache.org/licenses/LICENSE-2.0
14
//
15
// Unless required by applicable law or agreed to in writing, software distributed under the License
16
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
17
// or implied.  See the License for the specific language governing permissions and limitations
18
// under the License.
19
//
20
21
#ifndef ROCKSDB_LITE
22
#include "yb/rocksdb/memtable/hash_linklist_rep.h"
23
24
#include <algorithm>
25
#include <atomic>
26
#include "yb/rocksdb/memtablerep.h"
27
#include "yb/rocksdb/util/arena.h"
28
#include "yb/util/slice.h"
29
#include "yb/rocksdb/slice_transform.h"
30
#include "yb/rocksdb/port/port.h"
31
#include "yb/rocksdb/util/histogram.h"
32
#include "yb/rocksdb/util/murmurhash.h"
33
#include "yb/rocksdb/db/memtable.h"
34
#include "yb/rocksdb/db/skiplist.h"
35
36
namespace rocksdb {
37
namespace {
38
39
typedef const char* Key;
40
typedef SkipList<Key, const MemTableRep::KeyComparator&> MemtableSkipList;
41
typedef std::atomic<void*> Pointer;
42
43
// A data structure used as the header of a link list of a hash bucket.
44
struct BucketHeader {
45
  Pointer next;
46
  std::atomic<uint32_t> num_entries;
47
48
  explicit BucketHeader(void* n, uint32_t count)
49
79.1k
      : next(n), num_entries(count) {}
50
51
1.75M
  bool IsSkipListBucket() {
52
1.75M
    return next.load(std::memory_order_relaxed) == this;
53
1.75M
  }
54
55
2.47M
  uint32_t GetNumEntries() const {
56
2.47M
    return num_entries.load(std::memory_order_relaxed);
57
2.47M
  }
58
59
  // REQUIRES: called from single-threaded Insert()
60
529k
  void IncNumEntries() {
61
    // Only one thread can do write at one time. No need to do atomic
62
    // incremental. Update it with relaxed load and store.
63
529k
    num_entries.store(GetNumEntries() + 1, std::memory_order_relaxed);
64
529k
  }
65
};
66
67
// A data structure used as the header of a skip list of a hash bucket.
68
struct SkipListBucketHeader {
69
  BucketHeader Counting_header;
70
  MemtableSkipList skip_list;
71
72
  explicit SkipListBucketHeader(const MemTableRep::KeyComparator& cmp,
73
                                MemTableAllocator* allocator, uint32_t count)
74
      : Counting_header(this,  // Pointing to itself to indicate header type.
75
                        count),
76
52
        skip_list(cmp, allocator) {}
77
};
78
79
struct Node {
80
  // Accessors/mutators for links.  Wrapped in methods so we can
81
  // add the appropriate barriers as necessary.
82
1.05M
  Node* Next() {
83
    // Use an 'acquire load' so that we observe a fully initialized
84
    // version of the returned Node.
85
1.05M
    return next_.load(std::memory_order_acquire);
86
1.05M
  }
87
110k
  void SetNext(Node* x) {
88
    // Use a 'release store' so that anybody who reads through this
89
    // pointer observes a fully initialized version of the inserted node.
90
110k
    next_.store(x, std::memory_order_release);
91
110k
  }
92
  // No-barrier variants that can be safely used in a few locations.
93
0
  Node* NoBarrier_Next() {
94
0
    return next_.load(std::memory_order_relaxed);
95
0
  }
96
97
301k
  void NoBarrier_SetNext(Node* x) { next_.store(x, std::memory_order_relaxed); }
98
99
  // Needed for placement new below which is fine
100
719k
  Node() {}
101
102
 private:
103
  std::atomic<Node*> next_;
104
105
  // Prohibit copying due to the below
106
  Node(const Node&) = delete;
107
  Node& operator=(const Node&) = delete;
108
109
 public:
110
  char key[1];
111
};
112
113
// Memory structure of the mem table:
114
// It is a hash table, each bucket points to one entry, a linked list or a
115
// skip list. In order to track total number of records in a bucket to determine
116
// whether should switch to skip list, a header is added just to indicate
117
// number of entries in the bucket.
118
//
119
//
120
//          +-----> NULL    Case 1. Empty bucket
121
//          |
122
//          |
123
//          | +---> +-------+
124
//          | |     | Next  +--> NULL
125
//          | |     +-------+
126
//  +-----+ | |     |       |  Case 2. One Entry in bucket.
127
//  |     +-+ |     | Data  |          next pointer points to
128
//  +-----+   |     |       |          NULL. All other cases
129
//  |     |   |     |       |          next pointer is not NULL.
130
//  +-----+   |     +-------+
131
//  |     +---+
132
//  +-----+     +-> +-------+  +> +-------+  +-> +-------+
133
//  |     |     |   | Next  +--+  | Next  +--+   | Next  +-->NULL
134
//  +-----+     |   +-------+     +-------+      +-------+
135
//  |     +-----+   | Count |     |       |      |       |
136
//  +-----+         +-------+     | Data  |      | Data  |
137
//  |     |                       |       |      |       |
138
//  +-----+          Case 3.      |       |      |       |
139
//  |     |          A header     +-------+      +-------+
140
//  +-----+          points to
141
//  |     |          a linked list. Count indicates total number
142
//  +-----+          of rows in this bucket.
143
//  |     |
144
//  +-----+    +-> +-------+ <--+
145
//  |     |    |   | Next  +----+
146
//  +-----+    |   +-------+   Case 4. A header points to a skip
147
//  |     +----+   | Count |           list and next pointer points to
148
//  +-----+        +-------+           itself, to distinguish case 3 or 4.
149
//  |     |        |       |           Count still is kept to indicates total
150
//  +-----+        | Skip +-->         of entries in the bucket for debugging
151
//  |     |        | List  |   Data    purpose.
152
//  |     |        |      +-->
153
//  +-----+        |       |
154
//  |     |        +-------+
155
//  +-----+
156
//
157
// We don't have data race when changing cases because:
158
// (1) When changing from case 2->3, we create a new bucket header, put the
159
//     single node there first without changing the original node, and do a
160
//     release store when changing the bucket pointer. In that case, a reader
161
//     who sees a stale value of the bucket pointer will read this node, while
162
//     a reader sees the correct value because of the release store.
163
// (2) When changing case 3->4, a new header is created with skip list points
164
//     to the data, before doing an acquire store to change the bucket pointer.
165
//     The old header and nodes are never changed, so any reader sees any
166
//     of those existing pointers will guarantee to be able to iterate to the
167
//     end of the linked list.
168
// (3) Header's next pointer in case 3 might change, but they are never equal
169
//     to itself, so no matter a reader sees any stale or newer value, it will
170
//     be able to correctly distinguish case 3 and 4.
171
//
172
// The reason that we use case 2 is we want to make the format to be efficient
173
// when the utilization of buckets is relatively low. If we use case 3 for
174
// single entry bucket, we will need to waste 12 bytes for every entry,
175
// which can be significant decrease of memory utilization.
176
class HashLinkListRep : public MemTableRep {
177
 public:
178
  HashLinkListRep(const MemTableRep::KeyComparator& compare,
179
                  MemTableAllocator* allocator, const SliceTransform* transform,
180
                  size_t bucket_size, uint32_t threshold_use_skiplist,
181
                  size_t huge_page_tlb_size, Logger* logger,
182
                  int bucket_entries_logging_threshold,
183
                  bool if_log_bucket_dist_when_flash);
184
185
  KeyHandle Allocate(const size_t len, char** buf) override;
186
187
  void Insert(KeyHandle handle) override;
188
189
  bool Contains(const char* key) const override;
190
191
  size_t ApproximateMemoryUsage() override;
192
193
  virtual void Get(const LookupKey& k, void* callback_args,
194
                   bool (*callback_func)(void* arg,
195
                                         const char* entry)) override;
196
197
  virtual ~HashLinkListRep();
198
199
  MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override;
200
201
  virtual MemTableRep::Iterator* GetDynamicPrefixIterator(
202
       Arena* arena = nullptr) override;
203
204
 private:
205
  friend class DynamicIterator;
206
207
  size_t bucket_size_;
208
209
  // Maps slices (which are transformed user keys) to buckets of keys sharing
210
  // the same transform.
211
  Pointer* buckets_;
212
213
  const uint32_t threshold_use_skiplist_;
214
215
  // The user-supplied transform whose domain is the user keys.
216
  const SliceTransform* transform_;
217
218
  const MemTableRep::KeyComparator& compare_;
219
220
  Logger* logger_;
221
  int bucket_entries_logging_threshold_;
222
  bool if_log_bucket_dist_when_flash_;
223
224
  bool LinkListContains(Node* head, const Slice& key) const;
225
226
  SkipListBucketHeader* GetSkipListBucketHeader(Pointer* first_next_pointer)
227
      const;
228
229
  Node* GetLinkListFirstNode(Pointer* first_next_pointer) const;
230
231
1.74M
  Slice GetPrefix(const Slice& internal_key) const {
232
1.74M
    return transform_->Transform(ExtractUserKey(internal_key));
233
1.74M
  }
234
235
2.18M
  size_t GetHash(const Slice& slice) const {
236
2.18M
    return MurmurHash(slice.data(), static_cast<int>(slice.size()), 0) %
237
2.18M
           bucket_size_;
238
2.18M
  }
239
240
1.46M
  Pointer* GetBucket(size_t i) const {
241
1.46M
    return static_cast<Pointer*>(buckets_[i].load(std::memory_order_acquire));
242
1.46M
  }
243
244
1.46M
  Pointer* GetBucket(const Slice& slice) const {
245
1.46M
    return GetBucket(GetHash(slice));
246
1.46M
  }
247
248
350
  bool Equal(const Slice& a, const Key& b) const {
249
350
    return (compare_(b, a) == 0);
250
350
  }
251
252
322
  bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
253
254
757k
  bool KeyIsAfterNode(const Slice& internal_key, const Node* n) const {
255
    // nullptr n is considered infinite
256
757k
    return (n != nullptr) && (compare_(n->key, internal_key) < 0);
257
757k
  }
258
259
91.6k
  bool KeyIsAfterNode(const Key& key, const Node* n) const {
260
    // nullptr n is considered infinite
261
91.6k
    return (n != nullptr) && (compare_(n->key, key) < 0);
262
91.6k
  }
263
264
265
  Node* FindGreaterOrEqualInBucket(Node* head, const Slice& key) const;
266
267
  class FullListIterator : public MemTableRep::Iterator {
268
   public:
269
    explicit FullListIterator(MemtableSkipList* list, Allocator* allocator)
270
753
        : iter_(list), full_list_(list), allocator_(allocator) {}
271
272
753
    virtual ~FullListIterator() {
273
753
    }
274
275
    // Returns true iff the iterator is positioned at a valid node.
276
1.25M
    bool Valid() const override { return iter_.Valid(); }
277
278
    // Returns the key at the current position.
279
    // REQUIRES: Valid()
280
627k
    const char* key() const override {
281
627k
      assert(Valid());
282
0
      return iter_.key();
283
627k
    }
284
285
    // Advances to the next position.
286
    // REQUIRES: Valid()
287
313k
    void Next() override {
288
313k
      assert(Valid());
289
0
      iter_.Next();
290
313k
    }
291
292
    // Advances to the previous position.
293
    // REQUIRES: Valid()
294
0
    void Prev() override {
295
0
      assert(Valid());
296
0
      iter_.Prev();
297
0
    }
298
299
    // Advance to the first entry with a key >= target
300
    virtual void Seek(const Slice& internal_key,
301
428
                      const char* memtable_key) override {
302
428
      const char* encoded_key =
303
428
          (memtable_key != nullptr) ?
304
428
              
memtable_key0
: EncodeKey(&tmp_, internal_key);
305
428
      iter_.Seek(encoded_key);
306
428
    }
307
308
    // Position at the first entry in collection.
309
    // Final state of iterator is Valid() iff collection is not empty.
310
325
    void SeekToFirst() override { iter_.SeekToFirst(); }
311
312
    // Position at the last entry in collection.
313
    // Final state of iterator is Valid() iff collection is not empty.
314
0
    void SeekToLast() override { iter_.SeekToLast(); }
315
   private:
316
    MemtableSkipList::Iterator iter_;
317
    // To destruct with the iterator.
318
    std::unique_ptr<MemtableSkipList> full_list_;
319
    std::unique_ptr<Allocator> allocator_;
320
    std::string tmp_;       // For passing to EncodeKey
321
  };
322
323
  class LinkListIterator : public MemTableRep::Iterator {
324
   public:
325
    explicit LinkListIterator(const HashLinkListRep* const hash_link_list_rep,
326
                              Node* head)
327
        : hash_link_list_rep_(hash_link_list_rep),
328
          head_(head),
329
764
          node_(nullptr) {}
330
331
764
    virtual ~LinkListIterator() {}
332
333
    // Returns true iff the iterator is positioned at a valid node.
334
1.31M
    bool Valid() const override { return node_ != nullptr; }
335
336
    // Returns the key at the current position.
337
    // REQUIRES: Valid()
338
412k
    const char* key() const override {
339
412k
      assert(Valid());
340
0
      return node_->key;
341
412k
    }
342
343
    // Advances to the next position.
344
    // REQUIRES: Valid()
345
301k
    void Next() override {
346
301k
      assert(Valid());
347
0
      node_ = node_->Next();
348
301k
    }
349
350
    // Advances to the previous position.
351
    // REQUIRES: Valid()
352
0
    void Prev() override {
353
      // Prefix iterator does not support total order.
354
      // We simply set the iterator to invalid state
355
0
      Reset(nullptr);
356
0
    }
357
358
    // Advance to the first entry with a key >= target
359
    virtual void Seek(const Slice& internal_key,
360
300k
                      const char* memtable_key) override {
361
300k
      node_ = hash_link_list_rep_->FindGreaterOrEqualInBucket(head_,
362
300k
                                                              internal_key);
363
300k
    }
364
365
    // Position at the first entry in collection.
366
    // Final state of iterator is Valid() iff collection is not empty.
367
0
    void SeekToFirst() override {
368
      // Prefix iterator does not support total order.
369
      // We simply set the iterator to invalid state
370
0
      Reset(nullptr);
371
0
    }
372
373
    // Position at the last entry in collection.
374
    // Final state of iterator is Valid() iff collection is not empty.
375
0
    void SeekToLast() override {
376
      // Prefix iterator does not support total order.
377
      // We simply set the iterator to invalid state
378
0
      Reset(nullptr);
379
0
    }
380
381
   protected:
382
300k
    void Reset(Node* head) {
383
300k
      head_ = head;
384
300k
      node_ = nullptr;
385
300k
    }
386
   private:
387
    friend class HashLinkListRep;
388
    const HashLinkListRep* const hash_link_list_rep_;
389
    Node* head_;
390
    Node* node_;
391
392
443
    virtual void SeekToHead() {
393
443
      node_ = head_;
394
443
    }
395
  };
396
397
  class DynamicIterator : public HashLinkListRep::LinkListIterator {
398
   public:
399
    explicit DynamicIterator(const HashLinkListRep& memtable_rep)
400
        : HashLinkListRep::LinkListIterator(&memtable_rep, nullptr),
401
103
          memtable_rep_(memtable_rep) {}
402
403
    // Advance to the first entry with a key >= target
404
300k
    void Seek(const Slice& k, const char* memtable_key) override {
405
300k
      auto transformed = memtable_rep_.GetPrefix(k);
406
300k
      auto* bucket = memtable_rep_.GetBucket(transformed);
407
408
300k
      SkipListBucketHeader* skip_list_header =
409
300k
          memtable_rep_.GetSkipListBucketHeader(bucket);
410
300k
      if (skip_list_header != nullptr) {
411
        // The bucket is organized as a skip list
412
2
        if (!skip_list_iter_) {
413
2
          skip_list_iter_.reset(
414
2
              new MemtableSkipList::Iterator(&skip_list_header->skip_list));
415
2
        } else {
416
0
          skip_list_iter_->SetList(&skip_list_header->skip_list);
417
0
        }
418
2
        if (memtable_key != nullptr) {
419
0
          skip_list_iter_->Seek(memtable_key);
420
2
        } else {
421
2
          IterKey encoded_key;
422
2
          encoded_key.EncodeLengthPrefixedKey(k);
423
2
          skip_list_iter_->Seek(encoded_key.GetKey().cdata());
424
2
        }
425
300k
      } else {
426
        // The bucket is organized as a linked list
427
300k
        skip_list_iter_.reset();
428
300k
        Reset(memtable_rep_.GetLinkListFirstNode(bucket));
429
300k
        HashLinkListRep::LinkListIterator::Seek(k, memtable_key);
430
300k
      }
431
300k
    }
432
433
1.31M
    bool Valid() const override {
434
1.31M
      if (skip_list_iter_) {
435
9
        return skip_list_iter_->Valid();
436
9
      }
437
1.31M
      return HashLinkListRep::LinkListIterator::Valid();
438
1.31M
    }
439
440
410k
    const char* key() const override {
441
410k
      if (skip_list_iter_) {
442
14
        return skip_list_iter_->key();
443
14
      }
444
410k
      return HashLinkListRep::LinkListIterator::key();
445
410k
    }
446
447
300k
    void Next() override {
448
300k
      if (skip_list_iter_) {
449
7
        skip_list_iter_->Next();
450
300k
      } else {
451
300k
        HashLinkListRep::LinkListIterator::Next();
452
300k
      }
453
300k
    }
454
455
   private:
456
    // the underlying memtable
457
    const HashLinkListRep& memtable_rep_;
458
    std::unique_ptr<MemtableSkipList::Iterator> skip_list_iter_;
459
  };
460
461
  class EmptyIterator : public MemTableRep::Iterator {
462
    // This is used when there wasn't a bucket. It is cheaper than
463
    // instantiating an empty bucket over which to iterate.
464
   public:
465
0
    EmptyIterator() { }
466
0
    bool Valid() const override { return false; }
467
0
    const char* key() const override {
468
0
      assert(false);
469
0
      return nullptr;
470
0
    }
471
0
    void Next() override {}
472
0
    void Prev() override {}
473
    virtual void Seek(const Slice& user_key,
474
0
                      const char* memtable_key) override {}
475
0
    void SeekToFirst() override {}
476
0
    void SeekToLast() override {}
477
478
   private:
479
  };
480
};
481
482
HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare,
483
                                 MemTableAllocator* allocator,
484
                                 const SliceTransform* transform,
485
                                 size_t bucket_size,
486
                                 uint32_t threshold_use_skiplist,
487
                                 size_t huge_page_tlb_size, Logger* logger,
488
                                 int bucket_entries_logging_threshold,
489
                                 bool if_log_bucket_dist_when_flash)
490
    : MemTableRep(allocator),
491
      bucket_size_(bucket_size),
492
      // Threshold to use skip list doesn't make sense if less than 3, so we
493
      // force it to be minimum of 3 to simplify implementation.
494
      threshold_use_skiplist_(std::max(threshold_use_skiplist, 3U)),
495
      transform_(transform),
496
      compare_(compare),
497
      logger_(logger),
498
      bucket_entries_logging_threshold_(bucket_entries_logging_threshold),
499
695
      if_log_bucket_dist_when_flash_(if_log_bucket_dist_when_flash) {
500
695
  char* mem = allocator_->AllocateAligned(sizeof(Pointer) * bucket_size,
501
695
                                      huge_page_tlb_size, logger);
502
503
695
  buckets_ = new (mem) Pointer[bucket_size];
504
505
303k
  for (size_t i = 0; i < bucket_size_; 
++i302k
) {
506
302k
    buckets_[i].store(nullptr, std::memory_order_relaxed);
507
302k
  }
508
695
}
509
510
695
HashLinkListRep::~HashLinkListRep() {
511
695
}
512
513
719k
KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) {
514
719k
  char* mem = allocator_->AllocateAligned(sizeof(Node) + len);
515
719k
  Node* x = new (mem) Node();
516
719k
  *buf = x->key;
517
719k
  return static_cast<void*>(x);
518
719k
}
519
520
SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader(
521
1.27M
    Pointer* first_next_pointer) const {
522
1.27M
  if (first_next_pointer == nullptr) {
523
287
    return nullptr;
524
287
  }
525
1.27M
  if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
526
    // Single entry bucket
527
190k
    return nullptr;
528
190k
  }
529
  // Counting header
530
1.08M
  BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
531
1.08M
  if (header->IsSkipListBucket()) {
532
859k
    assert(header->GetNumEntries() > threshold_use_skiplist_);
533
0
    auto* skip_list_bucket_header =
534
859k
        reinterpret_cast<SkipListBucketHeader*>(header);
535
859k
    assert(skip_list_bucket_header->Counting_header.next.load(
536
859k
               std::memory_order_relaxed) == header);
537
0
    return skip_list_bucket_header;
538
859k
  }
539
221k
  assert(header->GetNumEntries() <= threshold_use_skiplist_);
540
0
  return nullptr;
541
1.08M
}
542
543
412k
Node* HashLinkListRep::GetLinkListFirstNode(Pointer* first_next_pointer) const {
544
412k
  if (first_next_pointer == nullptr) {
545
287
    return nullptr;
546
287
  }
547
411k
  if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
548
    // Single entry bucket
549
190k
    return reinterpret_cast<Node*>(first_next_pointer);
550
190k
  }
551
  // Counting header
552
221k
  BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
553
221k
  if (!header->IsSkipListBucket()) {
554
221k
    assert(header->GetNumEntries() <= threshold_use_skiplist_);
555
0
    return reinterpret_cast<Node*>(
556
221k
        header->next.load(std::memory_order_acquire));
557
221k
  }
558
0
  assert(header->GetNumEntries() > threshold_use_skiplist_);
559
0
  return nullptr;
560
221k
}
561
562
719k
void HashLinkListRep::Insert(KeyHandle handle) {
563
719k
  Node* x = static_cast<Node*>(handle);
564
719k
  assert(!Contains(x->key));
565
0
  Slice internal_key = GetLengthPrefixedSlice(x->key);
566
719k
  auto transformed = GetPrefix(internal_key);
567
719k
  auto& bucket = buckets_[GetHash(transformed)];
568
719k
  Pointer* first_next_pointer =
569
719k
      static_cast<Pointer*>(bucket.load(std::memory_order_relaxed));
570
571
719k
  if (first_next_pointer == nullptr) {
572
    // Case 1. empty bucket
573
    // NoBarrier_SetNext() suffices since we will add a barrier when
574
    // we publish a pointer to "x" in prev[i].
575
189k
    x->NoBarrier_SetNext(nullptr);
576
189k
    bucket.store(x, std::memory_order_release);
577
189k
    return;
578
189k
  }
579
580
529k
  BucketHeader* header = nullptr;
581
529k
  if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
582
    // Case 2. only one entry in the bucket
583
    // Need to convert to a Counting bucket and turn to case 4.
584
79.0k
    Node* first = reinterpret_cast<Node*>(first_next_pointer);
585
    // Need to add a bucket header.
586
    // We have to first convert it to a bucket with header before inserting
587
    // the new node. Otherwise, we might need to change next pointer of first.
588
    // In that case, a reader might sees the next pointer is NULL and wrongly
589
    // think the node is a bucket header.
590
79.0k
    auto* mem = allocator_->AllocateAligned(sizeof(BucketHeader));
591
79.0k
    header = new (mem) BucketHeader(first, 1);
592
79.0k
    bucket.store(header, std::memory_order_release);
593
450k
  } else {
594
450k
    header = reinterpret_cast<BucketHeader*>(first_next_pointer);
595
450k
    if (header->IsSkipListBucket()) {
596
      // Case 4. Bucket is already a skip list
597
418k
      assert(header->GetNumEntries() > threshold_use_skiplist_);
598
0
      auto* skip_list_bucket_header =
599
418k
          reinterpret_cast<SkipListBucketHeader*>(header);
600
      // Only one thread can execute Insert() at one time. No need to do atomic
601
      // incremental.
602
418k
      skip_list_bucket_header->Counting_header.IncNumEntries();
603
418k
      skip_list_bucket_header->skip_list.Insert(x->key);
604
418k
      return;
605
418k
    }
606
450k
  }
607
608
111k
  if (bucket_entries_logging_threshold_ > 0 &&
609
111k
      header->GetNumEntries() ==
610
111k
          static_cast<uint32_t>(bucket_entries_logging_threshold_)) {
611
2.04k
    RINFO(logger_, "HashLinkedList bucket %" ROCKSDB_PRIszt
612
2.04k
                  " has more than %d "
613
2.04k
                  "entries. Key to insert: %s",
614
2.04k
         GetHash(transformed), header->GetNumEntries(),
615
2.04k
         GetLengthPrefixedSlice(x->key).ToString(true).c_str());
616
2.04k
  }
617
618
111k
  if (header->GetNumEntries() == threshold_use_skiplist_) {
619
    // Case 3. number of entries reaches the threshold so need to convert to
620
    // skip list.
621
52
    LinkListIterator bucket_iter(
622
52
        this, reinterpret_cast<Node*>(
623
52
                  first_next_pointer->load(std::memory_order_relaxed)));
624
52
    auto mem = allocator_->AllocateAligned(sizeof(SkipListBucketHeader));
625
52
    SkipListBucketHeader* new_skip_list_header = new (mem)
626
52
        SkipListBucketHeader(compare_, allocator_, header->GetNumEntries() + 1);
627
52
    auto& skip_list = new_skip_list_header->skip_list;
628
629
    // Add all current entries to the skip list
630
260
    for (bucket_iter.SeekToHead(); bucket_iter.Valid(); 
bucket_iter.Next()208
) {
631
208
      skip_list.Insert(bucket_iter.key());
632
208
    }
633
634
    // insert the new entry
635
52
    skip_list.Insert(x->key);
636
    // Set the bucket
637
52
    bucket.store(new_skip_list_header, std::memory_order_release);
638
111k
  } else {
639
    // Case 5. Need to insert to the sorted linked list without changing the
640
    // header.
641
111k
    Node* first =
642
111k
        reinterpret_cast<Node*>(header->next.load(std::memory_order_relaxed));
643
111k
    assert(first != nullptr);
644
    // Advance counter unless the bucket needs to be advanced to skip list.
645
    // In that case, we need to make sure the previous count never exceeds
646
    // threshold_use_skiplist_ to avoid readers to cast to wrong format.
647
0
    header->IncNumEntries();
648
649
111k
    Node* cur = first;
650
111k
    Node* prev = nullptr;
651
263k
    while (true) {
652
263k
      if (cur == nullptr) {
653
110k
        break;
654
110k
      }
655
152k
      Node* next = cur->Next();
656
      // Make sure the lists are sorted.
657
      // If x points to head_ or next points nullptr, it is trivially satisfied.
658
152k
      assert((cur == first) || (next == nullptr) ||
659
152k
             KeyIsAfterNode(next->key, cur));
660
152k
      if (KeyIsAfterNode(internal_key, cur)) {
661
        // Keep searching in this list
662
152k
        prev = cur;
663
152k
        cur = next;
664
152k
      } else {
665
322
        break;
666
322
      }
667
152k
    }
668
669
    // Our data structure does not allow duplicate insertion
670
111k
    assert(cur == nullptr || !Equal(x->key, cur->key));
671
672
    // NoBarrier_SetNext() suffices since we will add a barrier when
673
    // we publish a pointer to "x" in prev[i].
674
0
    x->NoBarrier_SetNext(cur);
675
676
111k
    if (prev) {
677
110k
      prev->SetNext(x);
678
110k
    } else {
679
210
      header->next.store(static_cast<void*>(x), std::memory_order_release);
680
210
    }
681
111k
  }
682
111k
}
683
684
719k
bool HashLinkListRep::Contains(const char* key) const {
685
719k
  Slice internal_key = GetLengthPrefixedSlice(key);
686
687
719k
  auto transformed = GetPrefix(internal_key);
688
719k
  auto bucket = GetBucket(transformed);
689
719k
  if (bucket == nullptr) {
690
189k
    return false;
691
189k
  }
692
693
529k
  SkipListBucketHeader* skip_list_header = GetSkipListBucketHeader(bucket);
694
529k
  if (skip_list_header != nullptr) {
695
418k
    return skip_list_header->skip_list.Contains(key);
696
418k
  } else {
697
111k
    return LinkListContains(GetLinkListFirstNode(bucket), internal_key);
698
111k
  }
699
529k
}
700
701
721k
size_t HashLinkListRep::ApproximateMemoryUsage() {
702
  // Memory is always allocated from the allocator.
703
721k
  return 0;
704
721k
}
705
706
void HashLinkListRep::Get(const LookupKey& k, void* callback_args,
707
441k
                          bool (*callback_func)(void* arg, const char* entry)) {
708
441k
  auto transformed = transform_->Transform(k.user_key());
709
441k
  auto bucket = GetBucket(transformed);
710
711
441k
  auto* skip_list_header = GetSkipListBucketHeader(bucket);
712
441k
  if (
skip_list_header != nullptr441k
) {
713
    // Is a skip list
714
441k
    MemtableSkipList::Iterator iter(&skip_list_header->skip_list);
715
441k
    for (iter.Seek(k.memtable_key().cdata());
716
441k
         iter.Valid() && 
callback_func(callback_args, iter.key())440k
;
717
441k
         
iter.Next()0
) {
718
0
    }
719
18.4E
  } else {
720
18.4E
    auto* link_list_head = GetLinkListFirstNode(bucket);
721
18.4E
    if (link_list_head != nullptr) {
722
218
      LinkListIterator iter(this, link_list_head);
723
218
      for (iter.Seek(k.internal_key(), nullptr);
724
218
           iter.Valid() && 
callback_func(callback_args, iter.key())171
;
725
218
           
iter.Next()0
) {
726
0
      }
727
218
    }
728
18.4E
  }
729
441k
}
730
731
753
MemTableRep::Iterator* HashLinkListRep::GetIterator(Arena* alloc_arena) {
732
  // allocate a new arena of similar size to the one currently in use
733
753
  Arena* new_arena = new Arena(allocator_->BlockSize());
734
753
  auto list = new MemtableSkipList(compare_, new_arena);
735
753
  HistogramImpl keys_per_bucket_hist;
736
737
3.73k
  for (size_t i = 0; i < bucket_size_; 
++i2.98k
) {
738
2.98k
    int count = 0;
739
2.98k
    auto* bucket = GetBucket(i);
740
2.98k
    if (bucket != nullptr) {
741
779
      auto* skip_list_header = GetSkipListBucketHeader(bucket);
742
779
      if (skip_list_header != nullptr) {
743
        // Is a skip list
744
388
        MemtableSkipList::Iterator itr(&skip_list_header->skip_list);
745
358k
        for (itr.SeekToFirst(); itr.Valid(); 
itr.Next()358k
) {
746
358k
          list->Insert(itr.key());
747
358k
          count++;
748
358k
        }
749
391
      } else {
750
391
        auto* link_list_head = GetLinkListFirstNode(bucket);
751
391
        if (link_list_head != nullptr) {
752
391
          LinkListIterator itr(this, link_list_head);
753
1.23k
          for (itr.SeekToHead(); itr.Valid(); 
itr.Next()848
) {
754
848
            list->Insert(itr.key());
755
848
            count++;
756
848
          }
757
391
        }
758
391
      }
759
779
    }
760
2.98k
    if (if_log_bucket_dist_when_flash_) {
761
2.98k
      keys_per_bucket_hist.Add(count);
762
2.98k
    }
763
2.98k
  }
764
753
  if (if_log_bucket_dist_when_flash_ && logger_ != nullptr) {
765
753
    RINFO(logger_, "hashLinkedList Entry distribution among buckets: %s",
766
753
        keys_per_bucket_hist.ToString().c_str());
767
753
  }
768
769
753
  if (alloc_arena == nullptr) {
770
0
    return new FullListIterator(list, new_arena);
771
753
  } else {
772
753
    auto mem = alloc_arena->AllocateAligned(sizeof(FullListIterator));
773
753
    return new (mem) FullListIterator(list, new_arena);
774
753
  }
775
753
}
776
777
MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator(
778
103
    Arena* alloc_arena) {
779
103
  if (alloc_arena == nullptr) {
780
0
    return new DynamicIterator(*this);
781
103
  } else {
782
103
    auto mem = alloc_arena->AllocateAligned(sizeof(DynamicIterator));
783
103
    return new (mem) DynamicIterator(*this);
784
103
  }
785
103
}
786
787
bool HashLinkListRep::LinkListContains(Node* head,
788
111k
                                       const Slice& user_key) const {
789
111k
  Node* x = FindGreaterOrEqualInBucket(head, user_key);
790
111k
  return (x != nullptr && 
Equal(user_key, x->key)350
);
791
111k
}
792
793
Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head,
794
411k
                                                  const Slice& key) const {
795
411k
  Node* x = head;
796
715k
  while (true) {
797
715k
    if (x == nullptr) {
798
111k
      return x;
799
111k
    }
800
604k
    Node* next = x->Next();
801
    // Make sure the lists are sorted.
802
    // If x points to head_ or next points nullptr, it is trivially satisfied.
803
604k
    assert((x == head) || (next == nullptr) || KeyIsAfterNode(next->key, x));
804
604k
    if (KeyIsAfterNode(key, x)) {
805
      // Keep searching in this list
806
304k
      x = next;
807
304k
    } else {
808
300k
      break;
809
300k
    }
810
604k
  }
811
300k
  return x;
812
411k
}
813
814
} // anon namespace
815
816
MemTableRep* HashLinkListRepFactory::CreateMemTableRep(
817
    const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator,
818
695
    const SliceTransform* transform, Logger* logger) {
819
695
  return new HashLinkListRep(compare, allocator, transform, bucket_count_,
820
695
                             threshold_use_skiplist_, huge_page_tlb_size_,
821
695
                             logger, bucket_entries_logging_threshold_,
822
695
                             if_log_bucket_dist_when_flash_);
823
695
}
824
825
MemTableRepFactory* NewHashLinkListRepFactory(
826
    size_t bucket_count, size_t huge_page_tlb_size,
827
    int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash,
828
279
    uint32_t threshold_use_skiplist) {
829
279
  return new HashLinkListRepFactory(
830
279
      bucket_count, threshold_use_skiplist, huge_page_tlb_size,
831
279
      bucket_entries_logging_threshold, if_log_bucket_dist_when_flash);
832
279
}
833
834
} // namespace rocksdb
835
#endif  // ROCKSDB_LITE