YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/src/yb/rocksdb/memtablerep.h
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
// This source code is licensed under the BSD-style license found in the
3
// LICENSE file in the root directory of this source tree. An additional grant
4
// of patent rights can be found in the PATENTS file in the same directory.
5
//
6
// The following only applies to changes made to this file as part of YugaByte development.
7
//
8
// Portions Copyright (c) YugaByte, Inc.
9
//
10
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
11
// in compliance with the License.  You may obtain a copy of the License at
12
//
13
// http://www.apache.org/licenses/LICENSE-2.0
14
//
15
// Unless required by applicable law or agreed to in writing, software distributed under the License
16
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
17
// or implied.  See the License for the specific language governing permissions and limitations
18
// under the License.
19
//
20
// This file contains the interface that must be implemented by any collection
21
// to be used as the backing store for a MemTable. Such a collection must
22
// satisfy the following properties:
23
//  (1) It does not store duplicate items.
24
//  (2) It uses MemTableRep::KeyComparator to compare items for iteration and
25
//     equality.
26
//  (3) It can be accessed concurrently by multiple readers and can support
27
//     during reads. However, it needn't support multiple concurrent writes.
28
//  (4) Items are never deleted.
29
// The liberal use of assertions is encouraged to enforce (1).
30
//
31
// The factory will be passed an MemTableAllocator object when a new MemTableRep
32
// is requested.
33
//
34
// Users can implement their own memtable representations. We include three
35
// types built in:
36
//  - SkipListRep: This is the default; it is backed by a skip list.
37
//  - HashSkipListRep: The memtable rep that is best used for keys that are
38
//  structured like "prefix:suffix" where iteration within a prefix is
39
//  common and iteration across different prefixes is rare. It is backed by
40
//  a hash map where each bucket is a skip list.
41
//  - VectorRep: This is backed by an unordered std::vector. On iteration, the
42
// vector is sorted. It is intelligent about sorting; once the MarkReadOnly()
43
// has been called, the vector will only be sorted once. It is optimized for
44
// random-write-heavy workloads.
45
//
46
// The last four implementations are designed for situations in which
47
// iteration over the entire collection is rare since doing so requires all the
48
// keys to be copied into a sorted data structure.
49
#ifndef YB_ROCKSDB_MEMTABLEREP_H
50
#define YB_ROCKSDB_MEMTABLEREP_H
51
52
#pragma once
53
54
#include <stdint.h>
55
#include <stdlib.h>
56
57
#include <memory>
58
59
#include "yb/util/slice.h"
60
#include "yb/util/strongly_typed_bool.h"
61
62
namespace rocksdb {
63
64
class Arena;
65
class MemTableAllocator;
66
class LookupKey;
67
class SliceTransform;
68
class Logger;
69
70
typedef void* KeyHandle;
71
72
class MemTableRep {
73
 public:
74
  // KeyComparator provides a means to compare keys, which are internal keys
75
  // concatenated with values.
76
  class KeyComparator {
77
   public:
78
    // Compare a and b. Return a negative value if a is less than b, 0 if they
79
    // are equal, and a positive value if a is greater than b
80
    virtual int operator()(const char* prefix_len_key1,
81
                           const char* prefix_len_key2) const = 0;
82
83
    virtual int operator()(const char* prefix_len_key,
84
                           const Slice& key) const = 0;
85
86
56.7M
    virtual ~KeyComparator() { }
87
  };
88
89
381k
  explicit MemTableRep(MemTableAllocator* allocator) : allocator_(allocator) {}
90
91
  // Allocate a buf of len size for storing key. The idea is that a
92
  // specific memtable representation knows its underlying data structure
93
  // better. By allowing it to allocate memory, it can possibly put
94
  // correlated stuff in consecutive memory area to make processor
95
  // prefetching more efficient.
96
  virtual KeyHandle Allocate(const size_t len, char** buf);
97
98
  // Insert key into the collection. (The caller will pack key and value into a
99
  // single buffer and pass that in as the parameter to Insert).
100
  // REQUIRES: nothing that compares equal to key is currently in the
101
  // collection, and no concurrent modifications to the table in progress
102
  virtual void Insert(KeyHandle handle) = 0;
103
104
  // Like Insert(handle), but may be called concurrent with other calls
105
  // to InsertConcurrently for other handles
106
0
  virtual void InsertConcurrently(KeyHandle handle) {
107
0
#ifndef ROCKSDB_LITE
108
0
    throw std::runtime_error("concurrent insert not supported");
109
0
#else
110
0
    abort();
111
0
#endif
112
0
  }
113
114
21
  virtual bool Erase(KeyHandle handle, const KeyComparator& comparator) {
115
21
    return false;
116
21
  }
117
118
  // Returns true iff an entry that compares equal to key is in the collection.
119
  virtual bool Contains(const char* key) const = 0;
120
121
  // Notify this table rep that it will no longer be added to. By default,
122
  // does nothing.  After MarkReadOnly() is called, this table rep will
123
  // not be written to (ie No more calls to Allocate(), Insert(),
124
  // or any writes done directly to entries accessed through the iterator.)
125
25.4k
  virtual void MarkReadOnly() { }
126
127
  // Look up key from the mem table, since the first key in the mem table whose
128
  // user_key matches the one given k, call the function callback_func(), with
129
  // callback_args directly forwarded as the first parameter, and the mem table
130
  // key as the second parameter. If the return value is false, then terminates.
131
  // Otherwise, go through the next key.
132
  //
133
  // It's safe for Get() to terminate after having finished all the potential
134
  // key for the k.user_key(), or not.
135
  //
136
  // Default:
137
  // Get() function with a default value of dynamically construct an iterator,
138
  // seek and call the call back function.
139
  virtual void Get(const LookupKey& k, void* callback_args,
140
                   bool (*callback_func)(void* arg, const char* entry));
141
142
  virtual uint64_t ApproximateNumEntries(const Slice& start_ikey,
143
0
                                         const Slice& end_key) {
144
0
    return 0;
145
0
  }
146
147
  // Report an approximation of how much memory has been used other than memory
148
  // that was allocated through the allocator.  Safe to call from any thread.
149
  virtual size_t ApproximateMemoryUsage() = 0;
150
151
362k
  virtual ~MemTableRep() { }
152
153
  // Iteration over the contents of a skip collection
154
  class Iterator {
155
   public:
156
    // Initialize an iterator over the specified collection.
157
    // The returned iterator is not valid.
158
    // explicit Iterator(const MemTableRep* collection);
159
31.2M
    virtual ~Iterator() {}
160
161
    // Returns true iff the iterator is positioned at a valid node.
162
    virtual bool Valid() const = 0;
163
164
    // Returns the key at the current position.
165
    // REQUIRES: Valid()
166
    virtual const char* key() const = 0;
167
168
    // Advances to the next position.
169
    // REQUIRES: Valid()
170
    virtual void Next() = 0;
171
172
    // Advances to the previous position.
173
    // REQUIRES: Valid()
174
    virtual void Prev() = 0;
175
176
    // Advance to the first entry with a key >= target
177
    virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0;
178
179
    // Position at the first entry in collection.
180
    // Final state of iterator is Valid() iff collection is not empty.
181
    virtual void SeekToFirst() = 0;
182
183
    // Position at the last entry in collection.
184
    // Final state of iterator is Valid() iff collection is not empty.
185
    virtual void SeekToLast() = 0;
186
  };
187
188
  // Return an iterator over the keys in this representation.
189
  // arena: If not null, the arena needs to be used to allocate the Iterator.
190
  //        When destroying the iterator, the caller will not call "delete"
191
  //        but Iterator::~Iterator() directly. The destructor needs to destroy
192
  //        all the states but those allocated in arena.
193
  virtual Iterator* GetIterator(Arena* arena = nullptr) = 0;
194
195
  // Return an iterator that has a special Seek semantics. The result of
196
  // a Seek might only include keys with the same prefix as the target key.
197
  // arena: If not null, the arena is used to allocate the Iterator.
198
  //        When destroying the iterator, the caller will not call "delete"
199
  //        but Iterator::~Iterator() directly. The destructor needs to destroy
200
  //        all the states but those allocated in arena.
201
3.19k
  virtual Iterator* GetDynamicPrefixIterator(Arena* arena = nullptr) {
202
3.19k
    return GetIterator(arena);
203
3.19k
  }
204
205
  // Return true if the current MemTableRep supports merge operator.
206
  // Default: true
207
530
  virtual bool IsMergeOperatorSupported() const { return true; }
208
209
  // Return true if the current MemTableRep supports snapshot
210
  // Default: true
211
346k
  virtual bool IsSnapshotSupported() const { return true; }
212
213
 protected:
214
  // When *key is an internal key concatenated with the value, returns the
215
  // user key.
216
  virtual Slice UserKey(const char* key) const;
217
218
  MemTableAllocator* allocator_;
219
};
220
221
// This is the base class for all factories that are used by RocksDB to create
222
// new MemTableRep objects
223
class MemTableRepFactory {
224
 public:
225
3.24M
  virtual ~MemTableRepFactory() {}
226
  virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
227
                                         MemTableAllocator*,
228
                                         const SliceTransform*,
229
                                         Logger* logger) = 0;
230
  virtual const char* Name() const = 0;
231
232
  // Return true if the current MemTableRep supports concurrent inserts
233
  // Default: false
234
2
  virtual bool IsInsertConcurrentlySupported() const { return false; }
235
236
0
  virtual bool IsInMemoryEraseSupported() const { return false; }
237
};
238
239
YB_STRONGLY_TYPED_BOOL(ConcurrentWrites);
240
241
// This uses a skip list to store keys. It is the default.
242
//
243
// Parameters:
244
//   lookahead: If non-zero, each iterator's seek operation will start the
245
//     search from the previously visited record (doing at most 'lookahead'
246
//     steps). This is an optimization for the access pattern including many
247
//     seeks with consecutive keys.
248
class SkipListFactory : public MemTableRepFactory {
249
 public:
250
  explicit SkipListFactory(
251
      size_t lookahead = 0, ConcurrentWrites concurrent_writes = ConcurrentWrites::kTrue)
252
3.27M
      : lookahead_(lookahead), concurrent_writes_(concurrent_writes) {}
253
254
  MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
255
                                 MemTableAllocator*,
256
                                 const SliceTransform*,
257
                                 Logger* logger) override;
258
3.05M
  const char* Name() const override { return "SkipListFactory"; }
259
260
384
  bool IsInsertConcurrentlySupported() const override { return concurrent_writes_; }
261
262
0
  bool IsInMemoryEraseSupported() const override { return !concurrent_writes_; }
263
264
 private:
265
  const size_t lookahead_;
266
  const ConcurrentWrites concurrent_writes_;
267
};
268
269
class CDSSkipListFactory : public MemTableRepFactory {
270
 public:
271
  MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
272
                                 MemTableAllocator*,
273
                                 const SliceTransform*,
274
                                 Logger* logger) override;
275
276
0
  const char* Name() const override { return "CDSSkipListFactory"; }
277
278
0
  bool IsInsertConcurrentlySupported() const override { return true; }
279
};
280
281
#ifndef ROCKSDB_LITE
282
// This creates MemTableReps that are backed by an std::vector. On iteration,
283
// the vector is sorted. This is useful for workloads where iteration is very
284
// rare and writes are generally not issued after reads begin.
285
//
286
// Parameters:
287
//   count: Passed to the constructor of the underlying std::vector of each
288
//     VectorRep. On initialization, the underlying array will be at least count
289
//     bytes reserved for usage.
290
class VectorRepFactory : public MemTableRepFactory {
291
  const size_t count_;
292
293
 public:
294
107
  explicit VectorRepFactory(size_t count = 0) : count_(count) { }
295
  virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
296
                                         MemTableAllocator*,
297
                                         const SliceTransform*,
298
                                         Logger* logger) override;
299
1.54k
  virtual const char* Name() const override {
300
1.54k
    return "VectorRepFactory";
301
1.54k
  }
302
};
303
304
// This class contains a fixed array of buckets, each
305
// pointing to a skiplist (null if the bucket is empty).
306
// bucket_count: number of fixed array buckets
307
// skiplist_height: the max height of the skiplist
308
// skiplist_branching_factor: probabilistic size ratio between adjacent
309
//                            link lists in the skiplist
310
extern MemTableRepFactory* NewHashSkipListRepFactory(
311
    size_t bucket_count = 1000000, int32_t skiplist_height = 4,
312
    int32_t skiplist_branching_factor = 4
313
);
314
315
// The factory is to create memtables based on a hash table:
316
// it contains a fixed array of buckets, each pointing to either a linked list
317
// or a skip list if number of entries inside the bucket exceeds
318
// threshold_use_skiplist.
319
// @bucket_count: number of fixed array buckets
320
// @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc.
321
//                      Otherwise from huge page TLB. The user needs to reserve
322
//                      huge pages for it to be allocated, like:
323
//                          sysctl -w vm.nr_hugepages=20
324
//                      See linux doc Documentation/vm/hugetlbpage.txt
325
// @bucket_entries_logging_threshold: if number of entries in one bucket
326
//                                    exceeds this number, log about it.
327
// @if_log_bucket_dist_when_flash: if true, log distribution of number of
328
//                                 entries when flushing.
329
// @threshold_use_skiplist: a bucket switches to skip list if number of
330
//                          entries exceed this parameter.
331
extern MemTableRepFactory* NewHashLinkListRepFactory(
332
    size_t bucket_count = 50000, size_t huge_page_tlb_size = 0,
333
    int bucket_entries_logging_threshold = 4096,
334
    bool if_log_bucket_dist_when_flash = true,
335
    uint32_t threshold_use_skiplist = 256);
336
337
#endif  // ROCKSDB_LITE
338
}  // namespace rocksdb
339
340
#endif // YB_ROCKSDB_MEMTABLEREP_H