/Users/deen/code/yugabyte-db/src/yb/rocksdb/memtablerep.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under the BSD-style license found in the |
3 | | // LICENSE file in the root directory of this source tree. An additional grant |
4 | | // of patent rights can be found in the PATENTS file in the same directory. |
5 | | // |
6 | | // The following only applies to changes made to this file as part of YugaByte development. |
7 | | // |
8 | | // Portions Copyright (c) YugaByte, Inc. |
9 | | // |
10 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
11 | | // in compliance with the License. You may obtain a copy of the License at |
12 | | // |
13 | | // http://www.apache.org/licenses/LICENSE-2.0 |
14 | | // |
15 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
16 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
17 | | // or implied. See the License for the specific language governing permissions and limitations |
18 | | // under the License. |
19 | | // |
20 | | // This file contains the interface that must be implemented by any collection |
21 | | // to be used as the backing store for a MemTable. Such a collection must |
22 | | // satisfy the following properties: |
23 | | // (1) It does not store duplicate items. |
24 | | // (2) It uses MemTableRep::KeyComparator to compare items for iteration and |
25 | | // equality. |
26 | | // (3) It can be accessed concurrently by multiple readers and can support |
27 | | // during reads. However, it needn't support multiple concurrent writes. |
28 | | // (4) Items are never deleted. |
29 | | // The liberal use of assertions is encouraged to enforce (1). |
30 | | // |
31 | | // The factory will be passed an MemTableAllocator object when a new MemTableRep |
32 | | // is requested. |
33 | | // |
34 | | // Users can implement their own memtable representations. We include three |
35 | | // types built in: |
36 | | // - SkipListRep: This is the default; it is backed by a skip list. |
37 | | // - HashSkipListRep: The memtable rep that is best used for keys that are |
38 | | // structured like "prefix:suffix" where iteration within a prefix is |
39 | | // common and iteration across different prefixes is rare. It is backed by |
40 | | // a hash map where each bucket is a skip list. |
41 | | // - VectorRep: This is backed by an unordered std::vector. On iteration, the |
42 | | // vector is sorted. It is intelligent about sorting; once the MarkReadOnly() |
43 | | // has been called, the vector will only be sorted once. It is optimized for |
44 | | // random-write-heavy workloads. |
45 | | // |
46 | | // The last four implementations are designed for situations in which |
47 | | // iteration over the entire collection is rare since doing so requires all the |
48 | | // keys to be copied into a sorted data structure. |
49 | | #ifndef YB_ROCKSDB_MEMTABLEREP_H |
50 | | #define YB_ROCKSDB_MEMTABLEREP_H |
51 | | |
52 | | #pragma once |
53 | | |
54 | | #include <stdint.h> |
55 | | #include <stdlib.h> |
56 | | |
57 | | #include <memory> |
58 | | |
59 | | #include "yb/util/slice.h" |
60 | | #include "yb/util/strongly_typed_bool.h" |
61 | | |
62 | | namespace rocksdb { |
63 | | |
64 | | class Arena; |
65 | | class MemTableAllocator; |
66 | | class LookupKey; |
67 | | class SliceTransform; |
68 | | class Logger; |
69 | | |
70 | | typedef void* KeyHandle; |
71 | | |
72 | | class MemTableRep { |
73 | | public: |
74 | | // KeyComparator provides a means to compare keys, which are internal keys |
75 | | // concatenated with values. |
76 | | class KeyComparator { |
77 | | public: |
78 | | // Compare a and b. Return a negative value if a is less than b, 0 if they |
79 | | // are equal, and a positive value if a is greater than b |
80 | | virtual int operator()(const char* prefix_len_key1, |
81 | | const char* prefix_len_key2) const = 0; |
82 | | |
83 | | virtual int operator()(const char* prefix_len_key, |
84 | | const Slice& key) const = 0; |
85 | | |
86 | 56.7M | virtual ~KeyComparator() { } |
87 | | }; |
88 | | |
89 | 381k | explicit MemTableRep(MemTableAllocator* allocator) : allocator_(allocator) {} |
90 | | |
91 | | // Allocate a buf of len size for storing key. The idea is that a |
92 | | // specific memtable representation knows its underlying data structure |
93 | | // better. By allowing it to allocate memory, it can possibly put |
94 | | // correlated stuff in consecutive memory area to make processor |
95 | | // prefetching more efficient. |
96 | | virtual KeyHandle Allocate(const size_t len, char** buf); |
97 | | |
98 | | // Insert key into the collection. (The caller will pack key and value into a |
99 | | // single buffer and pass that in as the parameter to Insert). |
100 | | // REQUIRES: nothing that compares equal to key is currently in the |
101 | | // collection, and no concurrent modifications to the table in progress |
102 | | virtual void Insert(KeyHandle handle) = 0; |
103 | | |
104 | | // Like Insert(handle), but may be called concurrent with other calls |
105 | | // to InsertConcurrently for other handles |
106 | 0 | virtual void InsertConcurrently(KeyHandle handle) { |
107 | 0 | #ifndef ROCKSDB_LITE |
108 | 0 | throw std::runtime_error("concurrent insert not supported"); |
109 | 0 | #else |
110 | 0 | abort(); |
111 | 0 | #endif |
112 | 0 | } |
113 | | |
114 | 21 | virtual bool Erase(KeyHandle handle, const KeyComparator& comparator) { |
115 | 21 | return false; |
116 | 21 | } |
117 | | |
118 | | // Returns true iff an entry that compares equal to key is in the collection. |
119 | | virtual bool Contains(const char* key) const = 0; |
120 | | |
121 | | // Notify this table rep that it will no longer be added to. By default, |
122 | | // does nothing. After MarkReadOnly() is called, this table rep will |
123 | | // not be written to (ie No more calls to Allocate(), Insert(), |
124 | | // or any writes done directly to entries accessed through the iterator.) |
125 | 25.4k | virtual void MarkReadOnly() { } |
126 | | |
127 | | // Look up key from the mem table, since the first key in the mem table whose |
128 | | // user_key matches the one given k, call the function callback_func(), with |
129 | | // callback_args directly forwarded as the first parameter, and the mem table |
130 | | // key as the second parameter. If the return value is false, then terminates. |
131 | | // Otherwise, go through the next key. |
132 | | // |
133 | | // It's safe for Get() to terminate after having finished all the potential |
134 | | // key for the k.user_key(), or not. |
135 | | // |
136 | | // Default: |
137 | | // Get() function with a default value of dynamically construct an iterator, |
138 | | // seek and call the call back function. |
139 | | virtual void Get(const LookupKey& k, void* callback_args, |
140 | | bool (*callback_func)(void* arg, const char* entry)); |
141 | | |
142 | | virtual uint64_t ApproximateNumEntries(const Slice& start_ikey, |
143 | 0 | const Slice& end_key) { |
144 | 0 | return 0; |
145 | 0 | } |
146 | | |
147 | | // Report an approximation of how much memory has been used other than memory |
148 | | // that was allocated through the allocator. Safe to call from any thread. |
149 | | virtual size_t ApproximateMemoryUsage() = 0; |
150 | | |
151 | 362k | virtual ~MemTableRep() { } |
152 | | |
153 | | // Iteration over the contents of a skip collection |
154 | | class Iterator { |
155 | | public: |
156 | | // Initialize an iterator over the specified collection. |
157 | | // The returned iterator is not valid. |
158 | | // explicit Iterator(const MemTableRep* collection); |
159 | 31.2M | virtual ~Iterator() {} |
160 | | |
161 | | // Returns true iff the iterator is positioned at a valid node. |
162 | | virtual bool Valid() const = 0; |
163 | | |
164 | | // Returns the key at the current position. |
165 | | // REQUIRES: Valid() |
166 | | virtual const char* key() const = 0; |
167 | | |
168 | | // Advances to the next position. |
169 | | // REQUIRES: Valid() |
170 | | virtual void Next() = 0; |
171 | | |
172 | | // Advances to the previous position. |
173 | | // REQUIRES: Valid() |
174 | | virtual void Prev() = 0; |
175 | | |
176 | | // Advance to the first entry with a key >= target |
177 | | virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0; |
178 | | |
179 | | // Position at the first entry in collection. |
180 | | // Final state of iterator is Valid() iff collection is not empty. |
181 | | virtual void SeekToFirst() = 0; |
182 | | |
183 | | // Position at the last entry in collection. |
184 | | // Final state of iterator is Valid() iff collection is not empty. |
185 | | virtual void SeekToLast() = 0; |
186 | | }; |
187 | | |
188 | | // Return an iterator over the keys in this representation. |
189 | | // arena: If not null, the arena needs to be used to allocate the Iterator. |
190 | | // When destroying the iterator, the caller will not call "delete" |
191 | | // but Iterator::~Iterator() directly. The destructor needs to destroy |
192 | | // all the states but those allocated in arena. |
193 | | virtual Iterator* GetIterator(Arena* arena = nullptr) = 0; |
194 | | |
195 | | // Return an iterator that has a special Seek semantics. The result of |
196 | | // a Seek might only include keys with the same prefix as the target key. |
197 | | // arena: If not null, the arena is used to allocate the Iterator. |
198 | | // When destroying the iterator, the caller will not call "delete" |
199 | | // but Iterator::~Iterator() directly. The destructor needs to destroy |
200 | | // all the states but those allocated in arena. |
201 | 3.19k | virtual Iterator* GetDynamicPrefixIterator(Arena* arena = nullptr) { |
202 | 3.19k | return GetIterator(arena); |
203 | 3.19k | } |
204 | | |
205 | | // Return true if the current MemTableRep supports merge operator. |
206 | | // Default: true |
207 | 530 | virtual bool IsMergeOperatorSupported() const { return true; } |
208 | | |
209 | | // Return true if the current MemTableRep supports snapshot |
210 | | // Default: true |
211 | 346k | virtual bool IsSnapshotSupported() const { return true; } |
212 | | |
213 | | protected: |
214 | | // When *key is an internal key concatenated with the value, returns the |
215 | | // user key. |
216 | | virtual Slice UserKey(const char* key) const; |
217 | | |
218 | | MemTableAllocator* allocator_; |
219 | | }; |
220 | | |
221 | | // This is the base class for all factories that are used by RocksDB to create |
222 | | // new MemTableRep objects |
223 | | class MemTableRepFactory { |
224 | | public: |
225 | 3.24M | virtual ~MemTableRepFactory() {} |
226 | | virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, |
227 | | MemTableAllocator*, |
228 | | const SliceTransform*, |
229 | | Logger* logger) = 0; |
230 | | virtual const char* Name() const = 0; |
231 | | |
232 | | // Return true if the current MemTableRep supports concurrent inserts |
233 | | // Default: false |
234 | 2 | virtual bool IsInsertConcurrentlySupported() const { return false; } |
235 | | |
236 | 0 | virtual bool IsInMemoryEraseSupported() const { return false; } |
237 | | }; |
238 | | |
239 | | YB_STRONGLY_TYPED_BOOL(ConcurrentWrites); |
240 | | |
241 | | // This uses a skip list to store keys. It is the default. |
242 | | // |
243 | | // Parameters: |
244 | | // lookahead: If non-zero, each iterator's seek operation will start the |
245 | | // search from the previously visited record (doing at most 'lookahead' |
246 | | // steps). This is an optimization for the access pattern including many |
247 | | // seeks with consecutive keys. |
248 | | class SkipListFactory : public MemTableRepFactory { |
249 | | public: |
250 | | explicit SkipListFactory( |
251 | | size_t lookahead = 0, ConcurrentWrites concurrent_writes = ConcurrentWrites::kTrue) |
252 | 3.27M | : lookahead_(lookahead), concurrent_writes_(concurrent_writes) {} |
253 | | |
254 | | MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, |
255 | | MemTableAllocator*, |
256 | | const SliceTransform*, |
257 | | Logger* logger) override; |
258 | 3.05M | const char* Name() const override { return "SkipListFactory"; } |
259 | | |
260 | 384 | bool IsInsertConcurrentlySupported() const override { return concurrent_writes_; } |
261 | | |
262 | 0 | bool IsInMemoryEraseSupported() const override { return !concurrent_writes_; } |
263 | | |
264 | | private: |
265 | | const size_t lookahead_; |
266 | | const ConcurrentWrites concurrent_writes_; |
267 | | }; |
268 | | |
269 | | class CDSSkipListFactory : public MemTableRepFactory { |
270 | | public: |
271 | | MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, |
272 | | MemTableAllocator*, |
273 | | const SliceTransform*, |
274 | | Logger* logger) override; |
275 | | |
276 | 0 | const char* Name() const override { return "CDSSkipListFactory"; } |
277 | | |
278 | 0 | bool IsInsertConcurrentlySupported() const override { return true; } |
279 | | }; |
280 | | |
281 | | #ifndef ROCKSDB_LITE |
282 | | // This creates MemTableReps that are backed by an std::vector. On iteration, |
283 | | // the vector is sorted. This is useful for workloads where iteration is very |
284 | | // rare and writes are generally not issued after reads begin. |
285 | | // |
286 | | // Parameters: |
287 | | // count: Passed to the constructor of the underlying std::vector of each |
288 | | // VectorRep. On initialization, the underlying array will be at least count |
289 | | // bytes reserved for usage. |
290 | | class VectorRepFactory : public MemTableRepFactory { |
291 | | const size_t count_; |
292 | | |
293 | | public: |
294 | 107 | explicit VectorRepFactory(size_t count = 0) : count_(count) { } |
295 | | virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, |
296 | | MemTableAllocator*, |
297 | | const SliceTransform*, |
298 | | Logger* logger) override; |
299 | 1.54k | virtual const char* Name() const override { |
300 | 1.54k | return "VectorRepFactory"; |
301 | 1.54k | } |
302 | | }; |
303 | | |
304 | | // This class contains a fixed array of buckets, each |
305 | | // pointing to a skiplist (null if the bucket is empty). |
306 | | // bucket_count: number of fixed array buckets |
307 | | // skiplist_height: the max height of the skiplist |
308 | | // skiplist_branching_factor: probabilistic size ratio between adjacent |
309 | | // link lists in the skiplist |
310 | | extern MemTableRepFactory* NewHashSkipListRepFactory( |
311 | | size_t bucket_count = 1000000, int32_t skiplist_height = 4, |
312 | | int32_t skiplist_branching_factor = 4 |
313 | | ); |
314 | | |
315 | | // The factory is to create memtables based on a hash table: |
316 | | // it contains a fixed array of buckets, each pointing to either a linked list |
317 | | // or a skip list if number of entries inside the bucket exceeds |
318 | | // threshold_use_skiplist. |
319 | | // @bucket_count: number of fixed array buckets |
320 | | // @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc. |
321 | | // Otherwise from huge page TLB. The user needs to reserve |
322 | | // huge pages for it to be allocated, like: |
323 | | // sysctl -w vm.nr_hugepages=20 |
324 | | // See linux doc Documentation/vm/hugetlbpage.txt |
325 | | // @bucket_entries_logging_threshold: if number of entries in one bucket |
326 | | // exceeds this number, log about it. |
327 | | // @if_log_bucket_dist_when_flash: if true, log distribution of number of |
328 | | // entries when flushing. |
329 | | // @threshold_use_skiplist: a bucket switches to skip list if number of |
330 | | // entries exceed this parameter. |
331 | | extern MemTableRepFactory* NewHashLinkListRepFactory( |
332 | | size_t bucket_count = 50000, size_t huge_page_tlb_size = 0, |
333 | | int bucket_entries_logging_threshold = 4096, |
334 | | bool if_log_bucket_dist_when_flash = true, |
335 | | uint32_t threshold_use_skiplist = 256); |
336 | | |
337 | | #endif // ROCKSDB_LITE |
338 | | } // namespace rocksdb |
339 | | |
340 | | #endif // YB_ROCKSDB_MEMTABLEREP_H |