/Users/deen/code/yugabyte-db/src/yb/rocksdb/db/inlineskiplist.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under the BSD-style license found in the |
3 | | // LICENSE file in the root directory of this source tree. An additional |
4 | | // grant of patent rights can be found in the PATENTS file in the same |
5 | | // directory. |
6 | | // |
7 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. Use of |
8 | | // this source code is governed by a BSD-style license that can be found |
9 | | // in the LICENSE file. See the AUTHORS file for names of contributors. |
10 | | // |
11 | | // InlineSkipList is derived from SkipList (skiplist.h), but it optimizes |
12 | | // the memory layout by requiring that the key storage be allocated through |
13 | | // the skip list instance. For the common case of SkipList<const char*, |
14 | | // Cmp> this saves 1 pointer per skip list node and gives better cache |
15 | | // locality, at the expense of wasted padding from using AllocateAligned |
16 | | // instead of Allocate for the keys. The unused padding will be from |
17 | | // 0 to sizeof(void*)-1 bytes, and the space savings are sizeof(void*) |
18 | | // bytes, so despite the padding the space used is always less than |
19 | | // SkipList<const char*, ..>. |
20 | | // |
21 | | // Thread safety ------------- |
22 | | // |
23 | | // Writes via Insert require external synchronization, most likely a mutex. |
24 | | // InsertConcurrently can be safely called concurrently with reads and |
25 | | // with other concurrent inserts. Reads require a guarantee that the |
26 | | // InlineSkipList will not be destroyed while the read is in progress. |
27 | | // Apart from that, reads progress without any internal locking or |
28 | | // synchronization. |
29 | | // |
30 | | // Invariants: |
31 | | // |
32 | | // (1) Allocated nodes are never deleted until the InlineSkipList is |
33 | | // destroyed. This is trivially guaranteed by the code since we never |
34 | | // delete any skip list nodes. |
35 | | // |
36 | | // (2) The contents of a Node except for the next/prev pointers are |
37 | | // immutable after the Node has been linked into the InlineSkipList. |
38 | | // Only Insert() modifies the list, and it is careful to initialize a |
39 | | // node and use release-stores to publish the nodes in one or more lists. |
40 | | // |
41 | | // ... prev vs. next pointer ordering ... |
42 | | // |
43 | | |
44 | | // |
45 | | // The following only applies to changes made to this file as part of YugaByte development. |
46 | | // |
47 | | // Portions Copyright (c) YugaByte, Inc. |
48 | | // |
49 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
50 | | // in compliance with the License. You may obtain a copy of the License at |
51 | | // |
52 | | // http://www.apache.org/licenses/LICENSE-2.0 |
53 | | // |
54 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
55 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
56 | | // or implied. See the License for the specific language governing permissions and limitations |
57 | | // under the License. |
58 | | // |
59 | | |
60 | | #ifndef YB_ROCKSDB_DB_INLINESKIPLIST_H |
61 | | #define YB_ROCKSDB_DB_INLINESKIPLIST_H |
62 | | |
63 | | #pragma once |
64 | | |
65 | | #include <assert.h> |
66 | | #include <stdlib.h> |
67 | | |
68 | | #include <atomic> |
69 | | |
70 | | #include "yb/rocksdb/util/allocator.h" |
71 | | #include "yb/rocksdb/util/random.h" |
72 | | |
73 | | namespace rocksdb { |
74 | | |
75 | | template <class Comparator> |
76 | | class InlineSkipList { |
77 | | private: |
78 | | struct Node; |
79 | | |
80 | | public: |
81 | | // Create a new InlineSkipList object that will use "cmp" for comparing |
82 | | // keys, and will allocate memory using "*allocator". Objects allocated |
83 | | // in the allocator must remain allocated for the lifetime of the |
84 | | // skiplist object. |
85 | | explicit InlineSkipList(Comparator cmp, Allocator* allocator, |
86 | | int32_t max_height = 12, |
87 | | int32_t branching_factor = 4); |
88 | | |
89 | | // Allocates a key and a skip-list node, returning a pointer to the key |
90 | | // portion of the node. This method is thread-safe if the allocator |
91 | | // is thread-safe. |
92 | | char* AllocateKey(size_t key_size); |
93 | | |
94 | | // Inserts a key allocated by AllocateKey, after the actual key value |
95 | | // has been filled in. |
96 | | // |
97 | | // REQUIRES: nothing that compares equal to key is currently in the list. |
98 | | // REQUIRES: no concurrent calls to INSERT |
99 | | void Insert(const char* key); |
100 | | |
101 | | // Like Insert, but external synchronization is not required. |
102 | | void InsertConcurrently(const char* key); |
103 | | |
104 | 173 | bool Erase(const char* key, Comparator cmp) { |
105 | 173 | return false; |
106 | 173 | } |
107 | | |
108 | | // Returns true iff an entry that compares equal to key is in the list. |
109 | | bool Contains(const char* key) const; |
110 | | |
111 | | // Return estimated number of entries smaller than `key`. |
112 | | uint64_t EstimateCount(const char* key) const; |
113 | | |
114 | | // Iteration over the contents of a skip list |
115 | | class Iterator { |
116 | | public: |
117 | | // Initialize an iterator over the specified list. |
118 | | // The returned iterator is not valid. |
119 | | explicit Iterator(const InlineSkipList* list); |
120 | | |
121 | | // Change the underlying skiplist used for this iterator |
122 | | // This enables us not changing the iterator without deallocating |
123 | | // an old one and then allocating a new one |
124 | | void SetList(const InlineSkipList* list); |
125 | | |
126 | | // Returns true iff the iterator is positioned at a valid node. |
127 | | bool Valid() const; |
128 | | |
129 | | // Returns the key at the current position. |
130 | | // REQUIRES: Valid() |
131 | | const char* key() const; |
132 | | |
133 | | // Advances to the next position. |
134 | | // REQUIRES: Valid() |
135 | | void Next(); |
136 | | |
137 | | // Advances to the previous position. |
138 | | // REQUIRES: Valid() |
139 | | void Prev(); |
140 | | |
141 | | // Advance to the first entry with a key >= target |
142 | | void Seek(const char* target); |
143 | | |
144 | | // Position at the first entry in list. |
145 | | // Final state of iterator is Valid() iff list is not empty. |
146 | | void SeekToFirst(); |
147 | | |
148 | | // Position at the last entry in list. |
149 | | // Final state of iterator is Valid() iff list is not empty. |
150 | | void SeekToLast(); |
151 | | |
152 | | private: |
153 | | const InlineSkipList* list_; |
154 | | Node* node_; |
155 | | // Intentionally copyable |
156 | | }; |
157 | | |
158 | | private: |
159 | | enum MaxPossibleHeightEnum : uint16_t { kMaxPossibleHeight = 32 }; |
160 | | |
161 | | const uint16_t kMaxHeight_; |
162 | | const uint16_t kBranching_; |
163 | | const uint32_t kScaledInverseBranching_; |
164 | | |
165 | | // Immutable after construction |
166 | | Comparator const compare_; |
167 | | Allocator* const allocator_; // Allocator used for allocations of nodes |
168 | | |
169 | | Node* const head_; |
170 | | |
171 | | // Modified only by Insert(). Read racily by readers, but stale |
172 | | // values are ok. |
173 | | std::atomic<int> max_height_; // Height of the entire list |
174 | | |
175 | | // Used for optimizing sequential insert patterns. Tricky. prev_height_ |
176 | | // of zero means prev_ is undefined. Otherwise: prev_[i] for i up |
177 | | // to max_height_ - 1 (inclusive) is the predecessor of prev_[0], and |
178 | | // prev_height_ is the height of prev_[0]. prev_[0] can only be equal |
179 | | // to head when max_height_ and prev_height_ are both 1. |
180 | | Node** prev_; |
181 | | std::atomic<int32_t> prev_height_; |
182 | | |
183 | 68.6M | inline int GetMaxHeight() const { |
184 | 68.6M | return max_height_.load(std::memory_order_relaxed); |
185 | 68.6M | } |
186 | | |
187 | | int RandomHeight(); |
188 | | |
189 | | Node* AllocateNode(size_t key_size, int height); |
190 | | |
191 | 23.0M | bool Equal(const char* a, const char* b) const { |
192 | 23.0M | return (compare_(a, b) == 0); |
193 | 23.0M | } |
194 | | |
195 | | // Return true if key is greater than the data stored in "n". Null n |
196 | | // is considered infinite. |
197 | | bool KeyIsAfterNode(const char* key, Node* n) const; |
198 | | |
199 | | // Returns the earliest node with a key >= key. |
200 | | // Return nullptr if there is no such node. |
201 | | Node* FindGreaterOrEqual(const char* key) const; |
202 | | |
203 | | // Return the latest node with a key < key. |
204 | | // Return head_ if there is no such node. |
205 | | // Fills prev[level] with pointer to previous node at "level" for every |
206 | | // level in [0..max_height_-1], if prev is non-null. |
207 | | Node* FindLessThan(const char* key, Node** prev = nullptr) const; |
208 | | |
209 | | // Return the last node in the list. |
210 | | // Return head_ if list is empty. |
211 | | Node* FindLast() const; |
212 | | |
213 | | // Traverses a single level of the list, setting *out_prev to the last |
214 | | // node before the key and *out_next to the first node after. Assumes |
215 | | // that the key is not present in the skip list. On entry, before should |
216 | | // point to a node that is before the key, and after should point to |
217 | | // a node that is after the key. after should be nullptr if a good after |
218 | | // node isn't conveniently available. |
219 | | void FindLevelSplice(const char* key, Node* before, Node* after, int level, |
220 | | Node** out_prev, Node** out_next); |
221 | | |
222 | | // No copying allowed |
223 | | InlineSkipList(const InlineSkipList&); |
224 | | InlineSkipList& operator=(const InlineSkipList&); |
225 | | }; |
226 | | |
227 | | // Implementation details follow |
228 | | |
229 | | // The Node data type is more of a pointer into custom-managed memory than |
230 | | // a traditional C++ struct. The key is stored in the bytes immediately |
231 | | // after the struct, and the next_ pointers for nodes with height > 1 are |
232 | | // stored immediately _before_ the struct. This avoids the need to include |
233 | | // any pointer or sizing data, which reduces per-node memory overheads. |
234 | | template <class Comparator> |
235 | | struct InlineSkipList<Comparator>::Node { |
236 | | // Stores the height of the node in the memory location normally used for |
237 | | // next_[0]. This is used for passing data from AllocateKey to Insert. |
238 | 35.7M | void StashHeight(const int height) { |
239 | 35.7M | static_assert(sizeof(int) <= sizeof(next_[0]), "Too small height holder"); |
240 | 35.7M | memcpy(static_cast<void*>(&next_[0]), &height, sizeof(int)); |
241 | 35.7M | } |
242 | | |
243 | | // Retrieves the value passed to StashHeight. Undefined after a call |
244 | | // to SetNext or NoBarrier_SetNext. |
245 | 35.6M | int UnstashHeight() const { |
246 | 35.6M | int rv; |
247 | 35.6M | memcpy(&rv, &next_[0], sizeof(int)); |
248 | 35.6M | return rv; |
249 | 35.6M | } |
250 | | |
251 | 2.42G | const char* Key() const { return reinterpret_cast<const char*>(&next_[1]); } |
252 | | |
253 | | // Accessors/mutators for links. Wrapped in methods so we can add |
254 | | // the appropriate barriers as necessary, and perform the necessary |
255 | | // addressing trickery for storing links below the Node in memory. |
256 | 791M | Node* Next(int n) { |
257 | 791M | assert(n >= 0); |
258 | | // Use an 'acquire load' so that we observe a fully initialized |
259 | | // version of the returned Node. |
260 | 0 | return (next_[-n].load(std::memory_order_acquire)); |
261 | 791M | } |
262 | | |
263 | 47.3M | void SetNext(int n, Node* x) { |
264 | 47.3M | assert(n >= 0); |
265 | | // Use a 'release store' so that anybody who reads through this |
266 | | // pointer observes a fully initialized version of the inserted node. |
267 | 0 | next_[-n].store(x, std::memory_order_release); |
268 | 47.3M | } |
269 | | |
270 | 698k | bool CASNext(int n, Node* expected, Node* x) { |
271 | 698k | assert(n >= 0); |
272 | 0 | return next_[-n].compare_exchange_strong(expected, x); |
273 | 698k | } |
274 | | |
275 | | // No-barrier variants that can be safely used in a few locations. |
276 | 81.9M | Node* NoBarrier_Next(int n) { |
277 | 81.9M | assert(n >= 0); |
278 | 0 | return next_[-n].load(std::memory_order_relaxed); |
279 | 81.9M | } |
280 | | |
281 | 47.5M | void NoBarrier_SetNext(int n, Node* x) { |
282 | 47.5M | assert(n >= 0); |
283 | 0 | next_[-n].store(x, std::memory_order_relaxed); |
284 | 47.5M | } |
285 | | |
286 | | private: |
287 | | // next_[0] is the lowest level link (level 0). Higher levels are |
288 | | // stored _earlier_, so level 1 is at next_[-1]. |
289 | | std::atomic<Node*> next_[1]; |
290 | | }; |
291 | | |
292 | | template <class Comparator> |
293 | | inline InlineSkipList<Comparator>::Iterator::Iterator( |
294 | 15.5M | const InlineSkipList* list) { |
295 | 15.5M | SetList(list); |
296 | 15.5M | } |
297 | | |
298 | | template <class Comparator> |
299 | | inline void InlineSkipList<Comparator>::Iterator::SetList( |
300 | 15.5M | const InlineSkipList* list) { |
301 | 15.5M | list_ = list; |
302 | 15.5M | node_ = nullptr; |
303 | 15.5M | } |
304 | | |
305 | | template <class Comparator> |
306 | 167M | inline bool InlineSkipList<Comparator>::Iterator::Valid() const { |
307 | 167M | return node_ != nullptr; |
308 | 167M | } |
309 | | |
310 | | template <class Comparator> |
311 | 84.3M | inline const char* InlineSkipList<Comparator>::Iterator::key() const { |
312 | 84.3M | assert(Valid()); |
313 | 0 | return node_->Key(); |
314 | 84.3M | } |
315 | | |
316 | | template <class Comparator> |
317 | 32.7M | inline void InlineSkipList<Comparator>::Iterator::Next() { |
318 | 32.7M | assert(Valid()); |
319 | 0 | node_ = node_->Next(0); |
320 | 32.7M | } |
321 | | |
322 | | template <class Comparator> |
323 | 820k | inline void InlineSkipList<Comparator>::Iterator::Prev() { |
324 | | // Instead of using explicit "prev" links, we just search for the |
325 | | // last node that falls before key. |
326 | 820k | assert(Valid()); |
327 | 0 | node_ = list_->FindLessThan(node_->Key()); |
328 | 820k | if (node_ == list_->head_) { |
329 | 80.8k | node_ = nullptr; |
330 | 80.8k | } |
331 | 820k | } |
332 | | |
333 | | template <class Comparator> |
334 | 15.8M | inline void InlineSkipList<Comparator>::Iterator::Seek(const char* target) { |
335 | 15.8M | node_ = list_->FindGreaterOrEqual(target); |
336 | 15.8M | } |
337 | | |
338 | | template <class Comparator> |
339 | 297k | inline void InlineSkipList<Comparator>::Iterator::SeekToFirst() { |
340 | 297k | node_ = list_->head_->Next(0); |
341 | 297k | } |
342 | | |
343 | | template <class Comparator> |
344 | 267k | inline void InlineSkipList<Comparator>::Iterator::SeekToLast() { |
345 | 267k | node_ = list_->FindLast(); |
346 | 267k | if (node_ == list_->head_) { |
347 | 4.34k | node_ = nullptr; |
348 | 4.34k | } |
349 | 267k | } |
350 | | |
351 | | template <class Comparator> |
352 | 35.6M | int InlineSkipList<Comparator>::RandomHeight() { |
353 | 35.6M | auto rnd = Random::GetTLSInstance(); |
354 | | |
355 | | // Increase height with probability 1 in kBranching |
356 | 35.6M | int height = 1; |
357 | 47.5M | while (height < kMaxHeight_ && height < kMaxPossibleHeight47.5M && |
358 | 47.5M | rnd->Next() < kScaledInverseBranching_47.5M ) { |
359 | 11.8M | height++; |
360 | 11.8M | } |
361 | 35.6M | assert(height > 0); |
362 | 0 | assert(height <= kMaxHeight_); |
363 | 0 | assert(height <= kMaxPossibleHeight); |
364 | 0 | return height; |
365 | 35.6M | } |
366 | | |
367 | | template <class Comparator> |
368 | | bool InlineSkipList<Comparator>::KeyIsAfterNode(const char* key, |
369 | 1.57G | Node* n) const { |
370 | | // nullptr n is considered infinite |
371 | 1.57G | return (n != nullptr) && (compare_(n->Key(), key) < 0)1.56G ; |
372 | 1.57G | } |
373 | | |
374 | | template <class Comparator> |
375 | | typename InlineSkipList<Comparator>::Node* |
376 | 15.8M | InlineSkipList<Comparator>::FindGreaterOrEqual(const char* key) const { |
377 | | // Note: It looks like we could reduce duplication by implementing |
378 | | // this function as FindLessThan(key)->Next(0), but we wouldn't be able |
379 | | // to exit early on equality and the result wouldn't even be correct. |
380 | | // A concurrent insert might occur after FindLessThan(key) but before |
381 | | // we get a chance to call Next(0). |
382 | 15.8M | Node* x = head_; |
383 | 15.8M | int level = GetMaxHeight() - 1; |
384 | 15.8M | Node* last_bigger = nullptr; |
385 | 326M | while (true326M ) { |
386 | 326M | Node* next = x->Next(level); |
387 | | // Make sure the lists are sorted |
388 | 326M | assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x)); |
389 | | // Make sure we haven't overshot during our search |
390 | 0 | assert(x == head_ || KeyIsAfterNode(key, x)); |
391 | 326M | int cmp = (next == nullptr || next == last_bigger317M ) |
392 | 326M | ? 135.8M |
393 | 326M | : compare_(next->Key(), key)290M ; |
394 | 328M | if (cmp == 0326M || (cmp > 0 && level == 0108M )) { |
395 | 15.8M | return next; |
396 | 310M | } else if (cmp < 0) { |
397 | | // Keep searching in this list |
398 | 228M | x = next; |
399 | 228M | } else { |
400 | | // Switch to next list, reuse compare_() result |
401 | 82.5M | last_bigger = next; |
402 | 82.5M | level--; |
403 | 82.5M | } |
404 | 326M | } |
405 | 15.8M | } |
406 | | |
407 | | template <class Comparator> |
408 | | typename InlineSkipList<Comparator>::Node* |
409 | 17.4M | InlineSkipList<Comparator>::FindLessThan(const char* key, Node** prev) const { |
410 | 17.4M | Node* x = head_; |
411 | 17.4M | int level = GetMaxHeight() - 1; |
412 | | // KeyIsAfter(key, last_not_after) is definitely false |
413 | 17.4M | Node* last_not_after = nullptr; |
414 | 381M | while (true381M ) { |
415 | 381M | Node* next = x->Next(level); |
416 | 381M | assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x)); |
417 | 0 | assert(x == head_ || KeyIsAfterNode(key, x)); |
418 | 381M | if (next != last_not_after && KeyIsAfterNode(key, next)341M ) { |
419 | | // Keep searching in this list |
420 | 262M | x = next; |
421 | 262M | } else { |
422 | 118M | if (prev != nullptr) { |
423 | 115M | prev[level] = x; |
424 | 115M | } |
425 | 118M | if (level == 0) { |
426 | 17.4M | return x; |
427 | 101M | } else { |
428 | | // Switch to next list, reuse KeyIUsAfterNode() result |
429 | 101M | last_not_after = next; |
430 | 101M | level--; |
431 | 101M | } |
432 | 118M | } |
433 | 381M | } |
434 | 17.4M | } |
435 | | |
436 | | template <class Comparator> |
437 | | typename InlineSkipList<Comparator>::Node* |
438 | 267k | InlineSkipList<Comparator>::FindLast() const { |
439 | 267k | Node* x = head_; |
440 | 267k | int level = GetMaxHeight() - 1; |
441 | 2.36M | while (true) { |
442 | 2.36M | Node* next = x->Next(level); |
443 | 2.36M | if (next == nullptr) { |
444 | 791k | if (level == 0) { |
445 | 267k | return x; |
446 | 523k | } else { |
447 | | // Switch to next list |
448 | 523k | level--; |
449 | 523k | } |
450 | 1.57M | } else { |
451 | 1.57M | x = next; |
452 | 1.57M | } |
453 | 2.36M | } |
454 | 267k | } |
455 | | |
456 | | template <class Comparator> |
457 | 44 | uint64_t InlineSkipList<Comparator>::EstimateCount(const char* key) const { |
458 | 44 | uint64_t count = 0; |
459 | | |
460 | 44 | Node* x = head_; |
461 | 44 | int level = GetMaxHeight() - 1; |
462 | 401 | while (true) { |
463 | 401 | assert(x == head_ || compare_(x->Key(), key) < 0); |
464 | 0 | Node* next = x->Next(level); |
465 | 401 | if (next == nullptr || compare_(next->Key(), key) >= 0344 ) { |
466 | 182 | if (level == 0) { |
467 | 44 | return count; |
468 | 138 | } else { |
469 | | // Switch to next list |
470 | 138 | count *= kBranching_; |
471 | 138 | level--; |
472 | 138 | } |
473 | 219 | } else { |
474 | 219 | x = next; |
475 | 219 | count++; |
476 | 219 | } |
477 | 401 | } |
478 | 44 | } |
479 | | |
480 | | template <class Comparator> |
481 | | InlineSkipList<Comparator>::InlineSkipList(const Comparator cmp, |
482 | | Allocator* allocator, |
483 | | int32_t max_height, |
484 | | int32_t branching_factor) |
485 | | : kMaxHeight_(max_height), |
486 | | kBranching_(branching_factor), |
487 | | kScaledInverseBranching_((Random::kMaxNext + 1) / kBranching_), |
488 | | compare_(cmp), |
489 | | allocator_(allocator), |
490 | | head_(AllocateNode(0, max_height)), |
491 | | max_height_(1), |
492 | 45.8k | prev_height_(1) { |
493 | 45.8k | assert(max_height > 0 && kMaxHeight_ == static_cast<uint32_t>(max_height)); |
494 | 0 | assert(branching_factor > 1 && |
495 | 45.8k | kBranching_ == static_cast<uint32_t>(branching_factor)); |
496 | 0 | assert(kScaledInverseBranching_ > 0); |
497 | | // Allocate the prev_ Node* array, directly from the passed-in allocator. |
498 | | // prev_ does not need to be freed, as its life cycle is tied up with |
499 | | // the allocator as a whole. |
500 | 0 | prev_ = reinterpret_cast<Node**>( |
501 | 45.8k | allocator_->AllocateAligned(sizeof(Node*) * kMaxHeight_)); |
502 | 595k | for (int i = 0; i < kMaxHeight_; i++549k ) { |
503 | 549k | head_->SetNext(i, nullptr); |
504 | 549k | prev_[i] = head_; |
505 | 549k | } |
506 | 45.8k | } |
507 | | |
508 | | template <class Comparator> |
509 | 35.6M | char* InlineSkipList<Comparator>::AllocateKey(size_t key_size) { |
510 | 35.6M | return const_cast<char*>(AllocateNode(key_size, RandomHeight())->Key()); |
511 | 35.6M | } |
512 | | |
513 | | template <class Comparator> |
514 | | typename InlineSkipList<Comparator>::Node* |
515 | 35.6M | InlineSkipList<Comparator>::AllocateNode(size_t key_size, int height) { |
516 | 35.6M | auto prefix = sizeof(std::atomic<Node*>) * (height - 1); |
517 | | |
518 | | // prefix is space for the height - 1 pointers that we store before |
519 | | // the Node instance (next_[-(height - 1) .. -1]). Node starts at |
520 | | // raw + prefix, and holds the bottom-mode (level 0) skip list pointer |
521 | | // next_[0]. key_size is the bytes for the key, which comes just after |
522 | | // the Node. |
523 | 35.6M | char* raw = allocator_->AllocateAligned(prefix + sizeof(Node) + key_size); |
524 | 35.6M | Node* x = reinterpret_cast<Node*>(raw + prefix); |
525 | | |
526 | | // Once we've linked the node into the skip list we don't actually need |
527 | | // to know its height, because we can implicitly use the fact that we |
528 | | // traversed into a node at level h to known that h is a valid level |
529 | | // for that node. We need to convey the height to the Insert step, |
530 | | // however, so that it can perform the proper links. Since we're not |
531 | | // using the pointers at the moment, StashHeight temporarily borrow |
532 | | // storage from next_[0] for that purpose. |
533 | 35.6M | x->StashHeight(height); |
534 | 35.6M | return x; |
535 | 35.6M | } |
536 | | |
537 | | template <class Comparator> |
538 | 35.1M | void InlineSkipList<Comparator>::Insert(const char* key) { |
539 | | // InsertConcurrently often can't maintain the prev_ invariants, so |
540 | | // it just sets prev_height_ to zero, letting us know that we should |
541 | | // ignore it. A relaxed load suffices here because write thread |
542 | | // synchronization separates Insert calls from InsertConcurrently calls. |
543 | 35.1M | auto prev_height = prev_height_.load(std::memory_order_relaxed); |
544 | | |
545 | | // fast path for sequential insertion |
546 | 35.1M | if (prev_height > 0 && !KeyIsAfterNode(key, prev_[0]->NoBarrier_Next(0))35.1M && |
547 | 35.1M | (26.6M prev_[0] == head_26.6M || KeyIsAfterNode(key, prev_[0])26.6M )) { |
548 | 18.5M | assert(prev_[0] != head_ || (prev_height == 1 && GetMaxHeight() == 1)); |
549 | | |
550 | | // Outside of this method prev_[1..max_height_] is the predecessor |
551 | | // of prev_[0], and prev_height_ refers to prev_[0]. Inside Insert |
552 | | // prev_[0..max_height - 1] is the predecessor of key. Switch from |
553 | | // the external state to the internal |
554 | 24.7M | for (int i = 1; i < prev_height; i++6.16M ) { |
555 | 6.16M | prev_[i] = prev_[0]; |
556 | 6.16M | } |
557 | 18.5M | } else { |
558 | | // TODO(opt): we could use a NoBarrier predecessor search as an |
559 | | // optimization for architectures where memory_order_acquire needs |
560 | | // a synchronization instruction. Doesn't matter on x86 |
561 | 16.5M | FindLessThan(key, prev_); |
562 | 16.5M | } |
563 | | |
564 | | // Our data structure does not allow duplicate insertion |
565 | 0 | assert(prev_[0]->Next(0) == nullptr || !Equal(key, prev_[0]->Next(0)->Key())); |
566 | | |
567 | | // Find the Node that we placed before the key in AllocateKey |
568 | 0 | Node* x = reinterpret_cast<Node*>(const_cast<char*>(key)) - 1; |
569 | 35.1M | int height = x->UnstashHeight(); |
570 | 35.1M | assert(height >= 1 && height <= kMaxHeight_); |
571 | | |
572 | 35.1M | if (height > GetMaxHeight()) { |
573 | 160k | for (int i = GetMaxHeight(); i < height; i++92.0k ) { |
574 | 92.0k | prev_[i] = head_; |
575 | 92.0k | } |
576 | | |
577 | | // It is ok to mutate max_height_ without any synchronization |
578 | | // with concurrent readers. A concurrent reader that observes |
579 | | // the new value of max_height_ will see either the old value of |
580 | | // new level pointers from head_ (nullptr), or a new value set in |
581 | | // the loop below. In the former case the reader will |
582 | | // immediately drop to the next level since nullptr sorts after all |
583 | | // keys. In the latter case the reader will use the new node. |
584 | 68.7k | max_height_.store(height, std::memory_order_relaxed); |
585 | 68.7k | } |
586 | | |
587 | 81.9M | for (int i = 0; i < height; i++46.8M ) { |
588 | | // NoBarrier_SetNext() suffices since we will add a barrier when |
589 | | // we publish a pointer to "x" in prev[i]. |
590 | 46.8M | x->NoBarrier_SetNext(i, prev_[i]->NoBarrier_Next(i)); |
591 | 46.8M | prev_[i]->SetNext(i, x); |
592 | 46.8M | } |
593 | 35.1M | prev_[0] = x; |
594 | 35.1M | prev_height_.store(height, std::memory_order_relaxed); |
595 | 35.1M | } |
596 | | |
597 | | template <class Comparator> |
598 | | void InlineSkipList<Comparator>::FindLevelSplice(const char* key, Node* before, |
599 | | Node* after, int level, |
600 | | Node** out_prev, |
601 | 3.59M | Node** out_next) { |
602 | 11.0M | while (true11.0M ) { |
603 | 11.0M | Node* next = before->Next(level); |
604 | 11.0M | assert(before == head_ || next == nullptr || |
605 | 11.0M | KeyIsAfterNode(next->Key(), before)); |
606 | 0 | assert(before == head_ || KeyIsAfterNode(key, before)); |
607 | 11.0M | if (next == after || !KeyIsAfterNode(key, next)10.0M ) { |
608 | | // found it |
609 | 3.60M | *out_prev = before; |
610 | 3.60M | *out_next = next; |
611 | 3.60M | return; |
612 | 3.60M | } |
613 | 7.43M | before = next; |
614 | 7.43M | } |
615 | 3.59M | } |
616 | | |
617 | | template <class Comparator> |
618 | 525k | void InlineSkipList<Comparator>::InsertConcurrently(const char* key) { |
619 | 525k | Node* x = reinterpret_cast<Node*>(const_cast<char*>(key)) - 1; |
620 | 525k | int height = x->UnstashHeight(); |
621 | 525k | assert(height >= 1 && height <= kMaxHeight_); |
622 | | |
623 | | // We don't have a lock-free algorithm for updating prev_, but we do have |
624 | | // the option of invalidating the entire sequential-insertion cache. |
625 | | // prev_'s invariant is that prev_[i] (i > 0) is the predecessor of |
626 | | // prev_[0] at that level. We're only going to violate that if height |
627 | | // > 1 and key lands after prev_[height - 1] but before prev_[0]. |
628 | | // Comparisons are pretty expensive, so an easier version is to just |
629 | | // clear the cache if height > 1. We only write to prev_height_ if the |
630 | | // nobody else has, to avoid invalidating the root of the skip list in |
631 | | // all of the other CPU caches. |
632 | 525k | if (height > 1 && prev_height_.load(std::memory_order_relaxed) != 0131k ) { |
633 | 2.14k | prev_height_.store(0, std::memory_order_relaxed); |
634 | 2.14k | } |
635 | | |
636 | 525k | int max_height = max_height_.load(std::memory_order_relaxed); |
637 | 525k | while (height > max_height) { |
638 | 227 | if (max_height_.compare_exchange_strong(max_height, height)) { |
639 | | // successfully updated it |
640 | 227 | max_height = height; |
641 | 227 | break; |
642 | 227 | } |
643 | | // else retry, possibly exiting the loop because somebody else |
644 | | // increased it |
645 | 227 | } |
646 | 525k | assert(max_height <= kMaxPossibleHeight); |
647 | | |
648 | 0 | Node* prev[kMaxPossibleHeight + 1]; |
649 | 525k | Node* next[kMaxPossibleHeight + 1]; |
650 | 525k | prev[max_height] = head_; |
651 | 525k | next[max_height] = nullptr; |
652 | 4.13M | for (int i = max_height - 1; i >= 0; --i3.61M ) { |
653 | 3.61M | FindLevelSplice(key, prev[i + 1], next[i + 1], i, &prev[i], &next[i]); |
654 | 3.61M | } |
655 | 1.22M | for (int i = 0; i < height; ++i700k ) { |
656 | 700k | while (true698k ) { |
657 | 700k | x->NoBarrier_SetNext(i, next[i]); |
658 | 702k | if (prev[i]->CASNext(i, next[i], x)700k ) { |
659 | | // success |
660 | 702k | break; |
661 | 702k | } |
662 | | // CAS failed, we need to recompute prev and next. It is unlikely |
663 | | // to be helpful to try to use a different level as we redo the |
664 | | // search, because it should be unlikely that lots of nodes have |
665 | | // been inserted between prev[i] and next[i]. No point in using |
666 | | // next[i] as the after hint, because we know it is stale. |
667 | 18.4E | FindLevelSplice(key, prev[i], nullptr, i, &prev[i], &next[i]); |
668 | 18.4E | } |
669 | 700k | } |
670 | 525k | } |
671 | | |
672 | | template <class Comparator> |
673 | 0 | bool InlineSkipList<Comparator>::Contains(const char* key) const { |
674 | 0 | Node* x = FindGreaterOrEqual(key); |
675 | 0 | if (x != nullptr && Equal(key, x->Key())) { |
676 | 0 | return true; |
677 | 0 | } else { |
678 | 0 | return false; |
679 | 0 | } |
680 | 0 | } |
681 | | |
682 | | } // namespace rocksdb |
683 | | |
684 | | #endif // YB_ROCKSDB_DB_INLINESKIPLIST_H |