/Users/deen/code/yugabyte-db/src/yb/rocksdb/table/block.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under the BSD-style license found in the |
3 | | // LICENSE file in the root directory of this source tree. An additional grant |
4 | | // of patent rights can be found in the PATENTS file in the same directory. |
5 | | // |
6 | | // The following only applies to changes made to this file as part of YugaByte development. |
7 | | // |
8 | | // Portions Copyright (c) YugaByte, Inc. |
9 | | // |
10 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
11 | | // in compliance with the License. You may obtain a copy of the License at |
12 | | // |
13 | | // http://www.apache.org/licenses/LICENSE-2.0 |
14 | | // |
15 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
16 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
17 | | // or implied. See the License for the specific language governing permissions and limitations |
18 | | // under the License. |
19 | | // |
20 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
21 | | // Use of this source code is governed by a BSD-style license that can be |
22 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
23 | | // |
24 | | // Decodes the blocks generated by block_builder.cc. |
25 | | |
26 | | #include "yb/rocksdb/table/block.h" |
27 | | |
28 | | #include <algorithm> |
29 | | #include <string> |
30 | | #include <unordered_map> |
31 | | #include <vector> |
32 | | |
33 | | #include <glog/logging.h> |
34 | | |
35 | | #include "yb/rocksdb/comparator.h" |
36 | | #include "yb/rocksdb/table/block_hash_index.h" |
37 | | #include "yb/rocksdb/table/block_internal.h" |
38 | | #include "yb/rocksdb/table/block_prefix_index.h" |
39 | | #include "yb/rocksdb/table/format.h" |
40 | | #include "yb/rocksdb/util/coding.h" |
41 | | #include "yb/rocksdb/util/perf_context_imp.h" |
42 | | |
43 | | #include "yb/util/result.h" |
44 | | #include "yb/util/stats/perf_step_timer.h" |
45 | | |
46 | | namespace rocksdb { |
47 | | |
48 | | namespace { |
49 | | |
50 | | // Empty block consists of (see comments inside block_builder.cc for block structure): |
51 | | // - 0 data keys |
52 | | // - uint32 for single restart point (first restart point is always 0 and present in block) |
53 | | // - num_restarts: uint32 |
54 | | const size_t kMinBlockSize = 2*sizeof(uint32_t); |
55 | | |
56 | | } // namespace |
57 | | |
58 | | // Helper routine: decode the next block entry starting at "p", |
59 | | // storing the number of shared key bytes, non_shared key bytes, |
60 | | // and the length of the value in "*shared", "*non_shared", and |
61 | | // "*value_length", respectively. Will not derefence past "limit". |
62 | | // |
63 | | // If any errors are detected, returns nullptr. Otherwise, returns a |
64 | | // pointer to the key delta (just past the three decoded values). |
65 | | static inline const char* DecodeEntry(const char* p, const char* limit, |
66 | | uint32_t* shared, |
67 | | uint32_t* non_shared, |
68 | 859M | uint32_t* value_length) { |
69 | 859M | if (limit - p < 3) return nullptr; |
70 | 859M | *shared = reinterpret_cast<const unsigned char*>(p)[0]; |
71 | 859M | *non_shared = reinterpret_cast<const unsigned char*>(p)[1]; |
72 | 859M | *value_length = reinterpret_cast<const unsigned char*>(p)[2]; |
73 | 859M | if ((*shared | *non_shared | *value_length) < 128) { |
74 | | // Fast path: all three values are encoded in one byte each |
75 | 822M | p += 3; |
76 | 37.3M | } else { |
77 | 37.3M | if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; |
78 | 37.3M | if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; |
79 | 37.3M | if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr; |
80 | 859M | } |
81 | | |
82 | 859M | if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) { |
83 | 0 | return nullptr; |
84 | 0 | } |
85 | 859M | return p; |
86 | 859M | } |
87 | | |
88 | | // Decodes restart key size (key_size) and value size (value_size) starting at `p` and returns |
89 | | // pointer to the next byte after decoded data. Expects restart key to be stored fully without |
90 | | // reusing bytes from previous key (see BlockBuilder::Add for more details). |
91 | | // This function should not read at or beyond `limit`. |
92 | | // Returns nullptr in case of decode failure. |
93 | | static inline const char* DecodeRestartEntry( |
94 | | const KeyValueEncodingFormat key_value_encoding_format, const char* p, const char* limit, |
95 | 274M | const char* read_allowed_from, uint32_t* key_size) { |
96 | 274M | uint32_t value_size; |
97 | 274M | switch (key_value_encoding_format) { |
98 | 255M | case KeyValueEncodingFormat::kKeyDeltaEncodingSharedPrefix: { |
99 | 255M | uint32_t shared_prefix_size; |
100 | 255M | auto* result = DecodeEntry(p, limit, &shared_prefix_size, key_size, &value_size); |
101 | 255M | return result && (shared_prefix_size == 0) ? result : nullptr; |
102 | 0 | } |
103 | 19.3M | case KeyValueEncodingFormat::kKeyDeltaEncodingThreeSharedParts: { |
104 | | // We declare output variables for DecodeEntryThreeSharedParts, but since we are |
105 | | // decoding restart key and it is stored fully - we are only interested in non_shared_1_size |
106 | | // argument that stores full key size in this case (see BlockBuilder::Add for more details). |
107 | | // So, we just pass key_size as non_shared_1_size to DecodeEntryThreeSharedParts. |
108 | 19.3M | uint32_t shared_prefix_size, non_shared_2_size, shared_last_component_size; |
109 | 19.3M | bool is_something_shared; |
110 | 19.3M | int64_t non_shared_1_size_delta, non_shared_2_size_delta; |
111 | 19.3M | uint64_t shared_last_component_increase; |
112 | | |
113 | 19.3M | auto* result = DecodeEntryThreeSharedParts( |
114 | 19.3M | p, limit, read_allowed_from, &shared_prefix_size, key_size, &non_shared_1_size_delta, |
115 | 19.3M | &is_something_shared, &non_shared_2_size, &non_shared_2_size_delta, |
116 | 19.3M | &shared_last_component_size, &shared_last_component_increase, &value_size); |
117 | 19.3M | if (PREDICT_FALSE(!result || is_something_shared)) { |
118 | | // This means corruption or decode failure, restart key is stored fully without reusing any |
119 | | // data from the previous key. |
120 | 0 | return nullptr; |
121 | 0 | } |
122 | 19.3M | return result; |
123 | 19.3M | } |
124 | 0 | } |
125 | 0 | FATAL_INVALID_ENUM_VALUE(KeyValueEncodingFormat, key_value_encoding_format); |
126 | 0 | } |
127 | | |
128 | 288M | void BlockIter::Next() { |
129 | 288M | assert(Valid()); |
130 | 288M | ParseNextKey(); |
131 | 288M | } |
132 | | |
133 | 1.87M | void BlockIter::Prev() { |
134 | 1.87M | assert(Valid()); |
135 | | |
136 | | // Scan backwards to a restart point before current_ |
137 | 1.87M | const uint32_t original = current_; |
138 | 2.06M | while (GetRestartPoint(restart_index_) >= original) { |
139 | 366k | if (restart_index_ == 0) { |
140 | | // No more entries |
141 | 183k | current_ = restarts_; |
142 | 183k | restart_index_ = num_restarts_; |
143 | 183k | return; |
144 | 183k | } |
145 | 183k | restart_index_--; |
146 | 183k | } |
147 | | |
148 | 1.69M | SeekToRestartPoint(restart_index_); |
149 | 57.6M | do { |
150 | | // Loop until end of current entry hits the start of original entry |
151 | 57.6M | } while (ParseNextKey() && NextEntryOffset() < original); |
152 | 1.69M | } |
153 | | |
154 | | void BlockIter::Initialize( |
155 | | const Comparator* comparator, const char* data, |
156 | | const KeyValueEncodingFormat key_value_encoding_format, uint32_t restarts, |
157 | 34.8M | uint32_t num_restarts, BlockHashIndex* hash_index, BlockPrefixIndex* prefix_index) { |
158 | 34.8M | DCHECK(data_ == nullptr); // Ensure it is called only once |
159 | 34.8M | DCHECK_GT(num_restarts, 0); // Ensure the param is valid |
160 | | |
161 | 34.8M | comparator_ = comparator; |
162 | 34.8M | data_ = data; |
163 | 34.8M | key_value_encoding_format_ = key_value_encoding_format; |
164 | 34.8M | restarts_ = restarts; |
165 | 34.8M | num_restarts_ = num_restarts; |
166 | 34.8M | current_ = restarts_; |
167 | 34.8M | restart_index_ = num_restarts_; |
168 | 34.8M | hash_index_ = hash_index; |
169 | 34.8M | prefix_index_ = prefix_index; |
170 | 34.8M | } |
171 | | |
172 | | |
173 | 58.3M | void BlockIter::Seek(const Slice& target) { |
174 | 58.3M | PERF_TIMER_GUARD(block_seek_nanos); |
175 | 58.3M | if (data_ == nullptr) { // Not init yet |
176 | 0 | return; |
177 | 0 | } |
178 | 58.3M | uint32_t index = 0; |
179 | 58.3M | bool ok = false; |
180 | 58.3M | if (prefix_index_) { |
181 | 102k | ok = PrefixSeek(target, &index); |
182 | 58.2M | } else { |
183 | 799k | ok = hash_index_ ? HashSeek(target, &index) |
184 | 57.4M | : BinarySeek(target, 0, num_restarts_ - 1, &index); |
185 | 58.2M | } |
186 | | |
187 | 58.3M | if (!ok) { |
188 | 202k | return; |
189 | 202k | } |
190 | 58.1M | SeekToRestartPoint(index); |
191 | | // Linear search (within restart block) for first key >= target |
192 | | |
193 | 252M | while (true) { |
194 | 252M | if (!ParseNextKey() || Compare(key_.GetKey(), target) >= 0) { |
195 | 58.1M | return; |
196 | 58.1M | } |
197 | 252M | } |
198 | 58.1M | } |
199 | | |
200 | 3.09M | void BlockIter::SeekToFirst() { |
201 | 3.09M | if (data_ == nullptr) { // Not init yet |
202 | 0 | return; |
203 | 0 | } |
204 | 3.09M | SeekToRestartPoint(0); |
205 | 3.09M | ParseNextKey(); |
206 | 3.09M | } |
207 | | |
208 | 517k | void BlockIter::SeekToLast() { |
209 | 517k | if (data_ == nullptr) { // Not init yet |
210 | 0 | return; |
211 | 0 | } |
212 | 517k | SeekToRestartPoint(num_restarts_ - 1); |
213 | 11.1M | while (ParseNextKey() && NextEntryOffset() < restarts_) { |
214 | | // Keep skipping |
215 | 10.6M | } |
216 | 517k | } |
217 | | |
218 | | |
219 | | namespace { |
220 | | |
221 | 0 | Status BadBlockContentsError() { |
222 | 0 | return STATUS(Corruption, "bad block contents"); |
223 | 0 | } |
224 | | |
225 | 2 | Status BadEntryInBlockError(const std::string& error_details) { |
226 | 2 | return STATUS(Corruption, yb::Format("bad entry in block: $0", error_details)); |
227 | 2 | } |
228 | | |
229 | | } // namespace |
230 | | |
231 | 2 | void BlockIter::SetError(const Status& error) { |
232 | 2 | current_ = restarts_; |
233 | 2 | restart_index_ = num_restarts_; |
234 | 2 | status_ = error; |
235 | 2 | key_.Clear(); |
236 | 2 | value_.clear(); |
237 | 2 | } |
238 | | |
239 | 2 | void BlockIter::CorruptionError(const std::string& error_details) { |
240 | 2 | SetError(BadEntryInBlockError(error_details)); |
241 | 2 | } |
242 | | |
243 | | // This function decodes next key-value pair starting at p and encoded with three_shared_parts |
244 | | // delta-encoding algorithm (see ThreeSharedPartsEncoder inside block_builder.cc). |
245 | | // limit specifies exclusive upper bound on where we allowed to decode from. |
246 | | // read_allowed_from specifies exclusive upper bound on where we allowed to read data from (used |
247 | | // for performance optimization in some cases to read by multi-bytes chunks), but still only data |
248 | | // before the limit will be used for decoding. |
249 | | // |
250 | | // The function relies on *key to contain previous decoded key and updates it with a next one. |
251 | | // *value is set to Slice pointing to corresponding key's value. |
252 | | // |
253 | | // Returns whether decoding was successful. |
254 | | inline bool ParseNextKeyThreeSharedParts( |
255 | 4.54M | const char* p, const char* limit, const char* read_allowed_from, IterKey* key, Slice* value) { |
256 | 4.54M | uint32_t shared_prefix_size, non_shared_1_size, non_shared_2_size, shared_last_component_size, |
257 | 4.54M | value_size; |
258 | 4.54M | bool is_something_shared; |
259 | 4.54M | int64_t non_shared_1_size_delta, non_shared_2_size_delta; |
260 | 4.54M | uint64_t shared_last_component_increase; |
261 | | |
262 | 4.54M | p = DecodeEntryThreeSharedParts( |
263 | 4.54M | p, limit, read_allowed_from, &shared_prefix_size, &non_shared_1_size, |
264 | 4.54M | &non_shared_1_size_delta, &is_something_shared, &non_shared_2_size, &non_shared_2_size_delta, |
265 | 4.54M | &shared_last_component_size, &shared_last_component_increase, &value_size); |
266 | 4.54M | if (p == nullptr) { |
267 | 0 | return false; |
268 | 0 | } |
269 | | |
270 | 4.54M | if (PREDICT_FALSE(!is_something_shared)) { |
271 | | // If this key doesn't share any bytes with prev key then we don't need |
272 | | // to decode it and can use its address in the block directly. |
273 | 3.44M | key->SetKey(Slice(p, non_shared_1_size), false /* copy */); |
274 | 3.44M | *value = Slice(p + non_shared_1_size, value_size); |
275 | 3.44M | return true; |
276 | 3.44M | } |
277 | | |
278 | | // The start offset of the shared middle part of the previous key. |
279 | 1.09M | const auto prev_shared_middle_start = |
280 | 1.09M | shared_prefix_size + non_shared_1_size - non_shared_1_size_delta; |
281 | 1.09M | const auto prev_non_shared_2_size = non_shared_2_size - non_shared_2_size_delta; |
282 | 1.09M | const auto prev_size_except_middle_shared = prev_shared_middle_start + prev_non_shared_2_size + |
283 | 1.09M | shared_last_component_size; |
284 | | |
285 | 1.09M | const auto key_size = static_cast<uint32_t>(key->Size()); |
286 | | |
287 | 1.09M | if (key_size < prev_size_except_middle_shared) { |
288 | 0 | return false; |
289 | 0 | } |
290 | | |
291 | 1.09M | const auto shared_middle_size = key_size - prev_size_except_middle_shared; |
292 | | |
293 | 1.09M | if ((shared_prefix_size + shared_middle_size + shared_last_component_size) == 0) { |
294 | | // This is an error, because is_something_shared is true.x |
295 | 0 | return false; |
296 | 0 | } |
297 | | |
298 | 1.09M | key->Update( |
299 | 1.09M | p, shared_prefix_size, non_shared_1_size, static_cast<uint32_t>(prev_shared_middle_start), |
300 | 1.09M | static_cast<uint32_t>(shared_middle_size), non_shared_2_size, shared_last_component_size, |
301 | 1.09M | shared_last_component_increase); |
302 | 1.09M | *value = Slice(p + non_shared_1_size + non_shared_2_size, value_size); |
303 | 1.09M | return true; |
304 | 1.09M | } |
305 | | |
306 | 612M | bool BlockIter::ParseNextKey() { |
307 | 612M | current_ = NextEntryOffset(); |
308 | 612M | const char* p = data_ + current_; |
309 | 612M | const char* limit = data_ + restarts_; // Restarts come right after data |
310 | 612M | if (p >= limit) { |
311 | | // No more entries to return. Mark as invalid. |
312 | 3.17M | current_ = restarts_; |
313 | 3.17M | restart_index_ = num_restarts_; |
314 | 3.17M | return false; |
315 | 3.17M | } |
316 | | |
317 | | // Decode next entry |
318 | 609M | bool valid_encoding_type = false; |
319 | 609M | switch (key_value_encoding_format_) { |
320 | 604M | case KeyValueEncodingFormat::kKeyDeltaEncodingSharedPrefix: { |
321 | 604M | valid_encoding_type = true; |
322 | 604M | uint32_t shared, non_shared, value_length; |
323 | 604M | p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); |
324 | 604M | if (p == nullptr || key_.Size() < shared) { |
325 | 2 | CorruptionError(yb::Format( |
326 | 2 | "p: $0, key_.Size(): $1, shared: $2", static_cast<const void*>(p), key_.Size(), |
327 | 2 | shared)); |
328 | 2 | return false; |
329 | 2 | } |
330 | 604M | if (shared == 0) { |
331 | | // If this key dont share any bytes with prev key then we dont need |
332 | | // to decode it and can use it's address in the block directly. |
333 | 152M | key_.SetKey(Slice(p, non_shared), false /* copy */); |
334 | 452M | } else { |
335 | | // This key share `shared` bytes with prev key, we need to decode it |
336 | 452M | key_.TrimAppend(shared, p, non_shared); |
337 | 452M | } |
338 | 604M | value_ = Slice(p + non_shared, value_length); |
339 | 604M | break; |
340 | 604M | } |
341 | 4.54M | case KeyValueEncodingFormat::kKeyDeltaEncodingThreeSharedParts: { |
342 | 4.54M | valid_encoding_type = true; |
343 | 4.54M | if (!ParseNextKeyThreeSharedParts(p, limit, data_, &key_, &value_)) { |
344 | 0 | CorruptionError("ParseNextKeyThreeSharedParts failed"); |
345 | 0 | return false; |
346 | 0 | } |
347 | 4.54M | break; |
348 | 4.54M | } |
349 | 609M | } |
350 | | |
351 | 609M | if (!valid_encoding_type) { |
352 | 0 | FATAL_INVALID_ENUM_VALUE(KeyValueEncodingFormat, key_value_encoding_format_); |
353 | 0 | } |
354 | | |
355 | | // Restore the invariant that restart_index_ is the index of restart block in which current_ |
356 | | // falls. |
357 | 637M | while (restart_index_ + 1 < num_restarts_ && GetRestartPoint(restart_index_ + 1) < current_) { |
358 | 28.1M | ++restart_index_; |
359 | 28.1M | } |
360 | 609M | return true; |
361 | 609M | } |
362 | | |
363 | | // Binary search in restart array to find the first restart point |
364 | | // with a key >= target (TODO: this comment is inaccurate) |
365 | | bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right, |
366 | 58.0M | uint32_t* index) { |
367 | 58.0M | assert(left <= right); |
368 | | |
369 | 332M | while (left < right) { |
370 | 274M | uint32_t mid = (left + right + 1) / 2; |
371 | 274M | uint32_t region_offset = GetRestartPoint(mid); |
372 | 274M | uint32_t key_size; |
373 | 274M | const char* key_ptr = DecodeRestartEntry( |
374 | 274M | key_value_encoding_format_, data_ + region_offset, data_ + restarts_, data_, &key_size); |
375 | 274M | if (key_ptr == nullptr) { |
376 | 0 | CorruptionError("DecodeRestartEntry failed"); |
377 | 0 | return false; |
378 | 0 | } |
379 | 274M | Slice mid_key(key_ptr, key_size); |
380 | 274M | int cmp = Compare(mid_key, target); |
381 | 274M | if (cmp < 0) { |
382 | | // Key at "mid" is smaller than "target". Therefore all |
383 | | // blocks before "mid" are uninteresting. |
384 | 137M | left = mid; |
385 | 136M | } else if (cmp > 0) { |
386 | | // Key at "mid" is >= "target". Therefore all blocks at or |
387 | | // after "mid" are uninteresting. |
388 | 135M | right = mid - 1; |
389 | 951k | } else { |
390 | 951k | left = right = mid; |
391 | 951k | } |
392 | 274M | } |
393 | | |
394 | 58.0M | *index = left; |
395 | 58.0M | return true; |
396 | 58.0M | } |
397 | | |
398 | | // Compare target key and the block key of the block of `block_index`. |
399 | | // Return -1 if error. |
400 | 471k | int BlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) { |
401 | 471k | uint32_t region_offset = GetRestartPoint(block_index); |
402 | 471k | uint32_t key_size; |
403 | 471k | const char* key_ptr = DecodeRestartEntry( |
404 | 471k | key_value_encoding_format_, data_ + region_offset, data_ + restarts_, data_, &key_size); |
405 | 471k | if (key_ptr == nullptr) { |
406 | 0 | CorruptionError("DecodeRestartEntry failed"); |
407 | 0 | return 1; // Return target is smaller |
408 | 0 | } |
409 | 471k | Slice block_key(key_ptr, key_size); |
410 | 471k | return Compare(block_key, target); |
411 | 471k | } |
412 | | |
413 | | // Binary search in block_ids to find the first block |
414 | | // with a key >= target |
415 | | bool BlockIter::BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, |
416 | | uint32_t left, uint32_t right, |
417 | 100k | uint32_t* index) { |
418 | 100k | assert(left <= right); |
419 | 100k | uint32_t left_bound = left; |
420 | | |
421 | 427k | while (left <= right) { |
422 | 427k | uint32_t mid = (left + right) / 2; |
423 | | |
424 | 427k | int cmp = CompareBlockKey(block_ids[mid], target); |
425 | 427k | if (!status_.ok()) { |
426 | 0 | return false; |
427 | 0 | } |
428 | 427k | if (cmp < 0) { |
429 | | // Key at "target" is larger than "mid". Therefore all |
430 | | // blocks before or at "mid" are uninteresting. |
431 | 155k | left = mid + 1; |
432 | 272k | } else { |
433 | | // Key at "target" is <= "mid". Therefore all blocks |
434 | | // after "mid" are uninteresting. |
435 | | // If there is only one block left, we found it. |
436 | 272k | if (left == right) break; |
437 | 172k | right = mid; |
438 | 172k | } |
439 | 427k | } |
440 | | |
441 | 100k | if (left == right) { |
442 | | // In one of the two following cases: |
443 | | // (1) left is the first one of block_ids |
444 | | // (2) there is a gap of blocks between block of `left` and `left-1`. |
445 | | // we can further distinguish the case of key in the block or key not |
446 | | // existing, by comparing the target key and the key of the previous |
447 | | // block to the left of the block found. |
448 | 100k | if (block_ids[left] > 0 && |
449 | 90.6k | (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) && |
450 | 43.3k | CompareBlockKey(block_ids[left] - 1, target) > 0) { |
451 | 1 | current_ = restarts_; |
452 | 1 | return false; |
453 | 1 | } |
454 | | |
455 | 100k | *index = block_ids[left]; |
456 | 100k | return true; |
457 | 8 | } else { |
458 | 8 | assert(left > right); |
459 | | // Mark iterator invalid |
460 | 8 | current_ = restarts_; |
461 | 8 | return false; |
462 | 8 | } |
463 | 100k | } |
464 | | |
465 | 799k | bool BlockIter::HashSeek(const Slice& target, uint32_t* index) { |
466 | 799k | assert(hash_index_); |
467 | 799k | auto restart_index = hash_index_->GetRestartIndex(target); |
468 | 799k | if (restart_index == nullptr) { |
469 | 199k | current_ = restarts_; |
470 | 199k | return false; |
471 | 199k | } |
472 | | |
473 | | // the elements in restart_array[index : index + num_blocks] |
474 | | // are all with same prefix. We'll do binary search in that small range. |
475 | 600k | auto left = restart_index->first_index; |
476 | 600k | auto right = restart_index->first_index + restart_index->num_blocks - 1; |
477 | 600k | return BinarySeek(target, left, right, index); |
478 | 600k | } |
479 | | |
480 | 102k | bool BlockIter::PrefixSeek(const Slice& target, uint32_t* index) { |
481 | 102k | assert(prefix_index_); |
482 | 102k | uint32_t* block_ids = nullptr; |
483 | 102k | uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids); |
484 | | |
485 | 102k | if (num_blocks == 0) { |
486 | 2.17k | current_ = restarts_; |
487 | 2.17k | return false; |
488 | 100k | } else { |
489 | 100k | return BinaryBlockIndexSeek(target, block_ids, 0, num_blocks - 1, index); |
490 | 100k | } |
491 | 102k | } |
492 | | |
493 | 38.7M | uint32_t Block::NumRestarts() const { |
494 | 38.7M | assert(size_ >= kMinBlockSize); |
495 | 38.7M | return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); |
496 | 38.7M | } |
497 | | |
498 | | Block::Block(BlockContents&& contents) |
499 | | : contents_(std::move(contents)), |
500 | | data_(contents_.data.cdata()), |
501 | 3.88M | size_(contents_.data.size()) { |
502 | 3.88M | if (size_ < sizeof(uint32_t)) { |
503 | 0 | size_ = 0; // Error marker |
504 | 3.88M | } else { |
505 | 3.88M | restart_offset_ = |
506 | 3.88M | static_cast<uint32_t>(size_) - (1 + NumRestarts()) * sizeof(uint32_t); |
507 | 3.88M | if (restart_offset_ > size_ - sizeof(uint32_t)) { |
508 | | // The size is too small for NumRestarts() and therefore |
509 | | // restart_offset_ wrapped around. |
510 | 0 | size_ = 0; |
511 | 0 | } |
512 | 3.88M | } |
513 | 3.88M | } |
514 | | |
515 | | InternalIterator* Block::NewIterator( |
516 | | const Comparator* cmp, const KeyValueEncodingFormat key_value_encoding_format, BlockIter* iter, |
517 | 34.8M | bool total_order_seek) { |
518 | 34.8M | if (size_ < kMinBlockSize) { |
519 | 0 | if (iter != nullptr) { |
520 | 0 | iter->SetStatus(BadBlockContentsError()); |
521 | 0 | return iter; |
522 | 0 | } else { |
523 | 0 | return NewErrorInternalIterator(BadBlockContentsError()); |
524 | 0 | } |
525 | 34.8M | } |
526 | 34.8M | const uint32_t num_restarts = NumRestarts(); |
527 | 34.8M | if (num_restarts == 0) { |
528 | 0 | if (iter != nullptr) { |
529 | 0 | iter->SetStatus(Status::OK()); |
530 | 0 | return iter; |
531 | 0 | } else { |
532 | 0 | return NewEmptyInternalIterator(); |
533 | 0 | } |
534 | 34.8M | } else { |
535 | 34.8M | BlockHashIndex* hash_index_ptr = |
536 | 34.7M | total_order_seek ? nullptr : hash_index_.get(); |
537 | 34.8M | BlockPrefixIndex* prefix_index_ptr = |
538 | 34.7M | total_order_seek ? nullptr : prefix_index_.get(); |
539 | | |
540 | 34.8M | if (iter != nullptr) { |
541 | 19.7M | iter->Initialize(cmp, data_, key_value_encoding_format, restart_offset_, num_restarts, |
542 | 19.7M | hash_index_ptr, prefix_index_ptr); |
543 | 15.1M | } else { |
544 | 15.1M | iter = new BlockIter(cmp, data_, key_value_encoding_format, restart_offset_, num_restarts, |
545 | 15.1M | hash_index_ptr, prefix_index_ptr); |
546 | 15.1M | } |
547 | 34.8M | } |
548 | | |
549 | 34.8M | return iter; |
550 | 34.8M | } |
551 | | |
552 | 4 | void Block::SetBlockHashIndex(BlockHashIndex* hash_index) { |
553 | 4 | hash_index_.reset(hash_index); |
554 | 4 | } |
555 | | |
556 | 1.65k | void Block::SetBlockPrefixIndex(BlockPrefixIndex* prefix_index) { |
557 | 1.65k | prefix_index_.reset(prefix_index); |
558 | 1.65k | } |
559 | | |
560 | 29 | size_t Block::ApproximateMemoryUsage() const { |
561 | 29 | size_t usage = usable_size(); |
562 | 29 | if (hash_index_) { |
563 | 0 | usage += hash_index_->ApproximateMemoryUsage(); |
564 | 0 | } |
565 | 29 | if (prefix_index_) { |
566 | 0 | usage += prefix_index_->ApproximateMemoryUsage(); |
567 | 0 | } |
568 | 29 | return usage; |
569 | 29 | } |
570 | | |
571 | | yb::Result<Slice> Block::GetMiddleKey( |
572 | 58 | const KeyValueEncodingFormat key_value_encoding_format) const { |
573 | 58 | if (size_ < kMinBlockSize) { |
574 | 0 | return BadBlockContentsError(); |
575 | 58 | } else if (size_ == kMinBlockSize) { |
576 | 2 | return STATUS(Incomplete, "Empty block"); |
577 | 2 | } |
578 | | |
579 | 56 | const auto restart_idx = (NumRestarts() - 1) / 2; |
580 | | |
581 | 56 | const auto entry_offset = DecodeFixed32(data_ + restart_offset_ + restart_idx * sizeof(uint32_t)); |
582 | 56 | uint32_t key_size; |
583 | 56 | const char* key_ptr = DecodeRestartEntry( |
584 | 56 | key_value_encoding_format, data_ + entry_offset, data_ + restart_offset_, data_, &key_size); |
585 | 56 | if (key_ptr == nullptr) { |
586 | 0 | return BadEntryInBlockError("DecodeRestartEntry failed"); |
587 | 0 | } |
588 | 56 | return Slice(key_ptr, key_size); |
589 | 56 | } |
590 | | |
591 | | } // namespace rocksdb |