/Users/deen/code/yugabyte-db/src/yb/rocksdb/table/block.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under the BSD-style license found in the |
3 | | // LICENSE file in the root directory of this source tree. An additional grant |
4 | | // of patent rights can be found in the PATENTS file in the same directory. |
5 | | // |
6 | | // The following only applies to changes made to this file as part of YugaByte development. |
7 | | // |
8 | | // Portions Copyright (c) YugaByte, Inc. |
9 | | // |
10 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
11 | | // in compliance with the License. You may obtain a copy of the License at |
12 | | // |
13 | | // http://www.apache.org/licenses/LICENSE-2.0 |
14 | | // |
15 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
16 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
17 | | // or implied. See the License for the specific language governing permissions and limitations |
18 | | // under the License. |
19 | | // |
20 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
21 | | // Use of this source code is governed by a BSD-style license that can be |
22 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
23 | | // |
24 | | // Decodes the blocks generated by block_builder.cc. |
25 | | |
26 | | #include "yb/rocksdb/table/block.h" |
27 | | |
28 | | #include <algorithm> |
29 | | #include <string> |
30 | | #include <unordered_map> |
31 | | #include <vector> |
32 | | |
33 | | #include <glog/logging.h> |
34 | | |
35 | | #include "yb/rocksdb/comparator.h" |
36 | | #include "yb/rocksdb/table/block_hash_index.h" |
37 | | #include "yb/rocksdb/table/block_internal.h" |
38 | | #include "yb/rocksdb/table/block_prefix_index.h" |
39 | | #include "yb/rocksdb/table/format.h" |
40 | | #include "yb/rocksdb/util/coding.h" |
41 | | #include "yb/rocksdb/util/perf_context_imp.h" |
42 | | |
43 | | #include "yb/util/result.h" |
44 | | #include "yb/util/stats/perf_step_timer.h" |
45 | | |
46 | | namespace rocksdb { |
47 | | |
48 | | namespace { |
49 | | |
50 | | // Empty block consists of (see comments inside block_builder.cc for block structure): |
51 | | // - 0 data keys |
52 | | // - uint32 for single restart point (first restart point is always 0 and present in block) |
53 | | // - num_restarts: uint32 |
54 | | const size_t kMinBlockSize = 2*sizeof(uint32_t); |
55 | | |
56 | | } // namespace |
57 | | |
58 | | // Helper routine: decode the next block entry starting at "p", |
59 | | // storing the number of shared key bytes, non_shared key bytes, |
60 | | // and the length of the value in "*shared", "*non_shared", and |
61 | | // "*value_length", respectively. Will not derefence past "limit". |
62 | | // |
63 | | // If any errors are detected, returns nullptr. Otherwise, returns a |
64 | | // pointer to the key delta (just past the three decoded values). |
65 | | static inline const char* DecodeEntry(const char* p, const char* limit, |
66 | | uint32_t* shared, |
67 | | uint32_t* non_shared, |
68 | 2.96G | uint32_t* value_length) { |
69 | 2.96G | if (limit - p < 3) return nullptr0 ; |
70 | 2.96G | *shared = reinterpret_cast<const unsigned char*>(p)[0]; |
71 | 2.96G | *non_shared = reinterpret_cast<const unsigned char*>(p)[1]; |
72 | 2.96G | *value_length = reinterpret_cast<const unsigned char*>(p)[2]; |
73 | 2.96G | if ((*shared | *non_shared | *value_length) < 128) { |
74 | | // Fast path: all three values are encoded in one byte each |
75 | 2.90G | p += 3; |
76 | 2.90G | } else { |
77 | 62.0M | if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr2 ; |
78 | 62.0M | if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr0 ; |
79 | 62.0M | if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr0 ; |
80 | 62.0M | } |
81 | | |
82 | 2.96G | if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) { |
83 | 0 | return nullptr; |
84 | 0 | } |
85 | 2.96G | return p; |
86 | 2.96G | } |
87 | | |
88 | | // Decodes restart key size (key_size) and value size (value_size) starting at `p` and returns |
89 | | // pointer to the next byte after decoded data. Expects restart key to be stored fully without |
90 | | // reusing bytes from previous key (see BlockBuilder::Add for more details). |
91 | | // This function should not read at or beyond `limit`. |
92 | | // Returns nullptr in case of decode failure. |
93 | | static inline const char* DecodeRestartEntry( |
94 | | const KeyValueEncodingFormat key_value_encoding_format, const char* p, const char* limit, |
95 | 1.12G | const char* read_allowed_from, uint32_t* key_size) { |
96 | 1.12G | uint32_t value_size; |
97 | 1.12G | switch (key_value_encoding_format) { |
98 | 1.10G | case KeyValueEncodingFormat::kKeyDeltaEncodingSharedPrefix: { |
99 | 1.10G | uint32_t shared_prefix_size; |
100 | 1.10G | auto* result = DecodeEntry(p, limit, &shared_prefix_size, key_size, &value_size); |
101 | 18.4E | return result1.10G && (shared_prefix_size == 0)1.10G ? result1.10G : nullptr; |
102 | 0 | } |
103 | 19.3M | case KeyValueEncodingFormat::kKeyDeltaEncodingThreeSharedParts: { |
104 | | // We declare output variables for DecodeEntryThreeSharedParts, but since we are |
105 | | // decoding restart key and it is stored fully - we are only interested in non_shared_1_size |
106 | | // argument that stores full key size in this case (see BlockBuilder::Add for more details). |
107 | | // So, we just pass key_size as non_shared_1_size to DecodeEntryThreeSharedParts. |
108 | 19.3M | uint32_t shared_prefix_size, non_shared_2_size, shared_last_component_size; |
109 | 19.3M | bool is_something_shared; |
110 | 19.3M | int64_t non_shared_1_size_delta, non_shared_2_size_delta; |
111 | 19.3M | uint64_t shared_last_component_increase; |
112 | | |
113 | 19.3M | auto* result = DecodeEntryThreeSharedParts( |
114 | 19.3M | p, limit, read_allowed_from, &shared_prefix_size, key_size, &non_shared_1_size_delta, |
115 | 19.3M | &is_something_shared, &non_shared_2_size, &non_shared_2_size_delta, |
116 | 19.3M | &shared_last_component_size, &shared_last_component_increase, &value_size); |
117 | 19.3M | if (PREDICT_FALSE(!result || is_something_shared)) { |
118 | | // This means corruption or decode failure, restart key is stored fully without reusing any |
119 | | // data from the previous key. |
120 | 0 | return nullptr; |
121 | 0 | } |
122 | 19.3M | return result; |
123 | 19.3M | } |
124 | 1.12G | } |
125 | 0 | FATAL_INVALID_ENUM_VALUE(KeyValueEncodingFormat, key_value_encoding_format); |
126 | 0 | } |
127 | | |
128 | 736M | void BlockIter::Next() { |
129 | 736M | assert(Valid()); |
130 | 0 | ParseNextKey(); |
131 | 736M | } |
132 | | |
133 | 7.06M | void BlockIter::Prev() { |
134 | 7.06M | assert(Valid()); |
135 | | |
136 | | // Scan backwards to a restart point before current_ |
137 | 0 | const uint32_t original = current_; |
138 | 7.53M | while (GetRestartPoint(restart_index_) >= original) { |
139 | 989k | if (restart_index_ == 0) { |
140 | | // No more entries |
141 | 518k | current_ = restarts_; |
142 | 518k | restart_index_ = num_restarts_; |
143 | 518k | return; |
144 | 518k | } |
145 | 471k | restart_index_--; |
146 | 471k | } |
147 | | |
148 | 6.55M | SeekToRestartPoint(restart_index_); |
149 | 97.7M | do { |
150 | | // Loop until end of current entry hits the start of original entry |
151 | 97.7M | } while (ParseNextKey() && NextEntryOffset() < original97.7M ); |
152 | 6.55M | } |
153 | | |
154 | | void BlockIter::Initialize( |
155 | | const Comparator* comparator, const char* data, |
156 | | const KeyValueEncodingFormat key_value_encoding_format, uint32_t restarts, |
157 | 83.4M | uint32_t num_restarts, BlockHashIndex* hash_index, BlockPrefixIndex* prefix_index) { |
158 | 83.4M | DCHECK(data_ == nullptr); // Ensure it is called only once |
159 | 83.4M | DCHECK_GT(num_restarts, 0); // Ensure the param is valid |
160 | | |
161 | 83.4M | comparator_ = comparator; |
162 | 83.4M | data_ = data; |
163 | 83.4M | key_value_encoding_format_ = key_value_encoding_format; |
164 | 83.4M | restarts_ = restarts; |
165 | 83.4M | num_restarts_ = num_restarts; |
166 | 83.4M | current_ = restarts_; |
167 | 83.4M | restart_index_ = num_restarts_; |
168 | 83.4M | hash_index_ = hash_index; |
169 | 83.4M | prefix_index_ = prefix_index; |
170 | 83.4M | } |
171 | | |
172 | | |
173 | 231M | void BlockIter::Seek(const Slice& target) { |
174 | 231M | PERF_TIMER_GUARD(block_seek_nanos); |
175 | 231M | if (data_ == nullptr) { // Not init yet |
176 | 0 | return; |
177 | 0 | } |
178 | 231M | uint32_t index = 0; |
179 | 231M | bool ok = false; |
180 | 231M | if (prefix_index_) { |
181 | 101k | ok = PrefixSeek(target, &index); |
182 | 231M | } else { |
183 | 231M | ok = hash_index_ ? HashSeek(target, &index)799k |
184 | 231M | : BinarySeek(target, 0, num_restarts_ - 1, &index)231M ; |
185 | 231M | } |
186 | | |
187 | 231M | if (!ok) { |
188 | 202k | return; |
189 | 202k | } |
190 | 231M | SeekToRestartPoint(index); |
191 | | // Linear search (within restart block) for first key >= target |
192 | | |
193 | 1.02G | while (true1.02G ) { |
194 | 1.02G | if (!ParseNextKey() || Compare(key_.GetKey(), target) >= 01.02G ) { |
195 | 231M | return; |
196 | 231M | } |
197 | 1.02G | } |
198 | 231M | } |
199 | | |
200 | 4.15M | void BlockIter::SeekToFirst() { |
201 | 4.15M | if (data_ == nullptr) { // Not init yet |
202 | 0 | return; |
203 | 0 | } |
204 | 4.15M | SeekToRestartPoint(0); |
205 | 4.15M | ParseNextKey(); |
206 | 4.15M | } |
207 | | |
208 | 3.09M | void BlockIter::SeekToLast() { |
209 | 3.09M | if (data_ == nullptr) { // Not init yet |
210 | 0 | return; |
211 | 0 | } |
212 | 3.09M | SeekToRestartPoint(num_restarts_ - 1); |
213 | 22.7M | while (ParseNextKey() && NextEntryOffset() < restarts_22.7M ) { |
214 | | // Keep skipping |
215 | 19.7M | } |
216 | 3.09M | } |
217 | | |
218 | | |
219 | | namespace { |
220 | | |
221 | 0 | Status BadBlockContentsError() { |
222 | 0 | return STATUS(Corruption, "bad block contents"); |
223 | 0 | } |
224 | | |
225 | 2 | Status BadEntryInBlockError(const std::string& error_details) { |
226 | 2 | return STATUS(Corruption, yb::Format("bad entry in block: $0", error_details)); |
227 | 2 | } |
228 | | |
229 | | } // namespace |
230 | | |
231 | 2 | void BlockIter::SetError(const Status& error) { |
232 | 2 | current_ = restarts_; |
233 | 2 | restart_index_ = num_restarts_; |
234 | 2 | status_ = error; |
235 | 2 | key_.Clear(); |
236 | 2 | value_.clear(); |
237 | 2 | } |
238 | | |
239 | 2 | void BlockIter::CorruptionError(const std::string& error_details) { |
240 | 2 | SetError(BadEntryInBlockError(error_details)); |
241 | 2 | } |
242 | | |
243 | | // This function decodes next key-value pair starting at p and encoded with three_shared_parts |
244 | | // delta-encoding algorithm (see ThreeSharedPartsEncoder inside block_builder.cc). |
245 | | // limit specifies exclusive upper bound on where we allowed to decode from. |
246 | | // read_allowed_from specifies exclusive upper bound on where we allowed to read data from (used |
247 | | // for performance optimization in some cases to read by multi-bytes chunks), but still only data |
248 | | // before the limit will be used for decoding. |
249 | | // |
250 | | // The function relies on *key to contain previous decoded key and updates it with a next one. |
251 | | // *value is set to Slice pointing to corresponding key's value. |
252 | | // |
253 | | // Returns whether decoding was successful. |
254 | | inline bool ParseNextKeyThreeSharedParts( |
255 | 4.53M | const char* p, const char* limit, const char* read_allowed_from, IterKey* key, Slice* value) { |
256 | 4.53M | uint32_t shared_prefix_size, non_shared_1_size, non_shared_2_size, shared_last_component_size, |
257 | 4.53M | value_size; |
258 | 4.53M | bool is_something_shared; |
259 | 4.53M | int64_t non_shared_1_size_delta, non_shared_2_size_delta; |
260 | 4.53M | uint64_t shared_last_component_increase; |
261 | | |
262 | 4.53M | p = DecodeEntryThreeSharedParts( |
263 | 4.53M | p, limit, read_allowed_from, &shared_prefix_size, &non_shared_1_size, |
264 | 4.53M | &non_shared_1_size_delta, &is_something_shared, &non_shared_2_size, &non_shared_2_size_delta, |
265 | 4.53M | &shared_last_component_size, &shared_last_component_increase, &value_size); |
266 | 4.53M | if (p == nullptr) { |
267 | 0 | return false; |
268 | 0 | } |
269 | | |
270 | 4.53M | if (PREDICT_FALSE(!is_something_shared)) { |
271 | | // If this key doesn't share any bytes with prev key then we don't need |
272 | | // to decode it and can use its address in the block directly. |
273 | 3.44M | key->SetKey(Slice(p, non_shared_1_size), false /* copy */); |
274 | 3.44M | *value = Slice(p + non_shared_1_size, value_size); |
275 | 3.44M | return true; |
276 | 3.44M | } |
277 | | |
278 | | // The start offset of the shared middle part of the previous key. |
279 | 1.09M | const auto prev_shared_middle_start = |
280 | 1.09M | shared_prefix_size + non_shared_1_size - non_shared_1_size_delta; |
281 | 1.09M | const auto prev_non_shared_2_size = non_shared_2_size - non_shared_2_size_delta; |
282 | 1.09M | const auto prev_size_except_middle_shared = prev_shared_middle_start + prev_non_shared_2_size + |
283 | 1.09M | shared_last_component_size; |
284 | | |
285 | 1.09M | const auto key_size = static_cast<uint32_t>(key->Size()); |
286 | | |
287 | 1.09M | if (key_size < prev_size_except_middle_shared) { |
288 | 0 | return false; |
289 | 0 | } |
290 | | |
291 | 1.09M | const auto shared_middle_size = key_size - prev_size_except_middle_shared; |
292 | | |
293 | 1.09M | if ((shared_prefix_size + shared_middle_size + shared_last_component_size) == 0) { |
294 | | // This is an error, because is_something_shared is true.x |
295 | 0 | return false; |
296 | 0 | } |
297 | | |
298 | 1.09M | key->Update( |
299 | 1.09M | p, shared_prefix_size, non_shared_1_size, static_cast<uint32_t>(prev_shared_middle_start), |
300 | 1.09M | static_cast<uint32_t>(shared_middle_size), non_shared_2_size, shared_last_component_size, |
301 | 1.09M | shared_last_component_increase); |
302 | 1.09M | *value = Slice(p + non_shared_1_size + non_shared_2_size, value_size); |
303 | 1.09M | return true; |
304 | 1.09M | } |
305 | | |
306 | 1.88G | bool BlockIter::ParseNextKey() { |
307 | 1.88G | current_ = NextEntryOffset(); |
308 | 1.88G | const char* p = data_ + current_; |
309 | 1.88G | const char* limit = data_ + restarts_; // Restarts come right after data |
310 | 1.88G | if (p >= limit) { |
311 | | // No more entries to return. Mark as invalid. |
312 | 6.15M | current_ = restarts_; |
313 | 6.15M | restart_index_ = num_restarts_; |
314 | 6.15M | return false; |
315 | 6.15M | } |
316 | | |
317 | | // Decode next entry |
318 | 1.87G | bool valid_encoding_type = false; |
319 | 1.87G | switch (key_value_encoding_format_) { |
320 | 1.87G | case KeyValueEncodingFormat::kKeyDeltaEncodingSharedPrefix: { |
321 | 1.87G | valid_encoding_type = true; |
322 | 1.87G | uint32_t shared, non_shared, value_length; |
323 | 1.87G | p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); |
324 | 1.87G | if (p == nullptr1.87G || key_.Size() < shared) { |
325 | 2 | CorruptionError(yb::Format( |
326 | 2 | "p: $0, key_.Size(): $1, shared: $2", static_cast<const void*>(p), key_.Size(), |
327 | 2 | shared)); |
328 | 2 | return false; |
329 | 2 | } |
330 | 1.87G | if (shared == 0) { |
331 | | // If this key dont share any bytes with prev key then we dont need |
332 | | // to decode it and can use it's address in the block directly. |
333 | 439M | key_.SetKey(Slice(p, non_shared), false /* copy */); |
334 | 1.43G | } else { |
335 | | // This key share `shared` bytes with prev key, we need to decode it |
336 | 1.43G | key_.TrimAppend(shared, p, non_shared); |
337 | 1.43G | } |
338 | 1.87G | value_ = Slice(p + non_shared, value_length); |
339 | 1.87G | break; |
340 | 1.87G | } |
341 | 4.53M | case KeyValueEncodingFormat::kKeyDeltaEncodingThreeSharedParts: { |
342 | 4.53M | valid_encoding_type = true; |
343 | 4.53M | if (!ParseNextKeyThreeSharedParts(p, limit, data_, &key_, &value_)) { |
344 | 0 | CorruptionError("ParseNextKeyThreeSharedParts failed"); |
345 | 0 | return false; |
346 | 0 | } |
347 | 4.53M | break; |
348 | 4.53M | } |
349 | 1.87G | } |
350 | | |
351 | 1.87G | if (!valid_encoding_type) { |
352 | 0 | FATAL_INVALID_ENUM_VALUE(KeyValueEncodingFormat, key_value_encoding_format_); |
353 | 0 | } |
354 | | |
355 | | // Restore the invariant that restart_index_ is the index of restart block in which current_ |
356 | | // falls. |
357 | 1.93G | while (restart_index_ + 1 < num_restarts_ && GetRestartPoint(restart_index_ + 1) < current_1.78G ) { |
358 | 56.8M | ++restart_index_; |
359 | 56.8M | } |
360 | 1.87G | return true; |
361 | 1.87G | } |
362 | | |
363 | | // Binary search in restart array to find the first restart point |
364 | | // with a key >= target (TODO: this comment is inaccurate) |
365 | | bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right, |
366 | 231M | uint32_t* index) { |
367 | 231M | assert(left <= right); |
368 | | |
369 | 1.36G | while (left < right) { |
370 | 1.12G | uint32_t mid = (left + right + 1) / 2; |
371 | 1.12G | uint32_t region_offset = GetRestartPoint(mid); |
372 | 1.12G | uint32_t key_size; |
373 | 1.12G | const char* key_ptr = DecodeRestartEntry( |
374 | 1.12G | key_value_encoding_format_, data_ + region_offset, data_ + restarts_, data_, &key_size); |
375 | 1.12G | if (key_ptr == nullptr) { |
376 | 0 | CorruptionError("DecodeRestartEntry failed"); |
377 | 0 | return false; |
378 | 0 | } |
379 | 1.12G | Slice mid_key(key_ptr, key_size); |
380 | 1.12G | int cmp = Compare(mid_key, target); |
381 | 1.12G | if (cmp < 0) { |
382 | | // Key at "mid" is smaller than "target". Therefore all |
383 | | // blocks before "mid" are uninteresting. |
384 | 588M | left = mid; |
385 | 588M | } else if (541M cmp > 0541M ) { |
386 | | // Key at "mid" is >= "target". Therefore all blocks at or |
387 | | // after "mid" are uninteresting. |
388 | 543M | right = mid - 1; |
389 | 18.4E | } else { |
390 | 18.4E | left = right = mid; |
391 | 18.4E | } |
392 | 1.12G | } |
393 | | |
394 | 231M | *index = left; |
395 | 231M | return true; |
396 | 231M | } |
397 | | |
398 | | // Compare target key and the block key of the block of `block_index`. |
399 | | // Return -1 if error. |
400 | 467k | int BlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) { |
401 | 467k | uint32_t region_offset = GetRestartPoint(block_index); |
402 | 467k | uint32_t key_size; |
403 | 467k | const char* key_ptr = DecodeRestartEntry( |
404 | 467k | key_value_encoding_format_, data_ + region_offset, data_ + restarts_, data_, &key_size); |
405 | 467k | if (key_ptr == nullptr) { |
406 | 0 | CorruptionError("DecodeRestartEntry failed"); |
407 | 0 | return 1; // Return target is smaller |
408 | 0 | } |
409 | 467k | Slice block_key(key_ptr, key_size); |
410 | 467k | return Compare(block_key, target); |
411 | 467k | } |
412 | | |
413 | | // Binary search in block_ids to find the first block |
414 | | // with a key >= target |
415 | | bool BlockIter::BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, |
416 | | uint32_t left, uint32_t right, |
417 | 99.5k | uint32_t* index) { |
418 | 99.5k | assert(left <= right); |
419 | 0 | uint32_t left_bound = left; |
420 | | |
421 | 424k | while (left <= right) { |
422 | 424k | uint32_t mid = (left + right) / 2; |
423 | | |
424 | 424k | int cmp = CompareBlockKey(block_ids[mid], target); |
425 | 424k | if (!status_.ok()) { |
426 | 0 | return false; |
427 | 0 | } |
428 | 424k | if (cmp < 0) { |
429 | | // Key at "target" is larger than "mid". Therefore all |
430 | | // blocks before or at "mid" are uninteresting. |
431 | 154k | left = mid + 1; |
432 | 270k | } else { |
433 | | // Key at "target" is <= "mid". Therefore all blocks |
434 | | // after "mid" are uninteresting. |
435 | | // If there is only one block left, we found it. |
436 | 270k | if (left == right) break99.5k ; |
437 | 170k | right = mid; |
438 | 170k | } |
439 | 424k | } |
440 | | |
441 | 99.5k | if (left == right) { |
442 | | // In one of the two following cases: |
443 | | // (1) left is the first one of block_ids |
444 | | // (2) there is a gap of blocks between block of `left` and `left-1`. |
445 | | // we can further distinguish the case of key in the block or key not |
446 | | // existing, by comparing the target key and the key of the previous |
447 | | // block to the left of the block found. |
448 | 99.5k | if (block_ids[left] > 0 && |
449 | 99.5k | (90.1k left == left_bound90.1k || block_ids[left - 1] != block_ids[left] - 162.0k ) && |
450 | 99.5k | CompareBlockKey(block_ids[left] - 1, target) > 043.0k ) { |
451 | 1 | current_ = restarts_; |
452 | 1 | return false; |
453 | 1 | } |
454 | | |
455 | 99.5k | *index = block_ids[left]; |
456 | 99.5k | return true; |
457 | 99.5k | } else { |
458 | 30 | assert(left > right); |
459 | | // Mark iterator invalid |
460 | 0 | current_ = restarts_; |
461 | 30 | return false; |
462 | 30 | } |
463 | 99.5k | } |
464 | | |
465 | 799k | bool BlockIter::HashSeek(const Slice& target, uint32_t* index) { |
466 | 799k | assert(hash_index_); |
467 | 0 | auto restart_index = hash_index_->GetRestartIndex(target); |
468 | 799k | if (restart_index == nullptr) { |
469 | 199k | current_ = restarts_; |
470 | 199k | return false; |
471 | 199k | } |
472 | | |
473 | | // the elements in restart_array[index : index + num_blocks] |
474 | | // are all with same prefix. We'll do binary search in that small range. |
475 | 600k | auto left = restart_index->first_index; |
476 | 600k | auto right = restart_index->first_index + restart_index->num_blocks - 1; |
477 | 600k | return BinarySeek(target, left, right, index); |
478 | 799k | } |
479 | | |
480 | 101k | bool BlockIter::PrefixSeek(const Slice& target, uint32_t* index) { |
481 | 101k | assert(prefix_index_); |
482 | 0 | uint32_t* block_ids = nullptr; |
483 | 101k | uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids); |
484 | | |
485 | 101k | if (num_blocks == 0) { |
486 | 2.17k | current_ = restarts_; |
487 | 2.17k | return false; |
488 | 99.5k | } else { |
489 | 99.5k | return BinaryBlockIndexSeek(target, block_ids, 0, num_blocks - 1, index); |
490 | 99.5k | } |
491 | 101k | } |
492 | | |
493 | 87.7M | uint32_t Block::NumRestarts() const { |
494 | 87.7M | assert(size_ >= kMinBlockSize); |
495 | 0 | return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); |
496 | 87.7M | } |
497 | | |
498 | | Block::Block(BlockContents&& contents) |
499 | | : contents_(std::move(contents)), |
500 | | data_(contents_.data.cdata()), |
501 | 4.33M | size_(contents_.data.size()) { |
502 | 4.33M | if (size_ < sizeof(uint32_t)) { |
503 | 0 | size_ = 0; // Error marker |
504 | 4.33M | } else { |
505 | 4.33M | restart_offset_ = |
506 | 4.33M | static_cast<uint32_t>(size_) - (1 + NumRestarts()) * sizeof(uint32_t); |
507 | 4.33M | if (restart_offset_ > size_ - sizeof(uint32_t)) { |
508 | | // The size is too small for NumRestarts() and therefore |
509 | | // restart_offset_ wrapped around. |
510 | 0 | size_ = 0; |
511 | 0 | } |
512 | 4.33M | } |
513 | 4.33M | } |
514 | | |
515 | | InternalIterator* Block::NewIterator( |
516 | | const Comparator* cmp, const KeyValueEncodingFormat key_value_encoding_format, BlockIter* iter, |
517 | 83.4M | bool total_order_seek) { |
518 | 83.4M | if (size_ < kMinBlockSize) { |
519 | 0 | if (iter != nullptr) { |
520 | 0 | iter->SetStatus(BadBlockContentsError()); |
521 | 0 | return iter; |
522 | 0 | } else { |
523 | 0 | return NewErrorInternalIterator(BadBlockContentsError()); |
524 | 0 | } |
525 | 0 | } |
526 | 83.4M | const uint32_t num_restarts = NumRestarts(); |
527 | 83.4M | if (num_restarts == 0) { |
528 | 0 | if (iter != nullptr) { |
529 | 0 | iter->SetStatus(Status::OK()); |
530 | 0 | return iter; |
531 | 0 | } else { |
532 | 0 | return NewEmptyInternalIterator(); |
533 | 0 | } |
534 | 83.4M | } else { |
535 | 83.4M | BlockHashIndex* hash_index_ptr = |
536 | 83.4M | total_order_seek ? nullptr83.2M : hash_index_.get()137k ; |
537 | 83.4M | BlockPrefixIndex* prefix_index_ptr = |
538 | 83.4M | total_order_seek ? nullptr83.2M : prefix_index_.get()138k ; |
539 | | |
540 | 83.4M | if (iter != nullptr) { |
541 | 28.3M | iter->Initialize(cmp, data_, key_value_encoding_format, restart_offset_, num_restarts, |
542 | 28.3M | hash_index_ptr, prefix_index_ptr); |
543 | 55.1M | } else { |
544 | 55.1M | iter = new BlockIter(cmp, data_, key_value_encoding_format, restart_offset_, num_restarts, |
545 | 55.1M | hash_index_ptr, prefix_index_ptr); |
546 | 55.1M | } |
547 | 83.4M | } |
548 | | |
549 | 83.4M | return iter; |
550 | 83.4M | } |
551 | | |
552 | 4 | void Block::SetBlockHashIndex(BlockHashIndex* hash_index) { |
553 | 4 | hash_index_.reset(hash_index); |
554 | 4 | } |
555 | | |
556 | 1.66k | void Block::SetBlockPrefixIndex(BlockPrefixIndex* prefix_index) { |
557 | 1.66k | prefix_index_.reset(prefix_index); |
558 | 1.66k | } |
559 | | |
560 | 29 | size_t Block::ApproximateMemoryUsage() const { |
561 | 29 | size_t usage = usable_size(); |
562 | 29 | if (hash_index_) { |
563 | 0 | usage += hash_index_->ApproximateMemoryUsage(); |
564 | 0 | } |
565 | 29 | if (prefix_index_) { |
566 | 0 | usage += prefix_index_->ApproximateMemoryUsage(); |
567 | 0 | } |
568 | 29 | return usage; |
569 | 29 | } |
570 | | |
571 | | yb::Result<Slice> Block::GetMiddleKey( |
572 | 155 | const KeyValueEncodingFormat key_value_encoding_format) const { |
573 | 155 | if (size_ < kMinBlockSize) { |
574 | 0 | return BadBlockContentsError(); |
575 | 155 | } else if (size_ == kMinBlockSize) { |
576 | 2 | return STATUS(Incomplete, "Empty block"); |
577 | 2 | } |
578 | | |
579 | 153 | const auto restart_idx = (NumRestarts() - 1) / 2; |
580 | | |
581 | 153 | const auto entry_offset = DecodeFixed32(data_ + restart_offset_ + restart_idx * sizeof(uint32_t)); |
582 | 153 | uint32_t key_size; |
583 | 153 | const char* key_ptr = DecodeRestartEntry( |
584 | 153 | key_value_encoding_format, data_ + entry_offset, data_ + restart_offset_, data_, &key_size); |
585 | 153 | if (key_ptr == nullptr) { |
586 | 0 | return BadEntryInBlockError("DecodeRestartEntry failed"); |
587 | 0 | } |
588 | 153 | return Slice(key_ptr, key_size); |
589 | 153 | } |
590 | | |
591 | | } // namespace rocksdb |