/Users/deen/code/yugabyte-db/src/yb/rocksdb/table/block_builder.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under the BSD-style license found in the |
3 | | // LICENSE file in the root directory of this source tree. An additional grant |
4 | | // of patent rights can be found in the PATENTS file in the same directory. |
5 | | // |
6 | | // The following only applies to changes made to this file as part of YugaByte development. |
7 | | // |
8 | | // Portions Copyright (c) YugaByte, Inc. |
9 | | // |
10 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
11 | | // in compliance with the License. You may obtain a copy of the License at |
12 | | // |
13 | | // http://www.apache.org/licenses/LICENSE-2.0 |
14 | | // |
15 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
16 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
17 | | // or implied. See the License for the specific language governing permissions and limitations |
18 | | // under the License. |
19 | | // |
20 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
21 | | // Use of this source code is governed by a BSD-style license that can be |
22 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
23 | | // |
24 | | // BlockBuilder generates blocks where keys are prefix-compressed: |
25 | | // |
26 | | // When we store a key, we drop the prefix shared with the previous |
27 | | // string. This helps reduce the space requirement significantly. |
28 | | // Furthermore, once every K keys, we do not apply the prefix |
29 | | // compression and store the entire key. We call this a "restart |
30 | | // point". The tail end of the block stores the offsets of all of the |
31 | | // restart points, and can be used to do a binary search when looking |
32 | | // for a particular key. Values are stored as-is (without compression) |
33 | | // immediately following the corresponding key. |
34 | | // |
35 | | // An entry for a particular key-value pair has the form: |
36 | | // shared_bytes: varint32 |
37 | | // unshared_bytes: varint32 |
38 | | // value_length: varint32 |
39 | | // key_delta: char[unshared_bytes] |
40 | | // value: char[value_length] |
41 | | // shared_bytes == 0 for restart points. |
42 | | // |
43 | | // The trailer of the block has the form: |
44 | | // restarts: uint32[num_restarts] |
45 | | // num_restarts: uint32 |
46 | | // restarts[i] contains the offset within the block of the ith restart point. |
47 | | |
48 | | #include "yb/rocksdb/table/block_builder.h" |
49 | | |
50 | | #include <assert.h> |
51 | | |
52 | | #include <algorithm> |
53 | | |
54 | | #include "yb/rocksdb/comparator.h" |
55 | | #include "yb/rocksdb/db/dbformat.h" |
56 | | #include "yb/rocksdb/table/block_builder_internal.h" |
57 | | #include "yb/rocksdb/util/coding.h" |
58 | | |
59 | | #include "yb/util/string_util.h" |
60 | | |
61 | | namespace rocksdb { |
62 | | |
63 | | BlockBuilder::BlockBuilder( |
64 | | int block_restart_interval, const KeyValueEncodingFormat key_value_encoding_format, |
65 | | const bool use_delta_encoding) |
66 | | : block_restart_interval_(block_restart_interval), |
67 | | use_delta_encoding_(use_delta_encoding), |
68 | | key_value_encoding_format_(key_value_encoding_format), |
69 | | restarts_(), |
70 | | counter_(0), |
71 | 330k | finished_(false) { |
72 | 330k | assert(block_restart_interval_ >= 1); |
73 | 330k | restarts_.push_back(0); // First restart point is at offset 0 |
74 | 330k | } |
75 | | |
76 | 2.53M | void BlockBuilder::Reset() { |
77 | 2.53M | buffer_.clear(); |
78 | 2.53M | restarts_.clear(); |
79 | 2.53M | restarts_.push_back(0); // First restart point is at offset 0 |
80 | 2.53M | counter_ = 0; |
81 | 2.53M | finished_ = false; |
82 | 2.53M | last_key_.clear(); |
83 | 2.53M | } |
84 | | |
85 | 279M | size_t BlockBuilder::CurrentSizeEstimate() const { |
86 | 279M | size_t size = buffer_.size(); // Raw data buffer. |
87 | 279M | if (!finished_) { |
88 | | // Restarts haven't been flushed to buffer yet. |
89 | 279M | size += restarts_.size() * sizeof(uint32_t) + // Restart array. |
90 | 279M | sizeof(uint32_t); // Restart array length. |
91 | 279M | } |
92 | 279M | return size; |
93 | 279M | } |
94 | | |
95 | | size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, const Slice& value) |
96 | 92.9M | const { |
97 | 92.9M | size_t estimate = CurrentSizeEstimate(); |
98 | 92.9M | estimate += key.size() + value.size(); |
99 | 92.9M | if (counter_ >= block_restart_interval_) { |
100 | 7.48M | estimate += sizeof(uint32_t); // a new restart entry. |
101 | 7.48M | } |
102 | | |
103 | 92.9M | estimate += sizeof(int32_t); // varint for shared prefix length. |
104 | 92.9M | estimate += VarintLength(key.size()); // varint for key length. |
105 | 92.9M | estimate += VarintLength(value.size()); // varint for value length. |
106 | | |
107 | 92.9M | return estimate; |
108 | 92.9M | } |
109 | | |
110 | 2.52M | size_t BlockBuilder::NumKeys() const { |
111 | 2.52M | return restarts_.size() * block_restart_interval_ + counter_; |
112 | 2.52M | } |
113 | | |
114 | | namespace { |
115 | | |
116 | | // Find maximum common substring starting at the same position in both input strings. |
117 | | // Returns pair of substring index and size. |
118 | | inline std::pair<size_t, size_t> FindMaxSharedSubstringAtTheSamePos( |
119 | 232k | const uint8_t* lhs, const uint8_t* rhs, const size_t min_size) { |
120 | 232k | size_t max_shared_size = 0; |
121 | 232k | const auto* offset_of_max_shared = lhs; |
122 | 232k | { |
123 | 232k | size_t shared_size = 0; |
124 | 232k | const auto lhs_limit = lhs + min_size; |
125 | 7.98M | for (auto *p1 = lhs, *p2 = rhs; p1 < lhs_limit; ++p1, ++p2) { |
126 | 7.75M | if (*p1 == *p2) { |
127 | 627k | shared_size++; |
128 | 7.12M | } else { |
129 | 7.12M | if (shared_size > max_shared_size) { |
130 | 12.0k | max_shared_size = shared_size; |
131 | 12.0k | offset_of_max_shared = p1 - shared_size; |
132 | 12.0k | } |
133 | 7.12M | shared_size = 0; |
134 | 7.12M | } |
135 | 7.75M | } |
136 | 232k | } |
137 | 232k | DCHECK(strings::memeq(offset_of_max_shared, offset_of_max_shared - lhs + rhs, |
138 | 232k | max_shared_size)); |
139 | 232k | return std::make_pair(offset_of_max_shared - lhs, max_shared_size); |
140 | 232k | } |
141 | | |
142 | | inline ComponentSizes NothingToShare( |
143 | 155k | const size_t prev_key_non_shared_1_size, const size_t non_shared_1_size) { |
144 | 155k | return ComponentSizes { |
145 | 155k | .prev_key_non_shared_1_size = prev_key_non_shared_1_size, |
146 | 155k | .non_shared_1_size = non_shared_1_size, |
147 | 155k | .shared_middle_size = 0, |
148 | 155k | .prev_key_non_shared_2_size = 0, |
149 | 155k | .non_shared_2_size = 0, |
150 | 155k | }; |
151 | 155k | } |
152 | | |
153 | | // Trying to find max shared_middle matching the following pattern: |
154 | | // lhs = <lhs_non_shared_1><shared_middle><lhs_non_shared_2> |
155 | | // rhs = <rhs_non_shared_1><shared_middle><rhs_non_shared_2> |
156 | | // Returns sizes of the above components. |
157 | | // |
158 | | // For runtime efficiency we assume that either lhs_key_non_shared_1 and rhs_non_shared_1 or |
159 | | // lhs_non_shared_2 and rhs_non_shared_2 are of the same size (and this happens in most cases for |
160 | | // docdb encoded keys), so we can find shared_middle in linear time by trying to search at the same |
161 | | // positions in maximum prefixes and maximum suffixes of lhs and rhs. |
162 | 163k | inline ComponentSizes FindMaxSharedMiddle(const Slice& lhs, const Slice& rhs) { |
163 | 163k | const auto lhs_size = lhs.size(); |
164 | 163k | const auto rhs_size = rhs.size(); |
165 | | |
166 | 163k | size_t min_length; |
167 | 163k | std::pair<size_t, size_t> max_shared; |
168 | 163k | bool from_left = true; |
169 | 163k | if (lhs_size == rhs_size) { |
170 | 93.7k | min_length = rhs_size; |
171 | 93.7k | max_shared = FindMaxSharedSubstringAtTheSamePos(lhs.data(), rhs.data(), min_length); |
172 | 69.3k | } else { |
173 | 69.3k | const uint8_t* lhs_suffix_start; |
174 | 69.3k | const uint8_t* rhs_suffix_start; |
175 | 69.3k | if (lhs_size > rhs_size) { |
176 | 4.57k | min_length = rhs_size; |
177 | 4.57k | lhs_suffix_start = lhs.end() - min_length; |
178 | 4.57k | rhs_suffix_start = rhs.data(); |
179 | 64.7k | } else { |
180 | 64.7k | min_length = lhs_size; |
181 | 64.7k | lhs_suffix_start = lhs.data(); |
182 | 64.7k | rhs_suffix_start = rhs.end() - min_length; |
183 | 64.7k | } |
184 | 69.3k | max_shared = FindMaxSharedSubstringAtTheSamePos(lhs.data(), rhs.data(), min_length); |
185 | | |
186 | | // Search max shared substring in the max suffixes of the same size. |
187 | 69.3k | const auto max_shared_from_right = FindMaxSharedSubstringAtTheSamePos( |
188 | 69.3k | lhs_suffix_start, rhs_suffix_start, min_length); |
189 | 69.3k | if (max_shared_from_right.second > max_shared.second) { |
190 | 1.21k | from_left = false; |
191 | 1.21k | max_shared = max_shared_from_right; |
192 | 1.21k | } |
193 | 69.3k | } |
194 | | |
195 | 163k | if (max_shared.second == 0) { |
196 | 155k | return NothingToShare(lhs.size(), rhs.size()); |
197 | 155k | } |
198 | | |
199 | 7.34k | if (from_left) { |
200 | 6.12k | const auto non_shared_1_plus_shared_middle_size = max_shared.first + max_shared.second; |
201 | 6.12k | return ComponentSizes { |
202 | 6.12k | .prev_key_non_shared_1_size = max_shared.first, |
203 | 6.12k | .non_shared_1_size = max_shared.first, |
204 | 6.12k | .shared_middle_size = max_shared.second, |
205 | 6.12k | .prev_key_non_shared_2_size = lhs_size - non_shared_1_plus_shared_middle_size, |
206 | 6.12k | .non_shared_2_size = rhs_size - non_shared_1_plus_shared_middle_size, |
207 | 6.12k | }; |
208 | 1.21k | } else { |
209 | 1.21k | const auto shared_middle_plus_non_shared_2_size = min_length - max_shared.first; |
210 | 1.21k | const auto non_shared_2_size = shared_middle_plus_non_shared_2_size - max_shared.second; |
211 | 1.21k | return ComponentSizes { |
212 | 1.21k | .prev_key_non_shared_1_size = lhs_size - shared_middle_plus_non_shared_2_size, |
213 | 1.21k | .non_shared_1_size = rhs_size - shared_middle_plus_non_shared_2_size, |
214 | 1.21k | .shared_middle_size = max_shared.second, |
215 | 1.21k | .prev_key_non_shared_2_size = non_shared_2_size, |
216 | 1.21k | .non_shared_2_size = non_shared_2_size, |
217 | 1.21k | }; |
218 | 1.21k | } |
219 | 7.34k | } |
220 | | |
221 | | inline void CalculateLastInternalComponentReuse( |
222 | | const Slice& prev_key, const Slice& key, const size_t min_length, |
223 | | const size_t shared_prefix_size, size_t* last_internal_component_reuse_size, |
224 | 163k | bool* is_last_internal_component_inc) { |
225 | 163k | DCHECK_EQ(min_length, std::min(prev_key.size(), key.size())); |
226 | 163k | if (min_length >= shared_prefix_size + kLastInternalComponentSize) { |
227 | 9.23k | const auto prev_last_internal_comp = |
228 | 9.23k | DecodeFixed64(prev_key.end() - kLastInternalComponentSize); |
229 | 9.23k | const auto cur_last_internal_comp = DecodeFixed64(key.end() - kLastInternalComponentSize); |
230 | 9.23k | if (cur_last_internal_comp == prev_last_internal_comp + 0x100) { |
231 | 0 | *is_last_internal_component_inc = true; |
232 | 0 | *last_internal_component_reuse_size = kLastInternalComponentSize; |
233 | 9.23k | } else { |
234 | 9.23k | *is_last_internal_component_inc = false; |
235 | 9.23k | if (cur_last_internal_comp == prev_last_internal_comp) { |
236 | 1.75k | *last_internal_component_reuse_size = kLastInternalComponentSize; |
237 | 7.47k | } else { |
238 | 7.47k | *last_internal_component_reuse_size = 0; |
239 | 7.47k | } |
240 | 9.23k | } |
241 | 153k | } else { |
242 | 153k | *is_last_internal_component_inc = false; |
243 | 153k | *last_internal_component_reuse_size = 0; |
244 | 153k | } |
245 | 163k | } |
246 | | |
247 | | // Specific delta encoding optimized for the following keys pattern: |
248 | | // |
249 | | // Prev key: |
250 | | // <shared_prefix>[<prev_key_non_shared_1>[<shared_middle>[<prev_key_non_shared_2>]]] |
251 | | // [<last_internal_component_to_reuse>] |
252 | | // |
253 | | // Current key: |
254 | | // <shared_prefix>[<non_shared_1>[<shared_middle>[<non_shared_2>]]] |
255 | | // [<last_internal_component_to_reuse> (optionally incremented)] |
256 | | // |
257 | | // <last_internal_component_to_reuse> is either empty (that means not reused) or contains rocksdb |
258 | | // internal key suffix (see rocksdb/db/dbformat.cc - PackSequenceAndType) of size |
259 | | // kLastInternalComponentSize in case it is reused as is or sequence number is incremented from |
260 | | // prev key. |
261 | | // |
262 | | // Based on prev_key and key calculates sizes of shared/non-shared key components mentioned above. |
263 | | // Tries to maximize shared_prefix and shared_middle sizes. |
264 | | class ThreeSharedPartsEncoder { |
265 | | public: |
266 | | ThreeSharedPartsEncoder(const Slice& prev_key, const Slice& key) |
267 | | : prev_key_(prev_key), |
268 | | prev_key_size_(prev_key.size()), |
269 | | key_(key), |
270 | | key_size_(key.size()), |
271 | | rest_components_sizes_{ |
272 | | .prev_key_non_shared_1_size = prev_key_size_, |
273 | | .non_shared_1_size = key_size_, |
274 | 96.6M | } {} |
275 | | |
276 | 163k | inline void FindComponents(const size_t min_length, const size_t shared_prefix_size) { |
277 | 163k | DCHECK_EQ(min_length, std::min(prev_key_.size(), key_.size())); |
278 | 163k | CalculateLastInternalComponentReuse( |
279 | 163k | prev_key_, key_, min_length, shared_prefix_size, &last_internal_component_reuse_size_, |
280 | 163k | &is_last_internal_component_inc_); |
281 | | |
282 | 163k | const auto prev_key_between_shared = Slice( |
283 | 163k | prev_key_.data() + shared_prefix_size, |
284 | 163k | prev_key_.end() - last_internal_component_reuse_size_); |
285 | 163k | const auto cur_key_between_shared = |
286 | 163k | Slice(key_.data() + shared_prefix_size, key_.end() - last_internal_component_reuse_size_); |
287 | | |
288 | 163k | rest_components_sizes_ = FindMaxSharedMiddle(prev_key_between_shared, cur_key_between_shared); |
289 | 163k | #ifndef DEBUG |
290 | 163k | auto check_result = |
291 | 163k | rest_components_sizes_.DebugVerify(prev_key_between_shared, cur_key_between_shared); |
292 | 163k | if (!check_result.ok()) { |
293 | 0 | LOG(FATAL) << check_result << ": " << rest_components_sizes_.ToString(); |
294 | 0 | } |
295 | 163k | #endif |
296 | 163k | } |
297 | | |
298 | | // Encodes components sizes + non-shared key parts and value into buffer. |
299 | | // See EncodeThreeSharedPartsSizes for description of how we encode component sizes. |
300 | 480k | inline void Encode(size_t shared_prefix_size, const Slice& value, std::string* buffer) { |
301 | 480k | const auto value_size = value.size(); |
302 | | |
303 | 480k | EncodeThreeSharedPartsSizes( |
304 | 480k | shared_prefix_size, last_internal_component_reuse_size_, is_last_internal_component_inc_, |
305 | 480k | rest_components_sizes_, prev_key_size_, key_size_, value_size, buffer); |
306 | | |
307 | 480k | yb::EnlargeBufferIfNeeded( |
308 | 480k | buffer, buffer->size() + rest_components_sizes_.non_shared_1_size + |
309 | 480k | rest_components_sizes_.non_shared_2_size + value_size); |
310 | | |
311 | | // Add key deltas to buffer followed by value. |
312 | 480k | buffer->append(key_.cdata() + shared_prefix_size, rest_components_sizes_.non_shared_1_size); |
313 | 480k | buffer->append( |
314 | 480k | key_.cend() - last_internal_component_reuse_size_ - |
315 | 480k | rest_components_sizes_.non_shared_2_size, |
316 | 480k | rest_components_sizes_.non_shared_2_size); |
317 | 480k | buffer->append(value.cdata(), value_size); |
318 | 480k | } |
319 | | |
320 | | private: |
321 | | const Slice prev_key_; |
322 | | const size_t prev_key_size_ = 0; |
323 | | const Slice key_; |
324 | | const size_t key_size_ = 0; |
325 | | |
326 | | size_t last_internal_component_reuse_size_ = 0; |
327 | | bool is_last_internal_component_inc_ = false; |
328 | | ComponentSizes rest_components_sizes_; |
329 | | }; |
330 | | |
331 | | } // namespace |
332 | | |
333 | 2.74M | Slice BlockBuilder::Finish() { |
334 | | // Append restart array |
335 | 14.6M | for (size_t i = 0; i < restarts_.size(); i++) { |
336 | 11.9M | PutFixed32(&buffer_, restarts_[i]); |
337 | 11.9M | } |
338 | 2.74M | PutFixed32(&buffer_, static_cast<uint32_t>(restarts_.size())); |
339 | 2.74M | finished_ = true; |
340 | 2.74M | return Slice(buffer_); |
341 | 2.74M | } |
342 | | |
343 | 96.6M | void BlockBuilder::Add(const Slice& key, const Slice& value) { |
344 | 96.6M | const Slice prev_key_piece(last_key_); |
345 | 96.6M | assert(!finished_); |
346 | 96.6M | assert(counter_ <= block_restart_interval_); |
347 | | |
348 | 96.6M | size_t shared_prefix_size = 0; // number of prefix bytes shared with prev key |
349 | | |
350 | | // Initial values to be used on restarts or when delta encoding is off. |
351 | 96.6M | ThreeSharedPartsEncoder three_shared_parts_encoder(prev_key_piece, key); |
352 | | |
353 | 96.6M | if (counter_ >= block_restart_interval_) { |
354 | | // Restart compression |
355 | 9.16M | restarts_.push_back(static_cast<uint32_t>(buffer_.size())); |
356 | 9.16M | counter_ = 0; |
357 | 87.4M | } else if (use_delta_encoding_) { |
358 | | // See how much sharing to do with previous string |
359 | 86.5M | const size_t min_length = std::min(prev_key_piece.size(), key.size()); |
360 | 86.5M | shared_prefix_size = |
361 | 86.5M | strings::MemoryDifferencePos(prev_key_piece.data(), key.data(), min_length); |
362 | 86.5M | bool valid_encoding_format = false; |
363 | 86.5M | switch (key_value_encoding_format_) { |
364 | 86.4M | case KeyValueEncodingFormat::kKeyDeltaEncodingSharedPrefix: |
365 | 86.4M | valid_encoding_format = true; |
366 | 86.4M | break; |
367 | 163k | case KeyValueEncodingFormat::kKeyDeltaEncodingThreeSharedParts: |
368 | 163k | valid_encoding_format = true; |
369 | 163k | three_shared_parts_encoder.FindComponents(min_length, shared_prefix_size); |
370 | 163k | break; |
371 | 86.6M | } |
372 | 86.6M | if (!valid_encoding_format) { |
373 | 0 | FATAL_INVALID_ENUM_VALUE(KeyValueEncodingFormat, key_value_encoding_format_); |
374 | 0 | } |
375 | 86.6M | } |
376 | | |
377 | 28.5k | DVLOG_WITH_FUNC(4) << "key: " << Slice(key).ToDebugHexString() << " size: " << key.size() |
378 | 28.5k | << " offset: " << buffer_.size() << " counter: " << counter_; |
379 | | |
380 | 96.7M | const size_t after_shared_prefix_size = key.size() - shared_prefix_size; |
381 | | |
382 | 96.7M | switch (key_value_encoding_format_) { |
383 | | // If we don't use key delta encoding, we still use the same encoding format, but shared |
384 | | // components sizes are 0. |
385 | 96.1M | case KeyValueEncodingFormat::kKeyDeltaEncodingSharedPrefix: { |
386 | | // Add "<shared_prefix_size><after_shared_prefix_size><value_size>" to buffer_ |
387 | 96.1M | PutVarint32(&buffer_, static_cast<uint32_t>(shared_prefix_size)); |
388 | 96.1M | PutVarint32(&buffer_, static_cast<uint32_t>(after_shared_prefix_size)); |
389 | 96.1M | PutVarint32(&buffer_, static_cast<uint32_t>(value.size())); |
390 | | |
391 | | // Add string delta to buffer_ followed by value |
392 | 96.1M | buffer_.append(key.cdata() + shared_prefix_size, after_shared_prefix_size); |
393 | 96.1M | buffer_.append(value.cdata(), value.size()); |
394 | 96.1M | break; |
395 | 0 | } |
396 | 480k | case KeyValueEncodingFormat::kKeyDeltaEncodingThreeSharedParts: { |
397 | 480k | three_shared_parts_encoder.Encode(shared_prefix_size, value, &buffer_); |
398 | 480k | break; |
399 | 96.6M | } |
400 | 96.6M | } |
401 | | |
402 | | // Update state |
403 | 96.6M | last_key_.resize(shared_prefix_size); |
404 | 96.6M | last_key_.append(key.cdata() + shared_prefix_size, after_shared_prefix_size); |
405 | | |
406 | 96.6M | assert(Slice(last_key_) == key); |
407 | 96.6M | counter_++; |
408 | 96.6M | } |
409 | | |
410 | | } // namespace rocksdb |