YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/rocksdb/table/block_builder.cc
Line
Count
Source (jump to first uncovered line)
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under the BSD-style license found in the
3
//  LICENSE file in the root directory of this source tree. An additional grant
4
//  of patent rights can be found in the PATENTS file in the same directory.
5
//
6
// The following only applies to changes made to this file as part of YugaByte development.
7
//
8
// Portions Copyright (c) YugaByte, Inc.
9
//
10
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
11
// in compliance with the License.  You may obtain a copy of the License at
12
//
13
// http://www.apache.org/licenses/LICENSE-2.0
14
//
15
// Unless required by applicable law or agreed to in writing, software distributed under the License
16
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
17
// or implied.  See the License for the specific language governing permissions and limitations
18
// under the License.
19
//
20
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
21
// Use of this source code is governed by a BSD-style license that can be
22
// found in the LICENSE file. See the AUTHORS file for names of contributors.
23
//
24
// BlockBuilder generates blocks where keys are prefix-compressed:
25
//
26
// When we store a key, we drop the prefix shared with the previous
27
// string.  This helps reduce the space requirement significantly.
28
// Furthermore, once every K keys, we do not apply the prefix
29
// compression and store the entire key.  We call this a "restart
30
// point".  The tail end of the block stores the offsets of all of the
31
// restart points, and can be used to do a binary search when looking
32
// for a particular key.  Values are stored as-is (without compression)
33
// immediately following the corresponding key.
34
//
35
// An entry for a particular key-value pair has the form:
36
//     shared_bytes: varint32
37
//     unshared_bytes: varint32
38
//     value_length: varint32
39
//     key_delta: char[unshared_bytes]
40
//     value: char[value_length]
41
// shared_bytes == 0 for restart points.
42
//
43
// The trailer of the block has the form:
44
//     restarts: uint32[num_restarts]
45
//     num_restarts: uint32
46
// restarts[i] contains the offset within the block of the ith restart point.
47
48
#include "yb/rocksdb/table/block_builder.h"
49
50
#include <assert.h>
51
52
#include <algorithm>
53
54
#include "yb/rocksdb/comparator.h"
55
#include "yb/rocksdb/db/dbformat.h"
56
#include "yb/rocksdb/table/block_builder_internal.h"
57
#include "yb/rocksdb/util/coding.h"
58
59
#include "yb/util/string_util.h"
60
61
namespace rocksdb {
62
63
BlockBuilder::BlockBuilder(
64
    int block_restart_interval, const KeyValueEncodingFormat key_value_encoding_format,
65
    const bool use_delta_encoding)
66
    : block_restart_interval_(block_restart_interval),
67
      use_delta_encoding_(use_delta_encoding),
68
      key_value_encoding_format_(key_value_encoding_format),
69
      restarts_(),
70
      counter_(0),
71
352k
      finished_(false) {
72
352k
  assert(block_restart_interval_ >= 1);
73
0
  restarts_.push_back(0);       // First restart point is at offset 0
74
352k
}
75
76
2.86M
void BlockBuilder::Reset() {
77
2.86M
  buffer_.clear();
78
2.86M
  restarts_.clear();
79
2.86M
  restarts_.push_back(0);       // First restart point is at offset 0
80
2.86M
  counter_ = 0;
81
2.86M
  finished_ = false;
82
2.86M
  last_key_.clear();
83
2.86M
}
84
85
544M
size_t BlockBuilder::CurrentSizeEstimate() const {
86
544M
  size_t size = buffer_.size(); // Raw data buffer.
87
544M
  if (!finished_) {
88
    // Restarts haven't been flushed to buffer yet.
89
543M
    size += restarts_.size() * sizeof(uint32_t) +    // Restart array.
90
543M
            sizeof(uint32_t);                        // Restart array length.
91
543M
  }
92
544M
  return size;
93
544M
}
94
95
size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, const Slice& value)
96
181M
  const {
97
181M
  size_t estimate = CurrentSizeEstimate();
98
181M
  estimate += key.size() + value.size();
99
181M
  if (counter_ >= block_restart_interval_) {
100
13.0M
    estimate += sizeof(uint32_t); // a new restart entry.
101
13.0M
  }
102
103
181M
  estimate += sizeof(int32_t); // varint for shared prefix length.
104
181M
  estimate += VarintLength(key.size()); // varint for key length.
105
181M
  estimate += VarintLength(value.size()); // varint for value length.
106
107
181M
  return estimate;
108
181M
}
109
110
2.93M
size_t BlockBuilder::NumKeys() const {
111
2.93M
  return restarts_.size() * block_restart_interval_ + counter_;
112
2.93M
}
113
114
namespace {
115
116
// Find maximum common substring starting at the same position in both input strings.
117
// Returns pair of substring index and size.
118
inline std::pair<size_t, size_t> FindMaxSharedSubstringAtTheSamePos(
119
232k
    const uint8_t* lhs, const uint8_t* rhs, const size_t min_size) {
120
232k
  size_t max_shared_size = 0;
121
232k
  const auto* offset_of_max_shared = lhs;
122
232k
  {
123
232k
    size_t shared_size = 0;
124
232k
    const auto lhs_limit = lhs + min_size;
125
7.88M
    for (auto *p1 = lhs, *p2 = rhs; p1 < lhs_limit; 
++p1, ++p27.65M
) {
126
7.65M
      if (*p1 == *p2) {
127
624k
        shared_size++;
128
7.02M
      } else {
129
7.02M
        if (shared_size > max_shared_size) {
130
11.8k
          max_shared_size = shared_size;
131
11.8k
          offset_of_max_shared = p1 - shared_size;
132
11.8k
        }
133
7.02M
        shared_size = 0;
134
7.02M
      }
135
7.65M
    }
136
232k
  }
137
232k
  DCHECK(strings::memeq(offset_of_max_shared, offset_of_max_shared - lhs + rhs,
138
232k
      max_shared_size));
139
232k
  return std::make_pair(offset_of_max_shared - lhs, max_shared_size);
140
232k
}
141
142
inline ComponentSizes NothingToShare(
143
155k
    const size_t prev_key_non_shared_1_size, const size_t non_shared_1_size) {
144
155k
  return ComponentSizes {
145
155k
    .prev_key_non_shared_1_size = prev_key_non_shared_1_size,
146
155k
    .non_shared_1_size = non_shared_1_size,
147
155k
    .shared_middle_size = 0,
148
155k
    .prev_key_non_shared_2_size = 0,
149
155k
    .non_shared_2_size = 0,
150
155k
  };
151
155k
}
152
153
// Trying to find max shared_middle matching the following pattern:
154
//   lhs = <lhs_non_shared_1><shared_middle><lhs_non_shared_2>
155
//   rhs = <rhs_non_shared_1><shared_middle><rhs_non_shared_2>
156
// Returns sizes of the above components.
157
//
158
// For runtime efficiency we assume that either lhs_key_non_shared_1 and rhs_non_shared_1 or
159
// lhs_non_shared_2 and rhs_non_shared_2 are of the same size (and this happens in most cases for
160
// docdb encoded keys), so we can find shared_middle in linear time by trying to search at the same
161
// positions in maximum prefixes and maximum suffixes of lhs and rhs.
162
163k
inline ComponentSizes FindMaxSharedMiddle(const Slice& lhs, const Slice& rhs) {
163
163k
  const auto lhs_size = lhs.size();
164
163k
  const auto rhs_size = rhs.size();
165
166
163k
  size_t min_length;
167
163k
  std::pair<size_t, size_t> max_shared;
168
163k
  bool from_left = true;
169
163k
  if (lhs_size == rhs_size) {
170
93.7k
    min_length = rhs_size;
171
93.7k
    max_shared = FindMaxSharedSubstringAtTheSamePos(lhs.data(), rhs.data(), min_length);
172
93.7k
  } else {
173
69.3k
    const uint8_t* lhs_suffix_start;
174
69.3k
    const uint8_t* rhs_suffix_start;
175
69.3k
    if (lhs_size > rhs_size) {
176
4.68k
      min_length = rhs_size;
177
4.68k
      lhs_suffix_start = lhs.end() - min_length;
178
4.68k
      rhs_suffix_start = rhs.data();
179
64.6k
    } else {
180
64.6k
      min_length = lhs_size;
181
64.6k
      lhs_suffix_start = lhs.data();
182
64.6k
      rhs_suffix_start = rhs.end() - min_length;
183
64.6k
    }
184
69.3k
    max_shared = FindMaxSharedSubstringAtTheSamePos(lhs.data(), rhs.data(), min_length);
185
186
    // Search max shared substring in the max suffixes of the same size.
187
69.3k
    const auto max_shared_from_right = FindMaxSharedSubstringAtTheSamePos(
188
69.3k
        lhs_suffix_start, rhs_suffix_start, min_length);
189
69.3k
    if (max_shared_from_right.second > max_shared.second) {
190
1.24k
      from_left = false;
191
1.24k
      max_shared = max_shared_from_right;
192
1.24k
    }
193
69.3k
  }
194
195
163k
  if (max_shared.second == 0) {
196
155k
    return NothingToShare(lhs.size(), rhs.size());
197
155k
  }
198
199
7.23k
  if (from_left) {
200
5.98k
    const auto non_shared_1_plus_shared_middle_size = max_shared.first + max_shared.second;
201
5.98k
    return ComponentSizes {
202
5.98k
      .prev_key_non_shared_1_size = max_shared.first,
203
5.98k
      .non_shared_1_size = max_shared.first,
204
5.98k
      .shared_middle_size = max_shared.second,
205
5.98k
      .prev_key_non_shared_2_size = lhs_size - non_shared_1_plus_shared_middle_size,
206
5.98k
      .non_shared_2_size = rhs_size - non_shared_1_plus_shared_middle_size,
207
5.98k
    };
208
5.98k
  } else {
209
1.24k
    const auto shared_middle_plus_non_shared_2_size = min_length - max_shared.first;
210
1.24k
    const auto non_shared_2_size = shared_middle_plus_non_shared_2_size - max_shared.second;
211
1.24k
    return ComponentSizes {
212
1.24k
      .prev_key_non_shared_1_size = lhs_size - shared_middle_plus_non_shared_2_size,
213
1.24k
      .non_shared_1_size = rhs_size - shared_middle_plus_non_shared_2_size,
214
1.24k
      .shared_middle_size = max_shared.second,
215
1.24k
      .prev_key_non_shared_2_size = non_shared_2_size,
216
1.24k
      .non_shared_2_size = non_shared_2_size,
217
1.24k
    };
218
1.24k
  }
219
7.23k
}
220
221
inline void CalculateLastInternalComponentReuse(
222
    const Slice& prev_key, const Slice& key, const size_t min_length,
223
    const size_t shared_prefix_size, size_t* last_internal_component_reuse_size,
224
163k
    bool* is_last_internal_component_inc) {
225
163k
  DCHECK_EQ(min_length, std::min(prev_key.size(), key.size()));
226
163k
  if (min_length >= shared_prefix_size + kLastInternalComponentSize) {
227
9.18k
    const auto prev_last_internal_comp =
228
9.18k
        DecodeFixed64(prev_key.end() - kLastInternalComponentSize);
229
9.18k
    const auto cur_last_internal_comp = DecodeFixed64(key.end() - kLastInternalComponentSize);
230
9.18k
    if (cur_last_internal_comp == prev_last_internal_comp + 0x100) {
231
0
      *is_last_internal_component_inc = true;
232
0
      *last_internal_component_reuse_size = kLastInternalComponentSize;
233
9.18k
    } else {
234
9.18k
      *is_last_internal_component_inc = false;
235
9.18k
      if (cur_last_internal_comp == prev_last_internal_comp) {
236
1.76k
        *last_internal_component_reuse_size = kLastInternalComponentSize;
237
7.42k
      } else {
238
7.42k
        *last_internal_component_reuse_size = 0;
239
7.42k
      }
240
9.18k
    }
241
153k
  } else {
242
153k
    *is_last_internal_component_inc = false;
243
153k
    *last_internal_component_reuse_size = 0;
244
153k
  }
245
163k
}
246
247
// Specific delta encoding optimized for the following keys pattern:
248
//
249
// Prev key:
250
// <shared_prefix>[<prev_key_non_shared_1>[<shared_middle>[<prev_key_non_shared_2>]]]
251
//   [<last_internal_component_to_reuse>]
252
//
253
// Current key:
254
// <shared_prefix>[<non_shared_1>[<shared_middle>[<non_shared_2>]]]
255
//   [<last_internal_component_to_reuse> (optionally incremented)]
256
//
257
// <last_internal_component_to_reuse> is either empty (that means not reused) or contains rocksdb
258
// internal key suffix (see rocksdb/db/dbformat.cc - PackSequenceAndType) of size
259
// kLastInternalComponentSize in case it is reused as is or sequence number is incremented from
260
// prev key.
261
//
262
// Based on prev_key and key calculates sizes of shared/non-shared key components mentioned above.
263
// Tries to maximize shared_prefix and shared_middle sizes.
264
class ThreeSharedPartsEncoder {
265
 public:
266
  ThreeSharedPartsEncoder(const Slice& prev_key, const Slice& key)
267
      : prev_key_(prev_key),
268
        prev_key_size_(prev_key.size()),
269
        key_(key),
270
        key_size_(key.size()),
271
        rest_components_sizes_{
272
            .prev_key_non_shared_1_size = prev_key_size_,
273
            .non_shared_1_size = key_size_,
274
184M
        } {}
275
276
163k
  inline void FindComponents(const size_t min_length, const size_t shared_prefix_size) {
277
163k
    DCHECK_EQ(min_length, std::min(prev_key_.size(), key_.size()));
278
163k
    CalculateLastInternalComponentReuse(
279
163k
        prev_key_, key_, min_length, shared_prefix_size, &last_internal_component_reuse_size_,
280
163k
        &is_last_internal_component_inc_);
281
282
163k
    const auto prev_key_between_shared = Slice(
283
163k
        prev_key_.data() + shared_prefix_size,
284
163k
        prev_key_.end() - last_internal_component_reuse_size_);
285
163k
    const auto cur_key_between_shared =
286
163k
        Slice(key_.data() + shared_prefix_size, key_.end() - last_internal_component_reuse_size_);
287
288
163k
    rest_components_sizes_ = FindMaxSharedMiddle(prev_key_between_shared, cur_key_between_shared);
289
163k
#ifndef DEBUG
290
163k
    auto check_result =
291
163k
        rest_components_sizes_.DebugVerify(prev_key_between_shared, cur_key_between_shared);
292
163k
    if (!check_result.ok()) {
293
0
      LOG(FATAL) << check_result << ": " << rest_components_sizes_.ToString();
294
0
    }
295
163k
#endif
296
163k
  }
297
298
  // Encodes components sizes + non-shared key parts and value into buffer.
299
  // See EncodeThreeSharedPartsSizes for description of how we encode component sizes.
300
480k
  inline void Encode(size_t shared_prefix_size, const Slice& value, std::string* buffer) {
301
480k
    const auto value_size = value.size();
302
303
480k
    EncodeThreeSharedPartsSizes(
304
480k
        shared_prefix_size, last_internal_component_reuse_size_, is_last_internal_component_inc_,
305
480k
        rest_components_sizes_, prev_key_size_, key_size_, value_size, buffer);
306
307
480k
    yb::EnlargeBufferIfNeeded(
308
480k
        buffer, buffer->size() + rest_components_sizes_.non_shared_1_size +
309
480k
                    rest_components_sizes_.non_shared_2_size + value_size);
310
311
    // Add key deltas to buffer followed by value.
312
480k
    buffer->append(key_.cdata() + shared_prefix_size, rest_components_sizes_.non_shared_1_size);
313
480k
    buffer->append(
314
480k
        key_.cend() - last_internal_component_reuse_size_ -
315
480k
            rest_components_sizes_.non_shared_2_size,
316
480k
        rest_components_sizes_.non_shared_2_size);
317
480k
    buffer->append(value.cdata(), value_size);
318
480k
  }
319
320
 private:
321
  const Slice prev_key_;
322
  const size_t prev_key_size_ = 0;
323
  const Slice key_;
324
  const size_t key_size_ = 0;
325
326
  size_t last_internal_component_reuse_size_ = 0;
327
  bool is_last_internal_component_inc_ = false;
328
  ComponentSizes rest_components_sizes_;
329
};
330
331
} // namespace
332
333
3.09M
Slice BlockBuilder::Finish() {
334
  // Append restart array
335
21.0M
  for (size_t i = 0; i < restarts_.size(); 
i++17.9M
) {
336
17.9M
    PutFixed32(&buffer_, restarts_[i]);
337
17.9M
  }
338
3.09M
  PutFixed32(&buffer_, static_cast<uint32_t>(restarts_.size()));
339
3.09M
  finished_ = true;
340
3.09M
  return Slice(buffer_);
341
3.09M
}
342
343
184M
void BlockBuilder::Add(const Slice& key, const Slice& value) {
344
184M
  const Slice prev_key_piece(last_key_);
345
184M
  assert(!finished_);
346
0
  assert(counter_ <= block_restart_interval_);
347
348
0
  size_t shared_prefix_size = 0; // number of prefix bytes shared with prev key
349
350
  // Initial values to be used on restarts or when delta encoding is off.
351
184M
  ThreeSharedPartsEncoder three_shared_parts_encoder(prev_key_piece, key);
352
353
184M
  if (counter_ >= block_restart_interval_) {
354
    // Restart compression
355
14.8M
    restarts_.push_back(static_cast<uint32_t>(buffer_.size()));
356
14.8M
    counter_ = 0;
357
169M
  } else if (use_delta_encoding_) {
358
    // See how much sharing to do with previous string
359
169M
    const size_t min_length = std::min(prev_key_piece.size(), key.size());
360
169M
    shared_prefix_size =
361
169M
        strings::MemoryDifferencePos(prev_key_piece.data(), key.data(), min_length);
362
169M
    bool valid_encoding_format = false;
363
169M
    switch (key_value_encoding_format_) {
364
168M
      case KeyValueEncodingFormat::kKeyDeltaEncodingSharedPrefix:
365
168M
        valid_encoding_format = true;
366
168M
        break;
367
163k
      case KeyValueEncodingFormat::kKeyDeltaEncodingThreeSharedParts:
368
163k
        valid_encoding_format = true;
369
163k
        three_shared_parts_encoder.FindComponents(min_length, shared_prefix_size);
370
163k
        break;
371
169M
    }
372
169M
    if (!valid_encoding_format) {
373
0
      FATAL_INVALID_ENUM_VALUE(KeyValueEncodingFormat, key_value_encoding_format_);
374
0
    }
375
169M
  }
376
377
18.4E
  DVLOG_WITH_FUNC(4) << "key: " << Slice(key).ToDebugHexString() << " size: " << key.size()
378
18.4E
                    << " offset: " << buffer_.size() << " counter: " << counter_;
379
380
184M
  const size_t after_shared_prefix_size = key.size() - shared_prefix_size;
381
382
184M
  switch (key_value_encoding_format_) {
383
    // If we don't use key delta encoding, we still use the same encoding format, but shared
384
    // components sizes are 0.
385
184M
    case KeyValueEncodingFormat::kKeyDeltaEncodingSharedPrefix: {
386
      // Add "<shared_prefix_size><after_shared_prefix_size><value_size>" to buffer_
387
184M
      PutVarint32(&buffer_, static_cast<uint32_t>(shared_prefix_size));
388
184M
      PutVarint32(&buffer_, static_cast<uint32_t>(after_shared_prefix_size));
389
184M
      PutVarint32(&buffer_, static_cast<uint32_t>(value.size()));
390
391
      // Add string delta to buffer_ followed by value
392
184M
      buffer_.append(key.cdata() + shared_prefix_size, after_shared_prefix_size);
393
184M
      buffer_.append(value.cdata(), value.size());
394
184M
      break;
395
0
    }
396
480k
    case KeyValueEncodingFormat::kKeyDeltaEncodingThreeSharedParts: {
397
480k
      three_shared_parts_encoder.Encode(shared_prefix_size, value, &buffer_);
398
480k
      break;
399
0
    }
400
184M
  }
401
402
  // Update state
403
184M
  last_key_.resize(shared_prefix_size);
404
184M
  last_key_.append(key.cdata() + shared_prefix_size, after_shared_prefix_size);
405
406
184M
  assert(Slice(last_key_) == key);
407
0
  counter_++;
408
184M
}
409
410
}  // namespace rocksdb