YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/src/yb/rocksdb/table/block_prefix_index.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2
// This source code is licensed under the BSD-style license found in the
3
// LICENSE file in the root directory of this source tree. An additional grant
4
// of patent rights can be found in the PATENTS file in the same directory.
5
//
6
// The following only applies to changes made to this file as part of YugaByte development.
7
//
8
// Portions Copyright (c) YugaByte, Inc.
9
//
10
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
11
// in compliance with the License.  You may obtain a copy of the License at
12
//
13
// http://www.apache.org/licenses/LICENSE-2.0
14
//
15
// Unless required by applicable law or agreed to in writing, software distributed under the License
16
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
17
// or implied.  See the License for the specific language governing permissions and limitations
18
// under the License.
19
//
20
21
#include "yb/rocksdb/table/block_prefix_index.h"
22
23
#include <vector>
24
25
#include "yb/util/slice.h"
26
#include "yb/rocksdb/slice_transform.h"
27
#include "yb/rocksdb/util/arena.h"
28
#include "yb/rocksdb/util/coding.h"
29
#include "yb/rocksdb/util/hash.h"
30
31
namespace rocksdb {
32
33
531k
inline uint32_t Hash(const Slice& s) {
34
531k
  return rocksdb::Hash(s.data(), s.size(), 0);
35
531k
}
36
37
530k
inline uint32_t PrefixToBucket(const Slice& prefix, uint32_t num_buckets) {
38
530k
  return Hash(prefix) % num_buckets;
39
530k
}
40
41
// The prefix block index is simply a bucket array, with each entry pointing to
42
// the blocks that span the prefixes hashed to this bucket.
43
//
44
// To reduce memory footprint, if there is only one block per bucket, the entry
45
// stores the block id directly. If there are more than one blocks per bucket,
46
// because of hash collision or a single prefix spanning multiple blocks,
47
// the entry points to an array of block ids. The block array is an array of
48
// uint32_t's. The first uint32_t indicates the total number of blocks, followed
49
// by the block ids.
50
//
51
// To differentiate the two cases, the high order bit of the entry indicates
52
// whether it is a 'pointer' into a separate block array.
53
// 0x7FFFFFFF is reserved for empty bucket.
54
55
const uint32_t kNoneBlock = 0x7FFFFFFF;
56
const uint32_t kBlockArrayMask = 0x80000000;
57
58
102k
inline bool IsNone(uint32_t block_id) {
59
102k
  return block_id == kNoneBlock;
60
102k
}
61
62
100k
inline bool IsBlockId(uint32_t block_id) {
63
100k
  return (block_id & kBlockArrayMask) == 0;
64
100k
}
65
66
77.0k
inline uint32_t DecodeIndex(uint32_t block_id) {
67
77.0k
  uint32_t index = block_id ^ kBlockArrayMask;
68
77.0k
  assert(index < kBlockArrayMask);
69
77.0k
  return index;
70
77.0k
}
71
72
100k
inline uint32_t EncodeIndex(uint32_t index) {
73
100k
  assert(index < kBlockArrayMask);
74
100k
  return index | kBlockArrayMask;
75
100k
}
76
77
// temporary storage for prefix information during index building
78
struct PrefixRecord {
79
  Slice prefix;
80
  uint32_t start_block;
81
  uint32_t end_block;
82
  uint32_t num_blocks;
83
  PrefixRecord* next;
84
};
85
86
class BlockPrefixIndex::Builder {
87
 public:
88
  explicit Builder(const SliceTransform* internal_prefix_extractor)
89
1.65k
      : internal_prefix_extractor_(internal_prefix_extractor) {}
90
91
  void Add(const Slice& key_prefix, uint32_t start_block,
92
427k
           uint32_t num_blocks) {
93
427k
    PrefixRecord* record = reinterpret_cast<PrefixRecord*>(
94
427k
      arena_.AllocateAligned(sizeof(PrefixRecord)));
95
427k
    record->prefix = key_prefix;
96
427k
    record->start_block = start_block;
97
427k
    record->end_block = start_block + num_blocks - 1;
98
427k
    record->num_blocks = num_blocks;
99
427k
    prefixes_.push_back(record);
100
427k
  }
101
102
1.65k
  BlockPrefixIndex* Finish() {
103
    // For now, use roughly 1:1 prefix to bucket ratio.
104
1.65k
    uint32_t num_buckets = static_cast<uint32_t>(prefixes_.size()) + 1;
105
106
    // Collect prefix records that hash to the same bucket, into a single
107
    // linklist.
108
1.65k
    std::vector<PrefixRecord*> prefixes_per_bucket(num_buckets, nullptr);
109
1.65k
    std::vector<uint32_t> num_blocks_per_bucket(num_buckets, 0);
110
428k
    for (PrefixRecord* current : prefixes_) {
111
428k
      uint32_t bucket = PrefixToBucket(current->prefix, num_buckets);
112
      // merge the prefix block span if the first block of this prefix is
113
      // connected to the last block of the previous prefix.
114
428k
      PrefixRecord* prev = prefixes_per_bucket[bucket];
115
428k
      if (prev) {
116
157k
        assert(current->start_block >= prev->end_block);
117
157k
        auto distance = current->start_block - prev->end_block;
118
157k
        if (distance <= 1) {
119
46.2k
          prev->end_block = current->end_block;
120
46.2k
          prev->num_blocks = prev->end_block - prev->start_block + 1;
121
46.2k
          num_blocks_per_bucket[bucket] += (current->num_blocks + distance - 1);
122
46.2k
          continue;
123
46.2k
        }
124
382k
      }
125
382k
      current->next = prev;
126
382k
      prefixes_per_bucket[bucket] = current;
127
382k
      num_blocks_per_bucket[bucket] += current->num_blocks;
128
382k
    }
129
130
    // Calculate the block array buffer size
131
1.65k
    uint32_t total_block_array_entries = 0;
132
436k
    for (uint32_t i = 0; i < num_buckets; i++) {
133
435k
      uint32_t num_blocks = num_blocks_per_bucket[i];
134
435k
      if (num_blocks > 1) {
135
100k
        total_block_array_entries += (num_blocks + 1);
136
100k
      }
137
435k
    }
138
139
    // Populate the final prefix block index
140
1.65k
    uint32_t* block_array_buffer = new uint32_t[total_block_array_entries];
141
1.65k
    uint32_t* buckets = new uint32_t[num_buckets];
142
1.65k
    uint32_t offset = 0;
143
435k
    for (uint32_t i = 0; i < num_buckets; i++) {
144
433k
      uint32_t num_blocks = num_blocks_per_bucket[i];
145
433k
      if (num_blocks == 0) {
146
159k
        assert(prefixes_per_bucket[i] == nullptr);
147
159k
        buckets[i] = kNoneBlock;
148
274k
      } else if (num_blocks == 1) {
149
174k
        assert(prefixes_per_bucket[i] != nullptr);
150
174k
        assert(prefixes_per_bucket[i]->next == nullptr);
151
174k
        buckets[i] = prefixes_per_bucket[i]->start_block;
152
99.9k
      } else {
153
99.9k
        assert(prefixes_per_bucket[i] != nullptr);
154
99.9k
        buckets[i] = EncodeIndex(offset);
155
99.9k
        block_array_buffer[offset] = num_blocks;
156
99.9k
        uint32_t* last_block = &block_array_buffer[offset + num_blocks];
157
99.9k
        auto current = prefixes_per_bucket[i];
158
        // populate block ids from largest to smallest
159
312k
        while (current != nullptr) {
160
456k
          for (uint32_t iter = 0; iter < current->num_blocks; iter++) {
161
243k
            *last_block = current->end_block - iter;
162
243k
            last_block--;
163
243k
          }
164
212k
          current = current->next;
165
212k
        }
166
99.9k
        assert(last_block == &block_array_buffer[offset]);
167
99.9k
        offset += (num_blocks + 1);
168
99.9k
      }
169
433k
    }
170
171
1.65k
    assert(offset == total_block_array_entries);
172
173
1.65k
    return new BlockPrefixIndex(internal_prefix_extractor_, num_buckets,
174
1.65k
                                buckets, total_block_array_entries,
175
1.65k
                                block_array_buffer);
176
1.65k
  }
177
178
 private:
179
  const SliceTransform* internal_prefix_extractor_;
180
181
  std::vector<PrefixRecord*> prefixes_;
182
  Arena arena_;
183
};
184
185
186
Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor,
187
                                const Slice& prefixes, const Slice& prefix_meta,
188
1.65k
                                BlockPrefixIndex** prefix_index) {
189
1.65k
  uint64_t pos = 0;
190
1.65k
  auto meta_pos = prefix_meta;
191
1.65k
  Status s;
192
1.65k
  Builder builder(internal_prefix_extractor);
193
194
431k
  while (!meta_pos.empty()) {
195
429k
    uint32_t prefix_size = 0;
196
429k
    uint32_t entry_index = 0;
197
429k
    uint32_t num_blocks = 0;
198
429k
    if (!GetVarint32(&meta_pos, &prefix_size) ||
199
426k
        !GetVarint32(&meta_pos, &entry_index) ||
200
425k
        !GetVarint32(&meta_pos, &num_blocks)) {
201
0
      s = STATUS(Corruption,
202
0
          "Corrupted prefix meta block: unable to read from it.");
203
0
      break;
204
0
    }
205
429k
    if (pos + prefix_size > prefixes.size()) {
206
0
      s = STATUS(Corruption,
207
0
        "Corrupted prefix meta block: size inconsistency.");
208
0
      break;
209
0
    }
210
429k
    Slice prefix(prefixes.data() + pos, prefix_size);
211
429k
    builder.Add(prefix, entry_index, num_blocks);
212
213
429k
    pos += prefix_size;
214
429k
  }
215
216
1.65k
  if (s.ok() && pos != prefixes.size()) {
217
0
    s = STATUS(Corruption, "Corrupted prefix meta block");
218
0
  }
219
220
1.65k
  if (s.ok()) {
221
1.65k
    *prefix_index = builder.Finish();
222
1.65k
  }
223
224
1.65k
  return s;
225
1.65k
}
226
227
uint32_t BlockPrefixIndex::GetBlocks(const Slice& key,
228
102k
                                     uint32_t** blocks) {
229
102k
  Slice prefix = internal_prefix_extractor_->Transform(key);
230
231
102k
  uint32_t bucket = PrefixToBucket(prefix, num_buckets_);
232
102k
  uint32_t block_id = buckets_[bucket];
233
234
102k
  if (IsNone(block_id)) {
235
2.17k
    return 0;
236
100k
  } else if (IsBlockId(block_id)) {
237
22.9k
    *blocks = &buckets_[bucket];
238
22.9k
    return 1;
239
77.0k
  } else {
240
77.0k
    uint32_t index = DecodeIndex(block_id);
241
77.0k
    assert(index < num_block_array_buffer_entries_);
242
77.0k
    *blocks = &block_array_buffer_[index+1];
243
77.0k
    uint32_t num_blocks = block_array_buffer_[index];
244
77.0k
    assert(num_blocks > 1);
245
77.0k
    assert(index + num_blocks < num_block_array_buffer_entries_);
246
77.0k
    return num_blocks;
247
77.0k
  }
248
102k
}
249
250
}  // namespace rocksdb