YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/rocksdb/table/block_prefix_index.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2
// This source code is licensed under the BSD-style license found in the
3
// LICENSE file in the root directory of this source tree. An additional grant
4
// of patent rights can be found in the PATENTS file in the same directory.
5
//
6
// The following only applies to changes made to this file as part of YugaByte development.
7
//
8
// Portions Copyright (c) YugaByte, Inc.
9
//
10
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
11
// in compliance with the License.  You may obtain a copy of the License at
12
//
13
// http://www.apache.org/licenses/LICENSE-2.0
14
//
15
// Unless required by applicable law or agreed to in writing, software distributed under the License
16
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
17
// or implied.  See the License for the specific language governing permissions and limitations
18
// under the License.
19
//
20
21
#include "yb/rocksdb/table/block_prefix_index.h"
22
23
#include <vector>
24
25
#include "yb/util/slice.h"
26
#include "yb/rocksdb/slice_transform.h"
27
#include "yb/rocksdb/util/arena.h"
28
#include "yb/rocksdb/util/coding.h"
29
#include "yb/rocksdb/util/hash.h"
30
31
namespace rocksdb {
32
33
531k
inline uint32_t Hash(const Slice& s) {
34
531k
  return rocksdb::Hash(s.data(), s.size(), 0);
35
531k
}
36
37
530k
inline uint32_t PrefixToBucket(const Slice& prefix, uint32_t num_buckets) {
38
530k
  return Hash(prefix) % num_buckets;
39
530k
}
40
41
// The prefix block index is simply a bucket array, with each entry pointing to
42
// the blocks that span the prefixes hashed to this bucket.
43
//
44
// To reduce memory footprint, if there is only one block per bucket, the entry
45
// stores the block id directly. If there are more than one blocks per bucket,
46
// because of hash collision or a single prefix spanning multiple blocks,
47
// the entry points to an array of block ids. The block array is an array of
48
// uint32_t's. The first uint32_t indicates the total number of blocks, followed
49
// by the block ids.
50
//
51
// To differentiate the two cases, the high order bit of the entry indicates
52
// whether it is a 'pointer' into a separate block array.
53
// 0x7FFFFFFF is reserved for empty bucket.
54
55
const uint32_t kNoneBlock = 0x7FFFFFFF;
56
const uint32_t kBlockArrayMask = 0x80000000;
57
58
101k
inline bool IsNone(uint32_t block_id) {
59
101k
  return block_id == kNoneBlock;
60
101k
}
61
62
99.5k
inline bool IsBlockId(uint32_t block_id) {
63
99.5k
  return (block_id & kBlockArrayMask) == 0;
64
99.5k
}
65
66
76.5k
inline uint32_t DecodeIndex(uint32_t block_id) {
67
76.5k
  uint32_t index = block_id ^ kBlockArrayMask;
68
76.5k
  assert(index < kBlockArrayMask);
69
0
  return index;
70
76.5k
}
71
72
101k
inline uint32_t EncodeIndex(uint32_t index) {
73
101k
  assert(index < kBlockArrayMask);
74
0
  return index | kBlockArrayMask;
75
101k
}
76
77
// temporary storage for prefix information during index building
78
struct PrefixRecord {
79
  Slice prefix;
80
  uint32_t start_block;
81
  uint32_t end_block;
82
  uint32_t num_blocks;
83
  PrefixRecord* next;
84
};
85
86
class BlockPrefixIndex::Builder {
87
 public:
88
  explicit Builder(const SliceTransform* internal_prefix_extractor)
89
1.66k
      : internal_prefix_extractor_(internal_prefix_extractor) {}
90
91
  void Add(const Slice& key_prefix, uint32_t start_block,
92
430k
           uint32_t num_blocks) {
93
430k
    PrefixRecord* record = reinterpret_cast<PrefixRecord*>(
94
430k
      arena_.AllocateAligned(sizeof(PrefixRecord)));
95
430k
    record->prefix = key_prefix;
96
430k
    record->start_block = start_block;
97
430k
    record->end_block = start_block + num_blocks - 1;
98
430k
    record->num_blocks = num_blocks;
99
430k
    prefixes_.push_back(record);
100
430k
  }
101
102
1.66k
  BlockPrefixIndex* Finish() {
103
    // For now, use roughly 1:1 prefix to bucket ratio.
104
1.66k
    uint32_t num_buckets = static_cast<uint32_t>(prefixes_.size()) + 1;
105
106
    // Collect prefix records that hash to the same bucket, into a single
107
    // linklist.
108
1.66k
    std::vector<PrefixRecord*> prefixes_per_bucket(num_buckets, nullptr);
109
1.66k
    std::vector<uint32_t> num_blocks_per_bucket(num_buckets, 0);
110
429k
    for (PrefixRecord* current : prefixes_) {
111
429k
      uint32_t bucket = PrefixToBucket(current->prefix, num_buckets);
112
      // merge the prefix block span if the first block of this prefix is
113
      // connected to the last block of the previous prefix.
114
429k
      PrefixRecord* prev = prefixes_per_bucket[bucket];
115
429k
      if (prev) {
116
158k
        assert(current->start_block >= prev->end_block);
117
0
        auto distance = current->start_block - prev->end_block;
118
158k
        if (distance <= 1) {
119
46.3k
          prev->end_block = current->end_block;
120
46.3k
          prev->num_blocks = prev->end_block - prev->start_block + 1;
121
46.3k
          num_blocks_per_bucket[bucket] += (current->num_blocks + distance - 1);
122
46.3k
          continue;
123
46.3k
        }
124
158k
      }
125
383k
      current->next = prev;
126
383k
      prefixes_per_bucket[bucket] = current;
127
383k
      num_blocks_per_bucket[bucket] += current->num_blocks;
128
383k
    }
129
130
    // Calculate the block array buffer size
131
1.66k
    uint32_t total_block_array_entries = 0;
132
439k
    for (uint32_t i = 0; i < num_buckets; 
i++437k
) {
133
437k
      uint32_t num_blocks = num_blocks_per_bucket[i];
134
437k
      if (num_blocks > 1) {
135
101k
        total_block_array_entries += (num_blocks + 1);
136
101k
      }
137
437k
    }
138
139
    // Populate the final prefix block index
140
1.66k
    uint32_t* block_array_buffer = new uint32_t[total_block_array_entries];
141
1.66k
    uint32_t* buckets = new uint32_t[num_buckets];
142
1.66k
    uint32_t offset = 0;
143
438k
    for (uint32_t i = 0; i < num_buckets; 
i++436k
) {
144
436k
      uint32_t num_blocks = num_blocks_per_bucket[i];
145
436k
      if (num_blocks == 0) {
146
160k
        assert(prefixes_per_bucket[i] == nullptr);
147
0
        buckets[i] = kNoneBlock;
148
276k
      } else if (num_blocks == 1) {
149
174k
        assert(prefixes_per_bucket[i] != nullptr);
150
0
        assert(prefixes_per_bucket[i]->next == nullptr);
151
0
        buckets[i] = prefixes_per_bucket[i]->start_block;
152
174k
      } else {
153
101k
        assert(prefixes_per_bucket[i] != nullptr);
154
0
        buckets[i] = EncodeIndex(offset);
155
101k
        block_array_buffer[offset] = num_blocks;
156
101k
        uint32_t* last_block = &block_array_buffer[offset + num_blocks];
157
101k
        auto current = prefixes_per_bucket[i];
158
        // populate block ids from largest to smallest
159
314k
        while (current != nullptr) {
160
459k
          for (uint32_t iter = 0; iter < current->num_blocks; 
iter++245k
) {
161
245k
            *last_block = current->end_block - iter;
162
245k
            last_block--;
163
245k
          }
164
213k
          current = current->next;
165
213k
        }
166
101k
        assert(last_block == &block_array_buffer[offset]);
167
0
        offset += (num_blocks + 1);
168
101k
      }
169
436k
    }
170
171
1.66k
    assert(offset == total_block_array_entries);
172
173
0
    return new BlockPrefixIndex(internal_prefix_extractor_, num_buckets,
174
1.66k
                                buckets, total_block_array_entries,
175
1.66k
                                block_array_buffer);
176
1.66k
  }
177
178
 private:
179
  const SliceTransform* internal_prefix_extractor_;
180
181
  std::vector<PrefixRecord*> prefixes_;
182
  Arena arena_;
183
};
184
185
186
Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor,
187
                                const Slice& prefixes, const Slice& prefix_meta,
188
1.66k
                                BlockPrefixIndex** prefix_index) {
189
1.66k
  uint64_t pos = 0;
190
1.66k
  auto meta_pos = prefix_meta;
191
1.66k
  Status s;
192
1.66k
  Builder builder(internal_prefix_extractor);
193
194
434k
  while (!meta_pos.empty()) {
195
432k
    uint32_t prefix_size = 0;
196
432k
    uint32_t entry_index = 0;
197
432k
    uint32_t num_blocks = 0;
198
432k
    if (!GetVarint32(&meta_pos, &prefix_size) ||
199
432k
        
!GetVarint32(&meta_pos, &entry_index)429k
||
200
432k
        
!GetVarint32(&meta_pos, &num_blocks)428k
) {
201
0
      s = STATUS(Corruption,
202
0
          "Corrupted prefix meta block: unable to read from it.");
203
0
      break;
204
0
    }
205
432k
    if (pos + prefix_size > prefixes.size()) {
206
0
      s = STATUS(Corruption,
207
0
        "Corrupted prefix meta block: size inconsistency.");
208
0
      break;
209
0
    }
210
432k
    Slice prefix(prefixes.data() + pos, prefix_size);
211
432k
    builder.Add(prefix, entry_index, num_blocks);
212
213
432k
    pos += prefix_size;
214
432k
  }
215
216
1.66k
  if (s.ok() && 
pos != prefixes.size()1.66k
) {
217
0
    s = STATUS(Corruption, "Corrupted prefix meta block");
218
0
  }
219
220
1.66k
  if (s.ok()) {
221
1.66k
    *prefix_index = builder.Finish();
222
1.66k
  }
223
224
1.66k
  return s;
225
1.66k
}
226
227
uint32_t BlockPrefixIndex::GetBlocks(const Slice& key,
228
101k
                                     uint32_t** blocks) {
229
101k
  Slice prefix = internal_prefix_extractor_->Transform(key);
230
231
101k
  uint32_t bucket = PrefixToBucket(prefix, num_buckets_);
232
101k
  uint32_t block_id = buckets_[bucket];
233
234
101k
  if (IsNone(block_id)) {
235
2.17k
    return 0;
236
99.5k
  } else if (IsBlockId(block_id)) {
237
23.0k
    *blocks = &buckets_[bucket];
238
23.0k
    return 1;
239
76.4k
  } else {
240
76.4k
    uint32_t index = DecodeIndex(block_id);
241
76.4k
    assert(index < num_block_array_buffer_entries_);
242
0
    *blocks = &block_array_buffer_[index+1];
243
76.4k
    uint32_t num_blocks = block_array_buffer_[index];
244
76.4k
    assert(num_blocks > 1);
245
0
    assert(index + num_blocks < num_block_array_buffer_entries_);
246
0
    return num_blocks;
247
76.4k
  }
248
101k
}
249
250
}  // namespace rocksdb