YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/rocksdb/table/plain_table_index.h
Line
Count
Source (jump to first uncovered line)
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under the BSD-style license found in the
3
//  LICENSE file in the root directory of this source tree. An additional grant
4
//  of patent rights can be found in the PATENTS file in the same directory.
5
//
6
// The following only applies to changes made to this file as part of YugaByte development.
7
//
8
// Portions Copyright (c) YugaByte, Inc.
9
//
10
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
11
// in compliance with the License.  You may obtain a copy of the License at
12
//
13
// http://www.apache.org/licenses/LICENSE-2.0
14
//
15
// Unless required by applicable law or agreed to in writing, software distributed under the License
16
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
17
// or implied.  See the License for the specific language governing permissions and limitations
18
// under the License.
19
//
20
21
#ifndef YB_ROCKSDB_TABLE_PLAIN_TABLE_INDEX_H
22
#define YB_ROCKSDB_TABLE_PLAIN_TABLE_INDEX_H
23
24
#pragma once
25
26
#ifndef ROCKSDB_LITE
27
28
#include <string>
29
#include <vector>
30
31
#include "yb/rocksdb/db/dbformat.h"
32
#include "yb/rocksdb/immutable_options.h"
33
#include "yb/rocksdb/options.h"
34
#include "yb/rocksdb/util/murmurhash.h"
35
#include "yb/rocksdb/util/hash.h"
36
#include "yb/rocksdb/util/arena.h"
37
#include "yb/rocksdb/util/histogram.h"
38
39
#include "yb/util/status_log.h"
40
41
namespace rocksdb {
42
43
// PlainTableIndex contains buckets size of index_size_, each is a
44
// 32-bit integer. The lower 31 bits contain an offset value (explained below)
45
// and the first bit of the integer indicates type of the offset.
46
//
47
// +--------------+------------------------------------------------------+
48
// | Flag (1 bit) | Offset to binary search buffer or file (31 bits)     +
49
// +--------------+------------------------------------------------------+
50
//
51
// Explanation for the "flag bit":
52
//
53
// 0 indicates that the bucket contains only one prefix (no conflict when
54
//   hashing this prefix), whose first row starts from this offset of the
55
// file.
56
// 1 indicates that the bucket contains more than one prefixes, or there
57
//   are too many rows for one prefix so we need a binary search for it. In
58
//   this case, the offset indicates the offset of sub_index_ holding the
59
//   binary search indexes of keys for those rows. Those binary search indexes
60
//   are organized in this way:
61
//
62
// The first 4 bytes, indicate how many indexes (N) are stored after it. After
63
// it, there are N 32-bit integers, each points of an offset of the file,
64
// which
65
// points to starting of a row. Those offsets need to be guaranteed to be in
66
// ascending order so the keys they are pointing to are also in ascending
67
// order
68
// to make sure we can use them to do binary searches. Below is visual
69
// presentation of a bucket.
70
//
71
// <begin>
72
//   number_of_records:  varint32
73
//   record 1 file offset:  fixedint32
74
//   record 2 file offset:  fixedint32
75
//    ....
76
//   record N file offset:  fixedint32
77
// <end>
78
class PlainTableIndex {
79
 public:
80
  enum IndexSearchResult {
81
    kNoPrefixForBucket = 0,
82
    kDirectToFile = 1,
83
    kSubindex = 2
84
  };
85
86
0
  explicit PlainTableIndex(Slice data) { CHECK_OK(InitFromRawData(data)); }
87
88
  PlainTableIndex()
89
      : index_size_(0),
90
        sub_index_size_(0),
91
        num_prefixes_(0),
92
        index_(nullptr),
93
3.10k
        sub_index_(nullptr) {}
94
95
  IndexSearchResult GetOffset(uint32_t prefix_hash,
96
                              uint32_t* bucket_value) const;
97
98
  Status InitFromRawData(Slice data);
99
100
  const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset,
101
87.0k
                                              uint32_t* upper_bound) const {
102
87.0k
    const char* index_ptr = &sub_index_[offset];
103
87.0k
    return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound);
104
87.0k
  }
105
106
20.0k
  uint32_t GetIndexSize() const { return index_size_; }
107
108
3.03k
  uint32_t GetSubIndexSize() const { return sub_index_size_; }
109
110
3.03k
  uint32_t GetNumPrefixes() const { return num_prefixes_; }
111
112
  static const uint64_t kMaxFileSize = (1u << 31) - 1;
113
  static const uint32_t kSubIndexMask = 0x80000000;
114
  static const size_t kOffsetLen = sizeof(uint32_t);
115
116
 private:
117
  uint32_t index_size_;
118
  uint32_t sub_index_size_;
119
  uint32_t num_prefixes_;
120
121
  uint32_t* index_;
122
  char* sub_index_;
123
};
124
125
// PlainTableIndexBuilder is used to create plain table index.
126
// After calling Finish(), it returns Slice, which is usually
127
// used either to initialize PlainTableIndex or
128
// to save index to sst file.
129
// For more details about the  index, please refer to:
130
// https://github.com/facebook/rocksdb/wiki/PlainTable-Format
131
// #wiki-in-memory-index-format
132
class PlainTableIndexBuilder {
133
 public:
134
  PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions,
135
                         size_t index_sparseness, double hash_table_ratio,
136
                         size_t huge_page_tlb_size)
137
      : arena_(arena),
138
        ioptions_(ioptions),
139
        record_list_(kRecordsPerGroup),
140
        is_first_record_(true),
141
        due_index_(false),
142
        num_prefixes_(0),
143
        num_keys_per_prefix_(0),
144
        prev_key_prefix_hash_(0),
145
        index_sparseness_(index_sparseness),
146
        prefix_extractor_(ioptions.prefix_extractor),
147
        hash_table_ratio_(hash_table_ratio),
148
3.16k
        huge_page_tlb_size_(huge_page_tlb_size) {}
149
150
  void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset);
151
152
  Slice Finish();
153
154
6.20k
  uint32_t GetTotalSize() const {
155
6.20k
    return VarintLength(index_size_) + VarintLength(num_prefixes_) +
156
6.20k
           PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_;
157
6.20k
  }
158
159
  static const std::string kPlainTableIndexBlock;
160
161
 private:
162
  struct IndexRecord {
163
    uint32_t hash;    // hash of the prefix
164
    uint32_t offset;  // offset of a row
165
    IndexRecord* next;
166
  };
167
168
  // Helper class to track all the index records
169
  class IndexRecordList {
170
   public:
171
    explicit IndexRecordList(size_t num_records_per_group)
172
        : kNumRecordsPerGroup(num_records_per_group),
173
          current_group_(nullptr),
174
3.16k
          num_records_in_current_group_(num_records_per_group) {}
175
176
3.16k
    ~IndexRecordList() {
177
8.35k
      for (size_t i = 0; i < groups_.size(); 
i++5.18k
) {
178
5.18k
        delete[] groups_[i];
179
5.18k
      }
180
3.16k
    }
181
182
    void AddRecord(uint32_t hash, uint32_t offset);
183
184
3.10k
    size_t GetNumRecords() const {
185
3.10k
      return (groups_.size() - 1) * kNumRecordsPerGroup +
186
3.10k
             num_records_in_current_group_;
187
3.10k
    }
188
690k
    IndexRecord* At(size_t index) {
189
690k
      return &(groups_[index / kNumRecordsPerGroup]
190
690k
                      [index % kNumRecordsPerGroup]);
191
690k
    }
192
193
   private:
194
5.18k
    IndexRecord* AllocateNewGroup() {
195
5.18k
      IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
196
5.18k
      groups_.push_back(result);
197
5.18k
      return result;
198
5.18k
    }
199
200
    // Each group in `groups_` contains fix-sized records (determined by
201
    // kNumRecordsPerGroup). Which can help us minimize the cost if resizing
202
    // occurs.
203
    const size_t kNumRecordsPerGroup;
204
    IndexRecord* current_group_;
205
    // List of arrays allocated
206
    std::vector<IndexRecord*> groups_;
207
    size_t num_records_in_current_group_;
208
  };
209
210
  void AllocateIndex();
211
212
  // Internal helper function to bucket index record list to hash buckets.
213
  void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets,
214
                        std::vector<uint32_t>* entries_per_bucket);
215
216
  // Internal helper class to fill the indexes and bloom filters to internal
217
  // data structures.
218
  Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets,
219
                    const std::vector<uint32_t>& entries_per_bucket);
220
221
  Arena* arena_;
222
  const ImmutableCFOptions ioptions_;
223
  HistogramImpl keys_per_prefix_hist_;
224
  IndexRecordList record_list_;
225
  bool is_first_record_;
226
  bool due_index_;
227
  uint32_t num_prefixes_;
228
  uint32_t num_keys_per_prefix_;
229
230
  uint32_t prev_key_prefix_hash_;
231
  size_t index_sparseness_;
232
  uint32_t index_size_;
233
  uint32_t sub_index_size_;
234
235
  const SliceTransform* prefix_extractor_;
236
  double hash_table_ratio_;
237
  size_t huge_page_tlb_size_;
238
239
  std::string prev_key_prefix_;
240
241
  static const size_t kRecordsPerGroup = 256;
242
};
243
244
};  // namespace rocksdb
245
246
#endif  // ROCKSDB_LITE
247
248
#endif // YB_ROCKSDB_TABLE_PLAIN_TABLE_INDEX_H