/Users/deen/code/yugabyte-db/src/yb/rocksdb/table/plain_table_index.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under the BSD-style license found in the |
3 | | // LICENSE file in the root directory of this source tree. An additional grant |
4 | | // of patent rights can be found in the PATENTS file in the same directory. |
5 | | // |
6 | | // The following only applies to changes made to this file as part of YugaByte development. |
7 | | // |
8 | | // Portions Copyright (c) YugaByte, Inc. |
9 | | // |
10 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
11 | | // in compliance with the License. You may obtain a copy of the License at |
12 | | // |
13 | | // http://www.apache.org/licenses/LICENSE-2.0 |
14 | | // |
15 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
16 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
17 | | // or implied. See the License for the specific language governing permissions and limitations |
18 | | // under the License. |
19 | | // |
20 | | |
21 | | #ifndef YB_ROCKSDB_TABLE_PLAIN_TABLE_INDEX_H |
22 | | #define YB_ROCKSDB_TABLE_PLAIN_TABLE_INDEX_H |
23 | | |
24 | | #pragma once |
25 | | |
26 | | #ifndef ROCKSDB_LITE |
27 | | |
28 | | #include <string> |
29 | | #include <vector> |
30 | | |
31 | | #include "yb/rocksdb/db/dbformat.h" |
32 | | #include "yb/rocksdb/immutable_options.h" |
33 | | #include "yb/rocksdb/options.h" |
34 | | #include "yb/rocksdb/util/murmurhash.h" |
35 | | #include "yb/rocksdb/util/hash.h" |
36 | | #include "yb/rocksdb/util/arena.h" |
37 | | #include "yb/rocksdb/util/histogram.h" |
38 | | |
39 | | #include "yb/util/status_log.h" |
40 | | |
41 | | namespace rocksdb { |
42 | | |
43 | | // PlainTableIndex contains buckets size of index_size_, each is a |
44 | | // 32-bit integer. The lower 31 bits contain an offset value (explained below) |
45 | | // and the first bit of the integer indicates type of the offset. |
46 | | // |
47 | | // +--------------+------------------------------------------------------+ |
48 | | // | Flag (1 bit) | Offset to binary search buffer or file (31 bits) + |
49 | | // +--------------+------------------------------------------------------+ |
50 | | // |
51 | | // Explanation for the "flag bit": |
52 | | // |
53 | | // 0 indicates that the bucket contains only one prefix (no conflict when |
54 | | // hashing this prefix), whose first row starts from this offset of the |
55 | | // file. |
56 | | // 1 indicates that the bucket contains more than one prefixes, or there |
57 | | // are too many rows for one prefix so we need a binary search for it. In |
58 | | // this case, the offset indicates the offset of sub_index_ holding the |
59 | | // binary search indexes of keys for those rows. Those binary search indexes |
60 | | // are organized in this way: |
61 | | // |
62 | | // The first 4 bytes, indicate how many indexes (N) are stored after it. After |
63 | | // it, there are N 32-bit integers, each points of an offset of the file, |
64 | | // which |
65 | | // points to starting of a row. Those offsets need to be guaranteed to be in |
66 | | // ascending order so the keys they are pointing to are also in ascending |
67 | | // order |
68 | | // to make sure we can use them to do binary searches. Below is visual |
69 | | // presentation of a bucket. |
70 | | // |
71 | | // <begin> |
72 | | // number_of_records: varint32 |
73 | | // record 1 file offset: fixedint32 |
74 | | // record 2 file offset: fixedint32 |
75 | | // .... |
76 | | // record N file offset: fixedint32 |
77 | | // <end> |
78 | | class PlainTableIndex { |
79 | | public: |
80 | | enum IndexSearchResult { |
81 | | kNoPrefixForBucket = 0, |
82 | | kDirectToFile = 1, |
83 | | kSubindex = 2 |
84 | | }; |
85 | | |
86 | 0 | explicit PlainTableIndex(Slice data) { CHECK_OK(InitFromRawData(data)); } |
87 | | |
88 | | PlainTableIndex() |
89 | | : index_size_(0), |
90 | | sub_index_size_(0), |
91 | | num_prefixes_(0), |
92 | | index_(nullptr), |
93 | 3.10k | sub_index_(nullptr) {} |
94 | | |
95 | | IndexSearchResult GetOffset(uint32_t prefix_hash, |
96 | | uint32_t* bucket_value) const; |
97 | | |
98 | | Status InitFromRawData(Slice data); |
99 | | |
100 | | const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset, |
101 | 87.0k | uint32_t* upper_bound) const { |
102 | 87.0k | const char* index_ptr = &sub_index_[offset]; |
103 | 87.0k | return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound); |
104 | 87.0k | } |
105 | | |
106 | 20.0k | uint32_t GetIndexSize() const { return index_size_; } |
107 | | |
108 | 3.03k | uint32_t GetSubIndexSize() const { return sub_index_size_; } |
109 | | |
110 | 3.03k | uint32_t GetNumPrefixes() const { return num_prefixes_; } |
111 | | |
112 | | static const uint64_t kMaxFileSize = (1u << 31) - 1; |
113 | | static const uint32_t kSubIndexMask = 0x80000000; |
114 | | static const size_t kOffsetLen = sizeof(uint32_t); |
115 | | |
116 | | private: |
117 | | uint32_t index_size_; |
118 | | uint32_t sub_index_size_; |
119 | | uint32_t num_prefixes_; |
120 | | |
121 | | uint32_t* index_; |
122 | | char* sub_index_; |
123 | | }; |
124 | | |
125 | | // PlainTableIndexBuilder is used to create plain table index. |
126 | | // After calling Finish(), it returns Slice, which is usually |
127 | | // used either to initialize PlainTableIndex or |
128 | | // to save index to sst file. |
129 | | // For more details about the index, please refer to: |
130 | | // https://github.com/facebook/rocksdb/wiki/PlainTable-Format |
131 | | // #wiki-in-memory-index-format |
132 | | class PlainTableIndexBuilder { |
133 | | public: |
134 | | PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions, |
135 | | size_t index_sparseness, double hash_table_ratio, |
136 | | size_t huge_page_tlb_size) |
137 | | : arena_(arena), |
138 | | ioptions_(ioptions), |
139 | | record_list_(kRecordsPerGroup), |
140 | | is_first_record_(true), |
141 | | due_index_(false), |
142 | | num_prefixes_(0), |
143 | | num_keys_per_prefix_(0), |
144 | | prev_key_prefix_hash_(0), |
145 | | index_sparseness_(index_sparseness), |
146 | | prefix_extractor_(ioptions.prefix_extractor), |
147 | | hash_table_ratio_(hash_table_ratio), |
148 | 3.16k | huge_page_tlb_size_(huge_page_tlb_size) {} |
149 | | |
150 | | void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset); |
151 | | |
152 | | Slice Finish(); |
153 | | |
154 | 6.20k | uint32_t GetTotalSize() const { |
155 | 6.20k | return VarintLength(index_size_) + VarintLength(num_prefixes_) + |
156 | 6.20k | PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_; |
157 | 6.20k | } |
158 | | |
159 | | static const std::string kPlainTableIndexBlock; |
160 | | |
161 | | private: |
162 | | struct IndexRecord { |
163 | | uint32_t hash; // hash of the prefix |
164 | | uint32_t offset; // offset of a row |
165 | | IndexRecord* next; |
166 | | }; |
167 | | |
168 | | // Helper class to track all the index records |
169 | | class IndexRecordList { |
170 | | public: |
171 | | explicit IndexRecordList(size_t num_records_per_group) |
172 | | : kNumRecordsPerGroup(num_records_per_group), |
173 | | current_group_(nullptr), |
174 | 3.16k | num_records_in_current_group_(num_records_per_group) {} |
175 | | |
176 | 3.16k | ~IndexRecordList() { |
177 | 8.35k | for (size_t i = 0; i < groups_.size(); i++5.18k ) { |
178 | 5.18k | delete[] groups_[i]; |
179 | 5.18k | } |
180 | 3.16k | } |
181 | | |
182 | | void AddRecord(uint32_t hash, uint32_t offset); |
183 | | |
184 | 3.10k | size_t GetNumRecords() const { |
185 | 3.10k | return (groups_.size() - 1) * kNumRecordsPerGroup + |
186 | 3.10k | num_records_in_current_group_; |
187 | 3.10k | } |
188 | 690k | IndexRecord* At(size_t index) { |
189 | 690k | return &(groups_[index / kNumRecordsPerGroup] |
190 | 690k | [index % kNumRecordsPerGroup]); |
191 | 690k | } |
192 | | |
193 | | private: |
194 | 5.18k | IndexRecord* AllocateNewGroup() { |
195 | 5.18k | IndexRecord* result = new IndexRecord[kNumRecordsPerGroup]; |
196 | 5.18k | groups_.push_back(result); |
197 | 5.18k | return result; |
198 | 5.18k | } |
199 | | |
200 | | // Each group in `groups_` contains fix-sized records (determined by |
201 | | // kNumRecordsPerGroup). Which can help us minimize the cost if resizing |
202 | | // occurs. |
203 | | const size_t kNumRecordsPerGroup; |
204 | | IndexRecord* current_group_; |
205 | | // List of arrays allocated |
206 | | std::vector<IndexRecord*> groups_; |
207 | | size_t num_records_in_current_group_; |
208 | | }; |
209 | | |
210 | | void AllocateIndex(); |
211 | | |
212 | | // Internal helper function to bucket index record list to hash buckets. |
213 | | void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets, |
214 | | std::vector<uint32_t>* entries_per_bucket); |
215 | | |
216 | | // Internal helper class to fill the indexes and bloom filters to internal |
217 | | // data structures. |
218 | | Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets, |
219 | | const std::vector<uint32_t>& entries_per_bucket); |
220 | | |
221 | | Arena* arena_; |
222 | | const ImmutableCFOptions ioptions_; |
223 | | HistogramImpl keys_per_prefix_hist_; |
224 | | IndexRecordList record_list_; |
225 | | bool is_first_record_; |
226 | | bool due_index_; |
227 | | uint32_t num_prefixes_; |
228 | | uint32_t num_keys_per_prefix_; |
229 | | |
230 | | uint32_t prev_key_prefix_hash_; |
231 | | size_t index_sparseness_; |
232 | | uint32_t index_size_; |
233 | | uint32_t sub_index_size_; |
234 | | |
235 | | const SliceTransform* prefix_extractor_; |
236 | | double hash_table_ratio_; |
237 | | size_t huge_page_tlb_size_; |
238 | | |
239 | | std::string prev_key_prefix_; |
240 | | |
241 | | static const size_t kRecordsPerGroup = 256; |
242 | | }; |
243 | | |
244 | | }; // namespace rocksdb |
245 | | |
246 | | #endif // ROCKSDB_LITE |
247 | | |
248 | | #endif // YB_ROCKSDB_TABLE_PLAIN_TABLE_INDEX_H |