/Users/deen/code/yugabyte-db/src/yb/rocksdb/table/index_builder.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) YugaByte, Inc. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
4 | | // in compliance with the License. You may obtain a copy of the License at |
5 | | // |
6 | | // http://www.apache.org/licenses/LICENSE-2.0 |
7 | | // |
8 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
9 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
10 | | // or implied. See the License for the specific language governing permissions and limitations |
11 | | // under the License. |
12 | | // |
13 | | |
14 | | #ifndef YB_ROCKSDB_TABLE_INDEX_BUILDER_H |
15 | | #define YB_ROCKSDB_TABLE_INDEX_BUILDER_H |
16 | | |
17 | | #include "yb/rocksdb/flush_block_policy.h" |
18 | | #include "yb/rocksdb/table.h" |
19 | | #include "yb/rocksdb/table/block_builder.h" |
20 | | #include "yb/rocksdb/table/format.h" |
21 | | |
22 | | #include "yb/util/strongly_typed_bool.h" |
23 | | |
24 | | namespace rocksdb { |
25 | | |
26 | | using yb::Result; |
27 | | |
28 | | // The interface for building index. |
29 | | // Instruction for adding a new concrete IndexBuilder: |
30 | | // 1. Create a subclass instantiated from IndexBuilder. |
31 | | // 2. Add a new entry associated with that subclass in IndexType. |
32 | | // 3. Add a create function for the new subclass in CreateIndexBuilder. |
33 | | // Note: we can devise more advanced design to simplify the process for adding |
34 | | // new subclass, which will, on the other hand, increase the code complexity and |
35 | | // catch unwanted attention from readers. Given that we won't add/change |
36 | | // indexes frequently, it makes sense to just embrace a more straightforward |
37 | | // design that just works. |
38 | | class IndexBuilder { |
39 | | public: |
40 | | // Create a index builder based on its type. |
41 | | static IndexBuilder* CreateIndexBuilder( |
42 | | IndexType index_type, |
43 | | const Comparator* comparator, |
44 | | const SliceTransform* prefix_extractor, |
45 | | const BlockBasedTableOptions& table_opt); |
46 | | |
47 | | // Index builder will construct a set of blocks which contain: |
48 | | // 1. One primary index block. |
49 | | // 2. (Optional) a set of metablocks that contains the metadata of the |
50 | | // primary index. |
51 | | struct IndexBlocks { |
52 | | Slice index_block_contents; |
53 | | std::unordered_map<std::string, Slice> meta_blocks; |
54 | | }; |
55 | | explicit IndexBuilder(const Comparator* comparator) |
56 | 204k | : comparator_(comparator) {} |
57 | | |
58 | 204k | virtual ~IndexBuilder() {} |
59 | | |
60 | | // Add a new index entry to index block. |
61 | | // To allow further optimization, we provide `last_key_in_current_block` and |
62 | | // `first_key_in_next_block`, based on which the specific implementation can |
63 | | // determine the best index key to be used for the index block. |
64 | | // @last_key_in_current_block: this parameter maybe overridden with the value |
65 | | // "substitute key". |
66 | | // @first_key_in_next_block: it will be nullptr if the entry being added is |
67 | | // the last one in the table |
68 | | // |
69 | | // REQUIRES: Finish() has not yet been called. |
70 | | virtual void AddIndexEntry(std::string* last_key_in_current_block, |
71 | | const Slice* first_key_in_next_block, |
72 | | const BlockHandle& block_handle) = 0; |
73 | | |
74 | | // This method will be called whenever a key is added. The subclasses may |
75 | | // override OnKeyAdded() if they need to collect additional information. |
76 | 91.2M | virtual void OnKeyAdded(const Slice& key) {} |
77 | | |
78 | | // Inform the index builder that all entries has been written. Block builder |
79 | | // may therefore perform any operation required for block finalization. |
80 | | // |
81 | | // REQUIRES: Finish() has not yet been called. |
82 | | virtual CHECKED_STATUS Finish(IndexBlocks* index_blocks) = 0; |
83 | | |
84 | | // Whether it is time to flush the current index block. Overridden in MultiLevelIndexBuilder. |
85 | | // While true is returned the caller should keep calling FlushNextBlock(IndexBlocks*, |
86 | | // const BlockHandle&) with handle of the block written with the result of the last call to |
87 | | // FlushNextBlock and keep writing flushed blocks. |
88 | 19.4k | virtual bool ShouldFlush() const { return false; } |
89 | | |
90 | | // This method can be overridden to build the n-th level index in |
91 | | // MultiLevelIndexBuilder. |
92 | | // |
93 | | // Returns true when it actually flushed entries into index_blocks. |
94 | | // Returns false when everything was already flushed by previous call to FlushNextBlock. |
95 | | virtual Result<bool> FlushNextBlock( |
96 | | IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle); |
97 | | |
98 | | // Get the estimated size for index block. |
99 | | virtual size_t EstimatedSize() const = 0; |
100 | | |
101 | | // Number of levels for index. |
102 | | virtual int NumLevels() const = 0; |
103 | | |
104 | | protected: |
105 | | const Comparator* comparator_; |
106 | | }; |
107 | | |
108 | | YB_STRONGLY_TYPED_BOOL(ShortenKeys); |
109 | | |
110 | | // This index builder builds space-efficient index block. |
111 | | // |
112 | | // Optimizations: |
113 | | // 1. Made block's `block_restart_interval` to be 1, which will avoid linear |
114 | | // search when doing index lookup (can be disabled by setting |
115 | | // index_block_restart_interval). |
116 | | // 2. Shorten the key length for index block. Other than honestly using the |
117 | | // last key in the data block as the index key, we instead find a shortest |
118 | | // substitute key that serves the same function. |
119 | | class ShortenedIndexBuilder : public IndexBuilder { |
120 | | public: |
121 | | explicit ShortenedIndexBuilder(const Comparator* comparator, |
122 | | int index_block_restart_interval) |
123 | | : IndexBuilder(comparator), |
124 | 138k | index_block_builder_(index_block_restart_interval, kIndexBlockKeyValueEncodingFormat) {} |
125 | | |
126 | | void AddIndexEntry( |
127 | | std::string* last_key_in_current_block, |
128 | | const Slice* first_key_in_next_block, |
129 | | const BlockHandle& block_handle) override; |
130 | | |
131 | | void AddIndexEntry( |
132 | | std::string* last_key_in_current_block, |
133 | | const Slice* first_key_in_next_block, |
134 | | const std::string& block_handle_encoded, |
135 | | ShortenKeys shorten_keys); |
136 | | |
137 | | CHECKED_STATUS Finish(IndexBlocks* index_blocks) override; |
138 | | |
139 | 4.56k | size_t EstimatedSize() const override { |
140 | 4.56k | return index_block_builder_.CurrentSizeEstimate(); |
141 | 4.56k | } |
142 | | |
143 | 8 | int NumLevels() const override { return 1; } |
144 | | |
145 | 0 | void Reset() { index_block_builder_.Reset(); } |
146 | | |
147 | 75.6k | const BlockBuilder& GetIndexBlockBuilder() { return index_block_builder_; } |
148 | | |
149 | | private: |
150 | | BlockBuilder index_block_builder_; |
151 | | }; |
152 | | |
153 | | // HashIndexBuilder contains a binary-searchable primary index and the |
154 | | // metadata for secondary hash index construction. |
155 | | // The metadata for hash index consists two parts: |
156 | | // - a metablock that compactly contains a sequence of prefixes. All prefixes |
157 | | // are stored consecutively without any metadata (like, prefix sizes) being |
158 | | // stored, which is kept in the other metablock. |
159 | | // - a metablock contains the metadata of the prefixes, including prefix size, |
160 | | // restart index and number of block it spans. The format looks like: |
161 | | // |
162 | | // +-----------------+---------------------------+---------------------+ <=prefix 1 |
163 | | // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | |
164 | | // +-----------------+---------------------------+---------------------+ <=prefix 2 |
165 | | // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | |
166 | | // +-----------------+---------------------------+---------------------+ |
167 | | // | | |
168 | | // | .... | |
169 | | // | | |
170 | | // +-----------------+---------------------------+---------------------+ <=prefix n |
171 | | // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | |
172 | | // +-----------------+---------------------------+---------------------+ |
173 | | // |
174 | | // The reason of separating these two metablocks is to enable the efficiently |
175 | | // reuse the first metablock during hash index construction without unnecessary |
176 | | // data copy or small heap allocations for prefixes. |
177 | | class HashIndexBuilder : public IndexBuilder { |
178 | | public: |
179 | | explicit HashIndexBuilder(const Comparator* comparator, |
180 | | const SliceTransform* hash_key_extractor, |
181 | | int index_block_restart_interval) |
182 | | : IndexBuilder(comparator), |
183 | | primary_index_builder_(comparator, index_block_restart_interval), |
184 | 1.45k | hash_key_extractor_(hash_key_extractor) {} |
185 | | |
186 | | void AddIndexEntry( |
187 | | std::string* last_key_in_current_block, |
188 | | const Slice* first_key_in_next_block, |
189 | | const BlockHandle& block_handle) override; |
190 | | |
191 | | void OnKeyAdded(const Slice& key) override; |
192 | | |
193 | | CHECKED_STATUS Finish(IndexBlocks* index_blocks) override; |
194 | | |
195 | 1.44k | size_t EstimatedSize() const override { |
196 | 1.44k | return primary_index_builder_.EstimatedSize() + prefix_block_.size() + |
197 | 1.44k | prefix_meta_block_.size(); |
198 | 1.44k | } |
199 | | |
200 | 2.06k | int NumLevels() const override { return 1; } |
201 | | |
202 | | private: |
203 | | void FlushPendingPrefix(); |
204 | | |
205 | | ShortenedIndexBuilder primary_index_builder_; |
206 | | const SliceTransform* hash_key_extractor_; |
207 | | |
208 | | // stores a sequence of prefixes |
209 | | std::string prefix_block_; |
210 | | // stores the metadata of prefixes |
211 | | std::string prefix_meta_block_; |
212 | | |
213 | | // The following 3 variables keeps unflushed prefix and its metadata. |
214 | | // The details of block_num and entry_index can be found in |
215 | | // "block_hash_index.{h,cc}" |
216 | | uint32_t pending_block_num_ = 0; |
217 | | uint32_t pending_entry_index_ = 0; |
218 | | std::string pending_entry_prefix_; |
219 | | |
220 | | uint64_t current_restart_index_ = 0; |
221 | | }; |
222 | | |
223 | | class MultiLevelIndexBuilder : public IndexBuilder { |
224 | | public: |
225 | | MultiLevelIndexBuilder(const Comparator* comparator, const BlockBasedTableOptions& table_opt); |
226 | | |
227 | | void AddIndexEntry( |
228 | | std::string* last_key_in_current_block, |
229 | | const Slice* first_key_in_next_block, |
230 | 2.51M | const BlockHandle& block_handle) override { |
231 | 2.51M | AddIndexEntry( |
232 | 2.51M | last_key_in_current_block, first_key_in_next_block, block_handle, ShortenKeys::kTrue); |
233 | 2.51M | } |
234 | | |
235 | | void AddIndexEntry( |
236 | | std::string* last_key_in_current_block, |
237 | | const Slice* first_key_in_next_block, |
238 | | const BlockHandle& block_handle, |
239 | | ShortenKeys shorten_keys); |
240 | | |
241 | 0 | CHECKED_STATUS Finish(IndexBlocks* index_blocks) override { |
242 | 0 | return STATUS(NotSupported, "Finishing as single block is not supported by multi-level index."); |
243 | 0 | } |
244 | | |
245 | | bool ShouldFlush() const override; |
246 | | |
247 | | Result<bool> FlushNextBlock( |
248 | | IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) override; |
249 | | |
250 | | size_t EstimatedSize() const override; |
251 | | |
252 | | int NumLevels() const override; |
253 | | |
254 | | private: |
255 | | struct IndexBlockInfo; |
256 | | |
257 | | void EnsureCurrentLevelIndexBuilderCreated(); |
258 | | CHECKED_STATUS FlushCurrentLevel(IndexBlocks* index_blocks); |
259 | | void AddToNextLevelIfReady( |
260 | | IndexBlockInfo* block_info, const BlockHandle& block_handle); |
261 | | |
262 | | const BlockBasedTableOptions& table_opt_; |
263 | | |
264 | | // Builder for an index block at the current level. |
265 | | std::unique_ptr<ShortenedIndexBuilder> current_level_index_block_builder_; |
266 | | |
267 | | // Already flushed index block builder, which can only be deleted after the corresponding block is |
268 | | // written, because it holds flushed data referenced by a slice returned from FlushNextBlock. |
269 | | std::unique_ptr<ShortenedIndexBuilder> finished_index_block_builder_; |
270 | | |
271 | | // Builder for index blocks at higher levels. |
272 | | std::unique_ptr<MultiLevelIndexBuilder> next_level_index_builder_; |
273 | | std::unique_ptr<FlushBlockPolicy> flush_policy_; |
274 | | |
275 | | // Whether we've started flushing index blocks, i.e. the FlushNextBlock method has been already |
276 | | // called. |
277 | | bool flushing_indexes_ = false; |
278 | | |
279 | | // Size of index blocks at the current level. |
280 | | size_t current_level_size_ = 0; |
281 | | |
282 | | struct IndexBlockInfo { |
283 | | // Whether the index block is ready for processing, other fields are only correct when |
284 | | // is_ready is true. |
285 | | bool is_ready = false; |
286 | | // Last key in the index block. |
287 | | std::string last_key; |
288 | | // Whether there are keys for the next block at the same level. |
289 | | bool has_next_block = false; |
290 | | // First key for the next index block at the same level (only correct if has_next_block is |
291 | | // true). |
292 | | std::string next_block_first_key; |
293 | | |
294 | 3.90M | bool IsReadyAndLast() const { return is_ready && !has_next_block; } |
295 | | }; |
296 | | |
297 | | // A complete current level index block once it is ready for flush. |
298 | | IndexBlockInfo current_level_block_; |
299 | | struct { |
300 | | // A complete current level index block once it is ready to add to next level index. |
301 | | IndexBlockInfo block_to_add; |
302 | | // Handle of the last index block flushed at a higher level. |
303 | | BlockHandle last_flushed_block_; |
304 | | // If an index block from a higher level was flushed during the previous call to |
305 | | // FlushNextBlock(). |
306 | | bool just_flushed = false; |
307 | | } next_level_; |
308 | | // Buffer for block handle encoding. |
309 | | std::string block_handle_encoding_; |
310 | | }; |
311 | | |
312 | | } // namespace rocksdb |
313 | | |
314 | | #endif // YB_ROCKSDB_TABLE_INDEX_BUILDER_H |