/Users/deen/code/yugabyte-db/src/yb/rocksdb/table/index_builder.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) YugaByte, Inc. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
4 | | // in compliance with the License. You may obtain a copy of the License at |
5 | | // |
6 | | // http://www.apache.org/licenses/LICENSE-2.0 |
7 | | // |
8 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
9 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
10 | | // or implied. See the License for the specific language governing permissions and limitations |
11 | | // under the License. |
12 | | // |
13 | | |
14 | | #include "yb/rocksdb/table/index_builder.h" |
15 | | |
16 | | #include "yb/rocksdb/comparator.h" |
17 | | #include "yb/rocksdb/port/likely.h" |
18 | | #include "yb/rocksdb/slice_transform.h" |
19 | | #include "yb/rocksdb/table/block_based_table_factory.h" |
20 | | #include "yb/rocksdb/util/coding.h" |
21 | | |
22 | | #include "yb/util/logging.h" |
23 | | |
24 | | namespace rocksdb { |
25 | | |
26 | | Result<bool> IndexBuilder::FlushNextBlock( |
27 | 1.45k | IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) { |
28 | 1.45k | RETURN_NOT_OK(Finish(index_blocks)); |
29 | 1.45k | return true; |
30 | 1.45k | } |
31 | | |
32 | | IndexBuilder* IndexBuilder::CreateIndexBuilder( |
33 | | IndexType type, |
34 | | const Comparator* comparator, |
35 | | const SliceTransform* prefix_extractor, |
36 | 131k | const BlockBasedTableOptions& table_opt) { |
37 | 131k | switch (type) { |
38 | 65.6k | case IndexType::kBinarySearch: { |
39 | 65.6k | return new ShortenedIndexBuilder(comparator, |
40 | 65.6k | table_opt.index_block_restart_interval); |
41 | 0 | } |
42 | 1.45k | case IndexType::kHashSearch: { |
43 | 1.45k | return new HashIndexBuilder(comparator, prefix_extractor, |
44 | 1.45k | table_opt.index_block_restart_interval); |
45 | 0 | } |
46 | 64.1k | case IndexType::kMultiLevelBinarySearch: { |
47 | 64.1k | return new MultiLevelIndexBuilder(comparator, table_opt); |
48 | 0 | } |
49 | 0 | default: { |
50 | 0 | assert(!"Do not recognize the index type "); |
51 | 0 | return nullptr; |
52 | 0 | } |
53 | 131k | } |
54 | | // impossible. |
55 | 0 | assert(false); |
56 | 0 | return nullptr; |
57 | 131k | } |
58 | | |
59 | | void ShortenedIndexBuilder::AddIndexEntry( |
60 | | std::string* last_key_in_current_block, |
61 | | const Slice* first_key_in_next_block, |
62 | 33.8k | const BlockHandle& block_handle) { |
63 | 33.8k | std::string handle_encoding; |
64 | 33.8k | block_handle.AppendEncodedTo(&handle_encoding); |
65 | 33.8k | AddIndexEntry( |
66 | 33.8k | last_key_in_current_block, first_key_in_next_block, handle_encoding, ShortenKeys::kTrue); |
67 | 33.8k | } |
68 | | |
69 | | void ShortenedIndexBuilder::AddIndexEntry( |
70 | | std::string* last_key_in_current_block, |
71 | | const Slice* first_key_in_next_block, |
72 | | const std::string& block_handle_encoded, |
73 | 2.89M | const ShortenKeys shorten_keys) { |
74 | 2.89M | if (shorten_keys) { |
75 | 2.87M | if (UNLIKELY(first_key_in_next_block == nullptr)) { |
76 | 76.9k | comparator_->FindShortSuccessor(last_key_in_current_block); |
77 | 2.79M | } else { |
78 | 2.79M | comparator_->FindShortestSeparator(last_key_in_current_block, *first_key_in_next_block); |
79 | 2.79M | } |
80 | 2.87M | } |
81 | | |
82 | 2.89M | index_block_builder_.Add(*last_key_in_current_block, block_handle_encoded); |
83 | 2.89M | } |
84 | | |
85 | 90.5k | Status ShortenedIndexBuilder::Finish(IndexBlocks* index_blocks) { |
86 | 90.5k | index_blocks->index_block_contents = index_block_builder_.Finish(); |
87 | 90.5k | return Status::OK(); |
88 | 90.5k | } |
89 | | |
90 | | void HashIndexBuilder::AddIndexEntry( |
91 | | std::string* last_key_in_current_block, |
92 | | const Slice* first_key_in_next_block, |
93 | 19.4k | const BlockHandle& block_handle) { |
94 | 19.4k | ++current_restart_index_; |
95 | 19.4k | primary_index_builder_.AddIndexEntry( |
96 | 19.4k | last_key_in_current_block, first_key_in_next_block, block_handle); |
97 | 19.4k | } |
98 | | |
99 | 554k | void HashIndexBuilder::OnKeyAdded(const Slice& key) { |
100 | 554k | auto key_prefix = hash_key_extractor_->Transform(key); |
101 | 554k | bool is_first_entry = pending_block_num_ == 0; |
102 | | |
103 | | // Keys may share the prefix |
104 | 554k | if (is_first_entry || pending_entry_prefix_ != key_prefix553k ) { |
105 | 268k | if (!is_first_entry) { |
106 | 266k | FlushPendingPrefix(); |
107 | 266k | } |
108 | | |
109 | | // need a hard copy otherwise the underlying data changes all the time. |
110 | | // TODO(kailiu) ToString() is expensive. We may speed up can avoid data |
111 | | // copy. |
112 | 268k | pending_entry_prefix_ = key_prefix.ToString(); |
113 | 268k | pending_block_num_ = 1; |
114 | 268k | pending_entry_index_ = static_cast<uint32_t>(current_restart_index_); |
115 | 286k | } else { |
116 | | // entry number increments when keys share the prefix reside in |
117 | | // different data blocks. |
118 | 286k | auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1; |
119 | 286k | assert(last_restart_index <= current_restart_index_); |
120 | 286k | if (last_restart_index != current_restart_index_) { |
121 | 9.06k | ++pending_block_num_; |
122 | 9.06k | } |
123 | 286k | } |
124 | 554k | } |
125 | | |
126 | 1.45k | Status HashIndexBuilder::Finish(IndexBlocks* index_blocks) { |
127 | 1.45k | FlushPendingPrefix(); |
128 | 1.45k | RETURN_NOT_OK(primary_index_builder_.Finish(index_blocks)); |
129 | 1.45k | index_blocks->meta_blocks.emplace(kHashIndexPrefixesBlock, prefix_block_); |
130 | 1.45k | index_blocks->meta_blocks.emplace(kHashIndexPrefixesMetadataBlock, prefix_meta_block_); |
131 | 1.45k | return Status::OK(); |
132 | 1.45k | } |
133 | | |
134 | 268k | void HashIndexBuilder::FlushPendingPrefix() { |
135 | 268k | prefix_block_.append(pending_entry_prefix_.data(), pending_entry_prefix_.size()); |
136 | 268k | PutVarint32(&prefix_meta_block_, static_cast<uint32_t>(pending_entry_prefix_.size())); |
137 | 268k | PutVarint32(&prefix_meta_block_, pending_entry_index_); |
138 | 268k | PutVarint32(&prefix_meta_block_, pending_block_num_); |
139 | 268k | } |
140 | | |
141 | | MultiLevelIndexBuilder::MultiLevelIndexBuilder( |
142 | | const Comparator* comparator, const BlockBasedTableOptions& table_opt) |
143 | | : IndexBuilder(comparator), |
144 | 68.4k | table_opt_(table_opt) {} |
145 | | |
146 | 2.86M | void MultiLevelIndexBuilder::EnsureCurrentLevelIndexBuilderCreated() { |
147 | 2.86M | if (!current_level_index_block_builder_) { |
148 | 82.0k | DCHECK(!flush_policy_); |
149 | 82.0k | current_level_index_block_builder_.reset( |
150 | 82.0k | new ShortenedIndexBuilder(comparator_, table_opt_.index_block_restart_interval)); |
151 | 82.0k | flush_policy_ = FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( |
152 | 82.0k | table_opt_.index_block_size, table_opt_.block_size_deviation, |
153 | 82.0k | table_opt_.min_keys_per_index_block, |
154 | 82.0k | current_level_index_block_builder_->GetIndexBlockBuilder()); |
155 | 82.0k | } |
156 | 2.86M | } |
157 | | |
158 | | void MultiLevelIndexBuilder::AddIndexEntry( |
159 | | std::string* last_key_in_current_block, const Slice* first_key_in_next_block, |
160 | 2.86M | const BlockHandle& block_handle, const ShortenKeys shorten_keys) { |
161 | 2.86M | DCHECK(!current_level_block_.is_ready) |
162 | 39 | << "Expected to first flush already complete index block"; |
163 | 2.86M | EnsureCurrentLevelIndexBuilderCreated(); |
164 | | |
165 | 2.86M | block_handle_encoding_.clear(); |
166 | 2.86M | block_handle.AppendEncodedTo(&block_handle_encoding_); |
167 | | |
168 | 2.86M | current_level_index_block_builder_->AddIndexEntry( |
169 | 2.86M | last_key_in_current_block, first_key_in_next_block, block_handle_encoding_, shorten_keys); |
170 | | |
171 | 2.86M | if (flush_policy_->Update(*last_key_in_current_block, block_handle_encoding_) |
172 | 2.86M | || first_key_in_next_block == nullptr2.84M ) { |
173 | 81.9k | current_level_block_.is_ready = true; |
174 | 81.9k | yb::CopyToBuffer(*last_key_in_current_block, ¤t_level_block_.last_key); |
175 | 81.9k | current_level_block_.has_next_block = first_key_in_next_block != nullptr; |
176 | 81.9k | if (current_level_block_.has_next_block) { |
177 | 13.5k | first_key_in_next_block->CopyToBuffer(¤t_level_block_.next_block_first_key); |
178 | 13.5k | } |
179 | 81.9k | } |
180 | 2.86M | } |
181 | | |
182 | 4.52M | bool MultiLevelIndexBuilder::ShouldFlush() const { |
183 | 4.52M | return current_level_block_.is_ready || |
184 | 4.52M | next_level_.block_to_add.IsReadyAndLast()4.43M || |
185 | 4.52M | (4.43M next_level_index_builder_4.43M && next_level_index_builder_->ShouldFlush()1.57M ); |
186 | 4.52M | } |
187 | | |
188 | 81.9k | Status MultiLevelIndexBuilder::FlushCurrentLevel(IndexBlocks* index_blocks) { |
189 | 81.9k | RETURN_NOT_OK(current_level_index_block_builder_->Finish(index_blocks)); |
190 | 81.9k | finished_index_block_builder_ = std::move(current_level_index_block_builder_); |
191 | 81.9k | flush_policy_.reset(); |
192 | 81.9k | current_level_size_ += index_blocks->index_block_contents.size() + kBlockTrailerSize; |
193 | 81.9k | return Status::OK(); |
194 | 81.9k | } |
195 | | |
196 | | void MultiLevelIndexBuilder::AddToNextLevelIfReady( |
197 | 81.9k | IndexBlockInfo* block_info, const BlockHandle& block_handle) { |
198 | 81.9k | DCHECK_ONLY_NOTNULL(block_info); |
199 | 81.9k | if (block_info->is_ready) { |
200 | 17.8k | DCHECK_ONLY_NOTNULL(next_level_index_builder_.get()); |
201 | | // We don't need to shorten keys in next level index builder, since we already did that when |
202 | | // added this entry at previous levels. |
203 | 17.8k | if (block_info->has_next_block) { |
204 | 13.5k | Slice key_slice(block_info->next_block_first_key); |
205 | 13.5k | next_level_index_builder_->AddIndexEntry( |
206 | 13.5k | &block_info->last_key, &key_slice, block_handle, ShortenKeys::kFalse); |
207 | 13.5k | } else { |
208 | 4.31k | next_level_index_builder_->AddIndexEntry( |
209 | 4.31k | &block_info->last_key, nullptr, block_handle, ShortenKeys::kFalse); |
210 | 4.31k | DCHECK(next_level_index_builder_->ShouldFlush()) |
211 | 0 | << "Expected to request flushing of last block at next index level"; |
212 | 4.31k | } |
213 | 17.8k | block_info->is_ready = false; |
214 | 17.8k | } |
215 | 81.9k | } |
216 | | |
217 | | Result<bool> MultiLevelIndexBuilder::FlushNextBlock( |
218 | 150k | IndexBlocks* index_blocks, const BlockHandle& last_index_block_handle) { |
219 | 150k | if (next_level_.just_flushed) { |
220 | | // We should only have current level index block to add for next level if we have just |
221 | | // flushed it, but not next level index block. |
222 | 4.32k | if (next_level_.block_to_add.is_ready) { |
223 | 0 | return STATUS(IllegalState, "MultiLevelIndexBuilder just flushed next level index block, but" |
224 | 0 | " current level index block is ready to add for next level."); |
225 | 0 | } |
226 | 4.32k | next_level_.last_flushed_block_ = last_index_block_handle; |
227 | 4.32k | next_level_.just_flushed = false; |
228 | 4.32k | } |
229 | 150k | if (flushing_indexes_) { |
230 | | // If ready - add last flushed current level index block to next level index. |
231 | 81.9k | AddToNextLevelIfReady(&next_level_.block_to_add, last_index_block_handle); |
232 | 81.9k | if (next_level_index_builder_ && next_level_index_builder_->ShouldFlush()22.1k ) { |
233 | | // First flush next level index block for all previously flushed index blocks at this |
234 | | // level and only after that we will flush pending current level index block on subsequent |
235 | | // FlushNextBlock() call at this level. |
236 | 4.33k | auto result = next_level_index_builder_->FlushNextBlock( |
237 | 4.33k | index_blocks, next_level_.last_flushed_block_); |
238 | 4.33k | RETURN_NOT_OK(result); |
239 | 4.33k | next_level_.just_flushed = true; |
240 | 4.33k | return result; |
241 | 4.33k | } |
242 | 81.9k | } |
243 | 145k | flushing_indexes_ = true; |
244 | 145k | if (current_level_block_.is_ready) { |
245 | 81.9k | RETURN_NOT_OK(FlushCurrentLevel(index_blocks)); |
246 | 81.9k | if (!next_level_index_builder_ && current_level_block_.has_next_block68.3k ) { |
247 | | // We only need next level index builder if we plan to have more than one index entry at next |
248 | | // level, i.e. if we have next block at current level. Otherwise current level will be the top |
249 | | // level with single index block. |
250 | 4.33k | next_level_index_builder_ = std::make_unique<MultiLevelIndexBuilder>( |
251 | 4.33k | comparator_, table_opt_); |
252 | 4.33k | } |
253 | 81.9k | if (next_level_index_builder_) { |
254 | | // Postpone adding index entry for just flushed block for next FlushNextBlock() call, since |
255 | | // we need to have block handle to add it to next level index. |
256 | 17.8k | next_level_.block_to_add = std::move(current_level_block_); |
257 | 17.8k | } |
258 | 81.9k | current_level_block_.is_ready = false; |
259 | 81.9k | return true; |
260 | 81.9k | } else if (64.0k !last_index_block_handle.IsSet()64.0k ) { |
261 | | // It means this is the first call of FlushNextBlock() and no keys have been added to index, |
262 | | // which can only happen when we call FlushNextBlock() unconditionally during finishing of empty |
263 | | // SST file, so we need to produce an empty index. |
264 | 54 | EnsureCurrentLevelIndexBuilderCreated(); |
265 | 54 | RETURN_NOT_OK(FlushCurrentLevel(index_blocks)); |
266 | 54 | return true; |
267 | 64.0k | } else { |
268 | 64.0k | return false; |
269 | 64.0k | } |
270 | 145k | } |
271 | | |
272 | 68.3k | size_t MultiLevelIndexBuilder::EstimatedSize() const { |
273 | | // We subtract kBlockTrailerSize at the top level, because it will be added by |
274 | | // BlockBasedTableBuilder. |
275 | 68.3k | return current_level_size_ + |
276 | 68.3k | (next_level_index_builder_ ? next_level_index_builder_->EstimatedSize()4.31k : -kBlockTrailerSize64.0k ); |
277 | 68.3k | } |
278 | | |
279 | 127k | int MultiLevelIndexBuilder::NumLevels() const { |
280 | 127k | return 1 + (next_level_index_builder_ ? next_level_index_builder_->NumLevels()8.62k : 0119k ); |
281 | 127k | } |
282 | | |
283 | | } // namespace rocksdb |