/Users/deen/code/yugabyte-db/src/yb/rocksdb/table/plain_table_reader.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
2 | | // Use of this source code is governed by a BSD-style license that can be |
3 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
4 | | // |
5 | | // The following only applies to changes made to this file as part of YugaByte development. |
6 | | // |
7 | | // Portions Copyright (c) YugaByte, Inc. |
8 | | // |
9 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
10 | | // in compliance with the License. You may obtain a copy of the License at |
11 | | // |
12 | | // http://www.apache.org/licenses/LICENSE-2.0 |
13 | | // |
14 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
15 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
16 | | // or implied. See the License for the specific language governing permissions and limitations |
17 | | // under the License. |
18 | | // |
19 | | |
20 | | #ifndef ROCKSDB_LITE |
21 | | |
22 | | #include "yb/rocksdb/table/plain_table_reader.h" |
23 | | |
24 | | #include <string> |
25 | | #include <vector> |
26 | | |
27 | | #include "yb/rocksdb/cache.h" |
28 | | #include "yb/rocksdb/comparator.h" |
29 | | #include "yb/rocksdb/env.h" |
30 | | #include "yb/rocksdb/options.h" |
31 | | #include "yb/rocksdb/statistics.h" |
32 | | #include "yb/rocksdb/table/bloom_block.h" |
33 | | #include "yb/rocksdb/table/format.h" |
34 | | #include "yb/rocksdb/table/get_context.h" |
35 | | #include "yb/rocksdb/table/internal_iterator.h" |
36 | | #include "yb/rocksdb/table/iterator_wrapper.h" |
37 | | #include "yb/rocksdb/table/meta_blocks.h" |
38 | | #include "yb/rocksdb/table/plain_table_factory.h" |
39 | | #include "yb/rocksdb/table/plain_table_key_coding.h" |
40 | | #include "yb/rocksdb/util/arena.h" |
41 | | #include "yb/rocksdb/util/coding.h" |
42 | | #include "yb/rocksdb/util/dynamic_bloom.h" |
43 | | #include "yb/rocksdb/util/hash.h" |
44 | | #include "yb/rocksdb/util/histogram.h" |
45 | | #include "yb/rocksdb/util/murmurhash.h" |
46 | | #include "yb/rocksdb/util/perf_context_imp.h" |
47 | | |
48 | | #include "yb/util/mem_tracker.h" |
49 | | #include "yb/util/string_util.h" |
50 | | |
51 | | namespace rocksdb { |
52 | | |
53 | | namespace { |
54 | | |
55 | | // Safely getting a uint32_t element from a char array, where, starting from |
56 | | // `base`, every 4 bytes are considered as an fixed 32 bit integer. |
57 | 490k | inline uint32_t GetFixed32Element(const char* base, size_t offset) { |
58 | 490k | return DecodeFixed32(base + offset * sizeof(uint32_t)); |
59 | 490k | } |
60 | | } // namespace |
61 | | |
62 | | // Iterator to iterate IndexedTable |
63 | | class PlainTableIterator : public InternalIterator { |
64 | | public: |
65 | | explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek); |
66 | | ~PlainTableIterator(); |
67 | | |
68 | | bool Valid() const override; |
69 | | |
70 | | void SeekToFirst() override; |
71 | | |
72 | | void SeekToLast() override; |
73 | | |
74 | | void Seek(const Slice& target) override; |
75 | | |
76 | | void Next() override; |
77 | | |
78 | | void Prev() override; |
79 | | |
80 | | Slice key() const override; |
81 | | |
82 | | Slice value() const override; |
83 | | |
84 | | Status status() const override; |
85 | | |
86 | | private: |
87 | | PlainTableReader* table_; |
88 | | PlainTableKeyDecoder decoder_; |
89 | | bool use_prefix_seek_; |
90 | | uint32_t offset_; |
91 | | uint32_t next_offset_; |
92 | | Slice key_; |
93 | | Slice value_; |
94 | | Status status_; |
95 | | // No copying allowed |
96 | | PlainTableIterator(const PlainTableIterator&) = delete; |
97 | | void operator=(const Iterator&) = delete; |
98 | | }; |
99 | | |
100 | | extern const uint64_t kPlainTableMagicNumber; |
101 | | PlainTableReader::PlainTableReader(const ImmutableCFOptions& ioptions, |
102 | | unique_ptr<RandomAccessFileReader>&& file, |
103 | | const EnvOptions& storage_options, |
104 | | const InternalKeyComparatorPtr& icomparator, |
105 | | EncodingType encoding_type, |
106 | | uint64_t file_size, |
107 | | const TableProperties* table_properties) |
108 | | : internal_comparator_(icomparator), |
109 | | encoding_type_(encoding_type), |
110 | | full_scan_mode_(false), |
111 | | user_key_len_(static_cast<uint32_t>(table_properties->fixed_key_len)), |
112 | | prefix_extractor_(ioptions.prefix_extractor), |
113 | | enable_bloom_(false), |
114 | | bloom_(6, nullptr), |
115 | | file_info_(std::move(file), storage_options, |
116 | | static_cast<uint32_t>(table_properties->data_size)), |
117 | | ioptions_(ioptions), |
118 | | file_size_(file_size), |
119 | 3.11k | table_properties_(nullptr) { |
120 | 3.11k | if (ioptions.mem_tracker) { |
121 | 0 | mem_tracker_ = yb::MemTracker::FindOrCreateTracker("PlainTableReader", ioptions.mem_tracker); |
122 | 0 | } |
123 | 3.11k | } |
124 | | |
125 | 3.11k | PlainTableReader::~PlainTableReader() { |
126 | 3.11k | if (mem_tracker_ && tracked_consumption_) { |
127 | 0 | mem_tracker_->Release(tracked_consumption_); |
128 | 0 | } |
129 | 3.11k | } |
130 | | |
131 | | Status PlainTableReader::Open(const ImmutableCFOptions& ioptions, |
132 | | const EnvOptions& env_options, |
133 | | const InternalKeyComparatorPtr& internal_comparator, |
134 | | unique_ptr<RandomAccessFileReader>&& file, |
135 | | uint64_t file_size, |
136 | | unique_ptr<TableReader>* table_reader, |
137 | | const int bloom_bits_per_key, |
138 | | double hash_table_ratio, size_t index_sparseness, |
139 | 2.89k | size_t huge_page_tlb_size, bool full_scan_mode) { |
140 | 2.89k | if (file_size > PlainTableIndex::kMaxFileSize) { |
141 | 0 | return STATUS(NotSupported, "File is too large for PlainTableReader!"); |
142 | 0 | } |
143 | | |
144 | 2.89k | TableProperties* props = nullptr; |
145 | 2.89k | auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, |
146 | 2.89k | ioptions.env, ioptions.info_log, &props); |
147 | 2.89k | if (!s.ok()) { |
148 | 2 | return s; |
149 | 2 | } |
150 | | |
151 | 2.89k | assert(hash_table_ratio >= 0.0); |
152 | 2.89k | auto& user_props = props->user_collected_properties; |
153 | 2.89k | auto prefix_extractor_in_file = |
154 | 2.89k | user_props.find(PlainTablePropertyNames::kPrefixExtractorName); |
155 | | |
156 | 2.89k | if (!full_scan_mode && prefix_extractor_in_file != user_props.end()) { |
157 | 2.60k | if (!ioptions.prefix_extractor) { |
158 | 0 | return STATUS(InvalidArgument, |
159 | 0 | "Prefix extractor is missing when opening a PlainTable built " |
160 | 0 | "using a prefix extractor"); |
161 | 2.60k | } else if (prefix_extractor_in_file->second.compare( |
162 | 0 | ioptions.prefix_extractor->Name()) != 0) { |
163 | 0 | return STATUS(InvalidArgument, |
164 | 0 | "Prefix extractor given doesn't match the one used to build " |
165 | 0 | "PlainTable"); |
166 | 0 | } |
167 | 2.89k | } |
168 | | |
169 | 2.89k | EncodingType encoding_type = kPlain; |
170 | 2.89k | auto encoding_type_prop = |
171 | 2.89k | user_props.find(PlainTablePropertyNames::kEncodingType); |
172 | 2.89k | if (encoding_type_prop != user_props.end()) { |
173 | 2.89k | encoding_type = static_cast<EncodingType>( |
174 | 2.89k | DecodeFixed32(encoding_type_prop->second.c_str())); |
175 | 2.89k | } |
176 | | |
177 | 2.89k | std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader( |
178 | 2.89k | ioptions, std::move(file), env_options, internal_comparator, |
179 | 2.89k | encoding_type, file_size, props)); |
180 | | |
181 | 2.89k | s = new_reader->MmapDataIfNeeded(); |
182 | 2.89k | if (!s.ok()) { |
183 | 0 | return s; |
184 | 0 | } |
185 | | |
186 | 2.89k | if (!full_scan_mode) { |
187 | 2.89k | s = new_reader->PopulateIndex(props, bloom_bits_per_key, hash_table_ratio, |
188 | 2.89k | index_sparseness, huge_page_tlb_size); |
189 | 2.89k | if (!s.ok()) { |
190 | 0 | return s; |
191 | 0 | } |
192 | 0 | } else { |
193 | | // Flag to indicate it is a full scan mode so that none of the indexes |
194 | | // can be used. |
195 | 0 | new_reader->full_scan_mode_ = true; |
196 | 0 | } |
197 | | |
198 | 2.89k | *table_reader = std::move(new_reader); |
199 | 2.89k | return s; |
200 | 2.89k | } |
201 | | |
202 | 826 | void PlainTableReader::SetupForCompaction() { |
203 | 826 | } |
204 | | |
205 | | InternalIterator* PlainTableReader::NewIterator(const ReadOptions& options, |
206 | | Arena* arena, |
207 | 7.12k | bool skip_filters) { |
208 | 7.12k | if (options.total_order_seek && !IsTotalOrderMode()) { |
209 | 0 | return NewErrorInternalIterator( |
210 | 0 | STATUS(InvalidArgument, "total_order_seek not supported"), arena); |
211 | 0 | } |
212 | 7.12k | if (arena == nullptr) { |
213 | 4.85k | return new PlainTableIterator(this, prefix_extractor_ != nullptr); |
214 | 2.27k | } else { |
215 | 2.27k | auto mem = arena->AllocateAligned(sizeof(PlainTableIterator)); |
216 | 2.27k | return new (mem) PlainTableIterator(this, prefix_extractor_ != nullptr); |
217 | 2.27k | } |
218 | 7.12k | } |
219 | | |
220 | | Status PlainTableReader::PopulateIndexRecordList( |
221 | 3.04k | PlainTableIndexBuilder* index_builder, vector<uint32_t>* prefix_hashes) { |
222 | 3.04k | Slice prev_key_prefix_slice; |
223 | 3.04k | std::string prev_key_prefix_buf; |
224 | 3.04k | uint32_t pos = data_start_offset_; |
225 | | |
226 | 3.04k | bool is_first_record = true; |
227 | 3.04k | Slice key_prefix_slice; |
228 | 3.04k | PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_, |
229 | 3.04k | ioptions_.prefix_extractor); |
230 | 992k | while (pos < file_info_.data_end_offset) { |
231 | 989k | uint32_t key_offset = pos; |
232 | 989k | ParsedInternalKey key; |
233 | 989k | Slice value_slice; |
234 | 989k | bool seekable = false; |
235 | 989k | Status s = Next(&decoder, &pos, &key, nullptr, &value_slice, &seekable); |
236 | 989k | if (!s.ok()) { |
237 | 0 | return s; |
238 | 0 | } |
239 | | |
240 | 989k | key_prefix_slice = GetPrefix(key); |
241 | 989k | if (enable_bloom_) { |
242 | 2.10k | bloom_.AddHash(GetSliceHash(key.user_key)); |
243 | 987k | } else { |
244 | 987k | if (is_first_record || prev_key_prefix_slice != key_prefix_slice) { |
245 | 676k | if (!is_first_record) { |
246 | 673k | prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice)); |
247 | 673k | } |
248 | 676k | if (file_info_.is_mmap_mode) { |
249 | 438k | prev_key_prefix_slice = key_prefix_slice; |
250 | 237k | } else { |
251 | 237k | prev_key_prefix_buf = key_prefix_slice.ToString(); |
252 | 237k | prev_key_prefix_slice = prev_key_prefix_buf; |
253 | 237k | } |
254 | 676k | } |
255 | 987k | } |
256 | | |
257 | 989k | index_builder->AddKeyPrefix(GetPrefix(key), key_offset); |
258 | | |
259 | 989k | if (!seekable && is_first_record) { |
260 | 0 | return STATUS(Corruption, "Key for a prefix is not seekable"); |
261 | 0 | } |
262 | | |
263 | 989k | is_first_record = false; |
264 | 989k | } |
265 | | |
266 | 3.04k | prefix_hashes->push_back(GetSliceHash(key_prefix_slice)); |
267 | 3.04k | auto s = index_.InitFromRawData(index_builder->Finish()); |
268 | 3.04k | return s; |
269 | 3.04k | } |
270 | | |
271 | | void PlainTableReader::AllocateAndFillBloom(int bloom_bits_per_key, |
272 | | int num_prefixes, |
273 | | size_t huge_page_tlb_size, |
274 | 3.04k | vector<uint32_t>* prefix_hashes) { |
275 | 3.04k | if (!IsTotalOrderMode()) { |
276 | 2.71k | uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key; |
277 | 2.71k | if (bloom_total_bits > 0) { |
278 | 2.61k | enable_bloom_ = true; |
279 | 2.61k | bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality, |
280 | 2.61k | huge_page_tlb_size, ioptions_.info_log); |
281 | 2.61k | FillBloom(prefix_hashes); |
282 | 2.61k | } |
283 | 2.71k | } |
284 | 3.04k | } |
285 | | |
286 | 2.61k | void PlainTableReader::FillBloom(vector<uint32_t>* prefix_hashes) { |
287 | 2.61k | assert(bloom_.IsInitialized()); |
288 | 675k | for (auto prefix_hash : *prefix_hashes) { |
289 | 675k | bloom_.AddHash(prefix_hash); |
290 | 675k | } |
291 | 2.61k | } |
292 | | |
293 | 3.11k | Status PlainTableReader::MmapDataIfNeeded() { |
294 | 3.11k | if (file_info_.is_mmap_mode) { |
295 | | // Get mmapped memory. |
296 | 2.04k | return file_info_.file->Read(0, file_size_, &file_info_.file_data, nullptr); |
297 | 2.04k | } |
298 | 1.06k | return Status::OK(); |
299 | 1.06k | } |
300 | | |
301 | | Status PlainTableReader::PopulateIndex(TableProperties* props, |
302 | | int bloom_bits_per_key, |
303 | | double hash_table_ratio, |
304 | | size_t index_sparseness, |
305 | 3.11k | size_t huge_page_tlb_size) { |
306 | 3.11k | assert(props != nullptr); |
307 | 3.11k | table_properties_.reset(props); |
308 | | |
309 | 3.11k | BlockContents bloom_block_contents; |
310 | 3.11k | auto s = ReadMetaBlock(file_info_.file.get(), file_size_, |
311 | 3.11k | kPlainTableMagicNumber, ioptions_.env, |
312 | 3.11k | BloomBlockBuilder::kBloomBlock, mem_tracker_, &bloom_block_contents); |
313 | 3.11k | bool index_in_file = s.ok(); |
314 | | |
315 | 3.11k | BlockContents index_block_contents; |
316 | 3.11k | s = ReadMetaBlock( |
317 | 3.11k | file_info_.file.get(), file_size_, kPlainTableMagicNumber, ioptions_.env, |
318 | 3.11k | PlainTableIndexBuilder::kPlainTableIndexBlock, mem_tracker_, &index_block_contents); |
319 | | |
320 | 3.11k | index_in_file &= s.ok(); |
321 | | |
322 | 3.11k | Slice* bloom_block; |
323 | 3.11k | if (index_in_file) { |
324 | | // If bloom_block_contents.allocation is not empty (which will be the case |
325 | | // for non-mmap mode), it holds the alloated memory for the bloom block. |
326 | | // It needs to be kept alive to keep `bloom_block` valid. |
327 | 64 | bloom_block_alloc_ = std::move(bloom_block_contents.allocation); |
328 | 64 | bloom_block = &bloom_block_contents.data; |
329 | 64 | tracked_consumption_ += bloom_block->size(); |
330 | 3.04k | } else { |
331 | 3.04k | bloom_block = nullptr; |
332 | 3.04k | } |
333 | | |
334 | | // index_in_file == true only if there are kBloomBlock and |
335 | | // kPlainTableIndexBlock in file |
336 | 3.11k | Slice* index_block; |
337 | 3.11k | if (index_in_file) { |
338 | | // If index_block_contents.allocation is not empty (which will be the case |
339 | | // for non-mmap mode), it holds the alloated memory for the index block. |
340 | | // It needs to be kept alive to keep `index_block` valid. |
341 | 64 | index_block_alloc_ = std::move(index_block_contents.allocation); |
342 | 64 | index_block = &index_block_contents.data; |
343 | 3.04k | } else { |
344 | 3.04k | index_block = nullptr; |
345 | 3.04k | } |
346 | | |
347 | 3.11k | if ((ioptions_.prefix_extractor == nullptr) && |
348 | 345 | (hash_table_ratio != 0)) { |
349 | | // ioptions.prefix_extractor is requried for a hash-based look-up. |
350 | 0 | return STATUS(NotSupported, |
351 | 0 | "PlainTable requires a prefix extractor enable prefix hash mode."); |
352 | 0 | } |
353 | | |
354 | | // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows |
355 | | // for a prefix (starting from the first one), generate a record of (hash, |
356 | | // offset) and append it to IndexRecordList, which is a data structure created |
357 | | // to store them. |
358 | | |
359 | 3.11k | if (!index_in_file) { |
360 | | // Allocate bloom filter here for total order mode. |
361 | 3.04k | if (IsTotalOrderMode()) { |
362 | 337 | uint32_t num_bloom_bits = |
363 | 337 | static_cast<uint32_t>(table_properties_->num_entries) * |
364 | 337 | bloom_bits_per_key; |
365 | 337 | if (num_bloom_bits > 0) { |
366 | 43 | enable_bloom_ = true; |
367 | 43 | bloom_.SetTotalBits(&arena_, num_bloom_bits, ioptions_.bloom_locality, |
368 | 43 | huge_page_tlb_size, ioptions_.info_log); |
369 | 43 | } |
370 | 337 | } |
371 | 64 | } else { |
372 | 64 | enable_bloom_ = true; |
373 | 64 | auto num_blocks_property = props->user_collected_properties.find( |
374 | 64 | PlainTablePropertyNames::kNumBloomBlocks); |
375 | | |
376 | 64 | uint32_t num_blocks = 0; |
377 | 64 | if (num_blocks_property != props->user_collected_properties.end()) { |
378 | 64 | Slice temp_slice(num_blocks_property->second); |
379 | 64 | if (!GetVarint32(&temp_slice, &num_blocks)) { |
380 | 0 | num_blocks = 0; |
381 | 0 | } |
382 | 64 | } |
383 | | // cast away const qualifier, because bloom_ won't be changed |
384 | 64 | bloom_.SetRawData( |
385 | 64 | const_cast<unsigned char*>( |
386 | 64 | reinterpret_cast<const unsigned char*>(bloom_block->data())), |
387 | 64 | static_cast<uint32_t>(bloom_block->size()) * 8, num_blocks); |
388 | 64 | } |
389 | | |
390 | 3.11k | PlainTableIndexBuilder index_builder(&arena_, ioptions_, index_sparseness, |
391 | 3.11k | hash_table_ratio, huge_page_tlb_size); |
392 | | |
393 | 3.11k | std::vector<uint32_t> prefix_hashes; |
394 | 3.11k | if (!index_in_file) { |
395 | 3.04k | s = PopulateIndexRecordList(&index_builder, &prefix_hashes); |
396 | 3.04k | if (!s.ok()) { |
397 | 0 | return s; |
398 | 0 | } |
399 | 64 | } else { |
400 | 64 | s = index_.InitFromRawData(*index_block); |
401 | 64 | if (!s.ok()) { |
402 | 0 | return s; |
403 | 0 | } |
404 | 3.11k | } |
405 | | |
406 | 3.11k | if (!index_in_file) { |
407 | | // Calculated bloom filter size and allocate memory for |
408 | | // bloom filter based on the number of prefixes, then fill it. |
409 | 3.04k | AllocateAndFillBloom(bloom_bits_per_key, index_.GetNumPrefixes(), |
410 | 3.04k | huge_page_tlb_size, &prefix_hashes); |
411 | 3.04k | } |
412 | | |
413 | | // Fill two table properties. |
414 | 3.11k | if (!index_in_file) { |
415 | 3.04k | props->user_collected_properties["plain_table_hash_table_size"] = |
416 | 3.04k | ToString(index_.GetIndexSize() * PlainTableIndex::kOffsetLen); |
417 | 3.04k | props->user_collected_properties["plain_table_sub_index_size"] = |
418 | 3.04k | ToString(index_.GetSubIndexSize()); |
419 | 64 | } else { |
420 | 64 | props->user_collected_properties["plain_table_hash_table_size"] = |
421 | 64 | ToString(0); |
422 | 64 | props->user_collected_properties["plain_table_sub_index_size"] = |
423 | 64 | ToString(0); |
424 | 64 | } |
425 | | |
426 | 3.11k | return Status::OK(); |
427 | 3.11k | } |
428 | | |
429 | | Status PlainTableReader::GetOffset(PlainTableKeyDecoder* decoder, |
430 | | const Slice& target, const Slice& prefix, |
431 | | uint32_t prefix_hash, bool* prefix_matched, |
432 | 130k | uint32_t* offset) const { |
433 | 130k | *prefix_matched = false; |
434 | 130k | uint32_t prefix_index_offset; |
435 | 130k | auto res = index_.GetOffset(prefix_hash, &prefix_index_offset); |
436 | 130k | if (res == PlainTableIndex::kNoPrefixForBucket) { |
437 | 876 | *offset = file_info_.data_end_offset; |
438 | 876 | return Status::OK(); |
439 | 129k | } else if (res == PlainTableIndex::kDirectToFile) { |
440 | 35.3k | *offset = prefix_index_offset; |
441 | 35.3k | return Status::OK(); |
442 | 35.3k | } |
443 | | |
444 | | // point to sub-index, need to do a binary search |
445 | 94.1k | uint32_t upper_bound; |
446 | 94.1k | const char* base_ptr = |
447 | 94.1k | index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound); |
448 | 94.1k | uint32_t low = 0; |
449 | 94.1k | uint32_t high = upper_bound; |
450 | 94.1k | ParsedInternalKey mid_key; |
451 | 94.1k | ParsedInternalKey parsed_target; |
452 | 94.1k | if (!ParseInternalKey(target, &parsed_target)) { |
453 | 0 | return STATUS(Corruption, Slice()); |
454 | 0 | } |
455 | | |
456 | | // The key is between [low, high). Do a binary search between it. |
457 | 484k | while (high - low > 1) { |
458 | 399k | uint32_t mid = (high + low) / 2; |
459 | 399k | uint32_t file_offset = GetFixed32Element(base_ptr, mid); |
460 | 399k | uint32_t tmp; |
461 | 399k | Status s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp); |
462 | 399k | if (!s.ok()) { |
463 | 0 | return s; |
464 | 0 | } |
465 | 399k | int cmp_result = internal_comparator_->Compare(mid_key, parsed_target); |
466 | 399k | if (cmp_result < 0) { |
467 | 186k | low = mid; |
468 | 212k | } else { |
469 | 212k | if (cmp_result == 0) { |
470 | | // Happen to have found the exact key or target is smaller than the |
471 | | // first key after base_offset. |
472 | 8.84k | *prefix_matched = true; |
473 | 8.84k | *offset = file_offset; |
474 | 8.84k | return Status::OK(); |
475 | 203k | } else { |
476 | 203k | high = mid; |
477 | 203k | } |
478 | 212k | } |
479 | 399k | } |
480 | | // Both of the key at the position low or low+1 could share the same |
481 | | // prefix as target. We need to rule out one of them to avoid to go |
482 | | // to the wrong prefix. |
483 | 85.3k | ParsedInternalKey low_key; |
484 | 85.3k | uint32_t tmp; |
485 | 85.3k | uint32_t low_key_offset = GetFixed32Element(base_ptr, low); |
486 | 85.3k | Status s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp); |
487 | 85.3k | if (!s.ok()) { |
488 | 0 | return s; |
489 | 0 | } |
490 | | |
491 | 85.3k | if (GetPrefix(low_key) == prefix) { |
492 | 79.1k | *prefix_matched = true; |
493 | 79.1k | *offset = low_key_offset; |
494 | 6.17k | } else if (low + 1 < upper_bound) { |
495 | | // There is possible a next prefix, return it |
496 | 6.16k | *prefix_matched = false; |
497 | 6.16k | *offset = GetFixed32Element(base_ptr, low + 1); |
498 | 2 | } else { |
499 | | // target is larger than a key of the last prefix in this bucket |
500 | | // but with a different prefix. Key does not exist. |
501 | 2 | *offset = file_info_.data_end_offset; |
502 | 2 | } |
503 | 85.3k | return Status::OK(); |
504 | 85.3k | } |
505 | | |
506 | 121k | bool PlainTableReader::MatchBloom(uint32_t hash) const { |
507 | 121k | if (!enable_bloom_) { |
508 | 1.20k | return true; |
509 | 1.20k | } |
510 | | |
511 | 120k | if (bloom_.MayContainHash(hash)) { |
512 | 112k | PERF_COUNTER_ADD(bloom_sst_hit_count, 1); |
513 | 112k | return true; |
514 | 8.12k | } else { |
515 | 8.12k | PERF_COUNTER_ADD(bloom_sst_miss_count, 1); |
516 | 8.12k | return false; |
517 | 8.12k | } |
518 | 120k | } |
519 | | |
520 | | Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset, |
521 | | ParsedInternalKey* parsed_key, |
522 | | Slice* internal_key, Slice* value, |
523 | 3.82M | bool* seekable) const { |
524 | 3.82M | if (*offset == file_info_.data_end_offset) { |
525 | 0 | *offset = file_info_.data_end_offset; |
526 | 0 | return Status::OK(); |
527 | 0 | } |
528 | | |
529 | 3.82M | if (*offset > file_info_.data_end_offset) { |
530 | 0 | return STATUS(Corruption, "Offset is out of file size"); |
531 | 0 | } |
532 | | |
533 | 3.82M | uint32_t bytes_read; |
534 | 3.82M | Status s = decoder->NextKey(*offset, parsed_key, internal_key, value, |
535 | 3.82M | &bytes_read, seekable); |
536 | 3.82M | if (!s.ok()) { |
537 | 0 | return s; |
538 | 0 | } |
539 | 3.82M | *offset = *offset + bytes_read; |
540 | 3.82M | return Status::OK(); |
541 | 3.82M | } |
542 | | |
543 | 0 | void PlainTableReader::Prepare(const Slice& target) { |
544 | 0 | if (enable_bloom_) { |
545 | 0 | uint32_t prefix_hash = GetSliceHash(GetPrefix(target)); |
546 | 0 | bloom_.Prefetch(prefix_hash); |
547 | 0 | } |
548 | 0 | } |
549 | | |
550 | | Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, |
551 | 87.1k | GetContext* get_context, bool skip_filters) { |
552 | | // Check bloom filter first. |
553 | 87.1k | Slice prefix_slice; |
554 | 87.1k | uint32_t prefix_hash; |
555 | 87.1k | if (IsTotalOrderMode()) { |
556 | 126 | if (full_scan_mode_) { |
557 | 0 | status_ = |
558 | 0 | STATUS(InvalidArgument, "Get() is not allowed in full scan mode."); |
559 | 0 | } |
560 | | // Match whole user key for bloom filter check. |
561 | 126 | if (!MatchBloom(GetSliceHash(GetUserKey(target)))) { |
562 | 21 | return Status::OK(); |
563 | 21 | } |
564 | | // in total order mode, there is only one bucket 0, and we always use empty |
565 | | // prefix. |
566 | 105 | prefix_slice = Slice(); |
567 | 105 | prefix_hash = 0; |
568 | 86.9k | } else { |
569 | 86.9k | prefix_slice = GetPrefix(target); |
570 | 86.9k | prefix_hash = GetSliceHash(prefix_slice); |
571 | 86.9k | if (!MatchBloom(prefix_hash)) { |
572 | 8.05k | return Status::OK(); |
573 | 8.05k | } |
574 | 79.0k | } |
575 | 79.0k | uint32_t offset; |
576 | 79.0k | bool prefix_match; |
577 | 79.0k | PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_, |
578 | 79.0k | ioptions_.prefix_extractor); |
579 | 79.0k | Status s = GetOffset(&decoder, target, prefix_slice, prefix_hash, |
580 | 79.0k | &prefix_match, &offset); |
581 | | |
582 | 79.0k | if (!s.ok()) { |
583 | 0 | return s; |
584 | 0 | } |
585 | 79.0k | ParsedInternalKey found_key; |
586 | 79.0k | ParsedInternalKey parsed_target; |
587 | 79.0k | if (!ParseInternalKey(target, &parsed_target)) { |
588 | 0 | return STATUS(Corruption, Slice()); |
589 | 0 | } |
590 | 79.0k | Slice found_value; |
591 | 582k | while (offset < file_info_.data_end_offset) { |
592 | 582k | s = Next(&decoder, &offset, &found_key, nullptr, &found_value); |
593 | 582k | if (!s.ok()) { |
594 | 0 | return s; |
595 | 0 | } |
596 | 582k | if (!prefix_match) { |
597 | | // Need to verify prefix for the first key found if it is not yet |
598 | | // checked. |
599 | 15.6k | if (GetPrefix(found_key) != prefix_slice) { |
600 | 4 | return Status::OK(); |
601 | 4 | } |
602 | 15.6k | prefix_match = true; |
603 | 15.6k | } |
604 | | // TODO(ljin): since we know the key comparison result here, |
605 | | // can we enable the fast path? |
606 | 582k | if (internal_comparator_->Compare(found_key, parsed_target) >= 0) { |
607 | 78.9k | if (!get_context->SaveValue(found_key, found_value)) { |
608 | 78.9k | break; |
609 | 78.9k | } |
610 | 78.9k | } |
611 | 582k | } |
612 | 79.0k | return Status::OK(); |
613 | 79.0k | } |
614 | | |
615 | 0 | uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& key) { |
616 | 0 | return 0; |
617 | 0 | } |
618 | | |
619 | | PlainTableIterator::PlainTableIterator(PlainTableReader* table, |
620 | | bool use_prefix_seek) |
621 | | : table_(table), |
622 | | decoder_(&table_->file_info_, table_->encoding_type_, |
623 | | table_->user_key_len_, table_->prefix_extractor_), |
624 | 7.12k | use_prefix_seek_(use_prefix_seek) { |
625 | 7.12k | next_offset_ = offset_ = table_->file_info_.data_end_offset; |
626 | 7.12k | } |
627 | | |
628 | 7.12k | PlainTableIterator::~PlainTableIterator() { |
629 | 7.12k | } |
630 | | |
631 | 6.96M | bool PlainTableIterator::Valid() const { |
632 | 6.96M | return offset_ < table_->file_info_.data_end_offset && |
633 | 6.94M | offset_ >= table_->data_start_offset_; |
634 | 6.96M | } |
635 | | |
636 | 56.1k | void PlainTableIterator::SeekToFirst() { |
637 | 56.1k | next_offset_ = table_->data_start_offset_; |
638 | 56.1k | if (next_offset_ >= table_->file_info_.data_end_offset) { |
639 | 732 | next_offset_ = offset_ = table_->file_info_.data_end_offset; |
640 | 55.3k | } else { |
641 | 55.3k | Next(); |
642 | 55.3k | } |
643 | 56.1k | } |
644 | | |
645 | 0 | void PlainTableIterator::SeekToLast() { |
646 | 0 | assert(false); |
647 | 0 | status_ = STATUS(NotSupported, "SeekToLast() is not supported in PlainTable"); |
648 | 0 | } |
649 | | |
650 | 51.4k | void PlainTableIterator::Seek(const Slice& target) { |
651 | | // If the user doesn't set prefix seek option and we are not able to do a |
652 | | // total Seek(). assert failure. |
653 | 51.4k | if (!use_prefix_seek_) { |
654 | 17.0k | if (table_->full_scan_mode_) { |
655 | 0 | status_ = |
656 | 0 | STATUS(InvalidArgument, "Seek() is not allowed in full scan mode."); |
657 | 0 | offset_ = next_offset_ = table_->file_info_.data_end_offset; |
658 | 0 | return; |
659 | 17.0k | } else if (table_->GetIndexSize() > 1) { |
660 | 0 | assert(false); |
661 | 0 | status_ = STATUS(NotSupported, |
662 | 0 | "PlainTable cannot issue non-prefix seek unless in total order " |
663 | 0 | "mode."); |
664 | 0 | offset_ = next_offset_ = table_->file_info_.data_end_offset; |
665 | 0 | return; |
666 | 0 | } |
667 | 51.4k | } |
668 | | |
669 | 51.4k | Slice prefix_slice = table_->GetPrefix(target); |
670 | 51.4k | uint32_t prefix_hash = 0; |
671 | | // Bloom filter is ignored in total-order mode. |
672 | 51.4k | if (!table_->IsTotalOrderMode()) { |
673 | 34.4k | prefix_hash = GetSliceHash(prefix_slice); |
674 | 34.4k | if (!table_->MatchBloom(prefix_hash)) { |
675 | 69 | offset_ = next_offset_ = table_->file_info_.data_end_offset; |
676 | 69 | return; |
677 | 69 | } |
678 | 51.4k | } |
679 | 51.4k | bool prefix_match; |
680 | 51.4k | status_ = table_->GetOffset(&decoder_, target, prefix_slice, prefix_hash, |
681 | 51.4k | &prefix_match, &next_offset_); |
682 | 51.4k | if (!status_.ok()) { |
683 | 0 | offset_ = next_offset_ = table_->file_info_.data_end_offset; |
684 | 0 | return; |
685 | 0 | } |
686 | | |
687 | 51.4k | if (next_offset_ < table_->file_info_.data_end_offset) { |
688 | 138k | for (Next(); status_.ok() && Valid(); Next()) { |
689 | 138k | if (!prefix_match) { |
690 | | // Need to verify the first key's prefix |
691 | 25.9k | if (table_->GetPrefix(key()) != prefix_slice) { |
692 | 52 | offset_ = next_offset_ = table_->file_info_.data_end_offset; |
693 | 52 | break; |
694 | 52 | } |
695 | 25.9k | prefix_match = true; |
696 | 25.9k | } |
697 | 138k | if (table_->internal_comparator_->Compare(key(), target) >= 0) { |
698 | 50.4k | break; |
699 | 50.4k | } |
700 | 138k | } |
701 | 900 | } else { |
702 | 900 | offset_ = table_->file_info_.data_end_offset; |
703 | 900 | } |
704 | 51.4k | } |
705 | | |
706 | 2.26M | void PlainTableIterator::Next() { |
707 | 2.26M | offset_ = next_offset_; |
708 | 2.26M | if (offset_ < table_->file_info_.data_end_offset) { |
709 | 2.25M | Slice tmp_slice; |
710 | 2.25M | ParsedInternalKey parsed_key; |
711 | 2.25M | status_ = |
712 | 2.25M | table_->Next(&decoder_, &next_offset_, &parsed_key, &key_, &value_); |
713 | 2.25M | if (!status_.ok()) { |
714 | 0 | offset_ = next_offset_ = table_->file_info_.data_end_offset; |
715 | 0 | } |
716 | 2.25M | } |
717 | 2.26M | } |
718 | | |
719 | 0 | void PlainTableIterator::Prev() { |
720 | 0 | assert(false); |
721 | 0 | } |
722 | | |
723 | 2.33M | Slice PlainTableIterator::key() const { |
724 | 2.33M | assert(Valid()); |
725 | 2.33M | return key_; |
726 | 2.33M | } |
727 | | |
728 | 2.06M | Slice PlainTableIterator::value() const { |
729 | 2.06M | assert(Valid()); |
730 | 2.06M | return value_; |
731 | 2.06M | } |
732 | | |
733 | 5.37k | Status PlainTableIterator::status() const { |
734 | 5.37k | return status_; |
735 | 5.37k | } |
736 | | |
737 | | } // namespace rocksdb |
738 | | #endif // ROCKSDB_LITE |