/Users/deen/code/yugabyte-db/src/yb/rocksdb/table/block_based_table_reader.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under the BSD-style license found in the |
3 | | // LICENSE file in the root directory of this source tree. An additional grant |
4 | | // of patent rights can be found in the PATENTS file in the same directory. |
5 | | // |
6 | | // The following only applies to changes made to this file as part of YugaByte development. |
7 | | // |
8 | | // Portions Copyright (c) YugaByte, Inc. |
9 | | // |
10 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
11 | | // in compliance with the License. You may obtain a copy of the License at |
12 | | // |
13 | | // http://www.apache.org/licenses/LICENSE-2.0 |
14 | | // |
15 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
16 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
17 | | // or implied. See the License for the specific language governing permissions and limitations |
18 | | // under the License. |
19 | | // |
20 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
21 | | // Use of this source code is governed by a BSD-style license that can be |
22 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
23 | | |
24 | | #include "yb/rocksdb/table/block_based_table_reader.h" |
25 | | |
26 | | #include <string> |
27 | | #include <utility> |
28 | | |
29 | | #include "yb/gutil/macros.h" |
30 | | |
31 | | #include "yb/rocksdb/cache.h" |
32 | | #include "yb/rocksdb/comparator.h" |
33 | | #include "yb/rocksdb/db/dbformat.h" |
34 | | #include "yb/rocksdb/env.h" |
35 | | #include "yb/rocksdb/filter_policy.h" |
36 | | #include "yb/rocksdb/iterator.h" |
37 | | #include "yb/rocksdb/options.h" |
38 | | #include "yb/rocksdb/statistics.h" |
39 | | #include "yb/rocksdb/table.h" |
40 | | #include "yb/rocksdb/table/block.h" |
41 | | #include "yb/rocksdb/table/block_based_filter_block.h" |
42 | | #include "yb/rocksdb/table/block_based_table_factory.h" |
43 | | #include "yb/rocksdb/table/block_based_table_internal.h" |
44 | | #include "yb/rocksdb/table/block_hash_index.h" |
45 | | #include "yb/rocksdb/table/block_prefix_index.h" |
46 | | #include "yb/rocksdb/table/filter_block.h" |
47 | | #include "yb/rocksdb/table/fixed_size_filter_block.h" |
48 | | #include "yb/rocksdb/table/format.h" |
49 | | #include "yb/rocksdb/table/full_filter_block.h" |
50 | | #include "yb/rocksdb/table/get_context.h" |
51 | | #include "yb/rocksdb/table/index_reader.h" |
52 | | #include "yb/rocksdb/table/internal_iterator.h" |
53 | | #include "yb/rocksdb/table/meta_blocks.h" |
54 | | #include "yb/rocksdb/table/table_properties_internal.h" |
55 | | #include "yb/rocksdb/table/two_level_iterator.h" |
56 | | #include "yb/rocksdb/table_properties.h" |
57 | | #include "yb/rocksdb/util/coding.h" |
58 | | #include "yb/rocksdb/util/file_reader_writer.h" |
59 | | #include "yb/rocksdb/util/perf_context_imp.h" |
60 | | #include "yb/rocksdb/util/statistics.h" |
61 | | #include "yb/rocksdb/util/stop_watch.h" |
62 | | |
63 | | #include "yb/util/atomic.h" |
64 | | #include "yb/util/logging.h" |
65 | | #include "yb/util/mem_tracker.h" |
66 | | #include "yb/util/scope_exit.h" |
67 | | #include "yb/util/stats/perf_step_timer.h" |
68 | | #include "yb/util/status_format.h" |
69 | | #include "yb/util/string_util.h" |
70 | | |
71 | | namespace rocksdb { |
72 | | |
73 | | extern const uint64_t kBlockBasedTableMagicNumber; |
74 | | extern const char kHashIndexPrefixesBlock[]; |
75 | | extern const char kHashIndexPrefixesMetadataBlock[]; |
76 | | using std::unique_ptr; |
77 | | |
78 | | typedef FilterPolicy::FilterType FilterType; |
79 | | |
80 | | namespace { |
81 | | |
82 | | // Delete the resource that is held by the iterator. |
83 | | template <class ResourceType> |
84 | 1.45M | void DeleteHeldResource(void* arg, void* ignored) { |
85 | 1.45M | delete reinterpret_cast<ResourceType*>(arg); |
86 | 1.45M | } |
87 | | |
88 | | // Delete the entry resided in the cache. |
89 | | template <class Entry> |
90 | 1.99M | void DeleteCachedEntry(const Slice& key, void* value) { |
91 | 1.99M | auto entry = reinterpret_cast<Entry*>(value); |
92 | 1.99M | delete entry; |
93 | 1.99M | } block_based_table_reader.cc:_ZN7rocksdb12_GLOBAL__N_117DeleteCachedEntryINS_5BlockEEEvRKN2yb5SliceEPv Line | Count | Source | 90 | 1.99M | void DeleteCachedEntry(const Slice& key, void* value) { | 91 | 1.99M | auto entry = reinterpret_cast<Entry*>(value); | 92 | 1.99M | delete entry; | 93 | 1.99M | } |
block_based_table_reader.cc:_ZN7rocksdb12_GLOBAL__N_117DeleteCachedEntryINS_17FilterBlockReaderEEEvRKN2yb5SliceEPv Line | Count | Source | 90 | 319 | void DeleteCachedEntry(const Slice& key, void* value) { | 91 | 319 | auto entry = reinterpret_cast<Entry*>(value); | 92 | 319 | delete entry; | 93 | 319 | } |
block_based_table_reader.cc:_ZN7rocksdb12_GLOBAL__N_117DeleteCachedEntryINS_11IndexReaderEEEvRKN2yb5SliceEPv Line | Count | Source | 90 | 500 | void DeleteCachedEntry(const Slice& key, void* value) { | 91 | 500 | auto entry = reinterpret_cast<Entry*>(value); | 92 | 500 | delete entry; | 93 | 500 | } |
|
94 | | |
95 | | // Release the cached entry and decrement its ref count. |
96 | 20.8M | void ReleaseCachedEntry(void* arg, void* h) { |
97 | 20.8M | Cache* cache = reinterpret_cast<Cache*>(arg); |
98 | 20.8M | Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h); |
99 | 20.8M | cache->Release(handle); |
100 | 20.8M | } |
101 | | |
102 | | Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key, |
103 | | Tickers block_cache_miss_ticker, |
104 | | Tickers block_cache_hit_ticker, |
105 | | Statistics* statistics, |
106 | 27.1M | const QueryId query_id) { |
107 | 27.1M | auto cache_handle = block_cache->Lookup(key, query_id, statistics); |
108 | 27.1M | if (cache_handle != nullptr) { |
109 | 23.6M | PERF_COUNTER_ADD(block_cache_hit_count, 1); |
110 | | // block-type specific cache hit |
111 | 23.6M | RecordTick(statistics, block_cache_hit_ticker); |
112 | 3.52M | } else { |
113 | | // block-type specific cache miss |
114 | 3.52M | RecordTick(statistics, block_cache_miss_ticker); |
115 | 3.52M | } |
116 | | |
117 | 27.1M | return cache_handle; |
118 | 27.1M | } |
119 | | |
120 | | class NotMatchingFilterBlockReader : public FilterBlockReader { |
121 | | public: |
122 | 72.6k | NotMatchingFilterBlockReader() {} |
123 | | NotMatchingFilterBlockReader(const NotMatchingFilterBlockReader&) = delete; |
124 | | void operator=(const NotMatchingFilterBlockReader&) = delete; |
125 | 10 | virtual bool KeyMayMatch(const Slice& key, uint64_t block_offset = 0) override { |
126 | 10 | return false; } |
127 | 0 | virtual bool PrefixMayMatch(const Slice& prefix, uint64_t block_offset = 0) override { |
128 | 0 | return false; } |
129 | 0 | virtual size_t ApproximateMemoryUsage() const override { return 0; } |
130 | | }; |
131 | | |
132 | | } // namespace |
133 | | |
134 | | // Originally following data was stored in BlockBasedTable::Rep and related to a single SST file. |
135 | | // Since SST file is now split into two files - data file and metadata file, all file-related data |
136 | | // was moved into dedicated structure for each file. |
137 | | struct BlockBasedTable::FileReaderWithCachePrefix { |
138 | | // Pointer to file reader. |
139 | | unique_ptr<RandomAccessFileReader> reader; |
140 | | |
141 | | // BlockBasedTableReader uses the block cache passed to BlockBasedTableReader::Open inside |
142 | | // a BlockBasedTableOptions instance to reduce the number of file read requests. If block cache |
143 | | // pointer in options is nullptr, cache is not used. File blocks are referred in cache by keys, |
144 | | // which are composed from the following data (see GetCacheKey helper function): |
145 | | // - cache key prefix (unique for each file), generated by BlockBasedTable::GenerateCachePrefix |
146 | | // - block offset within a file. |
147 | | block_based_table::CacheKeyPrefixBuffer cache_key_prefix; |
148 | | |
149 | | // Similar prefix, but for compressed blocks cache: |
150 | | block_based_table::CacheKeyPrefixBuffer compressed_cache_key_prefix; |
151 | | |
152 | | explicit FileReaderWithCachePrefix(unique_ptr<RandomAccessFileReader>&& _reader) : |
153 | 142k | reader(std::move(_reader)) {} |
154 | | }; |
155 | | |
156 | | // CachableEntry represents the entries that *may* be fetched from block cache. |
157 | | // field `value` is the item we want to get. |
158 | | // field `cache_handle` is the cache handle to the block cache. If the value |
159 | | // was not read from cache, `cache_handle` will be nullptr. |
160 | | template <class TValue> |
161 | | struct BlockBasedTable::CachableEntry { |
162 | | CachableEntry(TValue* _value, Cache::Handle* _cache_handle) |
163 | 50.5M | : value(_value), cache_handle(_cache_handle) {} _ZN7rocksdb15BlockBasedTable13CachableEntryINS_17FilterBlockReaderEEC2EPS2_PNS_5Cache6HandleE Line | Count | Source | 163 | 20.3M | : value(_value), cache_handle(_cache_handle) {} |
_ZN7rocksdb15BlockBasedTable13CachableEntryINS_11IndexReaderEEC2EPS2_PNS_5Cache6HandleE Line | Count | Source | 163 | 11.7M | : value(_value), cache_handle(_cache_handle) {} |
_ZN7rocksdb15BlockBasedTable13CachableEntryINS_5BlockEEC2EPS2_PNS_5Cache6HandleE Line | Count | Source | 163 | 18.3M | : value(_value), cache_handle(_cache_handle) {} |
|
164 | 26.5M | CachableEntry() : CachableEntry(nullptr, nullptr) {} _ZN7rocksdb15BlockBasedTable13CachableEntryINS_17FilterBlockReaderEEC1Ev Line | Count | Source | 164 | 8.20M | CachableEntry() : CachableEntry(nullptr, nullptr) {} |
_ZN7rocksdb15BlockBasedTable13CachableEntryINS_5BlockEEC1Ev Line | Count | Source | 164 | 18.3M | CachableEntry() : CachableEntry(nullptr, nullptr) {} |
|
165 | 12.5M | void Release(Cache* cache) { |
166 | 12.5M | if (cache_handle) { |
167 | 4.87M | cache->Release(cache_handle); |
168 | 4.87M | value = nullptr; |
169 | 4.87M | cache_handle = nullptr; |
170 | 4.87M | } |
171 | 12.5M | } _ZN7rocksdb15BlockBasedTable13CachableEntryINS_17FilterBlockReaderEE7ReleaseEPNS_5CacheE Line | Count | Source | 165 | 12.5M | void Release(Cache* cache) { | 166 | 12.5M | if (cache_handle) { | 167 | 4.87M | cache->Release(cache_handle); | 168 | 4.87M | value = nullptr; | 169 | 4.87M | cache_handle = nullptr; | 170 | 4.87M | } | 171 | 12.5M | } |
_ZN7rocksdb15BlockBasedTable13CachableEntryINS_11IndexReaderEE7ReleaseEPNS_5CacheE Line | Count | Source | 165 | 46 | void Release(Cache* cache) { | 166 | 46 | if (cache_handle) { | 167 | 46 | cache->Release(cache_handle); | 168 | 46 | value = nullptr; | 169 | 46 | cache_handle = nullptr; | 170 | 46 | } | 171 | 46 | } |
|
172 | | |
173 | | TValue* value = nullptr; |
174 | | // if the entry is from the cache, cache_handle will be populated. |
175 | | Cache::Handle* cache_handle = nullptr; |
176 | | }; |
177 | | |
178 | | struct BlockBasedTable::Rep { |
179 | | struct NotMatchingFilterEntry : public CachableEntry<FilterBlockReader> { |
180 | 72.6k | NotMatchingFilterEntry() : CachableEntry(&filter, nullptr) {} |
181 | | NotMatchingFilterBlockReader filter; |
182 | | }; |
183 | | |
184 | | Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options, |
185 | | const BlockBasedTableOptions& _table_opt, |
186 | | const InternalKeyComparatorPtr& _internal_comparator, bool skip_filters, |
187 | | const DataIndexLoadMode data_index_load_mode_) |
188 | | : ioptions(_ioptions), |
189 | | env_options(_env_options), |
190 | | table_options(_table_opt), |
191 | | filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()), |
192 | | filter_key_transformer(filter_policy ? filter_policy->GetKeyTransformer() : nullptr), |
193 | | comparator(_internal_comparator), |
194 | | filter_type(FilterType::kNoFilter), |
195 | | whole_key_filtering(_table_opt.whole_key_filtering), |
196 | | prefix_filtering(true), |
197 | 72.6k | data_index_load_mode(data_index_load_mode_) { |
198 | 72.6k | if (ioptions.block_based_table_mem_tracker) { |
199 | 4.48k | mem_tracker = ioptions.block_based_table_mem_tracker; |
200 | 68.1k | } else if (ioptions.mem_tracker) { |
201 | 0 | mem_tracker = yb::MemTracker::FindOrCreateTracker("BlockBasedTable", ioptions.mem_tracker); |
202 | 0 | } |
203 | 72.6k | } |
204 | | |
205 | | const ImmutableCFOptions& ioptions; |
206 | | const EnvOptions& env_options; |
207 | | const BlockBasedTableOptions& table_options; |
208 | | const FilterPolicy* filter_policy; |
209 | | const FilterPolicy::KeyTransformer* filter_key_transformer; |
210 | | InternalKeyComparatorPtr comparator; |
211 | | const NotMatchingFilterEntry not_matching_filter_entry; |
212 | | Status status; |
213 | | std::shared_ptr<FileReaderWithCachePrefix> base_reader_with_cache_prefix; |
214 | | std::shared_ptr<FileReaderWithCachePrefix> data_reader_with_cache_prefix; |
215 | | |
216 | | // Footer contains the fixed table information |
217 | | Footer footer; |
218 | | std::mutex data_index_reader_mutex; |
219 | | yb::AtomicUniquePtr<IndexReader> data_index_reader; |
220 | | unique_ptr<BlockEntryIteratorState> data_index_iterator_state; |
221 | | unique_ptr<IndexReader> filter_index_reader; |
222 | | unique_ptr<FilterBlockReader> filter; |
223 | | |
224 | | FilterType filter_type; |
225 | | |
226 | | // Handle of fixed-size bloom filter index block or simply filter block for filters of other |
227 | | // types. |
228 | | BlockHandle filter_handle; |
229 | | |
230 | | std::shared_ptr<const TableProperties> table_properties; |
231 | | IndexType index_type = IndexType::kBinarySearch; |
232 | | bool hash_index_allow_collision = false; |
233 | | bool whole_key_filtering = false; |
234 | | bool prefix_filtering = false; |
235 | | KeyValueEncodingFormat data_block_key_value_encoding_format = |
236 | | KeyValueEncodingFormat::kKeyDeltaEncodingSharedPrefix; |
237 | | // TODO(kailiu) It is very ugly to use internal key in table, since table |
238 | | // module should not be relying on db module. However to make things easier |
239 | | // and compatible with existing code, we introduce a wrapper that allows |
240 | | // block to extract prefix without knowing if a key is internal or not. |
241 | | unique_ptr<SliceTransform> internal_prefix_transform; |
242 | | |
243 | | DataIndexLoadMode data_index_load_mode = static_cast<DataIndexLoadMode>(0); |
244 | | yb::MemTrackerPtr mem_tracker; |
245 | | }; |
246 | | |
247 | | // BlockEntryIteratorState doesn't actually store any iterator state and is only used as an adapter |
248 | | // to BlockBasedTable. It is used by TwoLevelIterator and MultiLevelIterator to call BlockBasedTable |
249 | | // functions in order to check if prefix may match or to create a secondary iterator. |
250 | | class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState { |
251 | | public: |
252 | | BlockEntryIteratorState( |
253 | | BlockBasedTable* table, const ReadOptions& read_options, bool skip_filters, |
254 | | BlockType block_type) |
255 | | : TwoLevelIteratorState(table->rep_->ioptions.prefix_extractor != nullptr), |
256 | | table_(table), |
257 | | read_options_(read_options), |
258 | | skip_filters_(skip_filters), |
259 | 3.84M | block_type_(block_type) {} |
260 | | |
261 | 11.1M | InternalIterator* NewSecondaryIterator(const Slice& index_value) override { |
262 | 11.1M | return table_->NewDataBlockIterator(read_options_, index_value, block_type_); |
263 | 11.1M | } |
264 | | |
265 | 98.2k | bool PrefixMayMatch(const Slice& internal_key) override { |
266 | 98.2k | if (read_options_.total_order_seek || skip_filters_) { |
267 | 97.9k | return true; |
268 | 97.9k | } |
269 | 324 | return table_->PrefixMayMatch(internal_key); |
270 | 324 | } |
271 | | |
272 | | private: |
273 | | // Don't own table_. BlockEntryIteratorState should only be stored in iterators or in |
274 | | // corresponding BlockBasedTable. TableReader (superclass of BlockBasedTable) is only destroyed |
275 | | // after iterator is deleted. |
276 | | BlockBasedTable* const table_; |
277 | | const ReadOptions read_options_; |
278 | | const bool skip_filters_; |
279 | | const BlockType block_type_; |
280 | | }; |
281 | | |
282 | | |
283 | | class BlockBasedTable::IndexIteratorHolder { |
284 | | public: |
285 | | IndexIteratorHolder(BlockBasedTable* table_reader, ReadOptions read_options) |
286 | | : iter_holder_(table_reader->NewIndexIterator(read_options, &iter_)), |
287 | 7.94M | iter_ptr_(iter_holder_ ? iter_holder_.get() : implicit_cast<InternalIterator*>(&iter_)) {} |
288 | | |
289 | 7.94M | InternalIterator* iter() const { return iter_ptr_; } |
290 | | |
291 | | private: |
292 | | BlockIter iter_; |
293 | | std::unique_ptr<InternalIterator> iter_holder_; |
294 | | InternalIterator* iter_ptr_; |
295 | | }; |
296 | | |
297 | 71.4k | BlockBasedTable::~BlockBasedTable() { |
298 | 71.4k | delete rep_; |
299 | 71.4k | } |
300 | | |
301 | | void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep, |
302 | 142k | FileReaderWithCachePrefix* reader_with_cache_prefix) { |
303 | 142k | reader_with_cache_prefix->cache_key_prefix.size = 0; |
304 | 142k | reader_with_cache_prefix->compressed_cache_key_prefix.size = 0; |
305 | 142k | if (rep->table_options.block_cache != nullptr) { |
306 | 140k | GenerateCachePrefix(rep->table_options.block_cache.get(), |
307 | 140k | reader_with_cache_prefix->reader->file(), |
308 | 140k | &reader_with_cache_prefix->cache_key_prefix); |
309 | 140k | } |
310 | 142k | if (rep->table_options.block_cache_compressed != nullptr) { |
311 | 1.66k | GenerateCachePrefix(rep->table_options.block_cache_compressed.get(), |
312 | 1.66k | reader_with_cache_prefix->reader->file(), |
313 | 1.66k | &reader_with_cache_prefix->compressed_cache_key_prefix); |
314 | 1.66k | } |
315 | 142k | } |
316 | | |
317 | 18.3M | KeyValueEncodingFormat BlockBasedTable::GetKeyValueEncodingFormat(const BlockType block_type) { |
318 | 18.3M | switch (block_type) { |
319 | 14.4M | case BlockType::kData: |
320 | 14.4M | return rep_->data_block_key_value_encoding_format; |
321 | 3.89M | case BlockType::kIndex: |
322 | 3.89M | return kIndexBlockKeyValueEncodingFormat; |
323 | 0 | } |
324 | 0 | FATAL_INVALID_ENUM_VALUE(BlockType, block_type); |
325 | 0 | } |
326 | | |
327 | 18.3M | BlockBasedTable::FileReaderWithCachePrefix* BlockBasedTable::GetBlockReader(BlockType block_type) { |
328 | 18.3M | switch (block_type) { |
329 | 14.4M | case BlockType::kData: |
330 | 14.4M | return rep_->data_reader_with_cache_prefix.get(); |
331 | 3.89M | case BlockType::kIndex: |
332 | 3.89M | return rep_->base_reader_with_cache_prefix.get(); |
333 | 0 | } |
334 | 0 | FATAL_INVALID_ENUM_VALUE(BlockType, block_type); |
335 | 0 | } |
336 | | |
337 | | BloomFilterAwareFileFilter::BloomFilterAwareFileFilter( |
338 | | const ReadOptions& read_options, const Slice& user_key) |
339 | 8.65M | : read_options_(read_options), user_key_(user_key.ToBuffer()) {} |
340 | | |
341 | 4.34M | bool BloomFilterAwareFileFilter::Filter(TableReader* reader) const { |
342 | 4.34M | auto table = down_cast<BlockBasedTable*>(reader); |
343 | 4.34M | if (table->rep_->filter_type == FilterType::kFixedSizeFilter) { |
344 | 4.33M | const auto filter_key = table->GetFilterKeyFromUserKey(user_key_); |
345 | 4.33M | if (filter_key.empty()) { |
346 | 0 | return true; |
347 | 0 | } |
348 | 4.33M | auto filter_entry = table->GetFilter(read_options_.query_id, |
349 | 4.33M | read_options_.read_tier == kBlockCacheTier /* no_io */, &filter_key); |
350 | 4.33M | FilterBlockReader* filter = filter_entry.value; |
351 | | // If bloom filter was not useful, then take this file into account. |
352 | 4.33M | const bool use_file = table->NonBlockBasedFilterKeyMayMatch(filter, filter_key); |
353 | 4.33M | if (!use_file) { |
354 | | // Record that the bloom filter was useful. |
355 | 2.68M | RecordTick(table->rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); |
356 | 2.68M | } |
357 | 4.33M | filter_entry.Release(table->rep_->table_options.block_cache.get()); |
358 | 4.33M | return use_file; |
359 | 2.28k | } else { |
360 | | // For non fixed-size filters - take file into account. We are only using fixed-size bloom |
361 | | // filters for DocDB, so not need to support others. |
362 | 2.28k | return true; |
363 | 2.28k | } |
364 | 4.34M | } |
365 | | |
366 | | namespace { |
367 | | // Return True if table_properties has `user_prop_name` has a `true` value |
368 | | // or it doesn't contain this property (for backward compatible). |
369 | | bool IsFeatureSupported(const TableProperties& table_properties, |
370 | 145k | const std::string& user_prop_name, Logger* info_log) { |
371 | 145k | auto& props = table_properties.user_collected_properties; |
372 | 145k | auto pos = props.find(user_prop_name); |
373 | | // Older version doesn't have this value set. Skip this check. |
374 | 145k | if (pos != props.end()) { |
375 | 145k | if (pos->second == kPropFalse) { |
376 | 70.1k | return false; |
377 | 75.1k | } else if (pos->second != kPropTrue) { |
378 | 0 | RLOG(InfoLogLevel::WARN_LEVEL, info_log, |
379 | 0 | "Property %s has invalidate value %s", user_prop_name.c_str(), |
380 | 0 | pos->second.c_str()); |
381 | 0 | } |
382 | 145k | } |
383 | 75.1k | return true; |
384 | 145k | } |
385 | | } // namespace |
386 | | |
387 | | Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, |
388 | | const EnvOptions& env_options, |
389 | | const BlockBasedTableOptions& table_options, |
390 | | const InternalKeyComparatorPtr& internal_comparator, |
391 | | unique_ptr<RandomAccessFileReader>&& base_file, |
392 | | uint64_t base_file_size, |
393 | | unique_ptr<TableReader>* table_reader, |
394 | | DataIndexLoadMode data_index_load_mode, |
395 | | PrefetchFilter prefetch_filter, |
396 | 72.6k | const bool skip_filters) { |
397 | 72.6k | table_reader->reset(); |
398 | | |
399 | 72.6k | Footer footer; |
400 | 72.6k | RETURN_NOT_OK(ReadFooterFromFile( |
401 | 72.6k | base_file.get(), base_file_size, &footer, kBlockBasedTableMagicNumber)); |
402 | 72.6k | if (!BlockBasedTableSupportedVersion(footer.version())) { |
403 | 0 | return STATUS(Corruption, |
404 | 0 | "Unknown Footer version. Maybe this file was created with newer " |
405 | 0 | "version of RocksDB?"); |
406 | 0 | } |
407 | | |
408 | | // We've successfully read the footer and the index block: we're |
409 | | // ready to serve requests. |
410 | 72.6k | Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options, |
411 | 72.6k | internal_comparator, skip_filters, data_index_load_mode); |
412 | 72.6k | rep->base_reader_with_cache_prefix = |
413 | 72.6k | std::make_shared<FileReaderWithCachePrefix>(std::move(base_file)); |
414 | 72.6k | rep->data_reader_with_cache_prefix = rep->base_reader_with_cache_prefix; |
415 | 72.6k | rep->footer = footer; |
416 | 72.6k | rep->index_type = table_options.index_type; |
417 | 72.6k | rep->hash_index_allow_collision = table_options.hash_index_allow_collision; |
418 | 72.6k | SetupCacheKeyPrefix(rep, rep->base_reader_with_cache_prefix.get()); |
419 | 72.6k | unique_ptr<BlockBasedTable> new_table(new BlockBasedTable(rep)); |
420 | | |
421 | | // Read meta index |
422 | 72.6k | std::unique_ptr<Block> meta; |
423 | 72.6k | std::unique_ptr<InternalIterator> meta_iter; |
424 | 72.6k | RETURN_NOT_OK(ReadMetaBlock(rep, &meta, &meta_iter)); |
425 | | |
426 | 72.6k | RETURN_NOT_OK(new_table->ReadPropertiesBlock(meta_iter.get())); |
427 | | |
428 | 72.6k | RETURN_NOT_OK(new_table->SetupFilter(meta_iter.get())); |
429 | | |
430 | 72.6k | if (data_index_load_mode == DataIndexLoadMode::PRELOAD_ON_OPEN) { |
431 | | // Will use block cache for data index access? |
432 | 4 | if (table_options.cache_index_and_filter_blocks) { |
433 | 0 | DCHECK_ONLY_NOTNULL(table_options.block_cache.get()); |
434 | | // Hack: Call NewIndexIterator() to implicitly add index to the |
435 | | // block_cache |
436 | 0 | unique_ptr<InternalIterator> iter(new_table->NewIndexIterator(ReadOptions::kDefault)); |
437 | 0 | RETURN_NOT_OK(iter->status()); |
438 | 4 | } else { |
439 | | // If we don't use block cache for data index access, we'll pre-load it, which will kept in |
440 | | // member variables in Rep and with a same life-time as this table object. |
441 | | // NOTE: Table reader objects are cached in table cache (table_cache.cc). |
442 | 4 | std::unique_ptr<IndexReader> index_reader; |
443 | 4 | RETURN_NOT_OK(new_table->CreateDataBlockIndexReader(&index_reader, meta_iter.get())); |
444 | 4 | rep->data_index_reader.reset(index_reader.release()); |
445 | 4 | } |
446 | 4 | } |
447 | | |
448 | 72.6k | if (prefetch_filter == PrefetchFilter::YES) { |
449 | | // pre-fetching of blocks is turned on |
450 | | // NOTE: Table reader objects are cached in table cache (table_cache.cc). |
451 | 72.6k | if (rep->filter_policy && rep->filter_type == FilterType::kFixedSizeFilter) { |
452 | | // TODO: may be put it in block cache instead of table reader in case |
453 | | // table_options.cache_index_and_filter_blocks is set? |
454 | 3.70k | RETURN_NOT_OK(new_table->CreateFilterIndexReader(&rep->filter_index_reader)); |
455 | 3.70k | } |
456 | | |
457 | | // Will use block cache for filter blocks access? |
458 | 72.6k | if (table_options.cache_index_and_filter_blocks) { |
459 | 4.26k | assert(table_options.block_cache != nullptr); |
460 | 4.26k | bool corrupted_filter_type = true; |
461 | 4.26k | switch (rep->filter_type) { |
462 | 25 | case FilterType::kFullFilter: |
463 | 25 | FALLTHROUGH_INTENDED; |
464 | 184 | case FilterType::kBlockBasedFilter: { |
465 | | // Hack: Call GetFilter() to implicitly add filter to the block_cache |
466 | 184 | auto filter_entry = new_table->GetFilter(kDefaultQueryId); |
467 | 184 | filter_entry.Release(table_options.block_cache.get()); |
468 | 184 | corrupted_filter_type = false; |
469 | 184 | break; |
470 | 25 | } |
471 | 3.06k | case FilterType::kFixedSizeFilter: |
472 | | // We never pre-cache fixed-size bloom filters. |
473 | 3.06k | FALLTHROUGH_INTENDED; |
474 | 4.08k | case FilterType::kNoFilter: |
475 | 4.08k | corrupted_filter_type = false; |
476 | 4.08k | break; |
477 | 4.26k | } |
478 | 4.26k | if (corrupted_filter_type) { |
479 | 0 | RLOG(InfoLogLevel::FATAL_LEVEL, rep->ioptions.info_log, "Corrupted bloom filter type: %d", |
480 | 0 | rep->filter_type); |
481 | 0 | assert(false); |
482 | 0 | return STATUS_SUBSTITUTE(Corruption, "Corrupted bloom filter type: $0", rep->filter_type); |
483 | 0 | } |
484 | 68.3k | } else { |
485 | | // If we don't use block cache for filter access, we'll pre-load these blocks, which will |
486 | | // kept in member variables in Rep and with a same life-time as this table object. |
487 | 68.3k | bool corrupted_filter_type = true; |
488 | 68.3k | switch (rep->filter_type) { |
489 | 3.97k | case FilterType::kFullFilter: |
490 | 3.97k | FALLTHROUGH_INTENDED; |
491 | 4.90k | case FilterType::kBlockBasedFilter: |
492 | 4.90k | rep->filter.reset(ReadFilterBlock(rep->filter_handle, rep, nullptr)); |
493 | 4.90k | corrupted_filter_type = false; |
494 | 4.90k | break; |
495 | 644 | case FilterType::kFixedSizeFilter: |
496 | | // We never pre-load fixed-size bloom filters. |
497 | 644 | FALLTHROUGH_INTENDED; |
498 | 63.4k | case FilterType::kNoFilter: |
499 | 63.4k | corrupted_filter_type = false; |
500 | 63.4k | break; |
501 | 68.3k | } |
502 | 68.3k | if (corrupted_filter_type) { |
503 | 0 | RLOG(InfoLogLevel::FATAL_LEVEL, rep->ioptions.info_log, "Corrupted bloom filter type: %d", |
504 | 0 | rep->filter_type); |
505 | 0 | assert(false); |
506 | 0 | return STATUS_SUBSTITUTE(Corruption, "Corrupted bloom filter type: $0", rep->filter_type); |
507 | 0 | } |
508 | 72.6k | } |
509 | 72.6k | } |
510 | | |
511 | | // Filters are checked before seeking the index. |
512 | 72.6k | const bool skip_filters_for_index = true; |
513 | 72.6k | rep->data_index_iterator_state = std::make_unique<BlockEntryIteratorState>( |
514 | 72.6k | new_table.get(), ReadOptions::kDefault, skip_filters_for_index, BlockType::kIndex); |
515 | | |
516 | 72.6k | *table_reader = std::move(new_table); |
517 | | |
518 | 72.6k | return Status::OK(); |
519 | 72.6k | } |
520 | | |
521 | 72.6k | Status BlockBasedTable::ReadPropertiesBlock(InternalIterator* meta_iter) { |
522 | | // Read the properties |
523 | 72.6k | bool found_properties_block = true; |
524 | 72.6k | auto s = SeekToPropertiesBlock(meta_iter, &found_properties_block); |
525 | | |
526 | 72.6k | if (!s.ok()) { |
527 | 0 | RLOG(InfoLogLevel::WARN_LEVEL, rep_->ioptions.info_log, |
528 | 0 | "Cannot seek to properties block from file: %s", |
529 | 0 | s.ToString().c_str()); |
530 | 0 | return s; |
531 | 0 | } |
532 | | |
533 | 72.6k | if (found_properties_block) { |
534 | 72.6k | s = meta_iter->status(); |
535 | 72.6k | TableProperties* table_properties = nullptr; |
536 | 72.6k | if (s.ok()) { |
537 | 72.6k | s = ReadProperties( |
538 | 72.6k | meta_iter->value(), rep_->base_reader_with_cache_prefix->reader.get(), |
539 | 72.6k | rep_->footer, rep_->ioptions.env, rep_->ioptions.info_log, &table_properties); |
540 | 72.6k | } |
541 | | |
542 | 72.6k | if (!s.ok()) { |
543 | 0 | RLOG(InfoLogLevel::WARN_LEVEL, rep_->ioptions.info_log, |
544 | 0 | "Encountered error while reading data from properties " |
545 | 0 | "block %s", s.ToString().c_str()); |
546 | 0 | return s; |
547 | 0 | } |
548 | 72.6k | rep_->table_properties.reset(table_properties); |
549 | 18.4E | } else { |
550 | 18.4E | RLOG(InfoLogLevel::ERROR_LEVEL, rep_->ioptions.info_log, |
551 | 18.4E | "Cannot find Properties block from file."); |
552 | 18.4E | } |
553 | | |
554 | | // Determine whether whole key filtering is supported. |
555 | 72.6k | if (rep_->table_properties) { |
556 | 72.6k | rep_->whole_key_filtering &= |
557 | 72.6k | IsFeatureSupported(*(rep_->table_properties), |
558 | 72.6k | BlockBasedTablePropertyNames::kWholeKeyFiltering, |
559 | 72.6k | rep_->ioptions.info_log); |
560 | 72.6k | rep_->prefix_filtering &= IsFeatureSupported( |
561 | 72.6k | *(rep_->table_properties), |
562 | 72.6k | BlockBasedTablePropertyNames::kPrefixFiltering, rep_->ioptions.info_log); |
563 | | |
564 | 72.6k | auto& props = rep_->table_properties->user_collected_properties; |
565 | 72.6k | auto it = props.find(BlockBasedTablePropertyNames::kDataBlockKeyValueEncodingFormat); |
566 | 72.6k | if (it != props.end()) { |
567 | 72.6k | rep_->data_block_key_value_encoding_format = |
568 | 72.6k | static_cast<KeyValueEncodingFormat>(DecodeFixed8(it->second.c_str())); |
569 | 72.6k | } |
570 | 72.6k | } |
571 | | |
572 | 72.6k | return Status::OK(); |
573 | 72.6k | } |
574 | | |
575 | 72.6k | Status BlockBasedTable::SetupFilter(InternalIterator* meta_iter) { |
576 | | // Find filter handle and filter type. |
577 | 72.6k | if (!rep_->filter_policy) { |
578 | 63.6k | return Status::OK(); |
579 | 63.6k | } |
580 | 9.00k | const auto& table_filter_policy_name = rep_->table_properties->filter_policy_name; |
581 | 9.00k | if (rep_->filter_policy->Name() != table_filter_policy_name && |
582 | 5 | !table_filter_policy_name.empty()) { |
583 | | // SST file has been written using another filter policy - use it for reading if it is still |
584 | | // supported. |
585 | 4 | const FilterPolicy* table_filter_policy = nullptr; |
586 | 4 | const auto& policies = rep_->table_options.supported_filter_policies; |
587 | 4 | if (policies) { |
588 | 4 | const auto it = policies->find(table_filter_policy_name); |
589 | 4 | if (it != policies->end()) { |
590 | 4 | table_filter_policy = it->second.get(); |
591 | 4 | } |
592 | 4 | } |
593 | 4 | if (!table_filter_policy) { |
594 | 0 | rep_->filter_policy = nullptr; |
595 | 0 | rep_->filter_key_transformer = nullptr; |
596 | 0 | const auto error_message = yb::Format( |
597 | 0 | "Filter policy '$0' is not supported, not using use bloom filters for reading '$1'", |
598 | 0 | table_filter_policy_name, |
599 | 0 | rep_->base_reader_with_cache_prefix->reader->file()->filename()); |
600 | 0 | RLOG(InfoLogLevel::ERROR_LEVEL, rep_->ioptions.info_log, error_message.c_str()); |
601 | | // For testing in debug build we want to fail in case some filter policy is not supported, but |
602 | | // for production we prefer to continue operation with lower performance due to lack of |
603 | | // supported bloom filters for this file. And eventually during compaction this file will |
604 | | // be replaced and latest version of filter policy will be used. |
605 | 0 | #ifndef NDEBUG |
606 | 0 | return STATUS(IllegalState, error_message); |
607 | | #else |
608 | | return Status::OK(); |
609 | | #endif |
610 | 0 | } |
611 | 4 | rep_->filter_policy = table_filter_policy; |
612 | 4 | rep_->filter_key_transformer = table_filter_policy->GetKeyTransformer(); |
613 | 4 | } |
614 | | |
615 | 9.00k | for (const auto& prefix : {block_based_table::kFullFilterBlockPrefix, |
616 | 9.00k | block_based_table::kFilterBlockPrefix, |
617 | 17.9k | block_based_table::kFixedSizeFilterBlockPrefix}) { |
618 | | // Unsuccessful read implies we should not use filter. |
619 | 17.9k | std::string filter_block_key = prefix; |
620 | 17.9k | filter_block_key.append(rep_->filter_policy->Name()); |
621 | 17.9k | if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle).ok()) { |
622 | 8.79k | if (prefix == block_based_table::kFullFilterBlockPrefix) { |
623 | 4.00k | rep_->filter_type = FilterType::kFullFilter; |
624 | 4.79k | } else if (prefix == block_based_table::kFilterBlockPrefix) { |
625 | 1.08k | rep_->filter_type = FilterType::kBlockBasedFilter; |
626 | 3.70k | } else if (prefix == block_based_table::kFixedSizeFilterBlockPrefix) { |
627 | 3.70k | rep_->filter_type = FilterType::kFixedSizeFilter; |
628 | 1 | } else { |
629 | | // That means we have memory corruption, so we should fail. |
630 | 1 | RLOG( |
631 | 1 | InfoLogLevel::FATAL_LEVEL, rep_->ioptions.info_log, "Invalid filter block prefix: %s", |
632 | 1 | prefix); |
633 | 1 | assert(false); |
634 | 1 | return STATUS(Corruption, "Invalid filter block prefix", prefix); |
635 | 1 | } |
636 | 8.79k | break; |
637 | 8.79k | } |
638 | 17.9k | } |
639 | | |
640 | 9.00k | return Status::OK(); |
641 | 9.00k | } |
642 | | |
643 | 69.4k | void BlockBasedTable::SetDataFileReader(unique_ptr<RandomAccessFileReader> &&data_file) { |
644 | 69.4k | rep_->data_reader_with_cache_prefix = |
645 | 69.4k | std::make_shared<FileReaderWithCachePrefix>(std::move(data_file)); |
646 | 69.4k | SetupCacheKeyPrefix(rep_, rep_->data_reader_with_cache_prefix.get()); |
647 | 69.4k | } |
648 | | |
649 | | namespace { |
650 | | void SetupFileReaderForCompaction(const Options::AccessHint &access_hint, |
651 | 89.3k | RandomAccessFileReader *reader) { |
652 | 89.3k | if (reader != nullptr) { |
653 | 89.3k | switch (access_hint) { |
654 | 0 | case Options::NONE: |
655 | 0 | break; |
656 | 89.3k | case Options::NORMAL: |
657 | 89.3k | reader->file()->Hint(RandomAccessFile::NORMAL); |
658 | 89.3k | break; |
659 | 0 | case Options::SEQUENTIAL: |
660 | 0 | reader->file()->Hint(RandomAccessFile::SEQUENTIAL); |
661 | 0 | break; |
662 | 0 | case Options::WILLNEED: |
663 | 0 | reader->file()->Hint(RandomAccessFile::WILLNEED); |
664 | 0 | break; |
665 | 0 | default: |
666 | 0 | assert(false); |
667 | 89.3k | } |
668 | 89.3k | } |
669 | 89.3k | } |
670 | | } // anonymous namespace |
671 | | |
672 | 44.6k | void BlockBasedTable::SetupForCompaction() { |
673 | 44.6k | auto access_hint = rep_->ioptions.access_hint_on_compaction_start; |
674 | 44.6k | ::rocksdb::SetupFileReaderForCompaction(access_hint, |
675 | 44.6k | rep_->base_reader_with_cache_prefix->reader.get()); |
676 | 44.6k | ::rocksdb::SetupFileReaderForCompaction(access_hint, |
677 | 44.6k | rep_->data_reader_with_cache_prefix->reader.get()); |
678 | 44.6k | } |
679 | | |
680 | | std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties() |
681 | 70.2k | const { |
682 | 70.2k | return rep_->table_properties; |
683 | 70.2k | } |
684 | | |
685 | 29 | size_t BlockBasedTable::ApproximateMemoryUsage() const { |
686 | 29 | size_t usage = 0; |
687 | 29 | if (rep_->filter) { |
688 | 0 | usage += rep_->filter->ApproximateMemoryUsage(); |
689 | 0 | } |
690 | 29 | if (rep_->filter_index_reader) { |
691 | 0 | usage += rep_->filter_index_reader->ApproximateMemoryUsage(); |
692 | 0 | } |
693 | 29 | IndexReader* data_index_reader = rep_->data_index_reader.get(std::memory_order_relaxed); |
694 | 29 | if (data_index_reader) { |
695 | 29 | usage += data_index_reader->ApproximateMemoryUsage(); |
696 | 29 | } |
697 | 29 | return usage; |
698 | 29 | } |
699 | | |
700 | | // Load the meta-block from the file. On success, return the loaded meta block |
701 | | // and its iterator. |
702 | | Status BlockBasedTable::ReadMetaBlock(Rep* rep, |
703 | | std::unique_ptr<Block>* meta_block, |
704 | 74.3k | std::unique_ptr<InternalIterator>* iter) { |
705 | | // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates |
706 | | // it is an empty block. |
707 | | // TODO: we never really verify check sum for meta index block |
708 | 74.3k | std::unique_ptr<Block> meta; |
709 | 74.3k | Status s = block_based_table::ReadBlockFromFile( |
710 | 74.3k | rep->base_reader_with_cache_prefix->reader.get(), |
711 | 74.3k | rep->footer, |
712 | 74.3k | ReadOptions::kDefault, |
713 | 74.3k | rep->footer.metaindex_handle(), |
714 | 74.3k | &meta, |
715 | 74.3k | rep->ioptions.env, |
716 | 74.3k | rep->mem_tracker); |
717 | | |
718 | 74.3k | if (!s.ok()) { |
719 | 0 | RLOG(InfoLogLevel::ERROR_LEVEL, rep->ioptions.info_log, |
720 | 0 | "Encountered error while reading data from properties" |
721 | 0 | " block %s", s.ToString().c_str()); |
722 | 0 | return s; |
723 | 0 | } |
724 | | |
725 | 74.3k | *meta_block = std::move(meta); |
726 | | // meta block uses bytewise comparator. |
727 | 74.3k | iter->reset( |
728 | 74.3k | meta_block->get()->NewIterator(BytewiseComparator(), kMetaIndexBlockKeyValueEncodingFormat)); |
729 | 74.3k | return Status::OK(); |
730 | 74.3k | } |
731 | | |
732 | | namespace { |
733 | | |
734 | 18.3M | Tickers GetBlockCacheMissTicker(BlockType block_type) { |
735 | 18.3M | switch (block_type) { |
736 | 14.4M | case BlockType::kData: |
737 | 14.4M | return BLOCK_CACHE_DATA_MISS; |
738 | 3.89M | case BlockType::kIndex: |
739 | 3.89M | return BLOCK_CACHE_INDEX_MISS; |
740 | 0 | } |
741 | 0 | FATAL_INVALID_ENUM_VALUE(BlockType, block_type); |
742 | 0 | } |
743 | | |
744 | 18.3M | Tickers GetBlockCacheHitTicker(BlockType block_type) { |
745 | 18.3M | switch (block_type) { |
746 | 14.4M | case BlockType::kData: |
747 | 14.4M | return BLOCK_CACHE_DATA_HIT; |
748 | 3.89M | case BlockType::kIndex: |
749 | 3.89M | return BLOCK_CACHE_INDEX_HIT; |
750 | 0 | } |
751 | 0 | FATAL_INVALID_ENUM_VALUE(BlockType, block_type); |
752 | 0 | } |
753 | | |
754 | | } // namespace |
755 | | |
756 | | Status BlockBasedTable::GetDataBlockFromCache( |
757 | | const Slice& block_cache_key, const Slice& compressed_block_cache_key, |
758 | | Cache* block_cache, Cache* block_cache_compressed, Statistics* statistics, |
759 | | const ReadOptions& read_options, BlockBasedTable::CachableEntry<Block>* block, |
760 | | uint32_t format_version, BlockType block_type, |
761 | 18.3M | const std::shared_ptr<yb::MemTracker>& mem_tracker) { |
762 | 18.3M | Status s; |
763 | 18.3M | Block* compressed_block = nullptr; |
764 | 18.3M | Cache::Handle* block_cache_compressed_handle = nullptr; |
765 | | |
766 | | // Lookup uncompressed cache first |
767 | 18.3M | if (block_cache != nullptr) { |
768 | 18.3M | block->cache_handle = |
769 | 18.3M | GetEntryFromCache( |
770 | 18.3M | block_cache, block_cache_key, GetBlockCacheMissTicker(block_type), |
771 | 18.3M | GetBlockCacheHitTicker(block_type), statistics, read_options.query_id); |
772 | 18.3M | if (block->cache_handle != nullptr) { |
773 | 14.8M | block->value = |
774 | 14.8M | static_cast<Block*>(block_cache->Value(block->cache_handle)); |
775 | 14.8M | return s; |
776 | 14.8M | } |
777 | 3.52M | } |
778 | | |
779 | | // If not found, search from the compressed block cache. |
780 | 3.52M | assert(block->cache_handle == nullptr && block->value == nullptr); |
781 | | |
782 | 3.52M | if (block_cache_compressed == nullptr) { |
783 | 3.51M | return s; |
784 | 3.51M | } |
785 | | |
786 | 8.50k | assert(!compressed_block_cache_key.empty()); |
787 | 8.50k | block_cache_compressed_handle = |
788 | 8.50k | block_cache_compressed->Lookup(compressed_block_cache_key, read_options.query_id); |
789 | | // if we found in the compressed cache, then uncompress and insert into |
790 | | // uncompressed cache |
791 | 9.63k | if (block_cache_compressed_handle == nullptr) { |
792 | 9.63k | RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS); |
793 | 9.63k | return s; |
794 | 9.63k | } |
795 | | |
796 | | // found compressed block |
797 | 18.4E | RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT); |
798 | 18.4E | compressed_block = static_cast<Block*>( |
799 | 18.4E | block_cache_compressed->Value(block_cache_compressed_handle)); |
800 | 18.4E | assert(compressed_block->compression_type() != kNoCompression); |
801 | | |
802 | | // Retrieve the uncompressed contents into a new buffer |
803 | 18.4E | BlockContents contents; |
804 | 18.4E | s = UncompressBlockContents(compressed_block->data(), compressed_block->size(), &contents, |
805 | 18.4E | format_version, mem_tracker); |
806 | | |
807 | | // Insert uncompressed block into block cache |
808 | 18.4E | if (s.ok()) { |
809 | 148 | block->value = new Block(std::move(contents)); // uncompressed block |
810 | 148 | assert(block->value->compression_type() == kNoCompression); |
811 | 148 | if (block_cache != nullptr && block->value->cachable() && |
812 | 80 | read_options.fill_cache) { |
813 | 80 | s = block_cache->Insert(block_cache_key, read_options.query_id, block->value, |
814 | 80 | block->value->usable_size(), &DeleteCachedEntry<Block>, |
815 | 80 | &block->cache_handle, statistics); |
816 | 80 | if (!s.ok()) { |
817 | 0 | delete block->value; |
818 | 0 | block->value = nullptr; |
819 | 0 | } |
820 | 80 | } |
821 | 148 | } |
822 | | |
823 | | // Release hold on compressed cache entry |
824 | 18.4E | block_cache_compressed->Release(block_cache_compressed_handle); |
825 | 18.4E | return s; |
826 | 18.4E | } |
827 | | |
828 | | Status BlockBasedTable::PutDataBlockToCache( |
829 | | const Slice& block_cache_key, const Slice& compressed_block_cache_key, |
830 | | Cache* block_cache, Cache* block_cache_compressed, |
831 | | const ReadOptions& read_options, Statistics* statistics, |
832 | | CachableEntry<Block>* block, Block* raw_block, uint32_t format_version, |
833 | 2.13M | const std::shared_ptr<yb::MemTracker>& mem_tracker) { |
834 | 2.13M | assert(raw_block->compression_type() == kNoCompression || |
835 | 2.13M | block_cache_compressed != nullptr); |
836 | | |
837 | 2.13M | Status s; |
838 | | // Retrieve the uncompressed contents into a new buffer |
839 | 2.13M | BlockContents contents; |
840 | 2.13M | if (raw_block->compression_type() != kNoCompression) { |
841 | 8.13k | s = UncompressBlockContents(raw_block->data(), raw_block->size(), &contents, |
842 | 8.13k | format_version, mem_tracker); |
843 | 8.13k | } |
844 | 2.13M | if (!s.ok()) { |
845 | 0 | delete raw_block; |
846 | 0 | return s; |
847 | 0 | } |
848 | | |
849 | 2.13M | if (raw_block->compression_type() != kNoCompression) { |
850 | 8.14k | block->value = new Block(std::move(contents)); // uncompressed block |
851 | 2.12M | } else { |
852 | 2.12M | block->value = raw_block; |
853 | 2.12M | raw_block = nullptr; |
854 | 2.12M | } |
855 | | |
856 | | // Insert compressed block into compressed block cache. |
857 | | // Release the hold on the compressed cache entry immediately. |
858 | 2.13M | if (block_cache_compressed != nullptr && raw_block != nullptr && |
859 | 8.14k | raw_block->cachable()) { |
860 | 8.14k | s = block_cache_compressed->Insert(compressed_block_cache_key, read_options.query_id, raw_block, |
861 | 8.14k | raw_block->usable_size(), &DeleteCachedEntry<Block>); |
862 | 8.14k | if (s.ok()) { |
863 | | // Avoid the following code to delete this cached block. |
864 | 8.14k | raw_block = nullptr; |
865 | 8.14k | RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD); |
866 | 0 | } else { |
867 | 0 | RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); |
868 | 0 | } |
869 | 8.14k | } |
870 | 2.13M | delete raw_block; |
871 | | |
872 | | // insert into uncompressed block cache |
873 | 2.13M | assert((block->value->compression_type() == kNoCompression)); |
874 | 2.13M | if (block_cache != nullptr && block->value->cachable()) { |
875 | 2.12M | s = block_cache->Insert(block_cache_key, read_options.query_id, block->value, |
876 | 2.12M | block->value->usable_size(), |
877 | 2.12M | &DeleteCachedEntry<Block>, &block->cache_handle, statistics); |
878 | 2.12M | if (!s.ok()) { |
879 | 0 | delete block->value; |
880 | 0 | block->value = nullptr; |
881 | 0 | } |
882 | 2.12M | } |
883 | | |
884 | 2.13M | return s; |
885 | 2.13M | } |
886 | | |
887 | 3.70k | Status BlockBasedTable::CreateFilterIndexReader(std::unique_ptr<IndexReader>* filter_index_reader) { |
888 | 3.70k | auto base_file_reader = rep_->base_reader_with_cache_prefix->reader.get(); |
889 | 3.70k | auto env = rep_->ioptions.env; |
890 | 3.70k | auto footer = rep_->footer; |
891 | 3.70k | return BinarySearchIndexReader::Create(base_file_reader, footer, rep_->filter_handle, env, |
892 | 3.70k | SharedBytewiseComparator(), filter_index_reader, rep_->mem_tracker); |
893 | 3.70k | } |
894 | | |
895 | | FilterBlockReader* BlockBasedTable::ReadFilterBlock(const BlockHandle& filter_handle, Rep* rep, |
896 | 6.46k | size_t* filter_size) { |
897 | | // TODO: We might want to unify with ReadBlockFromFile() if we start |
898 | | // requiring checksum verification in Table::Open. |
899 | 6.46k | if (rep->filter_type == FilterType::kNoFilter) { |
900 | 0 | return nullptr; |
901 | 0 | } |
902 | 6.46k | BlockContents block; |
903 | 6.46k | if (!ReadBlockContents( |
904 | 6.46k | rep->base_reader_with_cache_prefix->reader.get(), rep->footer, ReadOptions::kDefault, |
905 | 0 | filter_handle, &block, rep->ioptions.env, rep->mem_tracker, false).ok()) { |
906 | | // Error reading the block |
907 | 0 | return nullptr; |
908 | 0 | } |
909 | | |
910 | 6.46k | if (filter_size) { |
911 | 1.56k | *filter_size = block.data.size(); |
912 | 1.56k | } |
913 | | |
914 | 6.46k | assert(rep->filter_policy); |
915 | | |
916 | 6.46k | switch (rep->filter_type) { |
917 | 0 | case FilterType::kNoFilter: |
918 | | // Shouldn't happen, since we already checked for that above. In case of memory corruption |
919 | | // will be caught after switch statement. |
920 | 0 | break; |
921 | 1.09k | case FilterType::kBlockBasedFilter: |
922 | 1.09k | return new BlockBasedFilterBlockReader( |
923 | 1.05k | rep->prefix_filtering ? rep->ioptions.prefix_extractor : nullptr, |
924 | 1.09k | rep->table_options, rep->whole_key_filtering, std::move(block)); |
925 | 4.00k | case FilterType::kFullFilter: { |
926 | 4.00k | auto filter_bits_reader = rep->filter_policy->GetFilterBitsReader(block.data); |
927 | 4.00k | assert(filter_bits_reader); |
928 | 4.00k | return new FullFilterBlockReader( |
929 | 3.97k | rep->prefix_filtering ? rep->ioptions.prefix_extractor : nullptr, |
930 | 4.00k | rep->whole_key_filtering, std::move(block), filter_bits_reader); |
931 | 0 | } |
932 | 1.37k | case FilterType::kFixedSizeFilter: |
933 | 1.37k | return new FixedSizeFilterBlockReader( |
934 | 1.37k | rep->prefix_filtering ? rep->ioptions.prefix_extractor : nullptr, |
935 | 1.37k | rep->table_options, rep->whole_key_filtering, std::move(block)); |
936 | 0 | break; |
937 | 0 | } |
938 | 0 | RLOG(InfoLogLevel::FATAL_LEVEL, rep->ioptions.info_log, "Corrupted filter_type: %d", |
939 | 0 | rep->filter_type); |
940 | 0 | return nullptr; |
941 | 0 | } |
942 | | |
943 | | Status BlockBasedTable::GetFixedSizeFilterBlockHandle(const Slice& filter_key, |
944 | 4.55M | BlockHandle* filter_block_handle) const { |
945 | | // Determine block of fixed-size bloom filter using filter index. |
946 | 4.55M | BlockIter fiter; |
947 | 4.55M | rep_->filter_index_reader->NewIterator(&fiter, |
948 | | // Following parameters are ignored by BinarySearchIndexReader which we use as |
949 | | // filter_index_reader. |
950 | 4.55M | nullptr /* index_iterator_state */, true /* total_order_seek */); |
951 | 4.55M | fiter.Seek(filter_key); |
952 | 4.55M | if (fiter.Valid()) { |
953 | 4.55M | Slice filter_block_handle_encoded = fiter.value(); |
954 | 4.55M | return filter_block_handle->DecodeFrom(&filter_block_handle_encoded); |
955 | 4.20k | } else { |
956 | | // We are beyond the index, that means key is absent in filter, we use null block handle |
957 | | // stub to indicate that. |
958 | 4.20k | filter_block_handle->set_offset(0); |
959 | 4.20k | filter_block_handle->set_size(0); |
960 | 4.20k | return Status::OK(); |
961 | 4.20k | } |
962 | 4.55M | } |
963 | | |
964 | 7.97M | Slice BlockBasedTable::GetFilterKeyFromInternalKey(const Slice &internal_key) const { |
965 | 7.97M | return GetFilterKeyFromUserKey(ExtractUserKey(internal_key)); |
966 | 7.97M | } |
967 | | |
968 | 12.3M | Slice BlockBasedTable::GetFilterKeyFromUserKey(const Slice &user_key) const { |
969 | 12.3M | return rep_->filter_key_transformer ? |
970 | 7.57M | rep_->filter_key_transformer->Transform(user_key) : user_key; |
971 | 12.3M | } |
972 | | |
973 | | BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter( |
974 | | const QueryId query_id, |
975 | | bool no_io, |
976 | 12.1M | const Slice* filter_key) const { |
977 | 12.1M | const bool is_fixed_size_filter = rep_->filter_type == FilterType::kFixedSizeFilter; |
978 | | |
979 | | // Key is required for fixed size filter. |
980 | 12.1M | assert(!is_fixed_size_filter || filter_key != nullptr); |
981 | | |
982 | | // If cache_index_and_filter_blocks is false, filter (except fixed-size filter) should be |
983 | | // pre-populated. |
984 | | // We will return rep_->filter anyway. rep_->filter can be nullptr if filter |
985 | | // read fails at Open() time. We don't want to reload again since it will |
986 | | // most probably fail again. |
987 | | // Note: rep_->filter can be nullptr also if Open was called with |
988 | | // prefetch_index_and_filter == false. That means bloom filters are not be used if |
989 | | // both prefetch_index_and_filter and table_options.cache_index_and_filter_blocks are false. |
990 | 12.1M | if (!rep_->table_options.cache_index_and_filter_blocks && !is_fixed_size_filter) { |
991 | 7.24M | return {rep_->filter.get(), nullptr /* cache handle */}; |
992 | 7.24M | } |
993 | | |
994 | 4.86M | PERF_TIMER_GUARD(read_filter_block_nanos); |
995 | | |
996 | 4.86M | Cache* block_cache = rep_->table_options.block_cache.get(); |
997 | 4.86M | if (rep_->filter_policy == nullptr /* do not use filter */ || |
998 | 4.87M | block_cache == nullptr /* no block cache at all */) { |
999 | | // If we get here, we have: |
1000 | | // table_options.cache_index_and_filter_blocks || is_fixed_size_filter |
1001 | | // table_options.block_cache == nullptr |
1002 | 0 | return {nullptr /* filter */, nullptr /* cache handle */}; |
1003 | 0 | } |
1004 | | |
1005 | 4.86M | const BlockHandle* filter_block_handle; |
1006 | | // Determine filter block handle |
1007 | 4.86M | BlockHandle fixed_size_filter_block_handle; |
1008 | 4.86M | if (is_fixed_size_filter) { |
1009 | 4.55M | Status s = GetFixedSizeFilterBlockHandle(*filter_key, &fixed_size_filter_block_handle); |
1010 | 4.55M | if (s.ok()) { |
1011 | 4.55M | if (fixed_size_filter_block_handle.IsNull()) { |
1012 | | // Key is beyond filter index - return stub filter. |
1013 | 10 | return rep_->not_matching_filter_entry; |
1014 | 10 | } |
1015 | 4.55M | filter_block_handle = &fixed_size_filter_block_handle; |
1016 | 363 | } else { |
1017 | | // If we failed to decode filter block handle from filter index we will just log error in |
1018 | | // production to continue operation in case of just filter corruption, |
1019 | | // but we should fail in debug and under tests to be able to catch possible bugs. |
1020 | 363 | RLOG(InfoLogLevel::ERROR_LEVEL, rep_->ioptions.info_log, |
1021 | 363 | "Failed to decode fixed-size filter block handle from filter index."); |
1022 | 363 | FAIL_IF_NOT_PRODUCTION(); |
1023 | 363 | return {nullptr /* filter */, nullptr /* cache handle */}; |
1024 | 363 | } |
1025 | 318k | } else { |
1026 | 318k | filter_block_handle = &rep_->filter_handle; |
1027 | 318k | } |
1028 | | |
1029 | | // Fetching from the cache |
1030 | 4.86M | char cache_key_buffer[block_based_table::kCacheKeyBufferSize]; |
1031 | 4.86M | auto filter_block_cache_key = GetCacheKey(rep_->base_reader_with_cache_prefix->cache_key_prefix, |
1032 | 4.86M | *filter_block_handle, cache_key_buffer); |
1033 | | |
1034 | 4.86M | Statistics* statistics = rep_->ioptions.statistics; |
1035 | 4.86M | auto cache_handle = GetEntryFromCache(block_cache, filter_block_cache_key, |
1036 | 4.86M | BLOCK_CACHE_FILTER_MISS, BLOCK_CACHE_FILTER_HIT, statistics, query_id); |
1037 | | |
1038 | 4.86M | FilterBlockReader* filter = nullptr; |
1039 | 4.86M | if (cache_handle != nullptr) { |
1040 | 4.86M | filter = static_cast<FilterBlockReader*>(block_cache->Value(cache_handle)); |
1041 | 18.4E | } else if (no_io && rep_->filter_type != FilterType::kFixedSizeFilter) { |
1042 | | // Do not invoke any io. |
1043 | 0 | return CachableEntry<FilterBlockReader>(); |
1044 | 18.4E | } else { |
1045 | | // For fixed-size filter we don't prefetch all filter blocks and ignore no_io parameter always |
1046 | | // loading necessary filter block through block cache. |
1047 | 18.4E | size_t filter_size = 0; |
1048 | 18.4E | filter = ReadFilterBlock(*filter_block_handle, rep_, &filter_size); |
1049 | 18.4E | if (filter != nullptr) { |
1050 | 1.56k | assert(filter_size > 0); |
1051 | 1.56k | Status s = block_cache->Insert(filter_block_cache_key, query_id, |
1052 | 1.56k | filter, filter_size, |
1053 | 1.56k | &DeleteCachedEntry<FilterBlockReader>, &cache_handle, |
1054 | 1.56k | statistics); |
1055 | 1.56k | if (!s.ok()) { |
1056 | 0 | delete filter; |
1057 | 0 | return CachableEntry<FilterBlockReader>(); |
1058 | 0 | } |
1059 | 4.86M | } |
1060 | 18.4E | } |
1061 | | |
1062 | 4.86M | return { filter, cache_handle }; |
1063 | 4.86M | } |
1064 | | |
1065 | | namespace { |
1066 | | |
1067 | 1 | InternalIterator* ReturnErrorIterator(const Status& status, BlockIter* input_iter) { |
1068 | 1 | if (input_iter != nullptr) { |
1069 | 0 | input_iter->SetStatus(status); |
1070 | 0 | return input_iter; |
1071 | 1 | } else { |
1072 | 1 | return NewErrorInternalIterator(status); |
1073 | 1 | } |
1074 | 1 | } |
1075 | | |
1076 | 88 | Status ReturnNoIOError() { |
1077 | 88 | return STATUS(Incomplete, "no blocking io"); |
1078 | 88 | } |
1079 | | |
1080 | | } // namespace |
1081 | | |
1082 | | yb::Result<BlockBasedTable::CachableEntry<IndexReader>> BlockBasedTable::GetIndexReader( |
1083 | 11.7M | const ReadOptions& read_options) { |
1084 | 11.7M | auto* index_reader = rep_->data_index_reader.get(std::memory_order_acquire); |
1085 | 11.7M | if (index_reader) { |
1086 | | // Index reader has already been pre-populated. |
1087 | 7.72M | return BlockBasedTable::CachableEntry<IndexReader>{index_reader, /* cache_handle =*/ nullptr}; |
1088 | 7.72M | } |
1089 | 4.00M | PERF_TIMER_GUARD(read_index_block_nanos); |
1090 | | |
1091 | 4.00M | const bool no_io = read_options.read_tier == kBlockCacheTier; |
1092 | 4.00M | Cache* const block_cache = rep_->table_options.block_cache.get(); |
1093 | | |
1094 | 4.00M | if (block_cache && (rep_->data_index_load_mode == DataIndexLoadMode::USE_CACHE || |
1095 | 4.00M | rep_->table_options.cache_index_and_filter_blocks)) { |
1096 | 3.93M | char cache_key[block_based_table::kCacheKeyBufferSize]; |
1097 | 3.93M | auto key = GetCacheKey(rep_->base_reader_with_cache_prefix->cache_key_prefix, |
1098 | 3.93M | rep_->footer.index_handle(), cache_key); |
1099 | 3.93M | Statistics* statistics = rep_->ioptions.statistics; |
1100 | 3.93M | auto cache_handle = |
1101 | 3.93M | GetEntryFromCache(block_cache, key, BLOCK_CACHE_INDEX_MISS, |
1102 | 3.93M | BLOCK_CACHE_INDEX_HIT, statistics, read_options.query_id); |
1103 | | |
1104 | 3.93M | if (cache_handle == nullptr && no_io) { |
1105 | 0 | return ReturnNoIOError(); |
1106 | 0 | } |
1107 | | |
1108 | 3.93M | if (cache_handle != nullptr) { |
1109 | 3.93M | index_reader = static_cast<IndexReader*>(block_cache->Value(cache_handle)); |
1110 | 2.17k | } else { |
1111 | | // Create index reader and put it in the cache. |
1112 | 2.17k | std::unique_ptr<IndexReader> index_reader_unique; |
1113 | 2.17k | RETURN_NOT_OK(CreateDataBlockIndexReader(&index_reader_unique)); |
1114 | 2.17k | RETURN_NOT_OK(block_cache->Insert( |
1115 | 2.17k | key, read_options.query_id, index_reader_unique.get(), index_reader_unique->usable_size(), |
1116 | 2.17k | &DeleteCachedEntry<IndexReader>, &cache_handle, statistics)); |
1117 | 2.17k | assert(cache_handle); |
1118 | 2.17k | index_reader = index_reader_unique.release(); |
1119 | 2.17k | } |
1120 | | |
1121 | 3.93M | return BlockBasedTable::CachableEntry<IndexReader>{index_reader, cache_handle}; |
1122 | 68.9k | } else { |
1123 | 68.9k | if (no_io) { |
1124 | 0 | return ReturnNoIOError(); |
1125 | 0 | } |
1126 | | // Note that we've already performed first check at the beginning of method. |
1127 | 68.9k | std::lock_guard<std::mutex> lock(rep_->data_index_reader_mutex); |
1128 | 68.9k | index_reader = rep_->data_index_reader.get(std::memory_order_relaxed); |
1129 | 68.9k | if (!index_reader) { |
1130 | | // preloaded_meta_index_iter is not needed for kBinarySearch data index which DocDB uses, |
1131 | | // for kHashSearch data index it will do one more access to file to load it. |
1132 | | // TODO: if we need to optimize kHashSearch data index load, we can preload and store in |
1133 | | // rep_ meta index with iterator during Open. |
1134 | 68.2k | std::unique_ptr<IndexReader> index_reader_holder; |
1135 | 68.2k | RETURN_NOT_OK(CreateDataBlockIndexReader( |
1136 | 68.2k | &index_reader_holder, /* preloaded_meta_index_iter =*/ nullptr)); |
1137 | 68.2k | index_reader = index_reader_holder.release(); |
1138 | 68.2k | rep_->data_index_reader.reset(index_reader, std::memory_order_acq_rel); |
1139 | 68.2k | } |
1140 | 68.9k | return BlockBasedTable::CachableEntry<IndexReader>{index_reader, /* cache_handle =*/ nullptr}; |
1141 | 68.9k | } |
1142 | 4.00M | } |
1143 | | |
1144 | | InternalIterator* BlockBasedTable::NewIndexIterator( |
1145 | 11.7M | const ReadOptions& read_options, BlockIter* input_iter) { |
1146 | 11.7M | const auto index_reader_result = GetIndexReader(read_options); |
1147 | 11.7M | if (!index_reader_result.ok()) { |
1148 | 1 | return ReturnErrorIterator(index_reader_result.status(), input_iter); |
1149 | 1 | } |
1150 | | |
1151 | 11.7M | auto* new_iter = index_reader_result->value->NewIterator( |
1152 | 11.7M | input_iter, rep_->data_index_iterator_state.get(), read_options.total_order_seek); |
1153 | | |
1154 | 11.7M | if (index_reader_result->cache_handle) { |
1155 | 18.4E | auto iter = new_iter ? new_iter : input_iter; |
1156 | 3.93M | iter->RegisterCleanup( |
1157 | 3.93M | &ReleaseCachedEntry, rep_->table_options.block_cache.get(), |
1158 | 3.93M | index_reader_result->cache_handle); |
1159 | 3.93M | } |
1160 | | |
1161 | 11.7M | return new_iter; |
1162 | 11.7M | } |
1163 | | |
1164 | | // Convert an index iterator value (i.e., an encoded BlockHandle) |
1165 | | // into an iterator over the contents of the corresponding block. |
1166 | | // If input_iter is null, new a iterator |
1167 | | // If input_iter is not null, update this iter and return it |
1168 | | InternalIterator* BlockBasedTable::NewDataBlockIterator(const ReadOptions& ro, |
1169 | 18.3M | const Slice& index_value, BlockType block_type, BlockIter* input_iter) { |
1170 | 18.3M | PERF_TIMER_GUARD(new_table_block_iter_nanos); |
1171 | | |
1172 | 18.3M | const bool no_io = (ro.read_tier == kBlockCacheTier); |
1173 | 18.3M | Cache* block_cache = rep_->table_options.block_cache.get(); |
1174 | 18.3M | Cache* block_cache_compressed = |
1175 | 18.3M | rep_->table_options.block_cache_compressed.get(); |
1176 | 18.3M | CachableEntry<Block> block; |
1177 | | |
1178 | 18.3M | BlockHandle handle; |
1179 | 18.3M | Slice input = index_value; |
1180 | | // We intentionally allow extra stuff in index_value so that we |
1181 | | // can add more features in the future. |
1182 | 18.3M | Status s = handle.DecodeFrom(&input); |
1183 | | |
1184 | 18.3M | if (!s.ok()) { |
1185 | 0 | if (input_iter != nullptr) { |
1186 | 0 | input_iter->SetStatus(s); |
1187 | 0 | return input_iter; |
1188 | 0 | } else { |
1189 | 0 | return NewErrorInternalIterator(s); |
1190 | 0 | } |
1191 | 18.3M | } |
1192 | | |
1193 | 18.3M | FileReaderWithCachePrefix* reader = GetBlockReader(block_type); |
1194 | | |
1195 | | // If either block cache is enabled, we'll try to read from it. |
1196 | 18.3M | if (block_cache != nullptr || block_cache_compressed != nullptr) { |
1197 | 18.3M | Statistics* statistics = rep_->ioptions.statistics; |
1198 | 18.3M | char cache_key[block_based_table::kCacheKeyBufferSize]; |
1199 | 18.3M | char compressed_cache_key[block_based_table::kCacheKeyBufferSize]; |
1200 | 18.3M | Slice key, /* key to the block cache */ |
1201 | 18.3M | ckey /* key to the compressed block cache */; |
1202 | | |
1203 | | // create key for block cache |
1204 | 18.3M | if (block_cache != nullptr) { |
1205 | 18.3M | key = GetCacheKey(reader->cache_key_prefix, handle, cache_key); |
1206 | 18.3M | } |
1207 | | |
1208 | 18.3M | if (block_cache_compressed != nullptr) { |
1209 | 79.4k | ckey = GetCacheKey(reader->compressed_cache_key_prefix, handle, compressed_cache_key); |
1210 | 79.4k | } |
1211 | | |
1212 | 18.3M | s = GetDataBlockFromCache( |
1213 | 18.3M | key, ckey, block_cache, block_cache_compressed, statistics, ro, &block, |
1214 | 18.3M | rep_->table_options.format_version, block_type, rep_->mem_tracker); |
1215 | | |
1216 | 18.3M | if (block.value == nullptr && !no_io && ro.fill_cache) { |
1217 | 2.13M | std::unique_ptr<Block> raw_block; |
1218 | 2.13M | { |
1219 | 2.13M | StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS); |
1220 | 2.13M | s = block_based_table::ReadBlockFromFile( |
1221 | 2.13M | reader->reader.get(), rep_->footer, ro, handle, &raw_block, rep_->ioptions.env, |
1222 | 2.13M | rep_->mem_tracker, block_cache_compressed == nullptr); |
1223 | 2.13M | } |
1224 | | |
1225 | 2.13M | if (s.ok()) { |
1226 | 2.13M | s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed, |
1227 | 2.13M | ro, statistics, &block, raw_block.release(), |
1228 | 2.13M | rep_->table_options.format_version, rep_->mem_tracker); |
1229 | 2.13M | } |
1230 | 2.13M | } |
1231 | 18.3M | } |
1232 | | |
1233 | | // Didn't get any data from block caches. |
1234 | 18.3M | if (s.ok() && block.value == nullptr) { |
1235 | 1.45M | if (no_io) { |
1236 | | // Could not read from block_cache and can't do IO |
1237 | 88 | if (input_iter != nullptr) { |
1238 | 40 | input_iter->SetStatus(ReturnNoIOError()); |
1239 | 40 | return input_iter; |
1240 | 48 | } else { |
1241 | 48 | return NewErrorInternalIterator(ReturnNoIOError()); |
1242 | 48 | } |
1243 | 1.45M | } |
1244 | 1.45M | std::unique_ptr<Block> block_value; |
1245 | 1.45M | s = block_based_table::ReadBlockFromFile( |
1246 | 1.45M | reader->reader.get(), rep_->footer, ro, handle, &block_value, rep_->ioptions.env, |
1247 | 1.45M | rep_->mem_tracker); |
1248 | 1.45M | if (s.ok()) { |
1249 | 1.45M | block.value = block_value.release(); |
1250 | 1.45M | } |
1251 | 1.45M | } |
1252 | | |
1253 | 18.3M | InternalIterator* iter; |
1254 | 18.3M | if (s.ok() && block.value != nullptr) { |
1255 | 18.3M | iter = block.value->NewIterator( |
1256 | 18.3M | rep_->comparator.get(), GetKeyValueEncodingFormat(block_type), input_iter); |
1257 | 18.3M | if (block.cache_handle != nullptr) { |
1258 | 16.9M | iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, |
1259 | 16.9M | block.cache_handle); |
1260 | 1.45M | } else { |
1261 | 1.45M | iter->RegisterCleanup(&DeleteHeldResource<Block>, block.value, nullptr); |
1262 | 1.45M | } |
1263 | 580 | } else { |
1264 | 580 | if (input_iter != nullptr) { |
1265 | 0 | input_iter->SetStatus(s); |
1266 | 0 | iter = input_iter; |
1267 | 580 | } else { |
1268 | 580 | iter = NewErrorInternalIterator(s); |
1269 | 580 | } |
1270 | 580 | } |
1271 | 18.3M | return iter; |
1272 | 18.3M | } |
1273 | | |
1274 | | // This will be broken if the user specifies an unusual implementation |
1275 | | // of Options.comparator, or if the user specifies an unusual |
1276 | | // definition of prefixes in BlockBasedTableOptions.filter_policy. |
1277 | | // In particular, we require the following three properties: |
1278 | | // |
1279 | | // 1) key.starts_with(prefix(key)) |
1280 | | // 2) Compare(prefix(key), key) <= 0. |
1281 | | // 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0 |
1282 | | // |
1283 | | // Otherwise, this method guarantees no I/O will be incurred. |
1284 | | // |
1285 | | // REQUIRES: this method shouldn't be called while the DB lock is held. |
1286 | 324 | bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) { |
1287 | 324 | if (!rep_->filter_policy) { |
1288 | 272 | return true; |
1289 | 272 | } |
1290 | | |
1291 | 52 | assert(rep_->ioptions.prefix_extractor != nullptr); |
1292 | 52 | auto user_key = ExtractUserKey(internal_key); |
1293 | 52 | auto filter_key = GetFilterKeyFromUserKey(user_key); |
1294 | 52 | if (filter_key.empty() || |
1295 | 52 | !rep_->ioptions.prefix_extractor->InDomain(filter_key) || |
1296 | 51 | !rep_->ioptions.prefix_extractor->InDomain(user_key)) { |
1297 | 1 | return true; |
1298 | 1 | } |
1299 | 51 | auto user_key_prefix = rep_->ioptions.prefix_extractor->Transform(user_key); |
1300 | 51 | auto filter_key_prefix = rep_->ioptions.prefix_extractor->Transform(filter_key); |
1301 | 51 | InternalKey internal_key_prefix(user_key_prefix, kMaxSequenceNumber, kTypeValue); |
1302 | 51 | auto internal_prefix = internal_key_prefix.Encode(); |
1303 | | |
1304 | 51 | bool may_match = true; |
1305 | 51 | Status s; |
1306 | | |
1307 | | // To prevent any io operation in this method, we set `read_tier` to make |
1308 | | // sure we always read index or filter only when they have already been |
1309 | | // loaded to memory. |
1310 | 51 | ReadOptions no_io_read_options; |
1311 | 51 | no_io_read_options.read_tier = kBlockCacheTier; |
1312 | | |
1313 | | // First check non block-based filter. |
1314 | 51 | auto filter_entry = GetFilter(no_io_read_options.query_id, true /* no io */, &filter_key); |
1315 | 51 | FilterBlockReader* filter = filter_entry.value; |
1316 | 51 | const bool is_block_based_filter = rep_->filter_type == FilterType::kBlockBasedFilter; |
1317 | 51 | if (filter != nullptr && !is_block_based_filter) { |
1318 | 25 | may_match = filter->PrefixMayMatch(filter_key_prefix); |
1319 | 25 | } |
1320 | | |
1321 | | // If filter is block-based or checking filter was not successful we need to get data block |
1322 | | // offset. For block-based filter we need to know offset of data block to get and check |
1323 | | // corresponding filter block. For non block-based filter we just need offset to try to get data |
1324 | | // for the key. |
1325 | 51 | if (may_match) { |
1326 | 39 | unique_ptr<InternalIterator> iiter(NewIndexIterator(no_io_read_options)); |
1327 | 39 | iiter->Seek(internal_prefix); |
1328 | | |
1329 | 39 | if (!iiter->Valid()) { |
1330 | | // we're past end of file |
1331 | | // if it's incomplete, it means that we avoided I/O |
1332 | | // and we're not really sure that we're past the end |
1333 | | // of the file |
1334 | 0 | may_match = iiter->status().IsIncomplete(); |
1335 | 39 | } else if (ExtractUserKey(iiter->key()).starts_with( |
1336 | 2 | ExtractUserKey(internal_prefix))) { |
1337 | | // we need to check for this subtle case because our only |
1338 | | // guarantee is that "the key is a string >= last key in that data |
1339 | | // block" according to the doc/table_format.txt spec. |
1340 | | // |
1341 | | // Suppose iiter.key() starts with the desired prefix; it is not |
1342 | | // necessarily the case that the corresponding data block will |
1343 | | // contain the prefix, since iiter.key() need not be in the |
1344 | | // block. However, the next data block may contain the prefix, so |
1345 | | // we return true to play it safe. |
1346 | 2 | may_match = true; |
1347 | 37 | } else if (filter != nullptr && is_block_based_filter) { |
1348 | | // iiter.key() does NOT start with the desired prefix. Because |
1349 | | // Seek() finds the first key that is >= the seek target, this |
1350 | | // means that iiter.key() > prefix. Thus, any data blocks coming |
1351 | | // after the data block corresponding to iiter.key() cannot |
1352 | | // possibly contain the key. Thus, the corresponding data block |
1353 | | // is the only on could potentially contain the prefix. |
1354 | 25 | Slice handle_value = iiter->value(); |
1355 | 25 | BlockHandle handle; |
1356 | 25 | s = handle.DecodeFrom(&handle_value); |
1357 | 25 | assert(s.ok()); |
1358 | 25 | may_match = filter->PrefixMayMatch(filter_key_prefix, handle.offset()); |
1359 | 25 | } |
1360 | 39 | } |
1361 | | |
1362 | 51 | Statistics* statistics = rep_->ioptions.statistics; |
1363 | 51 | RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED); |
1364 | 51 | if (!may_match) { |
1365 | 31 | RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); |
1366 | 31 | } |
1367 | | |
1368 | 51 | filter_entry.Release(rep_->table_options.block_cache.get()); |
1369 | 51 | return may_match; |
1370 | 51 | } |
1371 | | |
1372 | | InternalIterator* BlockBasedTable::NewIterator(const ReadOptions& read_options, |
1373 | | Arena* arena, |
1374 | 3.77M | bool skip_filters) { |
1375 | 3.77M | auto state = std::make_unique<BlockEntryIteratorState>( |
1376 | 3.77M | this, read_options, skip_filters, BlockType::kData); |
1377 | | // TODO: unify the semantics across NewIterator callsites, so that we can pass an arena across |
1378 | | // them, and decide the free / no free based on that. This callsite, for example, allows us to |
1379 | | // put the top level iterator on the arena and potentially even the State object, however, not |
1380 | | // the IndexIterator, as that does not expose arena allocation semantics... |
1381 | 3.77M | return NewTwoLevelIterator( |
1382 | 3.77M | state.release(), NewIndexIterator(read_options), arena, true /* need_free_iter_and_state */ |
1383 | 3.77M | ); |
1384 | 3.77M | } |
1385 | | |
1386 | | bool BlockBasedTable::NonBlockBasedFilterKeyMayMatch(FilterBlockReader* filter, |
1387 | 11.1M | const Slice& filter_key) const { |
1388 | 11.1M | assert(rep_->filter_type != FilterType::kBlockBasedFilter); |
1389 | 11.1M | if (filter == nullptr) { |
1390 | 6.35M | return true; |
1391 | 6.35M | } |
1392 | 4.83M | RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_CHECKED); |
1393 | 4.83M | if (!filter->KeyMayMatch(filter_key)) { |
1394 | 2.95M | return false; |
1395 | 2.95M | } |
1396 | 1.88M | if (rep_->ioptions.prefix_extractor && |
1397 | 37 | rep_->ioptions.prefix_extractor->InDomain(filter_key) && |
1398 | 36 | !filter->PrefixMayMatch( |
1399 | 5 | rep_->ioptions.prefix_extractor->Transform(filter_key))) { |
1400 | 5 | return false; |
1401 | 5 | } |
1402 | 1.88M | return true; |
1403 | 1.88M | } |
1404 | | |
1405 | | Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& internal_key, |
1406 | 8.20M | GetContext* get_context, bool skip_filters) { |
1407 | 8.20M | Status s; |
1408 | 8.20M | CachableEntry<FilterBlockReader> filter_entry; |
1409 | 8.20M | Slice filter_key; |
1410 | 8.20M | if (!skip_filters) { |
1411 | 7.97M | filter_key = GetFilterKeyFromInternalKey(internal_key); |
1412 | 7.97M | if (!filter_key.empty()) { |
1413 | 7.77M | filter_entry = |
1414 | 7.77M | GetFilter(read_options.query_id, read_options.read_tier == kBlockCacheTier, &filter_key); |
1415 | 200k | } else { |
1416 | 200k | skip_filters = true; |
1417 | 200k | } |
1418 | 7.97M | } |
1419 | 8.20M | FilterBlockReader* filter = filter_entry.value; |
1420 | | |
1421 | 8.20M | const bool is_block_based_filter = rep_->filter_type == FilterType::kBlockBasedFilter; |
1422 | | |
1423 | | // First check non block-based filter. |
1424 | 8.20M | if (!is_block_based_filter && !NonBlockBasedFilterKeyMayMatch(filter, filter_key)) { |
1425 | 264k | RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); |
1426 | 7.94M | } else { |
1427 | | // Either filter is block-based or key may match. |
1428 | 7.94M | IndexIteratorHolder iiter_holder(this, read_options); |
1429 | 7.94M | InternalIterator& iiter = *iiter_holder.iter(); |
1430 | | |
1431 | 7.94M | RETURN_NOT_OK(iiter.status()); |
1432 | | |
1433 | 7.94M | bool done = false; |
1434 | 15.2M | for (iiter.Seek(internal_key); iiter.Valid() && !done; iiter.Next()) { |
1435 | 7.94M | { |
1436 | 7.94M | Slice data_block_handle_encoded = iiter.value(); |
1437 | | |
1438 | 7.94M | if (!skip_filters && is_block_based_filter) { |
1439 | 1.36M | RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_CHECKED); |
1440 | 1.36M | BlockHandle data_block_handle; |
1441 | 1.36M | const bool absent_from_filter = |
1442 | 1.36M | data_block_handle.DecodeFrom(&data_block_handle_encoded).ok() |
1443 | 1.36M | && !filter->KeyMayMatch(filter_key, data_block_handle.offset()); |
1444 | | |
1445 | 1.36M | if (absent_from_filter) { |
1446 | | // Not found |
1447 | | // TODO: think about interaction with Merge. If a user key cannot |
1448 | | // cross one data block, we should be fine. |
1449 | 669k | RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); |
1450 | 669k | break; |
1451 | 669k | } |
1452 | 7.27M | } |
1453 | 7.27M | } |
1454 | | |
1455 | 7.27M | BlockIter biter; |
1456 | 7.27M | NewDataBlockIterator(read_options, iiter.value(), BlockType::kData, &biter); |
1457 | | |
1458 | 7.27M | if (read_options.read_tier == kBlockCacheTier && |
1459 | 43 | biter.status().IsIncomplete()) { |
1460 | | // couldn't get block from block_cache |
1461 | | // Update Saver.state to Found because we are only looking for whether |
1462 | | // we can guarantee the key is not there when "no_io" is set |
1463 | 40 | get_context->MarkKeyMayExist(); |
1464 | 40 | break; |
1465 | 40 | } |
1466 | 7.27M | if (!biter.status().ok()) { |
1467 | 0 | s = biter.status(); |
1468 | 0 | break; |
1469 | 0 | } |
1470 | | |
1471 | | // Call the *saver function on each entry/block until it returns false |
1472 | 7.27M | for (biter.Seek(internal_key); biter.Valid(); biter.Next()) { |
1473 | 7.26M | ParsedInternalKey parsed_key; |
1474 | 7.26M | if (!ParseInternalKey(biter.key(), &parsed_key)) { |
1475 | 0 | s = STATUS(Corruption, Slice()); |
1476 | 0 | } |
1477 | | |
1478 | 7.26M | if (!get_context->SaveValue(parsed_key, biter.value())) { |
1479 | 7.26M | done = true; |
1480 | 7.26M | break; |
1481 | 7.26M | } |
1482 | 7.26M | } |
1483 | 7.27M | s = biter.status(); |
1484 | 7.27M | } |
1485 | 7.94M | if (s.ok()) { |
1486 | 7.94M | s = iiter.status(); |
1487 | 7.94M | } |
1488 | 7.94M | } |
1489 | | |
1490 | 8.20M | filter_entry.Release(rep_->table_options.block_cache.get()); |
1491 | 8.20M | return s; |
1492 | 8.20M | } |
1493 | | |
1494 | | Status BlockBasedTable::Prefetch(const Slice* const begin, |
1495 | 10 | const Slice* const end) { |
1496 | 10 | auto& comparator = *rep_->comparator; |
1497 | | // pre-condition |
1498 | 10 | if (begin && end && comparator.Compare(*begin, *end) > 0) { |
1499 | 1 | return STATUS(InvalidArgument, *begin, *end); |
1500 | 1 | } |
1501 | | |
1502 | 9 | IndexIteratorHolder iiter_holder(this, ReadOptions::kDefault); |
1503 | 9 | InternalIterator& iiter = *iiter_holder.iter(); |
1504 | | |
1505 | 9 | RETURN_NOT_OK(iiter.status()); |
1506 | | |
1507 | | // indicates if we are on the last page that need to be pre-fetched |
1508 | 9 | bool prefetching_boundary_page = false; |
1509 | | |
1510 | 36 | for (begin ? iiter.Seek(*begin) : iiter.SeekToFirst(); iiter.Valid(); |
1511 | 31 | iiter.Next()) { |
1512 | 31 | Slice block_handle = iiter.value(); |
1513 | | |
1514 | 31 | if (end && comparator.Compare(iiter.key(), *end) >= 0) { |
1515 | 9 | if (prefetching_boundary_page) { |
1516 | 4 | break; |
1517 | 4 | } |
1518 | | |
1519 | | // The index entry represents the last key in the data block. |
1520 | | // We should load this page into memory as well, but no more |
1521 | 5 | prefetching_boundary_page = true; |
1522 | 5 | } |
1523 | | |
1524 | | // Load the block specified by the block_handle into the block cache |
1525 | 27 | BlockIter biter; |
1526 | 27 | NewDataBlockIterator(ReadOptions::kDefault, block_handle, BlockType::kData, &biter); |
1527 | | |
1528 | 27 | if (!biter.status().ok()) { |
1529 | | // there was an unexpected error while pre-fetching |
1530 | 0 | return biter.status(); |
1531 | 0 | } |
1532 | 27 | } |
1533 | | |
1534 | 9 | return Status::OK(); |
1535 | 9 | } |
1536 | | |
1537 | | bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, |
1538 | 77 | const Slice& key) { |
1539 | 77 | std::unique_ptr<InternalIterator> iiter(NewIndexIterator(options)); |
1540 | 77 | iiter->Seek(key); |
1541 | 77 | assert(iiter->Valid()); |
1542 | 77 | CachableEntry<Block> block; |
1543 | | |
1544 | 77 | BlockHandle handle; |
1545 | 77 | Slice input = iiter->value(); |
1546 | 77 | Status s = handle.DecodeFrom(&input); |
1547 | 77 | assert(s.ok()); |
1548 | 77 | Cache* block_cache = rep_->table_options.block_cache.get(); |
1549 | 77 | assert(block_cache != nullptr); |
1550 | | |
1551 | 77 | char cache_key_storage[block_based_table::kCacheKeyBufferSize]; |
1552 | 77 | Slice cache_key = |
1553 | 77 | GetCacheKey(rep_->data_reader_with_cache_prefix->cache_key_prefix, handle, cache_key_storage); |
1554 | 77 | Slice ckey; |
1555 | | |
1556 | 77 | s = GetDataBlockFromCache(cache_key, ckey, block_cache, nullptr, nullptr, options, &block, |
1557 | 77 | rep_->table_options.format_version, BlockType::kData, rep_->mem_tracker); |
1558 | 77 | assert(s.ok()); |
1559 | 77 | bool in_cache = block.value != nullptr; |
1560 | 77 | if (in_cache) { |
1561 | 55 | ReleaseCachedEntry(block_cache, block.cache_handle); |
1562 | 55 | } |
1563 | 77 | return in_cache; |
1564 | 77 | } |
1565 | | |
1566 | | // REQUIRES: The following fields of rep_ should have already been populated: |
1567 | | // 1. file |
1568 | | // 2. index_handle, |
1569 | | // 3. options |
1570 | | // 4. internal_comparator |
1571 | | // 5. index_type |
1572 | | Status BlockBasedTable::CreateDataBlockIndexReader( |
1573 | 72.4k | std::unique_ptr<IndexReader>* index_reader, InternalIterator* preloaded_meta_index_iter) { |
1574 | | // Some old version of block-based tables don't have index type present in |
1575 | | // table properties. If that's the case we can safely use the kBinarySearch. |
1576 | 72.4k | auto index_type_on_file = IndexType::kBinarySearch; |
1577 | 72.4k | if (rep_->table_properties) { |
1578 | 72.4k | auto& props = rep_->table_properties->user_collected_properties; |
1579 | 72.4k | auto pos = props.find(BlockBasedTablePropertyNames::kIndexType); |
1580 | 72.4k | if (pos != props.end()) { |
1581 | 72.4k | index_type_on_file = static_cast<IndexType>( |
1582 | 72.4k | DecodeFixed32(pos->second.c_str())); |
1583 | 72.4k | } |
1584 | 72.4k | } |
1585 | | |
1586 | 72.4k | auto file = rep_->base_reader_with_cache_prefix->reader.get(); |
1587 | 72.4k | auto env = rep_->ioptions.env; |
1588 | 72.4k | const auto& comparator = rep_->comparator; |
1589 | 72.4k | const Footer& footer = rep_->footer; |
1590 | | |
1591 | 72.4k | if (index_type_on_file == IndexType::kHashSearch && |
1592 | 1.65k | rep_->ioptions.prefix_extractor == nullptr) { |
1593 | 1 | RLOG(InfoLogLevel::WARN_LEVEL, rep_->ioptions.info_log, |
1594 | 1 | "IndexType::kHashSearch requires " |
1595 | 1 | "options.prefix_extractor to be set." |
1596 | 1 | " Fall back to binary search index."); |
1597 | 1 | index_type_on_file = IndexType::kBinarySearch; |
1598 | 1 | } |
1599 | | |
1600 | 72.4k | switch (index_type_on_file) { |
1601 | 6 | case IndexType::kBinarySearch: { |
1602 | 6 | return BinarySearchIndexReader::Create( |
1603 | 6 | file, footer, footer.index_handle(), env, comparator, index_reader, rep_->mem_tracker); |
1604 | 0 | } |
1605 | 1.65k | case IndexType::kHashSearch: { |
1606 | 1.65k | std::unique_ptr<Block> meta_guard; |
1607 | 1.65k | std::unique_ptr<InternalIterator> meta_iter_guard; |
1608 | 1.65k | auto meta_index_iter = preloaded_meta_index_iter; |
1609 | 1.65k | if (meta_index_iter == nullptr) { |
1610 | 1.65k | auto s = ReadMetaBlock(rep_, &meta_guard, &meta_iter_guard); |
1611 | 1.65k | if (!s.ok()) { |
1612 | | // we simply fall back to binary search in case there is any |
1613 | | // problem with prefix hash index loading. |
1614 | 0 | RLOG(InfoLogLevel::WARN_LEVEL, rep_->ioptions.info_log, |
1615 | 0 | "Unable to read the metaindex block." |
1616 | 0 | " Fall back to binary search index."); |
1617 | 0 | return BinarySearchIndexReader::Create( |
1618 | 0 | file, footer, footer.index_handle(), env, comparator, index_reader, rep_->mem_tracker); |
1619 | 0 | } |
1620 | 1.65k | meta_index_iter = meta_iter_guard.get(); |
1621 | 1.65k | } |
1622 | | |
1623 | | // We need to wrap data with internal_prefix_transform to make sure it can |
1624 | | // handle prefix correctly. |
1625 | 1.65k | rep_->internal_prefix_transform.reset( |
1626 | 1.65k | new InternalKeySliceTransform(rep_->ioptions.prefix_extractor)); |
1627 | 1.65k | return HashIndexReader::Create( |
1628 | 1.65k | rep_->internal_prefix_transform.get(), footer, file, env, comparator, |
1629 | 1.65k | footer.index_handle(), meta_index_iter, index_reader, |
1630 | 1.65k | rep_->hash_index_allow_collision, rep_->mem_tracker); |
1631 | 1.65k | } |
1632 | 70.8k | case IndexType::kMultiLevelBinarySearch: { |
1633 | 70.8k | auto& props = DCHECK_NOTNULL(rep_->table_properties.get())->user_collected_properties; |
1634 | 70.8k | auto pos = props.find(BlockBasedTablePropertyNames::kNumIndexLevels); |
1635 | 70.8k | if (pos == props.end()) { |
1636 | 0 | return STATUS_FORMAT( |
1637 | 0 | NotFound, "Missed table property $0 for multi-level binary-search index", |
1638 | 0 | BlockBasedTablePropertyNames::kNumIndexLevels); |
1639 | 0 | } |
1640 | 70.8k | int num_levels = DecodeFixed32(pos->second.c_str()); |
1641 | 70.8k | auto result = MultiLevelIndexReader::Create( |
1642 | 70.8k | file, footer, num_levels, footer.index_handle(), env, comparator, rep_->mem_tracker); |
1643 | 70.8k | RETURN_NOT_OK(result); |
1644 | 70.8k | *index_reader = std::move(*result); |
1645 | 70.8k | return Status::OK(); |
1646 | 70.8k | } |
1647 | 0 | default: { |
1648 | 0 | std::string error_message = |
1649 | 0 | "Unrecognized index type: " + ToString(rep_->index_type); |
1650 | 0 | return STATUS(InvalidArgument, error_message.c_str()); |
1651 | 70.8k | } |
1652 | 72.4k | } |
1653 | 72.4k | } |
1654 | | |
1655 | 16.9k | uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) { |
1656 | 16.9k | unique_ptr<InternalIterator> index_iter(NewIndexIterator(ReadOptions::kDefault)); |
1657 | | |
1658 | 16.9k | index_iter->Seek(key); |
1659 | 16.9k | uint64_t result; |
1660 | 16.9k | if (index_iter->Valid()) { |
1661 | 16.9k | BlockHandle handle; |
1662 | 16.9k | Slice input = index_iter->value(); |
1663 | 16.9k | Status s = handle.DecodeFrom(&input); |
1664 | 16.9k | if (s.ok()) { |
1665 | 16.9k | result = handle.offset(); |
1666 | 0 | } else { |
1667 | | // Strange: we can't decode the block handle in the index block. |
1668 | | // We'll just return the offset of the metaindex block, which is |
1669 | | // close to the whole file size for this case. |
1670 | 0 | result = rep_->footer.metaindex_handle().offset(); |
1671 | 0 | } |
1672 | 16 | } else { |
1673 | | // key is past the last key in the file. If table_properties is not |
1674 | | // available, approximate the offset by returning the offset of the |
1675 | | // metaindex block (which is right near the end of the file). |
1676 | 16 | result = 0; |
1677 | 16 | if (rep_->table_properties) { |
1678 | 16 | result = rep_->table_properties->data_size; |
1679 | 16 | } |
1680 | | // table_properties is not present in the table. |
1681 | 16 | if (result == 0) { |
1682 | 0 | result = rep_->footer.metaindex_handle().offset(); |
1683 | 0 | } |
1684 | 16 | } |
1685 | 16.9k | return result; |
1686 | 16.9k | } |
1687 | | |
1688 | 3 | bool BlockBasedTable::TEST_filter_block_preloaded() const { |
1689 | 3 | return rep_->filter != nullptr; |
1690 | 3 | } |
1691 | | |
1692 | 3 | bool BlockBasedTable::TEST_index_reader_loaded() const { |
1693 | 3 | return rep_->data_index_reader.get() != nullptr; |
1694 | 3 | } |
1695 | | |
1696 | 3 | Status BlockBasedTable::DumpTable(WritableFile* out_file) { |
1697 | | // Output Footer |
1698 | 3 | RETURN_NOT_OK(out_file->Append( |
1699 | 3 | "Footer Details:\n" |
1700 | 3 | "--------------------------------------\n" |
1701 | 3 | " ")); |
1702 | 3 | RETURN_NOT_OK(out_file->Append(rep_->footer.ToString().c_str())); |
1703 | 3 | RETURN_NOT_OK(out_file->Append("\n")); |
1704 | | |
1705 | | // Output MetaIndex |
1706 | 3 | RETURN_NOT_OK(out_file->Append( |
1707 | 3 | "Metaindex Details:\n" |
1708 | 3 | "--------------------------------------\n")); |
1709 | 3 | std::unique_ptr<Block> meta; |
1710 | 3 | std::unique_ptr<InternalIterator> meta_iter; |
1711 | 3 | Status s = ReadMetaBlock(rep_, &meta, &meta_iter); |
1712 | 3 | if (s.ok()) { |
1713 | 8 | for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) { |
1714 | 5 | s = meta_iter->status(); |
1715 | 5 | if (!s.ok()) { |
1716 | 0 | return s; |
1717 | 0 | } |
1718 | 5 | if (meta_iter->key() == rocksdb::kPropertiesBlock) { |
1719 | 3 | RETURN_NOT_OK(out_file->Append(" Properties block handle: ")); |
1720 | 3 | RETURN_NOT_OK(out_file->Append(meta_iter->value().ToString(true).c_str())); |
1721 | 3 | RETURN_NOT_OK(out_file->Append("\n")); |
1722 | 2 | } else if (strstr(meta_iter->key().ToString().c_str(), |
1723 | 2 | "filter.rocksdb.") != nullptr) { |
1724 | 2 | RETURN_NOT_OK(out_file->Append(" Filter block handle: ")); |
1725 | 2 | RETURN_NOT_OK(out_file->Append(meta_iter->value().ToString(true).c_str())); |
1726 | 2 | RETURN_NOT_OK(out_file->Append("\n")); |
1727 | 2 | } |
1728 | 5 | } |
1729 | 3 | RETURN_NOT_OK(out_file->Append("\n")); |
1730 | 0 | } else { |
1731 | 0 | return s; |
1732 | 0 | } |
1733 | | |
1734 | | // Output TableProperties |
1735 | 3 | const rocksdb::TableProperties* table_properties; |
1736 | 3 | table_properties = rep_->table_properties.get(); |
1737 | | |
1738 | 3 | if (table_properties != nullptr) { |
1739 | 3 | RETURN_NOT_OK(out_file->Append( |
1740 | 3 | "Table Properties:\n" |
1741 | 3 | "--------------------------------------\n" |
1742 | 3 | " ")); |
1743 | 3 | RETURN_NOT_OK(out_file->Append(table_properties->ToString("\n ", ": ").c_str())); |
1744 | 3 | RETURN_NOT_OK(out_file->Append("\n")); |
1745 | 3 | } |
1746 | | |
1747 | | // Output Filter blocks |
1748 | 3 | if (!rep_->filter && !table_properties->filter_policy_name.empty()) { |
1749 | | // Support only BloomFilter as off now |
1750 | 2 | rocksdb::BlockBasedTableOptions table_options; |
1751 | 2 | table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1)); |
1752 | 2 | if (table_properties->filter_policy_name.compare( |
1753 | 2 | table_options.filter_policy->Name()) == 0) { |
1754 | 2 | std::string filter_block_key = block_based_table::kFilterBlockPrefix; |
1755 | 2 | filter_block_key.append(table_properties->filter_policy_name); |
1756 | 2 | BlockHandle handle; |
1757 | 2 | if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) { |
1758 | 1 | BlockContents block; |
1759 | 1 | if (ReadBlockContents( |
1760 | 1 | rep_->base_reader_with_cache_prefix->reader.get(), rep_->footer, |
1761 | 1 | ReadOptions::kDefault, handle, &block, rep_->ioptions.env, rep_->mem_tracker, |
1762 | 1 | false).ok()) { |
1763 | 1 | rep_->filter.reset(new BlockBasedFilterBlockReader( |
1764 | 1 | rep_->ioptions.prefix_extractor, table_options, |
1765 | 1 | table_options.whole_key_filtering, std::move(block))); |
1766 | 1 | } |
1767 | 1 | } |
1768 | 2 | } |
1769 | 2 | } |
1770 | 3 | if (rep_->filter) { |
1771 | 1 | RETURN_NOT_OK(out_file->Append( |
1772 | 1 | "Filter Details:\n" |
1773 | 1 | "--------------------------------------\n" |
1774 | 1 | " ")); |
1775 | 1 | RETURN_NOT_OK(out_file->Append(rep_->filter->ToString().c_str())); |
1776 | 1 | RETURN_NOT_OK(out_file->Append("\n")); |
1777 | 1 | } |
1778 | | |
1779 | | // Output Index block |
1780 | 3 | s = DumpIndexBlock(out_file); |
1781 | 3 | if (!s.ok()) { |
1782 | 0 | return s; |
1783 | 0 | } |
1784 | | // Output Data blocks |
1785 | 3 | s = DumpDataBlocks(out_file); |
1786 | | |
1787 | 3 | return s; |
1788 | 3 | } |
1789 | | |
1790 | 3 | Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { |
1791 | 3 | RETURN_NOT_OK(out_file->Append( |
1792 | 3 | "Index Details:\n" |
1793 | 3 | "--------------------------------------\n")); |
1794 | | |
1795 | 3 | std::unique_ptr<InternalIterator> blockhandles_iter( |
1796 | 3 | NewIndexIterator(ReadOptions::kDefault)); |
1797 | 3 | Status s = blockhandles_iter->status(); |
1798 | 3 | if (!s.ok()) { |
1799 | 0 | RETURN_NOT_OK(out_file->Append("Can not read Index Block \n\n")); |
1800 | 0 | return s; |
1801 | 3 | } |
1802 | | |
1803 | 3 | RETURN_NOT_OK(out_file->Append(" Block key hex dump: Data block handle\n")); |
1804 | 3 | RETURN_NOT_OK(out_file->Append(" Block key ascii\n\n")); |
1805 | 24 | for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid(); |
1806 | 21 | blockhandles_iter->Next()) { |
1807 | 21 | s = blockhandles_iter->status(); |
1808 | 21 | if (!s.ok()) { |
1809 | 0 | break; |
1810 | 0 | } |
1811 | 21 | Slice key = blockhandles_iter->key(); |
1812 | 21 | InternalKey ikey = InternalKey::DecodeFrom(key); |
1813 | | |
1814 | 21 | RETURN_NOT_OK(out_file->Append(" HEX ")); |
1815 | 21 | RETURN_NOT_OK(out_file->Append(ikey.user_key().ToString(true).c_str())); |
1816 | 21 | RETURN_NOT_OK(out_file->Append(": ")); |
1817 | 21 | RETURN_NOT_OK(out_file->Append(blockhandles_iter->value().ToString(true).c_str())); |
1818 | 21 | RETURN_NOT_OK(out_file->Append("\n")); |
1819 | | |
1820 | 21 | std::string str_key = ikey.user_key().ToString(); |
1821 | 21 | std::string res_key(""); |
1822 | 21 | char cspace = ' '; |
1823 | 132 | for (size_t i = 0; i < str_key.size(); i++) { |
1824 | 111 | res_key.append(&str_key[i], 1); |
1825 | 111 | res_key.append(1, cspace); |
1826 | 111 | } |
1827 | 21 | RETURN_NOT_OK(out_file->Append(" ASCII ")); |
1828 | 21 | RETURN_NOT_OK(out_file->Append(res_key.c_str())); |
1829 | 21 | RETURN_NOT_OK(out_file->Append("\n ------\n")); |
1830 | 21 | } |
1831 | 3 | RETURN_NOT_OK(out_file->Append("\n")); |
1832 | 3 | return Status::OK(); |
1833 | 3 | } |
1834 | | |
1835 | 3 | Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { |
1836 | 3 | std::unique_ptr<InternalIterator> blockhandles_iter( |
1837 | 3 | NewIndexIterator(ReadOptions::kDefault)); |
1838 | 3 | Status s = blockhandles_iter->status(); |
1839 | 3 | if (!s.ok()) { |
1840 | 0 | RETURN_NOT_OK(out_file->Append("Can not read Index Block \n\n")); |
1841 | 0 | return s; |
1842 | 3 | } |
1843 | | |
1844 | 3 | size_t block_id = 1; |
1845 | 24 | for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid(); |
1846 | 21 | block_id++, blockhandles_iter->Next()) { |
1847 | 21 | s = blockhandles_iter->status(); |
1848 | 21 | if (!s.ok()) { |
1849 | 0 | break; |
1850 | 0 | } |
1851 | | |
1852 | 21 | RETURN_NOT_OK(out_file->Append("Data Block # ")); |
1853 | 21 | RETURN_NOT_OK(out_file->Append(rocksdb::ToString(block_id))); |
1854 | 21 | RETURN_NOT_OK(out_file->Append(" @ ")); |
1855 | 21 | RETURN_NOT_OK(out_file->Append(blockhandles_iter->value().ToString(true).c_str())); |
1856 | 21 | RETURN_NOT_OK(out_file->Append("\n")); |
1857 | 21 | RETURN_NOT_OK(out_file->Append("--------------------------------------\n")); |
1858 | | |
1859 | 21 | std::unique_ptr<InternalIterator> datablock_iter; |
1860 | 21 | datablock_iter.reset( |
1861 | 21 | NewDataBlockIterator( |
1862 | 21 | ReadOptions::kDefault, blockhandles_iter->value(), BlockType::kData)); |
1863 | 21 | s = datablock_iter->status(); |
1864 | | |
1865 | 21 | if (!s.ok()) { |
1866 | 0 | RETURN_NOT_OK(out_file->Append("Error reading the block - Skipped \n\n")); |
1867 | 0 | continue; |
1868 | 21 | } |
1869 | | |
1870 | 3.09k | for (datablock_iter->SeekToFirst(); datablock_iter->Valid(); |
1871 | 3.07k | datablock_iter->Next()) { |
1872 | 3.07k | s = datablock_iter->status(); |
1873 | 3.07k | if (!s.ok()) { |
1874 | 0 | RETURN_NOT_OK(out_file->Append("Error reading the block - Skipped \n")); |
1875 | 0 | break; |
1876 | 3.07k | } |
1877 | 3.07k | Slice key = datablock_iter->key(); |
1878 | 3.07k | Slice value = datablock_iter->value(); |
1879 | 3.07k | InternalKey ikey = InternalKey::DecodeFrom(key); |
1880 | | |
1881 | 3.07k | RETURN_NOT_OK(out_file->Append(" HEX ")); |
1882 | 3.07k | RETURN_NOT_OK(out_file->Append(ikey.user_key().ToString(true).c_str())); |
1883 | 3.07k | RETURN_NOT_OK(out_file->Append(": ")); |
1884 | 3.07k | RETURN_NOT_OK(out_file->Append(value.ToString(true).c_str())); |
1885 | 3.07k | RETURN_NOT_OK(out_file->Append("\n")); |
1886 | | |
1887 | 3.07k | std::string str_key = ikey.user_key().ToString(); |
1888 | 3.07k | std::string str_value = value.ToString(); |
1889 | 3.07k | std::string res_key(""), res_value(""); |
1890 | 3.07k | char cspace = ' '; |
1891 | 21.5k | for (size_t i = 0; i < str_key.size(); i++) { |
1892 | 18.4k | res_key.append(&str_key[i], 1); |
1893 | 18.4k | res_key.append(1, cspace); |
1894 | 18.4k | } |
1895 | 46.0k | for (size_t i = 0; i < str_value.size(); i++) { |
1896 | 43.0k | res_value.append(&str_value[i], 1); |
1897 | 43.0k | res_value.append(1, cspace); |
1898 | 43.0k | } |
1899 | | |
1900 | 3.07k | RETURN_NOT_OK(out_file->Append(" ASCII ")); |
1901 | 3.07k | RETURN_NOT_OK(out_file->Append(res_key.c_str())); |
1902 | 3.07k | RETURN_NOT_OK(out_file->Append(": ")); |
1903 | 3.07k | RETURN_NOT_OK(out_file->Append(res_value.c_str())); |
1904 | 3.07k | RETURN_NOT_OK(out_file->Append("\n ------\n")); |
1905 | 3.07k | } |
1906 | 21 | RETURN_NOT_OK(out_file->Append("\n")); |
1907 | 21 | } |
1908 | 3 | return Status::OK(); |
1909 | 3 | } |
1910 | | |
1911 | 0 | const ImmutableCFOptions& BlockBasedTable::ioptions() { |
1912 | 0 | return rep_->ioptions; |
1913 | 0 | } |
1914 | | |
1915 | 46 | yb::Result<std::string> BlockBasedTable::GetMiddleKey() { |
1916 | 46 | auto index_reader = VERIFY_RESULT(GetIndexReader(ReadOptions::kDefault)); |
1917 | | |
1918 | | // TODO: remove this trick after https://github.com/yugabyte/yugabyte-db/issues/4720 is resolved. |
1919 | 46 | auto se = yb::ScopeExit([this, &index_reader] { |
1920 | 46 | index_reader.Release(rep_->table_options.block_cache.get()); |
1921 | 46 | }); |
1922 | | |
1923 | 46 | const auto index_middle_key = VERIFY_RESULT(index_reader.value->GetMiddleKey()); |
1924 | 46 | std::unique_ptr<InternalIterator> iter( |
1925 | 46 | NewIterator(ReadOptions::kDefault, nullptr, /* skip_filters =*/ true)); |
1926 | 46 | iter->Seek(index_middle_key); |
1927 | 46 | if (!iter->Valid()) { |
1928 | | // There are no keys in SST that are >= index_middle_key. That means SST is empty or just have |
1929 | | // the single data block. |
1930 | | // For tablet splitting we don't need to handle such small files, but if needed for other cases |
1931 | | // we can update this function to return the middle key of the data block in case there is data |
1932 | | // in the SST. |
1933 | 0 | return STATUS(Incomplete, "Empty or too small SST"); |
1934 | 0 | } |
1935 | 46 | return iter->key().ToBuffer(); |
1936 | 46 | } |
1937 | | |
1938 | | } // namespace rocksdb |