/Users/deen/code/yugabyte-db/src/yb/rocksdb/table/block_based_table_reader.h
Line | Count | Source |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under the BSD-style license found in the |
3 | | // LICENSE file in the root directory of this source tree. An additional grant |
4 | | // of patent rights can be found in the PATENTS file in the same directory. |
5 | | // |
6 | | // The following only applies to changes made to this file as part of YugaByte development. |
7 | | // |
8 | | // Portions Copyright (c) YugaByte, Inc. |
9 | | // |
10 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
11 | | // in compliance with the License. You may obtain a copy of the License at |
12 | | // |
13 | | // http://www.apache.org/licenses/LICENSE-2.0 |
14 | | // |
15 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
16 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
17 | | // or implied. See the License for the specific language governing permissions and limitations |
18 | | // under the License. |
19 | | // |
20 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
21 | | // Use of this source code is governed by a BSD-style license that can be |
22 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
23 | | |
24 | | #ifndef YB_ROCKSDB_TABLE_BLOCK_BASED_TABLE_READER_H |
25 | | #define YB_ROCKSDB_TABLE_BLOCK_BASED_TABLE_READER_H |
26 | | |
27 | | #include <stdint.h> |
28 | | |
29 | | #include <memory> |
30 | | #include <string> |
31 | | #include <utility> |
32 | | |
33 | | #include "yb/rocksdb/immutable_options.h" |
34 | | #include "yb/rocksdb/options.h" |
35 | | #include "yb/rocksdb/statistics.h" |
36 | | #include "yb/rocksdb/status.h" |
37 | | #include "yb/rocksdb/table/table_reader.h" |
38 | | |
39 | | #include "yb/util/strongly_typed_bool.h" |
40 | | |
41 | | namespace rocksdb { |
42 | | |
43 | | class Block; |
44 | | class BlockIter; |
45 | | class BlockHandle; |
46 | | class Cache; |
47 | | class FilterBlockReader; |
48 | | class BlockBasedFilterBlockReader; |
49 | | class FullFilterBlockReader; |
50 | | class Footer; |
51 | | class InternalKeyComparator; |
52 | | class Iterator; |
53 | | class TableCache; |
54 | | class TableReader; |
55 | | class WritableFile; |
56 | | struct BlockBasedTableOptions; |
57 | | struct EnvOptions; |
58 | | struct ReadOptions; |
59 | | class GetContext; |
60 | | class InternalIterator; |
61 | | class IndexReader; |
62 | | |
63 | | using std::unique_ptr; |
64 | | |
65 | | enum class DataIndexLoadMode { |
66 | | // Preload on Open, store in block cache or in table reader depending on |
67 | | // BlockBasedTableOptions::cache_index_and_filter_blocks. |
68 | | PRELOAD_ON_OPEN, |
69 | | // Load on first data index access, store in block cache or in table reader depending on |
70 | | // BlockBasedTableOptions::cache_index_and_filter_blocks. |
71 | | LAZY, |
72 | | // Don't preload data index, access as needed, use block cache if available. |
73 | | USE_CACHE |
74 | | }; |
75 | | |
76 | | enum class PrefetchFilter { |
77 | | YES, |
78 | | NO |
79 | | }; |
80 | | |
81 | | YB_DEFINE_ENUM(BlockType, (kData)(kIndex)); |
82 | | |
83 | | // BloomFilterAwareFileFilter should only be used when scanning within the same hashed components of |
84 | | // the key and it should be used together with DocDbAwareFilterPolicy which only takes into account |
85 | | // hashed components of key for filtering. |
86 | | // BloomFilterAwareFileFilter ignores an SST file completely if there are no keys with the same |
87 | | // hashed components as the key specified in constructor. |
88 | | class BloomFilterAwareFileFilter : public TableAwareReadFileFilter { |
89 | | public: |
90 | | BloomFilterAwareFileFilter(const ReadOptions& read_options, const Slice& user_key); |
91 | | |
92 | | bool Filter(TableReader* reader) const override; |
93 | | |
94 | | private: |
95 | | const ReadOptions read_options_; |
96 | | const std::string user_key_; |
97 | | }; |
98 | | |
99 | | // A Table is a sorted map from strings to strings. Tables are |
100 | | // immutable and persistent. A Table may be safely accessed from |
101 | | // multiple threads without external synchronization. |
102 | | class BlockBasedTable : public TableReader { |
103 | | public: |
104 | | // No copying allowed |
105 | | explicit BlockBasedTable(const TableReader&) = delete; |
106 | | void operator=(const TableReader&) = delete; |
107 | | |
108 | | // Attempt to open the table that is stored in bytes [0..base_file_size) of "base_file" (may be |
109 | | // only metadata and data will be read from separate file passed via SetDataFileReader), and read |
110 | | // the metadata entries necessary to allow retrieving data from the table. |
111 | | // |
112 | | // If successful, returns ok and sets "*table_reader" to the newly opened table. The client |
113 | | // should delete "*table_reader" when no longer needed. If there was an error while initializing |
114 | | // the table, sets "*table_reader" to nullptr and returns a non-ok status. |
115 | | // |
116 | | // base_file must remain live while this Table is in use. |
117 | | // data_index_load_mode can be used to control loading of data index (see DataIndexLoadMode |
118 | | // description). |
119 | | // prefetch_filter can be used to disable prefetching of filter blocks at startup. For fixed-size |
120 | | // bloom filter only filter index could be prefetched. |
121 | | // skip_filters Disables loading/accessing the filter block. Overrides prefetch_filter, so filter |
122 | | // will be skipped if both are set. |
123 | | static CHECKED_STATUS Open( |
124 | | const ImmutableCFOptions& ioptions, |
125 | | const EnvOptions& env_options, |
126 | | const BlockBasedTableOptions& table_options, |
127 | | const InternalKeyComparatorPtr& internal_key_comparator, |
128 | | unique_ptr<RandomAccessFileReader>&& base_file, |
129 | | uint64_t base_file_size, |
130 | | unique_ptr<TableReader>* table_reader, |
131 | | DataIndexLoadMode data_index_load_mode = DataIndexLoadMode::LAZY, |
132 | | PrefetchFilter prefetch_filter = PrefetchFilter::YES, |
133 | | bool skip_filters = false); |
134 | | |
135 | 69.4k | bool IsSplitSst() const override { return true; } |
136 | | |
137 | | void SetDataFileReader(unique_ptr<RandomAccessFileReader>&& data_file) override; |
138 | | |
139 | | bool PrefixMayMatch(const Slice& internal_key); |
140 | | |
141 | | // Returns a new iterator over the table contents. |
142 | | // The result of NewIterator() is initially invalid (caller must |
143 | | // call one of the Seek methods on the iterator before using it). |
144 | | // @param skip_filters Disables loading/accessing the filter block |
145 | | InternalIterator* NewIterator(const ReadOptions&, Arena* arena = nullptr, |
146 | | bool skip_filters = false) override; |
147 | | |
148 | | // @param skip_filters Disables loading/accessing the filter block. |
149 | | // key should be internal key in case bloom filters are used. |
150 | | CHECKED_STATUS Get( |
151 | | const ReadOptions& readOptions, const Slice& key, GetContext* get_context, |
152 | | bool skip_filters = false) override; |
153 | | |
154 | | // Pre-fetch the disk blocks that correspond to the key range specified by |
155 | | // (kbegin, kend). The call will return return error status in the event of |
156 | | // IO or iteration error. |
157 | | CHECKED_STATUS Prefetch(const Slice* begin, const Slice* end) override; |
158 | | |
159 | | // Given a key, return an approximate byte offset in the file where |
160 | | // the data for that key begins (or would begin if the key were |
161 | | // present in the file). The returned value is in terms of file |
162 | | // bytes, and so includes effects like compression of the underlying data. |
163 | | // E.g., the approximate offset of the last key in the table will |
164 | | // be close to the file length. |
165 | | uint64_t ApproximateOffsetOf(const Slice& key) override; |
166 | | |
167 | | // Returns true if the block for the specified key is in cache. |
168 | | // REQUIRES: key is in this table && block cache enabled |
169 | | bool TEST_KeyInCache(const ReadOptions& options, const Slice& key); |
170 | | |
171 | | // Set up the table for Compaction. Might change some parameters with |
172 | | // posix_fadvise |
173 | | void SetupForCompaction() override; |
174 | | |
175 | | std::shared_ptr<const TableProperties> GetTableProperties() const override; |
176 | | |
177 | | size_t ApproximateMemoryUsage() const override; |
178 | | |
179 | | // convert SST file to a human readable form |
180 | | CHECKED_STATUS DumpTable(WritableFile* out_file) override; |
181 | | |
182 | | // input_iter: if it is not null, update this one and return it as Iterator |
183 | | InternalIterator* NewDataBlockIterator( |
184 | | const ReadOptions& ro, const Slice& index_value, BlockType block_type, |
185 | | BlockIter* input_iter = nullptr); |
186 | | |
187 | | const ImmutableCFOptions& ioptions(); |
188 | | |
189 | | yb::Result<std::string> GetMiddleKey() override; |
190 | | |
191 | | ~BlockBasedTable(); |
192 | | |
193 | | bool TEST_filter_block_preloaded() const; |
194 | | bool TEST_index_reader_loaded() const; |
195 | | |
196 | | private: |
197 | | template <class TValue> |
198 | | struct CachableEntry; |
199 | | |
200 | | struct FileReaderWithCachePrefix; |
201 | | |
202 | | struct Rep; |
203 | | Rep* rep_; |
204 | | |
205 | | class BlockEntryIteratorState; |
206 | | class IndexIteratorHolder; |
207 | | |
208 | | // Returns filter block handle for fixed-size bloom filter using filter index and filter key. |
209 | | CHECKED_STATUS GetFixedSizeFilterBlockHandle(const Slice& filter_key, |
210 | | BlockHandle* filter_block_handle) const; |
211 | | |
212 | | // Returns key to be added to filter or verified against filter based on internal_key. |
213 | | Slice GetFilterKeyFromInternalKey(const Slice &internal_key) const; |
214 | | |
215 | | // Returns key to be added to filter or verified against filter based on user_key. |
216 | | Slice GetFilterKeyFromUserKey(const Slice& user_key) const; |
217 | | |
218 | | // If `no_io == true`, we will not try to read filter/index from sst file (except fixed-size |
219 | | // filter blocks) were they not present in cache yet. |
220 | | // filter_key is only required when using fixed-size bloom filter in order to use the filter index |
221 | | // to get the correct filter block. |
222 | | // Note: even if we check prefix match we still need to get filter based on filter_key, not its |
223 | | // prefix, because prefix for the key goes to the same filter block as key itself. |
224 | | CachableEntry<FilterBlockReader> GetFilter(const QueryId query_id, |
225 | | bool no_io = false, |
226 | | const Slice* filter_key = nullptr) const; |
227 | | |
228 | | // Returns index reader. |
229 | | // If index reader is not stored in either block or internal cache: |
230 | | // - If read_options.read_tier == kBlockCacheTier: Status::Incomplete error will be returned. |
231 | | // - If read_options.read_tier != kBlockCacheTier: new index reader will be created and cached. |
232 | | yb::Result<CachableEntry<IndexReader>> GetIndexReader(const ReadOptions& read_options); |
233 | | |
234 | | // Get the iterator from the index reader. |
235 | | // If input_iter is not set, return new Iterator |
236 | | // If input_iter is set, update it and return: |
237 | | // - newly created data index iterator in case it was created (if we use multi-level data index, |
238 | | // input_iter is an iterator of the top level index, but not the whole index iterator). |
239 | | // - nullptr if input_iter is a data index iterator and no new iterators were created. |
240 | | // |
241 | | // Note: ErrorIterator with error will be returned if GetIndexReader returned an error. |
242 | | InternalIterator* NewIndexIterator(const ReadOptions& read_options, |
243 | | BlockIter* input_iter = nullptr); |
244 | | |
245 | | // Read block cache from block caches (if set): block_cache and |
246 | | // block_cache_compressed. |
247 | | // On success, Status::OK with be returned and @block will be populated with |
248 | | // pointer to the block as well as its block handle. |
249 | | static CHECKED_STATUS GetDataBlockFromCache( |
250 | | const Slice& block_cache_key, const Slice& compressed_block_cache_key, |
251 | | Cache* block_cache, Cache* block_cache_compressed, Statistics* statistics, |
252 | | const ReadOptions& read_options, BlockBasedTable::CachableEntry<Block>* block, |
253 | | uint32_t format_version, BlockType block_type, |
254 | | const std::shared_ptr<yb::MemTracker>& mem_tracker); |
255 | | |
256 | | // Put a raw block (maybe compressed) to the corresponding block caches. |
257 | | // This method will perform decompression against raw_block if needed and then |
258 | | // populate the block caches. |
259 | | // On success, Status::OK will be returned; also @block will be populated with |
260 | | // uncompressed block and its cache handle. |
261 | | // |
262 | | // REQUIRES: raw_block is heap-allocated. PutDataBlockToCache() will be |
263 | | // responsible for releasing its memory if error occurs. |
264 | | static CHECKED_STATUS PutDataBlockToCache( |
265 | | const Slice& block_cache_key, const Slice& compressed_block_cache_key, |
266 | | Cache* block_cache, Cache* block_cache_compressed, |
267 | | const ReadOptions& read_options, Statistics* statistics, |
268 | | CachableEntry<Block>* block, Block* raw_block, uint32_t format_version, |
269 | | const std::shared_ptr<yb::MemTracker>& mem_tracker); |
270 | | |
271 | | // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found |
272 | | // after a call to Seek(key), until handle_result returns false. |
273 | | // May not make such a call if filter policy says that key is not present. |
274 | | friend class TableCache; |
275 | | friend class BlockBasedTableBuilder; |
276 | | friend class BloomFilterAwareFileFilter; |
277 | | |
278 | | void ReadMeta(const Footer& footer); |
279 | | |
280 | | // Create a index reader based on the index type stored in the table. |
281 | | // Optionally, user can pass a preloaded meta_index_iter for the index that |
282 | | // need to access extra meta blocks for index construction. This parameter |
283 | | // helps avoid re-reading meta index block if caller already created one. |
284 | | CHECKED_STATUS CreateDataBlockIndexReader( |
285 | | std::unique_ptr<IndexReader>* index_reader, |
286 | | InternalIterator* preloaded_meta_index_iter = nullptr); |
287 | | |
288 | | bool NonBlockBasedFilterKeyMayMatch(FilterBlockReader* filter, const Slice& filter_key) const; |
289 | | |
290 | | CHECKED_STATUS ReadPropertiesBlock(InternalIterator* meta_iter); |
291 | | |
292 | | CHECKED_STATUS SetupFilter(InternalIterator* meta_iter); |
293 | | |
294 | | // Read the meta block from sst. |
295 | | static CHECKED_STATUS ReadMetaBlock( |
296 | | Rep* rep, std::unique_ptr<Block>* meta_block, std::unique_ptr<InternalIterator>* iter); |
297 | | |
298 | | // Create the filter from the filter block. |
299 | | static FilterBlockReader* ReadFilterBlock(const BlockHandle& filter_block, Rep* rep, |
300 | | size_t* filter_size = nullptr); |
301 | | |
302 | | // CreateFilterIndexReader from sst |
303 | | CHECKED_STATUS CreateFilterIndexReader(std::unique_ptr<IndexReader>* filter_index_reader); |
304 | | |
305 | | // Helper function to setup the cache key's prefix for block of file passed within a reader |
306 | | // instance. Used for both data and metadata files. |
307 | | static void SetupCacheKeyPrefix(Rep* rep, FileReaderWithCachePrefix* reader_with_cache_prefix); |
308 | | |
309 | | FileReaderWithCachePrefix* GetBlockReader(BlockType block_type); |
310 | | KeyValueEncodingFormat GetKeyValueEncodingFormat(BlockType block_type); |
311 | | |
312 | 72.6k | explicit BlockBasedTable(Rep* rep) : rep_(rep) {} |
313 | | |
314 | | // Helper functions for DumpTable() |
315 | | CHECKED_STATUS DumpIndexBlock(WritableFile* out_file); |
316 | | CHECKED_STATUS DumpDataBlocks(WritableFile* out_file); |
317 | | }; |
318 | | |
319 | | } // namespace rocksdb |
320 | | |
321 | | #endif // YB_ROCKSDB_TABLE_BLOCK_BASED_TABLE_READER_H |