/Users/deen/code/yugabyte-db/src/yb/rocksdb/table.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
2 | | // Use of this source code is governed by a BSD-style license that can be |
3 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
4 | | // |
5 | | // The following only applies to changes made to this file as part of YugaByte development. |
6 | | // |
7 | | // Portions Copyright (c) YugaByte, Inc. |
8 | | // |
9 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
10 | | // in compliance with the License. You may obtain a copy of the License at |
11 | | // |
12 | | // http://www.apache.org/licenses/LICENSE-2.0 |
13 | | // |
14 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
15 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
16 | | // or implied. See the License for the specific language governing permissions and limitations |
17 | | // under the License. |
18 | | // |
19 | | // Currently we support two types of tables: plain table and block-based table. |
20 | | // 1. Block-based table: this is the default table type that we inherited from |
21 | | // LevelDB, which was designed for storing data in hard disk or flash |
22 | | // device. |
23 | | // 2. Plain table: it is one of RocksDB's SST file format optimized |
24 | | // for low query latency on pure-memory or really low-latency media. |
25 | | // |
26 | | // A tutorial of rocksdb table formats is available here: |
27 | | // https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats |
28 | | // |
29 | | // Example code is also available |
30 | | // https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples |
31 | | |
32 | | #ifndef YB_ROCKSDB_TABLE_H |
33 | | #define YB_ROCKSDB_TABLE_H |
34 | | |
35 | | #include <memory> |
36 | | #include <string> |
37 | | #include <unordered_map> |
38 | | |
39 | | #include "yb/rocksdb/options.h" |
40 | | #include "yb/rocksdb/status.h" |
41 | | #include "yb/rocksdb/types.h" |
42 | | |
43 | | #include "yb/util/size_literals.h" |
44 | | |
45 | | namespace rocksdb { |
46 | | |
47 | | // -- Block-based Table |
48 | | class FlushBlockPolicyFactory; |
49 | | struct TableReaderOptions; |
50 | | struct TableBuilderOptions; |
51 | | class TableBuilder; |
52 | | class TableReader; |
53 | | class WritableFileWriter; |
54 | | struct EnvOptions; |
55 | | struct Options; |
56 | | |
57 | | using std::unique_ptr; |
58 | | using namespace yb::size_literals; |
59 | | |
60 | | enum ChecksumType : char { |
61 | | kNoChecksum = 0x0, // not yet supported. Will fail |
62 | | kCRC32c = 0x1, |
63 | | kxxHash = 0x2, |
64 | | }; |
65 | | |
66 | | YB_DEFINE_ENUM(IndexType, |
67 | | // A space efficient index block that is optimized for binary-search-based index. |
68 | | (kBinarySearch) |
69 | | |
70 | | // The hash index, if enabled, will do the hash lookup when `Options.prefix_extractor` is |
71 | | // provided. |
72 | | (kHashSearch) |
73 | | |
74 | | // Index is partitioned into blocks based on size according to TableOptions::index_block_size |
75 | | // and TableOptions::min_keys_per_index_block. To guarantee that index blocks fit required size, |
76 | | // index could have multiple levels. All levels are binary-search-based indexes. |
77 | | (kMultiLevelBinarySearch) |
78 | | ); |
79 | | |
80 | | // For advanced user only |
81 | | struct BlockBasedTableOptions { |
82 | | // @flush_block_policy_factory creates the instances of flush block policy. |
83 | | // which provides a configurable way to determine when to flush a block in |
84 | | // the block based tables. If not set, table builder will use the default |
85 | | // block flush policy, which cut blocks by block size (please refer to |
86 | | // `FlushBlockBySizePolicy`). |
87 | | std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory; |
88 | | |
89 | | // TODO(kailiu) Temporarily disable this feature by making the default value |
90 | | // to be false. |
91 | | // |
92 | | // Indicating if we'd put index/filter blocks to the block cache. |
93 | | // If not specified, each "table reader" object will pre-load index/filter |
94 | | // block during table initialization. |
95 | | // Note: Fixed-size bloom filter data blocks are never pre-loaded. |
96 | | bool cache_index_and_filter_blocks = false; |
97 | | |
98 | | IndexType index_type = IndexType::kMultiLevelBinarySearch; |
99 | | |
100 | | // Influence the behavior when kHashSearch is used. |
101 | | // if false, stores a precise prefix to block range mapping |
102 | | // if true, does not store prefix and allows prefix hash collision |
103 | | // (less memory consumption) |
104 | | bool hash_index_allow_collision = true; |
105 | | |
106 | | // Use the specified checksum type. Newly created table files will be |
107 | | // protected with this checksum type. Old table files will still be readable, |
108 | | // even though they have different checksum type. |
109 | | ChecksumType checksum = kCRC32c; |
110 | | |
111 | | // Disable block cache. If this is set to true, |
112 | | // then no block cache should be used, and the block_cache should |
113 | | // point to a nullptr object. |
114 | | bool no_block_cache = false; |
115 | | |
116 | | // If non-NULL use the specified cache for blocks. |
117 | | // If NULL, rocksdb will automatically create and use an 8MB internal cache. |
118 | | std::shared_ptr<Cache> block_cache = nullptr; |
119 | | |
120 | | // If non-NULL use the specified cache for compressed blocks. |
121 | | // If NULL, rocksdb will not use a compressed block cache. |
122 | | std::shared_ptr<Cache> block_cache_compressed = nullptr; |
123 | | |
124 | | // Approximate size of user data packed per block, in bytes. Note that the |
125 | | // block size specified here corresponds to uncompressed data. The |
126 | | // actual size of the unit read from disk may be smaller if |
127 | | // compression is enabled. This parameter can be changed dynamically. |
128 | | // It is preferred to set large block size as it reduces the memory requirement |
129 | | // for indexing. |
130 | | size_t block_size = 4 * 1024; |
131 | | |
132 | | // Size of each filter block, in bytes. Only applicable for fixed size filter block. |
133 | | size_t filter_block_size = 64 * 1024; |
134 | | |
135 | | // This is used to close a block before it reaches the configured |
136 | | // 'block_size'. If the percentage of free space in the current block is less |
137 | | // than this specified number and adding a new record to the block will |
138 | | // exceed the configured block size, then this block will be closed and the |
139 | | // new record will be written to the next block. |
140 | | int block_size_deviation = 10; |
141 | | |
142 | | // Number of keys between restart points for delta encoding of keys. |
143 | | // This parameter can be changed dynamically. Most clients should |
144 | | // leave this parameter alone. The minimum value allowed is 1. Any smaller |
145 | | // value will be silently overwritten with 1. |
146 | | int block_restart_interval = 16; |
147 | | |
148 | | // Same as block_restart_interval but used for the index block. |
149 | | int index_block_restart_interval = 1; |
150 | | |
151 | | // Index block size for sharded index. Applied to data index when kMultiLevelBinarySearch is used. |
152 | | size_t index_block_size = 4_KB; |
153 | | |
154 | | // For kMultiLevelBinarySearch: minimum number of keys to put in index block. This constraint is |
155 | | // used to avoid too many index levels in case we have large keys. |
156 | | size_t min_keys_per_index_block = 64; |
157 | | |
158 | | // Use delta encoding to compress keys in data blocks. |
159 | | // Iterator::PinData() requires this option to be disabled. |
160 | | // |
161 | | // Default: true |
162 | | bool use_delta_encoding = true; |
163 | | |
164 | | // Specifies format for encoding entries in data blocks. |
165 | | KeyValueEncodingFormat data_block_key_value_encoding_format = |
166 | | KeyValueEncodingFormat::kKeyDeltaEncodingSharedPrefix; |
167 | | |
168 | | // If non-nullptr, use the specified filter policy for new SST files to reduce disk reads. |
169 | | // Many applications will benefit from passing the result of |
170 | | // NewBloomFilterPolicy() here. |
171 | | typedef std::shared_ptr<const FilterPolicy> FilterPolicyPtr; |
172 | | FilterPolicyPtr filter_policy = nullptr; |
173 | | |
174 | | // Other filter policies we support for backward compatibility to be able to use bloom filters |
175 | | // for existing SST files. |
176 | | // Note: wrapped into std::shared_ptr for options_helper to be compilable. Without wrapping there |
177 | | // is an error: offset of on non-standard-layout type 'struct BlockBasedTableOptions'. |
178 | | typedef std::unordered_map<std::string, FilterPolicyPtr> FilterPoliciesMap; |
179 | | std::shared_ptr<FilterPoliciesMap> supported_filter_policies; |
180 | | |
181 | | // If true, place whole keys in the filter (not just prefixes). |
182 | | // This must generally be true for gets to be efficient. |
183 | | bool whole_key_filtering = true; |
184 | | |
185 | | // If true, block will not be explicitly flushed to disk during building |
186 | | // a SstTable. Instead, buffer in WritableFileWriter will take |
187 | | // care of the flushing when it is full. |
188 | | // |
189 | | // On Windows, this option helps a lot when unbuffered I/O |
190 | | // (allow_os_buffer = false) is used, since it avoids small |
191 | | // unbuffered disk write. |
192 | | // |
193 | | // User may also adjust writable_file_max_buffer_size to optimize disk I/O |
194 | | // size. |
195 | | // |
196 | | // Default: false |
197 | | bool skip_table_builder_flush = false; |
198 | | |
199 | | // We currently have three versions: |
200 | | // 0 -- This version is currently written out by all RocksDB's versions by |
201 | | // default. Can be read by really old RocksDB's. Doesn't support changing |
202 | | // checksum (default is CRC32). |
203 | | // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default |
204 | | // checksum, like xxHash. It is written by RocksDB when |
205 | | // BlockBasedTableOptions::checksum is something other than kCRC32c. (version |
206 | | // 0 is silently upconverted) |
207 | | // 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we |
208 | | // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you |
209 | | // don't plan to run RocksDB before version 3.10, you should probably use |
210 | | // this. |
211 | | // This option only affects newly written tables. When reading exising tables, |
212 | | // the information about version is read from the footer. |
213 | | uint32_t format_version = 2; |
214 | | }; |
215 | | |
216 | | // Table Properties that are specific to block-based table properties. |
217 | | struct BlockBasedTablePropertyNames { |
218 | | // value of this property is a fixed int32 number. |
219 | | static const char kIndexType[]; |
220 | | // number of index levels for multi-level index, int32. |
221 | | static const char kNumIndexLevels[]; |
222 | | // value is "1" for true and "0" for false. |
223 | | static const char kWholeKeyFiltering[]; |
224 | | // value is "1" for true and "0" for false. |
225 | | static const char kPrefixFiltering[]; |
226 | | // value is a uint8_t. |
227 | | static const char kDataBlockKeyValueEncodingFormat[]; |
228 | | }; |
229 | | |
230 | | // Create default block based table factory. |
231 | | extern TableFactory* NewBlockBasedTableFactory( |
232 | | const BlockBasedTableOptions& table_options = BlockBasedTableOptions()); |
233 | | |
234 | | #ifndef ROCKSDB_LITE |
235 | | |
236 | | enum EncodingType : char { |
237 | | // Always write full keys without any special encoding. |
238 | | kPlain, |
239 | | // Find opportunity to write the same prefix once for multiple rows. |
240 | | // In some cases, when a key follows a previous key with the same prefix, |
241 | | // instead of writing out the full key, it just writes out the size of the |
242 | | // shared prefix, as well as other bytes, to save some bytes. |
243 | | // |
244 | | // When using this option, the user is required to use the same prefix |
245 | | // extractor to make sure the same prefix will be extracted from the same key. |
246 | | // The Name() value of the prefix extractor will be stored in the file. When |
247 | | // reopening the file, the name of the options.prefix_extractor given will be |
248 | | // bitwise compared to the prefix extractors stored in the file. An error |
249 | | // will be returned if the two don't match. |
250 | | kPrefix, |
251 | | }; |
252 | | |
253 | | // Table Properties that are specific to plain table properties. |
254 | | struct PlainTablePropertyNames { |
255 | | static const char kPrefixExtractorName[]; |
256 | | static const char kEncodingType[]; |
257 | | static const char kBloomVersion[]; |
258 | | static const char kNumBloomBlocks[]; |
259 | | }; |
260 | | |
261 | | const uint32_t kPlainTableVariableLength = 0; |
262 | | |
263 | | struct PlainTableOptions { |
264 | | // @user_key_len: plain table has optimization for fix-sized keys, which can |
265 | | // be specified via user_key_len. Alternatively, you can pass |
266 | | // `kPlainTableVariableLength` if your keys have variable |
267 | | // lengths. |
268 | | uint32_t user_key_len = kPlainTableVariableLength; |
269 | | |
270 | | // @bloom_bits_per_key: the number of bits used for bloom filer per prefix. |
271 | | // You may disable it by passing a zero. |
272 | | int bloom_bits_per_key = 10; |
273 | | |
274 | | // @hash_table_ratio: the desired utilization of the hash table used for |
275 | | // prefix hashing. |
276 | | // hash_table_ratio = number of prefixes / #buckets in the |
277 | | // hash table |
278 | | double hash_table_ratio = 0.75; |
279 | | |
280 | | // @index_sparseness: inside each prefix, need to build one index record for |
281 | | // how many keys for binary search inside each hash bucket. |
282 | | // For encoding type kPrefix, the value will be used when |
283 | | // writing to determine an interval to rewrite the full |
284 | | // key. It will also be used as a suggestion and satisfied |
285 | | // when possible. |
286 | | size_t index_sparseness = 16; |
287 | | |
288 | | // @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc. |
289 | | // Otherwise from huge page TLB. The user needs to |
290 | | // reserve huge pages for it to be allocated, like: |
291 | | // sysctl -w vm.nr_hugepages=20 |
292 | | // See linux doc Documentation/vm/hugetlbpage.txt |
293 | | size_t huge_page_tlb_size = 0; |
294 | | |
295 | | // @encoding_type: how to encode the keys. See enum EncodingType above for |
296 | | // the choices. The value will determine how to encode keys |
297 | | // when writing to a new SST file. This value will be stored |
298 | | // inside the SST file which will be used when reading from |
299 | | // the file, which makes it possible for users to choose |
300 | | // different encoding type when reopening a DB. Files with |
301 | | // different encoding types can co-exist in the same DB and |
302 | | // can be read. |
303 | | EncodingType encoding_type = kPlain; |
304 | | |
305 | | // @full_scan_mode: mode for reading the whole file one record by one without |
306 | | // using the index. |
307 | | bool full_scan_mode = false; |
308 | | |
309 | | // @store_index_in_file: compute plain table index and bloom filter during |
310 | | // file building and store it in file. When reading |
311 | | // file, index will be mmaped instead of recomputation. |
312 | | bool store_index_in_file = false; |
313 | | }; |
314 | | |
315 | | // -- Plain Table with prefix-only seek |
316 | | // For this factory, you need to set Options.prefix_extrator properly to make it |
317 | | // work. Look-up will starts with prefix hash lookup for key prefix. Inside the |
318 | | // hash bucket found, a binary search is executed for hash conflicts. Finally, |
319 | | // a linear search is used. |
320 | | |
321 | | extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options = |
322 | | PlainTableOptions()); |
323 | | |
324 | | #endif // ROCKSDB_LITE |
325 | | |
326 | | class RandomAccessFileReader; |
327 | | |
328 | | // A base class for table factories. |
329 | | class TableFactory { |
330 | | public: |
331 | 6.66M | virtual ~TableFactory() {} |
332 | | |
333 | | // The type of the table. |
334 | | // |
335 | | // The client of this package should switch to a new name whenever |
336 | | // the table format implementation changes. |
337 | | // |
338 | | // Names starting with "rocksdb." are reserved and should not be used |
339 | | // by any clients of this package. |
340 | | virtual const char* Name() const = 0; |
341 | | |
342 | | // Returns a Table object table that can fetch data from file specified |
343 | | // in parameter file. It's the caller's responsibility to make sure |
344 | | // file is in the correct format. |
345 | | // |
346 | | // NewTableReader() is called in three places: |
347 | | // (1) TableCache::FindTable() calls the function when table cache miss |
348 | | // and cache the table object returned. |
349 | | // (2) SstFileReader (for SST Dump) opens the table and dump the table |
350 | | // contents using the interator of the table. |
351 | | // (3) DBImpl::AddFile() calls this function to read the contents of |
352 | | // the sst file it's attempting to add |
353 | | // |
354 | | // table_reader_options is a TableReaderOptions which contain all the |
355 | | // needed parameters and configuration to open the table. |
356 | | // base_file is a file handler to handle the base file for the table. |
357 | | // base_file_size is the physical file size of the base file. |
358 | | // table_reader is the output table reader. |
359 | | virtual Status NewTableReader( |
360 | | const TableReaderOptions& table_reader_options, |
361 | | unique_ptr<RandomAccessFileReader>&& base_file, uint64_t base_file_size, |
362 | | unique_ptr<TableReader>* table_reader) const = 0; |
363 | | |
364 | | // Whether SST split into metadata and data file(s) is supported for writing. |
365 | | // There is a AdaptiveTableFactory inheriting common TableFactory interface. AdaptiveTableFactory |
366 | | // can delegate reads to one type of table factory and writes to another type of table factory |
367 | | // and in that case it can only support split SST either for read or write. |
368 | | virtual bool IsSplitSstForWriteSupported() const = 0; |
369 | | |
370 | | // Return a table builder to write to file(s) for this table type. |
371 | | // Particular table factory implementations can support either writing whole SST to a single file |
372 | | // (passed in base_file parameter, data_file should be nullptr in that case) or support separate |
373 | | // files for data (data_file) and metadata (base_file). |
374 | | // |
375 | | // It is called in several places: |
376 | | // (1) When flushing memtable to a level-0 output file, it creates a table |
377 | | // builder (In DBImpl::WriteLevel0Table(), by calling BuildTable()) |
378 | | // (2) During compaction, it gets the builder for writing compaction output |
379 | | // files in DBImpl::OpenCompactionOutputFile(). |
380 | | // (3) When recovering from transaction logs, it creates a table builder to |
381 | | // write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery, |
382 | | // by calling BuildTable()) |
383 | | // (4) When running Repairer, it creates a table builder to convert logs to |
384 | | // SST files (In Repairer::ConvertLogToTable() by calling BuildTable()) |
385 | | // |
386 | | // ImmutableCFOptions is a subset of Options that can not be altered. |
387 | | // Multiple configured can be acceseed from there, including and not limited |
388 | | // to compression options. file is a handle of a writable file. |
389 | | // It is the caller's responsibility to keep the file open and close the file |
390 | | // after closing the table builder. compression_type is the compression type |
391 | | // to use in this table. |
392 | | virtual TableBuilder* NewTableBuilder( |
393 | | const TableBuilderOptions& table_builder_options, |
394 | | uint32_t column_family_id, WritableFileWriter* base_file, |
395 | | WritableFileWriter* data_file = nullptr) const = 0; |
396 | | |
397 | | // Sanitizes the specified DB Options and ColumnFamilyOptions. |
398 | | // |
399 | | // If the function cannot find a way to sanitize the input DB Options, |
400 | | // a non-ok Status will be returned. |
401 | | virtual Status SanitizeOptions( |
402 | | const DBOptions& db_opts, |
403 | | const ColumnFamilyOptions& cf_opts) const = 0; |
404 | | |
405 | | // Return a string that contains printable format of table configurations. |
406 | | // RocksDB prints configurations at DB Open(). |
407 | | virtual std::string GetPrintableTableOptions() const = 0; |
408 | | |
409 | | // Returns the raw pointer of the table options that is used by this |
410 | | // TableFactory, or nullptr if this function is not supported. |
411 | | // Since the return value is a raw pointer, the TableFactory owns the |
412 | | // pointer and the caller should not delete the pointer. |
413 | | // |
414 | | // In certan case, it is desirable to alter the underlying options when the |
415 | | // TableFactory is not used by any open DB by casting the returned pointer |
416 | | // to the right class. For instance, if BlockBasedTableFactory is used, |
417 | | // then the pointer can be casted to BlockBasedTableOptions. |
418 | | // |
419 | | // Note that changing the underlying TableFactory options while the |
420 | | // TableFactory is currently used by any open DB is undefined behavior. |
421 | | // Developers should use DB::SetOption() instead to dynamically change |
422 | | // options while the DB is open. |
423 | 0 | virtual void* GetOptions() { return nullptr; } |
424 | | |
425 | | // Returns SST file filter for pruning out files which doesn't contain some part of user_key. |
426 | | // It should be in sync with FilterPolicy used for bloom filter construction. For example, |
427 | | // file filter should only consider hashed components of the key when using with |
428 | | // DocDbAwareFilterPolicy and HashedComponentsExtractor. |
429 | | virtual std::shared_ptr<TableAwareReadFileFilter> NewTableAwareReadFileFilter( |
430 | 0 | const ReadOptions &read_options, const Slice &user_key) const { return nullptr; } |
431 | | }; |
432 | | |
433 | | #ifndef ROCKSDB_LITE |
434 | | // Create a special table factory that can open either of the supported |
435 | | // table formats, based on setting inside the SST files. It should be used to |
436 | | // convert a DB from one table format to another. |
437 | | // @table_factory_to_write: the table factory used when writing to new files. |
438 | | // @block_based_table_factory: block based table factory to use. If NULL, use |
439 | | // a default one. |
440 | | // @plain_table_factory: plain table factory to use. If NULL, use a default one. |
441 | | extern TableFactory* NewAdaptiveTableFactory( |
442 | | std::shared_ptr<TableFactory> table_factory_to_write = nullptr, |
443 | | std::shared_ptr<TableFactory> block_based_table_factory = nullptr, |
444 | | std::shared_ptr<TableFactory> plain_table_factory = nullptr); |
445 | | |
446 | | #endif // ROCKSDB_LITE |
447 | | |
448 | | } // namespace rocksdb |
449 | | |
450 | | #endif // YB_ROCKSDB_TABLE_H |