YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/src/yb/rocksdb/table.h
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style license that can be
3
// found in the LICENSE file. See the AUTHORS file for names of contributors.
4
//
5
// The following only applies to changes made to this file as part of YugaByte development.
6
//
7
// Portions Copyright (c) YugaByte, Inc.
8
//
9
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
10
// in compliance with the License.  You may obtain a copy of the License at
11
//
12
// http://www.apache.org/licenses/LICENSE-2.0
13
//
14
// Unless required by applicable law or agreed to in writing, software distributed under the License
15
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
16
// or implied.  See the License for the specific language governing permissions and limitations
17
// under the License.
18
//
19
// Currently we support two types of tables: plain table and block-based table.
20
//   1. Block-based table: this is the default table type that we inherited from
21
//      LevelDB, which was designed for storing data in hard disk or flash
22
//      device.
23
//   2. Plain table: it is one of RocksDB's SST file format optimized
24
//      for low query latency on pure-memory or really low-latency media.
25
//
26
// A tutorial of rocksdb table formats is available here:
27
//   https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats
28
//
29
// Example code is also available
30
//   https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples
31
32
#ifndef YB_ROCKSDB_TABLE_H
33
#define YB_ROCKSDB_TABLE_H
34
35
#include <memory>
36
#include <string>
37
#include <unordered_map>
38
39
#include "yb/rocksdb/options.h"
40
#include "yb/rocksdb/status.h"
41
#include "yb/rocksdb/types.h"
42
43
#include "yb/util/size_literals.h"
44
45
namespace rocksdb {
46
47
// -- Block-based Table
48
class FlushBlockPolicyFactory;
49
struct TableReaderOptions;
50
struct TableBuilderOptions;
51
class TableBuilder;
52
class TableReader;
53
class WritableFileWriter;
54
struct EnvOptions;
55
struct Options;
56
57
using std::unique_ptr;
58
using namespace yb::size_literals;
59
60
enum ChecksumType : char {
61
  kNoChecksum = 0x0,  // not yet supported. Will fail
62
  kCRC32c = 0x1,
63
  kxxHash = 0x2,
64
};
65
66
YB_DEFINE_ENUM(IndexType,
67
  // A space efficient index block that is optimized for binary-search-based index.
68
  (kBinarySearch)
69
70
  // The hash index, if enabled, will do the hash lookup when `Options.prefix_extractor` is
71
  // provided.
72
  (kHashSearch)
73
74
  // Index is partitioned into blocks based on size according to TableOptions::index_block_size
75
  // and TableOptions::min_keys_per_index_block. To guarantee that index blocks fit required size,
76
  // index could have multiple levels. All levels are binary-search-based indexes.
77
  (kMultiLevelBinarySearch)
78
);
79
80
// For advanced user only
81
struct BlockBasedTableOptions {
82
  // @flush_block_policy_factory creates the instances of flush block policy.
83
  // which provides a configurable way to determine when to flush a block in
84
  // the block based tables.  If not set, table builder will use the default
85
  // block flush policy, which cut blocks by block size (please refer to
86
  // `FlushBlockBySizePolicy`).
87
  std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
88
89
  // TODO(kailiu) Temporarily disable this feature by making the default value
90
  // to be false.
91
  //
92
  // Indicating if we'd put index/filter blocks to the block cache.
93
  // If not specified, each "table reader" object will pre-load index/filter
94
  // block during table initialization.
95
  // Note: Fixed-size bloom filter data blocks are never pre-loaded.
96
  bool cache_index_and_filter_blocks = false;
97
98
  IndexType index_type = IndexType::kMultiLevelBinarySearch;
99
100
  // Influence the behavior when kHashSearch is used.
101
  // if false, stores a precise prefix to block range mapping
102
  // if true, does not store prefix and allows prefix hash collision
103
  // (less memory consumption)
104
  bool hash_index_allow_collision = true;
105
106
  // Use the specified checksum type. Newly created table files will be
107
  // protected with this checksum type. Old table files will still be readable,
108
  // even though they have different checksum type.
109
  ChecksumType checksum = kCRC32c;
110
111
  // Disable block cache. If this is set to true,
112
  // then no block cache should be used, and the block_cache should
113
  // point to a nullptr object.
114
  bool no_block_cache = false;
115
116
  // If non-NULL use the specified cache for blocks.
117
  // If NULL, rocksdb will automatically create and use an 8MB internal cache.
118
  std::shared_ptr<Cache> block_cache = nullptr;
119
120
  // If non-NULL use the specified cache for compressed blocks.
121
  // If NULL, rocksdb will not use a compressed block cache.
122
  std::shared_ptr<Cache> block_cache_compressed = nullptr;
123
124
  // Approximate size of user data packed per block, in bytes. Note that the
125
  // block size specified here corresponds to uncompressed data.  The
126
  // actual size of the unit read from disk may be smaller if
127
  // compression is enabled.  This parameter can be changed dynamically.
128
  // It is preferred to set large block size as it reduces the memory requirement
129
  // for indexing.
130
  size_t block_size = 4 * 1024;
131
132
  // Size of each filter block, in bytes. Only applicable for fixed size filter block.
133
  size_t filter_block_size = 64 * 1024;
134
135
  // This is used to close a block before it reaches the configured
136
  // 'block_size'. If the percentage of free space in the current block is less
137
  // than this specified number and adding a new record to the block will
138
  // exceed the configured block size, then this block will be closed and the
139
  // new record will be written to the next block.
140
  int block_size_deviation = 10;
141
142
  // Number of keys between restart points for delta encoding of keys.
143
  // This parameter can be changed dynamically.  Most clients should
144
  // leave this parameter alone.  The minimum value allowed is 1.  Any smaller
145
  // value will be silently overwritten with 1.
146
  int block_restart_interval = 16;
147
148
  // Same as block_restart_interval but used for the index block.
149
  int index_block_restart_interval = 1;
150
151
  // Index block size for sharded index. Applied to data index when kMultiLevelBinarySearch is used.
152
  size_t index_block_size = 4_KB;
153
154
  // For kMultiLevelBinarySearch: minimum number of keys to put in index block. This constraint is
155
  // used to avoid too many index levels in case we have large keys.
156
  size_t min_keys_per_index_block = 64;
157
158
  // Use delta encoding to compress keys in data blocks.
159
  // Iterator::PinData() requires this option to be disabled.
160
  //
161
  // Default: true
162
  bool use_delta_encoding = true;
163
164
  // Specifies format for encoding entries in data blocks.
165
  KeyValueEncodingFormat data_block_key_value_encoding_format =
166
      KeyValueEncodingFormat::kKeyDeltaEncodingSharedPrefix;
167
168
  // If non-nullptr, use the specified filter policy for new SST files to reduce disk reads.
169
  // Many applications will benefit from passing the result of
170
  // NewBloomFilterPolicy() here.
171
  typedef std::shared_ptr<const FilterPolicy> FilterPolicyPtr;
172
  FilterPolicyPtr filter_policy = nullptr;
173
174
  // Other filter policies we support for backward compatibility to be able to use bloom filters
175
  // for existing SST files.
176
  // Note: wrapped into std::shared_ptr for options_helper to be compilable. Without wrapping there
177
  // is an error: offset of on non-standard-layout type 'struct BlockBasedTableOptions'.
178
  typedef std::unordered_map<std::string, FilterPolicyPtr> FilterPoliciesMap;
179
  std::shared_ptr<FilterPoliciesMap> supported_filter_policies;
180
181
  // If true, place whole keys in the filter (not just prefixes).
182
  // This must generally be true for gets to be efficient.
183
  bool whole_key_filtering = true;
184
185
  // If true, block will not be explicitly flushed to disk during building
186
  // a SstTable. Instead, buffer in WritableFileWriter will take
187
  // care of the flushing when it is full.
188
  //
189
  // On Windows, this option helps a lot when unbuffered I/O
190
  // (allow_os_buffer = false) is used, since it avoids small
191
  // unbuffered disk write.
192
  //
193
  // User may also adjust writable_file_max_buffer_size to optimize disk I/O
194
  // size.
195
  //
196
  // Default: false
197
  bool skip_table_builder_flush = false;
198
199
  // We currently have three versions:
200
  // 0 -- This version is currently written out by all RocksDB's versions by
201
  // default.  Can be read by really old RocksDB's. Doesn't support changing
202
  // checksum (default is CRC32).
203
  // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default
204
  // checksum, like xxHash. It is written by RocksDB when
205
  // BlockBasedTableOptions::checksum is something other than kCRC32c. (version
206
  // 0 is silently upconverted)
207
  // 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we
208
  // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you
209
  // don't plan to run RocksDB before version 3.10, you should probably use
210
  // this.
211
  // This option only affects newly written tables. When reading exising tables,
212
  // the information about version is read from the footer.
213
  uint32_t format_version = 2;
214
};
215
216
// Table Properties that are specific to block-based table properties.
217
struct BlockBasedTablePropertyNames {
218
  // value of this property is a fixed int32 number.
219
  static const char kIndexType[];
220
  // number of index levels for multi-level index, int32.
221
  static const char kNumIndexLevels[];
222
  // value is "1" for true and "0" for false.
223
  static const char kWholeKeyFiltering[];
224
  // value is "1" for true and "0" for false.
225
  static const char kPrefixFiltering[];
226
  // value is a uint8_t.
227
  static const char kDataBlockKeyValueEncodingFormat[];
228
};
229
230
// Create default block based table factory.
231
extern TableFactory* NewBlockBasedTableFactory(
232
    const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
233
234
#ifndef ROCKSDB_LITE
235
236
enum EncodingType : char {
237
  // Always write full keys without any special encoding.
238
  kPlain,
239
  // Find opportunity to write the same prefix once for multiple rows.
240
  // In some cases, when a key follows a previous key with the same prefix,
241
  // instead of writing out the full key, it just writes out the size of the
242
  // shared prefix, as well as other bytes, to save some bytes.
243
  //
244
  // When using this option, the user is required to use the same prefix
245
  // extractor to make sure the same prefix will be extracted from the same key.
246
  // The Name() value of the prefix extractor will be stored in the file. When
247
  // reopening the file, the name of the options.prefix_extractor given will be
248
  // bitwise compared to the prefix extractors stored in the file. An error
249
  // will be returned if the two don't match.
250
  kPrefix,
251
};
252
253
// Table Properties that are specific to plain table properties.
254
struct PlainTablePropertyNames {
255
  static const char kPrefixExtractorName[];
256
  static const char kEncodingType[];
257
  static const char kBloomVersion[];
258
  static const char kNumBloomBlocks[];
259
};
260
261
const uint32_t kPlainTableVariableLength = 0;
262
263
struct PlainTableOptions {
264
  // @user_key_len: plain table has optimization for fix-sized keys, which can
265
  //                be specified via user_key_len.  Alternatively, you can pass
266
  //                `kPlainTableVariableLength` if your keys have variable
267
  //                lengths.
268
  uint32_t user_key_len = kPlainTableVariableLength;
269
270
  // @bloom_bits_per_key: the number of bits used for bloom filer per prefix.
271
  //                      You may disable it by passing a zero.
272
  int bloom_bits_per_key = 10;
273
274
  // @hash_table_ratio: the desired utilization of the hash table used for
275
  //                    prefix hashing.
276
  //                    hash_table_ratio = number of prefixes / #buckets in the
277
  //                    hash table
278
  double hash_table_ratio = 0.75;
279
280
  // @index_sparseness: inside each prefix, need to build one index record for
281
  //                    how many keys for binary search inside each hash bucket.
282
  //                    For encoding type kPrefix, the value will be used when
283
  //                    writing to determine an interval to rewrite the full
284
  //                    key. It will also be used as a suggestion and satisfied
285
  //                    when possible.
286
  size_t index_sparseness = 16;
287
288
  // @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
289
  //                      Otherwise from huge page TLB. The user needs to
290
  //                      reserve huge pages for it to be allocated, like:
291
  //                          sysctl -w vm.nr_hugepages=20
292
  //                      See linux doc Documentation/vm/hugetlbpage.txt
293
  size_t huge_page_tlb_size = 0;
294
295
  // @encoding_type: how to encode the keys. See enum EncodingType above for
296
  //                 the choices. The value will determine how to encode keys
297
  //                 when writing to a new SST file. This value will be stored
298
  //                 inside the SST file which will be used when reading from
299
  //                 the file, which makes it possible for users to choose
300
  //                 different encoding type when reopening a DB. Files with
301
  //                 different encoding types can co-exist in the same DB and
302
  //                 can be read.
303
  EncodingType encoding_type = kPlain;
304
305
  // @full_scan_mode: mode for reading the whole file one record by one without
306
  //                  using the index.
307
  bool full_scan_mode = false;
308
309
  // @store_index_in_file: compute plain table index and bloom filter during
310
  //                       file building and store it in file. When reading
311
  //                       file, index will be mmaped instead of recomputation.
312
  bool store_index_in_file = false;
313
};
314
315
// -- Plain Table with prefix-only seek
316
// For this factory, you need to set Options.prefix_extrator properly to make it
317
// work. Look-up will starts with prefix hash lookup for key prefix. Inside the
318
// hash bucket found, a binary search is executed for hash conflicts. Finally,
319
// a linear search is used.
320
321
extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options =
322
                                              PlainTableOptions());
323
324
#endif  // ROCKSDB_LITE
325
326
class RandomAccessFileReader;
327
328
// A base class for table factories.
329
class TableFactory {
330
 public:
331
5.26M
  virtual ~TableFactory() {}
332
333
  // The type of the table.
334
  //
335
  // The client of this package should switch to a new name whenever
336
  // the table format implementation changes.
337
  //
338
  // Names starting with "rocksdb." are reserved and should not be used
339
  // by any clients of this package.
340
  virtual const char* Name() const = 0;
341
342
  // Returns a Table object table that can fetch data from file specified
343
  // in parameter file. It's the caller's responsibility to make sure
344
  // file is in the correct format.
345
  //
346
  // NewTableReader() is called in three places:
347
  // (1) TableCache::FindTable() calls the function when table cache miss
348
  //     and cache the table object returned.
349
  // (2) SstFileReader (for SST Dump) opens the table and dump the table
350
  //     contents using the interator of the table.
351
  // (3) DBImpl::AddFile() calls this function to read the contents of
352
  //     the sst file it's attempting to add
353
  //
354
  // table_reader_options is a TableReaderOptions which contain all the
355
  //    needed parameters and configuration to open the table.
356
  // base_file is a file handler to handle the base file for the table.
357
  // base_file_size is the physical file size of the base file.
358
  // table_reader is the output table reader.
359
  virtual Status NewTableReader(
360
      const TableReaderOptions& table_reader_options,
361
      unique_ptr<RandomAccessFileReader>&& base_file, uint64_t base_file_size,
362
      unique_ptr<TableReader>* table_reader) const = 0;
363
364
  // Whether SST split into metadata and data file(s) is supported for writing.
365
  // There is a AdaptiveTableFactory inheriting common TableFactory interface. AdaptiveTableFactory
366
  // can delegate reads to one type of table factory and writes to another type of table factory
367
  // and in that case it can only support split SST either for read or write.
368
  virtual bool IsSplitSstForWriteSupported() const = 0;
369
370
  // Return a table builder to write to file(s) for this table type.
371
  // Particular table factory implementations can support either writing whole SST to a single file
372
  // (passed in base_file parameter, data_file should be nullptr in that case) or support separate
373
  // files for data (data_file) and metadata (base_file).
374
  //
375
  // It is called in several places:
376
  // (1) When flushing memtable to a level-0 output file, it creates a table
377
  //     builder (In DBImpl::WriteLevel0Table(), by calling BuildTable())
378
  // (2) During compaction, it gets the builder for writing compaction output
379
  //     files in DBImpl::OpenCompactionOutputFile().
380
  // (3) When recovering from transaction logs, it creates a table builder to
381
  //     write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery,
382
  //     by calling BuildTable())
383
  // (4) When running Repairer, it creates a table builder to convert logs to
384
  //     SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
385
  //
386
  // ImmutableCFOptions is a subset of Options that can not be altered.
387
  // Multiple configured can be acceseed from there, including and not limited
388
  // to compression options. file is a handle of a writable file.
389
  // It is the caller's responsibility to keep the file open and close the file
390
  // after closing the table builder. compression_type is the compression type
391
  // to use in this table.
392
  virtual TableBuilder* NewTableBuilder(
393
      const TableBuilderOptions& table_builder_options,
394
      uint32_t column_family_id, WritableFileWriter* base_file,
395
      WritableFileWriter* data_file = nullptr) const = 0;
396
397
  // Sanitizes the specified DB Options and ColumnFamilyOptions.
398
  //
399
  // If the function cannot find a way to sanitize the input DB Options,
400
  // a non-ok Status will be returned.
401
  virtual Status SanitizeOptions(
402
      const DBOptions& db_opts,
403
      const ColumnFamilyOptions& cf_opts) const = 0;
404
405
  // Return a string that contains printable format of table configurations.
406
  // RocksDB prints configurations at DB Open().
407
  virtual std::string GetPrintableTableOptions() const = 0;
408
409
  // Returns the raw pointer of the table options that is used by this
410
  // TableFactory, or nullptr if this function is not supported.
411
  // Since the return value is a raw pointer, the TableFactory owns the
412
  // pointer and the caller should not delete the pointer.
413
  //
414
  // In certan case, it is desirable to alter the underlying options when the
415
  // TableFactory is not used by any open DB by casting the returned pointer
416
  // to the right class.   For instance, if BlockBasedTableFactory is used,
417
  // then the pointer can be casted to BlockBasedTableOptions.
418
  //
419
  // Note that changing the underlying TableFactory options while the
420
  // TableFactory is currently used by any open DB is undefined behavior.
421
  // Developers should use DB::SetOption() instead to dynamically change
422
  // options while the DB is open.
423
0
  virtual void* GetOptions() { return nullptr; }
424
425
  // Returns SST file filter for pruning out files which doesn't contain some part of user_key.
426
  // It should be in sync with FilterPolicy used for bloom filter construction. For example,
427
  // file filter should only consider hashed components of the key when using with
428
  // DocDbAwareFilterPolicy and HashedComponentsExtractor.
429
  virtual std::shared_ptr<TableAwareReadFileFilter> NewTableAwareReadFileFilter(
430
0
      const ReadOptions &read_options, const Slice &user_key) const { return nullptr; }
431
};
432
433
#ifndef ROCKSDB_LITE
434
// Create a special table factory that can open either of the supported
435
// table formats, based on setting inside the SST files. It should be used to
436
// convert a DB from one table format to another.
437
// @table_factory_to_write: the table factory used when writing to new files.
438
// @block_based_table_factory:  block based table factory to use. If NULL, use
439
//                              a default one.
440
// @plain_table_factory: plain table factory to use. If NULL, use a default one.
441
extern TableFactory* NewAdaptiveTableFactory(
442
    std::shared_ptr<TableFactory> table_factory_to_write = nullptr,
443
    std::shared_ptr<TableFactory> block_based_table_factory = nullptr,
444
    std::shared_ptr<TableFactory> plain_table_factory = nullptr);
445
446
#endif  // ROCKSDB_LITE
447
448
}  // namespace rocksdb
449
450
#endif  // YB_ROCKSDB_TABLE_H