YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/rocksdb/options.h
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
// This source code is licensed under the BSD-style license found in the
3
// LICENSE file in the root directory of this source tree. An additional grant
4
// of patent rights can be found in the PATENTS file in the same directory.
5
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
6
// Use of this source code is governed by a BSD-style license that can be
7
// found in the LICENSE file. See the AUTHORS file for names of contributors.
8
//
9
// The following only applies to changes made to this file as part of YugaByte development.
10
//
11
// Portions Copyright (c) YugaByte, Inc.
12
//
13
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
14
// in compliance with the License.  You may obtain a copy of the License at
15
//
16
// http://www.apache.org/licenses/LICENSE-2.0
17
//
18
// Unless required by applicable law or agreed to in writing, software distributed under the License
19
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
20
// or implied.  See the License for the specific language governing permissions and limitations
21
// under the License.
22
//
23
24
#ifndef YB_ROCKSDB_OPTIONS_H
25
#define YB_ROCKSDB_OPTIONS_H
26
27
#include <stddef.h>
28
#include <stdint.h>
29
#include <string>
30
#include <memory>
31
#include <vector>
32
#include <limits>
33
#include <unordered_map>
34
35
#include "yb/rocksdb/cache.h"
36
#include "yb/rocksdb/listener.h"
37
#include "yb/util/slice.h"
38
#include "yb/rocksdb/universal_compaction.h"
39
40
#ifdef max
41
#undef max
42
#endif
43
44
namespace yb {
45
46
class MemTracker;
47
class PriorityThreadPool;
48
49
}
50
51
namespace rocksdb {
52
53
class Arena;
54
class BoundaryValuesExtractor;
55
class Cache;
56
class CompactionFilter;
57
class CompactionFilterFactory;
58
class Comparator;
59
class Env;
60
class CompactionFileFilterFactory;
61
enum InfoLogLevel : unsigned char;
62
class SstFileManager;
63
class FilterPolicy;
64
class Logger;
65
class MemTable;
66
class MergeOperator;
67
class Snapshot;
68
class TableFactory;
69
class MemTableRepFactory;
70
class TablePropertiesCollectorFactory;
71
class RateLimiter;
72
class SliceTransform;
73
class Statistics;
74
class InternalIterator;
75
class InternalKeyComparator;
76
class WalFilter;
77
class MemoryMonitor;
78
79
struct FileMetaData;
80
81
typedef std::shared_ptr<const InternalKeyComparator> InternalKeyComparatorPtr;
82
83
// DB contents are stored in a set of blocks, each of which holds a
84
// sequence of key,value pairs.  Each block may be compressed before
85
// being stored in a file.  The following enum describes which
86
// compression method (if any) is used to compress a block.
87
enum CompressionType : char {
88
  // NOTE: do not change the values of existing entries, as these are
89
  // part of the persistent format on disk.
90
  kNoCompression = 0x0,
91
  kSnappyCompression = 0x1,
92
  kZlibCompression = 0x2,
93
  kBZip2Compression = 0x3,
94
  kLZ4Compression = 0x4,
95
  kLZ4HCCompression = 0x5,
96
  // zstd format is not finalized yet so it's subject to changes.
97
  kZSTDNotFinalCompression = 0x40,
98
};
99
100
enum CompactionStyle : char {
101
  // level based compaction style
102
  kCompactionStyleLevel = 0x0,
103
  // Universal compaction style
104
  // Not supported in ROCKSDB_LITE.
105
  kCompactionStyleUniversal = 0x1,
106
  // FIFO compaction style
107
  // Not supported in ROCKSDB_LITE
108
  kCompactionStyleFIFO = 0x2,
109
  // Disable background compaction. Compaction jobs are submitted
110
  // via CompactFiles().
111
  // Not supported in ROCKSDB_LITE
112
  kCompactionStyleNone = 0x3,
113
};
114
115
// In Level-based comapction, it Determines which file from a level to be
116
// picked to merge to the next level. We suggest people try
117
// kMinOverlappingRatio first when you tune your database.
118
enum CompactionPri : char {
119
  // Slightly Priotize larger files by size compensated by #deletes
120
  kByCompensatedSize = 0x0,
121
  // First compact files whose data's latest update time is oldest.
122
  // Try this if you only update some hot keys in small ranges.
123
  kOldestLargestSeqFirst = 0x1,
124
  // First compact files whose range hasn't been compacted to the next level
125
  // for the longest. If your updates are random across the key space,
126
  // write amplification is slightly better with this option.
127
  kOldestSmallestSeqFirst = 0x2,
128
  // First compact files whose ratio between overlapping size in next level
129
  // and its size is the smallest. It in many cases can optimize write
130
  // amplification.
131
  kMinOverlappingRatio = 0x3,
132
};
133
134
enum class WALRecoveryMode : char {
135
  // Original levelDB recovery
136
  // We tolerate incomplete record in trailing data on all logs
137
  // Use case : This is legacy behavior (default)
138
  kTolerateCorruptedTailRecords = 0x00,
139
  // Recover from clean shutdown
140
  // We don't expect to find any corruption in the WAL
141
  // Use case : This is ideal for unit tests and rare applications that
142
  // can require high consistency guarantee
143
  kAbsoluteConsistency = 0x01,
144
  // Recover to point-in-time consistency
145
  // We stop the WAL playback on discovering WAL inconsistency
146
  // Use case : Ideal for systems that have disk controller cache like
147
  // hard disk, SSD without super capacitor that store related data
148
  kPointInTimeRecovery = 0x02,
149
  // Recovery after a disaster
150
  // We ignore any corruption in the WAL and try to salvage as much data as
151
  // possible
152
  // Use case : Ideal for last ditch effort to recover data or systems that
153
  // operate with low grade unrelated data
154
  kSkipAnyCorruptedRecords = 0x03,
155
};
156
157
struct CompactionOptionsFIFO {
158
  // once the total sum of table files reaches this, we will delete the oldest
159
  // table file
160
  // Default: 1GB
161
  uint64_t max_table_files_size;
162
163
3.64M
  CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {}
164
};
165
166
// Compression options for different compression algorithms like Zlib
167
struct CompressionOptions {
168
  int window_bits;
169
  int level;
170
  int strategy;
171
3.62M
  CompressionOptions() : window_bits(-14), level(-1), strategy(0) {}
172
  CompressionOptions(int wbits, int _lev, int _strategy)
173
0
      : window_bits(wbits), level(_lev), strategy(_strategy) {}
174
};
175
176
enum UpdateStatus {    // Return status For inplace update callback
177
  UPDATE_FAILED   = 0, // Nothing to update
178
  UPDATED_INPLACE = 1, // Value updated inplace
179
  UPDATED         = 2, // No inplace update. Merged value set
180
};
181
182
struct DbPath {
183
  std::string path;
184
  uint64_t target_size;  // Target size of total files under the path, in byte.
185
186
0
  DbPath() : target_size(0) {}
187
859k
  DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {}
188
};
189
190
struct Options;
191
192
struct ColumnFamilyOptions {
193
  // Some functions that make it easier to optimize RocksDB
194
195
  // Use this if you don't need to keep the data sorted, i.e. you'll never use
196
  // an iterator, only Put() and Get() API calls
197
  //
198
  // Not supported in ROCKSDB_LITE
199
  ColumnFamilyOptions* OptimizeForPointLookup(
200
      uint64_t block_cache_size_mb);
201
202
  // Default values for some parameters in ColumnFamilyOptions are not
203
  // optimized for heavy workloads and big datasets, which means you might
204
  // observe write stalls under some conditions. As a starting point for tuning
205
  // RocksDB options, use the following two functions:
206
  // * OptimizeLevelStyleCompaction -- optimizes level style compaction
207
  // * OptimizeUniversalStyleCompaction -- optimizes universal style compaction
208
  // Universal style compaction is focused on reducing Write Amplification
209
  // Factor for big data sets, but increases Space Amplification. You can learn
210
  // more about the different styles here:
211
  // https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide
212
  // Make sure to also call IncreaseParallelism(), which will provide the
213
  // biggest performance gains.
214
  // Note: we might use more memory than memtable_memory_budget during high
215
  // write rate period
216
  //
217
  // OptimizeUniversalStyleCompaction is not supported in ROCKSDB_LITE
218
  ColumnFamilyOptions* OptimizeLevelStyleCompaction(
219
      uint64_t memtable_memory_budget = 512 * 1024 * 1024);
220
  ColumnFamilyOptions* OptimizeUniversalStyleCompaction(
221
      uint64_t memtable_memory_budget = 512 * 1024 * 1024);
222
223
  // -------------------
224
  // Parameters that affect behavior
225
226
  // Comparator used to define the order of keys in the table.
227
  // Default: a comparator that uses lexicographic byte-wise ordering
228
  //
229
  // REQUIRES: The client must ensure that the comparator supplied
230
  // here has the same name and orders keys *exactly* the same as the
231
  // comparator provided to previous open calls on the same DB.
232
  const Comparator* comparator;
233
234
  // REQUIRES: The client must provide a merge operator if Merge operation
235
  // needs to be accessed. Calling Merge on a DB without a merge operator
236
  // would result in Status::NotSupported. The client must ensure that the
237
  // merge operator supplied here has the same name and *exactly* the same
238
  // semantics as the merge operator provided to previous open calls on
239
  // the same DB. The only exception is reserved for upgrade, where a DB
240
  // previously without a merge operator is introduced to Merge operation
241
  // for the first time. It's necessary to specify a merge operator when
242
  // openning the DB in this case.
243
  // Default: nullptr
244
  std::shared_ptr<MergeOperator> merge_operator;
245
246
  // A single CompactionFilter instance to call into during compaction.
247
  // Allows an application to modify/delete a key-value during background
248
  // compaction.
249
  //
250
  // If the client requires a new compaction filter to be used for different
251
  // compaction runs, it can specify compaction_filter_factory instead of this
252
  // option.  The client should specify only one of the two.
253
  // compaction_filter takes precedence over compaction_filter_factory if
254
  // client specifies both.
255
  //
256
  // If multithreaded compaction is being used, the supplied CompactionFilter
257
  // instance may be used from different threads concurrently and so should be
258
  // thread-safe.
259
  //
260
  // Default: nullptr
261
  CompactionFilter* compaction_filter;
262
263
  // This is a factory that provides compaction filter objects which allow
264
  // an application to modify/delete a key-value during background compaction.
265
  //
266
  // A new filter will be created on each compaction run.  If multithreaded
267
  // compaction is being used, each created CompactionFilter will only be used
268
  // from a single thread and so does not need to be thread-safe.
269
  //
270
  // Default: nullptr
271
  std::shared_ptr<CompactionFilterFactory> compaction_filter_factory;
272
273
  // -------------------
274
  // Parameters that affect performance
275
276
  // Amount of data to build up in memory (backed by an unsorted log
277
  // on disk) before converting to a sorted on-disk file.
278
  //
279
  // Larger values increase performance, especially during bulk loads.
280
  // Up to max_write_buffer_number write buffers may be held in memory
281
  // at the same time,
282
  // so you may wish to adjust this parameter to control memory usage.
283
  // Also, a larger write buffer will result in a longer recovery time
284
  // the next time the database is opened.
285
  //
286
  // Note that write_buffer_size is enforced per column family.
287
  // See db_write_buffer_size for sharing memory across column families.
288
  //
289
  // Default: 4MB
290
  //
291
  // Dynamically changeable through SetOptions() API
292
  size_t write_buffer_size;
293
294
  // The maximum number of write buffers that are built up in memory.
295
  // The default and the minimum number is 2, so that when 1 write buffer
296
  // is being flushed to storage, new writes can continue to the other
297
  // write buffer.
298
  // If max_write_buffer_number > 3, writing will be slowed down to
299
  // options.delayed_write_rate if we are writing to the last write buffer
300
  // allowed.
301
  //
302
  // Default: 2
303
  //
304
  // Dynamically changeable through SetOptions() API
305
  int max_write_buffer_number;
306
307
  // The minimum number of write buffers that will be merged together
308
  // before writing to storage.  If set to 1, then
309
  // all write buffers are fushed to L0 as individual files and this increases
310
  // read amplification because a get request has to check in all of these
311
  // files. Also, an in-memory merge may result in writing lesser
312
  // data to storage if there are duplicate records in each of these
313
  // individual write buffers.  Default: 1
314
  int min_write_buffer_number_to_merge;
315
316
  // The total maximum number of write buffers to maintain in memory including
317
  // copies of buffers that have already been flushed.  Unlike
318
  // max_write_buffer_number, this parameter does not affect flushing.
319
  // This controls the minimum amount of write history that will be available
320
  // in memory for conflict checking when Transactions are used.
321
  //
322
  // When using an OptimisticTransactionDB:
323
  // If this value is too low, some transactions may fail at commit time due
324
  // to not being able to determine whether there were any write conflicts.
325
  //
326
  // When using a TransactionDB:
327
  // If Transaction::SetSnapshot is used, TransactionDB will read either
328
  // in-memory write buffers or SST files to do write-conflict checking.
329
  // Increasing this value can reduce the number of reads to SST files
330
  // done for conflict detection.
331
  //
332
  // Setting this value to 0 will cause write buffers to be freed immediately
333
  // after they are flushed.
334
  // If this value is set to -1, 'max_write_buffer_number' will be used.
335
  //
336
  // Default:
337
  // If using a TransactionDB/OptimisticTransactionDB, the default value will
338
  // be set to the value of 'max_write_buffer_number' if it is not explicitly
339
  // set by the user.  Otherwise, the default is 0.
340
  int max_write_buffer_number_to_maintain;
341
342
  // Compress blocks using the specified compression algorithm.  This
343
  // parameter can be changed dynamically.
344
  //
345
  // Default: kSnappyCompression, if it's supported. If snappy is not linked
346
  // with the library, the default is kNoCompression.
347
  //
348
  // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
349
  //    ~200-500MB/s compression
350
  //    ~400-800MB/s decompression
351
  // Note that these speeds are significantly faster than most
352
  // persistent storage speeds, and therefore it is typically never
353
  // worth switching to kNoCompression.  Even if the input data is
354
  // incompressible, the kSnappyCompression implementation will
355
  // efficiently detect that and will switch to uncompressed mode.
356
  CompressionType compression;
357
358
  // Different levels can have different compression policies. There
359
  // are cases where most lower levels would like to use quick compression
360
  // algorithms while the higher levels (which have more data) use
361
  // compression algorithms that have better compression but could
362
  // be slower. This array, if non-empty, should have an entry for
363
  // each level of the database; these override the value specified in
364
  // the previous field 'compression'.
365
  //
366
  // NOTICE if level_compaction_dynamic_level_bytes=true,
367
  // compression_per_level[0] still determines L0, but other elements
368
  // of the array are based on base level (the level L0 files are merged
369
  // to), and may not match the level users see from info log for metadata.
370
  // If L0 files are merged to level-n, then, for i>0, compression_per_level[i]
371
  // determines compaction type for level n+i-1.
372
  // For example, if we have three 5 levels, and we determine to merge L0
373
  // data to L4 (which means L1..L3 will be empty), then the new files go to
374
  // L4 uses compression type compression_per_level[1].
375
  // If now L0 is merged to L2. Data goes to L2 will be compressed
376
  // according to compression_per_level[1], L3 using compression_per_level[2]
377
  // and L4 using compression_per_level[3]. Compaction for each level can
378
  // change when data grows.
379
  std::vector<CompressionType> compression_per_level;
380
381
  // different options for compression algorithms
382
  CompressionOptions compression_opts;
383
384
  // If non-nullptr, use the specified function to determine the
385
  // prefixes for keys.  These prefixes will be placed in the filter.
386
  // Depending on the workload, this can reduce the number of read-IOP
387
  // cost for scans when a prefix is passed via ReadOptions to
388
  // db.NewIterator().  For prefix filtering to work properly,
389
  // "prefix_extractor" and "comparator" must be such that the following
390
  // properties hold:
391
  //
392
  // 1) key.starts_with(prefix(key))
393
  // 2) Compare(prefix(key), key) <= 0.
394
  // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
395
  // 4) prefix(prefix(key)) == prefix(key)
396
  //
397
  // Default: nullptr
398
  std::shared_ptr<const SliceTransform> prefix_extractor;
399
400
  // Number of levels for this database
401
  int num_levels;
402
403
  // Number of files to trigger level-0 compaction. A value <0 means that
404
  // level-0 compaction will not be triggered by number of files at all.
405
  //
406
  // Default: 4
407
  //
408
  // Dynamically changeable through SetOptions() API
409
  int level0_file_num_compaction_trigger;
410
411
  // Soft limit on number of level-0 files. We start slowing down writes at this
412
  // point. A value <0 means that no writing slow down will be triggered by
413
  // number of files in level-0.
414
  //
415
  // Dynamically changeable through SetOptions() API
416
  int level0_slowdown_writes_trigger;
417
418
  // Maximum number of level-0 files.  We stop writes at this point.
419
  //
420
  // Dynamically changeable through SetOptions() API
421
  int level0_stop_writes_trigger;
422
423
  // This does not do anything anymore. Deprecated.
424
  int max_mem_compaction_level;
425
426
  // Target file size for compaction.
427
  // target_file_size_base is per-file size for level-1.
428
  // Target file size for level L can be calculated by
429
  // target_file_size_base * (target_file_size_multiplier ^ (L-1))
430
  // For example, if target_file_size_base is 2MB and
431
  // target_file_size_multiplier is 10, then each file on level-1 will
432
  // be 2MB, and each file on level 2 will be 20MB,
433
  // and each file on level-3 will be 200MB.
434
  //
435
  // Default: 2MB.
436
  //
437
  // Dynamically changeable through SetOptions() API
438
  uint64_t target_file_size_base;
439
440
  // By default target_file_size_multiplier is 1, which means
441
  // by default files in different levels will have similar size.
442
  //
443
  // Dynamically changeable through SetOptions() API
444
  int target_file_size_multiplier;
445
446
  // Control maximum total data size for a level.
447
  // max_bytes_for_level_base is the max total for level-1.
448
  // Maximum number of bytes for level L can be calculated as
449
  // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1))
450
  // For example, if max_bytes_for_level_base is 20MB, and if
451
  // max_bytes_for_level_multiplier is 10, total data size for level-1
452
  // will be 20MB, total file size for level-2 will be 200MB,
453
  // and total file size for level-3 will be 2GB.
454
  //
455
  // Default: 10MB.
456
  //
457
  // Dynamically changeable through SetOptions() API
458
  uint64_t max_bytes_for_level_base;
459
460
  // If true, RocksDB will pick target size of each level dynamically.
461
  // We will pick a base level b >= 1. L0 will be directly merged into level b,
462
  // instead of always into level 1. Level 1 to b-1 need to be empty.
463
  // We try to pick b and its target size so that
464
  // 1. target size is in the range of
465
  //   (max_bytes_for_level_base / max_bytes_for_level_multiplier,
466
  //    max_bytes_for_level_base]
467
  // 2. target size of the last level (level num_levels-1) equals to extra size
468
  //    of the level.
469
  // At the same time max_bytes_for_level_multiplier and
470
  // max_bytes_for_level_multiplier_additional are still satisfied.
471
  //
472
  // With this option on, from an empty DB, we make last level the base level,
473
  // which means merging L0 data into the last level, until it exceeds
474
  // max_bytes_for_level_base. And then we make the second last level to be
475
  // base level, to start to merge L0 data to second last level, with its
476
  // target size to be 1/max_bytes_for_level_multiplier of the last level's
477
  // extra size. After the data accumulates more so that we need to move the
478
  // base level to the third last one, and so on.
479
  //
480
  // For example, assume max_bytes_for_level_multiplier=10, num_levels=6,
481
  // and max_bytes_for_level_base=10MB.
482
  // Target sizes of level 1 to 5 starts with:
483
  // [- - - - 10MB]
484
  // with base level is level. Target sizes of level 1 to 4 are not applicable
485
  // because they will not be used.
486
  // Until the size of Level 5 grows to more than 10MB, say 11MB, we make
487
  // base target to level 4 and now the targets looks like:
488
  // [- - - 1.1MB 11MB]
489
  // While data are accumulated, size targets are tuned based on actual data
490
  // of level 5. When level 5 has 50MB of data, the target is like:
491
  // [- - - 5MB 50MB]
492
  // Until level 5's actual size is more than 100MB, say 101MB. Now if we keep
493
  // level 4 to be the base level, its target size needs to be 10.1MB, which
494
  // doesn't satisfy the target size range. So now we make level 3 the target
495
  // size and the target sizes of the levels look like:
496
  // [- - 1.01MB 10.1MB 101MB]
497
  // In the same way, while level 5 further grows, all levels' targets grow,
498
  // like
499
  // [- - 5MB 50MB 500MB]
500
  // Until level 5 exceeds 1000MB and becomes 1001MB, we make level 2 the
501
  // base level and make levels' target sizes like this:
502
  // [- 1.001MB 10.01MB 100.1MB 1001MB]
503
  // and go on...
504
  //
505
  // By doing it, we give max_bytes_for_level_multiplier a priority against
506
  // max_bytes_for_level_base, for a more predictable LSM tree shape. It is
507
  // useful to limit worse case space amplification.
508
  //
509
  // max_bytes_for_level_multiplier_additional is ignored with this flag on.
510
  //
511
  // Turning this feature on or off for an existing DB can cause unexpected
512
  // LSM tree structure so it's not recommended.
513
  //
514
  // NOTE: this option is experimental
515
  //
516
  // Default: false
517
  bool level_compaction_dynamic_level_bytes;
518
519
  // Default: 10.
520
  //
521
  // Dynamically changeable through SetOptions() API
522
  int max_bytes_for_level_multiplier;
523
524
  // Different max-size multipliers for different levels.
525
  // These are multiplied by max_bytes_for_level_multiplier to arrive
526
  // at the max-size of each level.
527
  //
528
  // Default: 1
529
  //
530
  // Dynamically changeable through SetOptions() API
531
  std::vector<int> max_bytes_for_level_multiplier_additional;
532
533
  // Maximum number of bytes in all compacted files.  We avoid expanding
534
  // the lower level file set of a compaction if it would make the
535
  // total compaction cover more than
536
  // (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
537
  //
538
  // Dynamically changeable through SetOptions() API
539
  int expanded_compaction_factor;
540
541
  // Maximum number of bytes in all source files to be compacted in a
542
  // single compaction run. We avoid picking too many files in the
543
  // source level so that we do not exceed the total source bytes
544
  // for compaction to exceed
545
  // (source_compaction_factor * targetFileSizeLevel()) many bytes.
546
  // Default:1, i.e. pick maxfilesize amount of data as the source of
547
  // a compaction.
548
  //
549
  // Dynamically changeable through SetOptions() API
550
  int source_compaction_factor;
551
552
  // Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
553
  // stop building a single file in a level->level+1 compaction.
554
  //
555
  // Dynamically changeable through SetOptions() API
556
  int max_grandparent_overlap_factor;
557
558
  // DEPRECATED -- this options is no longer used
559
  // Puts are delayed to options.delayed_write_rate when any level has a
560
  // compaction score that exceeds soft_rate_limit. This is ignored when == 0.0.
561
  //
562
  // Default: 0 (disabled)
563
  //
564
  // Dynamically changeable through SetOptions() API
565
  double soft_rate_limit;
566
567
  // DEPRECATED -- this options is no longer used
568
  double hard_rate_limit;
569
570
  // All writes will be slowed down to at least delayed_write_rate if estimated
571
  // bytes needed to be compaction exceed this threshold.
572
  //
573
  // Default: 0 (disabled)
574
  uint64_t soft_pending_compaction_bytes_limit;
575
576
  // All writes are stopped if estimated bytes needed to be compaction exceed
577
  // this threshold.
578
  //
579
  // Default: 0 (disabled)
580
  uint64_t hard_pending_compaction_bytes_limit;
581
582
  // DEPRECATED -- this options is no longer used
583
  unsigned int rate_limit_delay_max_milliseconds;
584
585
  // size of one block in arena memory allocation.
586
  // If <= 0, a proper value is automatically calculated (usually 1/8 of
587
  // writer_buffer_size, rounded up to a multiple of 4KB).
588
  //
589
  // There are two additional restriction of the The specified size:
590
  // (1) size should be in the range of [4096, 2 << 30] and
591
  // (2) be the multiple of the CPU word (which helps with the memory
592
  // alignment).
593
  //
594
  // We'll automatically check and adjust the size number to make sure it
595
  // conforms to the restrictions.
596
  //
597
  // Default: 0
598
  //
599
  // Dynamically changeable through SetOptions() API
600
  size_t arena_block_size;
601
602
  // Disable automatic compactions. Manual compactions can still
603
  // be issued on this column family
604
  //
605
  // Dynamically changeable through SetOptions() API
606
  bool disable_auto_compactions;
607
608
  // DEPREACTED
609
  // Does not have any effect.
610
  bool purge_redundant_kvs_while_flush;
611
612
  // The compaction style. Default: kCompactionStyleLevel
613
  CompactionStyle compaction_style;
614
615
  // If level compaction_style = kCompactionStyleLevel, for each level,
616
  // which files are prioritized to be picked to compact.
617
  // Default: kCompactionPriByCompensatedSize
618
  CompactionPri compaction_pri;
619
620
  // If true, compaction will verify checksum on every read that happens
621
  // as part of compaction
622
  //
623
  // Default: true
624
  //
625
  // Dynamically changeable through SetOptions() API
626
  bool verify_checksums_in_compaction;
627
628
  // The options needed to support Universal Style compactions
629
  CompactionOptionsUniversal compaction_options_universal;
630
631
  // The options for FIFO compaction style
632
  CompactionOptionsFIFO compaction_options_fifo;
633
634
  // Use KeyMayExist API to filter deletes when this is true.
635
  // If KeyMayExist returns false, i.e. the key definitely does not exist, then
636
  // the delete is a noop. KeyMayExist only incurs in-memory look up.
637
  // This optimization avoids writing the delete to storage when appropriate.
638
  //
639
  // Default: false
640
  //
641
  // Dynamically changeable through SetOptions() API
642
  bool filter_deletes;
643
644
  // An iteration->Next() sequentially skips over keys with the same
645
  // user-key unless this option is set. This number specifies the number
646
  // of keys (with the same userkey) that will be sequentially
647
  // skipped before a reseek is issued.
648
  //
649
  // Default: 8
650
  //
651
  // Dynamically changeable through SetOptions() API
652
  uint64_t max_sequential_skip_in_iterations;
653
654
  // This is a factory that provides MemTableRep objects.
655
  // Default: a factory that provides a skip-list-based implementation of
656
  // MemTableRep.
657
  std::shared_ptr<MemTableRepFactory> memtable_factory;
658
659
  // This is a factory that provides TableFactory objects.
660
  // Default: a block-based table factory that provides a default
661
  // implementation of TableBuilder and TableReader with default
662
  // BlockBasedTableOptions.
663
  std::shared_ptr<TableFactory> table_factory;
664
665
  // Block-based table related options are moved to BlockBasedTableOptions.
666
  // Related options that were originally here but now moved include:
667
  //   no_block_cache
668
  //   block_cache
669
  //   block_cache_compressed
670
  //   block_size
671
  //   block_size_deviation
672
  //   block_restart_interval
673
  //   filter_policy
674
  //   whole_key_filtering
675
  // If you'd like to customize some of these options, you will need to
676
  // use NewBlockBasedTableFactory() to construct a new table factory.
677
678
  // This option allows user to to collect their own interested statistics of
679
  // the tables.
680
  // Default: empty vector -- no user-defined statistics collection will be
681
  // performed.
682
  typedef std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
683
      TablePropertiesCollectorFactories;
684
  TablePropertiesCollectorFactories table_properties_collector_factories;
685
686
  // Allows thread-safe inplace updates. If this is true, there is no way to
687
  // achieve point-in-time consistency using snapshot or iterator (assuming
688
  // concurrent updates). Hence iterator and multi-get will return results
689
  // which are not consistent as of any point-in-time.
690
  // If inplace_callback function is not set,
691
  //   Put(key, new_value) will update inplace the existing_value iff
692
  //   * key exists in current memtable
693
  //   * new sizeof(new_value) <= sizeof(existing_value)
694
  //   * existing_value for that key is a put i.e. kTypeValue
695
  // If inplace_callback function is set, check doc for inplace_callback.
696
  // Default: false.
697
  bool inplace_update_support;
698
699
  // Number of locks used for inplace update
700
  // Default: 10000, if inplace_update_support = true, else 0.
701
  //
702
  // Dynamically changeable through SetOptions() API
703
  size_t inplace_update_num_locks;
704
705
  // existing_value - pointer to previous value (from both memtable and sst).
706
  //                  nullptr if key doesn't exist
707
  // existing_value_size - pointer to size of existing_value).
708
  //                       nullptr if key doesn't exist
709
  // delta_value - Delta value to be merged with the existing_value.
710
  //               Stored in transaction logs.
711
  // merged_value - Set when delta is applied on the previous value.
712
713
  // Applicable only when inplace_update_support is true,
714
  // this callback function is called at the time of updating the memtable
715
  // as part of a Put operation, lets say Put(key, delta_value). It allows the
716
  // 'delta_value' specified as part of the Put operation to be merged with
717
  // an 'existing_value' of the key in the database.
718
719
  // If the merged value is smaller in size that the 'existing_value',
720
  // then this function can update the 'existing_value' buffer inplace and
721
  // the corresponding 'existing_value'_size pointer, if it wishes to.
722
  // The callback should return UpdateStatus::UPDATED_INPLACE.
723
  // In this case. (In this case, the snapshot-semantics of the rocksdb
724
  // Iterator is not atomic anymore).
725
726
  // If the merged value is larger in size than the 'existing_value' or the
727
  // application does not wish to modify the 'existing_value' buffer inplace,
728
  // then the merged value should be returned via *merge_value. It is set by
729
  // merging the 'existing_value' and the Put 'delta_value'. The callback should
730
  // return UpdateStatus::UPDATED in this case. This merged value will be added
731
  // to the memtable.
732
733
  // If merging fails or the application does not wish to take any action,
734
  // then the callback should return UpdateStatus::UPDATE_FAILED.
735
736
  // Please remember that the original call from the application is Put(key,
737
  // delta_value). So the transaction log (if enabled) will still contain (key,
738
  // delta_value). The 'merged_value' is not stored in the transaction log.
739
  // Hence the inplace_callback function should be consistent across db reopens.
740
741
  // Default: nullptr
742
  UpdateStatus (*inplace_callback)(char* existing_value,
743
                                   uint32_t* existing_value_size,
744
                                   Slice delta_value,
745
                                   std::string* merged_value);
746
747
  // if prefix_extractor is set and bloom_bits is not 0, create prefix bloom
748
  // for memtable
749
  //
750
  // Dynamically changeable through SetOptions() API
751
  uint32_t memtable_prefix_bloom_bits;
752
753
  // number of hash probes per key
754
  //
755
  // Dynamically changeable through SetOptions() API
756
  uint32_t memtable_prefix_bloom_probes;
757
758
  // Page size for huge page TLB for bloom in memtable. If <=0, not allocate
759
  // from huge page TLB but from malloc.
760
  // Need to reserve huge pages for it to be allocated. For example:
761
  //      sysctl -w vm.nr_hugepages=20
762
  // See linux doc Documentation/vm/hugetlbpage.txt
763
  //
764
  // Dynamically changeable through SetOptions() API
765
  size_t memtable_prefix_bloom_huge_page_tlb_size;
766
767
  // Control locality of bloom filter probes to improve cache miss rate.
768
  // This option only applies to memtable prefix bloom and plaintable
769
  // prefix bloom. It essentially limits every bloom checking to one cache line.
770
  // This optimization is turned off when set to 0, and positive number to turn
771
  // it on.
772
  // Default: 0
773
  uint32_t bloom_locality;
774
775
  // Maximum number of successive merge operations on a key in the memtable.
776
  //
777
  // When a merge operation is added to the memtable and the maximum number of
778
  // successive merges is reached, the value of the key will be calculated and
779
  // inserted into the memtable instead of the merge operation. This will
780
  // ensure that there are never more than max_successive_merges merge
781
  // operations in the memtable.
782
  //
783
  // Default: 0 (disabled)
784
  //
785
  // Dynamically changeable through SetOptions() API
786
  size_t max_successive_merges;
787
788
  // The number of partial merge operands to accumulate before partial
789
  // merge will be performed. Partial merge will not be called
790
  // if the list of values to merge is less than min_partial_merge_operands.
791
  //
792
  // If min_partial_merge_operands < 2, then it will be treated as 2.
793
  //
794
  // Default: 2
795
  uint32_t min_partial_merge_operands;
796
797
  // This flag specifies that the implementation should optimize the filters
798
  // mainly for cases where keys are found rather than also optimize for keys
799
  // missed. This would be used in cases where the application knows that
800
  // there are very few misses or the performance in the case of misses is not
801
  // important.
802
  //
803
  // For now, this flag allows us to not store filters for the last level i.e
804
  // the largest level which contains data of the LSM store. For keys which
805
  // are hits, the filters in this level are not useful because we will search
806
  // for the data anyway. NOTE: the filters in other levels are still useful
807
  // even for key hit because they tell us whether to look in that level or go
808
  // to the higher level.
809
  //
810
  // Default: false
811
  bool optimize_filters_for_hits;
812
813
  // After writing every SST file, reopen it and read all the keys.
814
  // Default: false
815
  bool paranoid_file_checks;
816
817
  // Measure IO stats in compactions, if true.
818
  // Default: false
819
  bool compaction_measure_io_stats;
820
821
  // Create ColumnFamilyOptions with default values for all fields
822
  ColumnFamilyOptions();
823
  // Create ColumnFamilyOptions from Options
824
  explicit ColumnFamilyOptions(const Options& options);
825
826
  void Dump(Logger* log) const;
827
};
828
829
typedef std::function<yb::Result<bool>(const MemTable&)> MemTableFilter;
830
831
using IteratorReplacer =
832
    std::function<InternalIterator*(InternalIterator*, Arena*, const Slice&)>;
833
834
struct DBOptions {
835
  // Some functions that make it easier to optimize RocksDB
836
837
#ifndef ROCKSDB_LITE
838
  // By default, RocksDB uses only one background thread for flush and
839
  // compaction. Calling this function will set it up such that total of
840
  // `total_threads` is used. Good value for `total_threads` is the number of
841
  // cores. You almost definitely want to call this function if your system is
842
  // bottlenecked by RocksDB.
843
  DBOptions* IncreaseParallelism(int total_threads = 16);
844
#endif  // ROCKSDB_LITE
845
846
  // If true, the database will be created if it is missing.
847
  // Default: false
848
  bool create_if_missing;
849
850
  // If true, missing column families will be automatically created.
851
  // Default: false
852
  bool create_missing_column_families;
853
854
  // If true, an error is raised if the database already exists.
855
  // Default: false
856
  bool error_if_exists;
857
858
  // If true, RocksDB will aggressively check consistency of the data.
859
  // Also, if any of the  writes to the database fails (Put, Delete, Merge,
860
  // Write), the database will switch to read-only mode and fail all other
861
  // Write operations.
862
  // In most cases you want this to be set to true.
863
  // Default: true
864
  bool paranoid_checks;
865
866
  // Use the specified object to interact with the environment,
867
  // e.g. to read/write files, schedule background work, etc.
868
  // Default: Env::Default()
869
  Env* env;
870
871
1.20M
  Env* get_checkpoint_env() const {
872
1.20M
    return checkpoint_env ? 
checkpoint_env1.18M
:
env18.9k
;
873
1.20M
  }
874
875
  // Env used to create checkpoints. Default: Env::Default()
876
  Env* checkpoint_env;
877
878
  yb::PriorityThreadPool* priority_thread_pool_for_compactions_and_flushes = nullptr;
879
880
  // Use to control write rate of flush and compaction. Flush has higher
881
  // priority than compaction. Rate limiting is disabled if nullptr.
882
  // If rate limiter is enabled, bytes_per_sync is set to 1MB by default.
883
  // Default: nullptr
884
  std::shared_ptr<RateLimiter> rate_limiter;
885
886
  // Use to track SST files and control their file deletion rate, can be used
887
  // among multiple RocksDB instances, sst_file_manager only track and throttle
888
  // deletes of SST files in first db_path (db_name if db_paths is empty), other
889
  // files and other db_paths wont be tracked or affected by sst_file_manager.
890
  // Default: nullptr
891
  std::shared_ptr<SstFileManager> sst_file_manager;
892
893
  // Any internal progress/error information generated by the db will
894
  // be written to info_log if it is non-nullptr, or to a file stored
895
  // in the same directory as the DB contents if info_log is nullptr.
896
  // Default: nullptr
897
  std::shared_ptr<Logger> info_log;
898
899
  InfoLogLevel info_log_level;
900
901
  // Number of open files that can be used by the DB.  You may need to
902
  // increase this if your database has a large working set. Value -1 means
903
  // files opened are always kept open. You can estimate number of files based
904
  // on target_file_size_base and target_file_size_multiplier for level-based
905
  // compaction. For universal-style compaction, you can usually set it to -1.
906
  // Default: 5000 or ulimit value of max open files (whichever is smaller)
907
  int max_open_files;
908
909
  // If max_open_files is -1, DB will open all files on DB::Open(). You can
910
  // use this option to increase the number of threads used to open the files.
911
  // Default: 1
912
  int max_file_opening_threads;
913
914
  // Once write-ahead logs exceed this size, we will start forcing the flush of
915
  // column families whose memtables are backed by the oldest live WAL file
916
  // (i.e. the ones that are causing all the space amplification). If set to 0
917
  // (default), we will dynamically choose the WAL size limit to be
918
  // [sum of all write_buffer_size * max_write_buffer_number] * 4
919
  // Default: 0
920
  uint64_t max_total_wal_size;
921
922
  // If non-null, then we should collect metrics about database operations
923
  // Statistics objects should not be shared between DB instances as
924
  // it does not use any locks to prevent concurrent updates.
925
  std::shared_ptr<Statistics> statistics;
926
927
  // If true, then the contents of manifest and data files are not synced
928
  // to stable storage. Their contents remain in the OS buffers till the
929
  // OS decides to flush them. This option is good for bulk-loading
930
  // of data. Once the bulk-loading is complete, please issue a
931
  // sync to the OS to flush all dirty buffers to stable storage.
932
  // Default: false
933
  bool disableDataSync;
934
935
  // If true, then every store to stable storage will issue a fsync.
936
  // If false, then every store to stable storage will issue a fdatasync.
937
  // This parameter should be set to true while storing data to
938
  // filesystem like ext3 that can lose files after a reboot.
939
  // Default: false
940
  bool use_fsync;
941
942
  // A list of paths where SST files can be put into, with its target size.
943
  // Newer data is placed into paths specified earlier in the vector while
944
  // older data gradually moves to paths specified later in the vector.
945
  //
946
  // For example, you have a flash device with 10GB allocated for the DB,
947
  // as well as a hard drive of 2TB, you should config it to be:
948
  //   [{"/flash_path", 10GB}, {"/hard_drive", 2TB}]
949
  //
950
  // The system will try to guarantee data under each path is close to but
951
  // not larger than the target size. But current and future file sizes used
952
  // by determining where to place a file are based on best-effort estimation,
953
  // which means there is a chance that the actual size under the directory
954
  // is slightly more than target size under some workloads. User should give
955
  // some buffer room for those cases.
956
  //
957
  // If none of the paths has sufficient room to place a file, the file will
958
  // be placed to the last path anyway, despite to the target size.
959
  //
960
  // Placing newer data to earlier paths is also best-efforts. User should
961
  // expect user files to be placed in higher levels in some extreme cases.
962
  //
963
  // If left empty, only one path will be used, which is db_name passed when
964
  // opening the DB.
965
  // Default: empty
966
  std::vector<DbPath> db_paths;
967
968
  // This specifies the info LOG dir.
969
  // If it is empty, the log files will be in the same dir as data.
970
  // If it is non empty, the log files will be in the specified dir,
971
  // and the db data dir's absolute path will be used as the log file
972
  // name's prefix.
973
  std::string db_log_dir;
974
975
  // This specifies the absolute dir path for write-ahead logs (WAL).
976
  // If it is empty, the log files will be in the same dir as data,
977
  //   dbname is used as the data dir by default
978
  // If it is non empty, the log files will be in kept the specified dir.
979
  // When destroying the db,
980
  //   all log files in wal_dir and the dir itself is deleted
981
  std::string wal_dir;
982
983
  // The periodicity when obsolete files get deleted. The default
984
  // value is 6 hours. The files that get out of scope by compaction
985
  // process will still get automatically delete on every compaction,
986
  // regardless of this setting
987
  uint64_t delete_obsolete_files_period_micros;
988
989
  // Suggested number of concurrent background compaction jobs, submitted to
990
  // the default LOW priority thread pool.
991
  //
992
  // Default: max_background_compactions
993
  int base_background_compactions;
994
995
  // Maximum number of concurrent background compaction jobs, submitted to
996
  // the default LOW priority thread pool.
997
  // We first try to schedule compactions based on
998
  // `base_background_compactions`. If the compaction cannot catch up , we
999
  // will increase number of compaction threads up to
1000
  // `max_background_compactions`.
1001
  //
1002
  // If you're increasing this, also consider increasing number of threads in
1003
  // LOW priority thread pool. For more information, see
1004
  // Env::SetBackgroundThreads
1005
  // Default: 1
1006
  int max_background_compactions;
1007
1008
  // Number of threads reserved for exclusively doing small compactions
1009
  // Default: -1 (later gets set to base_background_compactions - 1)
1010
  int num_reserved_small_compaction_threads;
1011
1012
  // Threshold for input size beyond which compaction is considered large
1013
  // Default: numeric_limits<uint64_t>::max()
1014
  uint64_t compaction_size_threshold_bytes;
1015
1016
  // This value represents the maximum number of threads that will
1017
  // concurrently perform a compaction job by breaking it into multiple,
1018
  // smaller ones that are run simultaneously.
1019
  // Default: 1 (i.e. no subcompactions)
1020
  uint32_t max_subcompactions;
1021
1022
  // Maximum number of concurrent background memtable flush jobs, submitted to
1023
  // the HIGH priority thread pool.
1024
  //
1025
  // By default, all background jobs (major compaction and memtable flush) go
1026
  // to the LOW priority pool. If this option is set to a positive number,
1027
  // memtable flush jobs will be submitted to the HIGH priority pool.
1028
  // It is important when the same Env is shared by multiple db instances.
1029
  // Without a separate pool, long running major compaction jobs could
1030
  // potentially block memtable flush jobs of other db instances, leading to
1031
  // unnecessary Put stalls.
1032
  //
1033
  // If you're increasing this, also consider increasing number of threads in
1034
  // HIGH priority thread pool. For more information, see
1035
  // Env::SetBackgroundThreads
1036
  // Default: 1
1037
  int max_background_flushes;
1038
1039
  // Specify the maximal size of the info log file. If the log file
1040
  // is larger than `max_log_file_size`, a new info log file will
1041
  // be created.
1042
  // If max_log_file_size == 0, all logs will be written to one
1043
  // log file.
1044
  size_t max_log_file_size;
1045
1046
  // Time for the info log file to roll (in seconds).
1047
  // If specified with non-zero value, log file will be rolled
1048
  // if it has been active longer than `log_file_time_to_roll`.
1049
  // Default: 0 (disabled)
1050
  size_t log_file_time_to_roll;
1051
1052
  // Maximal info log files to be kept.
1053
  // Default: 1000
1054
  size_t keep_log_file_num;
1055
1056
  // Recycle log files.
1057
  // If non-zero, we will reuse previously written log files for new
1058
  // logs, overwriting the old data.  The value indicates how many
1059
  // such files we will keep around at any point in time for later
1060
  // use.  This is more efficient because the blocks are already
1061
  // allocated and fdatasync does not need to update the inode after
1062
  // each write.
1063
  // Default: 0
1064
  size_t recycle_log_file_num;
1065
1066
  // manifest file is rolled over on reaching this limit.
1067
  // The older manifest file be deleted.
1068
  // The default value is MAX_INT so that roll-over does not take place.
1069
  uint64_t max_manifest_file_size;
1070
1071
  // Number of shards used for table cache.
1072
  int table_cache_numshardbits;
1073
1074
  // DEPRECATED
1075
  // int table_cache_remove_scan_count_limit;
1076
1077
  // The following two fields affect how archived logs will be deleted.
1078
  // 1. If both set to 0, logs will be deleted asap and will not get into
1079
  //    the archive.
1080
  // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
1081
  //    WAL files will be checked every 10 min and if total size is greater
1082
  //    then WAL_size_limit_MB, they will be deleted starting with the
1083
  //    earliest until size_limit is met. All empty files will be deleted.
1084
  // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
1085
  //    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
1086
  //    are older than WAL_ttl_seconds will be deleted.
1087
  // 4. If both are not 0, WAL files will be checked every 10 min and both
1088
  //    checks will be performed with ttl being first.
1089
  uint64_t WAL_ttl_seconds;
1090
  uint64_t WAL_size_limit_MB;
1091
1092
  // Number of bytes to preallocate (via fallocate) the manifest
1093
  // files.  Default is 4mb, which is reasonable to reduce random IO
1094
  // as well as prevent overallocation for mounts that preallocate
1095
  // large amounts of data (such as xfs's allocsize option).
1096
  size_t manifest_preallocation_size;
1097
1098
  // Data being read from file storage may be buffered in the OS
1099
  // Default: true
1100
  bool allow_os_buffer;
1101
1102
  // Allow the OS to mmap file for reading sst tables. Default: false
1103
  bool allow_mmap_reads;
1104
1105
  // Allow the OS to mmap file for writing.
1106
  // DB::SyncWAL() only works if this is set to false.
1107
  // Default: false
1108
  bool allow_mmap_writes;
1109
1110
  // If false, fallocate() calls are bypassed
1111
  bool allow_fallocate;
1112
1113
  // Disable child process inherit open files. Default: true
1114
  bool is_fd_close_on_exec;
1115
1116
  // DEPRECATED -- this options is no longer used
1117
  bool skip_log_error_on_recovery;
1118
1119
  // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
1120
  // Default: 600 (10 min)
1121
  unsigned int stats_dump_period_sec;
1122
1123
  // If set true, will hint the underlying file system that the file
1124
  // access pattern is random, when a sst file is opened.
1125
  // Default: true
1126
  bool advise_random_on_open;
1127
1128
  // Amount of data to build up in memtables across all column
1129
  // families before writing to disk.
1130
  //
1131
  // This is distinct from write_buffer_size, which enforces a limit
1132
  // for a single memtable.
1133
  //
1134
  // This feature is disabled by default. Specify a non-zero value
1135
  // to enable it.
1136
  //
1137
  // Default: 0 (disabled)
1138
  size_t db_write_buffer_size;
1139
1140
  // Shared MemoryMonitor to keep track of total memory usage.
1141
  //
1142
  // Default: nullptr (disabled)
1143
  std::shared_ptr<MemoryMonitor> memory_monitor;
1144
1145
  // Specify the file access pattern once a compaction is started.
1146
  // It will be applied to all input files of a compaction.
1147
  // Default: NORMAL
1148
  enum AccessHint {
1149
      NONE,
1150
      NORMAL,
1151
      SEQUENTIAL,
1152
      WILLNEED
1153
  };
1154
  AccessHint access_hint_on_compaction_start;
1155
1156
  // If true, always create a new file descriptor and new table reader
1157
  // for compaction inputs. Turn this parameter on may introduce extra
1158
  // memory usage in the table reader, if it allocates extra memory
1159
  // for indexes. This will allow file descriptor prefetch options
1160
  // to be set for compaction input files and not to impact file
1161
  // descriptors for the same file used by user queries.
1162
  // Suggest to enable BlockBasedTableOptions.cache_index_and_filter_blocks
1163
  // for this mode if using block-based table.
1164
  //
1165
  // Default: false
1166
  bool new_table_reader_for_compaction_inputs;
1167
1168
  // If non-zero, we perform bigger reads when doing compaction. If you're
1169
  // running RocksDB on spinning disks, you should set this to at least 2MB.
1170
  // That way RocksDB's compaction is doing sequential instead of random reads.
1171
  //
1172
  // When non-zero, we also force new_table_reader_for_compaction_inputs to
1173
  // true.
1174
  //
1175
  // Default: 0
1176
  size_t compaction_readahead_size;
1177
1178
  // This is a maximum buffer size that is used by WinMmapReadableFile in
1179
  // unbuffered disk I/O mode. We need to maintain an aligned buffer for
1180
  // reads. We allow the buffer to grow until the specified value and then
1181
  // for bigger requests allocate one shot buffers. In unbuffered mode we
1182
  // always bypass read-ahead buffer at ReadaheadRandomAccessFile
1183
  // When read-ahead is required we then make use of compaction_readahead_size
1184
  // value and always try to read ahead. With read-ahead we always
1185
  // pre-allocate buffer to the size instead of growing it up to a limit.
1186
  //
1187
  // This option is currently honored only on Windows
1188
  //
1189
  // Default: 1 Mb
1190
  //
1191
  // Special value: 0 - means do not maintain per instance buffer. Allocate
1192
  //                per request buffer and avoid locking.
1193
  size_t random_access_max_buffer_size;
1194
1195
  // This is the maximum buffer size that is used by WritableFileWriter.
1196
  // On Windows, we need to maintain an aligned buffer for writes.
1197
  // We allow the buffer to grow until it's size hits the limit.
1198
  //
1199
  // Default: 1024 * 1024 (1 MB)
1200
  size_t writable_file_max_buffer_size;
1201
1202
1203
  // Use adaptive mutex, which spins in the user space before resorting
1204
  // to kernel. This could reduce context switch when the mutex is not
1205
  // heavily contended. However, if the mutex is hot, we could end up
1206
  // wasting spin time.
1207
  // Default: false
1208
  bool use_adaptive_mutex;
1209
1210
  // Create DBOptions with default values for all fields
1211
  DBOptions();
1212
1213
  void Dump(Logger* log) const;
1214
1215
  // Allows OS to incrementally sync files to disk while they are being
1216
  // written, asynchronously, in the background. This operation can be used
1217
  // to smooth out write I/Os over time. Users shouldn't reply on it for
1218
  // persistency guarantee.
1219
  // Issue one request for every bytes_per_sync written. 0 turns it off.
1220
  // Default: 0
1221
  //
1222
  // You may consider using rate_limiter to regulate write rate to device.
1223
  // When rate limiter is enabled, it automatically enables bytes_per_sync
1224
  // to 1MB.
1225
  //
1226
  // This option applies to table files
1227
  uint64_t bytes_per_sync;
1228
1229
  // Same as bytes_per_sync, but applies to WAL files
1230
  // Default: 0, turned off
1231
  uint64_t wal_bytes_per_sync;
1232
1233
  // A vector of EventListeners which call-back functions will be called
1234
  // when specific RocksDB event happens.
1235
  std::vector<std::shared_ptr<EventListener>> listeners;
1236
1237
  // If true, then the status of the threads involved in this DB will
1238
  // be tracked and available via GetThreadList() API.
1239
  //
1240
  // Default: false
1241
  bool enable_thread_tracking;
1242
1243
  // The limited write rate to DB if soft_pending_compaction_bytes_limit or
1244
  // level0_slowdown_writes_trigger is triggered, or we are writing to the
1245
  // last mem table allowed and we allow more than 3 mem tables. It is
1246
  // calculated using size of user write requests before compression.
1247
  // RocksDB may decide to slow down more if the compaction still
1248
  // gets behind further.
1249
  // Unit: byte per second.
1250
  //
1251
  // Default: 2MB/s
1252
  uint64_t delayed_write_rate;
1253
1254
  // If true, allow multi-writers to update mem tables in parallel.
1255
  // Only some memtable_factory-s support concurrent writes; currently it
1256
  // is implemented only for SkipListFactory.  Concurrent memtable writes
1257
  // are not compatible with inplace_update_support or filter_deletes.
1258
  // It is strongly recommended to set enable_write_thread_adaptive_yield
1259
  // if you are going to use this feature.
1260
  //
1261
  // THIS FEATURE IS NOT STABLE YET.
1262
  //
1263
  // Default: false
1264
  bool allow_concurrent_memtable_write;
1265
1266
  // If true, threads synchronizing with the write batch group leader will
1267
  // wait for up to write_thread_max_yield_usec before blocking on a mutex.
1268
  // This can substantially improve throughput for concurrent workloads,
1269
  // regardless of whether allow_concurrent_memtable_write is enabled.
1270
  //
1271
  // THIS FEATURE IS NOT STABLE YET.
1272
  //
1273
  // Default: false
1274
  bool enable_write_thread_adaptive_yield;
1275
1276
  // The maximum number of microseconds that a write operation will use
1277
  // a yielding spin loop to coordinate with other write threads before
1278
  // blocking on a mutex.  (Assuming write_thread_slow_yield_usec is
1279
  // set properly) increasing this value is likely to increase RocksDB
1280
  // throughput at the expense of increased CPU usage.
1281
  //
1282
  // Default: 100
1283
  uint64_t write_thread_max_yield_usec;
1284
1285
  // The latency in microseconds after which a std::this_thread::yield
1286
  // call (sched_yield on Linux) is considered to be a signal that
1287
  // other processes or threads would like to use the current core.
1288
  // Increasing this makes writer threads more likely to take CPU
1289
  // by spinning, which will show up as an increase in the number of
1290
  // involuntary context switches.
1291
  //
1292
  // Default: 3
1293
  uint64_t write_thread_slow_yield_usec;
1294
1295
  // If true, then DB::Open() will not update the statistics used to optimize
1296
  // compaction decision by loading table properties from many files.
1297
  // Turning off this feature will improve DBOpen time especially in
1298
  // disk environment.
1299
  //
1300
  // Default: false
1301
  bool skip_stats_update_on_db_open;
1302
1303
  // Recovery mode to control the consistency while replaying WAL
1304
  // Default: kTolerateCorruptedTailRecords
1305
  WALRecoveryMode wal_recovery_mode;
1306
1307
  // A global cache for table-level rows.
1308
  // Default: nullptr (disabled)
1309
  // Not supported in ROCKSDB_LITE mode!
1310
  std::shared_ptr<Cache> row_cache;
1311
1312
#ifndef ROCKSDB_LITE
1313
  // A filter object supplied to be invoked while processing write-ahead-logs
1314
  // (WALs) during recovery. The filter provides a way to inspect log
1315
  // records, ignoring a particular record or skipping replay.
1316
  // The filter is invoked at startup and is invoked from a single-thread
1317
  // currently.
1318
  const WalFilter* wal_filter;
1319
#endif  // ROCKSDB_LITE
1320
1321
  // If true, then DB::Open / CreateColumnFamily / DropColumnFamily
1322
  // / SetOptions will fail if options file is not detected or properly
1323
  // persisted.
1324
  //
1325
  // DEFAULT: false
1326
  bool fail_if_options_file_error;
1327
1328
  // Initial value for seqno generator.
1329
  // Used only during creation of new DB.
1330
  SequenceNumber initial_seqno = 0;
1331
1332
  // Boundary extractor is used to retrieve user defined values for record.
1333
  // Also it decodes those values during load of metafile.
1334
  std::shared_ptr<BoundaryValuesExtractor> boundary_extractor;
1335
1336
  // Function that returns max file size for compaction.
1337
  // Supported only for level0 of universal style compactions.
1338
  std::shared_ptr<std::function<uint64_t()>> max_file_size_for_compaction;
1339
1340
  // Invoked after memtable switched.
1341
  std::shared_ptr<std::function<MemTableFilter()>> mem_table_flush_filter_factory;
1342
1343
  // A prefix for log messages, usually containing the tablet id.
1344
  std::string log_prefix;
1345
1346
  // This RocksDB instance root mem tracker.
1347
  std::shared_ptr<yb::MemTracker> mem_tracker;
1348
1349
  // Specific mem tracker for block based tables created by this RocksDB instance.
1350
  std::shared_ptr<yb::MemTracker> block_based_table_mem_tracker;
1351
1352
  // Adds ability to modify iterator created for SST file.
1353
  // For instance some additional filtering could be added.
1354
  std::shared_ptr<IteratorReplacer> iterator_replacer;
1355
1356
  // Creates file filters that directly exclude files during compaction, resulting
1357
  // in their direct deletion without inspection.
1358
  // The filters are currently used to expire files in time-series DBs that have
1359
  // completely expired based on their table and/or column TTL.
1360
  std::shared_ptr<CompactionFileFilterFactory> compaction_file_filter_factory;
1361
};
1362
1363
// Options to control the behavior of a database (passed to DB::Open)
1364
struct Options : public DBOptions, public ColumnFamilyOptions {
1365
  // Create an Options object with default values for all fields.
1366
616k
  Options() : DBOptions(), ColumnFamilyOptions() {}
1367
1368
  Options(const DBOptions& db_options,
1369
          const ColumnFamilyOptions& column_family_options)
1370
1.30M
      : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {}
1371
1372
  void Dump(Logger* log) const;
1373
1374
  void DumpCFOptions(Logger* log) const;
1375
1376
  // Set appropriate parameters for bulk loading.
1377
  // The reason that this is a function that returns "this" instead of a
1378
  // constructor is to enable chaining of multiple similar calls in the future.
1379
  //
1380
1381
  // All data will be in level 0 without any automatic compaction.
1382
  // It's recommended to manually call CompactRange(NULL, NULL) before reading
1383
  // from the database, because otherwise the read can be very slow.
1384
  Options* PrepareForBulkLoad();
1385
};
1386
1387
//
1388
// An application can issue a read request (via Get/Iterators) and specify
1389
// if that read should process data that ALREADY resides on a specified cache
1390
// level. For example, if an application specifies kBlockCacheTier then the
1391
// Get call will process data that is already processed in the memtable or
1392
// the block cache. It will not page in data from the OS cache or data that
1393
// resides in storage.
1394
enum ReadTier {
1395
  kReadAllTier = 0x0,     // data in memtable, block cache, OS cache or storage
1396
  kBlockCacheTier = 0x1,  // data in memtable or block cache
1397
  kPersistedTier = 0x2    // persisted data.  When WAL is disabled, this option
1398
                          // will skip data in memtable.
1399
                          // Note that this ReadTier currently only supports
1400
                          // Get and MultiGet and does not support iterators.
1401
};
1402
1403
struct FdWithBoundaries;
1404
class ReadFileFilter {
1405
 public:
1406
  virtual bool Filter(const FdWithBoundaries&) const = 0;
1407
1408
 protected:
1409
19.0M
  virtual ~ReadFileFilter() {}
1410
};
1411
1412
class TableReader;
1413
class TableAwareReadFileFilter {
1414
 public:
1415
  virtual bool Filter(TableReader*) const = 0;
1416
1417
 protected:
1418
21.8M
  virtual ~TableAwareReadFileFilter() {}
1419
};
1420
1421
// Options that control read operations
1422
struct ReadOptions {
1423
  // If true, all data read from underlying storage will be
1424
  // verified against corresponding checksums.
1425
  // Default: true
1426
  bool verify_checksums;
1427
1428
  // Should the "data block"/"index block"/"filter block" read for this
1429
  // iteration be cached in memory?
1430
  // Callers may wish to set this field to false for bulk scans.
1431
  // Default: true
1432
  bool fill_cache;
1433
1434
  // If this option is set and memtable implementation allows, Seek
1435
  // might only return keys with the same prefix as the seek-key
1436
  //
1437
  // ! DEPRECATED: prefix_seek is on by default when prefix_extractor
1438
  // is configured
1439
  // bool prefix_seek;
1440
1441
  // If "snapshot" is non-nullptr, read as of the supplied snapshot
1442
  // (which must belong to the DB that is being read and which must
1443
  // not have been released).  If "snapshot" is nullptr, use an implicit
1444
  // snapshot of the state at the beginning of this read operation.
1445
  // Default: nullptr
1446
  const Snapshot* snapshot;
1447
1448
  // If "prefix" is non-nullptr, and ReadOptions is being passed to
1449
  // db.NewIterator, only return results when the key begins with this
1450
  // prefix.  This field is ignored by other calls (e.g., Get).
1451
  // Options.prefix_extractor must also be set, and
1452
  // prefix_extractor.InRange(prefix) must be true.  The iterator
1453
  // returned by NewIterator when this option is set will behave just
1454
  // as if the underlying store did not contain any non-matching keys,
1455
  // with two exceptions.  Seek() only accepts keys starting with the
1456
  // prefix, and SeekToLast() is not supported.  prefix filter with this
1457
  // option will sometimes reduce the number of read IOPs.
1458
  // Default: nullptr
1459
  //
1460
  // ! DEPRECATED
1461
  // const Slice* prefix;
1462
1463
  // "iterate_upper_bound" defines the extent upto which the forward iterator
1464
  // can returns entries. Once the bound is reached, Valid() will be false.
1465
  // "iterate_upper_bound" is exclusive ie the bound value is
1466
  // not a valid entry.  If iterator_extractor is not null, the Seek target
1467
  // and iterator_upper_bound need to have the same prefix.
1468
  // This is because ordering is not guaranteed outside of prefix domain.
1469
  // There is no lower bound on the iterator. If needed, that can be easily
1470
  // implemented
1471
  //
1472
  // Default: nullptr
1473
  const Slice* iterate_upper_bound;
1474
1475
  // Specify if this read request should process data that ALREADY
1476
  // resides on a particular cache. If the required data is not
1477
  // found at the specified cache, then Status::Incomplete is returned.
1478
  // Default: kReadAllTier
1479
  ReadTier read_tier;
1480
1481
  // Specify to create a tailing iterator -- a special iterator that has a
1482
  // view of the complete database (i.e. it can also be used to read newly
1483
  // added data) and is optimized for sequential reads. It will return records
1484
  // that were inserted into the database after the creation of the iterator.
1485
  // Default: false
1486
  // Not supported in ROCKSDB_LITE mode!
1487
  bool tailing;
1488
1489
  // Specify to create a managed iterator -- a special iterator that
1490
  // uses less resources by having the ability to free its underlying
1491
  // resources on request.
1492
  // Default: false
1493
  // Not supported in ROCKSDB_LITE mode!
1494
  bool managed;
1495
1496
  // Enable a total order seek regardless of index format (e.g. hash index)
1497
  // used in the table. Some table format (e.g. plain table) may not support
1498
  // this option.
1499
  bool total_order_seek;
1500
1501
  // Enforce that the iterator only iterates over the same prefix as the seek.
1502
  // This option is effective only for prefix seeks, i.e. prefix_extractor is
1503
  // non-null for the column family and total_order_seek is false.  Unlike
1504
  // iterate_upper_bound, prefix_same_as_start only works within a prefix
1505
  // but in both directions.
1506
  // Default: false
1507
  bool prefix_same_as_start;
1508
1509
  // Keep the blocks loaded by the iterator pinned in memory as long as the
1510
  // iterator is not deleted, If used when reading from tables created with
1511
  // BlockBasedTableOptions::use_delta_encoding = false,
1512
  // Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to
1513
  // return 1.
1514
  // Default: false
1515
  bool pin_data;
1516
1517
  // Query id designated for the read.
1518
  QueryId query_id = kDefaultQueryId;
1519
1520
  // Filter for pruning SST files. RocksDB user can provide its own implementation to exclude SST
1521
  // files from being added to MergeIterator. By default doesn't filter files.
1522
  std::shared_ptr<TableAwareReadFileFilter> table_aware_file_filter;
1523
1524
  std::shared_ptr<ReadFileFilter> file_filter;
1525
1526
  static const ReadOptions kDefault;
1527
1528
  ReadOptions();
1529
  ReadOptions(bool cksum, bool cache);
1530
};
1531
1532
// Options that control write operations
1533
struct WriteOptions {
1534
  // If true, the write will be flushed from the operating system
1535
  // buffer cache (by calling WritableFile::Sync()) before the write
1536
  // is considered complete.  If this flag is true, writes will be
1537
  // slower.
1538
  //
1539
  // If this flag is false, and the machine crashes, some recent
1540
  // writes may be lost.  Note that if it is just the process that
1541
  // crashes (i.e., the machine does not reboot), no writes will be
1542
  // lost even if sync==false.
1543
  //
1544
  // In other words, a DB write with sync==false has similar
1545
  // crash semantics as the "write()" system call.  A DB write
1546
  // with sync==true has similar crash semantics to a "write()"
1547
  // system call followed by "fdatasync()".
1548
  //
1549
  // Default: false
1550
  bool sync;
1551
1552
  // If true, writes will not first go to the write ahead log,
1553
  // and the write may got lost after a crash.
1554
  bool disableWAL;
1555
1556
  // The option is deprecated. It's not used anymore.
1557
  uint64_t timeout_hint_us;
1558
1559
  // If true and if user is trying to write to column families that don't exist
1560
  // (they were dropped),  ignore the write (don't return an error). If there
1561
  // are multiple writes in a WriteBatch, other writes will succeed.
1562
  // Default: false
1563
  bool ignore_missing_column_families;
1564
1565
  WriteOptions()
1566
      : sync(false),
1567
        disableWAL(false),
1568
        timeout_hint_us(0),
1569
27.8M
        ignore_missing_column_families(false) {}
1570
};
1571
1572
// On each call returned value is incremented by 1.
1573
// Could be used to track whether one action happened before another.
1574
int64_t FlushTick();
1575
1576
// Options that control flush operations
1577
struct FlushOptions {
1578
  // If true, the flush will wait until the flush is done.
1579
  // Default: true
1580
  bool wait = true;
1581
1582
  static constexpr int64_t kNeverIgnore = std::numeric_limits<int64_t>::max();
1583
1584
  int64_t ignore_if_flushed_after_tick = kNeverIgnore;
1585
};
1586
1587
// Get options based on some guidelines. Now only tune parameter based on
1588
// flush/compaction and fill default parameters for other parameters.
1589
// total_write_buffer_limit: budget for memory spent for mem tables
1590
// read_amplification_threshold: comfortable value of read amplification
1591
// write_amplification_threshold: comfortable value of write amplification.
1592
// target_db_size: estimated total DB size.
1593
extern Options GetOptions(size_t total_write_buffer_limit,
1594
                          int read_amplification_threshold = 8,
1595
                          int write_amplification_threshold = 32,
1596
                          uint64_t target_db_size = 68719476736 /* 64GB */);
1597
1598
// Create a Logger from provided DBOptions
1599
extern Status CreateLoggerFromOptions(const std::string& dbname,
1600
                                      const DBOptions& options,
1601
                                      std::shared_ptr<Logger>* logger);
1602
1603
// CompactionOptions are used in CompactFiles() call.
1604
struct CompactionOptions {
1605
  // Compaction output compression type
1606
  // Default: snappy
1607
  CompressionType compression;
1608
  // Compaction will create files of size `output_file_size_limit`.
1609
  // Default: MAX, which means that compaction will create a single file
1610
  uint64_t output_file_size_limit;
1611
1612
  CompactionOptions()
1613
      : compression(kSnappyCompression),
1614
45
        output_file_size_limit(std::numeric_limits<uint64_t>::max()) {}
1615
};
1616
1617
// For level based compaction, we can configure if we want to skip/force
1618
// bottommost level compaction.
1619
enum class BottommostLevelCompaction {
1620
  // Skip bottommost level compaction
1621
  kSkip,
1622
  // Only compact bottommost level if there is a compaction filter
1623
  // This is the default option
1624
  kIfHaveCompactionFilter,
1625
  // Always compact bottommost level
1626
  kForce,
1627
};
1628
1629
// CompactRangeOptions is used by CompactRange() call.
1630
struct CompactRangeOptions {
1631
  // If true, no other compaction will run at the same time as this
1632
  // manual compaction
1633
  bool exclusive_manual_compaction = true;
1634
  // If true, compacted files will be moved to the minimum level capable
1635
  // of holding the data or given level (specified non-negative target_level).
1636
  bool change_level = false;
1637
  // If change_level is true and target_level have non-negative value, compacted
1638
  // files will be moved to target_level.
1639
  int target_level = -1;
1640
  // Compaction outputs will be placed in options.db_paths[target_path_id].
1641
  // Behavior is undefined if target_path_id is out of range.
1642
  uint32_t target_path_id = 0;
1643
  // By default level based compaction will only compact the bottommost level
1644
  // if there is a compaction filter
1645
  BottommostLevelCompaction bottommost_level_compaction =
1646
      BottommostLevelCompaction::kIfHaveCompactionFilter;
1647
};
1648
}  // namespace rocksdb
1649
1650
#endif // YB_ROCKSDB_OPTIONS_H