/Users/deen/code/yugabyte-db/src/yb/rocksdb/options.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under the BSD-style license found in the |
3 | | // LICENSE file in the root directory of this source tree. An additional grant |
4 | | // of patent rights can be found in the PATENTS file in the same directory. |
5 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
6 | | // Use of this source code is governed by a BSD-style license that can be |
7 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
8 | | // |
9 | | // The following only applies to changes made to this file as part of YugaByte development. |
10 | | // |
11 | | // Portions Copyright (c) YugaByte, Inc. |
12 | | // |
13 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
14 | | // in compliance with the License. You may obtain a copy of the License at |
15 | | // |
16 | | // http://www.apache.org/licenses/LICENSE-2.0 |
17 | | // |
18 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
19 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
20 | | // or implied. See the License for the specific language governing permissions and limitations |
21 | | // under the License. |
22 | | // |
23 | | |
24 | | #ifndef YB_ROCKSDB_OPTIONS_H |
25 | | #define YB_ROCKSDB_OPTIONS_H |
26 | | |
27 | | #include <stddef.h> |
28 | | #include <stdint.h> |
29 | | #include <string> |
30 | | #include <memory> |
31 | | #include <vector> |
32 | | #include <limits> |
33 | | #include <unordered_map> |
34 | | |
35 | | #include "yb/rocksdb/cache.h" |
36 | | #include "yb/rocksdb/listener.h" |
37 | | #include "yb/util/slice.h" |
38 | | #include "yb/rocksdb/universal_compaction.h" |
39 | | |
40 | | #ifdef max |
41 | | #undef max |
42 | | #endif |
43 | | |
44 | | namespace yb { |
45 | | |
46 | | class MemTracker; |
47 | | class PriorityThreadPool; |
48 | | |
49 | | } |
50 | | |
51 | | namespace rocksdb { |
52 | | |
53 | | class Arena; |
54 | | class BoundaryValuesExtractor; |
55 | | class Cache; |
56 | | class CompactionFilter; |
57 | | class CompactionFilterFactory; |
58 | | class Comparator; |
59 | | class Env; |
60 | | class CompactionFileFilterFactory; |
61 | | enum InfoLogLevel : unsigned char; |
62 | | class SstFileManager; |
63 | | class FilterPolicy; |
64 | | class Logger; |
65 | | class MemTable; |
66 | | class MergeOperator; |
67 | | class Snapshot; |
68 | | class TableFactory; |
69 | | class MemTableRepFactory; |
70 | | class TablePropertiesCollectorFactory; |
71 | | class RateLimiter; |
72 | | class SliceTransform; |
73 | | class Statistics; |
74 | | class InternalIterator; |
75 | | class InternalKeyComparator; |
76 | | class WalFilter; |
77 | | class MemoryMonitor; |
78 | | |
79 | | struct FileMetaData; |
80 | | |
81 | | typedef std::shared_ptr<const InternalKeyComparator> InternalKeyComparatorPtr; |
82 | | |
83 | | // DB contents are stored in a set of blocks, each of which holds a |
84 | | // sequence of key,value pairs. Each block may be compressed before |
85 | | // being stored in a file. The following enum describes which |
86 | | // compression method (if any) is used to compress a block. |
87 | | enum CompressionType : char { |
88 | | // NOTE: do not change the values of existing entries, as these are |
89 | | // part of the persistent format on disk. |
90 | | kNoCompression = 0x0, |
91 | | kSnappyCompression = 0x1, |
92 | | kZlibCompression = 0x2, |
93 | | kBZip2Compression = 0x3, |
94 | | kLZ4Compression = 0x4, |
95 | | kLZ4HCCompression = 0x5, |
96 | | // zstd format is not finalized yet so it's subject to changes. |
97 | | kZSTDNotFinalCompression = 0x40, |
98 | | }; |
99 | | |
100 | | enum CompactionStyle : char { |
101 | | // level based compaction style |
102 | | kCompactionStyleLevel = 0x0, |
103 | | // Universal compaction style |
104 | | // Not supported in ROCKSDB_LITE. |
105 | | kCompactionStyleUniversal = 0x1, |
106 | | // FIFO compaction style |
107 | | // Not supported in ROCKSDB_LITE |
108 | | kCompactionStyleFIFO = 0x2, |
109 | | // Disable background compaction. Compaction jobs are submitted |
110 | | // via CompactFiles(). |
111 | | // Not supported in ROCKSDB_LITE |
112 | | kCompactionStyleNone = 0x3, |
113 | | }; |
114 | | |
115 | | // In Level-based comapction, it Determines which file from a level to be |
116 | | // picked to merge to the next level. We suggest people try |
117 | | // kMinOverlappingRatio first when you tune your database. |
118 | | enum CompactionPri : char { |
119 | | // Slightly Priotize larger files by size compensated by #deletes |
120 | | kByCompensatedSize = 0x0, |
121 | | // First compact files whose data's latest update time is oldest. |
122 | | // Try this if you only update some hot keys in small ranges. |
123 | | kOldestLargestSeqFirst = 0x1, |
124 | | // First compact files whose range hasn't been compacted to the next level |
125 | | // for the longest. If your updates are random across the key space, |
126 | | // write amplification is slightly better with this option. |
127 | | kOldestSmallestSeqFirst = 0x2, |
128 | | // First compact files whose ratio between overlapping size in next level |
129 | | // and its size is the smallest. It in many cases can optimize write |
130 | | // amplification. |
131 | | kMinOverlappingRatio = 0x3, |
132 | | }; |
133 | | |
134 | | enum class WALRecoveryMode : char { |
135 | | // Original levelDB recovery |
136 | | // We tolerate incomplete record in trailing data on all logs |
137 | | // Use case : This is legacy behavior (default) |
138 | | kTolerateCorruptedTailRecords = 0x00, |
139 | | // Recover from clean shutdown |
140 | | // We don't expect to find any corruption in the WAL |
141 | | // Use case : This is ideal for unit tests and rare applications that |
142 | | // can require high consistency guarantee |
143 | | kAbsoluteConsistency = 0x01, |
144 | | // Recover to point-in-time consistency |
145 | | // We stop the WAL playback on discovering WAL inconsistency |
146 | | // Use case : Ideal for systems that have disk controller cache like |
147 | | // hard disk, SSD without super capacitor that store related data |
148 | | kPointInTimeRecovery = 0x02, |
149 | | // Recovery after a disaster |
150 | | // We ignore any corruption in the WAL and try to salvage as much data as |
151 | | // possible |
152 | | // Use case : Ideal for last ditch effort to recover data or systems that |
153 | | // operate with low grade unrelated data |
154 | | kSkipAnyCorruptedRecords = 0x03, |
155 | | }; |
156 | | |
157 | | struct CompactionOptionsFIFO { |
158 | | // once the total sum of table files reaches this, we will delete the oldest |
159 | | // table file |
160 | | // Default: 1GB |
161 | | uint64_t max_table_files_size; |
162 | | |
163 | 3.64M | CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {} |
164 | | }; |
165 | | |
166 | | // Compression options for different compression algorithms like Zlib |
167 | | struct CompressionOptions { |
168 | | int window_bits; |
169 | | int level; |
170 | | int strategy; |
171 | 3.62M | CompressionOptions() : window_bits(-14), level(-1), strategy(0) {} |
172 | | CompressionOptions(int wbits, int _lev, int _strategy) |
173 | 0 | : window_bits(wbits), level(_lev), strategy(_strategy) {} |
174 | | }; |
175 | | |
176 | | enum UpdateStatus { // Return status For inplace update callback |
177 | | UPDATE_FAILED = 0, // Nothing to update |
178 | | UPDATED_INPLACE = 1, // Value updated inplace |
179 | | UPDATED = 2, // No inplace update. Merged value set |
180 | | }; |
181 | | |
182 | | struct DbPath { |
183 | | std::string path; |
184 | | uint64_t target_size; // Target size of total files under the path, in byte. |
185 | | |
186 | 0 | DbPath() : target_size(0) {} |
187 | 859k | DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {} |
188 | | }; |
189 | | |
190 | | struct Options; |
191 | | |
192 | | struct ColumnFamilyOptions { |
193 | | // Some functions that make it easier to optimize RocksDB |
194 | | |
195 | | // Use this if you don't need to keep the data sorted, i.e. you'll never use |
196 | | // an iterator, only Put() and Get() API calls |
197 | | // |
198 | | // Not supported in ROCKSDB_LITE |
199 | | ColumnFamilyOptions* OptimizeForPointLookup( |
200 | | uint64_t block_cache_size_mb); |
201 | | |
202 | | // Default values for some parameters in ColumnFamilyOptions are not |
203 | | // optimized for heavy workloads and big datasets, which means you might |
204 | | // observe write stalls under some conditions. As a starting point for tuning |
205 | | // RocksDB options, use the following two functions: |
206 | | // * OptimizeLevelStyleCompaction -- optimizes level style compaction |
207 | | // * OptimizeUniversalStyleCompaction -- optimizes universal style compaction |
208 | | // Universal style compaction is focused on reducing Write Amplification |
209 | | // Factor for big data sets, but increases Space Amplification. You can learn |
210 | | // more about the different styles here: |
211 | | // https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide |
212 | | // Make sure to also call IncreaseParallelism(), which will provide the |
213 | | // biggest performance gains. |
214 | | // Note: we might use more memory than memtable_memory_budget during high |
215 | | // write rate period |
216 | | // |
217 | | // OptimizeUniversalStyleCompaction is not supported in ROCKSDB_LITE |
218 | | ColumnFamilyOptions* OptimizeLevelStyleCompaction( |
219 | | uint64_t memtable_memory_budget = 512 * 1024 * 1024); |
220 | | ColumnFamilyOptions* OptimizeUniversalStyleCompaction( |
221 | | uint64_t memtable_memory_budget = 512 * 1024 * 1024); |
222 | | |
223 | | // ------------------- |
224 | | // Parameters that affect behavior |
225 | | |
226 | | // Comparator used to define the order of keys in the table. |
227 | | // Default: a comparator that uses lexicographic byte-wise ordering |
228 | | // |
229 | | // REQUIRES: The client must ensure that the comparator supplied |
230 | | // here has the same name and orders keys *exactly* the same as the |
231 | | // comparator provided to previous open calls on the same DB. |
232 | | const Comparator* comparator; |
233 | | |
234 | | // REQUIRES: The client must provide a merge operator if Merge operation |
235 | | // needs to be accessed. Calling Merge on a DB without a merge operator |
236 | | // would result in Status::NotSupported. The client must ensure that the |
237 | | // merge operator supplied here has the same name and *exactly* the same |
238 | | // semantics as the merge operator provided to previous open calls on |
239 | | // the same DB. The only exception is reserved for upgrade, where a DB |
240 | | // previously without a merge operator is introduced to Merge operation |
241 | | // for the first time. It's necessary to specify a merge operator when |
242 | | // openning the DB in this case. |
243 | | // Default: nullptr |
244 | | std::shared_ptr<MergeOperator> merge_operator; |
245 | | |
246 | | // A single CompactionFilter instance to call into during compaction. |
247 | | // Allows an application to modify/delete a key-value during background |
248 | | // compaction. |
249 | | // |
250 | | // If the client requires a new compaction filter to be used for different |
251 | | // compaction runs, it can specify compaction_filter_factory instead of this |
252 | | // option. The client should specify only one of the two. |
253 | | // compaction_filter takes precedence over compaction_filter_factory if |
254 | | // client specifies both. |
255 | | // |
256 | | // If multithreaded compaction is being used, the supplied CompactionFilter |
257 | | // instance may be used from different threads concurrently and so should be |
258 | | // thread-safe. |
259 | | // |
260 | | // Default: nullptr |
261 | | CompactionFilter* compaction_filter; |
262 | | |
263 | | // This is a factory that provides compaction filter objects which allow |
264 | | // an application to modify/delete a key-value during background compaction. |
265 | | // |
266 | | // A new filter will be created on each compaction run. If multithreaded |
267 | | // compaction is being used, each created CompactionFilter will only be used |
268 | | // from a single thread and so does not need to be thread-safe. |
269 | | // |
270 | | // Default: nullptr |
271 | | std::shared_ptr<CompactionFilterFactory> compaction_filter_factory; |
272 | | |
273 | | // ------------------- |
274 | | // Parameters that affect performance |
275 | | |
276 | | // Amount of data to build up in memory (backed by an unsorted log |
277 | | // on disk) before converting to a sorted on-disk file. |
278 | | // |
279 | | // Larger values increase performance, especially during bulk loads. |
280 | | // Up to max_write_buffer_number write buffers may be held in memory |
281 | | // at the same time, |
282 | | // so you may wish to adjust this parameter to control memory usage. |
283 | | // Also, a larger write buffer will result in a longer recovery time |
284 | | // the next time the database is opened. |
285 | | // |
286 | | // Note that write_buffer_size is enforced per column family. |
287 | | // See db_write_buffer_size for sharing memory across column families. |
288 | | // |
289 | | // Default: 4MB |
290 | | // |
291 | | // Dynamically changeable through SetOptions() API |
292 | | size_t write_buffer_size; |
293 | | |
294 | | // The maximum number of write buffers that are built up in memory. |
295 | | // The default and the minimum number is 2, so that when 1 write buffer |
296 | | // is being flushed to storage, new writes can continue to the other |
297 | | // write buffer. |
298 | | // If max_write_buffer_number > 3, writing will be slowed down to |
299 | | // options.delayed_write_rate if we are writing to the last write buffer |
300 | | // allowed. |
301 | | // |
302 | | // Default: 2 |
303 | | // |
304 | | // Dynamically changeable through SetOptions() API |
305 | | int max_write_buffer_number; |
306 | | |
307 | | // The minimum number of write buffers that will be merged together |
308 | | // before writing to storage. If set to 1, then |
309 | | // all write buffers are fushed to L0 as individual files and this increases |
310 | | // read amplification because a get request has to check in all of these |
311 | | // files. Also, an in-memory merge may result in writing lesser |
312 | | // data to storage if there are duplicate records in each of these |
313 | | // individual write buffers. Default: 1 |
314 | | int min_write_buffer_number_to_merge; |
315 | | |
316 | | // The total maximum number of write buffers to maintain in memory including |
317 | | // copies of buffers that have already been flushed. Unlike |
318 | | // max_write_buffer_number, this parameter does not affect flushing. |
319 | | // This controls the minimum amount of write history that will be available |
320 | | // in memory for conflict checking when Transactions are used. |
321 | | // |
322 | | // When using an OptimisticTransactionDB: |
323 | | // If this value is too low, some transactions may fail at commit time due |
324 | | // to not being able to determine whether there were any write conflicts. |
325 | | // |
326 | | // When using a TransactionDB: |
327 | | // If Transaction::SetSnapshot is used, TransactionDB will read either |
328 | | // in-memory write buffers or SST files to do write-conflict checking. |
329 | | // Increasing this value can reduce the number of reads to SST files |
330 | | // done for conflict detection. |
331 | | // |
332 | | // Setting this value to 0 will cause write buffers to be freed immediately |
333 | | // after they are flushed. |
334 | | // If this value is set to -1, 'max_write_buffer_number' will be used. |
335 | | // |
336 | | // Default: |
337 | | // If using a TransactionDB/OptimisticTransactionDB, the default value will |
338 | | // be set to the value of 'max_write_buffer_number' if it is not explicitly |
339 | | // set by the user. Otherwise, the default is 0. |
340 | | int max_write_buffer_number_to_maintain; |
341 | | |
342 | | // Compress blocks using the specified compression algorithm. This |
343 | | // parameter can be changed dynamically. |
344 | | // |
345 | | // Default: kSnappyCompression, if it's supported. If snappy is not linked |
346 | | // with the library, the default is kNoCompression. |
347 | | // |
348 | | // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz: |
349 | | // ~200-500MB/s compression |
350 | | // ~400-800MB/s decompression |
351 | | // Note that these speeds are significantly faster than most |
352 | | // persistent storage speeds, and therefore it is typically never |
353 | | // worth switching to kNoCompression. Even if the input data is |
354 | | // incompressible, the kSnappyCompression implementation will |
355 | | // efficiently detect that and will switch to uncompressed mode. |
356 | | CompressionType compression; |
357 | | |
358 | | // Different levels can have different compression policies. There |
359 | | // are cases where most lower levels would like to use quick compression |
360 | | // algorithms while the higher levels (which have more data) use |
361 | | // compression algorithms that have better compression but could |
362 | | // be slower. This array, if non-empty, should have an entry for |
363 | | // each level of the database; these override the value specified in |
364 | | // the previous field 'compression'. |
365 | | // |
366 | | // NOTICE if level_compaction_dynamic_level_bytes=true, |
367 | | // compression_per_level[0] still determines L0, but other elements |
368 | | // of the array are based on base level (the level L0 files are merged |
369 | | // to), and may not match the level users see from info log for metadata. |
370 | | // If L0 files are merged to level-n, then, for i>0, compression_per_level[i] |
371 | | // determines compaction type for level n+i-1. |
372 | | // For example, if we have three 5 levels, and we determine to merge L0 |
373 | | // data to L4 (which means L1..L3 will be empty), then the new files go to |
374 | | // L4 uses compression type compression_per_level[1]. |
375 | | // If now L0 is merged to L2. Data goes to L2 will be compressed |
376 | | // according to compression_per_level[1], L3 using compression_per_level[2] |
377 | | // and L4 using compression_per_level[3]. Compaction for each level can |
378 | | // change when data grows. |
379 | | std::vector<CompressionType> compression_per_level; |
380 | | |
381 | | // different options for compression algorithms |
382 | | CompressionOptions compression_opts; |
383 | | |
384 | | // If non-nullptr, use the specified function to determine the |
385 | | // prefixes for keys. These prefixes will be placed in the filter. |
386 | | // Depending on the workload, this can reduce the number of read-IOP |
387 | | // cost for scans when a prefix is passed via ReadOptions to |
388 | | // db.NewIterator(). For prefix filtering to work properly, |
389 | | // "prefix_extractor" and "comparator" must be such that the following |
390 | | // properties hold: |
391 | | // |
392 | | // 1) key.starts_with(prefix(key)) |
393 | | // 2) Compare(prefix(key), key) <= 0. |
394 | | // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0 |
395 | | // 4) prefix(prefix(key)) == prefix(key) |
396 | | // |
397 | | // Default: nullptr |
398 | | std::shared_ptr<const SliceTransform> prefix_extractor; |
399 | | |
400 | | // Number of levels for this database |
401 | | int num_levels; |
402 | | |
403 | | // Number of files to trigger level-0 compaction. A value <0 means that |
404 | | // level-0 compaction will not be triggered by number of files at all. |
405 | | // |
406 | | // Default: 4 |
407 | | // |
408 | | // Dynamically changeable through SetOptions() API |
409 | | int level0_file_num_compaction_trigger; |
410 | | |
411 | | // Soft limit on number of level-0 files. We start slowing down writes at this |
412 | | // point. A value <0 means that no writing slow down will be triggered by |
413 | | // number of files in level-0. |
414 | | // |
415 | | // Dynamically changeable through SetOptions() API |
416 | | int level0_slowdown_writes_trigger; |
417 | | |
418 | | // Maximum number of level-0 files. We stop writes at this point. |
419 | | // |
420 | | // Dynamically changeable through SetOptions() API |
421 | | int level0_stop_writes_trigger; |
422 | | |
423 | | // This does not do anything anymore. Deprecated. |
424 | | int max_mem_compaction_level; |
425 | | |
426 | | // Target file size for compaction. |
427 | | // target_file_size_base is per-file size for level-1. |
428 | | // Target file size for level L can be calculated by |
429 | | // target_file_size_base * (target_file_size_multiplier ^ (L-1)) |
430 | | // For example, if target_file_size_base is 2MB and |
431 | | // target_file_size_multiplier is 10, then each file on level-1 will |
432 | | // be 2MB, and each file on level 2 will be 20MB, |
433 | | // and each file on level-3 will be 200MB. |
434 | | // |
435 | | // Default: 2MB. |
436 | | // |
437 | | // Dynamically changeable through SetOptions() API |
438 | | uint64_t target_file_size_base; |
439 | | |
440 | | // By default target_file_size_multiplier is 1, which means |
441 | | // by default files in different levels will have similar size. |
442 | | // |
443 | | // Dynamically changeable through SetOptions() API |
444 | | int target_file_size_multiplier; |
445 | | |
446 | | // Control maximum total data size for a level. |
447 | | // max_bytes_for_level_base is the max total for level-1. |
448 | | // Maximum number of bytes for level L can be calculated as |
449 | | // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1)) |
450 | | // For example, if max_bytes_for_level_base is 20MB, and if |
451 | | // max_bytes_for_level_multiplier is 10, total data size for level-1 |
452 | | // will be 20MB, total file size for level-2 will be 200MB, |
453 | | // and total file size for level-3 will be 2GB. |
454 | | // |
455 | | // Default: 10MB. |
456 | | // |
457 | | // Dynamically changeable through SetOptions() API |
458 | | uint64_t max_bytes_for_level_base; |
459 | | |
460 | | // If true, RocksDB will pick target size of each level dynamically. |
461 | | // We will pick a base level b >= 1. L0 will be directly merged into level b, |
462 | | // instead of always into level 1. Level 1 to b-1 need to be empty. |
463 | | // We try to pick b and its target size so that |
464 | | // 1. target size is in the range of |
465 | | // (max_bytes_for_level_base / max_bytes_for_level_multiplier, |
466 | | // max_bytes_for_level_base] |
467 | | // 2. target size of the last level (level num_levels-1) equals to extra size |
468 | | // of the level. |
469 | | // At the same time max_bytes_for_level_multiplier and |
470 | | // max_bytes_for_level_multiplier_additional are still satisfied. |
471 | | // |
472 | | // With this option on, from an empty DB, we make last level the base level, |
473 | | // which means merging L0 data into the last level, until it exceeds |
474 | | // max_bytes_for_level_base. And then we make the second last level to be |
475 | | // base level, to start to merge L0 data to second last level, with its |
476 | | // target size to be 1/max_bytes_for_level_multiplier of the last level's |
477 | | // extra size. After the data accumulates more so that we need to move the |
478 | | // base level to the third last one, and so on. |
479 | | // |
480 | | // For example, assume max_bytes_for_level_multiplier=10, num_levels=6, |
481 | | // and max_bytes_for_level_base=10MB. |
482 | | // Target sizes of level 1 to 5 starts with: |
483 | | // [- - - - 10MB] |
484 | | // with base level is level. Target sizes of level 1 to 4 are not applicable |
485 | | // because they will not be used. |
486 | | // Until the size of Level 5 grows to more than 10MB, say 11MB, we make |
487 | | // base target to level 4 and now the targets looks like: |
488 | | // [- - - 1.1MB 11MB] |
489 | | // While data are accumulated, size targets are tuned based on actual data |
490 | | // of level 5. When level 5 has 50MB of data, the target is like: |
491 | | // [- - - 5MB 50MB] |
492 | | // Until level 5's actual size is more than 100MB, say 101MB. Now if we keep |
493 | | // level 4 to be the base level, its target size needs to be 10.1MB, which |
494 | | // doesn't satisfy the target size range. So now we make level 3 the target |
495 | | // size and the target sizes of the levels look like: |
496 | | // [- - 1.01MB 10.1MB 101MB] |
497 | | // In the same way, while level 5 further grows, all levels' targets grow, |
498 | | // like |
499 | | // [- - 5MB 50MB 500MB] |
500 | | // Until level 5 exceeds 1000MB and becomes 1001MB, we make level 2 the |
501 | | // base level and make levels' target sizes like this: |
502 | | // [- 1.001MB 10.01MB 100.1MB 1001MB] |
503 | | // and go on... |
504 | | // |
505 | | // By doing it, we give max_bytes_for_level_multiplier a priority against |
506 | | // max_bytes_for_level_base, for a more predictable LSM tree shape. It is |
507 | | // useful to limit worse case space amplification. |
508 | | // |
509 | | // max_bytes_for_level_multiplier_additional is ignored with this flag on. |
510 | | // |
511 | | // Turning this feature on or off for an existing DB can cause unexpected |
512 | | // LSM tree structure so it's not recommended. |
513 | | // |
514 | | // NOTE: this option is experimental |
515 | | // |
516 | | // Default: false |
517 | | bool level_compaction_dynamic_level_bytes; |
518 | | |
519 | | // Default: 10. |
520 | | // |
521 | | // Dynamically changeable through SetOptions() API |
522 | | int max_bytes_for_level_multiplier; |
523 | | |
524 | | // Different max-size multipliers for different levels. |
525 | | // These are multiplied by max_bytes_for_level_multiplier to arrive |
526 | | // at the max-size of each level. |
527 | | // |
528 | | // Default: 1 |
529 | | // |
530 | | // Dynamically changeable through SetOptions() API |
531 | | std::vector<int> max_bytes_for_level_multiplier_additional; |
532 | | |
533 | | // Maximum number of bytes in all compacted files. We avoid expanding |
534 | | // the lower level file set of a compaction if it would make the |
535 | | // total compaction cover more than |
536 | | // (expanded_compaction_factor * targetFileSizeLevel()) many bytes. |
537 | | // |
538 | | // Dynamically changeable through SetOptions() API |
539 | | int expanded_compaction_factor; |
540 | | |
541 | | // Maximum number of bytes in all source files to be compacted in a |
542 | | // single compaction run. We avoid picking too many files in the |
543 | | // source level so that we do not exceed the total source bytes |
544 | | // for compaction to exceed |
545 | | // (source_compaction_factor * targetFileSizeLevel()) many bytes. |
546 | | // Default:1, i.e. pick maxfilesize amount of data as the source of |
547 | | // a compaction. |
548 | | // |
549 | | // Dynamically changeable through SetOptions() API |
550 | | int source_compaction_factor; |
551 | | |
552 | | // Control maximum bytes of overlaps in grandparent (i.e., level+2) before we |
553 | | // stop building a single file in a level->level+1 compaction. |
554 | | // |
555 | | // Dynamically changeable through SetOptions() API |
556 | | int max_grandparent_overlap_factor; |
557 | | |
558 | | // DEPRECATED -- this options is no longer used |
559 | | // Puts are delayed to options.delayed_write_rate when any level has a |
560 | | // compaction score that exceeds soft_rate_limit. This is ignored when == 0.0. |
561 | | // |
562 | | // Default: 0 (disabled) |
563 | | // |
564 | | // Dynamically changeable through SetOptions() API |
565 | | double soft_rate_limit; |
566 | | |
567 | | // DEPRECATED -- this options is no longer used |
568 | | double hard_rate_limit; |
569 | | |
570 | | // All writes will be slowed down to at least delayed_write_rate if estimated |
571 | | // bytes needed to be compaction exceed this threshold. |
572 | | // |
573 | | // Default: 0 (disabled) |
574 | | uint64_t soft_pending_compaction_bytes_limit; |
575 | | |
576 | | // All writes are stopped if estimated bytes needed to be compaction exceed |
577 | | // this threshold. |
578 | | // |
579 | | // Default: 0 (disabled) |
580 | | uint64_t hard_pending_compaction_bytes_limit; |
581 | | |
582 | | // DEPRECATED -- this options is no longer used |
583 | | unsigned int rate_limit_delay_max_milliseconds; |
584 | | |
585 | | // size of one block in arena memory allocation. |
586 | | // If <= 0, a proper value is automatically calculated (usually 1/8 of |
587 | | // writer_buffer_size, rounded up to a multiple of 4KB). |
588 | | // |
589 | | // There are two additional restriction of the The specified size: |
590 | | // (1) size should be in the range of [4096, 2 << 30] and |
591 | | // (2) be the multiple of the CPU word (which helps with the memory |
592 | | // alignment). |
593 | | // |
594 | | // We'll automatically check and adjust the size number to make sure it |
595 | | // conforms to the restrictions. |
596 | | // |
597 | | // Default: 0 |
598 | | // |
599 | | // Dynamically changeable through SetOptions() API |
600 | | size_t arena_block_size; |
601 | | |
602 | | // Disable automatic compactions. Manual compactions can still |
603 | | // be issued on this column family |
604 | | // |
605 | | // Dynamically changeable through SetOptions() API |
606 | | bool disable_auto_compactions; |
607 | | |
608 | | // DEPREACTED |
609 | | // Does not have any effect. |
610 | | bool purge_redundant_kvs_while_flush; |
611 | | |
612 | | // The compaction style. Default: kCompactionStyleLevel |
613 | | CompactionStyle compaction_style; |
614 | | |
615 | | // If level compaction_style = kCompactionStyleLevel, for each level, |
616 | | // which files are prioritized to be picked to compact. |
617 | | // Default: kCompactionPriByCompensatedSize |
618 | | CompactionPri compaction_pri; |
619 | | |
620 | | // If true, compaction will verify checksum on every read that happens |
621 | | // as part of compaction |
622 | | // |
623 | | // Default: true |
624 | | // |
625 | | // Dynamically changeable through SetOptions() API |
626 | | bool verify_checksums_in_compaction; |
627 | | |
628 | | // The options needed to support Universal Style compactions |
629 | | CompactionOptionsUniversal compaction_options_universal; |
630 | | |
631 | | // The options for FIFO compaction style |
632 | | CompactionOptionsFIFO compaction_options_fifo; |
633 | | |
634 | | // Use KeyMayExist API to filter deletes when this is true. |
635 | | // If KeyMayExist returns false, i.e. the key definitely does not exist, then |
636 | | // the delete is a noop. KeyMayExist only incurs in-memory look up. |
637 | | // This optimization avoids writing the delete to storage when appropriate. |
638 | | // |
639 | | // Default: false |
640 | | // |
641 | | // Dynamically changeable through SetOptions() API |
642 | | bool filter_deletes; |
643 | | |
644 | | // An iteration->Next() sequentially skips over keys with the same |
645 | | // user-key unless this option is set. This number specifies the number |
646 | | // of keys (with the same userkey) that will be sequentially |
647 | | // skipped before a reseek is issued. |
648 | | // |
649 | | // Default: 8 |
650 | | // |
651 | | // Dynamically changeable through SetOptions() API |
652 | | uint64_t max_sequential_skip_in_iterations; |
653 | | |
654 | | // This is a factory that provides MemTableRep objects. |
655 | | // Default: a factory that provides a skip-list-based implementation of |
656 | | // MemTableRep. |
657 | | std::shared_ptr<MemTableRepFactory> memtable_factory; |
658 | | |
659 | | // This is a factory that provides TableFactory objects. |
660 | | // Default: a block-based table factory that provides a default |
661 | | // implementation of TableBuilder and TableReader with default |
662 | | // BlockBasedTableOptions. |
663 | | std::shared_ptr<TableFactory> table_factory; |
664 | | |
665 | | // Block-based table related options are moved to BlockBasedTableOptions. |
666 | | // Related options that were originally here but now moved include: |
667 | | // no_block_cache |
668 | | // block_cache |
669 | | // block_cache_compressed |
670 | | // block_size |
671 | | // block_size_deviation |
672 | | // block_restart_interval |
673 | | // filter_policy |
674 | | // whole_key_filtering |
675 | | // If you'd like to customize some of these options, you will need to |
676 | | // use NewBlockBasedTableFactory() to construct a new table factory. |
677 | | |
678 | | // This option allows user to to collect their own interested statistics of |
679 | | // the tables. |
680 | | // Default: empty vector -- no user-defined statistics collection will be |
681 | | // performed. |
682 | | typedef std::vector<std::shared_ptr<TablePropertiesCollectorFactory>> |
683 | | TablePropertiesCollectorFactories; |
684 | | TablePropertiesCollectorFactories table_properties_collector_factories; |
685 | | |
686 | | // Allows thread-safe inplace updates. If this is true, there is no way to |
687 | | // achieve point-in-time consistency using snapshot or iterator (assuming |
688 | | // concurrent updates). Hence iterator and multi-get will return results |
689 | | // which are not consistent as of any point-in-time. |
690 | | // If inplace_callback function is not set, |
691 | | // Put(key, new_value) will update inplace the existing_value iff |
692 | | // * key exists in current memtable |
693 | | // * new sizeof(new_value) <= sizeof(existing_value) |
694 | | // * existing_value for that key is a put i.e. kTypeValue |
695 | | // If inplace_callback function is set, check doc for inplace_callback. |
696 | | // Default: false. |
697 | | bool inplace_update_support; |
698 | | |
699 | | // Number of locks used for inplace update |
700 | | // Default: 10000, if inplace_update_support = true, else 0. |
701 | | // |
702 | | // Dynamically changeable through SetOptions() API |
703 | | size_t inplace_update_num_locks; |
704 | | |
705 | | // existing_value - pointer to previous value (from both memtable and sst). |
706 | | // nullptr if key doesn't exist |
707 | | // existing_value_size - pointer to size of existing_value). |
708 | | // nullptr if key doesn't exist |
709 | | // delta_value - Delta value to be merged with the existing_value. |
710 | | // Stored in transaction logs. |
711 | | // merged_value - Set when delta is applied on the previous value. |
712 | | |
713 | | // Applicable only when inplace_update_support is true, |
714 | | // this callback function is called at the time of updating the memtable |
715 | | // as part of a Put operation, lets say Put(key, delta_value). It allows the |
716 | | // 'delta_value' specified as part of the Put operation to be merged with |
717 | | // an 'existing_value' of the key in the database. |
718 | | |
719 | | // If the merged value is smaller in size that the 'existing_value', |
720 | | // then this function can update the 'existing_value' buffer inplace and |
721 | | // the corresponding 'existing_value'_size pointer, if it wishes to. |
722 | | // The callback should return UpdateStatus::UPDATED_INPLACE. |
723 | | // In this case. (In this case, the snapshot-semantics of the rocksdb |
724 | | // Iterator is not atomic anymore). |
725 | | |
726 | | // If the merged value is larger in size than the 'existing_value' or the |
727 | | // application does not wish to modify the 'existing_value' buffer inplace, |
728 | | // then the merged value should be returned via *merge_value. It is set by |
729 | | // merging the 'existing_value' and the Put 'delta_value'. The callback should |
730 | | // return UpdateStatus::UPDATED in this case. This merged value will be added |
731 | | // to the memtable. |
732 | | |
733 | | // If merging fails or the application does not wish to take any action, |
734 | | // then the callback should return UpdateStatus::UPDATE_FAILED. |
735 | | |
736 | | // Please remember that the original call from the application is Put(key, |
737 | | // delta_value). So the transaction log (if enabled) will still contain (key, |
738 | | // delta_value). The 'merged_value' is not stored in the transaction log. |
739 | | // Hence the inplace_callback function should be consistent across db reopens. |
740 | | |
741 | | // Default: nullptr |
742 | | UpdateStatus (*inplace_callback)(char* existing_value, |
743 | | uint32_t* existing_value_size, |
744 | | Slice delta_value, |
745 | | std::string* merged_value); |
746 | | |
747 | | // if prefix_extractor is set and bloom_bits is not 0, create prefix bloom |
748 | | // for memtable |
749 | | // |
750 | | // Dynamically changeable through SetOptions() API |
751 | | uint32_t memtable_prefix_bloom_bits; |
752 | | |
753 | | // number of hash probes per key |
754 | | // |
755 | | // Dynamically changeable through SetOptions() API |
756 | | uint32_t memtable_prefix_bloom_probes; |
757 | | |
758 | | // Page size for huge page TLB for bloom in memtable. If <=0, not allocate |
759 | | // from huge page TLB but from malloc. |
760 | | // Need to reserve huge pages for it to be allocated. For example: |
761 | | // sysctl -w vm.nr_hugepages=20 |
762 | | // See linux doc Documentation/vm/hugetlbpage.txt |
763 | | // |
764 | | // Dynamically changeable through SetOptions() API |
765 | | size_t memtable_prefix_bloom_huge_page_tlb_size; |
766 | | |
767 | | // Control locality of bloom filter probes to improve cache miss rate. |
768 | | // This option only applies to memtable prefix bloom and plaintable |
769 | | // prefix bloom. It essentially limits every bloom checking to one cache line. |
770 | | // This optimization is turned off when set to 0, and positive number to turn |
771 | | // it on. |
772 | | // Default: 0 |
773 | | uint32_t bloom_locality; |
774 | | |
775 | | // Maximum number of successive merge operations on a key in the memtable. |
776 | | // |
777 | | // When a merge operation is added to the memtable and the maximum number of |
778 | | // successive merges is reached, the value of the key will be calculated and |
779 | | // inserted into the memtable instead of the merge operation. This will |
780 | | // ensure that there are never more than max_successive_merges merge |
781 | | // operations in the memtable. |
782 | | // |
783 | | // Default: 0 (disabled) |
784 | | // |
785 | | // Dynamically changeable through SetOptions() API |
786 | | size_t max_successive_merges; |
787 | | |
788 | | // The number of partial merge operands to accumulate before partial |
789 | | // merge will be performed. Partial merge will not be called |
790 | | // if the list of values to merge is less than min_partial_merge_operands. |
791 | | // |
792 | | // If min_partial_merge_operands < 2, then it will be treated as 2. |
793 | | // |
794 | | // Default: 2 |
795 | | uint32_t min_partial_merge_operands; |
796 | | |
797 | | // This flag specifies that the implementation should optimize the filters |
798 | | // mainly for cases where keys are found rather than also optimize for keys |
799 | | // missed. This would be used in cases where the application knows that |
800 | | // there are very few misses or the performance in the case of misses is not |
801 | | // important. |
802 | | // |
803 | | // For now, this flag allows us to not store filters for the last level i.e |
804 | | // the largest level which contains data of the LSM store. For keys which |
805 | | // are hits, the filters in this level are not useful because we will search |
806 | | // for the data anyway. NOTE: the filters in other levels are still useful |
807 | | // even for key hit because they tell us whether to look in that level or go |
808 | | // to the higher level. |
809 | | // |
810 | | // Default: false |
811 | | bool optimize_filters_for_hits; |
812 | | |
813 | | // After writing every SST file, reopen it and read all the keys. |
814 | | // Default: false |
815 | | bool paranoid_file_checks; |
816 | | |
817 | | // Measure IO stats in compactions, if true. |
818 | | // Default: false |
819 | | bool compaction_measure_io_stats; |
820 | | |
821 | | // Create ColumnFamilyOptions with default values for all fields |
822 | | ColumnFamilyOptions(); |
823 | | // Create ColumnFamilyOptions from Options |
824 | | explicit ColumnFamilyOptions(const Options& options); |
825 | | |
826 | | void Dump(Logger* log) const; |
827 | | }; |
828 | | |
829 | | typedef std::function<yb::Result<bool>(const MemTable&)> MemTableFilter; |
830 | | |
831 | | using IteratorReplacer = |
832 | | std::function<InternalIterator*(InternalIterator*, Arena*, const Slice&)>; |
833 | | |
834 | | struct DBOptions { |
835 | | // Some functions that make it easier to optimize RocksDB |
836 | | |
837 | | #ifndef ROCKSDB_LITE |
838 | | // By default, RocksDB uses only one background thread for flush and |
839 | | // compaction. Calling this function will set it up such that total of |
840 | | // `total_threads` is used. Good value for `total_threads` is the number of |
841 | | // cores. You almost definitely want to call this function if your system is |
842 | | // bottlenecked by RocksDB. |
843 | | DBOptions* IncreaseParallelism(int total_threads = 16); |
844 | | #endif // ROCKSDB_LITE |
845 | | |
846 | | // If true, the database will be created if it is missing. |
847 | | // Default: false |
848 | | bool create_if_missing; |
849 | | |
850 | | // If true, missing column families will be automatically created. |
851 | | // Default: false |
852 | | bool create_missing_column_families; |
853 | | |
854 | | // If true, an error is raised if the database already exists. |
855 | | // Default: false |
856 | | bool error_if_exists; |
857 | | |
858 | | // If true, RocksDB will aggressively check consistency of the data. |
859 | | // Also, if any of the writes to the database fails (Put, Delete, Merge, |
860 | | // Write), the database will switch to read-only mode and fail all other |
861 | | // Write operations. |
862 | | // In most cases you want this to be set to true. |
863 | | // Default: true |
864 | | bool paranoid_checks; |
865 | | |
866 | | // Use the specified object to interact with the environment, |
867 | | // e.g. to read/write files, schedule background work, etc. |
868 | | // Default: Env::Default() |
869 | | Env* env; |
870 | | |
871 | 1.20M | Env* get_checkpoint_env() const { |
872 | 1.20M | return checkpoint_env ? checkpoint_env1.18M : env18.9k ; |
873 | 1.20M | } |
874 | | |
875 | | // Env used to create checkpoints. Default: Env::Default() |
876 | | Env* checkpoint_env; |
877 | | |
878 | | yb::PriorityThreadPool* priority_thread_pool_for_compactions_and_flushes = nullptr; |
879 | | |
880 | | // Use to control write rate of flush and compaction. Flush has higher |
881 | | // priority than compaction. Rate limiting is disabled if nullptr. |
882 | | // If rate limiter is enabled, bytes_per_sync is set to 1MB by default. |
883 | | // Default: nullptr |
884 | | std::shared_ptr<RateLimiter> rate_limiter; |
885 | | |
886 | | // Use to track SST files and control their file deletion rate, can be used |
887 | | // among multiple RocksDB instances, sst_file_manager only track and throttle |
888 | | // deletes of SST files in first db_path (db_name if db_paths is empty), other |
889 | | // files and other db_paths wont be tracked or affected by sst_file_manager. |
890 | | // Default: nullptr |
891 | | std::shared_ptr<SstFileManager> sst_file_manager; |
892 | | |
893 | | // Any internal progress/error information generated by the db will |
894 | | // be written to info_log if it is non-nullptr, or to a file stored |
895 | | // in the same directory as the DB contents if info_log is nullptr. |
896 | | // Default: nullptr |
897 | | std::shared_ptr<Logger> info_log; |
898 | | |
899 | | InfoLogLevel info_log_level; |
900 | | |
901 | | // Number of open files that can be used by the DB. You may need to |
902 | | // increase this if your database has a large working set. Value -1 means |
903 | | // files opened are always kept open. You can estimate number of files based |
904 | | // on target_file_size_base and target_file_size_multiplier for level-based |
905 | | // compaction. For universal-style compaction, you can usually set it to -1. |
906 | | // Default: 5000 or ulimit value of max open files (whichever is smaller) |
907 | | int max_open_files; |
908 | | |
909 | | // If max_open_files is -1, DB will open all files on DB::Open(). You can |
910 | | // use this option to increase the number of threads used to open the files. |
911 | | // Default: 1 |
912 | | int max_file_opening_threads; |
913 | | |
914 | | // Once write-ahead logs exceed this size, we will start forcing the flush of |
915 | | // column families whose memtables are backed by the oldest live WAL file |
916 | | // (i.e. the ones that are causing all the space amplification). If set to 0 |
917 | | // (default), we will dynamically choose the WAL size limit to be |
918 | | // [sum of all write_buffer_size * max_write_buffer_number] * 4 |
919 | | // Default: 0 |
920 | | uint64_t max_total_wal_size; |
921 | | |
922 | | // If non-null, then we should collect metrics about database operations |
923 | | // Statistics objects should not be shared between DB instances as |
924 | | // it does not use any locks to prevent concurrent updates. |
925 | | std::shared_ptr<Statistics> statistics; |
926 | | |
927 | | // If true, then the contents of manifest and data files are not synced |
928 | | // to stable storage. Their contents remain in the OS buffers till the |
929 | | // OS decides to flush them. This option is good for bulk-loading |
930 | | // of data. Once the bulk-loading is complete, please issue a |
931 | | // sync to the OS to flush all dirty buffers to stable storage. |
932 | | // Default: false |
933 | | bool disableDataSync; |
934 | | |
935 | | // If true, then every store to stable storage will issue a fsync. |
936 | | // If false, then every store to stable storage will issue a fdatasync. |
937 | | // This parameter should be set to true while storing data to |
938 | | // filesystem like ext3 that can lose files after a reboot. |
939 | | // Default: false |
940 | | bool use_fsync; |
941 | | |
942 | | // A list of paths where SST files can be put into, with its target size. |
943 | | // Newer data is placed into paths specified earlier in the vector while |
944 | | // older data gradually moves to paths specified later in the vector. |
945 | | // |
946 | | // For example, you have a flash device with 10GB allocated for the DB, |
947 | | // as well as a hard drive of 2TB, you should config it to be: |
948 | | // [{"/flash_path", 10GB}, {"/hard_drive", 2TB}] |
949 | | // |
950 | | // The system will try to guarantee data under each path is close to but |
951 | | // not larger than the target size. But current and future file sizes used |
952 | | // by determining where to place a file are based on best-effort estimation, |
953 | | // which means there is a chance that the actual size under the directory |
954 | | // is slightly more than target size under some workloads. User should give |
955 | | // some buffer room for those cases. |
956 | | // |
957 | | // If none of the paths has sufficient room to place a file, the file will |
958 | | // be placed to the last path anyway, despite to the target size. |
959 | | // |
960 | | // Placing newer data to earlier paths is also best-efforts. User should |
961 | | // expect user files to be placed in higher levels in some extreme cases. |
962 | | // |
963 | | // If left empty, only one path will be used, which is db_name passed when |
964 | | // opening the DB. |
965 | | // Default: empty |
966 | | std::vector<DbPath> db_paths; |
967 | | |
968 | | // This specifies the info LOG dir. |
969 | | // If it is empty, the log files will be in the same dir as data. |
970 | | // If it is non empty, the log files will be in the specified dir, |
971 | | // and the db data dir's absolute path will be used as the log file |
972 | | // name's prefix. |
973 | | std::string db_log_dir; |
974 | | |
975 | | // This specifies the absolute dir path for write-ahead logs (WAL). |
976 | | // If it is empty, the log files will be in the same dir as data, |
977 | | // dbname is used as the data dir by default |
978 | | // If it is non empty, the log files will be in kept the specified dir. |
979 | | // When destroying the db, |
980 | | // all log files in wal_dir and the dir itself is deleted |
981 | | std::string wal_dir; |
982 | | |
983 | | // The periodicity when obsolete files get deleted. The default |
984 | | // value is 6 hours. The files that get out of scope by compaction |
985 | | // process will still get automatically delete on every compaction, |
986 | | // regardless of this setting |
987 | | uint64_t delete_obsolete_files_period_micros; |
988 | | |
989 | | // Suggested number of concurrent background compaction jobs, submitted to |
990 | | // the default LOW priority thread pool. |
991 | | // |
992 | | // Default: max_background_compactions |
993 | | int base_background_compactions; |
994 | | |
995 | | // Maximum number of concurrent background compaction jobs, submitted to |
996 | | // the default LOW priority thread pool. |
997 | | // We first try to schedule compactions based on |
998 | | // `base_background_compactions`. If the compaction cannot catch up , we |
999 | | // will increase number of compaction threads up to |
1000 | | // `max_background_compactions`. |
1001 | | // |
1002 | | // If you're increasing this, also consider increasing number of threads in |
1003 | | // LOW priority thread pool. For more information, see |
1004 | | // Env::SetBackgroundThreads |
1005 | | // Default: 1 |
1006 | | int max_background_compactions; |
1007 | | |
1008 | | // Number of threads reserved for exclusively doing small compactions |
1009 | | // Default: -1 (later gets set to base_background_compactions - 1) |
1010 | | int num_reserved_small_compaction_threads; |
1011 | | |
1012 | | // Threshold for input size beyond which compaction is considered large |
1013 | | // Default: numeric_limits<uint64_t>::max() |
1014 | | uint64_t compaction_size_threshold_bytes; |
1015 | | |
1016 | | // This value represents the maximum number of threads that will |
1017 | | // concurrently perform a compaction job by breaking it into multiple, |
1018 | | // smaller ones that are run simultaneously. |
1019 | | // Default: 1 (i.e. no subcompactions) |
1020 | | uint32_t max_subcompactions; |
1021 | | |
1022 | | // Maximum number of concurrent background memtable flush jobs, submitted to |
1023 | | // the HIGH priority thread pool. |
1024 | | // |
1025 | | // By default, all background jobs (major compaction and memtable flush) go |
1026 | | // to the LOW priority pool. If this option is set to a positive number, |
1027 | | // memtable flush jobs will be submitted to the HIGH priority pool. |
1028 | | // It is important when the same Env is shared by multiple db instances. |
1029 | | // Without a separate pool, long running major compaction jobs could |
1030 | | // potentially block memtable flush jobs of other db instances, leading to |
1031 | | // unnecessary Put stalls. |
1032 | | // |
1033 | | // If you're increasing this, also consider increasing number of threads in |
1034 | | // HIGH priority thread pool. For more information, see |
1035 | | // Env::SetBackgroundThreads |
1036 | | // Default: 1 |
1037 | | int max_background_flushes; |
1038 | | |
1039 | | // Specify the maximal size of the info log file. If the log file |
1040 | | // is larger than `max_log_file_size`, a new info log file will |
1041 | | // be created. |
1042 | | // If max_log_file_size == 0, all logs will be written to one |
1043 | | // log file. |
1044 | | size_t max_log_file_size; |
1045 | | |
1046 | | // Time for the info log file to roll (in seconds). |
1047 | | // If specified with non-zero value, log file will be rolled |
1048 | | // if it has been active longer than `log_file_time_to_roll`. |
1049 | | // Default: 0 (disabled) |
1050 | | size_t log_file_time_to_roll; |
1051 | | |
1052 | | // Maximal info log files to be kept. |
1053 | | // Default: 1000 |
1054 | | size_t keep_log_file_num; |
1055 | | |
1056 | | // Recycle log files. |
1057 | | // If non-zero, we will reuse previously written log files for new |
1058 | | // logs, overwriting the old data. The value indicates how many |
1059 | | // such files we will keep around at any point in time for later |
1060 | | // use. This is more efficient because the blocks are already |
1061 | | // allocated and fdatasync does not need to update the inode after |
1062 | | // each write. |
1063 | | // Default: 0 |
1064 | | size_t recycle_log_file_num; |
1065 | | |
1066 | | // manifest file is rolled over on reaching this limit. |
1067 | | // The older manifest file be deleted. |
1068 | | // The default value is MAX_INT so that roll-over does not take place. |
1069 | | uint64_t max_manifest_file_size; |
1070 | | |
1071 | | // Number of shards used for table cache. |
1072 | | int table_cache_numshardbits; |
1073 | | |
1074 | | // DEPRECATED |
1075 | | // int table_cache_remove_scan_count_limit; |
1076 | | |
1077 | | // The following two fields affect how archived logs will be deleted. |
1078 | | // 1. If both set to 0, logs will be deleted asap and will not get into |
1079 | | // the archive. |
1080 | | // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, |
1081 | | // WAL files will be checked every 10 min and if total size is greater |
1082 | | // then WAL_size_limit_MB, they will be deleted starting with the |
1083 | | // earliest until size_limit is met. All empty files will be deleted. |
1084 | | // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then |
1085 | | // WAL files will be checked every WAL_ttl_secondsi / 2 and those that |
1086 | | // are older than WAL_ttl_seconds will be deleted. |
1087 | | // 4. If both are not 0, WAL files will be checked every 10 min and both |
1088 | | // checks will be performed with ttl being first. |
1089 | | uint64_t WAL_ttl_seconds; |
1090 | | uint64_t WAL_size_limit_MB; |
1091 | | |
1092 | | // Number of bytes to preallocate (via fallocate) the manifest |
1093 | | // files. Default is 4mb, which is reasonable to reduce random IO |
1094 | | // as well as prevent overallocation for mounts that preallocate |
1095 | | // large amounts of data (such as xfs's allocsize option). |
1096 | | size_t manifest_preallocation_size; |
1097 | | |
1098 | | // Data being read from file storage may be buffered in the OS |
1099 | | // Default: true |
1100 | | bool allow_os_buffer; |
1101 | | |
1102 | | // Allow the OS to mmap file for reading sst tables. Default: false |
1103 | | bool allow_mmap_reads; |
1104 | | |
1105 | | // Allow the OS to mmap file for writing. |
1106 | | // DB::SyncWAL() only works if this is set to false. |
1107 | | // Default: false |
1108 | | bool allow_mmap_writes; |
1109 | | |
1110 | | // If false, fallocate() calls are bypassed |
1111 | | bool allow_fallocate; |
1112 | | |
1113 | | // Disable child process inherit open files. Default: true |
1114 | | bool is_fd_close_on_exec; |
1115 | | |
1116 | | // DEPRECATED -- this options is no longer used |
1117 | | bool skip_log_error_on_recovery; |
1118 | | |
1119 | | // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec |
1120 | | // Default: 600 (10 min) |
1121 | | unsigned int stats_dump_period_sec; |
1122 | | |
1123 | | // If set true, will hint the underlying file system that the file |
1124 | | // access pattern is random, when a sst file is opened. |
1125 | | // Default: true |
1126 | | bool advise_random_on_open; |
1127 | | |
1128 | | // Amount of data to build up in memtables across all column |
1129 | | // families before writing to disk. |
1130 | | // |
1131 | | // This is distinct from write_buffer_size, which enforces a limit |
1132 | | // for a single memtable. |
1133 | | // |
1134 | | // This feature is disabled by default. Specify a non-zero value |
1135 | | // to enable it. |
1136 | | // |
1137 | | // Default: 0 (disabled) |
1138 | | size_t db_write_buffer_size; |
1139 | | |
1140 | | // Shared MemoryMonitor to keep track of total memory usage. |
1141 | | // |
1142 | | // Default: nullptr (disabled) |
1143 | | std::shared_ptr<MemoryMonitor> memory_monitor; |
1144 | | |
1145 | | // Specify the file access pattern once a compaction is started. |
1146 | | // It will be applied to all input files of a compaction. |
1147 | | // Default: NORMAL |
1148 | | enum AccessHint { |
1149 | | NONE, |
1150 | | NORMAL, |
1151 | | SEQUENTIAL, |
1152 | | WILLNEED |
1153 | | }; |
1154 | | AccessHint access_hint_on_compaction_start; |
1155 | | |
1156 | | // If true, always create a new file descriptor and new table reader |
1157 | | // for compaction inputs. Turn this parameter on may introduce extra |
1158 | | // memory usage in the table reader, if it allocates extra memory |
1159 | | // for indexes. This will allow file descriptor prefetch options |
1160 | | // to be set for compaction input files and not to impact file |
1161 | | // descriptors for the same file used by user queries. |
1162 | | // Suggest to enable BlockBasedTableOptions.cache_index_and_filter_blocks |
1163 | | // for this mode if using block-based table. |
1164 | | // |
1165 | | // Default: false |
1166 | | bool new_table_reader_for_compaction_inputs; |
1167 | | |
1168 | | // If non-zero, we perform bigger reads when doing compaction. If you're |
1169 | | // running RocksDB on spinning disks, you should set this to at least 2MB. |
1170 | | // That way RocksDB's compaction is doing sequential instead of random reads. |
1171 | | // |
1172 | | // When non-zero, we also force new_table_reader_for_compaction_inputs to |
1173 | | // true. |
1174 | | // |
1175 | | // Default: 0 |
1176 | | size_t compaction_readahead_size; |
1177 | | |
1178 | | // This is a maximum buffer size that is used by WinMmapReadableFile in |
1179 | | // unbuffered disk I/O mode. We need to maintain an aligned buffer for |
1180 | | // reads. We allow the buffer to grow until the specified value and then |
1181 | | // for bigger requests allocate one shot buffers. In unbuffered mode we |
1182 | | // always bypass read-ahead buffer at ReadaheadRandomAccessFile |
1183 | | // When read-ahead is required we then make use of compaction_readahead_size |
1184 | | // value and always try to read ahead. With read-ahead we always |
1185 | | // pre-allocate buffer to the size instead of growing it up to a limit. |
1186 | | // |
1187 | | // This option is currently honored only on Windows |
1188 | | // |
1189 | | // Default: 1 Mb |
1190 | | // |
1191 | | // Special value: 0 - means do not maintain per instance buffer. Allocate |
1192 | | // per request buffer and avoid locking. |
1193 | | size_t random_access_max_buffer_size; |
1194 | | |
1195 | | // This is the maximum buffer size that is used by WritableFileWriter. |
1196 | | // On Windows, we need to maintain an aligned buffer for writes. |
1197 | | // We allow the buffer to grow until it's size hits the limit. |
1198 | | // |
1199 | | // Default: 1024 * 1024 (1 MB) |
1200 | | size_t writable_file_max_buffer_size; |
1201 | | |
1202 | | |
1203 | | // Use adaptive mutex, which spins in the user space before resorting |
1204 | | // to kernel. This could reduce context switch when the mutex is not |
1205 | | // heavily contended. However, if the mutex is hot, we could end up |
1206 | | // wasting spin time. |
1207 | | // Default: false |
1208 | | bool use_adaptive_mutex; |
1209 | | |
1210 | | // Create DBOptions with default values for all fields |
1211 | | DBOptions(); |
1212 | | |
1213 | | void Dump(Logger* log) const; |
1214 | | |
1215 | | // Allows OS to incrementally sync files to disk while they are being |
1216 | | // written, asynchronously, in the background. This operation can be used |
1217 | | // to smooth out write I/Os over time. Users shouldn't reply on it for |
1218 | | // persistency guarantee. |
1219 | | // Issue one request for every bytes_per_sync written. 0 turns it off. |
1220 | | // Default: 0 |
1221 | | // |
1222 | | // You may consider using rate_limiter to regulate write rate to device. |
1223 | | // When rate limiter is enabled, it automatically enables bytes_per_sync |
1224 | | // to 1MB. |
1225 | | // |
1226 | | // This option applies to table files |
1227 | | uint64_t bytes_per_sync; |
1228 | | |
1229 | | // Same as bytes_per_sync, but applies to WAL files |
1230 | | // Default: 0, turned off |
1231 | | uint64_t wal_bytes_per_sync; |
1232 | | |
1233 | | // A vector of EventListeners which call-back functions will be called |
1234 | | // when specific RocksDB event happens. |
1235 | | std::vector<std::shared_ptr<EventListener>> listeners; |
1236 | | |
1237 | | // If true, then the status of the threads involved in this DB will |
1238 | | // be tracked and available via GetThreadList() API. |
1239 | | // |
1240 | | // Default: false |
1241 | | bool enable_thread_tracking; |
1242 | | |
1243 | | // The limited write rate to DB if soft_pending_compaction_bytes_limit or |
1244 | | // level0_slowdown_writes_trigger is triggered, or we are writing to the |
1245 | | // last mem table allowed and we allow more than 3 mem tables. It is |
1246 | | // calculated using size of user write requests before compression. |
1247 | | // RocksDB may decide to slow down more if the compaction still |
1248 | | // gets behind further. |
1249 | | // Unit: byte per second. |
1250 | | // |
1251 | | // Default: 2MB/s |
1252 | | uint64_t delayed_write_rate; |
1253 | | |
1254 | | // If true, allow multi-writers to update mem tables in parallel. |
1255 | | // Only some memtable_factory-s support concurrent writes; currently it |
1256 | | // is implemented only for SkipListFactory. Concurrent memtable writes |
1257 | | // are not compatible with inplace_update_support or filter_deletes. |
1258 | | // It is strongly recommended to set enable_write_thread_adaptive_yield |
1259 | | // if you are going to use this feature. |
1260 | | // |
1261 | | // THIS FEATURE IS NOT STABLE YET. |
1262 | | // |
1263 | | // Default: false |
1264 | | bool allow_concurrent_memtable_write; |
1265 | | |
1266 | | // If true, threads synchronizing with the write batch group leader will |
1267 | | // wait for up to write_thread_max_yield_usec before blocking on a mutex. |
1268 | | // This can substantially improve throughput for concurrent workloads, |
1269 | | // regardless of whether allow_concurrent_memtable_write is enabled. |
1270 | | // |
1271 | | // THIS FEATURE IS NOT STABLE YET. |
1272 | | // |
1273 | | // Default: false |
1274 | | bool enable_write_thread_adaptive_yield; |
1275 | | |
1276 | | // The maximum number of microseconds that a write operation will use |
1277 | | // a yielding spin loop to coordinate with other write threads before |
1278 | | // blocking on a mutex. (Assuming write_thread_slow_yield_usec is |
1279 | | // set properly) increasing this value is likely to increase RocksDB |
1280 | | // throughput at the expense of increased CPU usage. |
1281 | | // |
1282 | | // Default: 100 |
1283 | | uint64_t write_thread_max_yield_usec; |
1284 | | |
1285 | | // The latency in microseconds after which a std::this_thread::yield |
1286 | | // call (sched_yield on Linux) is considered to be a signal that |
1287 | | // other processes or threads would like to use the current core. |
1288 | | // Increasing this makes writer threads more likely to take CPU |
1289 | | // by spinning, which will show up as an increase in the number of |
1290 | | // involuntary context switches. |
1291 | | // |
1292 | | // Default: 3 |
1293 | | uint64_t write_thread_slow_yield_usec; |
1294 | | |
1295 | | // If true, then DB::Open() will not update the statistics used to optimize |
1296 | | // compaction decision by loading table properties from many files. |
1297 | | // Turning off this feature will improve DBOpen time especially in |
1298 | | // disk environment. |
1299 | | // |
1300 | | // Default: false |
1301 | | bool skip_stats_update_on_db_open; |
1302 | | |
1303 | | // Recovery mode to control the consistency while replaying WAL |
1304 | | // Default: kTolerateCorruptedTailRecords |
1305 | | WALRecoveryMode wal_recovery_mode; |
1306 | | |
1307 | | // A global cache for table-level rows. |
1308 | | // Default: nullptr (disabled) |
1309 | | // Not supported in ROCKSDB_LITE mode! |
1310 | | std::shared_ptr<Cache> row_cache; |
1311 | | |
1312 | | #ifndef ROCKSDB_LITE |
1313 | | // A filter object supplied to be invoked while processing write-ahead-logs |
1314 | | // (WALs) during recovery. The filter provides a way to inspect log |
1315 | | // records, ignoring a particular record or skipping replay. |
1316 | | // The filter is invoked at startup and is invoked from a single-thread |
1317 | | // currently. |
1318 | | const WalFilter* wal_filter; |
1319 | | #endif // ROCKSDB_LITE |
1320 | | |
1321 | | // If true, then DB::Open / CreateColumnFamily / DropColumnFamily |
1322 | | // / SetOptions will fail if options file is not detected or properly |
1323 | | // persisted. |
1324 | | // |
1325 | | // DEFAULT: false |
1326 | | bool fail_if_options_file_error; |
1327 | | |
1328 | | // Initial value for seqno generator. |
1329 | | // Used only during creation of new DB. |
1330 | | SequenceNumber initial_seqno = 0; |
1331 | | |
1332 | | // Boundary extractor is used to retrieve user defined values for record. |
1333 | | // Also it decodes those values during load of metafile. |
1334 | | std::shared_ptr<BoundaryValuesExtractor> boundary_extractor; |
1335 | | |
1336 | | // Function that returns max file size for compaction. |
1337 | | // Supported only for level0 of universal style compactions. |
1338 | | std::shared_ptr<std::function<uint64_t()>> max_file_size_for_compaction; |
1339 | | |
1340 | | // Invoked after memtable switched. |
1341 | | std::shared_ptr<std::function<MemTableFilter()>> mem_table_flush_filter_factory; |
1342 | | |
1343 | | // A prefix for log messages, usually containing the tablet id. |
1344 | | std::string log_prefix; |
1345 | | |
1346 | | // This RocksDB instance root mem tracker. |
1347 | | std::shared_ptr<yb::MemTracker> mem_tracker; |
1348 | | |
1349 | | // Specific mem tracker for block based tables created by this RocksDB instance. |
1350 | | std::shared_ptr<yb::MemTracker> block_based_table_mem_tracker; |
1351 | | |
1352 | | // Adds ability to modify iterator created for SST file. |
1353 | | // For instance some additional filtering could be added. |
1354 | | std::shared_ptr<IteratorReplacer> iterator_replacer; |
1355 | | |
1356 | | // Creates file filters that directly exclude files during compaction, resulting |
1357 | | // in their direct deletion without inspection. |
1358 | | // The filters are currently used to expire files in time-series DBs that have |
1359 | | // completely expired based on their table and/or column TTL. |
1360 | | std::shared_ptr<CompactionFileFilterFactory> compaction_file_filter_factory; |
1361 | | }; |
1362 | | |
1363 | | // Options to control the behavior of a database (passed to DB::Open) |
1364 | | struct Options : public DBOptions, public ColumnFamilyOptions { |
1365 | | // Create an Options object with default values for all fields. |
1366 | 616k | Options() : DBOptions(), ColumnFamilyOptions() {} |
1367 | | |
1368 | | Options(const DBOptions& db_options, |
1369 | | const ColumnFamilyOptions& column_family_options) |
1370 | 1.30M | : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {} |
1371 | | |
1372 | | void Dump(Logger* log) const; |
1373 | | |
1374 | | void DumpCFOptions(Logger* log) const; |
1375 | | |
1376 | | // Set appropriate parameters for bulk loading. |
1377 | | // The reason that this is a function that returns "this" instead of a |
1378 | | // constructor is to enable chaining of multiple similar calls in the future. |
1379 | | // |
1380 | | |
1381 | | // All data will be in level 0 without any automatic compaction. |
1382 | | // It's recommended to manually call CompactRange(NULL, NULL) before reading |
1383 | | // from the database, because otherwise the read can be very slow. |
1384 | | Options* PrepareForBulkLoad(); |
1385 | | }; |
1386 | | |
1387 | | // |
1388 | | // An application can issue a read request (via Get/Iterators) and specify |
1389 | | // if that read should process data that ALREADY resides on a specified cache |
1390 | | // level. For example, if an application specifies kBlockCacheTier then the |
1391 | | // Get call will process data that is already processed in the memtable or |
1392 | | // the block cache. It will not page in data from the OS cache or data that |
1393 | | // resides in storage. |
1394 | | enum ReadTier { |
1395 | | kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage |
1396 | | kBlockCacheTier = 0x1, // data in memtable or block cache |
1397 | | kPersistedTier = 0x2 // persisted data. When WAL is disabled, this option |
1398 | | // will skip data in memtable. |
1399 | | // Note that this ReadTier currently only supports |
1400 | | // Get and MultiGet and does not support iterators. |
1401 | | }; |
1402 | | |
1403 | | struct FdWithBoundaries; |
1404 | | class ReadFileFilter { |
1405 | | public: |
1406 | | virtual bool Filter(const FdWithBoundaries&) const = 0; |
1407 | | |
1408 | | protected: |
1409 | 19.0M | virtual ~ReadFileFilter() {} |
1410 | | }; |
1411 | | |
1412 | | class TableReader; |
1413 | | class TableAwareReadFileFilter { |
1414 | | public: |
1415 | | virtual bool Filter(TableReader*) const = 0; |
1416 | | |
1417 | | protected: |
1418 | 21.8M | virtual ~TableAwareReadFileFilter() {} |
1419 | | }; |
1420 | | |
1421 | | // Options that control read operations |
1422 | | struct ReadOptions { |
1423 | | // If true, all data read from underlying storage will be |
1424 | | // verified against corresponding checksums. |
1425 | | // Default: true |
1426 | | bool verify_checksums; |
1427 | | |
1428 | | // Should the "data block"/"index block"/"filter block" read for this |
1429 | | // iteration be cached in memory? |
1430 | | // Callers may wish to set this field to false for bulk scans. |
1431 | | // Default: true |
1432 | | bool fill_cache; |
1433 | | |
1434 | | // If this option is set and memtable implementation allows, Seek |
1435 | | // might only return keys with the same prefix as the seek-key |
1436 | | // |
1437 | | // ! DEPRECATED: prefix_seek is on by default when prefix_extractor |
1438 | | // is configured |
1439 | | // bool prefix_seek; |
1440 | | |
1441 | | // If "snapshot" is non-nullptr, read as of the supplied snapshot |
1442 | | // (which must belong to the DB that is being read and which must |
1443 | | // not have been released). If "snapshot" is nullptr, use an implicit |
1444 | | // snapshot of the state at the beginning of this read operation. |
1445 | | // Default: nullptr |
1446 | | const Snapshot* snapshot; |
1447 | | |
1448 | | // If "prefix" is non-nullptr, and ReadOptions is being passed to |
1449 | | // db.NewIterator, only return results when the key begins with this |
1450 | | // prefix. This field is ignored by other calls (e.g., Get). |
1451 | | // Options.prefix_extractor must also be set, and |
1452 | | // prefix_extractor.InRange(prefix) must be true. The iterator |
1453 | | // returned by NewIterator when this option is set will behave just |
1454 | | // as if the underlying store did not contain any non-matching keys, |
1455 | | // with two exceptions. Seek() only accepts keys starting with the |
1456 | | // prefix, and SeekToLast() is not supported. prefix filter with this |
1457 | | // option will sometimes reduce the number of read IOPs. |
1458 | | // Default: nullptr |
1459 | | // |
1460 | | // ! DEPRECATED |
1461 | | // const Slice* prefix; |
1462 | | |
1463 | | // "iterate_upper_bound" defines the extent upto which the forward iterator |
1464 | | // can returns entries. Once the bound is reached, Valid() will be false. |
1465 | | // "iterate_upper_bound" is exclusive ie the bound value is |
1466 | | // not a valid entry. If iterator_extractor is not null, the Seek target |
1467 | | // and iterator_upper_bound need to have the same prefix. |
1468 | | // This is because ordering is not guaranteed outside of prefix domain. |
1469 | | // There is no lower bound on the iterator. If needed, that can be easily |
1470 | | // implemented |
1471 | | // |
1472 | | // Default: nullptr |
1473 | | const Slice* iterate_upper_bound; |
1474 | | |
1475 | | // Specify if this read request should process data that ALREADY |
1476 | | // resides on a particular cache. If the required data is not |
1477 | | // found at the specified cache, then Status::Incomplete is returned. |
1478 | | // Default: kReadAllTier |
1479 | | ReadTier read_tier; |
1480 | | |
1481 | | // Specify to create a tailing iterator -- a special iterator that has a |
1482 | | // view of the complete database (i.e. it can also be used to read newly |
1483 | | // added data) and is optimized for sequential reads. It will return records |
1484 | | // that were inserted into the database after the creation of the iterator. |
1485 | | // Default: false |
1486 | | // Not supported in ROCKSDB_LITE mode! |
1487 | | bool tailing; |
1488 | | |
1489 | | // Specify to create a managed iterator -- a special iterator that |
1490 | | // uses less resources by having the ability to free its underlying |
1491 | | // resources on request. |
1492 | | // Default: false |
1493 | | // Not supported in ROCKSDB_LITE mode! |
1494 | | bool managed; |
1495 | | |
1496 | | // Enable a total order seek regardless of index format (e.g. hash index) |
1497 | | // used in the table. Some table format (e.g. plain table) may not support |
1498 | | // this option. |
1499 | | bool total_order_seek; |
1500 | | |
1501 | | // Enforce that the iterator only iterates over the same prefix as the seek. |
1502 | | // This option is effective only for prefix seeks, i.e. prefix_extractor is |
1503 | | // non-null for the column family and total_order_seek is false. Unlike |
1504 | | // iterate_upper_bound, prefix_same_as_start only works within a prefix |
1505 | | // but in both directions. |
1506 | | // Default: false |
1507 | | bool prefix_same_as_start; |
1508 | | |
1509 | | // Keep the blocks loaded by the iterator pinned in memory as long as the |
1510 | | // iterator is not deleted, If used when reading from tables created with |
1511 | | // BlockBasedTableOptions::use_delta_encoding = false, |
1512 | | // Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to |
1513 | | // return 1. |
1514 | | // Default: false |
1515 | | bool pin_data; |
1516 | | |
1517 | | // Query id designated for the read. |
1518 | | QueryId query_id = kDefaultQueryId; |
1519 | | |
1520 | | // Filter for pruning SST files. RocksDB user can provide its own implementation to exclude SST |
1521 | | // files from being added to MergeIterator. By default doesn't filter files. |
1522 | | std::shared_ptr<TableAwareReadFileFilter> table_aware_file_filter; |
1523 | | |
1524 | | std::shared_ptr<ReadFileFilter> file_filter; |
1525 | | |
1526 | | static const ReadOptions kDefault; |
1527 | | |
1528 | | ReadOptions(); |
1529 | | ReadOptions(bool cksum, bool cache); |
1530 | | }; |
1531 | | |
1532 | | // Options that control write operations |
1533 | | struct WriteOptions { |
1534 | | // If true, the write will be flushed from the operating system |
1535 | | // buffer cache (by calling WritableFile::Sync()) before the write |
1536 | | // is considered complete. If this flag is true, writes will be |
1537 | | // slower. |
1538 | | // |
1539 | | // If this flag is false, and the machine crashes, some recent |
1540 | | // writes may be lost. Note that if it is just the process that |
1541 | | // crashes (i.e., the machine does not reboot), no writes will be |
1542 | | // lost even if sync==false. |
1543 | | // |
1544 | | // In other words, a DB write with sync==false has similar |
1545 | | // crash semantics as the "write()" system call. A DB write |
1546 | | // with sync==true has similar crash semantics to a "write()" |
1547 | | // system call followed by "fdatasync()". |
1548 | | // |
1549 | | // Default: false |
1550 | | bool sync; |
1551 | | |
1552 | | // If true, writes will not first go to the write ahead log, |
1553 | | // and the write may got lost after a crash. |
1554 | | bool disableWAL; |
1555 | | |
1556 | | // The option is deprecated. It's not used anymore. |
1557 | | uint64_t timeout_hint_us; |
1558 | | |
1559 | | // If true and if user is trying to write to column families that don't exist |
1560 | | // (they were dropped), ignore the write (don't return an error). If there |
1561 | | // are multiple writes in a WriteBatch, other writes will succeed. |
1562 | | // Default: false |
1563 | | bool ignore_missing_column_families; |
1564 | | |
1565 | | WriteOptions() |
1566 | | : sync(false), |
1567 | | disableWAL(false), |
1568 | | timeout_hint_us(0), |
1569 | 27.8M | ignore_missing_column_families(false) {} |
1570 | | }; |
1571 | | |
1572 | | // On each call returned value is incremented by 1. |
1573 | | // Could be used to track whether one action happened before another. |
1574 | | int64_t FlushTick(); |
1575 | | |
1576 | | // Options that control flush operations |
1577 | | struct FlushOptions { |
1578 | | // If true, the flush will wait until the flush is done. |
1579 | | // Default: true |
1580 | | bool wait = true; |
1581 | | |
1582 | | static constexpr int64_t kNeverIgnore = std::numeric_limits<int64_t>::max(); |
1583 | | |
1584 | | int64_t ignore_if_flushed_after_tick = kNeverIgnore; |
1585 | | }; |
1586 | | |
1587 | | // Get options based on some guidelines. Now only tune parameter based on |
1588 | | // flush/compaction and fill default parameters for other parameters. |
1589 | | // total_write_buffer_limit: budget for memory spent for mem tables |
1590 | | // read_amplification_threshold: comfortable value of read amplification |
1591 | | // write_amplification_threshold: comfortable value of write amplification. |
1592 | | // target_db_size: estimated total DB size. |
1593 | | extern Options GetOptions(size_t total_write_buffer_limit, |
1594 | | int read_amplification_threshold = 8, |
1595 | | int write_amplification_threshold = 32, |
1596 | | uint64_t target_db_size = 68719476736 /* 64GB */); |
1597 | | |
1598 | | // Create a Logger from provided DBOptions |
1599 | | extern Status CreateLoggerFromOptions(const std::string& dbname, |
1600 | | const DBOptions& options, |
1601 | | std::shared_ptr<Logger>* logger); |
1602 | | |
1603 | | // CompactionOptions are used in CompactFiles() call. |
1604 | | struct CompactionOptions { |
1605 | | // Compaction output compression type |
1606 | | // Default: snappy |
1607 | | CompressionType compression; |
1608 | | // Compaction will create files of size `output_file_size_limit`. |
1609 | | // Default: MAX, which means that compaction will create a single file |
1610 | | uint64_t output_file_size_limit; |
1611 | | |
1612 | | CompactionOptions() |
1613 | | : compression(kSnappyCompression), |
1614 | 45 | output_file_size_limit(std::numeric_limits<uint64_t>::max()) {} |
1615 | | }; |
1616 | | |
1617 | | // For level based compaction, we can configure if we want to skip/force |
1618 | | // bottommost level compaction. |
1619 | | enum class BottommostLevelCompaction { |
1620 | | // Skip bottommost level compaction |
1621 | | kSkip, |
1622 | | // Only compact bottommost level if there is a compaction filter |
1623 | | // This is the default option |
1624 | | kIfHaveCompactionFilter, |
1625 | | // Always compact bottommost level |
1626 | | kForce, |
1627 | | }; |
1628 | | |
1629 | | // CompactRangeOptions is used by CompactRange() call. |
1630 | | struct CompactRangeOptions { |
1631 | | // If true, no other compaction will run at the same time as this |
1632 | | // manual compaction |
1633 | | bool exclusive_manual_compaction = true; |
1634 | | // If true, compacted files will be moved to the minimum level capable |
1635 | | // of holding the data or given level (specified non-negative target_level). |
1636 | | bool change_level = false; |
1637 | | // If change_level is true and target_level have non-negative value, compacted |
1638 | | // files will be moved to target_level. |
1639 | | int target_level = -1; |
1640 | | // Compaction outputs will be placed in options.db_paths[target_path_id]. |
1641 | | // Behavior is undefined if target_path_id is out of range. |
1642 | | uint32_t target_path_id = 0; |
1643 | | // By default level based compaction will only compact the bottommost level |
1644 | | // if there is a compaction filter |
1645 | | BottommostLevelCompaction bottommost_level_compaction = |
1646 | | BottommostLevelCompaction::kIfHaveCompactionFilter; |
1647 | | }; |
1648 | | } // namespace rocksdb |
1649 | | |
1650 | | #endif // YB_ROCKSDB_OPTIONS_H |