/Users/deen/code/yugabyte-db/src/yb/rocksdb/util/options_builder.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under the BSD-style license found in the |
3 | | // LICENSE file in the root directory of this source tree. An additional grant |
4 | | // of patent rights can be found in the PATENTS file in the same directory. |
5 | | // |
6 | | // The following only applies to changes made to this file as part of YugaByte development. |
7 | | // |
8 | | // Portions Copyright (c) YugaByte, Inc. |
9 | | // |
10 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
11 | | // in compliance with the License. You may obtain a copy of the License at |
12 | | // |
13 | | // http://www.apache.org/licenses/LICENSE-2.0 |
14 | | // |
15 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
16 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
17 | | // or implied. See the License for the specific language governing permissions and limitations |
18 | | // under the License. |
19 | | // |
20 | | |
21 | | #include <cmath> |
22 | | |
23 | | #include "yb/rocksdb/options.h" |
24 | | |
25 | | namespace rocksdb { |
26 | | |
27 | | namespace { |
28 | | |
29 | | // For now, always use 1-0 as level bytes multiplier. |
30 | | const int kBytesForLevelMultiplier = 10; |
31 | | const size_t kBytesForOneMb = 1024 * 1024; |
32 | | |
33 | | // Pick compaction style |
34 | | CompactionStyle PickCompactionStyle(size_t write_buffer_size, |
35 | | int read_amp_threshold, |
36 | | int write_amp_threshold, |
37 | 5 | uint64_t target_db_size) { |
38 | 5 | #ifndef ROCKSDB_LITE |
39 | | // Estimate read amplification and write amplification of two compaction |
40 | | // styles. If there is hard limit to force a choice, make the choice. |
41 | | // Otherwise, calculate a score based on threshold and expected value of |
42 | | // two styles, weighing reads 4X important than writes. |
43 | 5 | int expected_levels = static_cast<int>(ceil( |
44 | 5 | std::log(target_db_size / write_buffer_size) / std::log(kBytesForLevelMultiplier))); |
45 | | |
46 | 5 | int expected_max_files_universal = |
47 | 5 | static_cast<int>(ceil(log2(target_db_size / write_buffer_size))); |
48 | | |
49 | 5 | const int kEstimatedLevel0FilesInLevelStyle = 2; |
50 | | // Estimate write amplification: |
51 | | // (1) 1 for every L0 file |
52 | | // (2) 2 for L1 |
53 | | // (3) kBytesForLevelMultiplier for the last level. It's really hard to |
54 | | // predict. |
55 | | // (3) kBytesForLevelMultiplier for other levels. |
56 | 5 | int expected_write_amp_level = kEstimatedLevel0FilesInLevelStyle + 2 |
57 | 5 | + (expected_levels - 2) * kBytesForLevelMultiplier |
58 | 5 | + kBytesForLevelMultiplier; |
59 | 5 | int expected_read_amp_level = |
60 | 5 | kEstimatedLevel0FilesInLevelStyle + expected_levels; |
61 | | |
62 | 5 | int max_read_amp_uni = expected_max_files_universal; |
63 | 5 | if (read_amp_threshold <= max_read_amp_uni) { |
64 | 2 | return kCompactionStyleLevel; |
65 | 3 | } else if (write_amp_threshold <= expected_write_amp_level) { |
66 | 1 | return kCompactionStyleUniversal; |
67 | 1 | } |
68 | | |
69 | 2 | const double kReadWriteWeight = 4; |
70 | | |
71 | 2 | double level_ratio = |
72 | 2 | static_cast<double>(read_amp_threshold) / expected_read_amp_level * |
73 | 2 | kReadWriteWeight + |
74 | 2 | static_cast<double>(write_amp_threshold) / expected_write_amp_level; |
75 | | |
76 | 2 | int expected_write_amp_uni = expected_max_files_universal / 2 + 2; |
77 | 2 | int expected_read_amp_uni = expected_max_files_universal / 2 + 1; |
78 | | |
79 | 2 | double uni_ratio = |
80 | 2 | static_cast<double>(read_amp_threshold) / expected_read_amp_uni * |
81 | 2 | kReadWriteWeight + |
82 | 2 | static_cast<double>(write_amp_threshold) / expected_write_amp_uni; |
83 | | |
84 | 2 | if (level_ratio > uni_ratio) { |
85 | 0 | return kCompactionStyleLevel; |
86 | 2 | } else { |
87 | 2 | return kCompactionStyleUniversal; |
88 | 2 | } |
89 | | #else |
90 | | return kCompactionStyleLevel; |
91 | | #endif // !ROCKSDB_LITE |
92 | 2 | } |
93 | | |
94 | | // Pick mem table size |
95 | 5 | void PickWriteBufferSize(size_t total_write_buffer_limit, Options* options) { |
96 | 5 | const size_t kMaxWriteBufferSize = 128 * kBytesForOneMb; |
97 | 5 | const size_t kMinWriteBufferSize = 4 * kBytesForOneMb; |
98 | | |
99 | | // Try to pick up a buffer size between 4MB and 128MB. |
100 | | // And try to pick 4 as the total number of write buffers. |
101 | 5 | size_t write_buffer_size = total_write_buffer_limit / 4; |
102 | 5 | if (write_buffer_size > kMaxWriteBufferSize) { |
103 | 1 | write_buffer_size = kMaxWriteBufferSize; |
104 | 4 | } else if (write_buffer_size < kMinWriteBufferSize) { |
105 | 0 | write_buffer_size = std::min(static_cast<size_t>(kMinWriteBufferSize), |
106 | 0 | total_write_buffer_limit / 2); |
107 | 0 | } |
108 | | |
109 | | // Truncate to multiple of 1MB. |
110 | 5 | if (write_buffer_size % kBytesForOneMb != 0) { |
111 | 0 | write_buffer_size = |
112 | 0 | (write_buffer_size / kBytesForOneMb + 1) * kBytesForOneMb; |
113 | 0 | } |
114 | | |
115 | 5 | options->write_buffer_size = write_buffer_size; |
116 | 5 | options->max_write_buffer_number = |
117 | 5 | static_cast<int>(total_write_buffer_limit / write_buffer_size); |
118 | 5 | options->min_write_buffer_number_to_merge = 1; |
119 | 5 | } |
120 | | |
121 | | #ifndef ROCKSDB_LITE |
122 | 3 | void OptimizeForUniversal(Options* options) { |
123 | 3 | options->level0_file_num_compaction_trigger = 2; |
124 | 3 | options->level0_slowdown_writes_trigger = 30; |
125 | 3 | options->level0_stop_writes_trigger = 40; |
126 | 3 | options->max_open_files = -1; |
127 | 3 | } |
128 | | #endif |
129 | | |
130 | | // Optimize parameters for level-based compaction |
131 | | void OptimizeForLevel(int read_amplification_threshold, |
132 | | int write_amplification_threshold, |
133 | 2 | uint64_t target_db_size, Options* options) { |
134 | 2 | int expected_levels_one_level0_file = |
135 | 2 | static_cast<int>(ceil(std::log(target_db_size / options->write_buffer_size) / |
136 | 2 | std::log(kBytesForLevelMultiplier))); |
137 | | |
138 | 2 | int level0_stop_writes_trigger = |
139 | 2 | read_amplification_threshold - expected_levels_one_level0_file; |
140 | | |
141 | 2 | const size_t kInitialLevel0TotalSize = 128 * kBytesForOneMb; |
142 | 2 | const int kMaxFileNumCompactionTrigger = 4; |
143 | 2 | const int kMinLevel0StopTrigger = 3; |
144 | | |
145 | 2 | int file_num_buffer = static_cast<int>( |
146 | 2 | kInitialLevel0TotalSize / options->write_buffer_size + 1); |
147 | | |
148 | 2 | if (level0_stop_writes_trigger > file_num_buffer) { |
149 | | // Have sufficient room for multiple level 0 files |
150 | | // Try enlarge the buffer up to 1GB |
151 | | |
152 | | // Try to enlarge the buffer up to 1GB, if still have sufficient headroom. |
153 | 0 | file_num_buffer *= |
154 | 0 | 1 << std::max(0, std::min(3, level0_stop_writes_trigger - |
155 | 0 | file_num_buffer - 2)); |
156 | |
|
157 | 0 | options->level0_stop_writes_trigger = level0_stop_writes_trigger; |
158 | 0 | options->level0_slowdown_writes_trigger = level0_stop_writes_trigger - 2; |
159 | 0 | options->level0_file_num_compaction_trigger = |
160 | 0 | std::min(kMaxFileNumCompactionTrigger, file_num_buffer / 2); |
161 | 2 | } else { |
162 | 2 | options->level0_stop_writes_trigger = |
163 | 2 | std::max(kMinLevel0StopTrigger, file_num_buffer); |
164 | 2 | options->level0_slowdown_writes_trigger = |
165 | 2 | options->level0_stop_writes_trigger - 1; |
166 | 2 | options->level0_file_num_compaction_trigger = 1; |
167 | 2 | } |
168 | | |
169 | | // This doesn't consider compaction and overheads of mem tables. But usually |
170 | | // it is in the same order of magnitude. |
171 | 2 | size_t expected_level0_compaction_size = |
172 | 2 | options->level0_file_num_compaction_trigger * options->write_buffer_size; |
173 | | // Enlarge level1 target file size if level0 compaction size is larger. |
174 | 2 | uint64_t max_bytes_for_level_base = 10 * kBytesForOneMb; |
175 | 2 | if (expected_level0_compaction_size > max_bytes_for_level_base) { |
176 | 2 | max_bytes_for_level_base = expected_level0_compaction_size; |
177 | 2 | } |
178 | 2 | options->max_bytes_for_level_base = max_bytes_for_level_base; |
179 | | // Now always set level multiplier to be 10 |
180 | 2 | options->max_bytes_for_level_multiplier = kBytesForLevelMultiplier; |
181 | | |
182 | 2 | const uint64_t kMinFileSize = 2 * kBytesForOneMb; |
183 | | // Allow at least 3-way parallelism for compaction between level 1 and 2. |
184 | 2 | uint64_t max_file_size = max_bytes_for_level_base / 3; |
185 | 2 | if (max_file_size < kMinFileSize) { |
186 | 0 | options->target_file_size_base = kMinFileSize; |
187 | 2 | } else { |
188 | 2 | if (max_file_size % kBytesForOneMb != 0) { |
189 | 2 | max_file_size = (max_file_size / kBytesForOneMb + 1) * kBytesForOneMb; |
190 | 2 | } |
191 | 2 | options->target_file_size_base = max_file_size; |
192 | 2 | } |
193 | | |
194 | | // TODO: consider to tune num_levels too. |
195 | 2 | } |
196 | | |
197 | | } // namespace |
198 | | |
199 | | Options GetOptions(size_t total_write_buffer_limit, |
200 | | int read_amplification_threshold, |
201 | 5 | int write_amplification_threshold, uint64_t target_db_size) { |
202 | 5 | Options options; |
203 | 5 | PickWriteBufferSize(total_write_buffer_limit, &options); |
204 | 5 | size_t write_buffer_size = options.write_buffer_size; |
205 | 5 | options.compaction_style = |
206 | 5 | PickCompactionStyle(write_buffer_size, read_amplification_threshold, |
207 | 5 | write_amplification_threshold, target_db_size); |
208 | 5 | #ifndef ROCKSDB_LITE |
209 | 5 | if (options.compaction_style == kCompactionStyleUniversal) { |
210 | 3 | OptimizeForUniversal(&options); |
211 | 3 | } else { |
212 | | #else |
213 | | { |
214 | | #endif // !ROCKSDB_LITE |
215 | 2 | OptimizeForLevel(read_amplification_threshold, |
216 | 2 | write_amplification_threshold, target_db_size, &options); |
217 | 2 | } |
218 | 5 | return options; |
219 | 5 | } |
220 | | |
221 | | } // namespace rocksdb |