/Users/deen/code/yugabyte-db/src/yb/rocksdb/db/version_set.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under the BSD-style license found in the |
3 | | // LICENSE file in the root directory of this source tree. An additional grant |
4 | | // of patent rights can be found in the PATENTS file in the same directory. |
5 | | // |
6 | | // The following only applies to changes made to this file as part of YugaByte development. |
7 | | // |
8 | | // Portions Copyright (c) YugaByte, Inc. |
9 | | // |
10 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
11 | | // in compliance with the License. You may obtain a copy of the License at |
12 | | // |
13 | | // http://www.apache.org/licenses/LICENSE-2.0 |
14 | | // |
15 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
16 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
17 | | // or implied. See the License for the specific language governing permissions and limitations |
18 | | // under the License. |
19 | | // |
20 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
21 | | // Use of this source code is governed by a BSD-style license that can be |
22 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
23 | | // |
24 | | // The representation of a DBImpl consists of a set of Versions. The |
25 | | // newest version is called "current". Older versions may be kept |
26 | | // around to provide a consistent view to live iterators. |
27 | | // |
28 | | // Each Version keeps track of a set of Table files per level. The |
29 | | // entire set of versions is maintained in a VersionSet. |
30 | | // |
31 | | // Version,VersionSet are thread-compatible, but require external |
32 | | // synchronization on all accesses. |
33 | | |
34 | | #ifndef YB_ROCKSDB_DB_VERSION_SET_H |
35 | | #define YB_ROCKSDB_DB_VERSION_SET_H |
36 | | |
37 | | #pragma once |
38 | | |
39 | | #include <atomic> |
40 | | #include <deque> |
41 | | #include <limits> |
42 | | #include <map> |
43 | | #include <memory> |
44 | | #include <set> |
45 | | #include <string> |
46 | | #include <utility> |
47 | | #include <vector> |
48 | | |
49 | | #include "yb/rocksdb/db/dbformat.h" |
50 | | #include "yb/rocksdb/db/version_builder.h" |
51 | | #include "yb/rocksdb/db/version_edit.h" |
52 | | #include "yb/rocksdb/port/port.h" |
53 | | #include "yb/rocksdb/db/table_cache.h" |
54 | | #include "yb/rocksdb/db/compaction_picker.h" |
55 | | #include "yb/rocksdb/db/column_family.h" |
56 | | #include "yb/rocksdb/db/log_reader.h" |
57 | | #include "yb/rocksdb/db/file_indexer.h" |
58 | | #include "yb/rocksdb/db/write_controller.h" |
59 | | #include "yb/rocksdb/env.h" |
60 | | #include "yb/rocksdb/util/instrumented_mutex.h" |
61 | | |
62 | | namespace rocksdb { |
63 | | |
64 | | namespace log { |
65 | | class Writer; |
66 | | } |
67 | | |
68 | | class Compaction; |
69 | | class InternalIterator; |
70 | | class LogBuffer; |
71 | | class LookupKey; |
72 | | class MemTable; |
73 | | class Version; |
74 | | class VersionSet; |
75 | | class WriteBuffer; |
76 | | class MergeContext; |
77 | | class ColumnFamilyData; |
78 | | class ColumnFamilySet; |
79 | | class TableCache; |
80 | | class MergeIteratorBuilder; |
81 | | class FileNumbersProvider; |
82 | | |
83 | | // Return the smallest index i such that file_level.files[i]->largest >= key. |
84 | | // Return file_level.num_files if there is no such file. |
85 | | // REQUIRES: "file_level.files" contains a sorted list of |
86 | | // non-overlapping files. |
87 | | extern int FindFile(const InternalKeyComparator& icmp, |
88 | | const LevelFilesBrief& file_level, const Slice& key); |
89 | | |
90 | | // Returns true iff some file in "files" overlaps the user key range |
91 | | // [*smallest,*largest]. |
92 | | // smallest==nullptr represents a key smaller than all keys in the DB. |
93 | | // largest==nullptr represents a key largest than all keys in the DB. |
94 | | // REQUIRES: If disjoint_sorted_files, file_level.files[] |
95 | | // contains disjoint ranges in sorted order. |
96 | | extern bool SomeFileOverlapsRange(const InternalKeyComparator& icmp, |
97 | | bool disjoint_sorted_files, |
98 | | const LevelFilesBrief& file_level, |
99 | | const Slice* smallest_user_key, |
100 | | const Slice* largest_user_key); |
101 | | |
102 | | // Generate LevelFilesBrief from vector<FdWithKeyRange*> |
103 | | // Would copy smallest_key and largest_key data to sequential memory |
104 | | // arena: Arena used to allocate the memory |
105 | | extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level, |
106 | | const std::vector<FileMetaData*>& files, |
107 | | Arena* arena); |
108 | | |
109 | | class VersionStorageInfo { |
110 | | public: |
111 | | VersionStorageInfo(const InternalKeyComparatorPtr& internal_comparator, |
112 | | const Comparator* user_comparator, int num_levels, |
113 | | CompactionStyle compaction_style, |
114 | | VersionStorageInfo* src_vstorage); |
115 | | ~VersionStorageInfo(); |
116 | | |
117 | 1.19M | void Reserve(int level, size_t size) { files_[level].reserve(size); } |
118 | | |
119 | | void AddFile(int level, FileMetaData* f, Logger* info_log = nullptr); |
120 | | |
121 | | void SetFinalized(); |
122 | | |
123 | | // Update num_non_empty_levels_. |
124 | | void UpdateNumNonEmptyLevels(); |
125 | | |
126 | 774k | void GenerateFileIndexer() { |
127 | 774k | file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_); |
128 | 774k | } |
129 | | |
130 | | // Update the accumulated stats from a file-meta. |
131 | | void UpdateAccumulatedStats(FileMetaData* file_meta); |
132 | | |
133 | | // Decrease the current stat form a to-be-delected file-meta |
134 | | void RemoveCurrentStats(FileMetaData* file_meta); |
135 | | |
136 | | void ComputeCompensatedSizes(); |
137 | | |
138 | | // Updates internal structures that keep track of compaction scores |
139 | | // We use compaction scores to figure out which compaction to do next |
140 | | // REQUIRES: db_mutex held!! |
141 | | // TODO find a better way to pass compaction_options_fifo. |
142 | | void ComputeCompactionScore( |
143 | | const MutableCFOptions& mutable_cf_options, |
144 | | const CompactionOptionsFIFO& compaction_options_fifo); |
145 | | |
146 | | // Estimate est_comp_needed_bytes_ |
147 | | void EstimateCompactionBytesNeeded( |
148 | | const MutableCFOptions& mutable_cf_options); |
149 | | |
150 | | // This computes files_marked_for_compaction_ and is called by |
151 | | // ComputeCompactionScore() |
152 | | void ComputeFilesMarkedForCompaction(); |
153 | | |
154 | | // Generate level_files_brief_ from files_ |
155 | | void GenerateLevelFilesBrief(); |
156 | | // Sort all files for this version based on their file size and |
157 | | // record results in files_by_compaction_pri_. The largest files are listed |
158 | | // first. |
159 | | void UpdateFilesByCompactionPri(const MutableCFOptions& mutable_cf_options); |
160 | | |
161 | | void GenerateLevel0NonOverlapping(); |
162 | 23.7k | bool level0_non_overlapping() const { |
163 | 23.7k | return level0_non_overlapping_; |
164 | 23.7k | } |
165 | | |
166 | | int MaxInputLevel() const; |
167 | | |
168 | | // Return level number that has idx'th highest score |
169 | 47.7k | int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; } |
170 | | |
171 | | // Return idx'th highest score |
172 | 1.42M | double CompactionScore(int idx) const { return compaction_score_[idx]; } |
173 | | |
174 | | void GetOverlappingInputs( |
175 | | int level, |
176 | | const InternalKey* begin, // nullptr means before all keys |
177 | | const InternalKey* end, // nullptr means after all keys |
178 | | std::vector<FileMetaData*>* inputs, |
179 | | int hint_index = -1, // index of overlap file |
180 | | int* file_index = nullptr, // return index of overlap file |
181 | | bool expand_range = true) // if set, returns files which overlap the |
182 | | const; // range and overlap each other. If false, |
183 | | // then just files intersecting the range |
184 | | |
185 | | void GetOverlappingInputsBinarySearch( |
186 | | int level, |
187 | | const Slice& begin, // nullptr means before all keys |
188 | | const Slice& end, // nullptr means after all keys |
189 | | std::vector<FileMetaData*>* inputs, |
190 | | int hint_index, // index of overlap file |
191 | | int* file_index) const; // return index of overlap file |
192 | | |
193 | | void ExtendOverlappingInputs( |
194 | | int level, |
195 | | const Slice& begin, // nullptr means before all keys |
196 | | const Slice& end, // nullptr means after all keys |
197 | | std::vector<FileMetaData*>* inputs, |
198 | | unsigned int index) const; // start extending from this index |
199 | | |
200 | | // Returns true iff some file in the specified level overlaps |
201 | | // some part of [*smallest_user_key,*largest_user_key]. |
202 | | // smallest_user_key==NULL represents a key smaller than all keys in the DB. |
203 | | // largest_user_key==NULL represents a key largest than all keys in the DB. |
204 | | bool OverlapInLevel(int level, const Slice* smallest_user_key, |
205 | | const Slice* largest_user_key); |
206 | | |
207 | | // Returns true iff the first or last file in inputs contains |
208 | | // an overlapping user key to the file "just outside" of it (i.e. |
209 | | // just after the last file, or just before the first file) |
210 | | // REQUIRES: "*inputs" is a sorted list of non-overlapping files |
211 | | bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs, |
212 | | int level); |
213 | | |
214 | 40.7M | int num_levels() const { return num_levels_; } |
215 | | |
216 | | // REQUIRES: This version has been saved (see VersionSet::SaveTo) |
217 | 49.5M | int num_non_empty_levels() const { |
218 | 49.5M | assert(finalized_); |
219 | 0 | return num_non_empty_levels_; |
220 | 49.5M | } |
221 | | |
222 | | // REQUIRES: This version has been finalized. |
223 | | // (CalculateBaseBytes() is called) |
224 | | // This may or may not return number of level files. It is to keep backward |
225 | | // compatible behavior in universal compaction. |
226 | 3.72M | int l0_delay_trigger_count() const { return l0_delay_trigger_count_; } |
227 | | |
228 | 1.21M | void set_l0_delay_trigger_count(int v) { l0_delay_trigger_count_ = v; } |
229 | | |
230 | | // REQUIRES: This version has been saved (see VersionSet::SaveTo) |
231 | 438k | int NumLevelFiles(int level) const { |
232 | 438k | assert(finalized_); |
233 | 0 | return static_cast<int>(files_[level].size()); |
234 | 438k | } |
235 | | |
236 | | uint64_t NumFiles() const; |
237 | | |
238 | | // Return the combined file size of all files at the specified level. |
239 | | uint64_t NumLevelBytes(int level) const; |
240 | | |
241 | | // REQUIRES: This version has been saved (see VersionSet::SaveTo) |
242 | 55.9M | const std::vector<FileMetaData*>& LevelFiles(int level) const { |
243 | 55.9M | return files_[level]; |
244 | 55.9M | } |
245 | | |
246 | 52.0M | const rocksdb::LevelFilesBrief& LevelFilesBrief(int level) const { |
247 | 52.0M | assert(level < static_cast<int>(level_files_brief_.size())); |
248 | 0 | return level_files_brief_[level]; |
249 | 52.0M | } |
250 | | |
251 | | // REQUIRES: This version has been saved (see VersionSet::SaveTo) |
252 | 13.9k | const std::vector<int>& FilesByCompactionPri(int level) const { |
253 | 13.9k | assert(finalized_); |
254 | 0 | return files_by_compaction_pri_[level]; |
255 | 13.9k | } |
256 | | |
257 | | // REQUIRES: This version has been saved (see VersionSet::SaveTo) |
258 | | // REQUIRES: DB mutex held during access |
259 | | const autovector<std::pair<int, FileMetaData*>>& FilesMarkedForCompaction() |
260 | 93.2k | const { |
261 | 93.2k | assert(finalized_); |
262 | 0 | return files_marked_for_compaction_; |
263 | 93.2k | } |
264 | | |
265 | 745k | int base_level() const { return base_level_; } |
266 | | |
267 | | // REQUIRES: lock is held |
268 | | // Set the index that is used to offset into files_by_compaction_pri_ to find |
269 | | // the next compaction candidate file. |
270 | 13.9k | void SetNextCompactionIndex(int level, int index) { |
271 | 13.9k | next_file_to_compact_by_size_[level] = index; |
272 | 13.9k | } |
273 | | |
274 | | // REQUIRES: lock is held |
275 | 13.9k | int NextCompactionIndex(int level) const { |
276 | 13.9k | return next_file_to_compact_by_size_[level]; |
277 | 13.9k | } |
278 | | |
279 | | // REQUIRES: This version has been saved (see VersionSet::SaveTo) |
280 | 6.07k | const FileIndexer& file_indexer() const { |
281 | 6.07k | assert(finalized_); |
282 | 0 | return file_indexer_; |
283 | 6.07k | } |
284 | | |
285 | | // Only the first few entries of files_by_compaction_pri_ are sorted. |
286 | | // There is no need to sort all the files because it is likely |
287 | | // that on a running system, we need to look at only the first |
288 | | // few largest files because a new version is created every few |
289 | | // seconds/minutes (because of concurrent compactions). |
290 | | static const size_t kNumberFilesToSort = 50; |
291 | | |
292 | | // Return a human-readable short (single-line) summary of the number |
293 | | // of files per level. Uses *scratch as backing store. |
294 | | struct LevelSummaryStorage { |
295 | | char buffer[1000]; |
296 | | }; |
297 | | struct FileSummaryStorage { |
298 | | char buffer[3000]; |
299 | | }; |
300 | | const char* LevelSummary(LevelSummaryStorage* scratch) const; |
301 | | // Return a human-readable short (single-line) summary of files |
302 | | // in a specified level. Uses *scratch as backing store. |
303 | | const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const; |
304 | | |
305 | | // Return the maximum overlapping data (in bytes) at next level for any |
306 | | // file at a level >= 1. |
307 | | int64_t MaxNextLevelOverlappingBytes(); |
308 | | |
309 | 773k | uint64_t GetAverageValueSize() const { |
310 | 773k | if (accumulated_num_non_deletions_ == 0) { |
311 | 696k | return 0; |
312 | 696k | } |
313 | 77.5k | assert(accumulated_raw_key_size_ + accumulated_raw_value_size_ > 0); |
314 | 0 | assert(accumulated_file_size_ > 0); |
315 | 0 | return accumulated_raw_value_size_ / accumulated_num_non_deletions_ * |
316 | 77.5k | accumulated_file_size_ / |
317 | 77.5k | (accumulated_raw_key_size_ + accumulated_raw_value_size_); |
318 | 773k | } |
319 | | |
320 | | uint64_t GetEstimatedActiveKeys() const; |
321 | | |
322 | | // re-initializes the index that is used to offset into |
323 | | // files_by_compaction_pri_ |
324 | | // to find the next compaction candidate file. |
325 | 69 | void ResetNextCompactionIndex(int level) { |
326 | 69 | next_file_to_compact_by_size_[level] = 0; |
327 | 69 | } |
328 | | |
329 | 2.71M | const InternalKeyComparatorPtr& InternalComparator() { |
330 | 2.71M | return internal_comparator_; |
331 | 2.71M | } |
332 | | |
333 | | // Returns maximum total bytes of data on a given level. |
334 | | uint64_t MaxBytesForLevel(int level) const; |
335 | | |
336 | | // Must be called after any change to MutableCFOptions. |
337 | | void CalculateBaseBytes(const ImmutableCFOptions& ioptions, |
338 | | const MutableCFOptions& options); |
339 | | |
340 | | // Returns an estimate of the amount of live data in bytes. |
341 | | uint64_t EstimateLiveDataSize() const; |
342 | | |
343 | 2.46M | uint64_t estimated_compaction_needed_bytes() const { |
344 | 2.46M | return estimated_compaction_needed_bytes_; |
345 | 2.46M | } |
346 | | |
347 | | void TEST_set_estimated_compaction_needed_bytes(uint64_t v) { |
348 | | estimated_compaction_needed_bytes_ = v; |
349 | | } |
350 | | |
351 | | private: |
352 | | InternalKeyComparatorPtr internal_comparator_; |
353 | | const Comparator* user_comparator_; |
354 | | int num_levels_; // Number of levels |
355 | | int num_non_empty_levels_; // Number of levels. Any level larger than it |
356 | | // is guaranteed to be empty. |
357 | | // Per-level max bytes |
358 | | std::vector<uint64_t> level_max_bytes_; |
359 | | |
360 | | // A short brief metadata of files per level |
361 | | autovector<rocksdb::LevelFilesBrief> level_files_brief_; |
362 | | FileIndexer file_indexer_; |
363 | | Arena arena_; // Used to allocate space for file_levels_ |
364 | | |
365 | | CompactionStyle compaction_style_; |
366 | | |
367 | | // List of files per level, files in each level are arranged |
368 | | // in increasing order of keys |
369 | | std::vector<FileMetaData*>* files_; |
370 | | |
371 | | // Level that L0 data should be compacted to. All levels < base_level_ should |
372 | | // be empty. -1 if it is not level-compaction so it's not applicable. |
373 | | int base_level_; |
374 | | |
375 | | // A list for the same set of files that are stored in files_, |
376 | | // but files in each level are now sorted based on file |
377 | | // size. The file with the largest size is at the front. |
378 | | // This vector stores the index of the file from files_. |
379 | | std::vector<std::vector<int>> files_by_compaction_pri_; |
380 | | |
381 | | // If true, means that files in L0 have keys with non overlapping ranges |
382 | | bool level0_non_overlapping_; |
383 | | |
384 | | // An index into files_by_compaction_pri_ that specifies the first |
385 | | // file that is not yet compacted |
386 | | std::vector<int> next_file_to_compact_by_size_; |
387 | | |
388 | | // Only the first few entries of files_by_compaction_pri_ are sorted. |
389 | | // There is no need to sort all the files because it is likely |
390 | | // that on a running system, we need to look at only the first |
391 | | // few largest files because a new version is created every few |
392 | | // seconds/minutes (because of concurrent compactions). |
393 | | static const size_t number_of_files_to_sort_ = 50; |
394 | | |
395 | | // This vector contains list of files marked for compaction and also not |
396 | | // currently being compacted. It is protected by DB mutex. It is calculated in |
397 | | // ComputeCompactionScore() |
398 | | autovector<std::pair<int, FileMetaData*>> files_marked_for_compaction_; |
399 | | |
400 | | // Level that should be compacted next and its compaction score. |
401 | | // Score < 1 means compaction is not strictly needed. These fields |
402 | | // are initialized by Finalize(). |
403 | | // The most critical level to be compacted is listed first |
404 | | // These are used to pick the best compaction level |
405 | | std::vector<double> compaction_score_; |
406 | | std::vector<int> compaction_level_; |
407 | | int l0_delay_trigger_count_ = 0; // Count used to trigger slow down and stop |
408 | | // for number of L0 files. |
409 | | |
410 | | // the following are the sampled temporary stats. |
411 | | // the current accumulated size of sampled files. |
412 | | uint64_t accumulated_file_size_; |
413 | | // the current accumulated size of all raw keys based on the sampled files. |
414 | | uint64_t accumulated_raw_key_size_; |
415 | | // the current accumulated size of all raw keys based on the sampled files. |
416 | | uint64_t accumulated_raw_value_size_; |
417 | | // total number of non-deletion entries |
418 | | uint64_t accumulated_num_non_deletions_; |
419 | | // total number of deletion entries |
420 | | uint64_t accumulated_num_deletions_; |
421 | | // current number of non_deletion entries |
422 | | uint64_t current_num_non_deletions_; |
423 | | // current number of delection entries |
424 | | uint64_t current_num_deletions_; |
425 | | // current number of file samples |
426 | | uint64_t current_num_samples_; |
427 | | // Estimated bytes needed to be compacted until all levels' size is down to |
428 | | // target sizes. |
429 | | uint64_t estimated_compaction_needed_bytes_; |
430 | | |
431 | | bool finalized_; |
432 | | |
433 | | friend class Version; |
434 | | friend class VersionSet; |
435 | | // No copying allowed |
436 | | VersionStorageInfo(const VersionStorageInfo&) = delete; |
437 | | void operator=(const VersionStorageInfo&) = delete; |
438 | | }; |
439 | | |
440 | | class Version { |
441 | | public: |
442 | | // Append to *iters a sequence of iterators that will |
443 | | // yield the contents of this Version when merged together. |
444 | | // REQUIRES: This version has been saved (see VersionSet::SaveTo) |
445 | | void AddIterators(const ReadOptions&, const EnvOptions& soptions, |
446 | | MergeIteratorBuilder* merger_iter_builder); |
447 | | |
448 | | // Lookup the value for key. If found, store it in *val and |
449 | | // return OK. Else return a non-OK status. |
450 | | // Uses *operands to store merge_operator operations to apply later. |
451 | | // |
452 | | // If the ReadOptions.read_tier is set to do a read-only fetch, then |
453 | | // *value_found will be set to false if it cannot be determined whether |
454 | | // this value exists without doing IO. |
455 | | // |
456 | | // If the key is Deleted, *status will be set to NotFound and |
457 | | // *key_exists will be set to true. |
458 | | // If no key was found, *status will be set to NotFound and |
459 | | // *key_exists will be set to false. |
460 | | // If seq is non-null, *seq will be set to the sequence number found |
461 | | // for the key if a key was found. |
462 | | // |
463 | | // REQUIRES: lock is not held |
464 | | void Get(const ReadOptions&, const LookupKey& key, std::string* val, |
465 | | Status* status, MergeContext* merge_context, |
466 | | bool* value_found = nullptr, bool* key_exists = nullptr, |
467 | | SequenceNumber* seq = nullptr); |
468 | | |
469 | | // Loads some stats information from files. Call without mutex held. It needs |
470 | | // to be called before applying the version to the version set. |
471 | | void PrepareApply(const MutableCFOptions& mutable_cf_options, |
472 | | bool update_stats); |
473 | | |
474 | | // Reference count management (so Versions do not disappear out from |
475 | | // under live iterators) |
476 | | void Ref(); |
477 | | // Decrease reference count. Delete the object if no reference left |
478 | | // and return true. Otherwise, return false. |
479 | | bool Unref(); |
480 | | |
481 | | // Add all files listed in the current version to *live. |
482 | | void AddLiveFiles(std::vector<FileDescriptor>* live); |
483 | | |
484 | | // Return a human readable string that describes this version's contents. |
485 | | std::string DebugString(bool hex = false) const; |
486 | | |
487 | | // Returns the version nuber of this version |
488 | 39.4k | uint64_t GetVersionNumber() const { return version_number_; } |
489 | | |
490 | | // REQUIRES: lock is held |
491 | | // On success, "tp" will contains the table properties of the file |
492 | | // specified in "file_meta". If the file name of "file_meta" is |
493 | | // known ahread, passing it by a non-null "fname" can save a |
494 | | // file-name conversion. |
495 | | Status GetTableProperties(std::shared_ptr<const TableProperties>* tp, |
496 | | const FileMetaData* file_meta, |
497 | | const std::string* fname = nullptr) const; |
498 | | |
499 | | // REQUIRES: lock is held |
500 | | // On success, *props will be populated with all SSTables' table properties. |
501 | | // The keys of `props` are the sst file name, the values of `props` are the |
502 | | // tables' propertis, represented as shared_ptr. |
503 | | Status GetPropertiesOfAllTables(TablePropertiesCollection* props); |
504 | | Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level); |
505 | | Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n, |
506 | | TablePropertiesCollection* props) const; |
507 | | |
508 | | // REQUIRES: lock is held |
509 | | // On success, "tp" will contains the aggregated table property amoug |
510 | | // the table properties of all sst files in this version. |
511 | | Status GetAggregatedTableProperties( |
512 | | std::shared_ptr<const TableProperties>* tp, int level = -1); |
513 | | |
514 | 0 | uint64_t GetEstimatedActiveKeys() { |
515 | 0 | return storage_info_.GetEstimatedActiveKeys(); |
516 | 0 | } |
517 | | |
518 | | size_t GetMemoryUsageByTableReaders(); |
519 | | |
520 | | // Returns approximate middle key of the largest SST file (see TableReader::GetMiddleKey). |
521 | | // Returns Status(Incomplete) if there are no SST files for this version. |
522 | | Result<std::string> GetMiddleKey(); |
523 | | |
524 | 23.9k | ColumnFamilyData* cfd() const { return cfd_; } |
525 | | |
526 | | // Return the next Version in the linked list. Used for debug only |
527 | 402k | Version* TEST_Next() const { |
528 | 402k | return next_; |
529 | 402k | } |
530 | | |
531 | 78.4M | VersionStorageInfo* storage_info() { return &storage_info_; } |
532 | | |
533 | 797k | VersionSet* version_set() { return vset_; } |
534 | | |
535 | | void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta); |
536 | | |
537 | | private: |
538 | | Env* env_; |
539 | | friend class VersionSet; |
540 | | |
541 | 15.5M | const InternalKeyComparatorPtr& internal_comparator() const { |
542 | 15.5M | return storage_info_.internal_comparator_; |
543 | 15.5M | } |
544 | 14.6M | const Comparator* user_comparator() const { |
545 | 14.6M | return storage_info_.user_comparator_; |
546 | 14.6M | } |
547 | | |
548 | | bool PrefixMayMatch(const ReadOptions& read_options, |
549 | | InternalIterator* level_iter, |
550 | | const Slice& internal_prefix) const; |
551 | | |
552 | | // Returns true if the filter blocks in the specified level will not be |
553 | | // checked during read operations. In certain cases (trivial move or preload), |
554 | | // the filter block may already be cached, but we still do not access it such |
555 | | // that it eventually expires from the cache. |
556 | | bool IsFilterSkipped(int level, bool is_file_last_in_level = false); |
557 | | |
558 | | // The helper function of UpdateAccumulatedStats, which may fill the missing |
559 | | // fields of file_mata from its associated TableProperties. |
560 | | // Returns true if it does initialize FileMetaData. |
561 | | bool MaybeInitializeFileMetaData(FileMetaData* file_meta); |
562 | | |
563 | | // Update the accumulated stats associated with the current version. |
564 | | // This accumulated stats will be used in compaction. |
565 | | void UpdateAccumulatedStats(bool update_stats); |
566 | | |
567 | | // Sort all files for this version based on their file size and |
568 | | // record results in files_by_compaction_pri_. The largest files are listed |
569 | | // first. |
570 | | void UpdateFilesByCompactionPri(); |
571 | | |
572 | | ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs |
573 | | Logger* info_log_; |
574 | | Statistics* db_statistics_; |
575 | | TableCache* table_cache_; |
576 | | const MergeOperator* merge_operator_; |
577 | | |
578 | | VersionStorageInfo storage_info_; |
579 | | VersionSet* vset_; // VersionSet to which this Version belongs |
580 | | Version* next_; // Next version in linked list |
581 | | Version* prev_; // Previous version in linked list |
582 | | int refs_; // Number of live refs to this version |
583 | | |
584 | | // A version number that uniquely represents this version. This is |
585 | | // used for debugging and logging purposes only. |
586 | | uint64_t version_number_; |
587 | | |
588 | | Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number = 0); |
589 | | |
590 | | ~Version(); |
591 | | |
592 | | // No copying allowed |
593 | | Version(const Version&); |
594 | | void operator=(const Version&); |
595 | | }; |
596 | | |
597 | | typedef boost::intrusive_ptr<Version> VersionPtr; |
598 | | |
599 | 23.6k | inline void intrusive_ptr_add_ref(Version* version) { |
600 | 23.6k | version->Ref(); |
601 | 23.6k | } |
602 | | |
603 | 23.6k | inline void intrusive_ptr_release(Version* version) { |
604 | 23.6k | version->Unref(); |
605 | 23.6k | } |
606 | | |
607 | | class VersionSet { |
608 | | public: |
609 | | static constexpr uint64_t kInitialNextFileNumber = 2; |
610 | | |
611 | | VersionSet(const std::string& dbname, const DBOptions* db_options, |
612 | | const EnvOptions& env_options, Cache* table_cache, |
613 | | WriteBuffer* write_buffer, WriteController* write_controller); |
614 | | ~VersionSet(); |
615 | | |
616 | | // Apply *edit to the current version to form a new descriptor that |
617 | | // is both saved to persistent state and installed as the new |
618 | | // current version. Will release *mu while actually writing to the file. |
619 | | // column_family_options has to be set if edit is column family add |
620 | | // REQUIRES: *mu is held on entry. |
621 | | // REQUIRES: no other thread concurrently calls LogAndApply() |
622 | | Status LogAndApply( |
623 | | ColumnFamilyData* column_family_data, |
624 | | const MutableCFOptions& mutable_cf_options, VersionEdit* edit, |
625 | | InstrumentedMutex* mu, Directory* db_directory = nullptr, |
626 | | bool new_descriptor_log = false, |
627 | | const ColumnFamilyOptions* column_family_options = nullptr); |
628 | | |
629 | | // Recover the last saved descriptor from persistent storage. |
630 | | // If read_only == true, Recover() will not complain if some column families |
631 | | // are not opened |
632 | | Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families, |
633 | | bool read_only = false); |
634 | | |
635 | | // Reads a manifest file and returns a list of column families in |
636 | | // column_families. |
637 | | static Status ListColumnFamilies(std::vector<std::string>* column_families, |
638 | | const std::string& dbname, |
639 | | BoundaryValuesExtractor* extractor, |
640 | | Env* env); |
641 | | |
642 | | #ifndef ROCKSDB_LITE |
643 | | // Try to reduce the number of levels. This call is valid when |
644 | | // only one level from the new max level to the old |
645 | | // max level containing files. |
646 | | // The call is static, since number of levels is immutable during |
647 | | // the lifetime of a RocksDB instance. It reduces number of levels |
648 | | // in a DB by applying changes to manifest. |
649 | | // For example, a db currently has 7 levels [0-6], and a call to |
650 | | // to reduce to 5 [0-4] can only be executed when only one level |
651 | | // among [4-6] contains files. |
652 | | static Status ReduceNumberOfLevels(const std::string& dbname, |
653 | | const Options* options, |
654 | | const EnvOptions& env_options, |
655 | | int new_levels); |
656 | | |
657 | | // printf contents (for debugging) |
658 | | Status DumpManifest(const Options& options, const std::string& manifestFileName, |
659 | | bool verbose, bool hex = false); |
660 | | |
661 | | #endif // ROCKSDB_LITE |
662 | | |
663 | | // Return the current manifest file number |
664 | 1.85M | uint64_t manifest_file_number() const { return manifest_file_number_; } |
665 | | |
666 | 1.85M | uint64_t pending_manifest_file_number() const { |
667 | 1.85M | return pending_manifest_file_number_; |
668 | 1.85M | } |
669 | | |
670 | 0 | uint64_t current_next_file_number() const { return next_file_number_.load(); } |
671 | | |
672 | | // Allocate and return a new file number |
673 | 3.35M | uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); } |
674 | | |
675 | | // Return the last sequence number. |
676 | 111M | uint64_t LastSequence() const { |
677 | 111M | return last_sequence_.load(std::memory_order_acquire); |
678 | 111M | } |
679 | | |
680 | 21.4M | UserFrontier* FlushedFrontier() const { |
681 | 21.4M | return flushed_frontier_.get(); |
682 | 21.4M | } |
683 | | |
684 | | // Set the last sequence number to s. |
685 | | void SetLastSequence(SequenceNumber s); |
686 | | |
687 | | // Set last sequence number without verifying that it always keeps increasing. |
688 | | void SetLastSequenceNoSanityChecking(SequenceNumber s); |
689 | | |
690 | | // Attempts to set the last flushed op id / hybrid time / history cutoff to the specified tuple of |
691 | | // values. The current flushed frontier is always updated to the maximum of its current and |
692 | | // supplied values for each dimension. This will DFATAL in case the supplied frontier regresses |
693 | | // relative to the current frontier in any of its dimensions which have non-default (defined) |
694 | | // values. |
695 | | void UpdateFlushedFrontier(UserFrontierPtr values); |
696 | | |
697 | | // Mark the specified file number as used. |
698 | | // REQUIRED: this is only called during single-threaded recovery |
699 | | void MarkFileNumberUsedDuringRecovery(uint64_t number); |
700 | | |
701 | | // Return the log file number for the log file that is currently |
702 | | // being compacted, or zero if there is no such log file. |
703 | 2.32M | uint64_t prev_log_number() const { return prev_log_number_; } |
704 | | |
705 | | // Returns the minimum log number such that all |
706 | | // log numbers less than or equal to it can be deleted |
707 | 4.14M | uint64_t MinLogNumber() const { |
708 | 4.14M | uint64_t min_log_num = std::numeric_limits<uint64_t>::max(); |
709 | 4.74M | for (auto cfd : *column_family_set_) { |
710 | | // It's safe to ignore dropped column families here: |
711 | | // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST. |
712 | 4.74M | if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()4.15M ) { |
713 | 4.15M | min_log_num = cfd->GetLogNumber(); |
714 | 4.15M | } |
715 | 4.74M | } |
716 | 4.14M | return min_log_num; |
717 | 4.14M | } |
718 | | |
719 | | // Create an iterator that reads over the compaction inputs for "*c". |
720 | | // The caller should delete the iterator when no longer needed. |
721 | | InternalIterator* MakeInputIterator(Compaction* c); |
722 | | |
723 | | // Add all files listed in any live version to *live. |
724 | | void AddLiveFiles(std::vector<FileDescriptor>* live_list); |
725 | | |
726 | | // Return the approximate size of data to be scanned for range [start, end) |
727 | | // in levels [start_level, end_level). If end_level == 0 it will search |
728 | | // through all non-empty levels |
729 | | uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end, |
730 | | int start_level = 0, int end_level = -1); |
731 | | |
732 | | // Return the size of the current manifest file |
733 | 2.09k | uint64_t manifest_file_size() const { return manifest_file_size_; } |
734 | | |
735 | | // verify that the files that we started with for a compaction |
736 | | // still exist in the current version and in the same original level. |
737 | | // This ensures that a concurrent compaction did not erroneously |
738 | | // pick the same files to compact. |
739 | | bool VerifyCompactionFileConsistency(Compaction* c); |
740 | | |
741 | | Status GetMetadataForFile(uint64_t number, int* filelevel, |
742 | | FileMetaData** metadata, ColumnFamilyData** cfd); |
743 | | |
744 | | // This function doesn't support leveldb SST filenames |
745 | | void GetLiveFilesMetaData(std::vector<LiveFileMetaData> *metadata); |
746 | | |
747 | | void GetObsoleteFiles(const FileNumbersProvider& pending_outputs, |
748 | | std::vector<FileMetaData*>* files, |
749 | | std::vector<std::string>* manifest_filenames); |
750 | | |
751 | 42.9M | ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); } |
752 | 773k | const EnvOptions& env_options() { return env_options_; } |
753 | | |
754 | | CHECKED_STATUS Import(const std::string& source_dir, SequenceNumber seqno, VersionEdit* edit); |
755 | | |
756 | | void UnrefFile(ColumnFamilyData* cfd, FileMetaData* f); |
757 | | |
758 | | static uint64_t GetNumLiveVersions(Version* dummy_versions); |
759 | | |
760 | | static uint64_t GetTotalSstFilesSize(Version* dummy_versions); |
761 | | |
762 | 295 | bool has_manifest_writers() const { |
763 | 295 | return !manifest_writers_.empty(); |
764 | 295 | } |
765 | | |
766 | | private: |
767 | | struct ManifestWriter; |
768 | | |
769 | | friend class Version; |
770 | | friend class DBImpl; |
771 | | |
772 | | // ApproximateSize helper |
773 | | uint64_t ApproximateSizeLevel0(Version* v, const LevelFilesBrief& files_brief, |
774 | | const Slice& start, const Slice& end); |
775 | | |
776 | | uint64_t ApproximateSize(Version* v, const FdWithBoundaries& f, const Slice& key); |
777 | | |
778 | | // Save current contents to *log |
779 | | Status WriteSnapshot(log::Writer* log, UserFrontierPtr flushed_frontier_override); |
780 | | |
781 | | void AppendVersion(ColumnFamilyData* column_family_data, Version* v); |
782 | | |
783 | | ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, |
784 | | VersionEdit* edit); |
785 | | |
786 | | static void EnsureNonDecreasingLastSequence( |
787 | | SequenceNumber prev_last_seq, SequenceNumber new_last_seq); |
788 | | static void EnsureNonDecreasingFlushedFrontier( |
789 | | const UserFrontier* prev_value, const UserFrontier& new_value); |
790 | | |
791 | | void UpdateFlushedFrontierNoSanityChecking(UserFrontierPtr values); |
792 | | |
793 | | std::unique_ptr<ColumnFamilySet> column_family_set_; |
794 | | |
795 | | Env* const env_; |
796 | | const std::string dbname_; |
797 | | const DBOptions* const db_options_; |
798 | | std::atomic<uint64_t> next_file_number_ = {kInitialNextFileNumber}; |
799 | | uint64_t manifest_file_number_ = 0; |
800 | | uint64_t pending_manifest_file_number_ = 0; |
801 | | std::atomic<uint64_t> last_sequence_ = {0}; |
802 | | uint64_t prev_log_number_ = 0; // 0 or backing store for memtable being compacted |
803 | | UserFrontierPtr flushed_frontier_; |
804 | | |
805 | | // Opened lazily |
806 | | std::unique_ptr<log::Writer> descriptor_log_; |
807 | | std::string descriptor_log_file_name_; |
808 | | |
809 | | // generates a increasing version number for every new version |
810 | | uint64_t current_version_number_ = 0; |
811 | | |
812 | | // Queue of writers to the manifest file |
813 | | std::deque<ManifestWriter*> manifest_writers_; |
814 | | |
815 | | // Current size of manifest file |
816 | | uint64_t manifest_file_size_ = 0; |
817 | | |
818 | | std::vector<FileMetaData*> obsolete_files_; |
819 | | std::vector<std::string> obsolete_manifests_; |
820 | | |
821 | | // env options for all reads and writes except compactions |
822 | | const EnvOptions& env_options_; |
823 | | |
824 | | // env options used for compactions. This is a copy of |
825 | | // env_options_ but with readaheads set to readahead_compactions_. |
826 | | const EnvOptions env_options_compactions_; |
827 | | |
828 | | // No copying allowed |
829 | | VersionSet(const VersionSet&); |
830 | | void operator=(const VersionSet&); |
831 | | |
832 | | void LogAndApplyCFHelper(VersionEdit* edit); |
833 | | void LogAndApplyHelper( |
834 | | ColumnFamilyData* cfd, |
835 | | VersionBuilder* b, |
836 | | VersionEdit* edit, |
837 | | InstrumentedMutex* mu); |
838 | | }; |
839 | | |
840 | | } // namespace rocksdb |
841 | | |
842 | | #endif // YB_ROCKSDB_DB_VERSION_SET_H |