YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/src/yb/rocksdb/db/version_set.h
Line
Count
Source (jump to first uncovered line)
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under the BSD-style license found in the
3
//  LICENSE file in the root directory of this source tree. An additional grant
4
//  of patent rights can be found in the PATENTS file in the same directory.
5
//
6
// The following only applies to changes made to this file as part of YugaByte development.
7
//
8
// Portions Copyright (c) YugaByte, Inc.
9
//
10
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
11
// in compliance with the License.  You may obtain a copy of the License at
12
//
13
// http://www.apache.org/licenses/LICENSE-2.0
14
//
15
// Unless required by applicable law or agreed to in writing, software distributed under the License
16
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
17
// or implied.  See the License for the specific language governing permissions and limitations
18
// under the License.
19
//
20
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
21
// Use of this source code is governed by a BSD-style license that can be
22
// found in the LICENSE file. See the AUTHORS file for names of contributors.
23
//
24
// The representation of a DBImpl consists of a set of Versions.  The
25
// newest version is called "current".  Older versions may be kept
26
// around to provide a consistent view to live iterators.
27
//
28
// Each Version keeps track of a set of Table files per level.  The
29
// entire set of versions is maintained in a VersionSet.
30
//
31
// Version,VersionSet are thread-compatible, but require external
32
// synchronization on all accesses.
33
34
#ifndef YB_ROCKSDB_DB_VERSION_SET_H
35
#define YB_ROCKSDB_DB_VERSION_SET_H
36
37
#pragma once
38
39
#include <atomic>
40
#include <deque>
41
#include <limits>
42
#include <map>
43
#include <memory>
44
#include <set>
45
#include <string>
46
#include <utility>
47
#include <vector>
48
49
#include "yb/rocksdb/db/dbformat.h"
50
#include "yb/rocksdb/db/version_builder.h"
51
#include "yb/rocksdb/db/version_edit.h"
52
#include "yb/rocksdb/port/port.h"
53
#include "yb/rocksdb/db/table_cache.h"
54
#include "yb/rocksdb/db/compaction_picker.h"
55
#include "yb/rocksdb/db/column_family.h"
56
#include "yb/rocksdb/db/log_reader.h"
57
#include "yb/rocksdb/db/file_indexer.h"
58
#include "yb/rocksdb/db/write_controller.h"
59
#include "yb/rocksdb/env.h"
60
#include "yb/rocksdb/util/instrumented_mutex.h"
61
62
namespace rocksdb {
63
64
namespace log {
65
class Writer;
66
}
67
68
class Compaction;
69
class InternalIterator;
70
class LogBuffer;
71
class LookupKey;
72
class MemTable;
73
class Version;
74
class VersionSet;
75
class WriteBuffer;
76
class MergeContext;
77
class ColumnFamilyData;
78
class ColumnFamilySet;
79
class TableCache;
80
class MergeIteratorBuilder;
81
class FileNumbersProvider;
82
83
// Return the smallest index i such that file_level.files[i]->largest >= key.
84
// Return file_level.num_files if there is no such file.
85
// REQUIRES: "file_level.files" contains a sorted list of
86
// non-overlapping files.
87
extern int FindFile(const InternalKeyComparator& icmp,
88
                    const LevelFilesBrief& file_level, const Slice& key);
89
90
// Returns true iff some file in "files" overlaps the user key range
91
// [*smallest,*largest].
92
// smallest==nullptr represents a key smaller than all keys in the DB.
93
// largest==nullptr represents a key largest than all keys in the DB.
94
// REQUIRES: If disjoint_sorted_files, file_level.files[]
95
// contains disjoint ranges in sorted order.
96
extern bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
97
                                  bool disjoint_sorted_files,
98
                                  const LevelFilesBrief& file_level,
99
                                  const Slice* smallest_user_key,
100
                                  const Slice* largest_user_key);
101
102
// Generate LevelFilesBrief from vector<FdWithKeyRange*>
103
// Would copy smallest_key and largest_key data to sequential memory
104
// arena: Arena used to allocate the memory
105
extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
106
                                      const std::vector<FileMetaData*>& files,
107
                                      Arena* arena);
108
109
class VersionStorageInfo {
110
 public:
111
  VersionStorageInfo(const InternalKeyComparatorPtr& internal_comparator,
112
                     const Comparator* user_comparator, int num_levels,
113
                     CompactionStyle compaction_style,
114
                     VersionStorageInfo* src_vstorage);
115
  ~VersionStorageInfo();
116
117
1.07M
  void Reserve(int level, size_t size) { files_[level].reserve(size); }
118
119
  void AddFile(int level, FileMetaData* f, Logger* info_log = nullptr);
120
121
  void SetFinalized();
122
123
  // Update num_non_empty_levels_.
124
  void UpdateNumNonEmptyLevels();
125
126
653k
  void GenerateFileIndexer() {
127
653k
    file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_);
128
653k
  }
129
130
  // Update the accumulated stats from a file-meta.
131
  void UpdateAccumulatedStats(FileMetaData* file_meta);
132
133
  // Decrease the current stat form a to-be-delected file-meta
134
  void RemoveCurrentStats(FileMetaData* file_meta);
135
136
  void ComputeCompensatedSizes();
137
138
  // Updates internal structures that keep track of compaction scores
139
  // We use compaction scores to figure out which compaction to do next
140
  // REQUIRES: db_mutex held!!
141
  // TODO find a better way to pass compaction_options_fifo.
142
  void ComputeCompactionScore(
143
      const MutableCFOptions& mutable_cf_options,
144
      const CompactionOptionsFIFO& compaction_options_fifo);
145
146
  // Estimate est_comp_needed_bytes_
147
  void EstimateCompactionBytesNeeded(
148
      const MutableCFOptions& mutable_cf_options);
149
150
  // This computes files_marked_for_compaction_ and is called by
151
  // ComputeCompactionScore()
152
  void ComputeFilesMarkedForCompaction();
153
154
  // Generate level_files_brief_ from files_
155
  void GenerateLevelFilesBrief();
156
  // Sort all files for this version based on their file size and
157
  // record results in files_by_compaction_pri_. The largest files are listed
158
  // first.
159
  void UpdateFilesByCompactionPri(const MutableCFOptions& mutable_cf_options);
160
161
  void GenerateLevel0NonOverlapping();
162
23.0k
  bool level0_non_overlapping() const {
163
23.0k
    return level0_non_overlapping_;
164
23.0k
  }
165
166
  int MaxInputLevel() const;
167
168
  // Return level number that has idx'th highest score
169
49.4k
  int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; }
170
171
  // Return idx'th highest score
172
1.20M
  double CompactionScore(int idx) const { return compaction_score_[idx]; }
173
174
  void GetOverlappingInputs(
175
      int level,
176
      const InternalKey* begin,   // nullptr means before all keys
177
      const InternalKey* end,     // nullptr means after all keys
178
      std::vector<FileMetaData*>* inputs,
179
      int hint_index = -1,        // index of overlap file
180
      int* file_index = nullptr,  // return index of overlap file
181
      bool expand_range = true)   // if set, returns files which overlap the
182
      const;                      // range and overlap each other. If false,
183
                                  // then just files intersecting the range
184
185
  void GetOverlappingInputsBinarySearch(
186
      int level,
187
      const Slice& begin,  // nullptr means before all keys
188
      const Slice& end,    // nullptr means after all keys
189
      std::vector<FileMetaData*>* inputs,
190
      int hint_index,          // index of overlap file
191
      int* file_index) const;  // return index of overlap file
192
193
  void ExtendOverlappingInputs(
194
      int level,
195
      const Slice& begin,  // nullptr means before all keys
196
      const Slice& end,    // nullptr means after all keys
197
      std::vector<FileMetaData*>* inputs,
198
      unsigned int index) const;  // start extending from this index
199
200
  // Returns true iff some file in the specified level overlaps
201
  // some part of [*smallest_user_key,*largest_user_key].
202
  // smallest_user_key==NULL represents a key smaller than all keys in the DB.
203
  // largest_user_key==NULL represents a key largest than all keys in the DB.
204
  bool OverlapInLevel(int level, const Slice* smallest_user_key,
205
                      const Slice* largest_user_key);
206
207
  // Returns true iff the first or last file in inputs contains
208
  // an overlapping user key to the file "just outside" of it (i.e.
209
  // just after the last file, or just before the first file)
210
  // REQUIRES: "*inputs" is a sorted list of non-overlapping files
211
  bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
212
                             int level);
213
214
35.5M
  int num_levels() const { return num_levels_; }
215
216
  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
217
19.3M
  int num_non_empty_levels() const {
218
19.3M
    assert(finalized_);
219
19.3M
    return num_non_empty_levels_;
220
19.3M
  }
221
222
  // REQUIRES: This version has been finalized.
223
  // (CalculateBaseBytes() is called)
224
  // This may or may not return number of level files. It is to keep backward
225
  // compatible behavior in universal compaction.
226
3.01M
  int l0_delay_trigger_count() const { return l0_delay_trigger_count_; }
227
228
1.00M
  void set_l0_delay_trigger_count(int v) { l0_delay_trigger_count_ = v; }
229
230
  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
231
431k
  int NumLevelFiles(int level) const {
232
431k
    assert(finalized_);
233
431k
    return static_cast<int>(files_[level].size());
234
431k
  }
235
236
  uint64_t NumFiles() const;
237
238
  // Return the combined file size of all files at the specified level.
239
  uint64_t NumLevelBytes(int level) const;
240
241
  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
242
25.5M
  const std::vector<FileMetaData*>& LevelFiles(int level) const {
243
25.5M
    return files_[level];
244
25.5M
  }
245
246
15.8M
  const rocksdb::LevelFilesBrief& LevelFilesBrief(int level) const {
247
15.8M
    assert(level < static_cast<int>(level_files_brief_.size()));
248
15.8M
    return level_files_brief_[level];
249
15.8M
  }
250
251
  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
252
13.8k
  const std::vector<int>& FilesByCompactionPri(int level) const {
253
13.8k
    assert(finalized_);
254
13.8k
    return files_by_compaction_pri_[level];
255
13.8k
  }
256
257
  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
258
  // REQUIRES: DB mutex held during access
259
  const autovector<std::pair<int, FileMetaData*>>& FilesMarkedForCompaction()
260
93.4k
      const {
261
93.4k
    assert(finalized_);
262
93.4k
    return files_marked_for_compaction_;
263
93.4k
  }
264
265
744k
  int base_level() const { return base_level_; }
266
267
  // REQUIRES: lock is held
268
  // Set the index that is used to offset into files_by_compaction_pri_ to find
269
  // the next compaction candidate file.
270
13.8k
  void SetNextCompactionIndex(int level, int index) {
271
13.8k
    next_file_to_compact_by_size_[level] = index;
272
13.8k
  }
273
274
  // REQUIRES: lock is held
275
13.8k
  int NextCompactionIndex(int level) const {
276
13.8k
    return next_file_to_compact_by_size_[level];
277
13.8k
  }
278
279
  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
280
6.07k
  const FileIndexer& file_indexer() const {
281
6.07k
    assert(finalized_);
282
6.07k
    return file_indexer_;
283
6.07k
  }
284
285
  // Only the first few entries of files_by_compaction_pri_ are sorted.
286
  // There is no need to sort all the files because it is likely
287
  // that on a running system, we need to look at only the first
288
  // few largest files because a new version is created every few
289
  // seconds/minutes (because of concurrent compactions).
290
  static const size_t kNumberFilesToSort = 50;
291
292
  // Return a human-readable short (single-line) summary of the number
293
  // of files per level.  Uses *scratch as backing store.
294
  struct LevelSummaryStorage {
295
    char buffer[1000];
296
  };
297
  struct FileSummaryStorage {
298
    char buffer[3000];
299
  };
300
  const char* LevelSummary(LevelSummaryStorage* scratch) const;
301
  // Return a human-readable short (single-line) summary of files
302
  // in a specified level.  Uses *scratch as backing store.
303
  const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
304
305
  // Return the maximum overlapping data (in bytes) at next level for any
306
  // file at a level >= 1.
307
  int64_t MaxNextLevelOverlappingBytes();
308
309
653k
  uint64_t GetAverageValueSize() const {
310
653k
    if (accumulated_num_non_deletions_ == 0) {
311
585k
      return 0;
312
585k
    }
313
67.7k
    assert(accumulated_raw_key_size_ + accumulated_raw_value_size_ > 0);
314
67.7k
    assert(accumulated_file_size_ > 0);
315
67.7k
    return accumulated_raw_value_size_ / accumulated_num_non_deletions_ *
316
67.7k
           accumulated_file_size_ /
317
67.7k
           (accumulated_raw_key_size_ + accumulated_raw_value_size_);
318
67.7k
  }
319
320
  uint64_t GetEstimatedActiveKeys() const;
321
322
  // re-initializes the index that is used to offset into
323
  // files_by_compaction_pri_
324
  // to find the next compaction candidate file.
325
71
  void ResetNextCompactionIndex(int level) {
326
71
    next_file_to_compact_by_size_[level] = 0;
327
71
  }
328
329
2.58M
  const InternalKeyComparatorPtr& InternalComparator() {
330
2.58M
    return internal_comparator_;
331
2.58M
  }
332
333
  // Returns maximum total bytes of data on a given level.
334
  uint64_t MaxBytesForLevel(int level) const;
335
336
  // Must be called after any change to MutableCFOptions.
337
  void CalculateBaseBytes(const ImmutableCFOptions& ioptions,
338
                          const MutableCFOptions& options);
339
340
  // Returns an estimate of the amount of live data in bytes.
341
  uint64_t EstimateLiveDataSize() const;
342
343
1.99M
  uint64_t estimated_compaction_needed_bytes() const {
344
1.99M
    return estimated_compaction_needed_bytes_;
345
1.99M
  }
346
347
40
  void TEST_set_estimated_compaction_needed_bytes(uint64_t v) {
348
40
    estimated_compaction_needed_bytes_ = v;
349
40
  }
350
351
 private:
352
  InternalKeyComparatorPtr internal_comparator_;
353
  const Comparator* user_comparator_;
354
  int num_levels_;            // Number of levels
355
  int num_non_empty_levels_;  // Number of levels. Any level larger than it
356
                              // is guaranteed to be empty.
357
  // Per-level max bytes
358
  std::vector<uint64_t> level_max_bytes_;
359
360
  // A short brief metadata of files per level
361
  autovector<rocksdb::LevelFilesBrief> level_files_brief_;
362
  FileIndexer file_indexer_;
363
  Arena arena_;  // Used to allocate space for file_levels_
364
365
  CompactionStyle compaction_style_;
366
367
  // List of files per level, files in each level are arranged
368
  // in increasing order of keys
369
  std::vector<FileMetaData*>* files_;
370
371
  // Level that L0 data should be compacted to. All levels < base_level_ should
372
  // be empty. -1 if it is not level-compaction so it's not applicable.
373
  int base_level_;
374
375
  // A list for the same set of files that are stored in files_,
376
  // but files in each level are now sorted based on file
377
  // size. The file with the largest size is at the front.
378
  // This vector stores the index of the file from files_.
379
  std::vector<std::vector<int>> files_by_compaction_pri_;
380
381
  // If true, means that files in L0 have keys with non overlapping ranges
382
  bool level0_non_overlapping_;
383
384
  // An index into files_by_compaction_pri_ that specifies the first
385
  // file that is not yet compacted
386
  std::vector<int> next_file_to_compact_by_size_;
387
388
  // Only the first few entries of files_by_compaction_pri_ are sorted.
389
  // There is no need to sort all the files because it is likely
390
  // that on a running system, we need to look at only the first
391
  // few largest files because a new version is created every few
392
  // seconds/minutes (because of concurrent compactions).
393
  static const size_t number_of_files_to_sort_ = 50;
394
395
  // This vector contains list of files marked for compaction and also not
396
  // currently being compacted. It is protected by DB mutex. It is calculated in
397
  // ComputeCompactionScore()
398
  autovector<std::pair<int, FileMetaData*>> files_marked_for_compaction_;
399
400
  // Level that should be compacted next and its compaction score.
401
  // Score < 1 means compaction is not strictly needed.  These fields
402
  // are initialized by Finalize().
403
  // The most critical level to be compacted is listed first
404
  // These are used to pick the best compaction level
405
  std::vector<double> compaction_score_;
406
  std::vector<int> compaction_level_;
407
  int l0_delay_trigger_count_ = 0;  // Count used to trigger slow down and stop
408
                                    // for number of L0 files.
409
410
  // the following are the sampled temporary stats.
411
  // the current accumulated size of sampled files.
412
  uint64_t accumulated_file_size_;
413
  // the current accumulated size of all raw keys based on the sampled files.
414
  uint64_t accumulated_raw_key_size_;
415
  // the current accumulated size of all raw keys based on the sampled files.
416
  uint64_t accumulated_raw_value_size_;
417
  // total number of non-deletion entries
418
  uint64_t accumulated_num_non_deletions_;
419
  // total number of deletion entries
420
  uint64_t accumulated_num_deletions_;
421
  // current number of non_deletion entries
422
  uint64_t current_num_non_deletions_;
423
  // current number of delection entries
424
  uint64_t current_num_deletions_;
425
  // current number of file samples
426
  uint64_t current_num_samples_;
427
  // Estimated bytes needed to be compacted until all levels' size is down to
428
  // target sizes.
429
  uint64_t estimated_compaction_needed_bytes_;
430
431
  bool finalized_;
432
433
  friend class Version;
434
  friend class VersionSet;
435
  // No copying allowed
436
  VersionStorageInfo(const VersionStorageInfo&) = delete;
437
  void operator=(const VersionStorageInfo&) = delete;
438
};
439
440
class Version {
441
 public:
442
  // Append to *iters a sequence of iterators that will
443
  // yield the contents of this Version when merged together.
444
  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
445
  void AddIterators(const ReadOptions&, const EnvOptions& soptions,
446
                    MergeIteratorBuilder* merger_iter_builder);
447
448
  // Lookup the value for key.  If found, store it in *val and
449
  // return OK.  Else return a non-OK status.
450
  // Uses *operands to store merge_operator operations to apply later.
451
  //
452
  // If the ReadOptions.read_tier is set to do a read-only fetch, then
453
  // *value_found will be set to false if it cannot be determined whether
454
  // this value exists without doing IO.
455
  //
456
  // If the key is Deleted, *status will be set to NotFound and
457
  //                        *key_exists will be set to true.
458
  // If no key was found, *status will be set to NotFound and
459
  //                      *key_exists will be set to false.
460
  // If seq is non-null, *seq will be set to the sequence number found
461
  // for the key if a key was found.
462
  //
463
  // REQUIRES: lock is not held
464
  void Get(const ReadOptions&, const LookupKey& key, std::string* val,
465
           Status* status, MergeContext* merge_context,
466
           bool* value_found = nullptr, bool* key_exists = nullptr,
467
           SequenceNumber* seq = nullptr);
468
469
  // Loads some stats information from files. Call without mutex held. It needs
470
  // to be called before applying the version to the version set.
471
  void PrepareApply(const MutableCFOptions& mutable_cf_options,
472
                    bool update_stats);
473
474
  // Reference count management (so Versions do not disappear out from
475
  // under live iterators)
476
  void Ref();
477
  // Decrease reference count. Delete the object if no reference left
478
  // and return true. Otherwise, return false.
479
  bool Unref();
480
481
  // Add all files listed in the current version to *live.
482
  void AddLiveFiles(std::vector<FileDescriptor>* live);
483
484
  // Return a human readable string that describes this version's contents.
485
  std::string DebugString(bool hex = false) const;
486
487
  // Returns the version nuber of this version
488
38.0k
  uint64_t GetVersionNumber() const { return version_number_; }
489
490
  // REQUIRES: lock is held
491
  // On success, "tp" will contains the table properties of the file
492
  // specified in "file_meta".  If the file name of "file_meta" is
493
  // known ahread, passing it by a non-null "fname" can save a
494
  // file-name conversion.
495
  Status GetTableProperties(std::shared_ptr<const TableProperties>* tp,
496
                            const FileMetaData* file_meta,
497
                            const std::string* fname = nullptr) const;
498
499
  // REQUIRES: lock is held
500
  // On success, *props will be populated with all SSTables' table properties.
501
  // The keys of `props` are the sst file name, the values of `props` are the
502
  // tables' propertis, represented as shared_ptr.
503
  Status GetPropertiesOfAllTables(TablePropertiesCollection* props);
504
  Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level);
505
  Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n,
506
                                      TablePropertiesCollection* props) const;
507
508
  // REQUIRES: lock is held
509
  // On success, "tp" will contains the aggregated table property amoug
510
  // the table properties of all sst files in this version.
511
  Status GetAggregatedTableProperties(
512
      std::shared_ptr<const TableProperties>* tp, int level = -1);
513
514
0
  uint64_t GetEstimatedActiveKeys() {
515
0
    return storage_info_.GetEstimatedActiveKeys();
516
0
  }
517
518
  size_t GetMemoryUsageByTableReaders();
519
520
  // Returns approximate middle key of the largest SST file (see TableReader::GetMiddleKey).
521
  // Returns Status(Incomplete) if there are no SST files for this version.
522
  Result<std::string> GetMiddleKey();
523
524
23.1k
  ColumnFamilyData* cfd() const { return cfd_; }
525
526
  // Return the next Version in the linked list. Used for debug only
527
329k
  Version* TEST_Next() const {
528
329k
    return next_;
529
329k
  }
530
531
36.2M
  VersionStorageInfo* storage_info() { return &storage_info_; }
532
533
676k
  VersionSet* version_set() { return vset_; }
534
535
  void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta);
536
537
 private:
538
  Env* env_;
539
  friend class VersionSet;
540
541
15.6M
  const InternalKeyComparatorPtr& internal_comparator() const {
542
15.6M
    return storage_info_.internal_comparator_;
543
15.6M
  }
544
14.6M
  const Comparator* user_comparator() const {
545
14.6M
    return storage_info_.user_comparator_;
546
14.6M
  }
547
548
  bool PrefixMayMatch(const ReadOptions& read_options,
549
                      InternalIterator* level_iter,
550
                      const Slice& internal_prefix) const;
551
552
  // Returns true if the filter blocks in the specified level will not be
553
  // checked during read operations. In certain cases (trivial move or preload),
554
  // the filter block may already be cached, but we still do not access it such
555
  // that it eventually expires from the cache.
556
  bool IsFilterSkipped(int level, bool is_file_last_in_level = false);
557
558
  // The helper function of UpdateAccumulatedStats, which may fill the missing
559
  // fields of file_mata from its associated TableProperties.
560
  // Returns true if it does initialize FileMetaData.
561
  bool MaybeInitializeFileMetaData(FileMetaData* file_meta);
562
563
  // Update the accumulated stats associated with the current version.
564
  // This accumulated stats will be used in compaction.
565
  void UpdateAccumulatedStats(bool update_stats);
566
567
  // Sort all files for this version based on their file size and
568
  // record results in files_by_compaction_pri_. The largest files are listed
569
  // first.
570
  void UpdateFilesByCompactionPri();
571
572
  ColumnFamilyData* cfd_;  // ColumnFamilyData to which this Version belongs
573
  Logger* info_log_;
574
  Statistics* db_statistics_;
575
  TableCache* table_cache_;
576
  const MergeOperator* merge_operator_;
577
578
  VersionStorageInfo storage_info_;
579
  VersionSet* vset_;            // VersionSet to which this Version belongs
580
  Version* next_;               // Next version in linked list
581
  Version* prev_;               // Previous version in linked list
582
  int refs_;                    // Number of live refs to this version
583
584
  // A version number that uniquely represents this version. This is
585
  // used for debugging and logging purposes only.
586
  uint64_t version_number_;
587
588
  Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number = 0);
589
590
  ~Version();
591
592
  // No copying allowed
593
  Version(const Version&);
594
  void operator=(const Version&);
595
};
596
597
typedef boost::intrusive_ptr<Version> VersionPtr;
598
599
22.8k
inline void intrusive_ptr_add_ref(Version* version) {
600
22.8k
  version->Ref();
601
22.8k
}
602
603
22.8k
inline void intrusive_ptr_release(Version* version) {
604
22.8k
  version->Unref();
605
22.8k
}
606
607
class VersionSet {
608
 public:
609
  static constexpr uint64_t kInitialNextFileNumber = 2;
610
611
  VersionSet(const std::string& dbname, const DBOptions* db_options,
612
             const EnvOptions& env_options, Cache* table_cache,
613
             WriteBuffer* write_buffer, WriteController* write_controller);
614
  ~VersionSet();
615
616
  // Apply *edit to the current version to form a new descriptor that
617
  // is both saved to persistent state and installed as the new
618
  // current version.  Will release *mu while actually writing to the file.
619
  // column_family_options has to be set if edit is column family add
620
  // REQUIRES: *mu is held on entry.
621
  // REQUIRES: no other thread concurrently calls LogAndApply()
622
  Status LogAndApply(
623
      ColumnFamilyData* column_family_data,
624
      const MutableCFOptions& mutable_cf_options, VersionEdit* edit,
625
      InstrumentedMutex* mu, Directory* db_directory = nullptr,
626
      bool new_descriptor_log = false,
627
      const ColumnFamilyOptions* column_family_options = nullptr);
628
629
  // Recover the last saved descriptor from persistent storage.
630
  // If read_only == true, Recover() will not complain if some column families
631
  // are not opened
632
  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
633
                 bool read_only = false);
634
635
  // Reads a manifest file and returns a list of column families in
636
  // column_families.
637
  static Status ListColumnFamilies(std::vector<std::string>* column_families,
638
                                   const std::string& dbname,
639
                                   BoundaryValuesExtractor* extractor,
640
                                   Env* env);
641
642
#ifndef ROCKSDB_LITE
643
  // Try to reduce the number of levels. This call is valid when
644
  // only one level from the new max level to the old
645
  // max level containing files.
646
  // The call is static, since number of levels is immutable during
647
  // the lifetime of a RocksDB instance. It reduces number of levels
648
  // in a DB by applying changes to manifest.
649
  // For example, a db currently has 7 levels [0-6], and a call to
650
  // to reduce to 5 [0-4] can only be executed when only one level
651
  // among [4-6] contains files.
652
  static Status ReduceNumberOfLevels(const std::string& dbname,
653
                                     const Options* options,
654
                                     const EnvOptions& env_options,
655
                                     int new_levels);
656
657
  // printf contents (for debugging)
658
  Status DumpManifest(const Options& options, const std::string& manifestFileName,
659
                      bool verbose, bool hex = false);
660
661
#endif  // ROCKSDB_LITE
662
663
  // Return the current manifest file number
664
1.47M
  uint64_t manifest_file_number() const { return manifest_file_number_; }
665
666
1.47M
  uint64_t pending_manifest_file_number() const {
667
1.47M
    return pending_manifest_file_number_;
668
1.47M
  }
669
670
0
  uint64_t current_next_file_number() const { return next_file_number_.load(); }
671
672
  // Allocate and return a new file number
673
2.67M
  uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); }
674
675
  // Return the last sequence number.
676
76.5M
  uint64_t LastSequence() const {
677
76.5M
    return last_sequence_.load(std::memory_order_acquire);
678
76.5M
  }
679
680
2.64M
  UserFrontier* FlushedFrontier() const {
681
2.64M
    return flushed_frontier_.get();
682
2.64M
  }
683
684
  // Set the last sequence number to s.
685
  void SetLastSequence(SequenceNumber s);
686
687
  // Set last sequence number without verifying that it always keeps increasing.
688
  void SetLastSequenceNoSanityChecking(SequenceNumber s);
689
690
  // Attempts to set the last flushed op id / hybrid time / history cutoff to the specified tuple of
691
  // values. The current flushed frontier is always updated to the maximum of its current and
692
  // supplied values for each dimension. This will DFATAL in case the supplied frontier regresses
693
  // relative to the current frontier in any of its dimensions which have non-default (defined)
694
  // values.
695
  void UpdateFlushedFrontier(UserFrontierPtr values);
696
697
  // Mark the specified file number as used.
698
  // REQUIRED: this is only called during single-threaded recovery
699
  void MarkFileNumberUsedDuringRecovery(uint64_t number);
700
701
  // Return the log file number for the log file that is currently
702
  // being compacted, or zero if there is no such log file.
703
1.84M
  uint64_t prev_log_number() const { return prev_log_number_; }
704
705
  // Returns the minimum log number such that all
706
  // log numbers less than or equal to it can be deleted
707
3.29M
  uint64_t MinLogNumber() const {
708
3.29M
    uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
709
3.78M
    for (auto cfd : *column_family_set_) {
710
      // It's safe to ignore dropped column families here:
711
      // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
712
3.78M
      if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) {
713
3.29M
        min_log_num = cfd->GetLogNumber();
714
3.29M
      }
715
3.78M
    }
716
3.29M
    return min_log_num;
717
3.29M
  }
718
719
  // Create an iterator that reads over the compaction inputs for "*c".
720
  // The caller should delete the iterator when no longer needed.
721
  InternalIterator* MakeInputIterator(Compaction* c);
722
723
  // Add all files listed in any live version to *live.
724
  void AddLiveFiles(std::vector<FileDescriptor>* live_list);
725
726
  // Return the approximate size of data to be scanned for range [start, end)
727
  // in levels [start_level, end_level). If end_level == 0 it will search
728
  // through all non-empty levels
729
  uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end,
730
                           int start_level = 0, int end_level = -1);
731
732
  // Return the size of the current manifest file
733
1.51k
  uint64_t manifest_file_size() const { return manifest_file_size_; }
734
735
  // verify that the files that we started with for a compaction
736
  // still exist in the current version and in the same original level.
737
  // This ensures that a concurrent compaction did not erroneously
738
  // pick the same files to compact.
739
  bool VerifyCompactionFileConsistency(Compaction* c);
740
741
  Status GetMetadataForFile(uint64_t number, int* filelevel,
742
                            FileMetaData** metadata, ColumnFamilyData** cfd);
743
744
  // This function doesn't support leveldb SST filenames
745
  void GetLiveFilesMetaData(std::vector<LiveFileMetaData> *metadata);
746
747
  void GetObsoleteFiles(const FileNumbersProvider& pending_outputs,
748
                        std::vector<FileMetaData*>* files,
749
                        std::vector<std::string>* manifest_filenames);
750
751
18.5M
  ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
752
653k
  const EnvOptions& env_options() { return env_options_; }
753
754
  CHECKED_STATUS Import(const std::string& source_dir, SequenceNumber seqno, VersionEdit* edit);
755
756
  void UnrefFile(ColumnFamilyData* cfd, FileMetaData* f);
757
758
  static uint64_t GetNumLiveVersions(Version* dummy_versions);
759
760
  static uint64_t GetTotalSstFilesSize(Version* dummy_versions);
761
762
155
  bool has_manifest_writers() const {
763
155
    return !manifest_writers_.empty();
764
155
  }
765
766
 private:
767
  struct ManifestWriter;
768
769
  friend class Version;
770
  friend class DBImpl;
771
772
  // ApproximateSize helper
773
  uint64_t ApproximateSizeLevel0(Version* v, const LevelFilesBrief& files_brief,
774
                                 const Slice& start, const Slice& end);
775
776
  uint64_t ApproximateSize(Version* v, const FdWithBoundaries& f, const Slice& key);
777
778
  // Save current contents to *log
779
  Status WriteSnapshot(log::Writer* log, UserFrontierPtr flushed_frontier_override);
780
781
  void AppendVersion(ColumnFamilyData* column_family_data, Version* v);
782
783
  ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
784
                                       VersionEdit* edit);
785
786
  static void EnsureNonDecreasingLastSequence(
787
      SequenceNumber prev_last_seq, SequenceNumber new_last_seq);
788
  static void EnsureNonDecreasingFlushedFrontier(
789
      const UserFrontier* prev_value, const UserFrontier& new_value);
790
791
  void UpdateFlushedFrontierNoSanityChecking(UserFrontierPtr values);
792
793
  std::unique_ptr<ColumnFamilySet> column_family_set_;
794
795
  Env* const env_;
796
  const std::string dbname_;
797
  const DBOptions* const db_options_;
798
  std::atomic<uint64_t> next_file_number_ = {kInitialNextFileNumber};
799
  uint64_t manifest_file_number_ = 0;
800
  uint64_t pending_manifest_file_number_ = 0;
801
  std::atomic<uint64_t> last_sequence_ = {0};
802
  uint64_t prev_log_number_ = 0; // 0 or backing store for memtable being compacted
803
  UserFrontierPtr flushed_frontier_;
804
805
  // Opened lazily
806
  std::unique_ptr<log::Writer> descriptor_log_;
807
  std::string descriptor_log_file_name_;
808
809
  // generates a increasing version number for every new version
810
  uint64_t current_version_number_ = 0;
811
812
  // Queue of writers to the manifest file
813
  std::deque<ManifestWriter*> manifest_writers_;
814
815
  // Current size of manifest file
816
  uint64_t manifest_file_size_ = 0;
817
818
  std::vector<FileMetaData*> obsolete_files_;
819
  std::vector<std::string> obsolete_manifests_;
820
821
  // env options for all reads and writes except compactions
822
  const EnvOptions& env_options_;
823
824
  // env options used for compactions. This is a copy of
825
  // env_options_ but with readaheads set to readahead_compactions_.
826
  const EnvOptions env_options_compactions_;
827
828
  // No copying allowed
829
  VersionSet(const VersionSet&);
830
  void operator=(const VersionSet&);
831
832
  void LogAndApplyCFHelper(VersionEdit* edit);
833
  void LogAndApplyHelper(
834
      ColumnFamilyData* cfd,
835
      VersionBuilder* b,
836
      VersionEdit* edit,
837
      InstrumentedMutex* mu);
838
};
839
840
}  // namespace rocksdb
841
842
#endif // YB_ROCKSDB_DB_VERSION_SET_H