YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/docdb/docdb_compaction_filter.h
Line
Count
Source
1
// Copyright (c) YugaByte, Inc.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
4
// in compliance with the License.  You may obtain a copy of the License at
5
//
6
// http://www.apache.org/licenses/LICENSE-2.0
7
//
8
// Unless required by applicable law or agreed to in writing, software distributed under the License
9
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
10
// or implied.  See the License for the specific language governing permissions and limitations
11
// under the License.
12
//
13
14
#ifndef YB_DOCDB_DOCDB_COMPACTION_FILTER_H
15
#define YB_DOCDB_DOCDB_COMPACTION_FILTER_H
16
17
#include <atomic>
18
#include <memory>
19
#include <unordered_set>
20
#include <vector>
21
22
#include <boost/container/small_vector.hpp>
23
#include <boost/functional/hash.hpp>
24
25
#include "yb/common/column_id.h"
26
#include "yb/common/hybrid_time.h"
27
28
#include "yb/docdb/expiration.h"
29
30
#include "yb/gutil/thread_annotations.h"
31
32
#include "yb/rocksdb/compaction_filter.h"
33
#include "yb/rocksdb/metadata.h"
34
35
#include "yb/server/hybrid_clock.h"
36
37
#include "yb/util/strongly_typed_bool.h"
38
39
namespace yb {
40
namespace docdb {
41
42
YB_STRONGLY_TYPED_BOOL(IsMajorCompaction);
43
YB_STRONGLY_TYPED_BOOL(ShouldRetainDeleteMarkersInMajorCompaction);
44
45
struct Expiration;
46
using ColumnIds = std::unordered_set<ColumnId, boost::hash<ColumnId>>;
47
using ColumnIdsPtr = std::shared_ptr<ColumnIds>;
48
49
// A "directive" of how a particular compaction should retain old (overwritten or deleted) values.
50
struct HistoryRetentionDirective {
51
  // We will not keep history below this hybrid_time. The view of the database at this hybrid_time
52
  // is preserved, but after the compaction completes, we should not expect to be able to do
53
  // consistent scans at DocDB hybrid times lower than this. Those scans will result in missing
54
  // data. Therefore, it is really important to always set this to a value lower than or equal to
55
  // the lowest "read point" of any pending read operations.
56
  HybridTime history_cutoff;
57
58
  // Columns that were deleted at a timestamp lower than the history cutoff.
59
  ColumnIdsPtr deleted_cols;
60
61
  MonoDelta table_ttl;
62
63
  ShouldRetainDeleteMarkersInMajorCompaction retain_delete_markers_in_major_compaction{false};
64
};
65
66
// DocDB compaction filter. A new instance of this class is created for every compaction.
67
class DocDBCompactionFilter : public rocksdb::CompactionFilter {
68
 public:
69
  DocDBCompactionFilter(
70
      HistoryRetentionDirective retention,
71
      IsMajorCompaction is_major_compaction,
72
      const KeyBounds* key_bounds);
73
74
  ~DocDBCompactionFilter() override;
75
  rocksdb::FilterDecision Filter(
76
      int level, const Slice& key, const Slice& existing_value, std::string* new_value,
77
      bool* value_changed) override;
78
  const char* Name() const override;
79
80
  // This indicates we don't have a cached TTL. We need this to be different from kMaxTtl
81
  // and kResetTtl because a PERSIST call would lead to a cached TTL of kMaxTtl, and kResetTtl
82
  // indicates no TTL in Cassandra.
83
  const MonoDelta kNoTtl = MonoDelta::FromNanoseconds(-1);
84
85
  // This is used to provide the history_cutoff timestamp to the compaction as a field in the
86
  // ConsensusFrontier, so that it can be persisted in RocksDB metadata and recovered on bootstrap.
87
  rocksdb::UserFrontierPtr GetLargestUserFrontier() const override;
88
89
  // Returns an empty list when key_ranges_ is not set, denoting that the whole key range of the
90
  // tablet should be considered live.
91
  //
92
  // When key_ranges_ is set, returns two live ranges:
93
  // (1) A range covering any ApplyTransactionState records which may have been written
94
  // (2) A range covering all valid keys in key_ranges_, i.e. all user data this tablet is
95
  //     responsible for.
96
  std::vector<std::pair<Slice, Slice>> GetLiveRanges() const override;
97
98
 private:
99
  // Assigns prev_subdoc_key_ from memory addressed by data. The length of key is taken from
100
  // sub_key_ends_ and same_bytes are reused.
101
  void AssignPrevSubDocKey(const char* data, size_t same_bytes);
102
103
  // Actual Filter implementation.
104
  Result<rocksdb::FilterDecision> DoFilter(
105
      int level, const Slice& key, const Slice& existing_value, std::string* new_value,
106
      bool* value_changed);
107
108
  const HistoryRetentionDirective retention_;
109
  const KeyBounds* key_bounds_;
110
  const IsMajorCompaction is_major_compaction_;
111
112
  std::vector<char> prev_subdoc_key_;
113
114
  // Result of DecodeDocKeyAndSubKeyEnds for prev_subdoc_key_.
115
  boost::container::small_vector<size_t, 16> sub_key_ends_;
116
117
  // A stack of highest hybrid_times lower than or equal to history_cutoff_ at which parent
118
  // subdocuments of the key that has just been processed, or the subdocument / primitive value
119
  // itself stored at that key itself, were fully overwritten or deleted. A full overwrite of a
120
  // parent document is considered a full overwrite of all its subdocuments at every level for the
121
  // purpose of this definition. Therefore, the following inequalities hold:
122
  //
123
  // overwrite_ht_[0] <= ... <= overwrite_ht_[N - 1] <= history_cutoff_
124
  //
125
  // The following example shows contents of RocksDB being compacted, as well as the state of the
126
  // overwrite_ht_ stack and how it is being updated at each step. history_cutoff_ is 25 in this
127
  // example.
128
  //
129
  // RocksDB key/value                    | overwrite_ht_ | Filter logic
130
  // ----------------------------------------------------------------------------------------------
131
  // doc_key1 HT(30) -> {}                | [MinHT]       | Always keeping the first entry
132
  // doc_key1 HT(20) -> DEL               | [20]          | 20 >= MinHT, keeping the entry
133
  //                                      |               | ^^    ^^^^^
134
  //                                      | Note: we're comparing the hybrid_time in this key with
135
  //                                      | the previous stack top of overwrite_ht_.
136
  //                                      |               |
137
  // doc_key1 HT(10) -> {}                | [20]          | 10 < 20, deleting the entry
138
  // doc_key1 subkey1 HT(35) -> "value4"  | [20, 20]      | 35 >= 20, keeping the entry
139
  //                     ^^               |      ^^       |
140
  //                      \----------------------/ HT(35) is higher than history_cutoff_, so we're
141
  //                                      |        just duplicating the top value on the stack
142
  //                                      |        HT(20) this step.
143
  //                                      |               |
144
  // doc_key1 subkey1 HT(23) -> "value3"  | [20, 23]      | 23 >= 20, keeping the entry
145
  //                                      |      ^^       |
146
  //                                      |      Now we have actually found a hybrid_time that is
147
  //                                      |      <= history_cutoff_, so we're replacing the stack
148
  //                                      |      top with that hybrid_time.
149
  //                                      |               |
150
  // doc_key1 subkey1 HT(21) -> "value2"  | [20, 23]      | 21 < 23, deleting the entry
151
  // doc_key1 subkey1 HT(15) -> "value1"  | [20, 23]      | 15 < 23, deleting the entry
152
153
  struct OverwriteData {
154
    DocHybridTime doc_ht;
155
    Expiration expiration;
156
  };
157
158
  std::vector<OverwriteData> overwrite_;
159
160
  // We use this to only log a message that the filter is being used once on the first call to
161
  // the Filter function.
162
  bool filter_usage_logged_ = false;
163
  bool within_merge_block_ = false;
164
};
165
166
// A strategy for deciding how the history of old database operations should be retained during
167
// compactions. We may implement this differently in production and in tests.
168
class HistoryRetentionPolicy {
169
 public:
170
76.1k
  virtual ~HistoryRetentionPolicy() = default;
171
  virtual HistoryRetentionDirective GetRetentionDirective() = 0;
172
};
173
174
class DocDBCompactionFilterFactory : public rocksdb::CompactionFilterFactory {
175
 public:
176
  explicit DocDBCompactionFilterFactory(
177
      std::shared_ptr<HistoryRetentionPolicy> retention_policy, const KeyBounds* key_bounds);
178
  ~DocDBCompactionFilterFactory() override;
179
  std::unique_ptr<rocksdb::CompactionFilter> CreateCompactionFilter(
180
      const rocksdb::CompactionFilter::Context& context) override;
181
  const char* Name() const override;
182
183
 private:
184
  std::shared_ptr<HistoryRetentionPolicy> retention_policy_;
185
  const KeyBounds* key_bounds_;
186
};
187
188
// A history retention policy that can be configured manually. Useful in tests. This class is
189
// useful for testing and is thread-safe.
190
class ManualHistoryRetentionPolicy : public HistoryRetentionPolicy {
191
 public:
192
  HistoryRetentionDirective GetRetentionDirective() override;
193
194
  void SetHistoryCutoff(HybridTime history_cutoff);
195
196
  void AddDeletedColumn(ColumnId col);
197
198
  void SetTableTTLForTests(MonoDelta ttl);
199
200
 private:
201
  std::atomic<HybridTime> history_cutoff_{HybridTime::kMin};
202
203
  std::mutex deleted_cols_mtx_;
204
  ColumnIds deleted_cols_ GUARDED_BY(deleted_cols_mtx_);
205
206
  std::atomic<MonoDelta> table_ttl_{MonoDelta::kMax};
207
};
208
209
}  // namespace docdb
210
}  // namespace yb
211
212
#endif  // YB_DOCDB_DOCDB_COMPACTION_FILTER_H