/Users/deen/code/yugabyte-db/src/yb/docdb/docdb_compaction_filter.h
Line | Count | Source |
1 | | // Copyright (c) YugaByte, Inc. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
4 | | // in compliance with the License. You may obtain a copy of the License at |
5 | | // |
6 | | // http://www.apache.org/licenses/LICENSE-2.0 |
7 | | // |
8 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
9 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
10 | | // or implied. See the License for the specific language governing permissions and limitations |
11 | | // under the License. |
12 | | // |
13 | | |
14 | | #ifndef YB_DOCDB_DOCDB_COMPACTION_FILTER_H |
15 | | #define YB_DOCDB_DOCDB_COMPACTION_FILTER_H |
16 | | |
17 | | #include <atomic> |
18 | | #include <memory> |
19 | | #include <unordered_set> |
20 | | #include <vector> |
21 | | |
22 | | #include <boost/container/small_vector.hpp> |
23 | | #include <boost/functional/hash.hpp> |
24 | | |
25 | | #include "yb/common/column_id.h" |
26 | | #include "yb/common/hybrid_time.h" |
27 | | |
28 | | #include "yb/docdb/expiration.h" |
29 | | |
30 | | #include "yb/gutil/thread_annotations.h" |
31 | | |
32 | | #include "yb/rocksdb/compaction_filter.h" |
33 | | #include "yb/rocksdb/metadata.h" |
34 | | |
35 | | #include "yb/server/hybrid_clock.h" |
36 | | |
37 | | #include "yb/util/strongly_typed_bool.h" |
38 | | |
39 | | namespace yb { |
40 | | namespace docdb { |
41 | | |
42 | | YB_STRONGLY_TYPED_BOOL(IsMajorCompaction); |
43 | | YB_STRONGLY_TYPED_BOOL(ShouldRetainDeleteMarkersInMajorCompaction); |
44 | | |
45 | | struct Expiration; |
46 | | using ColumnIds = std::unordered_set<ColumnId, boost::hash<ColumnId>>; |
47 | | using ColumnIdsPtr = std::shared_ptr<ColumnIds>; |
48 | | |
49 | | // A "directive" of how a particular compaction should retain old (overwritten or deleted) values. |
50 | | struct HistoryRetentionDirective { |
51 | | // We will not keep history below this hybrid_time. The view of the database at this hybrid_time |
52 | | // is preserved, but after the compaction completes, we should not expect to be able to do |
53 | | // consistent scans at DocDB hybrid times lower than this. Those scans will result in missing |
54 | | // data. Therefore, it is really important to always set this to a value lower than or equal to |
55 | | // the lowest "read point" of any pending read operations. |
56 | | HybridTime history_cutoff; |
57 | | |
58 | | // Columns that were deleted at a timestamp lower than the history cutoff. |
59 | | ColumnIdsPtr deleted_cols; |
60 | | |
61 | | MonoDelta table_ttl; |
62 | | |
63 | | ShouldRetainDeleteMarkersInMajorCompaction retain_delete_markers_in_major_compaction{false}; |
64 | | }; |
65 | | |
66 | | // DocDB compaction filter. A new instance of this class is created for every compaction. |
67 | | class DocDBCompactionFilter : public rocksdb::CompactionFilter { |
68 | | public: |
69 | | DocDBCompactionFilter( |
70 | | HistoryRetentionDirective retention, |
71 | | IsMajorCompaction is_major_compaction, |
72 | | const KeyBounds* key_bounds); |
73 | | |
74 | | ~DocDBCompactionFilter() override; |
75 | | rocksdb::FilterDecision Filter( |
76 | | int level, const Slice& key, const Slice& existing_value, std::string* new_value, |
77 | | bool* value_changed) override; |
78 | | const char* Name() const override; |
79 | | |
80 | | // This indicates we don't have a cached TTL. We need this to be different from kMaxTtl |
81 | | // and kResetTtl because a PERSIST call would lead to a cached TTL of kMaxTtl, and kResetTtl |
82 | | // indicates no TTL in Cassandra. |
83 | | const MonoDelta kNoTtl = MonoDelta::FromNanoseconds(-1); |
84 | | |
85 | | // This is used to provide the history_cutoff timestamp to the compaction as a field in the |
86 | | // ConsensusFrontier, so that it can be persisted in RocksDB metadata and recovered on bootstrap. |
87 | | rocksdb::UserFrontierPtr GetLargestUserFrontier() const override; |
88 | | |
89 | | // Returns an empty list when key_ranges_ is not set, denoting that the whole key range of the |
90 | | // tablet should be considered live. |
91 | | // |
92 | | // When key_ranges_ is set, returns two live ranges: |
93 | | // (1) A range covering any ApplyTransactionState records which may have been written |
94 | | // (2) A range covering all valid keys in key_ranges_, i.e. all user data this tablet is |
95 | | // responsible for. |
96 | | std::vector<std::pair<Slice, Slice>> GetLiveRanges() const override; |
97 | | |
98 | | private: |
99 | | // Assigns prev_subdoc_key_ from memory addressed by data. The length of key is taken from |
100 | | // sub_key_ends_ and same_bytes are reused. |
101 | | void AssignPrevSubDocKey(const char* data, size_t same_bytes); |
102 | | |
103 | | // Actual Filter implementation. |
104 | | Result<rocksdb::FilterDecision> DoFilter( |
105 | | int level, const Slice& key, const Slice& existing_value, std::string* new_value, |
106 | | bool* value_changed); |
107 | | |
108 | | const HistoryRetentionDirective retention_; |
109 | | const KeyBounds* key_bounds_; |
110 | | const IsMajorCompaction is_major_compaction_; |
111 | | |
112 | | std::vector<char> prev_subdoc_key_; |
113 | | |
114 | | // Result of DecodeDocKeyAndSubKeyEnds for prev_subdoc_key_. |
115 | | boost::container::small_vector<size_t, 16> sub_key_ends_; |
116 | | |
117 | | // A stack of highest hybrid_times lower than or equal to history_cutoff_ at which parent |
118 | | // subdocuments of the key that has just been processed, or the subdocument / primitive value |
119 | | // itself stored at that key itself, were fully overwritten or deleted. A full overwrite of a |
120 | | // parent document is considered a full overwrite of all its subdocuments at every level for the |
121 | | // purpose of this definition. Therefore, the following inequalities hold: |
122 | | // |
123 | | // overwrite_ht_[0] <= ... <= overwrite_ht_[N - 1] <= history_cutoff_ |
124 | | // |
125 | | // The following example shows contents of RocksDB being compacted, as well as the state of the |
126 | | // overwrite_ht_ stack and how it is being updated at each step. history_cutoff_ is 25 in this |
127 | | // example. |
128 | | // |
129 | | // RocksDB key/value | overwrite_ht_ | Filter logic |
130 | | // ---------------------------------------------------------------------------------------------- |
131 | | // doc_key1 HT(30) -> {} | [MinHT] | Always keeping the first entry |
132 | | // doc_key1 HT(20) -> DEL | [20] | 20 >= MinHT, keeping the entry |
133 | | // | | ^^ ^^^^^ |
134 | | // | Note: we're comparing the hybrid_time in this key with |
135 | | // | the previous stack top of overwrite_ht_. |
136 | | // | | |
137 | | // doc_key1 HT(10) -> {} | [20] | 10 < 20, deleting the entry |
138 | | // doc_key1 subkey1 HT(35) -> "value4" | [20, 20] | 35 >= 20, keeping the entry |
139 | | // ^^ | ^^ | |
140 | | // \----------------------/ HT(35) is higher than history_cutoff_, so we're |
141 | | // | just duplicating the top value on the stack |
142 | | // | HT(20) this step. |
143 | | // | | |
144 | | // doc_key1 subkey1 HT(23) -> "value3" | [20, 23] | 23 >= 20, keeping the entry |
145 | | // | ^^ | |
146 | | // | Now we have actually found a hybrid_time that is |
147 | | // | <= history_cutoff_, so we're replacing the stack |
148 | | // | top with that hybrid_time. |
149 | | // | | |
150 | | // doc_key1 subkey1 HT(21) -> "value2" | [20, 23] | 21 < 23, deleting the entry |
151 | | // doc_key1 subkey1 HT(15) -> "value1" | [20, 23] | 15 < 23, deleting the entry |
152 | | |
153 | | struct OverwriteData { |
154 | | DocHybridTime doc_ht; |
155 | | Expiration expiration; |
156 | | }; |
157 | | |
158 | | std::vector<OverwriteData> overwrite_; |
159 | | |
160 | | // We use this to only log a message that the filter is being used once on the first call to |
161 | | // the Filter function. |
162 | | bool filter_usage_logged_ = false; |
163 | | bool within_merge_block_ = false; |
164 | | }; |
165 | | |
166 | | // A strategy for deciding how the history of old database operations should be retained during |
167 | | // compactions. We may implement this differently in production and in tests. |
168 | | class HistoryRetentionPolicy { |
169 | | public: |
170 | 76.1k | virtual ~HistoryRetentionPolicy() = default; |
171 | | virtual HistoryRetentionDirective GetRetentionDirective() = 0; |
172 | | }; |
173 | | |
174 | | class DocDBCompactionFilterFactory : public rocksdb::CompactionFilterFactory { |
175 | | public: |
176 | | explicit DocDBCompactionFilterFactory( |
177 | | std::shared_ptr<HistoryRetentionPolicy> retention_policy, const KeyBounds* key_bounds); |
178 | | ~DocDBCompactionFilterFactory() override; |
179 | | std::unique_ptr<rocksdb::CompactionFilter> CreateCompactionFilter( |
180 | | const rocksdb::CompactionFilter::Context& context) override; |
181 | | const char* Name() const override; |
182 | | |
183 | | private: |
184 | | std::shared_ptr<HistoryRetentionPolicy> retention_policy_; |
185 | | const KeyBounds* key_bounds_; |
186 | | }; |
187 | | |
188 | | // A history retention policy that can be configured manually. Useful in tests. This class is |
189 | | // useful for testing and is thread-safe. |
190 | | class ManualHistoryRetentionPolicy : public HistoryRetentionPolicy { |
191 | | public: |
192 | | HistoryRetentionDirective GetRetentionDirective() override; |
193 | | |
194 | | void SetHistoryCutoff(HybridTime history_cutoff); |
195 | | |
196 | | void AddDeletedColumn(ColumnId col); |
197 | | |
198 | | void SetTableTTLForTests(MonoDelta ttl); |
199 | | |
200 | | private: |
201 | | std::atomic<HybridTime> history_cutoff_{HybridTime::kMin}; |
202 | | |
203 | | std::mutex deleted_cols_mtx_; |
204 | | ColumnIds deleted_cols_ GUARDED_BY(deleted_cols_mtx_); |
205 | | |
206 | | std::atomic<MonoDelta> table_ttl_{MonoDelta::kMax}; |
207 | | }; |
208 | | |
209 | | } // namespace docdb |
210 | | } // namespace yb |
211 | | |
212 | | #endif // YB_DOCDB_DOCDB_COMPACTION_FILTER_H |