/Users/deen/code/yugabyte-db/src/yb/docdb/intent_aware_iterator.h
Line | Count | Source |
1 | | // Copyright (c) YugaByte, Inc. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
4 | | // in compliance with the License. You may obtain a copy of the License at |
5 | | // |
6 | | // http://www.apache.org/licenses/LICENSE-2.0 |
7 | | // |
8 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
9 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
10 | | // or implied. See the License for the specific language governing permissions and limitations |
11 | | // under the License. |
12 | | // |
13 | | |
14 | | #ifndef YB_DOCDB_INTENT_AWARE_ITERATOR_H_ |
15 | | #define YB_DOCDB_INTENT_AWARE_ITERATOR_H_ |
16 | | |
17 | | #include <boost/optional/optional.hpp> |
18 | | |
19 | | #include "yb/common/doc_hybrid_time.h" |
20 | | #include "yb/common/read_hybrid_time.h" |
21 | | |
22 | | #include "yb/docdb/bounded_rocksdb_iterator.h" |
23 | | #include "yb/docdb/key_bytes.h" |
24 | | #include "yb/docdb/transaction_status_cache.h" |
25 | | |
26 | | #include "yb/rocksdb/db.h" |
27 | | #include "yb/rocksdb/options.h" |
28 | | |
29 | | namespace yb { |
30 | | namespace docdb { |
31 | | |
32 | | class Value; |
33 | | struct Expiration; |
34 | | |
35 | | YB_DEFINE_ENUM(ResolvedIntentState, (kNoIntent)(kInvalidPrefix)(kValid)); |
36 | | YB_DEFINE_ENUM(Direction, (kForward)(kBackward)); |
37 | | YB_DEFINE_ENUM(SeekIntentIterNeeded, (kNoNeed)(kSeek)(kSeekForward)); |
38 | | |
39 | | struct FetchKeyResult { |
40 | | Slice key; |
41 | | DocHybridTime write_time; |
42 | | bool same_transaction; |
43 | | }; |
44 | | |
45 | | // Provides a way to iterate over DocDB (sub)keys with respect to committed intents transparently |
46 | | // for caller. Implementation relies on intents order in RocksDB, which is determined by intent key |
47 | | // format. If (sub)key A goes before/after (sub)key B, all intents for A should go before/after all |
48 | | // intents for (sub)key B. |
49 | | // |
50 | | // When seeking to some (sub)dockey, it will ignore values for this (sub)dockey |
51 | | // (except write intents written by the same transaction) with HT higher than read_time. |
52 | | // |
53 | | // For intents from the same transaction, intents with maximum HT would be picked -- ignoring |
54 | | // read_time for filtering purposes -- and returned as key with time equal to read_time. |
55 | | // Intent data format: |
56 | | // SubDocKey (no HybridTime) + IntentType + HybridTime -> TxnId + value. |
57 | | // TxnId, IntentType, HybridTime are all prefixed with their respective value types. |
58 | | // |
59 | | // KeyBytes/Slice passed to Seek* methods should not contain hybrid time. |
60 | | // HybridTime of subdoc_key in Seek* methods would be ignored. |
61 | | class IntentAwareIterator { |
62 | | public: |
63 | | IntentAwareIterator( |
64 | | const DocDB& doc_db, |
65 | | const rocksdb::ReadOptions& read_opts, |
66 | | CoarseTimePoint deadline, |
67 | | const ReadHybridTime& read_time, |
68 | | const TransactionOperationContext& txn_op_context); |
69 | | |
70 | | IntentAwareIterator(const IntentAwareIterator& other) = delete; |
71 | | void operator=(const IntentAwareIterator& other) = delete; |
72 | | |
73 | | // Seek to the smallest key which is greater or equal than doc_key. |
74 | | void Seek(const DocKey& doc_key); |
75 | | |
76 | | // Seek to specified encoded key (it is responsibility of caller to make sure it doesn't have |
77 | | // hybrid time). |
78 | | void Seek(const Slice& key); |
79 | | |
80 | | // Seek forward to specified encoded key (it is responsibility of caller to make sure it |
81 | | // doesn't have hybrid time). For efficiency, the method that takes a non-const KeyBytes pointer |
82 | | // avoids memory allocation by using the KeyBytes buffer to prepare the key to seek to, and may |
83 | | // append up to kMaxBytesPerEncodedHybridTime + 1 bytes of data to the buffer. The appended data |
84 | | // is removed when the method returns. |
85 | | void SeekForward(const Slice& key); |
86 | | void SeekForward(KeyBytes* key); |
87 | | |
88 | | // Seek past specified subdoc key (it is responsibility of caller to make sure it doesn't have |
89 | | // hybrid time). |
90 | | void SeekPastSubKey(const Slice& key); |
91 | | |
92 | | // Seek out of subdoc key (it is responsibility of caller to make sure it doesn't have hybrid |
93 | | // time). |
94 | | void SeekOutOfSubDoc(const Slice& key); |
95 | | // For efficiency, this overload takes a non-const KeyBytes pointer avoids memory allocation by |
96 | | // using the KeyBytes buffer to prepare the key to seek to by appending an extra byte. The |
97 | | // appended byte is removed when the method returns. |
98 | | void SeekOutOfSubDoc(KeyBytes* key_bytes); |
99 | | |
100 | | // Seek to last doc key. |
101 | | void SeekToLastDocKey(); |
102 | | |
103 | | // Seek to previous SubDocKey as opposed to previous DocKey. |
104 | | void PrevSubDocKey(const KeyBytes& key_bytes); |
105 | | |
106 | | // This method positions the iterator at the beginning of the DocKey found before the doc_key |
107 | | // provided. |
108 | | void PrevDocKey(const DocKey& doc_key); |
109 | | void PrevDocKey(const Slice& encoded_doc_key); |
110 | | |
111 | | // Fetches currently pointed key and also updates max_seen_ht to ht of this key. The key does not |
112 | | // contain the DocHybridTime but is returned separately and optionally. |
113 | | Result<FetchKeyResult> FetchKey(); |
114 | | |
115 | | bool valid(); |
116 | | Slice value(); |
117 | 774M | const ReadHybridTime& read_time() { return read_time_; } |
118 | 16.9M | HybridTime max_seen_ht() { return max_seen_ht_; } |
119 | | |
120 | | // Iterate through Next() until a row containing a full record (non merge record) is found, or the |
121 | | // key changes. |
122 | | // |
123 | | // If a new full record with the same key is found, latest_record_ht is set to the write time of |
124 | | // that record, result_value is set to the value, and final_key is set to the key. |
125 | | // |
126 | | // If the key changes, latest_record_ht is set to the write time of the last merge record seen, |
127 | | // result_value is set to its value, and final_key is set to the key. |
128 | | CHECKED_STATUS NextFullValue( |
129 | | DocHybridTime* latest_record_ht, |
130 | | Slice* result_value, |
131 | | Slice* final_key = nullptr); |
132 | | |
133 | | // Finds the latest record for a particular key after the provided max_overwrite_time, returns the |
134 | | // write time of the found record, and optionally also the result value. This latest record may |
135 | | // not be a full record, but instead a merge record (e.g. a TTL row). |
136 | | CHECKED_STATUS FindLatestRecord( |
137 | | const Slice& key_without_ht, |
138 | | DocHybridTime* max_overwrite_time, |
139 | | Slice* result_value = nullptr); |
140 | | |
141 | | // Finds the oldest record for a particular key that is larger than the |
142 | | // specified min_hybrid_time, returns the overwrite time. |
143 | | // This record may not be a full record, but instead a merge record (e.g. a |
144 | | // TTL row). |
145 | | // Returns HybridTime::kInvalid if no such record was found. |
146 | | Result<HybridTime> FindOldestRecord(const Slice& key_without_ht, |
147 | | HybridTime min_hybrid_time); |
148 | | |
149 | 18.9M | void SetUpperbound(const Slice& upperbound) { |
150 | 18.9M | upperbound_ = upperbound; |
151 | 18.9M | } |
152 | | |
153 | | void DebugDump(); |
154 | | |
155 | | private: |
156 | | friend class IntentAwareIteratorPrefixScope; |
157 | | |
158 | | // Adds new value to prefix stack. The top value of this stack is used to filter returned entries. |
159 | | void PushPrefix(const Slice& prefix); |
160 | | |
161 | | // Removes top value from prefix stack. This iteration could became valid after popping prefix, |
162 | | // if new top prefix is a prefix of currently pointed value. |
163 | | void PopPrefix(); |
164 | | |
165 | | Slice CurrentPrefix() const; |
166 | | |
167 | | // Seek forward on regular sub-iterator. |
168 | | void SeekForwardRegular(const Slice& slice); |
169 | | |
170 | | // Seek to latest doc key among regular and intent iterator. |
171 | | void SeekToLatestDocKeyInternal(); |
172 | | // Seek to latest subdoc key among regular and intent iterator. |
173 | | void SeekToLatestSubDocKeyInternal(); |
174 | | |
175 | | // Choose latest subkey among regular and intent iterators. |
176 | | Slice LatestSubDocKey(); |
177 | | |
178 | | // Skips regular entries with hybrid time after read limit. |
179 | | // If `is_forward` is `false` and `iter_` is positioned to the earliest record for the current |
180 | | // key, there are two cases: |
181 | | // 1 - record has hybrid time <= read limit: does nothing. |
182 | | // 2 - record has hybrid time > read limit: will skip all records for this key, since all of |
183 | | // them are after earliest record which is after read limit. Then will act the same way on |
184 | | // previous key. |
185 | | void SkipFutureRecords(Direction direction); |
186 | | |
187 | | // Skips intents with hybrid time after read limit. |
188 | | void SkipFutureIntents(); |
189 | | |
190 | | // Strong write intents which are either committed or written by the current |
191 | | // transaction (stored in txn_op_context) by considered time are considered as suitable. |
192 | | |
193 | | // Seek intent sub-iterator to latest suitable intent starting with seek_key_buffer_. |
194 | | // intent_iter_ will be positioned to first intent for the smallest key greater than |
195 | | // resolved_intent_sub_doc_key_encoded_. |
196 | | // If iterator already positioned far enough - does not perform seek. |
197 | | // If we already resolved intent after seek_key_prefix_, then it will be used. |
198 | | void SeekForwardToSuitableIntent(); |
199 | | |
200 | | // Seek intent sub-iterator forward (backward) to latest suitable intent for first available |
201 | | // key. Updates resolved_intent_XXX fields. |
202 | | // intent_iter_ will be positioned to first intent for the smallest (biggest) key |
203 | | // greater (smaller) than resolved_intent_sub_doc_key_encoded_. |
204 | | template<Direction direction> |
205 | | void SeekToSuitableIntent(); |
206 | | |
207 | | // Decodes intent at intent_iter_ position and updates resolved_intent_* fields if that intent |
208 | | // matches all following conditions: |
209 | | // 1. It is strong write intent. |
210 | | // 2. It is either (a) written at value_time by transaction associated with txn_op_context_ |
211 | | // (current transaction) earlier than high_ht_ or (b) its transaction is already committed and |
212 | | // value_time = commit time is earlier than high_ht_. |
213 | | // 3. It has value_time earlier than current resolved intent if we already have resolved intent. |
214 | | // |
215 | | // Preconditions: |
216 | | // 1. If has_resolved_intent_ is true then intent should be exactly for the same key as |
217 | | // resolved_intent_key_prefix_. |
218 | | // |
219 | | // Note: For performance reasons resolved_intent_sub_doc_key_encoded_ is not updated by this |
220 | | // function and should be updated after we have found latest suitable intent for the key by |
221 | | // calling UpdateResolvedIntentSubDocKeyEncoded. |
222 | | void ProcessIntent(); |
223 | | |
224 | | void UpdateResolvedIntentSubDocKeyEncoded(); |
225 | | |
226 | | // Seeks to the appropriate intent-prefix and returns the associated |
227 | | // DocHybridTime. |
228 | | Result<DocHybridTime> FindMatchingIntentRecordDocHybridTime(const Slice& key_without_ht); |
229 | | |
230 | | // Returns the DocHybridTime associated with the current regular record |
231 | | // pointed to, if it matches the key that is passed as the argument. |
232 | | // If the current record does not match the passed key, invalid hybrid time |
233 | | // is returned. |
234 | | Result<DocHybridTime> GetMatchingRegularRecordDocHybridTime( |
235 | | const Slice& key_without_ht); |
236 | | |
237 | | // Whether current entry is regular key-value pair. |
238 | | bool IsEntryRegular(bool descending = false); |
239 | | |
240 | | // Set the exclusive upperbound of the intent iterator to the current SubDocKey of the regular |
241 | | // iterator. This is necessary to avoid RocksDB iterator from scanning over the deleted intents |
242 | | // beyond the current regular key unnecessarily. |
243 | | CHECKED_STATUS SetIntentUpperbound(); |
244 | | |
245 | | // Resets the exclusive upperbound of the intent iterator to the beginning of the transaction |
246 | | // metadata and reverse index region. |
247 | | void ResetIntentUpperbound(); |
248 | | |
249 | | void SeekIntentIterIfNeeded(); |
250 | | |
251 | | // Does initial steps for prev doc key/sub doc key seek. |
252 | | // Returns true if prepare succeed. |
253 | | bool PreparePrev(const Slice& key); |
254 | | |
255 | | bool SatisfyBounds(const Slice& slice); |
256 | | |
257 | 10.8M | bool ResolvedIntentFromSameTransaction() const { |
258 | 10.8M | return intent_dht_from_same_txn_ != DocHybridTime::kMin; |
259 | 10.8M | } |
260 | | |
261 | 5.41M | DocHybridTime GetIntentDocHybridTime() const { |
262 | 5.41M | return ResolvedIntentFromSameTransaction() ? intent_dht_from_same_txn_4.55M |
263 | 5.41M | : resolved_intent_txn_dht_861k ; |
264 | 5.41M | } |
265 | | |
266 | | // Returns true if iterator currently points to some record. |
267 | | bool HasCurrentEntry(); |
268 | | |
269 | | // Update seek_intent_iter_needed_, seek_key_prefix_ and seek_key_buffer_ to seek forward |
270 | | // for key + suffix. |
271 | | // If use_suffix_for_prefix then suffix is used in seek_key_prefix_, otherwise it will match key. |
272 | | void UpdatePlannedIntentSeekForward( |
273 | | const Slice& key, const Slice& suffix, bool use_suffix_for_prefix = true); |
274 | | |
275 | | bool NextRegular(Direction direction); |
276 | | |
277 | | const ReadHybridTime read_time_; |
278 | | const std::string encoded_read_time_read_; |
279 | | const std::string encoded_read_time_local_limit_; |
280 | | const std::string encoded_read_time_global_limit_; |
281 | | |
282 | | // The encoded hybrid time to use to filter records in regular RocksDB. This is the maximum of |
283 | | // read_time and local_limit (in terms of hybrid time comparison), and this slice points to |
284 | | // one of the strings above. |
285 | | Slice encoded_read_time_regular_limit_; |
286 | | |
287 | | const TransactionOperationContext txn_op_context_; |
288 | | docdb::BoundedRocksDbIterator intent_iter_; |
289 | | docdb::BoundedRocksDbIterator iter_; |
290 | | // iter_valid_ is true if and only if iter_ is positioned at key which matches top prefix from |
291 | | // the stack and record time satisfies read_time_ criteria. |
292 | | bool iter_valid_ = false; |
293 | | Status status_; |
294 | | HybridTime max_seen_ht_ = HybridTime::kMin; |
295 | | |
296 | | // Upperbound for seek. If we see regular or intent record past this bound, it will be ignored. |
297 | | Slice upperbound_; |
298 | | |
299 | | // Exclusive upperbound of the intent key. |
300 | | KeyBytes intent_upperbound_keybytes_; |
301 | | Slice intent_upperbound_; |
302 | | |
303 | | // Following fields contain information related to resolved suitable intent. |
304 | | ResolvedIntentState resolved_intent_state_ = ResolvedIntentState::kNoIntent; |
305 | | // SubDocKey (no HT). |
306 | | KeyBytes resolved_intent_key_prefix_; |
307 | | // DocHybridTime of resolved_intent_sub_doc_key_encoded_ is set to commit time or intent time in |
308 | | // case of intent is written by current transaction (stored in txn_op_context_). |
309 | | DocHybridTime resolved_intent_txn_dht_; |
310 | | DocHybridTime intent_dht_from_same_txn_ = DocHybridTime::kMin; |
311 | | KeyBytes resolved_intent_sub_doc_key_encoded_; |
312 | | KeyBytes resolved_intent_value_; |
313 | | |
314 | | std::vector<Slice> prefix_stack_; |
315 | | TransactionStatusCache transaction_status_cache_; |
316 | | |
317 | | bool skip_future_records_needed_ = false; |
318 | | bool skip_future_intents_needed_ = false; |
319 | | SeekIntentIterNeeded seek_intent_iter_needed_ = SeekIntentIterNeeded::kNoNeed; |
320 | | |
321 | | // Reusable buffer to prepare seek key to avoid reallocating temporary buffers in critical paths. |
322 | | KeyBytes seek_key_buffer_; |
323 | | Slice seek_key_prefix_; |
324 | | }; |
325 | | |
326 | | class IntentAwareIteratorPrefixScope { |
327 | | public: |
328 | | IntentAwareIteratorPrefixScope(const Slice& prefix, IntentAwareIterator* iterator) |
329 | 1.54G | : iterator_(iterator) { |
330 | 1.54G | iterator->PushPrefix(prefix); |
331 | 1.54G | } |
332 | | |
333 | 1.55G | ~IntentAwareIteratorPrefixScope() { |
334 | 1.55G | iterator_->PopPrefix(); |
335 | 1.55G | } |
336 | | |
337 | | private: |
338 | | IntentAwareIterator* iterator_; |
339 | | }; |
340 | | |
341 | | } // namespace docdb |
342 | | } // namespace yb |
343 | | |
344 | | #endif // YB_DOCDB_INTENT_AWARE_ITERATOR_H_ |