YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/src/yb/docdb/intent_aware_iterator.h
Line
Count
Source
1
// Copyright (c) YugaByte, Inc.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
4
// in compliance with the License.  You may obtain a copy of the License at
5
//
6
// http://www.apache.org/licenses/LICENSE-2.0
7
//
8
// Unless required by applicable law or agreed to in writing, software distributed under the License
9
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
10
// or implied.  See the License for the specific language governing permissions and limitations
11
// under the License.
12
//
13
14
#ifndef YB_DOCDB_INTENT_AWARE_ITERATOR_H_
15
#define YB_DOCDB_INTENT_AWARE_ITERATOR_H_
16
17
#include <boost/optional/optional.hpp>
18
19
#include "yb/common/doc_hybrid_time.h"
20
#include "yb/common/read_hybrid_time.h"
21
22
#include "yb/docdb/bounded_rocksdb_iterator.h"
23
#include "yb/docdb/key_bytes.h"
24
#include "yb/docdb/transaction_status_cache.h"
25
26
#include "yb/rocksdb/db.h"
27
#include "yb/rocksdb/options.h"
28
29
namespace yb {
30
namespace docdb {
31
32
class Value;
33
struct Expiration;
34
35
YB_DEFINE_ENUM(ResolvedIntentState, (kNoIntent)(kInvalidPrefix)(kValid));
36
YB_DEFINE_ENUM(Direction, (kForward)(kBackward));
37
YB_DEFINE_ENUM(SeekIntentIterNeeded, (kNoNeed)(kSeek)(kSeekForward));
38
39
struct FetchKeyResult {
40
  Slice key;
41
  DocHybridTime write_time;
42
  bool same_transaction;
43
};
44
45
// Provides a way to iterate over DocDB (sub)keys with respect to committed intents transparently
46
// for caller. Implementation relies on intents order in RocksDB, which is determined by intent key
47
// format. If (sub)key A goes before/after (sub)key B, all intents for A should go before/after all
48
// intents for (sub)key B.
49
//
50
// When seeking to some (sub)dockey, it will ignore values for this (sub)dockey
51
// (except write intents written by the same transaction) with HT higher than read_time.
52
//
53
// For intents from the same transaction, intents with maximum HT would be picked -- ignoring
54
// read_time for filtering purposes -- and returned as key with time equal to read_time.
55
// Intent data format:
56
//   SubDocKey (no HybridTime) + IntentType + HybridTime -> TxnId + value.
57
// TxnId, IntentType, HybridTime are all prefixed with their respective value types.
58
//
59
// KeyBytes/Slice passed to Seek* methods should not contain hybrid time.
60
// HybridTime of subdoc_key in Seek* methods would be ignored.
61
class IntentAwareIterator {
62
 public:
63
  IntentAwareIterator(
64
      const DocDB& doc_db,
65
      const rocksdb::ReadOptions& read_opts,
66
      CoarseTimePoint deadline,
67
      const ReadHybridTime& read_time,
68
      const TransactionOperationContext& txn_op_context);
69
70
  IntentAwareIterator(const IntentAwareIterator& other) = delete;
71
  void operator=(const IntentAwareIterator& other) = delete;
72
73
  // Seek to the smallest key which is greater or equal than doc_key.
74
  void Seek(const DocKey& doc_key);
75
76
  // Seek to specified encoded key (it is responsibility of caller to make sure it doesn't have
77
  // hybrid time).
78
  void Seek(const Slice& key);
79
80
  // Seek forward to specified encoded key (it is responsibility of caller to make sure it
81
  // doesn't have hybrid time). For efficiency, the method that takes a non-const KeyBytes pointer
82
  // avoids memory allocation by using the KeyBytes buffer to prepare the key to seek to, and may
83
  // append up to kMaxBytesPerEncodedHybridTime + 1 bytes of data to the buffer. The appended data
84
  // is removed when the method returns.
85
  void SeekForward(const Slice& key);
86
  void SeekForward(KeyBytes* key);
87
88
  // Seek past specified subdoc key (it is responsibility of caller to make sure it doesn't have
89
  // hybrid time).
90
  void SeekPastSubKey(const Slice& key);
91
92
  // Seek out of subdoc key (it is responsibility of caller to make sure it doesn't have hybrid
93
  // time).
94
  void SeekOutOfSubDoc(const Slice& key);
95
  // For efficiency, this overload takes a non-const KeyBytes pointer avoids memory allocation by
96
  // using the KeyBytes buffer to prepare the key to seek to by appending an extra byte. The
97
  // appended byte is removed when the method returns.
98
  void SeekOutOfSubDoc(KeyBytes* key_bytes);
99
100
  // Seek to last doc key.
101
  void SeekToLastDocKey();
102
103
  // Seek to previous SubDocKey as opposed to previous DocKey.
104
  void PrevSubDocKey(const KeyBytes& key_bytes);
105
106
  // This method positions the iterator at the beginning of the DocKey found before the doc_key
107
  // provided.
108
  void PrevDocKey(const DocKey& doc_key);
109
  void PrevDocKey(const Slice& encoded_doc_key);
110
111
  // Fetches currently pointed key and also updates max_seen_ht to ht of this key. The key does not
112
  // contain the DocHybridTime but is returned separately and optionally.
113
  Result<FetchKeyResult> FetchKey();
114
115
  bool valid();
116
  Slice value();
117
260M
  const ReadHybridTime& read_time() { return read_time_; }
118
7.09M
  HybridTime max_seen_ht() { return max_seen_ht_; }
119
120
  // Iterate through Next() until a row containing a full record (non merge record) is found, or the
121
  // key changes.
122
  //
123
  // If a new full record with the same key is found, latest_record_ht is set to the write time of
124
  // that record, result_value is set to the value, and final_key is set to the key.
125
  //
126
  // If the key changes, latest_record_ht is set to the write time of the last merge record seen,
127
  // result_value is set to its value, and final_key is set to the key.
128
  CHECKED_STATUS NextFullValue(
129
      DocHybridTime* latest_record_ht,
130
      Slice* result_value,
131
      Slice* final_key = nullptr);
132
133
  // Finds the latest record for a particular key after the provided max_overwrite_time, returns the
134
  // write time of the found record, and optionally also the result value. This latest record may
135
  // not be a full record, but instead a merge record (e.g. a TTL row).
136
  CHECKED_STATUS FindLatestRecord(
137
      const Slice& key_without_ht,
138
      DocHybridTime* max_overwrite_time,
139
      Slice* result_value = nullptr);
140
141
  // Finds the oldest record for a particular key that is larger than the
142
  // specified min_hybrid_time, returns the overwrite time.
143
  // This record may not be a full record, but instead a merge record (e.g. a
144
  // TTL row).
145
  // Returns HybridTime::kInvalid if no such record was found.
146
  Result<HybridTime> FindOldestRecord(const Slice& key_without_ht,
147
                                      HybridTime min_hybrid_time);
148
149
7.96M
  void SetUpperbound(const Slice& upperbound) {
150
7.96M
    upperbound_ = upperbound;
151
7.96M
  }
152
153
  void DebugDump();
154
155
 private:
156
  friend class IntentAwareIteratorPrefixScope;
157
158
  // Adds new value to prefix stack. The top value of this stack is used to filter returned entries.
159
  void PushPrefix(const Slice& prefix);
160
161
  // Removes top value from prefix stack. This iteration could became valid after popping prefix,
162
  // if new top prefix is a prefix of currently pointed value.
163
  void PopPrefix();
164
165
  Slice CurrentPrefix() const;
166
167
  // Seek forward on regular sub-iterator.
168
  void SeekForwardRegular(const Slice& slice);
169
170
  // Seek to latest doc key among regular and intent iterator.
171
  void SeekToLatestDocKeyInternal();
172
  // Seek to latest subdoc key among regular and intent iterator.
173
  void SeekToLatestSubDocKeyInternal();
174
175
  // Choose latest subkey among regular and intent iterators.
176
  Slice LatestSubDocKey();
177
178
  // Skips regular entries with hybrid time after read limit.
179
  // If `is_forward` is `false` and `iter_` is positioned to the earliest record for the current
180
  // key, there are two cases:
181
  // 1 - record has hybrid time <= read limit: does nothing.
182
  // 2 - record has hybrid time > read limit: will skip all records for this key, since all of
183
  // them are after earliest record which is after read limit. Then will act the same way on
184
  // previous key.
185
  void SkipFutureRecords(Direction direction);
186
187
  // Skips intents with hybrid time after read limit.
188
  void SkipFutureIntents();
189
190
  // Strong write intents which are either committed or written by the current
191
  // transaction (stored in txn_op_context) by considered time are considered as suitable.
192
193
  // Seek intent sub-iterator to latest suitable intent starting with seek_key_buffer_.
194
  // intent_iter_ will be positioned to first intent for the smallest key greater than
195
  // resolved_intent_sub_doc_key_encoded_.
196
  // If iterator already positioned far enough - does not perform seek.
197
  // If we already resolved intent after seek_key_prefix_, then it will be used.
198
  void SeekForwardToSuitableIntent();
199
200
  // Seek intent sub-iterator forward (backward) to latest suitable intent for first available
201
  // key. Updates resolved_intent_XXX fields.
202
  // intent_iter_ will be positioned to first intent for the smallest (biggest) key
203
  // greater (smaller) than resolved_intent_sub_doc_key_encoded_.
204
  template<Direction direction>
205
  void SeekToSuitableIntent();
206
207
  // Decodes intent at intent_iter_ position and updates resolved_intent_* fields if that intent
208
  // matches all following conditions:
209
  // 1. It is strong write intent.
210
  // 2. It is either (a) written at value_time by transaction associated with txn_op_context_
211
  // (current transaction) earlier than high_ht_ or (b) its transaction is already committed and
212
  // value_time = commit time is earlier than high_ht_.
213
  // 3. It has value_time earlier than current resolved intent if we already have resolved intent.
214
  //
215
  // Preconditions:
216
  // 1. If has_resolved_intent_ is true then intent should be exactly for the same key as
217
  // resolved_intent_key_prefix_.
218
  //
219
  // Note: For performance reasons resolved_intent_sub_doc_key_encoded_ is not updated by this
220
  // function and should be updated after we have found latest suitable intent for the key by
221
  // calling UpdateResolvedIntentSubDocKeyEncoded.
222
  void ProcessIntent();
223
224
  void UpdateResolvedIntentSubDocKeyEncoded();
225
226
  // Seeks to the appropriate intent-prefix and returns the associated
227
  // DocHybridTime.
228
  Result<DocHybridTime> FindMatchingIntentRecordDocHybridTime(const Slice& key_without_ht);
229
230
  // Returns the DocHybridTime associated with the current regular record
231
  // pointed to, if it matches the key that is passed as the argument.
232
  // If the current record does not match the passed key, invalid hybrid time
233
  // is returned.
234
  Result<DocHybridTime> GetMatchingRegularRecordDocHybridTime(
235
      const Slice& key_without_ht);
236
237
  // Whether current entry is regular key-value pair.
238
  bool IsEntryRegular(bool descending = false);
239
240
  // Set the exclusive upperbound of the intent iterator to the current SubDocKey of the regular
241
  // iterator. This is necessary to avoid RocksDB iterator from scanning over the deleted intents
242
  // beyond the current regular key unnecessarily.
243
  CHECKED_STATUS SetIntentUpperbound();
244
245
  // Resets the exclusive upperbound of the intent iterator to the beginning of the transaction
246
  // metadata and reverse index region.
247
  void ResetIntentUpperbound();
248
249
  void SeekIntentIterIfNeeded();
250
251
  // Does initial steps for prev doc key/sub doc key seek.
252
  // Returns true if prepare succeed.
253
  bool PreparePrev(const Slice& key);
254
255
  bool SatisfyBounds(const Slice& slice);
256
257
3.45M
  bool ResolvedIntentFromSameTransaction() const {
258
3.45M
    return intent_dht_from_same_txn_ != DocHybridTime::kMin;
259
3.45M
  }
260
261
1.73M
  DocHybridTime GetIntentDocHybridTime() const {
262
1.63M
    return ResolvedIntentFromSameTransaction() ? intent_dht_from_same_txn_
263
93.9k
                                               : resolved_intent_txn_dht_;
264
1.73M
  }
265
266
  // Returns true if iterator currently points to some record.
267
  bool HasCurrentEntry();
268
269
  // Update seek_intent_iter_needed_, seek_key_prefix_ and seek_key_buffer_ to seek forward
270
  // for key + suffix.
271
  // If use_suffix_for_prefix then suffix is used in seek_key_prefix_, otherwise it will match key.
272
  void UpdatePlannedIntentSeekForward(
273
      const Slice& key, const Slice& suffix, bool use_suffix_for_prefix = true);
274
275
  bool NextRegular(Direction direction);
276
277
  const ReadHybridTime read_time_;
278
  const std::string encoded_read_time_read_;
279
  const std::string encoded_read_time_local_limit_;
280
  const std::string encoded_read_time_global_limit_;
281
282
  // The encoded hybrid time to use to filter records in regular RocksDB. This is the maximum of
283
  // read_time and local_limit (in terms of hybrid time comparison), and this slice points to
284
  // one of the strings above.
285
  Slice encoded_read_time_regular_limit_;
286
287
  const TransactionOperationContext txn_op_context_;
288
  docdb::BoundedRocksDbIterator intent_iter_;
289
  docdb::BoundedRocksDbIterator iter_;
290
  // iter_valid_ is true if and only if iter_ is positioned at key which matches top prefix from
291
  // the stack and record time satisfies read_time_ criteria.
292
  bool iter_valid_ = false;
293
  Status status_;
294
  HybridTime max_seen_ht_ = HybridTime::kMin;
295
296
  // Upperbound for seek. If we see regular or intent record past this bound, it will be ignored.
297
  Slice upperbound_;
298
299
  // Exclusive upperbound of the intent key.
300
  KeyBytes intent_upperbound_keybytes_;
301
  Slice intent_upperbound_;
302
303
  // Following fields contain information related to resolved suitable intent.
304
  ResolvedIntentState resolved_intent_state_ = ResolvedIntentState::kNoIntent;
305
  // SubDocKey (no HT).
306
  KeyBytes resolved_intent_key_prefix_;
307
  // DocHybridTime of resolved_intent_sub_doc_key_encoded_ is set to commit time or intent time in
308
  // case of intent is written by current transaction (stored in txn_op_context_).
309
  DocHybridTime resolved_intent_txn_dht_;
310
  DocHybridTime intent_dht_from_same_txn_ = DocHybridTime::kMin;
311
  KeyBytes resolved_intent_sub_doc_key_encoded_;
312
  KeyBytes resolved_intent_value_;
313
314
  std::vector<Slice> prefix_stack_;
315
  TransactionStatusCache transaction_status_cache_;
316
317
  bool skip_future_records_needed_ = false;
318
  bool skip_future_intents_needed_ = false;
319
  SeekIntentIterNeeded seek_intent_iter_needed_ = SeekIntentIterNeeded::kNoNeed;
320
321
  // Reusable buffer to prepare seek key to avoid reallocating temporary buffers in critical paths.
322
  KeyBytes seek_key_buffer_;
323
  Slice seek_key_prefix_;
324
};
325
326
class IntentAwareIteratorPrefixScope {
327
 public:
328
  IntentAwareIteratorPrefixScope(const Slice& prefix, IntentAwareIterator* iterator)
329
539M
      : iterator_(iterator) {
330
539M
    iterator->PushPrefix(prefix);
331
539M
  }
332
333
540M
  ~IntentAwareIteratorPrefixScope() {
334
540M
    iterator_->PopPrefix();
335
540M
  }
336
337
 private:
338
  IntentAwareIterator* iterator_;
339
};
340
341
} // namespace docdb
342
} // namespace yb
343
344
#endif // YB_DOCDB_INTENT_AWARE_ITERATOR_H_