YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/src/yb/tablet/transaction_loader.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) YugaByte, Inc.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
4
// in compliance with the License.  You may obtain a copy of the License at
5
//
6
// http://www.apache.org/licenses/LICENSE-2.0
7
//
8
// Unless required by applicable law or agreed to in writing, software distributed under the License
9
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
10
// or implied.  See the License for the specific language governing permissions and limitations
11
// under the License.
12
//
13
14
#include "yb/tablet/transaction_loader.h"
15
16
#include "yb/docdb/bounded_rocksdb_iterator.h"
17
#include "yb/docdb/doc_key.h"
18
#include "yb/docdb/docdb_rocksdb_util.h"
19
#include "yb/docdb/intent.h"
20
21
#include "yb/tablet/transaction_status_resolver.h"
22
23
#include "yb/util/bitmap.h"
24
#include "yb/util/flag_tags.h"
25
#include "yb/util/logging.h"
26
#include "yb/util/metrics.h"
27
#include "yb/util/operation_counter.h"
28
#include "yb/util/pb_util.h"
29
#include "yb/util/scope_exit.h"
30
#include "yb/util/thread.h"
31
32
using namespace std::literals;
33
34
DEFINE_test_flag(int32, inject_load_transaction_delay_ms, 0,
35
                 "Inject delay before loading each transaction at startup.");
36
37
DECLARE_bool(TEST_fail_on_replicated_batch_idx_set_in_txn_record);
38
39
METRIC_DEFINE_simple_counter(
40
    tablet, transaction_load_attempts,
41
    "Total number of attempts to load transaction metadata from the intents RocksDB",
42
    yb::MetricUnit::kTransactions);
43
44
namespace yb {
45
namespace tablet {
46
47
namespace {
48
49
51.6k
docdb::BoundedRocksDbIterator CreateFullScanIterator(rocksdb::DB* db) {
50
51.6k
  return docdb::BoundedRocksDbIterator(docdb::CreateRocksDBIterator(
51
51.6k
      db, &docdb::KeyBounds::kNoBounds,
52
51.6k
      docdb::BloomFilterMode::DONT_USE_BLOOM_FILTER,
53
51.6k
      /* user_key_for_filter= */ boost::none, rocksdb::kDefaultQueryId));
54
51.6k
}
55
56
} // namespace
57
58
class TransactionLoader::Executor {
59
 public:
60
  explicit Executor(
61
      TransactionLoader* loader,
62
      RWOperationCounter* pending_op_counter)
63
25.8k
      : loader_(*loader), scoped_pending_operation_(pending_op_counter) {
64
25.8k
    metric_transaction_load_attempts_ = METRIC_transaction_load_attempts.Instantiate(
65
25.8k
        loader_.entity_);
66
25.8k
  }
67
68
25.8k
  bool Start(const docdb::DocDB& db) {
69
25.8k
    if (!scoped_pending_operation_.ok()) {
70
0
      return false;
71
0
    }
72
25.8k
    regular_iterator_ = CreateFullScanIterator(db.regular);
73
25.8k
    intents_iterator_ = CreateFullScanIterator(db.intents);
74
25.8k
    auto& load_thread = loader_.load_thread_;
75
25.8k
    load_thread = std::thread(&Executor::Execute, this);
76
25.8k
    return true;
77
25.8k
  }
78
79
 private:
80
25.8k
  void Execute() {
81
25.8k
    CDSAttacher attacher;
82
83
25.8k
    SetThreadName("TransactionLoader");
84
85
25.8k
    auto se = ScopeExit([this] {
86
25.8k
      auto pending_applies = std::move(pending_applies_);
87
25.8k
      TransactionLoaderContext& context = loader_.context_;
88
25.8k
      loader_.executor_.reset();
89
90
25.8k
      context.LoadFinished(pending_applies);
91
25.8k
    });
92
93
25.8k
    LOG_WITH_PREFIX(INFO) << "Load transactions start";
94
95
25.8k
    LoadPendingApplies();
96
25.8k
    LoadTransactions();
97
25.8k
  }
98
99
25.8k
  void LoadTransactions() {
100
25.8k
    size_t loaded_transactions = 0;
101
25.8k
    TransactionId id = TransactionId::Nil();
102
25.8k
    AppendTransactionKeyPrefix(id, &current_key_);
103
25.8k
    intents_iterator_.Seek(current_key_.AsSlice());
104
25.8k
    while (intents_iterator_.Valid()) {
105
4
      auto key = intents_iterator_.key();
106
4
      if (!key.TryConsumeByte(docdb::ValueTypeAsChar::kTransactionId)) {
107
0
        break;
108
0
      }
109
4
      auto decode_id_result = DecodeTransactionId(&key);
110
4
      if (!decode_id_result.ok()) {
111
0
        LOG_WITH_PREFIX(DFATAL)
112
0
            << "Failed to decode transaction id from: " << key.ToDebugHexString();
113
0
        intents_iterator_.Next();
114
0
        continue;
115
0
      }
116
4
      id = *decode_id_result;
117
4
      current_key_.Clear();
118
4
      AppendTransactionKeyPrefix(id, &current_key_);
119
4
      if (key.empty()) { // The key only contains a transaction id - it is metadata record.
120
4
        if (FLAGS_TEST_inject_load_transaction_delay_ms > 0) {
121
0
          std::this_thread::sleep_for(FLAGS_TEST_inject_load_transaction_delay_ms * 1ms);
122
0
        }
123
4
        LoadTransaction(id);
124
4
        ++loaded_transactions;
125
4
      }
126
4
      current_key_.AppendValueType(docdb::ValueType::kMaxByte);
127
4
      intents_iterator_.Seek(current_key_.AsSlice());
128
4
    }
129
130
25.8k
    intents_iterator_.Reset();
131
132
25.8k
    context().CompleteLoad([this] {
133
25.8k
      loader_.all_loaded_.store(true, std::memory_order_release);
134
25.8k
    });
135
25.8k
    {
136
      // We need to lock and unlock the mutex here to avoid missing a notification in WaitLoaded
137
      // and WaitAllLoaded. The waiting loop in those functions is equivalent to the following,
138
      // after locking the mutex (and of course wait(...) releases the mutex while waiting):
139
      //
140
      // 1 while (!all_loaded_.load(std::memory_order_acquire)) {
141
      // 2   load_cond_.wait(lock);
142
      // 3 }
143
      //
144
      // If we did not have the lock/unlock here, it would be possible that all_loaded_ would be set
145
      // to true and notify_all() would be called between lines 1 and 2, and we would miss the
146
      // notification and wait indefinitely at line 2. With lock/unlock this is no longer possible
147
      // because if we set all_loaded_ to true between lines 1 and 2, the only time we would be able
148
      // to send a notification at line 2 as wait(...) releases the mutex, but then we would check
149
      // all_loaded_ and exit the loop at line 1.
150
25.8k
      std::lock_guard<std::mutex> lock(loader_.mutex_);
151
25.8k
    }
152
25.8k
    loader_.load_cond_.notify_all();
153
25.8k
    LOG_WITH_PREFIX(INFO) << __func__ << " done: loaded " << loaded_transactions << " transactions";
154
25.8k
  }
155
156
25.8k
  void LoadPendingApplies() {
157
25.8k
    std::array<char, 1 + sizeof(TransactionId) + 1> seek_buffer;
158
25.8k
    seek_buffer[0] = docdb::ValueTypeAsChar::kTransactionApplyState;
159
25.8k
    seek_buffer[seek_buffer.size() - 1] = docdb::ValueTypeAsChar::kMaxByte;
160
25.8k
    regular_iterator_.Seek(Slice(seek_buffer.data(), 1));
161
162
25.8k
    while (regular_iterator_.Valid()) {
163
110
      auto key = regular_iterator_.key();
164
110
      if (!key.TryConsumeByte(docdb::ValueTypeAsChar::kTransactionApplyState)) {
165
110
        break;
166
110
      }
167
0
      auto txn_id = DecodeTransactionId(&key);
168
0
      if (!txn_id.ok() || !key.TryConsumeByte(docdb::ValueTypeAsChar::kGroupEnd)) {
169
0
        LOG_WITH_PREFIX(DFATAL) << "Wrong txn id: " << regular_iterator_.key().ToDebugString();
170
0
        regular_iterator_.Next();
171
0
        continue;
172
0
      }
173
0
      Slice value = regular_iterator_.value();
174
0
      if (value.TryConsumeByte(docdb::ValueTypeAsChar::kString)) {
175
0
        auto pb = pb_util::ParseFromSlice<docdb::ApplyTransactionStatePB>(value);
176
0
        if (!pb.ok()) {
177
0
          LOG_WITH_PREFIX(DFATAL) << "Failed to decode apply state pb from RocksDB"
178
0
                                  << key.ToDebugString() << ": " << pb.status();
179
0
          regular_iterator_.Next();
180
0
          continue;
181
0
        }
182
183
0
        auto state = docdb::ApplyTransactionState::FromPB(*pb);
184
0
        if (!state.ok()) {
185
0
          LOG_WITH_PREFIX(DFATAL) << "Failed to decode apply state from stored pb "
186
0
              << state.status();
187
0
          regular_iterator_.Next();
188
0
          continue;
189
0
        }
190
191
0
        auto it = pending_applies_.emplace(*txn_id, ApplyStateWithCommitHt {
192
0
          .state = state.get(),
193
0
          .commit_ht = HybridTime(pb->commit_ht())
194
0
        }).first;
195
196
0
        VLOG_WITH_PREFIX(4) << "Loaded pending apply for " << *txn_id << ": "
197
0
                            << it->second.ToString();
198
0
      } else if (value.TryConsumeByte(docdb::ValueTypeAsChar::kTombstone)) {
199
0
        VLOG_WITH_PREFIX(4) << "Found deleted large apply for " << *txn_id;
200
0
      } else {
201
0
        LOG_WITH_PREFIX(DFATAL)
202
0
            << "Unexpected value type in apply state: " << value.ToDebugString();
203
0
      }
204
205
0
      memcpy(seek_buffer.data() + 1, txn_id->data(), txn_id->size());
206
0
      ROCKSDB_SEEK(&regular_iterator_, Slice(seek_buffer));
207
0
    }
208
25.8k
  }
209
210
  // id - transaction id to load.
211
4
  void LoadTransaction(const TransactionId& id) {
212
4
    metric_transaction_load_attempts_->Increment();
213
0
    VLOG_WITH_PREFIX(1) << "Loading transaction: " << id;
214
215
4
    TransactionMetadataPB metadata_pb;
216
217
4
    const Slice& value = intents_iterator_.value();
218
4
    if (!metadata_pb.ParseFromArray(value.cdata(), narrow_cast<int>(value.size()))) {
219
0
      LOG_WITH_PREFIX(DFATAL) << "Unable to parse stored metadata: "
220
0
                              << value.ToDebugHexString();
221
0
      return;
222
0
    }
223
224
4
    auto metadata = TransactionMetadata::FromPB(metadata_pb);
225
4
    if (!metadata.ok()) {
226
0
      LOG_WITH_PREFIX(DFATAL) << "Loaded bad metadata: " << metadata.status();
227
0
      return;
228
0
    }
229
230
4
    if (!metadata->start_time.is_valid()) {
231
0
      metadata->start_time = HybridTime::kMin;
232
0
      LOG_WITH_PREFIX(INFO) << "Patched start time " << metadata->transaction_id << ": "
233
0
                            << metadata->start_time;
234
0
    }
235
236
4
    TransactionalBatchData last_batch_data;
237
4
    OneWayBitmap replicated_batches;
238
4
    FetchLastBatchData(id, &last_batch_data, &replicated_batches);
239
240
4
    if (!status_resolver_) {
241
4
      status_resolver_ = &context().AddStatusResolver();
242
4
    }
243
4
    status_resolver_->Add(metadata->status_tablet, id);
244
245
4
    auto pending_apply_it = pending_applies_.find(id);
246
4
    context().LoadTransaction(
247
4
        std::move(*metadata), std::move(last_batch_data), std::move(replicated_batches),
248
4
        pending_apply_it != pending_applies_.end() ? &pending_apply_it->second : nullptr);
249
4
    {
250
4
      std::lock_guard<std::mutex> lock(loader_.mutex_);
251
4
      loader_.last_loaded_ = id;
252
4
    }
253
4
    loader_.load_cond_.notify_all();
254
4
  }
255
256
  void FetchLastBatchData(
257
      const TransactionId& id,
258
      TransactionalBatchData* last_batch_data,
259
4
      OneWayBitmap* replicated_batches) {
260
4
    current_key_.AppendValueType(docdb::ValueType::kMaxByte);
261
4
    intents_iterator_.Seek(current_key_.AsSlice());
262
4
    if (intents_iterator_.Valid()) {
263
0
      intents_iterator_.Prev();
264
4
    } else {
265
4
      intents_iterator_.SeekToLast();
266
4
    }
267
4
    current_key_.RemoveLastByte();
268
12
    while (intents_iterator_.Valid() && intents_iterator_.key().starts_with(current_key_)) {
269
12
      auto decoded_key = docdb::DecodeIntentKey(intents_iterator_.value());
270
0
      LOG_IF_WITH_PREFIX(DFATAL, !decoded_key.ok())
271
0
          << "Failed to decode intent while loading transaction " << id << ", "
272
0
          << intents_iterator_.key().ToDebugHexString() << " => "
273
0
          << intents_iterator_.value().ToDebugHexString() << ": " << decoded_key.status();
274
12
      if (decoded_key.ok() && docdb::HasStrong(decoded_key->intent_types)) {
275
4
        last_batch_data->hybrid_time = decoded_key->doc_ht.hybrid_time();
276
4
        Slice rev_key_slice(intents_iterator_.value());
277
        // Required by the transaction sealing protocol.
278
4
        if (!rev_key_slice.empty() && rev_key_slice[0] == docdb::ValueTypeAsChar::kBitSet) {
279
0
          CHECK(!FLAGS_TEST_fail_on_replicated_batch_idx_set_in_txn_record);
280
0
          rev_key_slice.remove_prefix(1);
281
0
          auto result = OneWayBitmap::Decode(&rev_key_slice);
282
0
          if (result.ok()) {
283
0
            *replicated_batches = std::move(*result);
284
0
            VLOG_WITH_PREFIX(1) << "Decoded replicated batches for " << id << ": "
285
0
                                << replicated_batches->ToString();
286
0
          } else {
287
0
            LOG_WITH_PREFIX(DFATAL)
288
0
                << "Failed to decode replicated batches from "
289
0
                << intents_iterator_.value().ToDebugHexString() << ": " << result.status();
290
0
          }
291
0
        }
292
4
        std::string rev_key = rev_key_slice.ToBuffer();
293
4
        intents_iterator_.Seek(rev_key);
294
        // Delete could run in parallel to this load, and since our deletes break snapshot read
295
        // we could get into a situation when metadata and reverse record were successfully read,
296
        // but intent record could not be found.
297
4
        if (intents_iterator_.Valid() && intents_iterator_.key().starts_with(rev_key)) {
298
0
          VLOG_WITH_PREFIX(1)
299
0
              << "Found latest record for " << id
300
0
              << ": " << docdb::SubDocKey::DebugSliceToString(intents_iterator_.key())
301
0
              << " => " << intents_iterator_.value().ToDebugHexString();
302
4
          auto txn_id_slice = id.AsSlice();
303
4
          auto decoded_value_or_status = docdb::DecodeIntentValue(
304
4
              intents_iterator_.value(), &txn_id_slice);
305
0
          LOG_IF_WITH_PREFIX(DFATAL, !decoded_value_or_status.ok())
306
0
              << "Failed to decode intent value: " << decoded_value_or_status.status() << ", "
307
0
              << docdb::SubDocKey::DebugSliceToString(intents_iterator_.key()) << " => "
308
0
              << intents_iterator_.value().ToDebugHexString();
309
4
          if (decoded_value_or_status.ok()) {
310
4
            last_batch_data->next_write_id = decoded_value_or_status->write_id;
311
4
          }
312
4
          ++last_batch_data->next_write_id;
313
4
        }
314
4
        break;
315
4
      }
316
8
      intents_iterator_.Prev();
317
8
    }
318
4
  }
319
320
77.5k
  TransactionLoaderContext& context() const {
321
77.5k
    return loader_.context_;
322
77.5k
  }
323
324
51.6k
  const std::string& LogPrefix() const {
325
51.6k
    return context().LogPrefix();
326
51.6k
  }
327
328
  TransactionLoader& loader_;
329
  ScopedRWOperation scoped_pending_operation_;
330
331
  docdb::BoundedRocksDbIterator regular_iterator_;
332
  docdb::BoundedRocksDbIterator intents_iterator_;
333
334
  // Buffer that contains key of current record, i.e. value type + transaction id.
335
  docdb::KeyBytes current_key_;
336
337
  TransactionStatusResolver* status_resolver_ = nullptr;
338
339
  ApplyStatesMap pending_applies_;
340
341
  scoped_refptr<Counter> metric_transaction_load_attempts_;
342
};
343
344
TransactionLoader::TransactionLoader(
345
    TransactionLoaderContext* context, const scoped_refptr<MetricEntity>& entity)
346
25.8k
    : context_(*context), entity_(entity) {}
347
348
19.2k
TransactionLoader::~TransactionLoader() {
349
19.2k
}
350
351
25.8k
void TransactionLoader::Start(RWOperationCounter* pending_op_counter, const docdb::DocDB& db) {
352
25.8k
  executor_ = std::make_unique<Executor>(this, pending_op_counter);
353
25.8k
  if (!executor_->Start(db)) {
354
0
    executor_ = nullptr;
355
0
  }
356
25.8k
}
357
358
namespace {
359
360
// Waiting threads will only wake up on a timeout if there is still an uncaught race condition that
361
// causes us to miss a notification on the condition variable.
362
constexpr auto kWaitLoadedWakeUpInterval = 10s;
363
364
}  // namespace
365
366
5.93M
void TransactionLoader::WaitLoaded(const TransactionId& id) NO_THREAD_SAFETY_ANALYSIS {
367
5.93M
  if (all_loaded_.load(std::memory_order_acquire)) {
368
5.93M
    return;
369
5.93M
  }
370
18.4E
  std::unique_lock<std::mutex> lock(mutex_);
371
  // Defensively wake up at least once a second to avoid deadlock due to any issue similar to #8696.
372
18.4E
  while (!all_loaded_.load(std::memory_order_acquire)) {
373
0
    if (last_loaded_ >= id) {
374
0
      break;
375
0
    }
376
0
    load_cond_.wait_for(lock, kWaitLoadedWakeUpInterval);
377
0
  }
378
18.4E
}
379
380
// Disable thread safety analysis because std::unique_lock is used.
381
82.1k
void TransactionLoader::WaitAllLoaded() NO_THREAD_SAFETY_ANALYSIS {
382
82.1k
  if (all_loaded_.load(std::memory_order_acquire)) {
383
82.1k
    return;
384
82.1k
  }
385
  // Defensively wake up at least once a second to avoid deadlock due to any issue similar to #8696.
386
0
  std::unique_lock<std::mutex> lock(mutex_);
387
0
  while (!all_loaded_.load(std::memory_order_acquire)) {
388
0
    load_cond_.wait_for(lock, kWaitLoadedWakeUpInterval);
389
0
  }
390
0
}
391
392
19.3k
void TransactionLoader::Shutdown() {
393
19.3k
  if (load_thread_.joinable()) {
394
19.3k
    load_thread_.join();
395
19.3k
  }
396
19.3k
}
397
398
} // namespace tablet
399
} // namespace yb