YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/src/yb/consensus/consensus.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
//
18
// The following only applies to changes made to this file as part of YugaByte development.
19
//
20
// Portions Copyright (c) YugaByte, Inc.
21
//
22
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
23
// in compliance with the License.  You may obtain a copy of the License at
24
//
25
// http://www.apache.org/licenses/LICENSE-2.0
26
//
27
// Unless required by applicable law or agreed to in writing, software distributed under the License
28
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
29
// or implied.  See the License for the specific language governing permissions and limitations
30
// under the License.
31
//
32
#ifndef YB_CONSENSUS_CONSENSUS_H_
33
#define YB_CONSENSUS_CONSENSUS_H_
34
35
#include <iosfwd>
36
#include <memory>
37
#include <string>
38
#include <vector>
39
40
#include <boost/optional/optional_fwd.hpp>
41
42
#include "yb/common/entity_ids_types.h"
43
44
#include "yb/consensus/consensus_fwd.h"
45
#include "yb/consensus/consensus_types.pb.h"
46
#include "yb/consensus/metadata.pb.h"
47
48
#include "yb/gutil/ref_counted.h"
49
#include "yb/gutil/stringprintf.h"
50
#include "yb/gutil/strings/substitute.h"
51
52
#include "yb/tserver/tserver_types.pb.h"
53
54
#include "yb/util/status_fwd.h"
55
#include "yb/util/enums.h"
56
#include "yb/util/monotime.h"
57
#include "yb/util/opid.h"
58
#include "yb/util/opid.pb.h"
59
#include "yb/util/physical_time.h"
60
#include "yb/util/status_callback.h"
61
#include "yb/util/strongly_typed_bool.h"
62
63
namespace yb {
64
65
namespace server {
66
class Clock;
67
}
68
69
namespace tablet {
70
class TabletPeer;
71
}
72
73
namespace tserver {
74
class TabletServerErrorPB;
75
}
76
77
namespace consensus {
78
79
// After completing bootstrap, some of the results need to be plumbed through
80
// into the consensus implementation.
81
struct ConsensusBootstrapInfo {
82
  ConsensusBootstrapInfo();
83
84
  // The id of the last operation in the log
85
  OpIdPB last_id;
86
87
  // The id of the last committed operation in the log.
88
  OpIdPB last_committed_id;
89
90
  // REPLICATE messages which were in the log with no accompanying
91
  // COMMIT. These need to be passed along to consensus init in order
92
  // to potentially commit them.
93
  //
94
  // These are owned by the ConsensusBootstrapInfo instance.
95
  ReplicateMsgs orphaned_replicates;
96
97
 private:
98
  DISALLOW_COPY_AND_ASSIGN(ConsensusBootstrapInfo);
99
};
100
101
struct LeaderState;
102
103
// Mode is orthogonal to pre-elections, so any combination could be used.
104
YB_DEFINE_ENUM(ElectionMode,
105
    // A normal leader election. Peers will not vote for this node
106
    // if they believe that a leader is alive.
107
    (NORMAL_ELECTION)
108
    // In this mode, peers will vote for this candidate even if they
109
    // think a leader is alive. This can be used for a faster hand-off
110
    // between a leader and one of its replicas.
111
    (ELECT_EVEN_IF_LEADER_IS_ALIVE));
112
113
// Arguments for StartElection.
114
struct LeaderElectionData {
115
  ElectionMode mode = ElectionMode::NORMAL_ELECTION;
116
117
  // pending_commit - we should start election only after we have specified entry committed.
118
  const bool pending_commit = false;
119
120
  // must_be_committed_opid - only matters if pending_commit is true.
121
  //    If this is specified, we would wait until this entry is committed. If not specified
122
  //    (i.e. if this has the default OpId value) it is taken from the last call to StartElection
123
  //    with pending_commit = true.
124
  OpId must_be_committed_opid;
125
126
  // originator_uuid - if election is initiated by an old leader as part of a stepdown procedure,
127
  //    this would contain the uuid of the old leader.
128
  std::string originator_uuid = std::string();
129
130
  TEST_SuppressVoteRequest suppress_vote_request = TEST_SuppressVoteRequest::kFalse;
131
132
  bool initial_election = false;
133
134
  std::string ToString() const;
135
};
136
137
// The external interface for a consensus peer.
138
//
139
// Note: Even though Consensus points to Log, it needs to be destroyed
140
// after it. See Log class header comment for the reason why. On the other
141
// hand Consensus must be quiesced before closing the log, otherwise it
142
// will try to write to a destroyed/closed log.
143
//
144
// The order of these operations on shutdown must therefore be:
145
// 1 - quiesce Consensus
146
// 2 - close/destroy Log
147
// 3 - destroy Consensus
148
class Consensus {
149
 public:
150
  class ConsensusFaultHooks;
151
152
88.7k
  Consensus() {}
153
47.8k
  virtual ~Consensus() {}
154
155
  // Starts running the consensus algorithm.
156
  virtual CHECKED_STATUS Start(const ConsensusBootstrapInfo& info) = 0;
157
158
  // Returns true if consensus is running.
159
  virtual bool IsRunning() const = 0;
160
161
  // Emulates a leader election by simply making this peer leader.
162
  virtual CHECKED_STATUS EmulateElection() = 0;
163
164
  virtual CHECKED_STATUS StartElection(const LeaderElectionData& data) = 0;
165
166
  // We tried to step down, so you protege become leader.
167
  // But it failed to win election, so we should reset our withhold time and try to reelect ourself.
168
  // election_lost_by_uuid - uuid of protege that lost election.
169
  virtual CHECKED_STATUS ElectionLostByProtege(const std::string& election_lost_by_uuid) = 0;
170
171
  // Implement a LeaderStepDown() request.
172
  virtual CHECKED_STATUS StepDown(const LeaderStepDownRequestPB* req,
173
                                  LeaderStepDownResponsePB* resp);
174
175
  // Wait until the node has LEADER role.
176
  // Returns Status::TimedOut if the role is not LEADER within 'timeout'.
177
  virtual CHECKED_STATUS WaitUntilLeaderForTests(const MonoDelta& timeout) = 0;
178
179
  // Called by a Leader to replicate an entry to the state machine.
180
  //
181
  // From the leader instance perspective execution proceeds as follows:
182
  //
183
  //           Leader                               RaftConfig
184
  //             +                                     +
185
  //     1) Req->| Replicate()                         |
186
  //             |                                     |
187
  //     2)      +-------------replicate-------------->|
188
  //             |<---------------ACK------------------+
189
  //             |                                     |
190
  //     3)      +--+                                  |
191
  //           <----+ round.NotifyReplicationFinished()|
192
  //             |                                     |
193
  //     3a)     |  +------ update commitIndex ------->|
194
  //             |                                     |
195
  //
196
  // 1) Caller calls Replicate(), method returns immediately to the caller and
197
  //    runs asynchronously.
198
  //
199
  // 2) Leader replicates the entry to the peers using the consensus
200
  //    algorithm, proceeds as soon as a majority of voters acknowledges the
201
  //    entry.
202
  //
203
  // 3) Leader defers to the caller by calling ConsensusRound::NotifyReplicationFinished,
204
  //    which calls the ConsensusReplicatedCallback.
205
  //
206
  // 3a) The leader asynchronously notifies other peers of the new
207
  //     commit index, which tells them to apply the operation.
208
  //
209
  // This method can only be called on the leader, i.e. role() == LEADER
210
211
  virtual CHECKED_STATUS TEST_Replicate(const ConsensusRoundPtr& round) = 0;
212
213
  // A batch version of Replicate, which is what we try to use as much as possible for performance.
214
  virtual CHECKED_STATUS ReplicateBatch(const ConsensusRounds& rounds) = 0;
215
216
  // Messages sent from LEADER to FOLLOWERS and LEARNERS to update their
217
  // state machines. This is equivalent to "AppendEntries()" in Raft
218
  // terminology.
219
  //
220
  // ConsensusRequestPB contains a sequence of 0 or more operations to apply
221
  // on the replica. If there are 0 operations the request is considered
222
  // 'status-only' i.e. the leader is communicating with the follower only
223
  // in order to pass back and forth information on watermarks (eg committed
224
  // operation ID, replicated op id, etc).
225
  //
226
  // If the sequence contains 1 or more operations they will be replicated
227
  // in the same order as the leader, and submitted for asynchronous Prepare
228
  // in the same order.
229
  //
230
  // The leader also provides information on the index of the latest
231
  // operation considered committed by consensus. The replica uses this
232
  // information to update the state of any pending (previously replicated/prepared)
233
  // transactions.
234
  //
235
  // Returns Status::OK if the response has been filled (regardless of accepting
236
  // or rejecting the specific request). Returns non-OK Status if a specific
237
  // error response could not be formed, which will result in the service
238
  // returning an UNKNOWN_ERROR RPC error code to the caller and including the
239
  // stringified Status message.
240
  virtual CHECKED_STATUS Update(
241
      ConsensusRequestPB* request,
242
      ConsensusResponsePB* response,
243
      CoarseTimePoint deadline) = 0;
244
245
  // Messages sent from CANDIDATEs to voting peers to request their vote
246
  // in leader election.
247
  virtual CHECKED_STATUS RequestVote(const VoteRequestPB* request,
248
                                     VoteResponsePB* response) = 0;
249
250
  // Implement a ChangeConfig() request.
251
  virtual CHECKED_STATUS ChangeConfig(const ChangeConfigRequestPB& req,
252
                                      const StdStatusCallback& client_cb,
253
                                      boost::optional<tserver::TabletServerErrorPB::Code>* error);
254
255
  virtual Status UnsafeChangeConfig(
256
      const UnsafeChangeConfigRequestPB& req,
257
      boost::optional<tserver::TabletServerErrorPB::Code>* error_code) = 0;
258
259
  // Returns the current Raft role of this instance.
260
  virtual PeerRole role() const = 0;
261
262
  // Returns the leader status (see LeaderStatus type description for details).
263
  // If leader is ready, then also returns term, otherwise OpId::kUnknownTerm is returned.
264
  //
265
  // allow_stale could be used to avoid refreshing cache, when we are OK to read slightly outdated
266
  // value.
267
  virtual LeaderState GetLeaderState(bool allow_stale = false) const = 0;
268
269
  LeaderStatus GetLeaderStatus(bool allow_stale = false) const;
270
  int64_t LeaderTerm() const;
271
272
  // Returns the uuid of this peer.
273
  virtual std::string peer_uuid() const = 0;
274
275
  // Returns the id of the tablet whose updates this consensus instance helps coordinate.
276
  virtual std::string tablet_id() const = 0;
277
278
  virtual const TabletId& split_parent_tablet_id() const = 0;
279
280
  // Returns a copy of the committed state of the Consensus system. Also allows returning the
281
  // leader lease status captured under the same lock.
282
  virtual ConsensusStatePB ConsensusState(
283
      ConsensusConfigType type,
284
      LeaderLeaseStatus* leader_lease_status = nullptr) const = 0;
285
286
  // Returns a copy of the committed state of the Consensus system, assuming caller holds the needed
287
  // locks.
288
  virtual ConsensusStatePB ConsensusStateUnlocked(
289
      ConsensusConfigType type,
290
      LeaderLeaseStatus* leader_lease_status = nullptr) const = 0;
291
292
  // Returns a copy of the current committed Raft configuration.
293
  virtual RaftConfigPB CommittedConfig() const = 0;
294
295
  virtual void DumpStatusHtml(std::ostream& out) const = 0;
296
297
  void SetFaultHooks(const std::shared_ptr<ConsensusFaultHooks>& hooks);
298
299
  const std::shared_ptr<ConsensusFaultHooks>& GetFaultHooks() const;
300
301
  // Stops running the consensus algorithm.
302
  virtual void Shutdown() = 0;
303
304
  // Returns the last OpId (either received or committed, depending on the 'type' argument) that the
305
  // Consensus implementation knows about.  Primarily used for testing purposes.
306
  Result<yb::OpId> GetLastOpId(OpIdType type);
307
308
  virtual yb::OpId GetLastReceivedOpId() = 0;
309
310
  virtual yb::OpId GetLastCommittedOpId() = 0;
311
312
  virtual yb::OpId GetLastCDCedOpId() = 0;
313
314
  virtual yb::OpId GetLastAppliedOpId() = 0;
315
316
  // Assuming we are the leader, wait until we have a valid leader lease (i.e. the old leader's
317
  // lease has expired, and we have replicated a new lease that has not expired yet).
318
  virtual CHECKED_STATUS WaitForLeaderLeaseImprecise(CoarseTimePoint deadline) = 0;
319
320
  // Check that this Consensus is a leader and has lease, returns Status::OK in this case.
321
  // Otherwise error status is returned.
322
  virtual CHECKED_STATUS CheckIsActiveLeaderAndHasLease() const = 0;
323
324
  // Returns majority replicated ht lease, so we know that after leader change
325
  // operations would not be added with hybrid time below this lease.
326
  //
327
  // `min_allowed` - result should be greater or equal to `min_allowed`, otherwise
328
  // it tries to wait until ht lease reaches this value or `deadline` happens.
329
  //
330
  // Returns 0 if timeout happened.
331
  virtual Result<MicrosTime> MajorityReplicatedHtLeaseExpiration(
332
      MicrosTime min_allowed, CoarseTimePoint deadline) const = 0;
333
334
  // Read majority replicated messages for CDC producer.
335
  virtual Result<ReadOpsResult> ReadReplicatedMessagesForCDC(const yb::OpId& from,
336
                                                             int64_t* repl_index,
337
                                                             const CoarseTimePoint deadline) = 0;
338
339
  virtual void UpdateCDCConsumerOpId(const yb::OpId& op_id) = 0;
340
341
 protected:
342
  friend class RefCountedThreadSafe<Consensus>;
343
  friend class tablet::TabletPeer;
344
345
  // Fault hooks for tests. In production code this will always be null.
346
  std::shared_ptr<ConsensusFaultHooks> fault_hooks_;
347
348
  enum HookPoint {
349
    PRE_START,
350
    POST_START,
351
    PRE_CONFIG_CHANGE,
352
    POST_CONFIG_CHANGE,
353
    PRE_REPLICATE,
354
    POST_REPLICATE,
355
    PRE_COMMIT,
356
    POST_COMMIT,
357
    PRE_UPDATE,
358
    POST_UPDATE,
359
    PRE_SHUTDOWN,
360
    POST_SHUTDOWN
361
  };
362
363
  CHECKED_STATUS ExecuteHook(HookPoint point);
364
365
  enum State {
366
    kNotInitialized,
367
    kInitializing,
368
    kConfiguring,
369
    kRunning,
370
  };
371
372
 private:
373
  DISALLOW_COPY_AND_ASSIGN(Consensus);
374
};
375
376
YB_DEFINE_ENUM(StateChangeReason,
377
    (INVALID_REASON)
378
    (TABLET_PEER_STARTED)
379
    (CONSENSUS_STARTED)
380
    (NEW_LEADER_ELECTED)
381
    (FOLLOWER_NO_OP_COMPLETE)
382
    (LEADER_CONFIG_CHANGE_COMPLETE)
383
    (FOLLOWER_CONFIG_CHANGE_COMPLETE));
384
385
class Consensus::ConsensusFaultHooks {
386
 public:
387
  virtual CHECKED_STATUS PreStart();
388
  virtual CHECKED_STATUS PostStart();
389
  virtual CHECKED_STATUS PreConfigChange();
390
  virtual CHECKED_STATUS PostConfigChange();
391
  virtual CHECKED_STATUS PreReplicate();
392
  virtual CHECKED_STATUS PostReplicate();
393
  virtual CHECKED_STATUS PreUpdate();
394
  virtual CHECKED_STATUS PostUpdate();
395
  virtual CHECKED_STATUS PreShutdown();
396
  virtual CHECKED_STATUS PostShutdown();
397
2
  virtual ~ConsensusFaultHooks() {}
398
};
399
400
class SafeOpIdWaiter {
401
 public:
402
  virtual yb::OpId WaitForSafeOpIdToApply(const yb::OpId& op_id) = 0;
403
404
 protected:
405
47.8k
  ~SafeOpIdWaiter() {}
406
};
407
408
struct LeaderState {
409
  LeaderStatus status;
410
  int64_t term;
411
  MonoDelta remaining_old_leader_lease;
412
413
  LeaderState& MakeNotReadyLeader(LeaderStatus status);
414
415
21.9M
  bool ok() const {
416
21.9M
    return status == LeaderStatus::LEADER_AND_READY;
417
21.9M
  }
418
419
  CHECKED_STATUS CreateStatus() const;
420
};
421
422
CHECKED_STATUS MoveStatus(LeaderState&& state);
423
424
} // namespace consensus
425
} // namespace yb
426
427
#endif // YB_CONSENSUS_CONSENSUS_H_