/Users/deen/code/yugabyte-db/src/yb/consensus/consensus.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // |
18 | | // The following only applies to changes made to this file as part of YugaByte development. |
19 | | // |
20 | | // Portions Copyright (c) YugaByte, Inc. |
21 | | // |
22 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
23 | | // in compliance with the License. You may obtain a copy of the License at |
24 | | // |
25 | | // http://www.apache.org/licenses/LICENSE-2.0 |
26 | | // |
27 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
28 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
29 | | // or implied. See the License for the specific language governing permissions and limitations |
30 | | // under the License. |
31 | | // |
32 | | #ifndef YB_CONSENSUS_CONSENSUS_H_ |
33 | | #define YB_CONSENSUS_CONSENSUS_H_ |
34 | | |
35 | | #include <iosfwd> |
36 | | #include <memory> |
37 | | #include <string> |
38 | | #include <vector> |
39 | | |
40 | | #include <boost/optional/optional_fwd.hpp> |
41 | | |
42 | | #include "yb/common/entity_ids_types.h" |
43 | | |
44 | | #include "yb/consensus/consensus_fwd.h" |
45 | | #include "yb/consensus/consensus_types.pb.h" |
46 | | #include "yb/consensus/metadata.pb.h" |
47 | | |
48 | | #include "yb/gutil/ref_counted.h" |
49 | | #include "yb/gutil/stringprintf.h" |
50 | | #include "yb/gutil/strings/substitute.h" |
51 | | |
52 | | #include "yb/tserver/tserver_types.pb.h" |
53 | | |
54 | | #include "yb/util/status_fwd.h" |
55 | | #include "yb/util/enums.h" |
56 | | #include "yb/util/monotime.h" |
57 | | #include "yb/util/opid.h" |
58 | | #include "yb/util/opid.pb.h" |
59 | | #include "yb/util/physical_time.h" |
60 | | #include "yb/util/status_callback.h" |
61 | | #include "yb/util/strongly_typed_bool.h" |
62 | | |
63 | | namespace yb { |
64 | | |
65 | | namespace server { |
66 | | class Clock; |
67 | | } |
68 | | |
69 | | namespace tablet { |
70 | | class TabletPeer; |
71 | | } |
72 | | |
73 | | namespace tserver { |
74 | | class TabletServerErrorPB; |
75 | | } |
76 | | |
77 | | namespace consensus { |
78 | | |
79 | | // After completing bootstrap, some of the results need to be plumbed through |
80 | | // into the consensus implementation. |
81 | | struct ConsensusBootstrapInfo { |
82 | | ConsensusBootstrapInfo(); |
83 | | |
84 | | // The id of the last operation in the log |
85 | | OpIdPB last_id; |
86 | | |
87 | | // The id of the last committed operation in the log. |
88 | | OpIdPB last_committed_id; |
89 | | |
90 | | // REPLICATE messages which were in the log with no accompanying |
91 | | // COMMIT. These need to be passed along to consensus init in order |
92 | | // to potentially commit them. |
93 | | // |
94 | | // These are owned by the ConsensusBootstrapInfo instance. |
95 | | ReplicateMsgs orphaned_replicates; |
96 | | |
97 | | private: |
98 | | DISALLOW_COPY_AND_ASSIGN(ConsensusBootstrapInfo); |
99 | | }; |
100 | | |
101 | | struct LeaderState; |
102 | | |
103 | | // Mode is orthogonal to pre-elections, so any combination could be used. |
104 | | YB_DEFINE_ENUM(ElectionMode, |
105 | | // A normal leader election. Peers will not vote for this node |
106 | | // if they believe that a leader is alive. |
107 | | (NORMAL_ELECTION) |
108 | | // In this mode, peers will vote for this candidate even if they |
109 | | // think a leader is alive. This can be used for a faster hand-off |
110 | | // between a leader and one of its replicas. |
111 | | (ELECT_EVEN_IF_LEADER_IS_ALIVE)); |
112 | | |
113 | | // Arguments for StartElection. |
114 | | struct LeaderElectionData { |
115 | | ElectionMode mode = ElectionMode::NORMAL_ELECTION; |
116 | | |
117 | | // pending_commit - we should start election only after we have specified entry committed. |
118 | | const bool pending_commit = false; |
119 | | |
120 | | // must_be_committed_opid - only matters if pending_commit is true. |
121 | | // If this is specified, we would wait until this entry is committed. If not specified |
122 | | // (i.e. if this has the default OpId value) it is taken from the last call to StartElection |
123 | | // with pending_commit = true. |
124 | | OpId must_be_committed_opid; |
125 | | |
126 | | // originator_uuid - if election is initiated by an old leader as part of a stepdown procedure, |
127 | | // this would contain the uuid of the old leader. |
128 | | std::string originator_uuid = std::string(); |
129 | | |
130 | | TEST_SuppressVoteRequest suppress_vote_request = TEST_SuppressVoteRequest::kFalse; |
131 | | |
132 | | bool initial_election = false; |
133 | | |
134 | | std::string ToString() const; |
135 | | }; |
136 | | |
137 | | // The external interface for a consensus peer. |
138 | | // |
139 | | // Note: Even though Consensus points to Log, it needs to be destroyed |
140 | | // after it. See Log class header comment for the reason why. On the other |
141 | | // hand Consensus must be quiesced before closing the log, otherwise it |
142 | | // will try to write to a destroyed/closed log. |
143 | | // |
144 | | // The order of these operations on shutdown must therefore be: |
145 | | // 1 - quiesce Consensus |
146 | | // 2 - close/destroy Log |
147 | | // 3 - destroy Consensus |
148 | | class Consensus { |
149 | | public: |
150 | | class ConsensusFaultHooks; |
151 | | |
152 | 88.7k | Consensus() {} |
153 | 47.8k | virtual ~Consensus() {} |
154 | | |
155 | | // Starts running the consensus algorithm. |
156 | | virtual CHECKED_STATUS Start(const ConsensusBootstrapInfo& info) = 0; |
157 | | |
158 | | // Returns true if consensus is running. |
159 | | virtual bool IsRunning() const = 0; |
160 | | |
161 | | // Emulates a leader election by simply making this peer leader. |
162 | | virtual CHECKED_STATUS EmulateElection() = 0; |
163 | | |
164 | | virtual CHECKED_STATUS StartElection(const LeaderElectionData& data) = 0; |
165 | | |
166 | | // We tried to step down, so you protege become leader. |
167 | | // But it failed to win election, so we should reset our withhold time and try to reelect ourself. |
168 | | // election_lost_by_uuid - uuid of protege that lost election. |
169 | | virtual CHECKED_STATUS ElectionLostByProtege(const std::string& election_lost_by_uuid) = 0; |
170 | | |
171 | | // Implement a LeaderStepDown() request. |
172 | | virtual CHECKED_STATUS StepDown(const LeaderStepDownRequestPB* req, |
173 | | LeaderStepDownResponsePB* resp); |
174 | | |
175 | | // Wait until the node has LEADER role. |
176 | | // Returns Status::TimedOut if the role is not LEADER within 'timeout'. |
177 | | virtual CHECKED_STATUS WaitUntilLeaderForTests(const MonoDelta& timeout) = 0; |
178 | | |
179 | | // Called by a Leader to replicate an entry to the state machine. |
180 | | // |
181 | | // From the leader instance perspective execution proceeds as follows: |
182 | | // |
183 | | // Leader RaftConfig |
184 | | // + + |
185 | | // 1) Req->| Replicate() | |
186 | | // | | |
187 | | // 2) +-------------replicate-------------->| |
188 | | // |<---------------ACK------------------+ |
189 | | // | | |
190 | | // 3) +--+ | |
191 | | // <----+ round.NotifyReplicationFinished()| |
192 | | // | | |
193 | | // 3a) | +------ update commitIndex ------->| |
194 | | // | | |
195 | | // |
196 | | // 1) Caller calls Replicate(), method returns immediately to the caller and |
197 | | // runs asynchronously. |
198 | | // |
199 | | // 2) Leader replicates the entry to the peers using the consensus |
200 | | // algorithm, proceeds as soon as a majority of voters acknowledges the |
201 | | // entry. |
202 | | // |
203 | | // 3) Leader defers to the caller by calling ConsensusRound::NotifyReplicationFinished, |
204 | | // which calls the ConsensusReplicatedCallback. |
205 | | // |
206 | | // 3a) The leader asynchronously notifies other peers of the new |
207 | | // commit index, which tells them to apply the operation. |
208 | | // |
209 | | // This method can only be called on the leader, i.e. role() == LEADER |
210 | | |
211 | | virtual CHECKED_STATUS TEST_Replicate(const ConsensusRoundPtr& round) = 0; |
212 | | |
213 | | // A batch version of Replicate, which is what we try to use as much as possible for performance. |
214 | | virtual CHECKED_STATUS ReplicateBatch(const ConsensusRounds& rounds) = 0; |
215 | | |
216 | | // Messages sent from LEADER to FOLLOWERS and LEARNERS to update their |
217 | | // state machines. This is equivalent to "AppendEntries()" in Raft |
218 | | // terminology. |
219 | | // |
220 | | // ConsensusRequestPB contains a sequence of 0 or more operations to apply |
221 | | // on the replica. If there are 0 operations the request is considered |
222 | | // 'status-only' i.e. the leader is communicating with the follower only |
223 | | // in order to pass back and forth information on watermarks (eg committed |
224 | | // operation ID, replicated op id, etc). |
225 | | // |
226 | | // If the sequence contains 1 or more operations they will be replicated |
227 | | // in the same order as the leader, and submitted for asynchronous Prepare |
228 | | // in the same order. |
229 | | // |
230 | | // The leader also provides information on the index of the latest |
231 | | // operation considered committed by consensus. The replica uses this |
232 | | // information to update the state of any pending (previously replicated/prepared) |
233 | | // transactions. |
234 | | // |
235 | | // Returns Status::OK if the response has been filled (regardless of accepting |
236 | | // or rejecting the specific request). Returns non-OK Status if a specific |
237 | | // error response could not be formed, which will result in the service |
238 | | // returning an UNKNOWN_ERROR RPC error code to the caller and including the |
239 | | // stringified Status message. |
240 | | virtual CHECKED_STATUS Update( |
241 | | ConsensusRequestPB* request, |
242 | | ConsensusResponsePB* response, |
243 | | CoarseTimePoint deadline) = 0; |
244 | | |
245 | | // Messages sent from CANDIDATEs to voting peers to request their vote |
246 | | // in leader election. |
247 | | virtual CHECKED_STATUS RequestVote(const VoteRequestPB* request, |
248 | | VoteResponsePB* response) = 0; |
249 | | |
250 | | // Implement a ChangeConfig() request. |
251 | | virtual CHECKED_STATUS ChangeConfig(const ChangeConfigRequestPB& req, |
252 | | const StdStatusCallback& client_cb, |
253 | | boost::optional<tserver::TabletServerErrorPB::Code>* error); |
254 | | |
255 | | virtual Status UnsafeChangeConfig( |
256 | | const UnsafeChangeConfigRequestPB& req, |
257 | | boost::optional<tserver::TabletServerErrorPB::Code>* error_code) = 0; |
258 | | |
259 | | // Returns the current Raft role of this instance. |
260 | | virtual PeerRole role() const = 0; |
261 | | |
262 | | // Returns the leader status (see LeaderStatus type description for details). |
263 | | // If leader is ready, then also returns term, otherwise OpId::kUnknownTerm is returned. |
264 | | // |
265 | | // allow_stale could be used to avoid refreshing cache, when we are OK to read slightly outdated |
266 | | // value. |
267 | | virtual LeaderState GetLeaderState(bool allow_stale = false) const = 0; |
268 | | |
269 | | LeaderStatus GetLeaderStatus(bool allow_stale = false) const; |
270 | | int64_t LeaderTerm() const; |
271 | | |
272 | | // Returns the uuid of this peer. |
273 | | virtual std::string peer_uuid() const = 0; |
274 | | |
275 | | // Returns the id of the tablet whose updates this consensus instance helps coordinate. |
276 | | virtual std::string tablet_id() const = 0; |
277 | | |
278 | | virtual const TabletId& split_parent_tablet_id() const = 0; |
279 | | |
280 | | // Returns a copy of the committed state of the Consensus system. Also allows returning the |
281 | | // leader lease status captured under the same lock. |
282 | | virtual ConsensusStatePB ConsensusState( |
283 | | ConsensusConfigType type, |
284 | | LeaderLeaseStatus* leader_lease_status = nullptr) const = 0; |
285 | | |
286 | | // Returns a copy of the committed state of the Consensus system, assuming caller holds the needed |
287 | | // locks. |
288 | | virtual ConsensusStatePB ConsensusStateUnlocked( |
289 | | ConsensusConfigType type, |
290 | | LeaderLeaseStatus* leader_lease_status = nullptr) const = 0; |
291 | | |
292 | | // Returns a copy of the current committed Raft configuration. |
293 | | virtual RaftConfigPB CommittedConfig() const = 0; |
294 | | |
295 | | virtual void DumpStatusHtml(std::ostream& out) const = 0; |
296 | | |
297 | | void SetFaultHooks(const std::shared_ptr<ConsensusFaultHooks>& hooks); |
298 | | |
299 | | const std::shared_ptr<ConsensusFaultHooks>& GetFaultHooks() const; |
300 | | |
301 | | // Stops running the consensus algorithm. |
302 | | virtual void Shutdown() = 0; |
303 | | |
304 | | // Returns the last OpId (either received or committed, depending on the 'type' argument) that the |
305 | | // Consensus implementation knows about. Primarily used for testing purposes. |
306 | | Result<yb::OpId> GetLastOpId(OpIdType type); |
307 | | |
308 | | virtual yb::OpId GetLastReceivedOpId() = 0; |
309 | | |
310 | | virtual yb::OpId GetLastCommittedOpId() = 0; |
311 | | |
312 | | virtual yb::OpId GetLastCDCedOpId() = 0; |
313 | | |
314 | | virtual yb::OpId GetLastAppliedOpId() = 0; |
315 | | |
316 | | // Assuming we are the leader, wait until we have a valid leader lease (i.e. the old leader's |
317 | | // lease has expired, and we have replicated a new lease that has not expired yet). |
318 | | virtual CHECKED_STATUS WaitForLeaderLeaseImprecise(CoarseTimePoint deadline) = 0; |
319 | | |
320 | | // Check that this Consensus is a leader and has lease, returns Status::OK in this case. |
321 | | // Otherwise error status is returned. |
322 | | virtual CHECKED_STATUS CheckIsActiveLeaderAndHasLease() const = 0; |
323 | | |
324 | | // Returns majority replicated ht lease, so we know that after leader change |
325 | | // operations would not be added with hybrid time below this lease. |
326 | | // |
327 | | // `min_allowed` - result should be greater or equal to `min_allowed`, otherwise |
328 | | // it tries to wait until ht lease reaches this value or `deadline` happens. |
329 | | // |
330 | | // Returns 0 if timeout happened. |
331 | | virtual Result<MicrosTime> MajorityReplicatedHtLeaseExpiration( |
332 | | MicrosTime min_allowed, CoarseTimePoint deadline) const = 0; |
333 | | |
334 | | // Read majority replicated messages for CDC producer. |
335 | | virtual Result<ReadOpsResult> ReadReplicatedMessagesForCDC(const yb::OpId& from, |
336 | | int64_t* repl_index, |
337 | | const CoarseTimePoint deadline) = 0; |
338 | | |
339 | | virtual void UpdateCDCConsumerOpId(const yb::OpId& op_id) = 0; |
340 | | |
341 | | protected: |
342 | | friend class RefCountedThreadSafe<Consensus>; |
343 | | friend class tablet::TabletPeer; |
344 | | |
345 | | // Fault hooks for tests. In production code this will always be null. |
346 | | std::shared_ptr<ConsensusFaultHooks> fault_hooks_; |
347 | | |
348 | | enum HookPoint { |
349 | | PRE_START, |
350 | | POST_START, |
351 | | PRE_CONFIG_CHANGE, |
352 | | POST_CONFIG_CHANGE, |
353 | | PRE_REPLICATE, |
354 | | POST_REPLICATE, |
355 | | PRE_COMMIT, |
356 | | POST_COMMIT, |
357 | | PRE_UPDATE, |
358 | | POST_UPDATE, |
359 | | PRE_SHUTDOWN, |
360 | | POST_SHUTDOWN |
361 | | }; |
362 | | |
363 | | CHECKED_STATUS ExecuteHook(HookPoint point); |
364 | | |
365 | | enum State { |
366 | | kNotInitialized, |
367 | | kInitializing, |
368 | | kConfiguring, |
369 | | kRunning, |
370 | | }; |
371 | | |
372 | | private: |
373 | | DISALLOW_COPY_AND_ASSIGN(Consensus); |
374 | | }; |
375 | | |
376 | | YB_DEFINE_ENUM(StateChangeReason, |
377 | | (INVALID_REASON) |
378 | | (TABLET_PEER_STARTED) |
379 | | (CONSENSUS_STARTED) |
380 | | (NEW_LEADER_ELECTED) |
381 | | (FOLLOWER_NO_OP_COMPLETE) |
382 | | (LEADER_CONFIG_CHANGE_COMPLETE) |
383 | | (FOLLOWER_CONFIG_CHANGE_COMPLETE)); |
384 | | |
385 | | class Consensus::ConsensusFaultHooks { |
386 | | public: |
387 | | virtual CHECKED_STATUS PreStart(); |
388 | | virtual CHECKED_STATUS PostStart(); |
389 | | virtual CHECKED_STATUS PreConfigChange(); |
390 | | virtual CHECKED_STATUS PostConfigChange(); |
391 | | virtual CHECKED_STATUS PreReplicate(); |
392 | | virtual CHECKED_STATUS PostReplicate(); |
393 | | virtual CHECKED_STATUS PreUpdate(); |
394 | | virtual CHECKED_STATUS PostUpdate(); |
395 | | virtual CHECKED_STATUS PreShutdown(); |
396 | | virtual CHECKED_STATUS PostShutdown(); |
397 | 2 | virtual ~ConsensusFaultHooks() {} |
398 | | }; |
399 | | |
400 | | class SafeOpIdWaiter { |
401 | | public: |
402 | | virtual yb::OpId WaitForSafeOpIdToApply(const yb::OpId& op_id) = 0; |
403 | | |
404 | | protected: |
405 | 47.8k | ~SafeOpIdWaiter() {} |
406 | | }; |
407 | | |
408 | | struct LeaderState { |
409 | | LeaderStatus status; |
410 | | int64_t term; |
411 | | MonoDelta remaining_old_leader_lease; |
412 | | |
413 | | LeaderState& MakeNotReadyLeader(LeaderStatus status); |
414 | | |
415 | 21.9M | bool ok() const { |
416 | 21.9M | return status == LeaderStatus::LEADER_AND_READY; |
417 | 21.9M | } |
418 | | |
419 | | CHECKED_STATUS CreateStatus() const; |
420 | | }; |
421 | | |
422 | | CHECKED_STATUS MoveStatus(LeaderState&& state); |
423 | | |
424 | | } // namespace consensus |
425 | | } // namespace yb |
426 | | |
427 | | #endif // YB_CONSENSUS_CONSENSUS_H_ |