/Users/deen/code/yugabyte-db/src/yb/consensus/leader_election.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // |
18 | | // The following only applies to changes made to this file as part of YugaByte development. |
19 | | // |
20 | | // Portions Copyright (c) YugaByte, Inc. |
21 | | // |
22 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
23 | | // in compliance with the License. You may obtain a copy of the License at |
24 | | // |
25 | | // http://www.apache.org/licenses/LICENSE-2.0 |
26 | | // |
27 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
28 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
29 | | // or implied. See the License for the specific language governing permissions and limitations |
30 | | // under the License. |
31 | | // |
32 | | |
33 | | #ifndef YB_CONSENSUS_LEADER_ELECTION_H |
34 | | #define YB_CONSENSUS_LEADER_ELECTION_H |
35 | | |
36 | | #include <functional> |
37 | | #include <map> |
38 | | #include <string> |
39 | | #include <unordered_map> |
40 | | #include <vector> |
41 | | |
42 | | #include <boost/optional/optional.hpp> |
43 | | |
44 | | #include "yb/common/hybrid_time.h" |
45 | | |
46 | | #include "yb/consensus/consensus_fwd.h" |
47 | | #include "yb/consensus/consensus.pb.h" |
48 | | #include "yb/consensus/leader_lease.h" |
49 | | |
50 | | #include "yb/gutil/macros.h" |
51 | | #include "yb/gutil/ref_counted.h" |
52 | | |
53 | | #include "yb/rpc/rpc_controller.h" |
54 | | |
55 | | #include "yb/util/locks.h" |
56 | | |
57 | | namespace yb { |
58 | | class Status; |
59 | | |
60 | | namespace metadata { |
61 | | class RaftPeerPB; |
62 | | } |
63 | | |
64 | | namespace consensus { |
65 | | class PeerProxyFactory; |
66 | | |
67 | | // The vote a peer has given. |
68 | | YB_DEFINE_ENUM(ElectionVote, (kDenied)(kGranted)(kUnknown)); |
69 | | |
70 | | // Simple class to count votes (in-memory, not persisted to disk). |
71 | | // This class is not thread safe and requires external synchronization. |
72 | | class VoteCounter { |
73 | | public: |
74 | | // Create new VoteCounter with the given majority size. |
75 | | VoteCounter(size_t num_voters, size_t majority_size); |
76 | | |
77 | | // Register a peer's vote. |
78 | | // |
79 | | // If the voter already has a vote recorded, but it has a different value than |
80 | | // the vote specified, returns Status::IllegalArgument. |
81 | | // |
82 | | // If the same vote is duplicated, 'is_duplicate' is set to true. |
83 | | // Otherwise, it is set to false. |
84 | | // If an OK status is not returned, the value in 'is_duplicate' is undefined. |
85 | | CHECKED_STATUS RegisterVote(const std::string& voter_uuid, ElectionVote vote, bool* is_duplicate); |
86 | | |
87 | | // If vote is not yet decided, returns ElectionVote::kUnknown. |
88 | | ElectionVote GetDecision() const; |
89 | | |
90 | | // Return the total of "Yes" and "No" votes. |
91 | | size_t GetTotalVotesCounted() const; |
92 | | |
93 | | // Return total number of expected votes. |
94 | 76.1k | size_t GetTotalExpectedVotes() const { return num_voters_; } |
95 | | |
96 | | // Return true iff GetTotalVotesCounted() == num_voters_; |
97 | | bool AreAllVotesIn() const; |
98 | | |
99 | | private: |
100 | | friend class VoteCounterTest; |
101 | | |
102 | | typedef std::map<std::string, ElectionVote> VoteMap; |
103 | | |
104 | | const size_t num_voters_; |
105 | | const size_t majority_size_; |
106 | | VoteMap votes_; // Voting record. |
107 | | size_t yes_votes_ = 0; // Accumulated yes votes, for quick counting. |
108 | | size_t no_votes_ = 0; // Accumulated no votes. |
109 | | |
110 | | DISALLOW_COPY_AND_ASSIGN(VoteCounter); |
111 | | }; |
112 | | |
113 | | // The result of a leader election. |
114 | | struct ElectionResult { |
115 | | explicit ElectionResult(PreElection preelection_, ConsensusTerm election_term_) |
116 | 76.1k | : preelection(preelection_), election_term(election_term_) {} |
117 | | |
118 | 476k | bool decided() const { |
119 | 476k | return decision != ElectionVote::kUnknown; |
120 | 476k | } |
121 | | |
122 | | // Whether this result is related to preelection. |
123 | | const PreElection preelection; |
124 | | |
125 | | // UUID of node that does not support preelections. |
126 | | std::string preelections_not_supported_by_uuid; |
127 | | |
128 | | // Term the election was run for. |
129 | | const ConsensusTerm election_term; |
130 | | |
131 | | // The overall election GRANTED/DENIED decision of the configuration. |
132 | | ElectionVote decision = ElectionVote::kUnknown; |
133 | | |
134 | | // At least one voter had a higher term than the candidate. |
135 | | boost::optional<ConsensusTerm> higher_term; |
136 | | |
137 | | // Human-readable explanation of the vote result, if any. |
138 | | std::string message; |
139 | | |
140 | | CoarseTimeLease old_leader_lease; |
141 | | |
142 | | PhysicalComponentLease old_leader_ht_lease; |
143 | | }; |
144 | | |
145 | | // Driver class to run a leader election. |
146 | | // |
147 | | // The caller must pass a callback to the driver, which will be called exactly |
148 | | // once when a Yes/No decision has been made, except in case of Shutdown() |
149 | | // on the Messenger or test ThreadPool, in which case no guarantee of a |
150 | | // callback is provided. In that case, we should not care about the election |
151 | | // result, because the server is ostensibly shutting down. |
152 | | // |
153 | | // For a "Yes" decision, a majority of voters must grant their vote. |
154 | | // |
155 | | // A "No" decision may be caused by either one of the following: |
156 | | // - One of the peers replies with a higher term before a decision is made. |
157 | | // - A majority of the peers votes "No". |
158 | | // |
159 | | // Any votes that come in after a decision has been made and the callback has |
160 | | // been invoked are logged but ignored. Note that this somewhat strays from the |
161 | | // letter of the Raft paper, in that replies that come after a "Yes" decision |
162 | | // do not immediately cause the candidate/leader to step down, but this keeps |
163 | | // our implementation and API simple, and the newly-minted leader will soon |
164 | | // discover that it must step down when it attempts to replicate its first |
165 | | // message to the peers. |
166 | | // |
167 | | // This class is thread-safe. |
168 | | class LeaderElection : public RefCountedThreadSafe<LeaderElection> { |
169 | | public: |
170 | | typedef std::function<void(const ElectionResult&)> ElectionDecisionCallback; |
171 | | typedef std::unordered_map<std::string, PeerProxy*> ProxyMap; |
172 | | |
173 | | // Set up a new leader election driver. |
174 | | // |
175 | | // The 'vote_counter' must be initialized with the candidate's own yes vote. |
176 | | LeaderElection(const RaftConfigPB& config, |
177 | | PeerProxyFactory* proxy_factory, |
178 | | const VoteRequestPB& request, |
179 | | std::unique_ptr<VoteCounter> vote_counter, |
180 | | MonoDelta timeout, |
181 | | PreElection preelection, |
182 | | TEST_SuppressVoteRequest suppress_vote_request, |
183 | | ElectionDecisionCallback decision_callback); |
184 | | |
185 | | // Run the election: send the vote request to followers. |
186 | | void Run(); |
187 | | |
188 | | private: |
189 | | friend class RefCountedThreadSafe<LeaderElection>; |
190 | | |
191 | | struct VoterState { |
192 | | PeerProxyPtr proxy; |
193 | | |
194 | | rpc::RpcController rpc; |
195 | | VoteRequestPB request; |
196 | | VoteResponsePB response; |
197 | | }; |
198 | | |
199 | | typedef std::unordered_map<std::string, std::unique_ptr<VoterState>> VoterStateMap; |
200 | | typedef simple_spinlock Lock; |
201 | | |
202 | | // This class is refcounted. |
203 | | ~LeaderElection(); |
204 | | |
205 | | // Check to see if a decision has been made. If so, invoke decision callback. |
206 | | // Calls the callback outside of holding a lock. |
207 | | void CheckForDecision(); |
208 | | |
209 | | // Callback called when the RPC responds. |
210 | | void VoteResponseRpcCallback(const std::string& voter_uuid, const LeaderElectionPtr& self); |
211 | | |
212 | | // Record vote from specified peer. |
213 | | void RecordVoteUnlocked(const std::string& voter_uuid, ElectionVote vote); |
214 | | |
215 | | // Handle a peer that reponded with a term greater than the election term. |
216 | | void HandleHigherTermUnlocked(const std::string& voter_uuid, const VoterState& state); |
217 | | |
218 | | // Log and record a granted vote. |
219 | | void HandleVoteGrantedUnlocked(const std::string& voter_uuid, const VoterState& state); |
220 | | |
221 | | // Log the reason for a denied vote and record it. |
222 | | void HandleVoteDeniedUnlocked(const std::string& voter_uuid, const VoterState& state); |
223 | | |
224 | | void TrySendRequestToVoters(std::chrono::steady_clock::time_point deadline, size_t* voters_left); |
225 | | |
226 | | // Returns a string to be prefixed to all log entries. |
227 | | // This method accesses const members and is thread safe. |
228 | | std::string LogPrefix() const; |
229 | | |
230 | | // Helper to reference the term we are running the election for. |
231 | 69.7k | ConsensusTerm election_term() const { return request_.candidate_term(); } |
232 | | |
233 | | // Helper to reference the term we are running the election for. |
234 | 1.10k | ConsensusTerm consensus_term() const { |
235 | | // If we execute preelection with term X, then our term is X - 1. |
236 | 567 | return request_.candidate_term() - (result_.preelection ? 1 : 0); |
237 | 1.10k | } |
238 | | |
239 | | // All non-const fields are protected by 'lock_'. |
240 | | Lock lock_; |
241 | | |
242 | | // Whether we have responded via the callback yet. |
243 | | bool has_responded_ = false; |
244 | | |
245 | | // Election request to send to voters. |
246 | | const VoteRequestPB request_; |
247 | | |
248 | | // The result returned by the ElectionDecisionCallback. |
249 | | ElectionResult result_; |
250 | | |
251 | | // Object to count the votes. |
252 | | const std::unique_ptr<VoteCounter> vote_counter_; |
253 | | |
254 | | // Timeout for sending RPCs. |
255 | | const MonoDelta timeout_; |
256 | | |
257 | | TEST_SuppressVoteRequest suppress_vote_request_; |
258 | | |
259 | | // Callback invoked to notify the caller of an election decision. |
260 | | const ElectionDecisionCallback decision_callback_; |
261 | | |
262 | | // List of all potential followers to request votes from. |
263 | | // The candidate's own UUID must not be included. |
264 | | std::vector<std::string> voting_follower_uuids_; |
265 | | |
266 | | // Map of UUID -> VoterState. |
267 | | VoterStateMap voter_state_; |
268 | | }; |
269 | | |
270 | | } // namespace consensus |
271 | | } // namespace yb |
272 | | |
273 | | #endif /* YB_CONSENSUS_LEADER_ELECTION_H */ |