/Users/deen/code/yugabyte-db/src/yb/consensus/replica_state.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // |
18 | | // The following only applies to changes made to this file as part of YugaByte development. |
19 | | // |
20 | | // Portions Copyright (c) YugaByte, Inc. |
21 | | // |
22 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
23 | | // in compliance with the License. You may obtain a copy of the License at |
24 | | // |
25 | | // http://www.apache.org/licenses/LICENSE-2.0 |
26 | | // |
27 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
28 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
29 | | // or implied. See the License for the specific language governing permissions and limitations |
30 | | // under the License. |
31 | | // |
32 | | #ifndef YB_CONSENSUS_REPLICA_STATE_H |
33 | | #define YB_CONSENSUS_REPLICA_STATE_H |
34 | | |
35 | | #include <atomic> |
36 | | #include <deque> |
37 | | #include <mutex> |
38 | | #include <string> |
39 | | #include <utility> |
40 | | #include <vector> |
41 | | |
42 | | #include <boost/atomic.hpp> |
43 | | |
44 | | #include "yb/common/hybrid_time.h" |
45 | | |
46 | | #include "yb/consensus/consensus_meta.h" |
47 | | #include "yb/consensus/consensus_queue.h" |
48 | | #include "yb/consensus/consensus_types.h" |
49 | | #include "yb/consensus/retryable_requests.h" |
50 | | #include "yb/consensus/leader_lease.h" |
51 | | |
52 | | #include "yb/gutil/port.h" |
53 | | #include "yb/util/locks.h" |
54 | | #include "yb/util/status_fwd.h" |
55 | | #include "yb/util/enums.h" |
56 | | |
57 | | namespace yb { |
58 | | |
59 | | class HostPort; |
60 | | class ReplicaState; |
61 | | class ThreadPool; |
62 | | |
63 | | namespace consensus { |
64 | | |
65 | | class RetryableRequests; |
66 | | |
67 | | YB_DEFINE_ENUM(SetMajorityReplicatedLeaseExpirationFlag, |
68 | | (kResetOldLeaderLease)(kResetOldLeaderHtLease)); |
69 | | |
70 | | YB_STRONGLY_TYPED_BOOL(CouldStop); |
71 | | |
72 | | // Whether we add pending operation while running as leader or follower. |
73 | | YB_DEFINE_ENUM(OperationMode, (kLeader)(kFollower)); |
74 | | |
75 | | // Class that coordinates access to the replica state (independently of Role). |
76 | | // This has a 1-1 relationship with RaftConsensus and is essentially responsible for |
77 | | // keeping state and checking if state changes are viable. |
78 | | // |
79 | | // Note that, in the case of a LEADER role, there are two configuration states that |
80 | | // that are tracked: a pending and a committed configuration. The "active" state is |
81 | | // considered to be the pending configuration if it is non-null, otherwise the |
82 | | // committed configuration is the active configuration. |
83 | | // |
84 | | // When a replica becomes a leader of a configuration, it sets the pending configuration to |
85 | | // a new configuration declaring itself as leader and sets its "active" role to LEADER. |
86 | | // It then starts up ConsensusPeers for each member of the pending configuration and |
87 | | // tries to push a new configuration to the peers. Once that configuration is |
88 | | // pushed to a majority of the cluster, it is considered committed and the |
89 | | // replica flushes that configuration to disk as the committed configuration. |
90 | | // |
91 | | // Each time an operation is to be performed on the replica the appropriate LockFor*() |
92 | | // method should be called. The LockFor*() methods check that the replica is in the |
93 | | // appropriate state to perform the requested operation and returns the lock or return |
94 | | // Status::IllegalState if that is not the case. |
95 | | // |
96 | | // All state reading/writing methods acquire the lock, unless suffixed by "Unlocked", in |
97 | | // which case a lock should be obtained prior to calling them. |
98 | | class ReplicaState { |
99 | | public: |
100 | | enum State { |
101 | | // State after the replica is built. |
102 | | kInitialized, |
103 | | |
104 | | // State signaling the replica accepts requests (from clients |
105 | | // if leader, from leader if follower) |
106 | | kRunning, |
107 | | |
108 | | // State signaling that the replica is shutting down and no longer accepting |
109 | | // new transactions or commits. |
110 | | kShuttingDown, |
111 | | |
112 | | // State signaling the replica is shut down and does not accept |
113 | | // any more requests. |
114 | | kShutDown |
115 | | }; |
116 | | |
117 | | typedef std::unique_lock<std::mutex> UniqueLock; |
118 | | |
119 | | ReplicaState( |
120 | | ConsensusOptions options, std::string peer_uuid, std::unique_ptr<ConsensusMetadata> cmeta, |
121 | | ConsensusContext* consensus_context, SafeOpIdWaiter* safe_op_id_waiter, |
122 | | RetryableRequests* retryable_requests, |
123 | | std::function<void(const OpIds&)> applied_ops_tracker); |
124 | | |
125 | | ~ReplicaState(); |
126 | | |
127 | | CHECKED_STATUS StartUnlocked(const OpIdPB& last_in_wal); |
128 | | |
129 | | // Should be used only to assert that the update_lock_ is held. |
130 | | bool IsLocked() const WARN_UNUSED_RESULT; |
131 | | |
132 | | // Locks a replica in preparation for StartUnlocked(). Makes |
133 | | // sure the replica is in kInitialized state. |
134 | | CHECKED_STATUS LockForStart(UniqueLock* lock) const; |
135 | | |
136 | | // Locks a replica down until the critical section of an append completes, |
137 | | // i.e. until the replicate message has been assigned an id and placed in |
138 | | // the log queue. |
139 | | // This also checks that the replica is in the appropriate |
140 | | // state (role) to replicate the provided operation, that the operation |
141 | | // contains a replicate message and is of the appropriate type, and returns |
142 | | // Status::IllegalState if that is not the case. |
143 | | CHECKED_STATUS LockForReplicate(UniqueLock* lock, const ReplicateMsg& msg) const; |
144 | | CHECKED_STATUS LockForReplicate(UniqueLock* lock) const; |
145 | | |
146 | | Status CheckIsActiveLeaderAndHasLease() const; |
147 | | |
148 | | // Locks a replica down until an the critical section of an update completes. |
149 | | // Further updates from the same or some other leader will be blocked until |
150 | | // this completes. This also checks that the replica is in the appropriate |
151 | | // state (role) to be updated and returns Status::IllegalState if that |
152 | | // is not the case. |
153 | | CHECKED_STATUS LockForUpdate(UniqueLock* lock) const; |
154 | | |
155 | | // Changes the role to non-participant and returns a lock that can be |
156 | | // used to make sure no state updates come in until Shutdown() is |
157 | | // completed. |
158 | | CHECKED_STATUS LockForShutdown(UniqueLock* lock); |
159 | | |
160 | | CHECKED_STATUS LockForConfigChange(UniqueLock* lock) const; |
161 | | |
162 | | // Obtains the lock for a state read, does not check state. |
163 | | UniqueLock LockForRead() const; |
164 | | |
165 | | // Obtains the lock so that we can advance the majority replicated |
166 | | // index and possibly the committed index. |
167 | | // Requires that this peer is leader. |
168 | | CHECKED_STATUS LockForMajorityReplicatedIndexUpdate( |
169 | | UniqueLock* lock) const; |
170 | | |
171 | | // Ensure the local peer is the active leader. |
172 | | // Returns OK if leader, IllegalState otherwise. |
173 | | CHECKED_STATUS CheckActiveLeaderUnlocked(LeaderLeaseCheckMode lease_check_mode) const; |
174 | | |
175 | | LeaderState GetLeaderState(bool allow_stale = false) const; |
176 | | |
177 | | // now is used as a cache for current time. It is in/out parameter and could contain or receive |
178 | | // current time if it was used during leader state calculation. |
179 | | LeaderState GetLeaderStateUnlocked( |
180 | | LeaderLeaseCheckMode lease_check_mode = LeaderLeaseCheckMode::NEED_LEASE, |
181 | | CoarseTimePoint* now = nullptr) const; |
182 | | |
183 | | // Completes the Shutdown() of this replica. No more operations, local |
184 | | // or otherwise can happen after this point. |
185 | | // Called after the quiescing phase (started with LockForShutdown()) |
186 | | // finishes. |
187 | | CHECKED_STATUS ShutdownUnlocked(); |
188 | | |
189 | | // Return current consensus state summary. |
190 | | ConsensusStatePB ConsensusStateUnlocked(ConsensusConfigType type) const; |
191 | | |
192 | | // Returns the currently active Raft role. |
193 | | PeerRole GetActiveRoleUnlocked() const; |
194 | | |
195 | | // Returns true if there is a configuration change currently in-flight but not yet |
196 | | // committed. |
197 | | bool IsConfigChangePendingUnlocked() const; |
198 | | |
199 | | // Inverse of IsConfigChangePendingUnlocked(): returns OK if there is |
200 | | // currently *no* configuration change pending, and IllegalState is there *is* a |
201 | | // configuration change pending. |
202 | | CHECKED_STATUS CheckNoConfigChangePendingUnlocked() const; |
203 | | |
204 | | // Returns true if an operation is in this replica's log, namely: |
205 | | // - If the op's index is lower than or equal to our committed index |
206 | | // - If the op id matches an inflight op. |
207 | | // If an operation with the same index is in our log but the terms |
208 | | // are different 'term_mismatch' is set to true, it is false otherwise. |
209 | | bool IsOpCommittedOrPending(const yb::OpId& op_id, bool* term_mismatch); |
210 | | |
211 | | // Sets the given configuration as pending commit. Does not persist into the peers |
212 | | // metadata. In order to be persisted, SetCommittedConfigUnlocked() must be called. |
213 | | CHECKED_STATUS SetPendingConfigUnlocked(const RaftConfigPB& new_config); |
214 | | |
215 | | // Clears the pending config. |
216 | | CHECKED_STATUS ClearPendingConfigUnlocked(); |
217 | | |
218 | | // Return the pending configuration, or crash if one is not set. |
219 | | const RaftConfigPB& GetPendingConfigUnlocked() const; |
220 | | |
221 | | // Changes the committed config for this replica. Checks that there is a |
222 | | // pending configuration and that it is equal to this one. Persists changes to disk. |
223 | | // Resets the pending configuration to null. |
224 | | CHECKED_STATUS SetCommittedConfigUnlocked(const RaftConfigPB& new_config); |
225 | | |
226 | | // Return the persisted configuration. |
227 | | const RaftConfigPB& GetCommittedConfigUnlocked() const; |
228 | | |
229 | | // Return the "active" configuration - if there is a pending configuration return it; |
230 | | // otherwise return the committed configuration. |
231 | | const RaftConfigPB& GetActiveConfigUnlocked() const; |
232 | | |
233 | | // Checks if the term change is legal. If so, sets 'current_term' |
234 | | // to 'new_term' and sets 'has voted' to no for the current term. |
235 | | CHECKED_STATUS SetCurrentTermUnlocked(int64_t new_term); |
236 | | |
237 | | // Returns the term set in the last config change round. |
238 | | const int64_t GetCurrentTermUnlocked() const; |
239 | | |
240 | | // Accessors for the leader of the current term. |
241 | | void SetLeaderUuidUnlocked(const std::string& uuid); |
242 | | const std::string& GetLeaderUuidUnlocked() const; |
243 | 52.0M | bool HasLeaderUnlocked() const { return !GetLeaderUuidUnlocked().empty(); } |
244 | 493k | void ClearLeaderUnlocked() { SetLeaderUuidUnlocked(""); } |
245 | | |
246 | | // Return whether this peer has voted in the current term. |
247 | | const bool HasVotedCurrentTermUnlocked() const; |
248 | | |
249 | | // Record replica's vote for the current term, then flush the consensus |
250 | | // metadata to disk. |
251 | | CHECKED_STATUS SetVotedForCurrentTermUnlocked(const std::string& uuid); |
252 | | |
253 | | // Return replica's vote for the current term. |
254 | | // The vote must be set; use HasVotedCurrentTermUnlocked() to check. |
255 | | const std::string& GetVotedForCurrentTermUnlocked() const; |
256 | | |
257 | 61.1M | ConsensusContext* context() const { |
258 | 61.1M | return context_; |
259 | 61.1M | } |
260 | | |
261 | | // Returns the uuid of the peer to which this replica state belongs. |
262 | | // Safe to call with or without locks held. |
263 | | const std::string& GetPeerUuid() const; |
264 | | |
265 | | const ConsensusOptions& GetOptions() const; |
266 | | |
267 | | // Aborts pending operations after, but not including 'index'. The OpId with 'index' |
268 | | // will become our new last received id. If there are pending operations with indexes |
269 | | // higher than 'index' those operations are aborted. |
270 | | CHECKED_STATUS AbortOpsAfterUnlocked(int64_t index); |
271 | | |
272 | | // Returns the ConsensusRound with the provided index, if there is any, or NULL |
273 | | // if there isn't. |
274 | | scoped_refptr<ConsensusRound> GetPendingOpByIndexOrNullUnlocked(int64_t index); |
275 | | |
276 | | // Add 'round' to the set of rounds waiting to be committed. |
277 | | CHECKED_STATUS AddPendingOperation(const ConsensusRoundPtr& round, OperationMode mode); |
278 | | |
279 | | // Marks ReplicaOperations up to 'id' as majority replicated, meaning the |
280 | | // transaction may Apply() (immediately if Prepare() has completed or when Prepare() |
281 | | // completes, if not). |
282 | | // Sets last_applied_op_id to the ID of last operation applied. |
283 | | // |
284 | | // If this advanced the committed index, sets *committed_op_id_changed to true. |
285 | | CHECKED_STATUS UpdateMajorityReplicatedUnlocked( |
286 | | const OpId& majority_replicated, OpId* committed_op_id, bool* committed_op_id_changed, |
287 | | OpId* last_applied_op_id); |
288 | | |
289 | | // Advances the committed index. |
290 | | // This is a no-op if the committed index has not changed. |
291 | | // Returns in whether the operation actually advanced the index. |
292 | | Result<bool> AdvanceCommittedOpIdUnlocked(const yb::OpId& committed_op_id, CouldStop could_stop); |
293 | | |
294 | | // Initializes the committed index. |
295 | | // Function checks that we are in initial state, then updates committed index. |
296 | | CHECKED_STATUS InitCommittedOpIdUnlocked(const yb::OpId& committed_op_id); |
297 | | |
298 | | // Returns the watermark below which all operations are known to |
299 | | // be committed according to consensus. |
300 | | // |
301 | | // This must be called under a lock. |
302 | | const OpId& GetCommittedOpIdUnlocked() const; |
303 | | |
304 | | // Returns the watermark below which all operations are known to be applied according to |
305 | | // consensus. |
306 | 60.3M | const OpId& GetLastAppliedOpIdUnlocked() const { |
307 | | // See comment for last_committed_op_id_ for why we return committed op ID here. |
308 | 60.3M | return GetCommittedOpIdUnlocked(); |
309 | 60.3M | } |
310 | | |
311 | | // Returns true if an op from the current term has been committed. |
312 | | bool AreCommittedAndCurrentTermsSameUnlocked() const; |
313 | | |
314 | | // Updates the last received operation. |
315 | | // This must be called under a lock. |
316 | | void UpdateLastReceivedOpIdUnlocked(const OpIdPB& op_id); |
317 | | |
318 | | // Returns the last received op id. This must be called under the lock. |
319 | | const OpId& GetLastReceivedOpIdUnlocked() const; |
320 | | |
321 | | // Returns the id of the last op received from the current leader. |
322 | | const OpId& GetLastReceivedOpIdCurLeaderUnlocked() const; |
323 | | |
324 | | // Returns the id of the latest pending transaction (i.e. the one with the |
325 | | // latest index). This must be called under the lock. |
326 | | OpId GetLastPendingOperationOpIdUnlocked() const; |
327 | | |
328 | | // Used by replicas to cancel pending transactions. Pending transaction are those |
329 | | // that have completed prepare/replicate but are waiting on the LEADER's commit |
330 | | // to complete. This does not cancel transactions being applied. |
331 | | CHECKED_STATUS CancelPendingOperations(); |
332 | | |
333 | | // API to dump pending transactions. Added to debug ENG-520. |
334 | | void DumpPendingOperationsUnlocked(); |
335 | | |
336 | | OpId NewIdUnlocked(); |
337 | | |
338 | | // Used when, for some reason, an operation that failed before it could be considered |
339 | | // a part of the state machine. Basically restores the id gen to the state it was before |
340 | | // generating 'id'. So that we reuse these ids later, when we can actually append to the |
341 | | // state machine. This makes the state machine have continuous ids for the same term, even if |
342 | | // the queue refused to add any more operations. |
343 | | // should_exists indicates whether we expect that this operation is already added. |
344 | | // Used for debugging purposes only. |
345 | | void CancelPendingOperation(const OpId& id, bool should_exist); |
346 | | |
347 | | // Accessors for pending election op id. These must be called under a lock. |
348 | 25.4M | const OpId& GetPendingElectionOpIdUnlocked() { return pending_election_opid_; } |
349 | 0 | void SetPendingElectionOpIdUnlocked(const OpId& opid) { pending_election_opid_ = opid; } |
350 | 170k | void ClearPendingElectionOpIdUnlocked() { pending_election_opid_ = OpId(); } |
351 | | |
352 | | std::string ToString() const; |
353 | | std::string ToStringUnlocked() const; |
354 | | |
355 | | // A common prefix that should be in any log messages emitted, |
356 | | // identifying the tablet and peer. |
357 | | std::string LogPrefix() const; |
358 | | |
359 | | // Checks that 'current' correctly follows 'previous'. Specifically it checks |
360 | | // that the term is the same or higher and that the index is sequential. |
361 | | static CHECKED_STATUS CheckOpInSequence(const yb::OpId& previous, const yb::OpId& current); |
362 | | |
363 | | // Return the current state of this object. |
364 | | // The update_lock_ must be held. |
365 | | ReplicaState::State state() const; |
366 | | |
367 | | // Update the point in time we have to wait until before starting to act as a leader in case |
368 | | // we win an election. |
369 | | void UpdateOldLeaderLeaseExpirationOnNonLeaderUnlocked( |
370 | | const CoarseTimeLease& lease, const PhysicalComponentLease& ht_lease); |
371 | | |
372 | | void SetMajorityReplicatedLeaseExpirationUnlocked( |
373 | | const MajorityReplicatedData& majority_replicated_data, |
374 | | EnumBitSet<SetMajorityReplicatedLeaseExpirationFlag> flags); |
375 | | |
376 | | // Checks two conditions: |
377 | | // - That the old leader definitely does not have a lease. |
378 | | // - That this leader has a committed lease. |
379 | | LeaderLeaseStatus GetLeaderLeaseStatusUnlocked( |
380 | | MonoDelta* remaining_old_leader_lease = nullptr, CoarseTimePoint* now = nullptr) const; |
381 | | |
382 | | LeaderLeaseStatus GetHybridTimeLeaseStatusAtUnlocked(MicrosTime micros_time) const; |
383 | | |
384 | | // Get the remaining duration of the old leader's lease. Optionally, return the current time in |
385 | | // the "now" output parameter. In case the old leader's lease has already expired or is not known, |
386 | | // returns an uninitialized MonoDelta value. |
387 | | MonoDelta RemainingOldLeaderLeaseDuration(CoarseTimePoint* now = nullptr) const; |
388 | | |
389 | 40.6M | const PhysicalComponentLease& old_leader_ht_lease() const { |
390 | 40.6M | return old_leader_ht_lease_; |
391 | 40.6M | } |
392 | | |
393 | 34.8M | const CoarseTimeLease& old_leader_lease() const { |
394 | 34.8M | return old_leader_lease_; |
395 | 34.8M | } |
396 | | |
397 | | bool MajorityReplicatedLeaderLeaseExpired(CoarseTimePoint* now = nullptr) const; |
398 | | |
399 | | bool MajorityReplicatedHybridTimeLeaseExpiredAt(MicrosTime hybrid_time) const; |
400 | | |
401 | | // Get the current majority-replicated hybrid time leader lease expiration time as a microsecond |
402 | | // timestamp. |
403 | | // @param min_allowed - will wait until the majority-replicated hybrid time leader lease reaches |
404 | | // at least this microsecond timestamp. |
405 | | // @param deadline - won't wait past this deadline. |
406 | | // @return leader lease or 0 if timed out. |
407 | | Result<MicrosTime> MajorityReplicatedHtLeaseExpiration( |
408 | | MicrosTime min_allowed, CoarseTimePoint deadline) const; |
409 | | |
410 | | // The on-disk size of the consensus metadata. |
411 | | uint64_t OnDiskSize() const; |
412 | | |
413 | | OpId MinRetryableRequestOpId(); |
414 | | |
415 | | bool RegisterRetryableRequest(const ConsensusRoundPtr& round); |
416 | | |
417 | | RestartSafeCoarseMonoClock& Clock(); |
418 | | |
419 | | RetryableRequestsCounts TEST_CountRetryableRequests(); |
420 | | |
421 | | void SetLeaderNoOpCommittedUnlocked(bool value); |
422 | | |
423 | | void NotifyReplicationFinishedUnlocked( |
424 | | const ConsensusRoundPtr& round, const Status& status, int64_t leader_term, |
425 | | OpIds* applied_op_ids); |
426 | | |
427 | | private: |
428 | | typedef std::deque<ConsensusRoundPtr> PendingOperations; |
429 | | |
430 | | template <class Policy> |
431 | | LeaderLeaseStatus GetLeaseStatusUnlocked(Policy policy) const; |
432 | | |
433 | | // Apply pending operations beginning at iter up to and including committed_op_id. |
434 | | // Updates last_committed_op_id_ to committed_op_id. |
435 | | CHECKED_STATUS ApplyPendingOperationsUnlocked( |
436 | | const yb::OpId& committed_op_id, CouldStop could_stop); |
437 | | |
438 | | void SetLastCommittedIndexUnlocked(const yb::OpId& committed_op_id); |
439 | | |
440 | | // Applies committed config change. |
441 | | void ApplyConfigChangeUnlocked(const ConsensusRoundPtr& round); |
442 | | |
443 | | consensus::LeaderState RefreshLeaderStateCacheUnlocked( |
444 | | CoarseTimePoint* now) const ATTRIBUTE_NONNULL(2); |
445 | | |
446 | | PendingOperations::iterator FindPendingOperation(int64_t index); |
447 | | |
448 | | // Checks whether first pending operation matches last committed op index + 1. |
449 | | void CheckPendingOperationsHead() const; |
450 | | |
451 | | const ConsensusOptions options_; |
452 | | |
453 | | // The UUID of the local peer. |
454 | | const std::string peer_uuid_; |
455 | | |
456 | | mutable std::mutex update_lock_; |
457 | | mutable std::condition_variable cond_; |
458 | | |
459 | | // Consensus metadata persistence object. |
460 | | std::unique_ptr<ConsensusMetadata> cmeta_; |
461 | | |
462 | | // Used by the LEADER. This is the index of the next operation generated |
463 | | // by this LEADER. |
464 | | int64_t next_index_ = 0; |
465 | | |
466 | | // Queue of pending operations. Ordered by growing operation index. |
467 | | PendingOperations pending_operations_; |
468 | | |
469 | | // When we receive a message from a remote peer telling us to start a operation, we use |
470 | | // this factory to start it. |
471 | | ConsensusContext* context_; |
472 | | |
473 | | // Used to wait for safe op id during apply of committed entries. |
474 | | SafeOpIdWaiter* safe_op_id_waiter_; |
475 | | |
476 | | // The id of the last received operation, which corresponds to the last entry |
477 | | // written to the local log. Operations whose id is lower than or equal to |
478 | | // this id do not need to be resent by the leader. This is not guaranteed to |
479 | | // be monotonically increasing due to the possibility for log truncation and |
480 | | // aborted operations when a leader change occurs. |
481 | | OpId last_received_op_id_; |
482 | | |
483 | | // Same as last_received_op_id_ but only includes operations sent by the |
484 | | // current leader. The "term" in this op may not actually match the current |
485 | | // term, since leaders may replicate ops from prior terms. |
486 | | // |
487 | | // As an implementation detail, this field is reset to MinumumOpId() every |
488 | | // time there is a term advancement on the local node, to simplify the logic |
489 | | // involved in resetting this every time a new node becomes leader. |
490 | | yb::OpId last_received_op_id_current_leader_; |
491 | | |
492 | | // The ID of the operation that was last committed. Initialized to MinimumOpId(). |
493 | | // NOTE: due to implementation details at this and lower layers all operations up to |
494 | | // last_committed_op_id_ are guaranteed to be already applied. |
495 | | OpId last_committed_op_id_; |
496 | | |
497 | | // If set, a leader election is pending upon the specific op id commitment to this peer's log. |
498 | | OpId pending_election_opid_; |
499 | | |
500 | | State state_ = State::kInitialized; |
501 | | |
502 | | // When a follower becomes the leader, it uses this field to wait out the old leader's lease |
503 | | // before accepting writes or serving up-to-date reads. This is also used by candidates by |
504 | | // granting a vote. We compute the amount of time the new leader has to wait to make sure the old |
505 | | // leader's lease has expired. |
506 | | // |
507 | | // This is marked mutable because it can be reset on to MonoTime::kMin on the read path after the |
508 | | // deadline has passed, so that we avoid querying the clock unnecessarily from that point on. |
509 | | mutable CoarseTimeLease old_leader_lease_; |
510 | | |
511 | | // The same as old_leader_lease_ but for hybrid time. |
512 | | mutable PhysicalComponentLease old_leader_ht_lease_; |
513 | | |
514 | | // LEADER only: the latest committed lease expiration deadline for the current leader. The leader |
515 | | // is allowed to serve up-to-date reads and accept writes only while the current time is less than |
516 | | // this. However, the leader might manage to replicate a lease extension without losing its |
517 | | // leadership. |
518 | | CoarseTimePoint majority_replicated_lease_expiration_; |
519 | | |
520 | | // LEADER only: the latest committed hybrid time lease expiration deadline for the current leader. |
521 | | // The leader is allowed to add new log entries only when lease of old leader is expired. |
522 | | std::atomic<MicrosTime> majority_replicated_ht_lease_expiration_{ |
523 | | PhysicalComponentLease::NoneValue()}; |
524 | | |
525 | | RetryableRequests retryable_requests_; |
526 | | |
527 | | // This leader is ready to serve only if NoOp was successfully committed |
528 | | // after the new leader successful election. |
529 | | bool leader_no_op_committed_ = false; |
530 | | |
531 | | std::function<void(const OpIds&)> applied_ops_tracker_; |
532 | | |
533 | | struct LeaderStateCache { |
534 | | static constexpr size_t kStatusBits = 3; |
535 | | static_assert(kLeaderStatusMapSize <= (1 << kStatusBits), |
536 | | "Leader status does not fit into kStatusBits"); |
537 | | |
538 | | static constexpr uint64_t kStatusMask = (1 << kStatusBits) -1; |
539 | | |
540 | | // Packed status consists on LeaderStatus and an extra value. |
541 | | // Extra value meaning depends on actual status: |
542 | | // LEADER_AND_READY: leader term. |
543 | | // LEADER_BUT_OLD_LEADER_MAY_HAVE_LEASE: number of microseconds in remaining_old_leader_lease. |
544 | | uint64_t packed_status; |
545 | | CoarseTimePoint expire_at; |
546 | | |
547 | 35.9M | LeaderStateCache() noexcept {} |
548 | | |
549 | 81.9M | LeaderStatus status() const { |
550 | 81.9M | return static_cast<LeaderStatus>(packed_status & kStatusMask); |
551 | 81.9M | } |
552 | | |
553 | 50.1M | uint64_t extra_value() const { |
554 | 50.1M | return packed_status >> kStatusBits; |
555 | 50.1M | } |
556 | | |
557 | 35.7M | void Set(LeaderStatus status, uint64_t extra_value, CoarseTimePoint expire_at_) { |
558 | 35.7M | DCHECK_EQ(extra_value << kStatusBits >> kStatusBits, extra_value); |
559 | 35.7M | packed_status = static_cast<uint64_t>(status) | (extra_value << kStatusBits); |
560 | 35.7M | expire_at = expire_at_; |
561 | 35.7M | } |
562 | | }; |
563 | | |
564 | | mutable boost::atomic<LeaderStateCache> leader_state_cache_; |
565 | | }; |
566 | | |
567 | | } // namespace consensus |
568 | | } // namespace yb |
569 | | |
570 | | #endif // YB_CONSENSUS_REPLICA_STATE_H_ |