/Users/deen/code/yugabyte-db/src/yb/master/cluster_balance.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) YugaByte, Inc. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
4 | | // in compliance with the License. You may obtain a copy of the License at |
5 | | // |
6 | | // http://www.apache.org/licenses/LICENSE-2.0 |
7 | | // |
8 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
9 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
10 | | // or implied. See the License for the specific language governing permissions and limitations |
11 | | // under the License. |
12 | | // |
13 | | |
14 | | #ifndef YB_MASTER_CLUSTER_BALANCE_H |
15 | | #define YB_MASTER_CLUSTER_BALANCE_H |
16 | | |
17 | | #include <atomic> |
18 | | #include <map> |
19 | | #include <memory> |
20 | | #include <set> |
21 | | #include <string> |
22 | | #include <unordered_map> |
23 | | #include <unordered_set> |
24 | | #include <vector> |
25 | | |
26 | | #include <boost/circular_buffer.hpp> |
27 | | |
28 | | #include "yb/master/catalog_manager.h" |
29 | | #include "yb/master/master_fwd.h" |
30 | | #include "yb/master/cluster_balance_util.h" |
31 | | #include "yb/master/ts_descriptor.h" |
32 | | |
33 | | #include "yb/util/random.h" |
34 | | |
35 | | DECLARE_int32(load_balancer_max_concurrent_tablet_remote_bootstraps); |
36 | | DECLARE_int32(load_balancer_max_over_replicated_tablets); |
37 | | DECLARE_int32(load_balancer_max_concurrent_adds); |
38 | | DECLARE_int32(load_balancer_max_concurrent_moves); |
39 | | DECLARE_int32(load_balancer_max_concurrent_removals); |
40 | | |
41 | | YB_STRONGLY_TYPED_BOOL(StepdownIfLeader); |
42 | | |
43 | | namespace yb { |
44 | | namespace master { |
45 | | |
46 | | // This class keeps state with regards to the full cluster load of tablets on tablet servers. We |
47 | | // count a tablet towards a tablet server's load if it is either RUNNING, or is in the process of |
48 | | // starting up, hence NOT_STARTED or BOOTSTRAPPING. |
49 | | // |
50 | | // This class also keeps state for the process of balancing load, which is done by temporarily |
51 | | // enlarging the replica set for a tablet by adding a new peer on a less loaded TS, and |
52 | | // subsequently removing a peer that is more loaded. |
53 | | // |
54 | | // The policy for balancing the relicas involves a two step process: |
55 | | // 1) Add replicas to tablet peer groups, if required, potentially leading to temporary |
56 | | // over-replication. |
57 | | // 1.1) If any tablet has less replicas than the configured RF, or if there is any placement block |
58 | | // with fewer replicas than the specified minimum in that placement, we will try to add replicas, |
59 | | // so as to reach the client requirements. |
60 | | // 1.2) If any tablet has replicas placed on tablet servers that do not conform to the specified |
61 | | // placement, then we should remove these. However, we never want to under-replicate a whole peer |
62 | | // group, or any individual placement block, so we will first add a new replica that will allow |
63 | | // the invalid ones to be removed. |
64 | | // 1.3) If we have no placement related issues, then we just want try to equalize load |
65 | | // distribution across the cluster, while still maintaining the placement requirements. |
66 | | // 2) Remove replicas from tablet peer groups if they are either over-replicated, or placed on |
67 | | // tablet servers they shouldn't be. |
68 | | // 2.1) If we have replicas living on tablet servers where they should not, due to placement |
69 | | // constraints, or tablet servers being blacklisted, we try to remove those replicas with high |
70 | | // priority, but naturally, only if removing said replica does not lead to under-replication. |
71 | | // 2.2) If we have no placement related issues, then we just try to shrink back any temporarily |
72 | | // over-replicated tablet peer groups, while still conforming to the placement requirements. |
73 | | // |
74 | | // This class also balances the leaders on tablet servers, starting from the server with the most |
75 | | // leaders and moving some leaders to the servers with less to achieve an even distribution. If |
76 | | // a threshold is set in the configuration, the balancer will just keep the numbers of leaders |
77 | | // on each server below it instead of maintaining an even distribution. |
78 | | class ClusterLoadBalancer { |
79 | | public: |
80 | | explicit ClusterLoadBalancer(CatalogManager* cm); |
81 | | virtual ~ClusterLoadBalancer(); |
82 | | |
83 | | // Executes one run of the load balancing algorithm. This currently does not persist any state, |
84 | | // so it needs to scan the in-memory tablet and TS data in the CatalogManager on every run and |
85 | | // create a new PerTableLoadState object. |
86 | | void RunLoadBalancerWithOptions(Options* options); |
87 | | |
88 | | // Runs the load balancer once for the live and all read only clusters, in order |
89 | | // of the cluster config. |
90 | | virtual void RunLoadBalancer(Options* options = nullptr); |
91 | | |
92 | | // Sets whether to enable or disable the load balancer, on demand. |
93 | 0 | void SetLoadBalancerEnabled(bool is_enabled) { is_enabled_ = is_enabled; } |
94 | | |
95 | | bool IsLoadBalancerEnabled() const; |
96 | | |
97 | | bool CanBalanceGlobalLoad() const; |
98 | | |
99 | | CHECKED_STATUS IsIdle() const; |
100 | | |
101 | | // Returns the TableInfo of all the tables for whom load balancing is being skipped. |
102 | | // As of today, this constitutes all the system tables, colocated user tables |
103 | | // and tables which have been marked as DELETING OR DELETED. |
104 | | vector<scoped_refptr<TableInfo>> GetAllTablesLoadBalancerSkipped(); |
105 | | |
106 | | // |
107 | | // Catalog manager indirection methods. |
108 | | // |
109 | | protected: |
110 | | // |
111 | | // Indirection methods to CatalogManager that we override in the subclasses (or testing). |
112 | | // |
113 | | |
114 | | void InitializeTSDescriptors(); |
115 | | |
116 | | // Get the list of all live TSDescriptors which reported their tablets. |
117 | | virtual void GetAllReportedDescriptors(TSDescriptorVector* ts_descs) const; |
118 | | |
119 | | // Get the list of all TSDescriptors. |
120 | | virtual void GetAllDescriptors(TSDescriptorVector* ts_descs) const; |
121 | | |
122 | | // Get access to the tablet map across the cluster. |
123 | | virtual const TabletInfoMap& GetTabletMap() const REQUIRES_SHARED(catalog_manager_->mutex_); |
124 | | |
125 | | // Get access to the table map. |
126 | | virtual const TableInfoMap& GetTableMap() const REQUIRES_SHARED(catalog_manager_->mutex_); |
127 | | |
128 | | // Get the table info object for given table uuid. |
129 | | virtual const scoped_refptr<TableInfo> GetTableInfo(const TableId& table_uuid) const |
130 | | REQUIRES_SHARED(catalog_manager_->mutex_); |
131 | | |
132 | | // Get the replication info from the cluster configuration. |
133 | | virtual const ReplicationInfoPB& GetClusterReplicationInfo() const; |
134 | | |
135 | | // Get the placement information from the cluster configuration. |
136 | | // Gets appropriate live or read only cluster placement, |
137 | | // depending on placement_uuid_. |
138 | | virtual const PlacementInfoPB& GetClusterPlacementInfo() const; |
139 | | |
140 | | // Get the blacklist information. |
141 | | virtual const BlacklistPB& GetServerBlacklist() const; |
142 | | |
143 | | // Get the leader blacklist information. |
144 | | virtual const BlacklistPB& GetLeaderBlacklist() const; |
145 | | |
146 | | // Should skip load-balancing of this table? |
147 | | virtual bool SkipLoadBalancing(const TableInfo& table) const |
148 | | REQUIRES_SHARED(catalog_manager_->mutex_); |
149 | | |
150 | | // Increment the provided variables by the number of pending tasks that were found. Do not call |
151 | | // more than once for the same table because it also modifies the internal state. |
152 | | virtual CHECKED_STATUS CountPendingTasksUnlocked(const TableId& table_uuid, |
153 | | int* pending_add_replica_tasks, |
154 | | int* pending_remove_replica_tasks, |
155 | | int* pending_stepdown_leader_tasks) |
156 | | REQUIRES_SHARED(catalog_manager_->mutex_); |
157 | | |
158 | | // Wrapper around CatalogManager::GetPendingTasks so it can be mocked by TestLoadBalancer. |
159 | | virtual void GetPendingTasks(const string& table_uuid, |
160 | | TabletToTabletServerMap* add_replica_tasks, |
161 | | TabletToTabletServerMap* remove_replica_tasks, |
162 | | TabletToTabletServerMap* stepdown_leader_tasks) |
163 | | REQUIRES_SHARED(catalog_manager_->mutex_); |
164 | | |
165 | | // Issue the call to CatalogManager to change the config for this particular tablet, either |
166 | | // adding or removing the peer at ts_uuid, based on the is_add argument. Removing the peer |
167 | | // is optional. When neither adding nor removing peer, it means just moving a leader from one |
168 | | // tablet server to another. If new_leader_ts_uuid is empty, a server will be picked by random |
169 | | // to be the new leader. Also takes in the role of the tablet for the creation flow. |
170 | | virtual Status SendReplicaChanges( |
171 | | scoped_refptr<TabletInfo> tablet, const TabletServerId& ts_uuid, const bool is_add, |
172 | | const bool should_remove_leader, const TabletServerId& new_leader_ts_uuid = ""); |
173 | | |
174 | | // If type_ is live, return PRE_VOTER, otherwise, return PRE_OBSERVER. |
175 | | consensus::PeerMemberType GetDefaultMemberType(); |
176 | | |
177 | | // |
178 | | // Higher level methods and members. |
179 | | // |
180 | | |
181 | | // Resets the global_state_ object, and the map of per-table states. |
182 | | virtual void ResetGlobalState(bool initialize_ts_descs = true); |
183 | | |
184 | | |
185 | | // Resets the pointer state_ to point to the correct table's state. |
186 | | virtual void ResetTableStatePtr(const TableId& table_id, Options* options = nullptr); |
187 | | |
188 | | // Goes over the tablet_map_ and the set of live TSDescriptors to compute the load distribution |
189 | | // across the tablets for the given table. Returns an OK status if the method succeeded or an |
190 | | // error if there are transient errors in updating the internal state. |
191 | | virtual CHECKED_STATUS AnalyzeTabletsUnlocked(const TableId& table_uuid) |
192 | | REQUIRES_SHARED(catalog_manager_->mutex_); |
193 | | |
194 | | // Processes any required replica additions, as part of moving load from a highly loaded TS to |
195 | | // one that is less loaded. |
196 | | // |
197 | | // Returns true if a move was actually made. |
198 | | Result<bool> HandleAddReplicas( |
199 | | TabletId* out_tablet_id, TabletServerId* out_from_ts, TabletServerId* out_to_ts) |
200 | | REQUIRES_SHARED(catalog_manager_->mutex_); |
201 | | |
202 | | // Processes any required replica removals, as part of having added an extra replica to a |
203 | | // tablet's set of peers, which caused its quorum to be larger than the configured number. |
204 | | // |
205 | | // Returns true if a move was actually made. |
206 | | Result<bool> HandleRemoveReplicas(TabletId* out_tablet_id, TabletServerId* out_from_ts) |
207 | | REQUIRES_SHARED(catalog_manager_->mutex_); |
208 | | |
209 | | // Methods for load preparation, called by ClusterLoadBalancer while analyzing tablets and |
210 | | // building the initial state. |
211 | | |
212 | | virtual void InitTablespaceManager(); |
213 | | |
214 | | // Return the replication info for 'table'. |
215 | | Result<ReplicationInfoPB> GetTableReplicationInfo(const scoped_refptr<TableInfo>& table) const; |
216 | | |
217 | | // Method called when initially analyzing tablets, to build up load and usage information. |
218 | | // Returns an OK status if the method succeeded or an error if there are transient errors in |
219 | | // updating the internal state. |
220 | | CHECKED_STATUS UpdateTabletInfo(TabletInfo* tablet); |
221 | | |
222 | | // If a tablet is under-replicated, or has certain placements that have less than the minimum |
223 | | // required number of replicas, we need to add extra tablets to its peer set. |
224 | | // |
225 | | // Returns true if a move was actually made. |
226 | | Result<bool> HandleAddIfMissingPlacement(TabletId* out_tablet_id, TabletServerId* out_to_ts) |
227 | | REQUIRES_SHARED(catalog_manager_->mutex_); |
228 | | |
229 | | // If we find a tablet with peers that violate the placement information, we want to move load |
230 | | // away from the invalid placement peers, to new peers that are valid. To ensure we do not |
231 | | // under-replicate a tablet, we first find the tablet server to add load to, essentially |
232 | | // over-replicating the tablet temporarily. |
233 | | // |
234 | | // Returns true if a move was actually made. |
235 | | Result<bool> HandleAddIfWrongPlacement( |
236 | | TabletId* out_tablet_id, TabletServerId* out_from_ts, TabletServerId* out_to_ts) |
237 | | REQUIRES_SHARED(catalog_manager_->mutex_); |
238 | | |
239 | | // If we find a tablet with peers that violate the placement information, we first over-replicate |
240 | | // the peer group, in the add portion of the algorithm. We then eventually remove extra replicas |
241 | | // on the remove path, here. |
242 | | // |
243 | | // Returns true if a move was actually made. |
244 | | Result<bool> HandleRemoveIfWrongPlacement(TabletId* out_tablet_id, TabletServerId* out_from_ts) |
245 | | REQUIRES_SHARED(catalog_manager_->mutex_); |
246 | | |
247 | | // This function handles leader load from non-affinitized to affinitized nodes. |
248 | | // If it can find a way to move leader load from a non-affinitized to affinitized node, |
249 | | // returns true, if not returns false, if error is found, returns Status. |
250 | | // This is called before normal leader load balancing. |
251 | | Result<bool> HandleLeaderLoadIfNonAffinitized(TabletId* moving_tablet_id, |
252 | | TabletServerId* from_ts, |
253 | | TabletServerId* to_ts, |
254 | | std::string* to_ts_path); |
255 | | |
256 | | // Processes any tablet leaders that are on a highly loaded tablet server and need to be moved. |
257 | | // |
258 | | // Returns true if a move was actually made. |
259 | | Result<bool> HandleLeaderMoves( |
260 | | TabletId* out_tablet_id, TabletServerId* out_from_ts, TabletServerId* out_to_ts) |
261 | | REQUIRES_SHARED(catalog_manager_->mutex_); |
262 | | |
263 | | virtual void GetAllAffinitizedZones(AffinitizedZonesSet* affinitized_zones) const; |
264 | | |
265 | | // Go through sorted_load_ and figure out which tablet to rebalance and from which TS that is |
266 | | // serving it to which other TS. |
267 | | // |
268 | | // Returns true if we could find a tablet to rebalance and sets the three output parameters. |
269 | | // Returns false otherwise. |
270 | | Result<bool> GetLoadToMove( |
271 | | TabletId* moving_tablet_id, TabletServerId* from_ts, TabletServerId* to_ts) |
272 | | REQUIRES_SHARED(catalog_manager_->mutex_); |
273 | | |
274 | | Result<bool> GetTabletToMove( |
275 | | const TabletServerId& from_ts, const TabletServerId& to_ts, TabletId* moving_tablet_id) |
276 | | REQUIRES_SHARED(catalog_manager_->mutex_); |
277 | | |
278 | | // Go through sorted_leader_load_ and figure out which leader to rebalance and from which TS |
279 | | // that is serving it to which other TS. |
280 | | // |
281 | | // Returns true if we could find a leader to rebalance and sets the three output parameters. |
282 | | // Returns false otherwise. |
283 | | Result<bool> GetLeaderToMove(TabletId* moving_tablet_id, |
284 | | TabletServerId* from_ts, |
285 | | TabletServerId* to_ts, |
286 | | std::string* to_ts_path); |
287 | | |
288 | | // Issue the change config and modify the in-memory state for moving a replica from one tablet |
289 | | // server to another. |
290 | | CHECKED_STATUS MoveReplica( |
291 | | const TabletId& tablet_id, const TabletServerId& from_ts, const TabletServerId& to_ts) |
292 | | REQUIRES_SHARED(catalog_manager_->mutex_); |
293 | | |
294 | | // Issue the change config and modify the in-memory state for adding a replica on the specified |
295 | | // tablet server. |
296 | | CHECKED_STATUS AddReplica(const TabletId& tablet_id, const TabletServerId& to_ts) |
297 | | REQUIRES_SHARED(catalog_manager_->mutex_); |
298 | | |
299 | | // Issue the change config and modify the in-memory state for removing a replica on the specified |
300 | | // tablet server. |
301 | | CHECKED_STATUS RemoveReplica( |
302 | | const TabletId& tablet_id, const TabletServerId& ts_uuid) |
303 | | REQUIRES_SHARED(catalog_manager_->mutex_); |
304 | | |
305 | | // Issue the change config and modify the in-memory state for moving a tablet leader on the |
306 | | // specified tablet server to the other specified tablet server. |
307 | | CHECKED_STATUS MoveLeader(const TabletId& tablet_id, |
308 | | const TabletServerId& from_ts, |
309 | | const TabletServerId& to_ts, |
310 | | const std::string& to_ts_path) |
311 | | REQUIRES_SHARED(catalog_manager_->mutex_); |
312 | | |
313 | | // Methods called for returning tablet id sets, for figuring out tablets to move around. |
314 | | |
315 | | const PlacementInfoPB& GetPlacementByTablet(const TabletId& tablet_id) const |
316 | | REQUIRES_SHARED(catalog_manager_->mutex_); |
317 | | |
318 | | // Get access to all the tablets for the given table. |
319 | | Result<TabletInfos> GetTabletsForTable(const TableId& table_uuid) const |
320 | | REQUIRES_SHARED(catalog_manager_->mutex_); |
321 | | |
322 | | // Populates pb with the placement info in tablet's config at cluster placement_uuid_. |
323 | | Status PopulatePlacementInfo(TabletInfo* tablet, PlacementInfoPB* pb); |
324 | | |
325 | | // Returns the read only placement info from placement_uuid_. |
326 | | const PlacementInfoPB& GetReadOnlyPlacementFromUuid( |
327 | | const ReplicationInfoPB& replication_info) const; |
328 | | |
329 | | virtual const PlacementInfoPB& GetLiveClusterPlacementInfo() const; |
330 | | |
331 | | // |
332 | | // Generic load information methods. |
333 | | // |
334 | | |
335 | | // Get the total number of extra replicas. |
336 | | size_t get_total_over_replication() const; |
337 | | |
338 | | size_t get_total_under_replication() const; |
339 | | |
340 | | // Convenience methods for getting totals of starting or running tablets. |
341 | | size_t get_total_starting_tablets() const; |
342 | | int get_total_running_tablets() const; |
343 | | |
344 | | size_t get_total_wrong_placement() const; |
345 | | size_t get_total_blacklisted_servers() const; |
346 | | size_t get_total_leader_blacklisted_servers() const; |
347 | | |
348 | | std::unordered_map<TableId, std::unique_ptr<PerTableLoadState>> per_table_states_; |
349 | | // The state of the table load in the cluster, as far as this run of the algorithm is concerned. |
350 | | // It points to the appropriate object in per_table_states_. |
351 | | PerTableLoadState* state_ = nullptr; |
352 | | |
353 | | std::unique_ptr<GlobalLoadState> global_state_; |
354 | | |
355 | | // The catalog manager of the Master that actually has the Tablet and TS state. The object is not |
356 | | // managed by this class, but by the Master's unique_ptr. |
357 | | CatalogManager* catalog_manager_; |
358 | | |
359 | | std::shared_ptr<YsqlTablespaceManager> tablespace_manager_; |
360 | | |
361 | | template <class ClusterLoadBalancerClass> friend class TestLoadBalancerBase; |
362 | | |
363 | | private: |
364 | | // Returns true if at least one member in the tablet's configuration is transitioning into a |
365 | | // VOTER, but it's not a VOTER yet. |
366 | | Result<bool> IsConfigMemberInTransitionMode(const TabletId& tablet_id) const |
367 | | REQUIRES_SHARED(catalog_manager_->mutex_); |
368 | | |
369 | | // Dump the sorted load on tservers (it is usually per table). |
370 | | void DumpSortedLoad() const; |
371 | | |
372 | | // Report unusual state at the beginning of an LB run which may prevent LB from making moves. |
373 | | void ReportUnusualLoadBalancerState() const; |
374 | | |
375 | | // Random number generator for picking items at random from sets, using ReservoirSample. |
376 | | ThreadSafeRandom random_; |
377 | | |
378 | | // Controls whether to run the load balancing algorithm or not. |
379 | | std::atomic<bool> is_enabled_; |
380 | | |
381 | | // Information representing activity of load balancer. |
382 | | struct ActivityInfo { |
383 | | uint32_t table_tasks = 0; |
384 | | uint32_t master_errors = 0; |
385 | | |
386 | 133k | bool IsIdle() const { |
387 | 133k | return table_tasks == 0 && master_errors == 0; |
388 | 133k | } |
389 | | }; |
390 | | |
391 | | // Circular buffer of load balancer activity. |
392 | | boost::circular_buffer<ActivityInfo> cbuf_activities_; |
393 | | |
394 | | // Summary of circular buffer of load balancer activity. |
395 | | size_t num_idle_runs_ = 0; |
396 | | std::atomic<bool> is_idle_ {true}; |
397 | | |
398 | | // Check if we are able to balance global load. With the current algorithm, we only allow for |
399 | | // global load balancing once all tables are themselves balanced. |
400 | | // This value is only set to true once is_idle_ becomes true, and this value is only set to false |
401 | | // once we perform a non-global move. |
402 | | bool can_perform_global_operations_ = false; |
403 | | |
404 | | // Record load balancer activity for tables and tservers. |
405 | | void RecordActivity(uint32_t master_errors) REQUIRES_SHARED(catalog_manager_->mutex_); |
406 | | |
407 | | typedef rw_spinlock LockType; |
408 | | mutable LockType mutex_; |
409 | | |
410 | | // Maintains a list of all tables for whom LB has been skipped. |
411 | | // Currently for consumption by components outside the LB. |
412 | | // Protected by a readers-writers lock. Only the LB writes to it. |
413 | | // Other components such as test, admin UI, etc. should |
414 | | // ideally read from it using a shared_lock<>. |
415 | | vector<scoped_refptr<TableInfo>> skipped_tables_ GUARDED_BY(mutex_); |
416 | | // Internal to LB structure to keep track of skipped tables. |
417 | | // skipped_tables_ is set at the end of each LB run using |
418 | | // skipped_tables_per_run_. |
419 | | vector<scoped_refptr<TableInfo>> skipped_tables_per_run_; |
420 | | |
421 | | DISALLOW_COPY_AND_ASSIGN(ClusterLoadBalancer); |
422 | | }; |
423 | | |
424 | | } // namespace master |
425 | | } // namespace yb |
426 | | #endif /* YB_MASTER_CLUSTER_BALANCE_H */ |