/Users/deen/code/yugabyte-db/src/yb/master/cluster_balance_util.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) YugaByte, Inc. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
4 | | // in compliance with the License. You may obtain a copy of the License at |
5 | | // |
6 | | // http://www.apache.org/licenses/LICENSE-2.0 |
7 | | // |
8 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
9 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
10 | | // or implied. See the License for the specific language governing permissions and limitations |
11 | | // under the License. |
12 | | // |
13 | | |
14 | | #ifndef YB_MASTER_CLUSTER_BALANCE_UTIL_H |
15 | | #define YB_MASTER_CLUSTER_BALANCE_UTIL_H |
16 | | |
17 | | #include <memory> |
18 | | #include <set> |
19 | | #include <string> |
20 | | #include <unordered_map> |
21 | | #include <unordered_set> |
22 | | #include <vector> |
23 | | |
24 | | #include "yb/gutil/casts.h" |
25 | | |
26 | | #include "yb/master/catalog_entity_info.pb.h" |
27 | | #include "yb/master/ts_descriptor.h" |
28 | | |
29 | | DECLARE_int32(leader_balance_threshold); |
30 | | |
31 | | DECLARE_int32(load_balancer_max_concurrent_tablet_remote_bootstraps); |
32 | | |
33 | | DECLARE_int32(load_balancer_max_concurrent_tablet_remote_bootstraps_per_table); |
34 | | |
35 | | DECLARE_int32(load_balancer_max_over_replicated_tablets); |
36 | | |
37 | | DECLARE_int32(load_balancer_max_concurrent_adds); |
38 | | |
39 | | DECLARE_int32(load_balancer_max_concurrent_removals); |
40 | | |
41 | | DECLARE_int32(load_balancer_max_concurrent_moves); |
42 | | |
43 | | DECLARE_int32(load_balancer_max_concurrent_moves_per_table); |
44 | | |
45 | | namespace yb { |
46 | | namespace master { |
47 | | |
48 | | // enum for replica type, either live (synchronous) or read only (timeline consistent) |
49 | | enum ReplicaType { |
50 | | LIVE, |
51 | | READ_ONLY, |
52 | | }; |
53 | | |
54 | | struct cloud_equal_to { |
55 | 65.1k | bool operator()(const yb::CloudInfoPB& x, const yb::CloudInfoPB& y) const { |
56 | 65.1k | return x.placement_cloud() == y.placement_cloud() && |
57 | 65.1k | x.placement_region() == y.placement_region() && |
58 | 65.1k | x.placement_zone() == y.placement_zone(); |
59 | 65.1k | } |
60 | | }; |
61 | | |
62 | | struct cloud_hash { |
63 | 104k | std::size_t operator()(const yb::CloudInfoPB& ci) const { |
64 | 104k | return std::hash<std::string>{} (TSDescriptor::generate_placement_id(ci)); |
65 | 104k | } |
66 | | }; |
67 | | |
68 | | struct CBTabletMetadata { |
69 | 967k | bool is_missing_replicas() { return is_under_replicated || !under_replicated_placements.empty(); } |
70 | | |
71 | 967k | bool has_wrong_placements() { |
72 | 967k | return !wrong_placement_tablet_servers.empty() || !blacklisted_tablet_servers.empty(); |
73 | 967k | } |
74 | | |
75 | 0 | bool has_blacklisted_leader() { |
76 | 0 | return !leader_blacklisted_tablet_servers.empty(); |
77 | 0 | } |
78 | | |
79 | | // Can the TS be added to any of the placements that lack a replica for this tablet. |
80 | | bool CanAddTSToMissingPlacements(const std::shared_ptr<TSDescriptor> ts_descriptor) const; |
81 | | |
82 | | // Number of running replicas for this tablet. |
83 | | int running = 0; |
84 | | |
85 | | // TODO(bogdan): actually use this! |
86 | | // |
87 | | // Number of starting replicas for this tablet. |
88 | | int starting = 0; |
89 | | |
90 | | // If this tablet has fewer replicas than the configured number in the PlacementInfoPB. |
91 | | bool is_under_replicated = false; |
92 | | |
93 | | // Set of placement ids that have less replicas available than the configured minimums. |
94 | | std::unordered_set<CloudInfoPB, cloud_hash, cloud_equal_to> under_replicated_placements; |
95 | | |
96 | | // If this tablet has more replicas than the configured number in the PlacementInfoPB. |
97 | | bool is_over_replicated; |
98 | | |
99 | | // Set of tablet server ids that can be candidates for removal, due to tablet being |
100 | | // over-replicated. For tablets with placement information, this will be all tablet servers |
101 | | // that are housing replicas of this tablet, in a placement with strictly more replicas than the |
102 | | // configured minimum (as that means there is at least one of them we can remove, and still |
103 | | // respect the minimum). |
104 | | // |
105 | | // For tablets with no placement information, this will be all the tablet servers currently |
106 | | // serving this tablet, as we can downsize with no restrictions in this case. |
107 | | std::set<TabletServerId> over_replicated_tablet_servers; |
108 | | |
109 | | // Set of tablet server ids whose placement information does not match that listed in the |
110 | | // table's PlacementInfoPB. This will happen when we change the configuration for the table or |
111 | | // the cluster. |
112 | | std::set<TabletServerId> wrong_placement_tablet_servers; |
113 | | |
114 | | // Set of tablet server ids that have been blacklisted and as such, should not get any more load |
115 | | // assigned to them and should be prioritized for removing load. |
116 | | std::set<TabletServerId> blacklisted_tablet_servers; |
117 | | std::set<TabletServerId> leader_blacklisted_tablet_servers; |
118 | | |
119 | | // The tablet server id of the leader in this tablet's peer group. |
120 | | TabletServerId leader_uuid; |
121 | | |
122 | | // Leader stepdown failures. We use this to prevent retrying the same leader stepdown too soon. |
123 | | LeaderStepDownFailureTimes leader_stepdown_failures; |
124 | | |
125 | | std::string ToString() const; |
126 | | }; |
127 | | |
128 | | using AffinitizedZonesSet = std::unordered_set<CloudInfoPB, cloud_hash, cloud_equal_to>; |
129 | | using PathToTablets = std::unordered_map<std::string, std::set<TabletId>>; |
130 | | |
131 | | struct CBTabletServerMetadata { |
132 | | // The TSDescriptor for this tablet server. |
133 | | std::shared_ptr<TSDescriptor> descriptor = nullptr; |
134 | | |
135 | | // Map from path to the set of tablet ids that this tablet server is currently running |
136 | | // on the path. |
137 | | PathToTablets path_to_tablets; |
138 | | |
139 | | // Map from path to the number of replicas that this tablet server is currently starting |
140 | | // on the path. |
141 | | std::unordered_map<std::string, int> path_to_starting_tablets_count; |
142 | | |
143 | | // Set of paths sorted descending by tablets count. |
144 | | vector<std::string> sorted_path_load_by_tablets_count; |
145 | | |
146 | | // Set of paths sorted ascending by tablet leaders count. |
147 | | vector<std::string> sorted_path_load_by_leader_count; |
148 | | |
149 | | // The set of tablet ids that this tablet server is currently running. |
150 | | std::set<TabletId> running_tablets; |
151 | | |
152 | | // The set of tablet ids that this tablet server is currently starting. |
153 | | std::set<TabletId> starting_tablets; |
154 | | |
155 | | // The set of tablet leader ids that this tablet server is currently running. |
156 | | std::set<TabletId> leaders; |
157 | | |
158 | | // Map from path to the set of tablet leader ids that this tablet server is currently running |
159 | | // on the path. |
160 | | PathToTablets path_to_leaders; |
161 | | |
162 | | // The set of tablet ids that this tablet server disabled (ex. after split). |
163 | | std::set<TabletId> disabled_by_ts_tablets; |
164 | | }; |
165 | | |
166 | | struct CBTabletServerLoadCounts { |
167 | | // Stores global load counts for a tablet server. |
168 | | // See definitions of these counts in CBTabletServerMetadata. |
169 | | int running_tablets_count = 0; |
170 | | int starting_tablets_count = 0; |
171 | | int leaders_count = 0; |
172 | | }; |
173 | | |
174 | | struct Options { |
175 | 77.4k | Options() {} |
176 | 77.4k | virtual ~Options() {} |
177 | | // If variance between load on TS goes past this number, we should try to balance. |
178 | | double kMinLoadVarianceToBalance = 2.0; |
179 | | |
180 | | // If variance between global load on TS goes past this number, we should try to balance. |
181 | | double kMinGlobalLoadVarianceToBalance = 2.0; |
182 | | |
183 | | // If variance between leader load on TS goes past this number, we should try to balance. |
184 | | double kMinLeaderLoadVarianceToBalance = 2.0; |
185 | | |
186 | | // If variance between global leader load on TS goes past this number, we should try to balance. |
187 | | double kMinGlobalLeaderLoadVarianceToBalance = 2.0; |
188 | | |
189 | | // Whether to limit the number of tablets being spun up on the cluster at any given time. |
190 | | bool kAllowLimitStartingTablets = true; |
191 | | |
192 | | // Max number of tablets being remote bootstrapped across the cluster, if we enable limiting |
193 | | // this. |
194 | | int kMaxTabletRemoteBootstraps = FLAGS_load_balancer_max_concurrent_tablet_remote_bootstraps; |
195 | | |
196 | | // Max number of tablets being remote bootstrapped for a specific table, if we enable limiting |
197 | | // this. |
198 | | int kMaxTabletRemoteBootstrapsPerTable = |
199 | | FLAGS_load_balancer_max_concurrent_tablet_remote_bootstraps_per_table; |
200 | | |
201 | | // Whether to limit the number of tablets that have more peers than configured at any given |
202 | | // time. |
203 | | bool kAllowLimitOverReplicatedTablets = true; |
204 | | |
205 | | // Max number of running tablet replicas that are over the configured limit. |
206 | | int kMaxOverReplicatedTablets = FLAGS_load_balancer_max_over_replicated_tablets; |
207 | | |
208 | | // Max number of over-replicated tablet peer removals to do in any one run of the load balancer. |
209 | | int kMaxConcurrentRemovals = FLAGS_load_balancer_max_concurrent_removals; |
210 | | |
211 | | // Max number of tablet peer replicas to add in any one run of the load balancer. |
212 | | int kMaxConcurrentAdds = FLAGS_load_balancer_max_concurrent_adds; |
213 | | |
214 | | // Max number of tablet leaders on tablet servers (across the cluster) to move in any one run of |
215 | | // the load balancer. |
216 | | int kMaxConcurrentLeaderMoves = FLAGS_load_balancer_max_concurrent_moves; |
217 | | |
218 | | // Max number of tablet leaders per table to move in any one run of the load balancer. |
219 | | int kMaxConcurrentLeaderMovesPerTable = FLAGS_load_balancer_max_concurrent_moves_per_table; |
220 | | |
221 | | // Either a live replica or a read. |
222 | | ReplicaType type; |
223 | | |
224 | | string placement_uuid; |
225 | | string live_placement_uuid; |
226 | | |
227 | | // TODO(bogdan): add state for leaders starting remote bootstraps, to limit on that end too. |
228 | | }; |
229 | | |
230 | | // Cluster-wide state and metrics. |
231 | | // For now it's used to determine how many tablets are being remote bootstrapped across the cluster, |
232 | | // as well as keeping track of global load counts in order to do global load balancing moves. |
233 | | class GlobalLoadState { |
234 | | public: |
235 | | // Get the global load for a certain TS. |
236 | | int GetGlobalLoad(const TabletServerId& ts_uuid) const; |
237 | | |
238 | | // Get global leader load for a certain TS. |
239 | | int GetGlobalLeaderLoad(const TabletServerId& ts_uuid) const; |
240 | | |
241 | | // Used to determine how many tablets are being remote bootstrapped across the cluster. |
242 | | int total_starting_tablets_ = 0; |
243 | | |
244 | | TSDescriptorVector ts_descs_; |
245 | | |
246 | | bool drive_aware_ = true; |
247 | | |
248 | | private: |
249 | | // Map from tablet server ids to the global metadata we store for each. |
250 | | std::unordered_map<TabletServerId, CBTabletServerLoadCounts> per_ts_global_meta_; |
251 | | |
252 | | friend class PerTableLoadState; |
253 | | }; |
254 | | |
255 | | class PerTableLoadState { |
256 | | public: |
257 | | TableId table_id_; |
258 | | explicit PerTableLoadState(GlobalLoadState* global_state); |
259 | | |
260 | | virtual ~PerTableLoadState(); |
261 | | |
262 | | // Comparators used for sorting by load. |
263 | | bool CompareByUuid(const TabletServerId& a, const TabletServerId& b); |
264 | | |
265 | | bool CompareByReplica(const TabletReplica& a, const TabletReplica& b); |
266 | | |
267 | | // Comparator functor to be able to wrap around the public but non-static compare methods that |
268 | | // end up using internal state of the class. |
269 | | struct Comparator { |
270 | 246k | explicit Comparator(PerTableLoadState* state) : state_(state) {} |
271 | 627k | bool operator()(const TabletServerId& a, const TabletServerId& b) { |
272 | 627k | return state_->CompareByUuid(a, b); |
273 | 627k | } |
274 | | |
275 | 0 | bool operator()(const TabletReplica& a, const TabletReplica& b) { |
276 | 0 | return state_->CompareByReplica(a, b); |
277 | 0 | } |
278 | | |
279 | | PerTableLoadState* state_; |
280 | | }; |
281 | | |
282 | | // Comparator to sort tablet servers' leader load. |
283 | | struct LeaderLoadComparator { |
284 | 127k | explicit LeaderLoadComparator(PerTableLoadState* state) : state_(state) {} |
285 | | bool operator()(const TabletServerId& a, const TabletServerId& b); |
286 | | |
287 | | PerTableLoadState* state_; |
288 | | }; |
289 | | |
290 | | // Get the load for a certain TS. |
291 | | size_t GetLoad(const TabletServerId& ts_uuid) const; |
292 | | |
293 | | // Get the load for a certain TS. |
294 | | size_t GetLeaderLoad(const TabletServerId& ts_uuid) const; |
295 | | |
296 | 121k | void SetBlacklist(const BlacklistPB& blacklist) { blacklist_ = blacklist; } |
297 | 121k | void SetLeaderBlacklist(const BlacklistPB& leader_blacklist) { |
298 | 121k | leader_blacklist_ = leader_blacklist; |
299 | 121k | } |
300 | | |
301 | 3.36M | bool IsTsInLivePlacement(TSDescriptor* ts_desc) { |
302 | 3.36M | return ts_desc->placement_uuid() == options_->live_placement_uuid; |
303 | 3.36M | } |
304 | | |
305 | | // Update the per-tablet information for this tablet. |
306 | | CHECKED_STATUS UpdateTablet(TabletInfo* tablet); |
307 | | |
308 | | virtual void UpdateTabletServer(std::shared_ptr<TSDescriptor> ts_desc); |
309 | | |
310 | | Result<bool> CanAddTabletToTabletServer( |
311 | | const TabletId& tablet_id, const TabletServerId& to_ts, |
312 | | const PlacementInfoPB* placement_info = nullptr); |
313 | | |
314 | | // For a TS specified by ts_uuid, this function checks if there is a placement |
315 | | // block in placement_info where this TS can be placed. If there doesn't exist |
316 | | // any, it returns boost::none. On the other hand if there is a placement block |
317 | | // that satisfies the criteria then it returns the cloud info of that block. |
318 | | // If there wasn't any placement information passed in placement_info then |
319 | | // it returns the cloud info of the TS itself. |
320 | | boost::optional<CloudInfoPB> GetValidPlacement(const TabletServerId& ts_uuid, |
321 | | const PlacementInfoPB* placement_info); |
322 | | |
323 | | Result<bool> CanSelectWrongReplicaToMove( |
324 | | const TabletId& tablet_id, const PlacementInfoPB& placement_info, TabletServerId* out_from_ts, |
325 | | TabletServerId* out_to_ts); |
326 | | |
327 | | CHECKED_STATUS AddReplica(const TabletId& tablet_id, const TabletServerId& to_ts); |
328 | | |
329 | | CHECKED_STATUS RemoveReplica(const TabletId& tablet_id, const TabletServerId& from_ts); |
330 | | |
331 | | void SortLoad(); |
332 | | |
333 | | void SortDriveLoad(); |
334 | | |
335 | | CHECKED_STATUS MoveLeader(const TabletId& tablet_id, |
336 | | const TabletServerId& from_ts, |
337 | | const TabletServerId& to_ts = "", |
338 | | const TabletServerId& to_ts_path = ""); |
339 | | |
340 | | void SortLeaderLoad(); |
341 | | |
342 | | void SortDriveLeaderLoad(); |
343 | | |
344 | | void LogSortedLeaderLoad(); |
345 | | |
346 | 117k | inline bool IsLeaderLoadBelowThreshold(const TabletServerId& ts_uuid) { |
347 | 117k | return ((leader_balance_threshold_ > 0) && |
348 | 16 | (GetLeaderLoad(ts_uuid) <= implicit_cast<size_t>(leader_balance_threshold_))); |
349 | 117k | } |
350 | | |
351 | | void AdjustLeaderBalanceThreshold(); |
352 | | |
353 | | std::shared_ptr<const TabletReplicaMap> GetReplicaLocations(TabletInfo* tablet); |
354 | | |
355 | | CHECKED_STATUS AddRunningTablet(const TabletId& tablet_id, |
356 | | const TabletServerId& ts_uuid, |
357 | | const std::string& path); |
358 | | |
359 | | CHECKED_STATUS RemoveRunningTablet(const TabletId& tablet_id, const TabletServerId& ts_uuid); |
360 | | |
361 | | CHECKED_STATUS AddStartingTablet(const TabletId& tablet_id, const TabletServerId& ts_uuid); |
362 | | |
363 | | CHECKED_STATUS AddLeaderTablet(const TabletId& tablet_id, |
364 | | const TabletServerId& ts_uuid, |
365 | | const TabletServerId& ts_path); |
366 | | |
367 | | CHECKED_STATUS RemoveLeaderTablet(const TabletId& tablet_id, const TabletServerId& ts_uuid); |
368 | | |
369 | | CHECKED_STATUS AddDisabledByTSTablet(const TabletId& tablet_id, const TabletServerId& ts_uuid); |
370 | | |
371 | | // PerTableLoadState member fields |
372 | | |
373 | | // Map from tablet ids to the metadata we store for each. |
374 | | std::unordered_map<TabletId, CBTabletMetadata> per_tablet_meta_; |
375 | | |
376 | | // Map from tablet server ids to the metadata we store for each. |
377 | | std::unordered_map<TabletServerId, CBTabletServerMetadata> per_ts_meta_; |
378 | | |
379 | | // Map from table id to placement information for this table. This will be used for both |
380 | | // determining over-replication, by checking num_replicas, but also for az awareness, by keeping |
381 | | // track of the placement block policies between cluster and table level. |
382 | | std::unordered_map<TableId, PlacementInfoPB> placement_by_table_; |
383 | | |
384 | | // Total number of running tablets in the clusters (including replicas). |
385 | | int total_running_ = 0; |
386 | | |
387 | | // Total number of tablet replicas being started across the cluster. |
388 | | int total_starting_ = 0; |
389 | | |
390 | | // Set of ts_uuid sorted ascending by load. This is the actual raw data of TS load. |
391 | | std::vector<TabletServerId> sorted_load_; |
392 | | |
393 | | // Set of tablet ids that have been determined to have missing replicas. This can mean they are |
394 | | // generically under-replicated (2 replicas active, but 3 configured), or missing replicas in |
395 | | // certain placements (3 replicas active out of 3 configured, but no replicas in one of the AZs |
396 | | // listed in the placement blocks). |
397 | | std::set<TabletId> tablets_missing_replicas_; |
398 | | |
399 | | // Set of tablet ids that have been temporarily over-replicated. This is used to pick tablets |
400 | | // to potentially bring back down to their proper configured size, if there are more running than |
401 | | // expected. |
402 | | std::set<TabletId> tablets_over_replicated_; |
403 | | |
404 | | // Set of tablet ids that have been determined to have replicas in incorrect placements. |
405 | | std::set<TabletId> tablets_wrong_placement_; |
406 | | |
407 | | // The cached blacklist setting of the cluster. We store this upfront, as we add to the list of |
408 | | // tablet servers one by one, so we compare against it once per tablet server. |
409 | | BlacklistPB blacklist_; |
410 | | BlacklistPB leader_blacklist_; |
411 | | |
412 | | // The list of tablet server ids that match the cached blacklist. |
413 | | std::set<TabletServerId> blacklisted_servers_; |
414 | | std::set<TabletServerId> leader_blacklisted_servers_; |
415 | | |
416 | | // List of tablet server ids that have pending deletes. |
417 | | std::set<TabletServerId> servers_with_pending_deletes_; |
418 | | |
419 | | // List of tablet ids that have been added to a new tablet server. |
420 | | std::set<TabletId> tablets_added_; |
421 | | |
422 | | // Number of leaders per each tablet server to balance below. |
423 | | int leader_balance_threshold_ = 0; |
424 | | |
425 | | // List of table server ids sorted by whether leader blacklisted and their leader load. |
426 | | // If affinitized leaders is enabled, stores leader load for affinitized nodes. |
427 | | vector<TabletServerId> sorted_leader_load_; |
428 | | |
429 | | std::unordered_map<TableId, TabletToTabletServerMap> pending_add_replica_tasks_; |
430 | | std::unordered_map<TableId, TabletToTabletServerMap> pending_remove_replica_tasks_; |
431 | | std::unordered_map<TableId, TabletToTabletServerMap> pending_stepdown_leader_tasks_; |
432 | | |
433 | | // Time at which we started the current round of load balancing. |
434 | | MonoTime current_time_; |
435 | | |
436 | | // The knobs we use for tweaking the flow of the algorithm. |
437 | | Options* options_; |
438 | | |
439 | | // Pointer to the cluster global state so that it can be updated when operations like add or |
440 | | // remove are executed. |
441 | | GlobalLoadState* global_state_; |
442 | | |
443 | | // Boolean whether tablets for this table should respect the affinited zones. |
444 | | bool use_preferred_zones_ = true; |
445 | | |
446 | | // check_ts_liveness_ is used to indicate if the TS descriptors |
447 | | // need to be checked if they are live and considered for Load balancing. |
448 | | // In most scenarios, this would be true, except when we use the cluster_balance_mocked.h |
449 | | // for triggering LB scenarios. |
450 | | bool check_ts_liveness_ = true; |
451 | | // Allow only leader balancing for this table. |
452 | | bool allow_only_leader_balancing_ = false; |
453 | | |
454 | | // If affinitized leaders is enabled, stores leader load for non affinitized nodes. |
455 | | vector<TabletServerId> sorted_non_affinitized_leader_load_; |
456 | | // List of availability zones for affinitized leaders. |
457 | | AffinitizedZonesSet affinitized_zones_; |
458 | | |
459 | | private: |
460 | | const std::string uninitialized_ts_meta_format_msg = |
461 | | "Found uninitialized ts_meta: ts_uuid: $0, table_uuid: $1"; |
462 | | |
463 | | DISALLOW_COPY_AND_ASSIGN(PerTableLoadState); |
464 | | }; // PerTableLoadState |
465 | | |
466 | | } // namespace master |
467 | | } // namespace yb |
468 | | |
469 | | #endif // YB_MASTER_CLUSTER_BALANCE_UTIL_H |