YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/src/yb/master/cluster_balance_util.h
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) YugaByte, Inc.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
4
// in compliance with the License.  You may obtain a copy of the License at
5
//
6
// http://www.apache.org/licenses/LICENSE-2.0
7
//
8
// Unless required by applicable law or agreed to in writing, software distributed under the License
9
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
10
// or implied.  See the License for the specific language governing permissions and limitations
11
// under the License.
12
//
13
14
#ifndef YB_MASTER_CLUSTER_BALANCE_UTIL_H
15
#define YB_MASTER_CLUSTER_BALANCE_UTIL_H
16
17
#include <memory>
18
#include <set>
19
#include <string>
20
#include <unordered_map>
21
#include <unordered_set>
22
#include <vector>
23
24
#include "yb/gutil/casts.h"
25
26
#include "yb/master/catalog_entity_info.pb.h"
27
#include "yb/master/ts_descriptor.h"
28
29
DECLARE_int32(leader_balance_threshold);
30
31
DECLARE_int32(load_balancer_max_concurrent_tablet_remote_bootstraps);
32
33
DECLARE_int32(load_balancer_max_concurrent_tablet_remote_bootstraps_per_table);
34
35
DECLARE_int32(load_balancer_max_over_replicated_tablets);
36
37
DECLARE_int32(load_balancer_max_concurrent_adds);
38
39
DECLARE_int32(load_balancer_max_concurrent_removals);
40
41
DECLARE_int32(load_balancer_max_concurrent_moves);
42
43
DECLARE_int32(load_balancer_max_concurrent_moves_per_table);
44
45
namespace yb {
46
namespace master {
47
48
// enum for replica type, either live (synchronous) or read only (timeline consistent)
49
enum ReplicaType {
50
  LIVE,
51
  READ_ONLY,
52
};
53
54
struct cloud_equal_to {
55
65.1k
  bool operator()(const yb::CloudInfoPB& x, const yb::CloudInfoPB& y) const {
56
65.1k
    return x.placement_cloud() == y.placement_cloud() &&
57
65.1k
        x.placement_region() == y.placement_region() &&
58
65.1k
        x.placement_zone() == y.placement_zone();
59
65.1k
  }
60
};
61
62
struct cloud_hash {
63
104k
  std::size_t operator()(const yb::CloudInfoPB& ci) const {
64
104k
    return std::hash<std::string>{} (TSDescriptor::generate_placement_id(ci));
65
104k
  }
66
};
67
68
struct CBTabletMetadata {
69
967k
  bool is_missing_replicas() { return is_under_replicated || !under_replicated_placements.empty(); }
70
71
967k
  bool has_wrong_placements() {
72
967k
    return !wrong_placement_tablet_servers.empty() || !blacklisted_tablet_servers.empty();
73
967k
  }
74
75
0
  bool has_blacklisted_leader() {
76
0
    return !leader_blacklisted_tablet_servers.empty();
77
0
  }
78
79
  // Can the TS be added to any of the placements that lack a replica for this tablet.
80
  bool CanAddTSToMissingPlacements(const std::shared_ptr<TSDescriptor> ts_descriptor) const;
81
82
  // Number of running replicas for this tablet.
83
  int running = 0;
84
85
  // TODO(bogdan): actually use this!
86
  //
87
  // Number of starting replicas for this tablet.
88
  int starting = 0;
89
90
  // If this tablet has fewer replicas than the configured number in the PlacementInfoPB.
91
  bool is_under_replicated = false;
92
93
  // Set of placement ids that have less replicas available than the configured minimums.
94
  std::unordered_set<CloudInfoPB, cloud_hash, cloud_equal_to> under_replicated_placements;
95
96
  // If this tablet has more replicas than the configured number in the PlacementInfoPB.
97
  bool is_over_replicated;
98
99
  // Set of tablet server ids that can be candidates for removal, due to tablet being
100
  // over-replicated. For tablets with placement information, this will be all tablet servers
101
  // that are housing replicas of this tablet, in a placement with strictly more replicas than the
102
  // configured minimum (as that means there is at least one of them we can remove, and still
103
  // respect the minimum).
104
  //
105
  // For tablets with no placement information, this will be all the tablet servers currently
106
  // serving this tablet, as we can downsize with no restrictions in this case.
107
  std::set<TabletServerId> over_replicated_tablet_servers;
108
109
  // Set of tablet server ids whose placement information does not match that listed in the
110
  // table's PlacementInfoPB. This will happen when we change the configuration for the table or
111
  // the cluster.
112
  std::set<TabletServerId> wrong_placement_tablet_servers;
113
114
  // Set of tablet server ids that have been blacklisted and as such, should not get any more load
115
  // assigned to them and should be prioritized for removing load.
116
  std::set<TabletServerId> blacklisted_tablet_servers;
117
  std::set<TabletServerId> leader_blacklisted_tablet_servers;
118
119
  // The tablet server id of the leader in this tablet's peer group.
120
  TabletServerId leader_uuid;
121
122
  // Leader stepdown failures. We use this to prevent retrying the same leader stepdown too soon.
123
  LeaderStepDownFailureTimes leader_stepdown_failures;
124
125
  std::string ToString() const;
126
};
127
128
using AffinitizedZonesSet = std::unordered_set<CloudInfoPB, cloud_hash, cloud_equal_to>;
129
using PathToTablets = std::unordered_map<std::string, std::set<TabletId>>;
130
131
struct CBTabletServerMetadata {
132
  // The TSDescriptor for this tablet server.
133
  std::shared_ptr<TSDescriptor> descriptor = nullptr;
134
135
  // Map from path to the set of tablet ids that this tablet server is currently running
136
  // on the path.
137
  PathToTablets path_to_tablets;
138
139
  // Map from path to the number of replicas that this tablet server is currently starting
140
  // on the path.
141
  std::unordered_map<std::string, int> path_to_starting_tablets_count;
142
143
  // Set of paths sorted descending by tablets count.
144
  vector<std::string> sorted_path_load_by_tablets_count;
145
146
  // Set of paths sorted ascending by tablet leaders count.
147
  vector<std::string> sorted_path_load_by_leader_count;
148
149
  // The set of tablet ids that this tablet server is currently running.
150
  std::set<TabletId> running_tablets;
151
152
  // The set of tablet ids that this tablet server is currently starting.
153
  std::set<TabletId> starting_tablets;
154
155
  // The set of tablet leader ids that this tablet server is currently running.
156
  std::set<TabletId> leaders;
157
158
  // Map from path to the set of tablet leader ids that this tablet server is currently running
159
  // on the path.
160
  PathToTablets path_to_leaders;
161
162
  // The set of tablet ids that this tablet server disabled (ex. after split).
163
  std::set<TabletId> disabled_by_ts_tablets;
164
};
165
166
struct CBTabletServerLoadCounts {
167
  // Stores global load counts for a tablet server.
168
  // See definitions of these counts in CBTabletServerMetadata.
169
  int running_tablets_count = 0;
170
  int starting_tablets_count = 0;
171
  int leaders_count = 0;
172
};
173
174
struct Options {
175
77.4k
  Options() {}
176
77.4k
  virtual ~Options() {}
177
  // If variance between load on TS goes past this number, we should try to balance.
178
  double kMinLoadVarianceToBalance = 2.0;
179
180
  // If variance between global load on TS goes past this number, we should try to balance.
181
  double kMinGlobalLoadVarianceToBalance = 2.0;
182
183
  // If variance between leader load on TS goes past this number, we should try to balance.
184
  double kMinLeaderLoadVarianceToBalance = 2.0;
185
186
  // If variance between global leader load on TS goes past this number, we should try to balance.
187
  double kMinGlobalLeaderLoadVarianceToBalance = 2.0;
188
189
  // Whether to limit the number of tablets being spun up on the cluster at any given time.
190
  bool kAllowLimitStartingTablets = true;
191
192
  // Max number of tablets being remote bootstrapped across the cluster, if we enable limiting
193
  // this.
194
  int kMaxTabletRemoteBootstraps = FLAGS_load_balancer_max_concurrent_tablet_remote_bootstraps;
195
196
  // Max number of tablets being remote bootstrapped for a specific table, if we enable limiting
197
  // this.
198
  int kMaxTabletRemoteBootstrapsPerTable =
199
      FLAGS_load_balancer_max_concurrent_tablet_remote_bootstraps_per_table;
200
201
  // Whether to limit the number of tablets that have more peers than configured at any given
202
  // time.
203
  bool kAllowLimitOverReplicatedTablets = true;
204
205
  // Max number of running tablet replicas that are over the configured limit.
206
  int kMaxOverReplicatedTablets = FLAGS_load_balancer_max_over_replicated_tablets;
207
208
  // Max number of over-replicated tablet peer removals to do in any one run of the load balancer.
209
  int kMaxConcurrentRemovals = FLAGS_load_balancer_max_concurrent_removals;
210
211
  // Max number of tablet peer replicas to add in any one run of the load balancer.
212
  int kMaxConcurrentAdds = FLAGS_load_balancer_max_concurrent_adds;
213
214
  // Max number of tablet leaders on tablet servers (across the cluster) to move in any one run of
215
  // the load balancer.
216
  int kMaxConcurrentLeaderMoves = FLAGS_load_balancer_max_concurrent_moves;
217
218
  // Max number of tablet leaders per table to move in any one run of the load balancer.
219
  int kMaxConcurrentLeaderMovesPerTable = FLAGS_load_balancer_max_concurrent_moves_per_table;
220
221
  // Either a live replica or a read.
222
  ReplicaType type;
223
224
  string placement_uuid;
225
  string live_placement_uuid;
226
227
  // TODO(bogdan): add state for leaders starting remote bootstraps, to limit on that end too.
228
};
229
230
// Cluster-wide state and metrics.
231
// For now it's used to determine how many tablets are being remote bootstrapped across the cluster,
232
// as well as keeping track of global load counts in order to do global load balancing moves.
233
class GlobalLoadState {
234
 public:
235
  // Get the global load for a certain TS.
236
  int GetGlobalLoad(const TabletServerId& ts_uuid) const;
237
238
  // Get global leader load for a certain TS.
239
  int GetGlobalLeaderLoad(const TabletServerId& ts_uuid) const;
240
241
  // Used to determine how many tablets are being remote bootstrapped across the cluster.
242
  int total_starting_tablets_ = 0;
243
244
  TSDescriptorVector ts_descs_;
245
246
  bool drive_aware_ = true;
247
248
 private:
249
  // Map from tablet server ids to the global metadata we store for each.
250
  std::unordered_map<TabletServerId, CBTabletServerLoadCounts> per_ts_global_meta_;
251
252
  friend class PerTableLoadState;
253
};
254
255
class PerTableLoadState {
256
 public:
257
  TableId table_id_;
258
  explicit PerTableLoadState(GlobalLoadState* global_state);
259
260
  virtual ~PerTableLoadState();
261
262
  // Comparators used for sorting by load.
263
  bool CompareByUuid(const TabletServerId& a, const TabletServerId& b);
264
265
  bool CompareByReplica(const TabletReplica& a, const TabletReplica& b);
266
267
  // Comparator functor to be able to wrap around the public but non-static compare methods that
268
  // end up using internal state of the class.
269
  struct Comparator {
270
246k
    explicit Comparator(PerTableLoadState* state) : state_(state) {}
271
627k
    bool operator()(const TabletServerId& a, const TabletServerId& b) {
272
627k
      return state_->CompareByUuid(a, b);
273
627k
    }
274
275
0
    bool operator()(const TabletReplica& a, const TabletReplica& b) {
276
0
      return state_->CompareByReplica(a, b);
277
0
    }
278
279
    PerTableLoadState* state_;
280
  };
281
282
  // Comparator to sort tablet servers' leader load.
283
  struct LeaderLoadComparator {
284
127k
    explicit LeaderLoadComparator(PerTableLoadState* state) : state_(state) {}
285
    bool operator()(const TabletServerId& a, const TabletServerId& b);
286
287
    PerTableLoadState* state_;
288
  };
289
290
  // Get the load for a certain TS.
291
  size_t GetLoad(const TabletServerId& ts_uuid) const;
292
293
  // Get the load for a certain TS.
294
  size_t GetLeaderLoad(const TabletServerId& ts_uuid) const;
295
296
121k
  void SetBlacklist(const BlacklistPB& blacklist) { blacklist_ = blacklist; }
297
121k
  void SetLeaderBlacklist(const BlacklistPB& leader_blacklist) {
298
121k
    leader_blacklist_ = leader_blacklist;
299
121k
  }
300
301
3.36M
  bool IsTsInLivePlacement(TSDescriptor* ts_desc) {
302
3.36M
    return ts_desc->placement_uuid() == options_->live_placement_uuid;
303
3.36M
  }
304
305
  // Update the per-tablet information for this tablet.
306
  CHECKED_STATUS UpdateTablet(TabletInfo* tablet);
307
308
  virtual void UpdateTabletServer(std::shared_ptr<TSDescriptor> ts_desc);
309
310
  Result<bool> CanAddTabletToTabletServer(
311
    const TabletId& tablet_id, const TabletServerId& to_ts,
312
    const PlacementInfoPB* placement_info = nullptr);
313
314
  // For a TS specified by ts_uuid, this function checks if there is a placement
315
  // block in placement_info where this TS can be placed. If there doesn't exist
316
  // any, it returns boost::none. On the other hand if there is a placement block
317
  // that satisfies the criteria then it returns the cloud info of that block.
318
  // If there wasn't any placement information passed in placement_info then
319
  // it returns the cloud info of the TS itself.
320
  boost::optional<CloudInfoPB> GetValidPlacement(const TabletServerId& ts_uuid,
321
                                                 const PlacementInfoPB* placement_info);
322
323
  Result<bool> CanSelectWrongReplicaToMove(
324
    const TabletId& tablet_id, const PlacementInfoPB& placement_info, TabletServerId* out_from_ts,
325
    TabletServerId* out_to_ts);
326
327
  CHECKED_STATUS AddReplica(const TabletId& tablet_id, const TabletServerId& to_ts);
328
329
  CHECKED_STATUS RemoveReplica(const TabletId& tablet_id, const TabletServerId& from_ts);
330
331
  void SortLoad();
332
333
  void SortDriveLoad();
334
335
  CHECKED_STATUS MoveLeader(const TabletId& tablet_id,
336
                            const TabletServerId& from_ts,
337
                            const TabletServerId& to_ts = "",
338
                            const TabletServerId& to_ts_path = "");
339
340
  void SortLeaderLoad();
341
342
  void SortDriveLeaderLoad();
343
344
  void LogSortedLeaderLoad();
345
346
117k
  inline bool IsLeaderLoadBelowThreshold(const TabletServerId& ts_uuid) {
347
117k
    return ((leader_balance_threshold_ > 0) &&
348
16
            (GetLeaderLoad(ts_uuid) <= implicit_cast<size_t>(leader_balance_threshold_)));
349
117k
  }
350
351
  void AdjustLeaderBalanceThreshold();
352
353
  std::shared_ptr<const TabletReplicaMap> GetReplicaLocations(TabletInfo* tablet);
354
355
  CHECKED_STATUS AddRunningTablet(const TabletId& tablet_id,
356
                                  const TabletServerId& ts_uuid,
357
                                  const std::string& path);
358
359
  CHECKED_STATUS RemoveRunningTablet(const TabletId& tablet_id, const TabletServerId& ts_uuid);
360
361
  CHECKED_STATUS AddStartingTablet(const TabletId& tablet_id, const TabletServerId& ts_uuid);
362
363
  CHECKED_STATUS AddLeaderTablet(const TabletId& tablet_id,
364
                                 const TabletServerId& ts_uuid,
365
                                 const TabletServerId& ts_path);
366
367
  CHECKED_STATUS RemoveLeaderTablet(const TabletId& tablet_id, const TabletServerId& ts_uuid);
368
369
  CHECKED_STATUS AddDisabledByTSTablet(const TabletId& tablet_id, const TabletServerId& ts_uuid);
370
371
  // PerTableLoadState member fields
372
373
  // Map from tablet ids to the metadata we store for each.
374
  std::unordered_map<TabletId, CBTabletMetadata> per_tablet_meta_;
375
376
  // Map from tablet server ids to the metadata we store for each.
377
  std::unordered_map<TabletServerId, CBTabletServerMetadata> per_ts_meta_;
378
379
  // Map from table id to placement information for this table. This will be used for both
380
  // determining over-replication, by checking num_replicas, but also for az awareness, by keeping
381
  // track of the placement block policies between cluster and table level.
382
  std::unordered_map<TableId, PlacementInfoPB> placement_by_table_;
383
384
  // Total number of running tablets in the clusters (including replicas).
385
  int total_running_ = 0;
386
387
  // Total number of tablet replicas being started across the cluster.
388
  int total_starting_ = 0;
389
390
  // Set of ts_uuid sorted ascending by load. This is the actual raw data of TS load.
391
  std::vector<TabletServerId> sorted_load_;
392
393
  // Set of tablet ids that have been determined to have missing replicas. This can mean they are
394
  // generically under-replicated (2 replicas active, but 3 configured), or missing replicas in
395
  // certain placements (3 replicas active out of 3 configured, but no replicas in one of the AZs
396
  // listed in the placement blocks).
397
  std::set<TabletId> tablets_missing_replicas_;
398
399
  // Set of tablet ids that have been temporarily over-replicated. This is used to pick tablets
400
  // to potentially bring back down to their proper configured size, if there are more running than
401
  // expected.
402
  std::set<TabletId> tablets_over_replicated_;
403
404
  // Set of tablet ids that have been determined to have replicas in incorrect placements.
405
  std::set<TabletId> tablets_wrong_placement_;
406
407
  // The cached blacklist setting of the cluster. We store this upfront, as we add to the list of
408
  // tablet servers one by one, so we compare against it once per tablet server.
409
  BlacklistPB blacklist_;
410
  BlacklistPB leader_blacklist_;
411
412
  // The list of tablet server ids that match the cached blacklist.
413
  std::set<TabletServerId> blacklisted_servers_;
414
  std::set<TabletServerId> leader_blacklisted_servers_;
415
416
  // List of tablet server ids that have pending deletes.
417
  std::set<TabletServerId> servers_with_pending_deletes_;
418
419
  // List of tablet ids that have been added to a new tablet server.
420
  std::set<TabletId> tablets_added_;
421
422
  // Number of leaders per each tablet server to balance below.
423
  int leader_balance_threshold_ = 0;
424
425
  // List of table server ids sorted by whether leader blacklisted and their leader load.
426
  // If affinitized leaders is enabled, stores leader load for affinitized nodes.
427
  vector<TabletServerId> sorted_leader_load_;
428
429
  std::unordered_map<TableId, TabletToTabletServerMap> pending_add_replica_tasks_;
430
  std::unordered_map<TableId, TabletToTabletServerMap> pending_remove_replica_tasks_;
431
  std::unordered_map<TableId, TabletToTabletServerMap> pending_stepdown_leader_tasks_;
432
433
  // Time at which we started the current round of load balancing.
434
  MonoTime current_time_;
435
436
  // The knobs we use for tweaking the flow of the algorithm.
437
  Options* options_;
438
439
  // Pointer to the cluster global state so that it can be updated when operations like add or
440
  // remove are executed.
441
  GlobalLoadState* global_state_;
442
443
  // Boolean whether tablets for this table should respect the affinited zones.
444
  bool use_preferred_zones_ = true;
445
446
  // check_ts_liveness_ is used to indicate if the TS descriptors
447
  // need to be checked if they are live and considered for Load balancing.
448
  // In most scenarios, this would be true, except when we use the cluster_balance_mocked.h
449
  // for triggering LB scenarios.
450
  bool check_ts_liveness_ = true;
451
  // Allow only leader balancing for this table.
452
  bool allow_only_leader_balancing_ = false;
453
454
  // If affinitized leaders is enabled, stores leader load for non affinitized nodes.
455
  vector<TabletServerId> sorted_non_affinitized_leader_load_;
456
  // List of availability zones for affinitized leaders.
457
  AffinitizedZonesSet affinitized_zones_;
458
459
 private:
460
  const std::string uninitialized_ts_meta_format_msg =
461
      "Found uninitialized ts_meta: ts_uuid: $0, table_uuid: $1";
462
463
  DISALLOW_COPY_AND_ASSIGN(PerTableLoadState);
464
}; // PerTableLoadState
465
466
} // namespace master
467
} // namespace yb
468
469
#endif // YB_MASTER_CLUSTER_BALANCE_UTIL_H