YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/master/cluster_balance.h
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) YugaByte, Inc.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
4
// in compliance with the License.  You may obtain a copy of the License at
5
//
6
// http://www.apache.org/licenses/LICENSE-2.0
7
//
8
// Unless required by applicable law or agreed to in writing, software distributed under the License
9
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
10
// or implied.  See the License for the specific language governing permissions and limitations
11
// under the License.
12
//
13
14
#ifndef YB_MASTER_CLUSTER_BALANCE_H
15
#define YB_MASTER_CLUSTER_BALANCE_H
16
17
#include <atomic>
18
#include <map>
19
#include <memory>
20
#include <set>
21
#include <string>
22
#include <unordered_map>
23
#include <unordered_set>
24
#include <vector>
25
26
#include <boost/circular_buffer.hpp>
27
28
#include "yb/master/catalog_manager.h"
29
#include "yb/master/master_fwd.h"
30
#include "yb/master/cluster_balance_util.h"
31
#include "yb/master/ts_descriptor.h"
32
33
#include "yb/util/random.h"
34
35
DECLARE_int32(load_balancer_max_concurrent_tablet_remote_bootstraps);
36
DECLARE_int32(load_balancer_max_over_replicated_tablets);
37
DECLARE_int32(load_balancer_max_concurrent_adds);
38
DECLARE_int32(load_balancer_max_concurrent_moves);
39
DECLARE_int32(load_balancer_max_concurrent_removals);
40
41
YB_STRONGLY_TYPED_BOOL(StepdownIfLeader);
42
43
namespace yb {
44
namespace master {
45
46
//  This class keeps state with regards to the full cluster load of tablets on tablet servers. We
47
//  count a tablet towards a tablet server's load if it is either RUNNING, or is in the process of
48
//  starting up, hence NOT_STARTED or BOOTSTRAPPING.
49
//
50
//  This class also keeps state for the process of balancing load, which is done by temporarily
51
//  enlarging the replica set for a tablet by adding a new peer on a less loaded TS, and
52
//  subsequently removing a peer that is more loaded.
53
//
54
//  The policy for balancing the relicas involves a two step process:
55
//  1) Add replicas to tablet peer groups, if required, potentially leading to temporary
56
//  over-replication.
57
//  1.1) If any tablet has less replicas than the configured RF, or if there is any placement block
58
//  with fewer replicas than the specified minimum in that placement, we will try to add replicas,
59
//  so as to reach the client requirements.
60
//  1.2) If any tablet has replicas placed on tablet servers that do not conform to the specified
61
//  placement, then we should remove these. However, we never want to under-replicate a whole peer
62
//  group, or any individual placement block, so we will first add a new replica that will allow
63
//  the invalid ones to be removed.
64
//  1.3) If we have no placement related issues, then we just want try to equalize load
65
//  distribution across the cluster, while still maintaining the placement requirements.
66
//  2) Remove replicas from tablet peer groups if they are either over-replicated, or placed on
67
//  tablet servers they shouldn't be.
68
//  2.1) If we have replicas living on tablet servers where they should not, due to placement
69
//  constraints, or tablet servers being blacklisted, we try to remove those replicas with high
70
//  priority, but naturally, only if removing said replica does not lead to under-replication.
71
//  2.2) If we have no placement related issues, then we just try to shrink back any temporarily
72
//  over-replicated tablet peer groups, while still conforming to the placement requirements.
73
//
74
//  This class also balances the leaders on tablet servers, starting from the server with the most
75
//  leaders and moving some leaders to the servers with less to achieve an even distribution. If
76
//  a threshold is set in the configuration, the balancer will just keep the numbers of leaders
77
//  on each server below it instead of maintaining an even distribution.
78
class ClusterLoadBalancer {
79
 public:
80
  explicit ClusterLoadBalancer(CatalogManager* cm);
81
  virtual ~ClusterLoadBalancer();
82
83
  // Executes one run of the load balancing algorithm. This currently does not persist any state,
84
  // so it needs to scan the in-memory tablet and TS data in the CatalogManager on every run and
85
  // create a new PerTableLoadState object.
86
  void RunLoadBalancerWithOptions(Options* options);
87
88
  // Runs the load balancer once for the live and all read only clusters, in order
89
  // of the cluster config.
90
  virtual void RunLoadBalancer(Options* options = nullptr);
91
92
  // Sets whether to enable or disable the load balancer, on demand.
93
0
  void SetLoadBalancerEnabled(bool is_enabled) { is_enabled_ = is_enabled; }
94
95
  bool IsLoadBalancerEnabled() const;
96
97
  bool CanBalanceGlobalLoad() const;
98
99
  CHECKED_STATUS IsIdle() const;
100
101
  // Returns the TableInfo of all the tables for whom load balancing is being skipped.
102
  // As of today, this constitutes all the system tables, colocated user tables
103
  // and tables which have been marked as DELETING OR DELETED.
104
  vector<scoped_refptr<TableInfo>> GetAllTablesLoadBalancerSkipped();
105
106
  //
107
  // Catalog manager indirection methods.
108
  //
109
 protected:
110
  //
111
  // Indirection methods to CatalogManager that we override in the subclasses (or testing).
112
  //
113
114
  void InitializeTSDescriptors();
115
116
  // Get the list of all live TSDescriptors which reported their tablets.
117
  virtual void GetAllReportedDescriptors(TSDescriptorVector* ts_descs) const;
118
119
  // Get the list of all TSDescriptors.
120
  virtual void GetAllDescriptors(TSDescriptorVector* ts_descs) const;
121
122
    // Get access to the tablet map across the cluster.
123
  virtual const TabletInfoMap& GetTabletMap() const REQUIRES_SHARED(catalog_manager_->mutex_);
124
125
  // Get access to the table map.
126
  virtual const TableInfoMap& GetTableMap() const REQUIRES_SHARED(catalog_manager_->mutex_);
127
128
  // Get the table info object for given table uuid.
129
  virtual const scoped_refptr<TableInfo> GetTableInfo(const TableId& table_uuid) const
130
    REQUIRES_SHARED(catalog_manager_->mutex_);
131
132
  // Get the replication info from the cluster configuration.
133
  virtual const ReplicationInfoPB& GetClusterReplicationInfo() const;
134
135
  // Get the placement information from the cluster configuration.
136
  // Gets appropriate live or read only cluster placement,
137
  // depending on placement_uuid_.
138
  virtual const PlacementInfoPB& GetClusterPlacementInfo() const;
139
140
  // Get the blacklist information.
141
  virtual const BlacklistPB& GetServerBlacklist() const;
142
143
  // Get the leader blacklist information.
144
  virtual const BlacklistPB& GetLeaderBlacklist() const;
145
146
  // Should skip load-balancing of this table?
147
  virtual bool SkipLoadBalancing(const TableInfo& table) const
148
      REQUIRES_SHARED(catalog_manager_->mutex_);
149
150
  // Increment the provided variables by the number of pending tasks that were found. Do not call
151
  // more than once for the same table because it also modifies the internal state.
152
  virtual CHECKED_STATUS CountPendingTasksUnlocked(const TableId& table_uuid,
153
                                                   int* pending_add_replica_tasks,
154
                                                   int* pending_remove_replica_tasks,
155
                                                   int* pending_stepdown_leader_tasks)
156
    REQUIRES_SHARED(catalog_manager_->mutex_);
157
158
  // Wrapper around CatalogManager::GetPendingTasks so it can be mocked by TestLoadBalancer.
159
  virtual void GetPendingTasks(const string& table_uuid,
160
                               TabletToTabletServerMap* add_replica_tasks,
161
                               TabletToTabletServerMap* remove_replica_tasks,
162
                               TabletToTabletServerMap* stepdown_leader_tasks)
163
      REQUIRES_SHARED(catalog_manager_->mutex_);
164
165
  // Issue the call to CatalogManager to change the config for this particular tablet, either
166
  // adding or removing the peer at ts_uuid, based on the is_add argument. Removing the peer
167
  // is optional. When neither adding nor removing peer, it means just moving a leader from one
168
  // tablet server to another. If new_leader_ts_uuid is empty, a server will be picked by random
169
  // to be the new leader. Also takes in the role of the tablet for the creation flow.
170
  virtual Status SendReplicaChanges(
171
      scoped_refptr<TabletInfo> tablet, const TabletServerId& ts_uuid, const bool is_add,
172
      const bool should_remove_leader, const TabletServerId& new_leader_ts_uuid = "");
173
174
  // If type_ is live, return PRE_VOTER, otherwise, return PRE_OBSERVER.
175
  consensus::PeerMemberType GetDefaultMemberType();
176
177
  //
178
  // Higher level methods and members.
179
  //
180
181
  // Resets the global_state_ object, and the map of per-table states.
182
  virtual void ResetGlobalState(bool initialize_ts_descs = true);
183
184
185
  // Resets the pointer state_ to point to the correct table's state.
186
  virtual void ResetTableStatePtr(const TableId& table_id, Options* options = nullptr);
187
188
  // Goes over the tablet_map_ and the set of live TSDescriptors to compute the load distribution
189
  // across the tablets for the given table. Returns an OK status if the method succeeded or an
190
  // error if there are transient errors in updating the internal state.
191
  virtual CHECKED_STATUS AnalyzeTabletsUnlocked(const TableId& table_uuid)
192
      REQUIRES_SHARED(catalog_manager_->mutex_);
193
194
  // Processes any required replica additions, as part of moving load from a highly loaded TS to
195
  // one that is less loaded.
196
  //
197
  // Returns true if a move was actually made.
198
  Result<bool> HandleAddReplicas(
199
      TabletId* out_tablet_id, TabletServerId* out_from_ts, TabletServerId* out_to_ts)
200
      REQUIRES_SHARED(catalog_manager_->mutex_);
201
202
  // Processes any required replica removals, as part of having added an extra replica to a
203
  // tablet's set of peers, which caused its quorum to be larger than the configured number.
204
  //
205
  // Returns true if a move was actually made.
206
  Result<bool> HandleRemoveReplicas(TabletId* out_tablet_id, TabletServerId* out_from_ts)
207
      REQUIRES_SHARED(catalog_manager_->mutex_);
208
209
  // Methods for load preparation, called by ClusterLoadBalancer while analyzing tablets and
210
  // building the initial state.
211
212
  virtual void InitTablespaceManager();
213
214
  // Return the replication info for 'table'.
215
  Result<ReplicationInfoPB> GetTableReplicationInfo(const scoped_refptr<TableInfo>& table) const;
216
217
  // Method called when initially analyzing tablets, to build up load and usage information.
218
  // Returns an OK status if the method succeeded or an error if there are transient errors in
219
  // updating the internal state.
220
  CHECKED_STATUS UpdateTabletInfo(TabletInfo* tablet);
221
222
  // If a tablet is under-replicated, or has certain placements that have less than the minimum
223
  // required number of replicas, we need to add extra tablets to its peer set.
224
  //
225
  // Returns true if a move was actually made.
226
  Result<bool> HandleAddIfMissingPlacement(TabletId* out_tablet_id, TabletServerId* out_to_ts)
227
      REQUIRES_SHARED(catalog_manager_->mutex_);
228
229
  // If we find a tablet with peers that violate the placement information, we want to move load
230
  // away from the invalid placement peers, to new peers that are valid. To ensure we do not
231
  // under-replicate a tablet, we first find the tablet server to add load to, essentially
232
  // over-replicating the tablet temporarily.
233
  //
234
  // Returns true if a move was actually made.
235
  Result<bool> HandleAddIfWrongPlacement(
236
      TabletId* out_tablet_id, TabletServerId* out_from_ts, TabletServerId* out_to_ts)
237
      REQUIRES_SHARED(catalog_manager_->mutex_);
238
239
  // If we find a tablet with peers that violate the placement information, we first over-replicate
240
  // the peer group, in the add portion of the algorithm. We then eventually remove extra replicas
241
  // on the remove path, here.
242
  //
243
  // Returns true if a move was actually made.
244
  Result<bool> HandleRemoveIfWrongPlacement(TabletId* out_tablet_id, TabletServerId* out_from_ts)
245
      REQUIRES_SHARED(catalog_manager_->mutex_);
246
247
  // This function handles leader load from non-affinitized to affinitized nodes.
248
  // If it can find a way to move leader load from a non-affinitized to affinitized node,
249
  // returns true, if not returns false, if error is found, returns Status.
250
  // This is called before normal leader load balancing.
251
  Result<bool> HandleLeaderLoadIfNonAffinitized(TabletId* moving_tablet_id,
252
                                                TabletServerId* from_ts,
253
                                                TabletServerId* to_ts,
254
                                                std::string* to_ts_path);
255
256
  // Processes any tablet leaders that are on a highly loaded tablet server and need to be moved.
257
  //
258
  // Returns true if a move was actually made.
259
  Result<bool> HandleLeaderMoves(
260
      TabletId* out_tablet_id, TabletServerId* out_from_ts, TabletServerId* out_to_ts)
261
      REQUIRES_SHARED(catalog_manager_->mutex_);
262
263
  virtual void GetAllAffinitizedZones(AffinitizedZonesSet* affinitized_zones) const;
264
265
  // Go through sorted_load_ and figure out which tablet to rebalance and from which TS that is
266
  // serving it to which other TS.
267
  //
268
  // Returns true if we could find a tablet to rebalance and sets the three output parameters.
269
  // Returns false otherwise.
270
  Result<bool> GetLoadToMove(
271
      TabletId* moving_tablet_id, TabletServerId* from_ts, TabletServerId* to_ts)
272
      REQUIRES_SHARED(catalog_manager_->mutex_);
273
274
  Result<bool> GetTabletToMove(
275
      const TabletServerId& from_ts, const TabletServerId& to_ts, TabletId* moving_tablet_id)
276
      REQUIRES_SHARED(catalog_manager_->mutex_);
277
278
  // Go through sorted_leader_load_ and figure out which leader to rebalance and from which TS
279
  // that is serving it to which other TS.
280
  //
281
  // Returns true if we could find a leader to rebalance and sets the three output parameters.
282
  // Returns false otherwise.
283
  Result<bool> GetLeaderToMove(TabletId* moving_tablet_id,
284
                               TabletServerId* from_ts,
285
                               TabletServerId* to_ts,
286
                               std::string* to_ts_path);
287
288
  // Issue the change config and modify the in-memory state for moving a replica from one tablet
289
  // server to another.
290
  CHECKED_STATUS MoveReplica(
291
      const TabletId& tablet_id, const TabletServerId& from_ts, const TabletServerId& to_ts)
292
      REQUIRES_SHARED(catalog_manager_->mutex_);
293
294
  // Issue the change config and modify the in-memory state for adding a replica on the specified
295
  // tablet server.
296
  CHECKED_STATUS AddReplica(const TabletId& tablet_id, const TabletServerId& to_ts)
297
      REQUIRES_SHARED(catalog_manager_->mutex_);
298
299
  // Issue the change config and modify the in-memory state for removing a replica on the specified
300
  // tablet server.
301
  CHECKED_STATUS RemoveReplica(
302
      const TabletId& tablet_id, const TabletServerId& ts_uuid)
303
      REQUIRES_SHARED(catalog_manager_->mutex_);
304
305
  // Issue the change config and modify the in-memory state for moving a tablet leader on the
306
  // specified tablet server to the other specified tablet server.
307
  CHECKED_STATUS MoveLeader(const TabletId& tablet_id,
308
                            const TabletServerId& from_ts,
309
                            const TabletServerId& to_ts,
310
                            const std::string& to_ts_path)
311
      REQUIRES_SHARED(catalog_manager_->mutex_);
312
313
  // Methods called for returning tablet id sets, for figuring out tablets to move around.
314
315
  const PlacementInfoPB& GetPlacementByTablet(const TabletId& tablet_id) const
316
      REQUIRES_SHARED(catalog_manager_->mutex_);
317
318
  // Get access to all the tablets for the given table.
319
  Result<TabletInfos> GetTabletsForTable(const TableId& table_uuid) const
320
      REQUIRES_SHARED(catalog_manager_->mutex_);
321
322
  // Populates pb with the placement info in tablet's config at cluster placement_uuid_.
323
  Status PopulatePlacementInfo(TabletInfo* tablet, PlacementInfoPB* pb);
324
325
  // Returns the read only placement info from placement_uuid_.
326
  const PlacementInfoPB& GetReadOnlyPlacementFromUuid(
327
      const ReplicationInfoPB& replication_info) const;
328
329
  virtual const PlacementInfoPB& GetLiveClusterPlacementInfo() const;
330
331
  //
332
  // Generic load information methods.
333
  //
334
335
  // Get the total number of extra replicas.
336
  size_t get_total_over_replication() const;
337
338
  size_t get_total_under_replication() const;
339
340
  // Convenience methods for getting totals of starting or running tablets.
341
  size_t get_total_starting_tablets() const;
342
  int get_total_running_tablets() const;
343
344
  size_t get_total_wrong_placement() const;
345
  size_t get_total_blacklisted_servers() const;
346
  size_t get_total_leader_blacklisted_servers() const;
347
348
  std::unordered_map<TableId, std::unique_ptr<PerTableLoadState>> per_table_states_;
349
  // The state of the table load in the cluster, as far as this run of the algorithm is concerned.
350
  // It points to the appropriate object in per_table_states_.
351
  PerTableLoadState* state_ = nullptr;
352
353
  std::unique_ptr<GlobalLoadState> global_state_;
354
355
  // The catalog manager of the Master that actually has the Tablet and TS state. The object is not
356
  // managed by this class, but by the Master's unique_ptr.
357
  CatalogManager* catalog_manager_;
358
359
  std::shared_ptr<YsqlTablespaceManager> tablespace_manager_;
360
361
  template <class ClusterLoadBalancerClass> friend class TestLoadBalancerBase;
362
363
 private:
364
  // Returns true if at least one member in the tablet's configuration is transitioning into a
365
  // VOTER, but it's not a VOTER yet.
366
  Result<bool> IsConfigMemberInTransitionMode(const TabletId& tablet_id) const
367
      REQUIRES_SHARED(catalog_manager_->mutex_);
368
369
  // Dump the sorted load on tservers (it is usually per table).
370
  void DumpSortedLoad() const;
371
372
  // Report unusual state at the beginning of an LB run which may prevent LB from making moves.
373
  void ReportUnusualLoadBalancerState() const;
374
375
  // Random number generator for picking items at random from sets, using ReservoirSample.
376
  ThreadSafeRandom random_;
377
378
  // Controls whether to run the load balancing algorithm or not.
379
  std::atomic<bool> is_enabled_;
380
381
  // Information representing activity of load balancer.
382
  struct ActivityInfo {
383
    uint32_t table_tasks = 0;
384
    uint32_t master_errors = 0;
385
386
1.94M
    bool IsIdle() const {
387
1.94M
      return table_tasks == 0 && 
master_errors == 01.78M
;
388
1.94M
    }
389
  };
390
391
  // Circular buffer of load balancer activity.
392
  boost::circular_buffer<ActivityInfo> cbuf_activities_;
393
394
  // Summary of circular buffer of load balancer activity.
395
  size_t num_idle_runs_ = 0;
396
  std::atomic<bool> is_idle_ {true};
397
398
  // Check if we are able to balance global load. With the current algorithm, we only allow for
399
  // global load balancing once all tables are themselves balanced.
400
  // This value is only set to true once is_idle_ becomes true, and this value is only set to false
401
  // once we perform a non-global move.
402
  bool can_perform_global_operations_ = false;
403
404
  // Record load balancer activity for tables and tservers.
405
  void RecordActivity(uint32_t master_errors) REQUIRES_SHARED(catalog_manager_->mutex_);
406
407
  typedef rw_spinlock LockType;
408
  mutable LockType mutex_;
409
410
  // Maintains a list of all tables for whom LB has been skipped.
411
  // Currently for consumption by components outside the LB.
412
  // Protected by a readers-writers lock. Only the LB writes to it.
413
  // Other components such as test, admin UI, etc. should
414
  // ideally read from it using a shared_lock<>.
415
  vector<scoped_refptr<TableInfo>> skipped_tables_ GUARDED_BY(mutex_);
416
  // Internal to LB structure to keep track of skipped tables.
417
  // skipped_tables_ is set at the end of each LB run using
418
  // skipped_tables_per_run_.
419
  vector<scoped_refptr<TableInfo>> skipped_tables_per_run_;
420
421
  DISALLOW_COPY_AND_ASSIGN(ClusterLoadBalancer);
422
};
423
424
}  // namespace master
425
}  // namespace yb
426
#endif /* YB_MASTER_CLUSTER_BALANCE_H */