YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/master/cluster_balance_util.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) YugaByte, Inc.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
4
// in compliance with the License.  You may obtain a copy of the License at
5
//
6
// http://www.apache.org/licenses/LICENSE-2.0
7
//
8
// Unless required by applicable law or agreed to in writing, software distributed under the License
9
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
10
// or implied.  See the License for the specific language governing permissions and limitations
11
// under the License.
12
//
13
14
#include "yb/master/cluster_balance_util.h"
15
16
#include "yb/gutil/map-util.h"
17
18
#include "yb/master/catalog_entity_info.h"
19
#include "yb/master/master_cluster.pb.h"
20
21
#include "yb/util/atomic.h"
22
23
DECLARE_int32(min_leader_stepdown_retry_interval_ms);
24
25
DECLARE_int32(leader_balance_threshold);
26
27
DECLARE_int32(leader_balance_unresponsive_timeout_ms);
28
29
DECLARE_int32(replication_factor);
30
31
DECLARE_bool(allow_leader_balancing_dead_node);
32
33
namespace yb {
34
namespace master {
35
36
bool CBTabletMetadata::CanAddTSToMissingPlacements(
37
294
    const std::shared_ptr<TSDescriptor> ts_descriptor) const {
38
298
  for (const auto& under_replicated_ci : under_replicated_placements) {
39
    // A prefix.
40
298
    if (ts_descriptor->MatchesCloudInfo(under_replicated_ci)) {
41
118
      return true;
42
118
    }
43
298
  }
44
176
  return false;
45
294
}
46
47
0
std::string CBTabletMetadata::ToString() const {
48
0
  return YB_STRUCT_TO_STRING(
49
0
      running, starting, is_under_replicated, under_replicated_placements,
50
0
      is_over_replicated, over_replicated_tablet_servers,
51
0
      wrong_placement_tablet_servers, blacklisted_tablet_servers,
52
0
      leader_uuid, leader_stepdown_failures, leader_blacklisted_tablet_servers);
53
0
}
54
55
8.28M
int GlobalLoadState::GetGlobalLoad(const TabletServerId& ts_uuid) const {
56
8.28M
  const auto& ts_meta = per_ts_global_meta_.at(ts_uuid);
57
8.28M
  return ts_meta.starting_tablets_count + ts_meta.running_tablets_count;
58
8.28M
}
59
60
4.54M
int GlobalLoadState::GetGlobalLeaderLoad(const TabletServerId& ts_uuid) const {
61
4.54M
  const auto& ts_meta = per_ts_global_meta_.at(ts_uuid);
62
4.54M
  return ts_meta.leaders_count;
63
4.54M
}
64
65
PerTableLoadState::PerTableLoadState(GlobalLoadState* global_state)
66
    : leader_balance_threshold_(FLAGS_leader_balance_threshold),
67
      current_time_(MonoTime::Now()),
68
891k
      global_state_(global_state) {}
69
70
889k
PerTableLoadState::~PerTableLoadState() {}
71
72
bool PerTableLoadState::LeaderLoadComparator::operator()(
73
2.77M
    const TabletServerId& a, const TabletServerId& b) {
74
  // Primary criteria: whether tserver is leader blacklisted.
75
2.77M
  auto a_leader_blacklisted =
76
2.77M
    state_->leader_blacklisted_servers_.find(a) != state_->leader_blacklisted_servers_.end();
77
2.77M
  auto b_leader_blacklisted =
78
2.77M
    state_->leader_blacklisted_servers_.find(b) != state_->leader_blacklisted_servers_.end();
79
2.77M
  if (a_leader_blacklisted != b_leader_blacklisted) {
80
3.14k
    return !a_leader_blacklisted;
81
3.14k
  }
82
83
  // Use global leader load as tie-breaker.
84
2.76M
  auto a_load = state_->GetLeaderLoad(a);
85
2.76M
  auto b_load = state_->GetLeaderLoad(b);
86
2.76M
  if (a_load == b_load) {
87
2.03M
    a_load = state_->global_state_->GetGlobalLeaderLoad(a);
88
2.03M
    b_load = state_->global_state_->GetGlobalLeaderLoad(b);
89
2.03M
    if (a_load == b_load) {
90
1.75M
      return a < b;
91
1.75M
    }
92
2.03M
  }
93
  // Secondary criteria: tserver leader load.
94
1.01M
  return a_load < b_load;
95
2.76M
}
96
97
4.49M
bool PerTableLoadState::CompareByUuid(const TabletServerId& a, const TabletServerId& b) {
98
4.49M
  auto load_a = GetLoad(a);
99
4.49M
  auto load_b = GetLoad(b);
100
4.49M
  if (load_a == load_b) {
101
    // Use global load as a heuristic to help break ties.
102
4.10M
    load_a = global_state_->GetGlobalLoad(a);
103
4.10M
    load_b = global_state_->GetGlobalLoad(b);
104
4.10M
    if (load_a == load_b) {
105
3.99M
      return a < b;
106
3.99M
    }
107
4.10M
  }
108
498k
  return load_a < load_b;
109
4.49M
}
110
111
11.3M
size_t PerTableLoadState::GetLoad(const TabletServerId& ts_uuid) const {
112
11.3M
  const auto& ts_meta = per_ts_meta_.at(ts_uuid);
113
11.3M
  return ts_meta.starting_tablets.size() + ts_meta.running_tablets.size();
114
11.3M
}
115
116
7.56M
size_t PerTableLoadState::GetLeaderLoad(const TabletServerId& ts_uuid) const {
117
7.56M
  return per_ts_meta_.at(ts_uuid).leaders.size();
118
7.56M
}
119
120
3.94M
Status PerTableLoadState::UpdateTablet(TabletInfo *tablet) {
121
3.94M
  const auto& tablet_id = tablet->id();
122
  // Set the per-tablet entry to empty default and get the reference for filling up information.
123
3.94M
  auto& tablet_meta = per_tablet_meta_[tablet_id];
124
125
  // Get the placement for this tablet.
126
3.94M
  const auto& placement = placement_by_table_[tablet->table()->id()];
127
128
  // Get replicas for this tablet.
129
3.94M
  auto replica_map = GetReplicaLocations(tablet);
130
  // Set state information for both the tablet and the tablet server replicas.
131
11.5M
  for (const auto& replica_it : *replica_map) {
132
11.5M
    const auto& ts_uuid = replica_it.first;
133
11.5M
    const auto& replica = replica_it.second;
134
    // If we do not have ts_meta information for this particular replica, then we are in the
135
    // rare case where we just became the master leader and started doing load balancing, but we
136
    // have yet to receive heartbeats from all the tablet servers. We will just return false
137
    // across the stack and stop load balancing and log errors, until we get all the needed info.
138
    //
139
    // Worst case scenario, there is a network partition that is stopping us from actually
140
    // getting the heartbeats from a certain tablet server, but we anticipate that to be a
141
    // temporary matter. We should monitor error logs for this and see that it never actually
142
    // becomes a problem!
143
11.5M
    if (per_ts_meta_.find(ts_uuid) == per_ts_meta_.end()) {
144
11
      return STATUS_SUBSTITUTE(LeaderNotReadyToServe, "Master leader has not yet received "
145
11
          "heartbeat from ts $0, either master just became leader or a network partition.",
146
11
                                ts_uuid);
147
11
    }
148
11.5M
    auto& meta_ts = per_ts_meta_[ts_uuid];
149
150
    // If the TS of this replica is deemed DEAD then perform LBing only if it is blacklisted.
151
11.5M
    if (check_ts_liveness_ && 
!meta_ts.descriptor->IsLiveAndHasReported()11.5M
) {
152
360k
      if (!blacklisted_servers_.count(ts_uuid)) {
153
360k
        if (GetAtomicFlag(&FLAGS_allow_leader_balancing_dead_node)) {
154
360k
          allow_only_leader_balancing_ = true;
155
360k
          YB_LOG_EVERY_N_SECS(INFO, 30)
156
2.03k
              << strings::Substitute("Master leader not received heartbeat from ts $0. "
157
2.03k
                                     "Only performing leader balancing for tables with replicas"
158
2.03k
                                     " in this TS.", ts_uuid);
159
360k
        } else {
160
0
          return STATUS_SUBSTITUTE(LeaderNotReadyToServe, "Master leader has not yet received "
161
0
              "heartbeat from ts $0. Aborting load balancing.", ts_uuid);
162
0
        }
163
360k
      } else {
164
349
        YB_LOG_EVERY_N_SECS(INFO, 30)
165
4
            << strings::Substitute("Master leader not received heartbeat from ts $0 but it is "
166
4
                                   "blacklisted. Continuing LB operations for tables with replicas"
167
4
                                   " in this TS.", ts_uuid);
168
349
      }
169
360k
    }
170
171
    // Fill leader info.
172
11.5M
    if (replica.role == PeerRole::LEADER) {
173
3.89M
      tablet_meta.leader_uuid = ts_uuid;
174
3.89M
      RETURN_NOT_OK(AddLeaderTablet(tablet_id, ts_uuid, replica.fs_data_dir));
175
3.89M
    }
176
177
11.5M
    const tablet::RaftGroupStatePB& tablet_state = replica.state;
178
11.5M
    const bool replica_is_stale = replica.IsStale();
179
11.5M
    VLOG(2) << "Tablet " << tablet_id << " for table " << table_id_
180
0
              << " is in state " << RaftGroupStatePB_Name(tablet_state);
181
11.5M
    if (tablet_state == tablet::RUNNING) {
182
11.5M
      RETURN_NOT_OK(AddRunningTablet(tablet_id, ts_uuid, replica.fs_data_dir));
183
11.5M
    } else 
if (1.38k
!replica_is_stale1.38k
&&
184
1.38k
                (tablet_state == tablet::BOOTSTRAPPING || 
tablet_state == tablet::NOT_STARTED1.10k
)) {
185
      // Keep track of transitioning state (not running, but not in a stopped or failed state).
186
1.38k
      RETURN_NOT_OK(AddStartingTablet(tablet_id, ts_uuid));
187
1.38k
      VLOG(1) << "Increased total_starting to "
188
0
                  << total_starting_ << " for tablet " << tablet_id << " and table " << table_id_;
189
1.38k
      auto counter_it = meta_ts.path_to_starting_tablets_count.find(replica.fs_data_dir);
190
1.38k
      if (counter_it != meta_ts.path_to_starting_tablets_count.end()) {
191
501
        ++counter_it->second;
192
884
      } else {
193
884
        meta_ts.path_to_starting_tablets_count.insert({replica.fs_data_dir, 1});
194
884
      }
195
1.38k
    } else 
if (0
replica_is_stale0
) {
196
0
      VLOG(1) << "Replica is stale: " << replica.ToString();
197
0
    }
198
199
11.5M
    if (replica.should_disable_lb_move) {
200
0
      RETURN_NOT_OK(AddDisabledByTSTablet(tablet_id, ts_uuid));
201
0
      VLOG(1) << "Replica was disabled by TS: " << replica.ToString();
202
0
    }
203
204
    // If this replica is blacklisted, we want to keep track of these specially, so we can
205
    // prioritize accordingly.
206
11.5M
    if (blacklisted_servers_.count(ts_uuid)) {
207
23.0k
      tablet_meta.blacklisted_tablet_servers.insert(ts_uuid);
208
23.0k
    }
209
210
    // If this replica has blacklisted leader, we want to keep track of these specially, so we can
211
    // prioritize accordingly.
212
11.5M
    if (leader_blacklisted_servers_.count(ts_uuid)) {
213
7.41k
      tablet_meta.leader_blacklisted_tablet_servers.insert(ts_uuid);
214
7.41k
    }
215
11.5M
  }
216
217
  // Only set the over-replication section if we need to.
218
3.94M
  size_t placement_num_replicas = placement.num_replicas() > 0 ?
219
3.42M
      
placement.num_replicas()522k
: FLAGS_replication_factor;
220
3.94M
  tablet_meta.is_over_replicated = placement_num_replicas < replica_map->size();
221
3.94M
  tablet_meta.is_under_replicated = placement_num_replicas > replica_map->size();
222
223
  // If no placement information, we will have already set the over and under replication flags.
224
  // For under-replication, we cannot use any placement_id, so we just leave the set empty and
225
  // use that as a marker that we are in this situation.
226
  //
227
  // For over-replication, we just add all the ts_uuids as candidates.
228
3.94M
  if (placement.placement_blocks().empty()) {
229
3.61M
    if (tablet_meta.is_over_replicated) {
230
14.1k
      for (auto& replica_entry : *replica_map) {
231
14.1k
        tablet_meta.over_replicated_tablet_servers.insert(std::move(replica_entry.first));
232
14.1k
      }
233
3.60k
    }
234
3.61M
  } else {
235
    // If we do have placement information, figure out how the load is distributed based on
236
    // placement blocks, for this tablet.
237
333k
    std::unordered_map<CloudInfoPB, vector<TabletReplica>, cloud_hash, cloud_equal_to>
238
333k
                                                                    placement_to_replicas;
239
333k
    std::unordered_map<CloudInfoPB, int, cloud_hash, cloud_equal_to> placement_to_min_replicas;
240
    // Preset the min_replicas, so we know if we're missing replicas somewhere as well.
241
630k
    for (const auto& pb : placement.placement_blocks()) {
242
      // Default empty vector.
243
630k
      placement_to_replicas[pb.cloud_info()];
244
630k
      placement_to_min_replicas[pb.cloud_info()] = pb.min_num_replicas();
245
630k
    }
246
    // Now actually fill the structures with matching TSs.
247
927k
    for (auto& replica_entry : *replica_map) {
248
927k
      auto ci = GetValidPlacement(replica_entry.first, &placement);
249
927k
      if (ci.has_value()) {
250
855k
        placement_to_replicas[*ci].push_back(std::move(replica_entry.second));
251
855k
      } else {
252
        // If placement does not match, we likely changed the config or the schema and this
253
        // tablet should no longer live on this tablet server.
254
71.3k
        tablet_meta.wrong_placement_tablet_servers.insert(std::move(replica_entry.first));
255
71.3k
      }
256
927k
    }
257
258
    // Loop over the data and populate extra replica as well as missing replica information.
259
630k
    for (const auto& entry : placement_to_replicas) {
260
630k
      const auto& cloud_info = entry.first;
261
630k
      const auto& replica_set = entry.second;
262
630k
      const size_t min_num_replicas = placement_to_min_replicas[cloud_info];
263
630k
      if (min_num_replicas > replica_set.size()) {
264
        // Placements that are under-replicated should be handled ASAP.
265
1.04k
        tablet_meta.under_replicated_placements.insert(cloud_info);
266
629k
      } else if (tablet_meta.is_over_replicated && 
min_num_replicas < replica_set.size()142k
) {
267
        // If this tablet is over-replicated, consider all the placements that have more than the
268
        // minimum number of tablets, as candidates for removing a replica.
269
944
        for (auto& replica : replica_set) {
270
944
          tablet_meta.over_replicated_tablet_servers.insert(
271
944
            std::move(replica.ts_desc->permanent_uuid()));
272
944
        }
273
322
      }
274
630k
    }
275
333k
  }
276
3.94M
  tablet->GetLeaderStepDownFailureTimes(
277
3.94M
      current_time_ - MonoDelta::FromMilliseconds(FLAGS_min_leader_stepdown_retry_interval_ms),
278
3.94M
      &tablet_meta.leader_stepdown_failures);
279
280
  // Prepare placement related sets for tablets that have placement info.
281
3.94M
  if (tablet_meta.is_missing_replicas()) {
282
132k
    tablets_missing_replicas_.insert(tablet_id);
283
132k
  }
284
3.94M
  if (tablet_meta.is_over_replicated) {
285
75.1k
    tablets_over_replicated_.insert(tablet_id);
286
75.1k
  }
287
3.94M
  if (tablet_meta.has_wrong_placements()) {
288
80.1k
    tablets_wrong_placement_.insert(tablet_id);
289
80.1k
  }
290
291
3.94M
  return Status::OK();
292
3.94M
}
293
294
2.86M
void PerTableLoadState::UpdateTabletServer(std::shared_ptr<TSDescriptor> ts_desc) {
295
2.86M
  const auto& ts_uuid = ts_desc->permanent_uuid();
296
  // Set and get, so we can use this for both tablet servers we've added data to, as well as
297
  // tablet servers that happen to not be serving any tablets, so were not in the map yet.
298
2.86M
  auto& ts_meta = per_ts_meta_[ts_uuid];
299
2.86M
  ts_meta.descriptor = ts_desc;
300
301
  // Also insert into per_ts_global_meta_ if we have yet to.
302
2.86M
  global_state_->per_ts_global_meta_.emplace(ts_uuid, CBTabletServerLoadCounts());
303
304
  // Only add TS for LBing if it is not dead.
305
  // check_ts_liveness_ is an artifact of cluster_balance_mocked.h
306
  // and is used to ensure that we don't perform a liveness check
307
  // during mimicing load balancers.
308
2.86M
  if (!check_ts_liveness_ || 
ts_desc->IsLiveAndHasReported()2.86M
) {
309
2.69M
    sorted_load_.push_back(ts_uuid);
310
2.69M
  }
311
312
  // Mark as blacklisted if it matches.
313
2.86M
  bool is_blacklisted = false;
314
2.86M
  for (const auto& hp : blacklist_.hosts()) {
315
80.7k
    if (ts_meta.descriptor->IsRunningOn(hp)) {
316
16.1k
      blacklisted_servers_.insert(ts_uuid);
317
16.1k
      is_blacklisted = true;
318
16.1k
      break;
319
16.1k
    }
320
80.7k
  }
321
322
  // Mark as blacklisted leader if it matches.
323
2.86M
  for (const auto& hp : leader_blacklist_.hosts()) {
324
4.52k
    if (ts_meta.descriptor->IsRunningOn(hp)) {
325
1.15k
      leader_blacklisted_servers_.insert(ts_uuid);
326
1.15k
      break;
327
1.15k
    }
328
4.52k
  }
329
330
  // Add this tablet server for leader load-balancing only if it is not blacklisted and it has
331
  // heartbeated recently enough to be considered responsive for leader balancing.
332
  // Also, don't add it if isn't live or hasn't reported all its tablets.
333
  // check_ts_liveness_ is an artifact of cluster_balance_mocked.h
334
  // and is used to ensure that we don't perform a liveness check
335
  // during mimicing load balancers.
336
2.86M
  if (!is_blacklisted &&
337
2.86M
      ts_desc->TimeSinceHeartbeat().ToMilliseconds() <
338
2.84M
      FLAGS_leader_balance_unresponsive_timeout_ms &&
339
2.86M
      
(2.66M
!check_ts_liveness_2.66M
||
ts_desc->IsLiveAndHasReported()2.66M
)) {
340
2.66M
    sorted_leader_load_.push_back(ts_uuid);
341
2.66M
  }
342
343
2.86M
  if (ts_desc->HasTabletDeletePending()) {
344
233k
    servers_with_pending_deletes_.insert(ts_uuid);
345
233k
  }
346
347
  // If the TS is perceived as DEAD then ignore it.
348
2.86M
  if (check_ts_liveness_ && 
!ts_desc->IsLiveAndHasReported()2.86M
) {
349
174k
    return;
350
174k
  }
351
352
2.69M
  bool is_ts_live = IsTsInLivePlacement(ts_desc.get());
353
2.69M
  switch (options_->type) {
354
2.66M
    case LIVE: {
355
2.66M
      if (!is_ts_live) {
356
        // LIVE cb run with READ_ONLY ts, ignore this ts
357
8.07k
        sorted_load_.pop_back();
358
8.07k
      }
359
2.66M
      break;
360
0
    }
361
22.9k
    case READ_ONLY: {
362
22.9k
      if (is_ts_live) {
363
        // READ_ONLY cb run with LIVE ts, ignore this ts
364
16.0k
        sorted_load_.pop_back();
365
16.0k
      } else {
366
6.93k
        string placement_uuid = ts_desc->placement_uuid();
367
6.93k
        if (placement_uuid == "") {
368
0
          LOG(WARNING) << "Read only ts " << ts_desc->permanent_uuid()
369
0
                        << " does not have placement uuid";
370
6.93k
        } else if (placement_uuid != options_->placement_uuid) {
371
          // Do not include this ts in load balancing.
372
342
          sorted_load_.pop_back();
373
342
        }
374
6.93k
      }
375
22.9k
      sorted_leader_load_.clear();
376
22.9k
      return;
377
0
    }
378
2.69M
  }
379
380
2.66M
  if (sorted_leader_load_.empty() ||
381
2.66M
      
sorted_leader_load_.back() != ts_uuid2.65M
||
382
2.66M
      
affinitized_zones_.empty()2.64M
) {
383
2.66M
    return;
384
2.66M
  }
385
865
  TSRegistrationPB registration = ts_desc->GetRegistration();
386
865
  if (affinitized_zones_.find(registration.common().cloud_info()) == affinitized_zones_.end()) {
387
    // This tablet server is in an affinitized leader zone.
388
762
    sorted_leader_load_.pop_back();
389
762
    sorted_non_affinitized_leader_load_.push_back(ts_uuid);
390
762
  }
391
865
}
392
393
Result<bool> PerTableLoadState::CanAddTabletToTabletServer(
394
2.71M
    const TabletId& tablet_id, const TabletServerId& to_ts, const PlacementInfoPB* placement_info) {
395
2.71M
  const auto& ts_meta = per_ts_meta_[to_ts];
396
397
  // If this server is deemed DEAD then don't add it.
398
2.71M
  if (check_ts_liveness_ && 
!ts_meta.descriptor->IsLiveAndHasReported()2.71M
) {
399
133
    return false;
400
133
  }
401
402
  // If this tablet has already been added to a new tablet server, don't add it again.
403
2.71M
  if (tablets_added_.count(tablet_id)) {
404
1.25k
    return false;
405
1.25k
  }
406
  // We do not add load to blacklisted servers.
407
2.71M
  if (blacklisted_servers_.count(to_ts)) {
408
486k
    return false;
409
486k
  }
410
  // We cannot add a tablet to a tablet server if it is already serving it.
411
2.23M
  if (ts_meta.running_tablets.count(tablet_id) || 
ts_meta.starting_tablets.count(tablet_id)1.70M
) {
412
522k
    return false;
413
522k
  }
414
  // If we ask to use placement information, check against it.
415
1.70M
  if (placement_info && 
!GetValidPlacement(to_ts, placement_info).has_value()1.70M
) {
416
1.36M
    YB_LOG_EVERY_N_SECS(INFO, 30) << "tablet server " << to_ts << " has invalid placement info. "
417
1.23k
                                  << "Not allowing it to take more tablets.";
418
1.36M
    return false;
419
1.36M
  }
420
  // If this server has a pending tablet delete, don't use it.
421
340k
  if (servers_with_pending_deletes_.count(to_ts)) {
422
28.2k
    LOG(INFO) << "tablet server " << to_ts << " has a pending delete. "
423
28.2k
              << "Not allowing it to take more tablets";
424
28.2k
    return false;
425
28.2k
  }
426
  // If all checks pass, return true.
427
312k
  return true;
428
340k
}
429
430
boost::optional<CloudInfoPB> PerTableLoadState::GetValidPlacement(
431
3.23M
    const TabletServerId& ts_uuid, const PlacementInfoPB* placement_info) {
432
3.23M
  if (!placement_info->placement_blocks().empty()) {
433
4.94M
    for (const auto& pb : placement_info->placement_blocks()) {
434
4.94M
      if (per_ts_meta_[ts_uuid].descriptor->MatchesCloudInfo(pb.cloud_info())) {
435
1.73M
        return pb.cloud_info();
436
1.73M
      }
437
4.94M
    }
438
1.43M
    return boost::none;
439
3.17M
  }
440
  // Return the cloudInfoPB of TS if no placement policy is specified
441
59.0k
  return per_ts_meta_[ts_uuid].descriptor->GetCloudInfo();
442
3.23M
}
443
444
Result<bool> PerTableLoadState::CanSelectWrongReplicaToMove(
445
    const TabletId& tablet_id, const PlacementInfoPB& placement_info, TabletServerId* out_from_ts,
446
863
    TabletServerId* out_to_ts) {
447
  // We consider both invalid placements (potentially due to config or schema changes), as well
448
  // as servers being blacklisted, as wrong placement.
449
863
  const auto& tablet_meta = per_tablet_meta_[tablet_id];
450
  // Prioritize taking away load from blacklisted servers, then from wrong placements.
451
863
  bool found_match = false;
452
  // Use these to do a fallback move, if placement id is the only thing that does not match.
453
863
  TabletServerId fallback_to_uuid;
454
863
  TabletServerId fallback_from_uuid;
455
863
  for (const auto& from_uuid : tablet_meta.blacklisted_tablet_servers) {
456
857
    bool invalid_placement = tablet_meta.wrong_placement_tablet_servers.count(from_uuid);
457
3.73k
    for (const auto& to_uuid : sorted_load_) {
458
      // TODO(bogdan): this could be made smarter if we kept track of per-placement numbers and
459
      // allowed to remove one from one placement, as long as it is still above the minimum.
460
      //
461
      // If this is a blacklisted server, we should aim to still respect placement and for now,
462
      // just try to move the load to the same placement. However, if the from_uuid was
463
      // previously invalidly placed, then we should ignore its placement.
464
3.73k
      if (invalid_placement &&
465
3.73k
          
VERIFY_RESULT0
(CanAddTabletToTabletServer(tablet_id, to_uuid, &placement_info))) {
466
0
        found_match = true;
467
3.73k
      } else {
468
3.73k
        if (VERIFY_RESULT(CanAddTabletToTabletServer(tablet_id, to_uuid))) {
469
          // If we have placement information, we want to only pick the tablet if it's moving
470
          // to the same placement, so we guarantee we're keeping the same type of distribution.
471
          // Since we allow prefixes as well, we can still respect the placement of this tablet
472
          // even if their placement ids aren't the same. An e.g.
473
          // placement info of tablet: C.R1.*
474
          // placement info of from_ts: C.R1.Z1
475
          // placement info of to_ts: C.R2.Z2
476
          // Note that we've assumed that for every TS there is a unique placement block
477
          // to which it can be mapped (see the validation rules in yb_admin-client).
478
          // If there is no unique placement block then it is simply the C.R.Z of the TS itself.
479
449
          auto ci_from_ts = GetValidPlacement(from_uuid, &placement_info);
480
449
          auto ci_to_ts = GetValidPlacement(to_uuid, &placement_info);
481
449
          if (ci_to_ts.has_value() && ci_from_ts.has_value() &&
482
449
          TSDescriptor::generate_placement_id(*ci_from_ts) ==
483
449
                                    TSDescriptor::generate_placement_id(*ci_to_ts)) {
484
415
            found_match = true;
485
415
          } else {
486
            // ENG-500 : Placement does not match, but we can still use this combo as a fallback.
487
            // It uses the last such pair, which should be fine.
488
34
            fallback_to_uuid = to_uuid;
489
34
            fallback_from_uuid = from_uuid;
490
34
          }
491
449
        }
492
3.73k
      }
493
3.73k
      if (found_match) {
494
415
        *out_from_ts = from_uuid;
495
415
        *out_to_ts = to_uuid;
496
415
        return true;
497
415
      }
498
3.73k
    }
499
857
  }
500
501
448
  if (!fallback_to_uuid.empty()) {
502
26
    *out_from_ts = fallback_from_uuid;
503
26
    *out_to_ts = fallback_to_uuid;
504
26
    return true;
505
26
  }
506
507
  // TODO(bogdan): sort and pick the highest load as source.
508
  //
509
  // If we didn't have or find any blacklisted server to move load from, move to the wrong
510
  // placement tablet servers. We can pick any of them as the source for now.
511
422
  if (!tablet_meta.wrong_placement_tablet_servers.empty()) {
512
10
    for (const auto& to_uuid : sorted_load_) {
513
10
      if (VERIFY_RESULT(CanAddTabletToTabletServer(tablet_id, to_uuid, &placement_info))) {
514
6
        *out_from_ts = *tablet_meta.wrong_placement_tablet_servers.begin();
515
6
        *out_to_ts = to_uuid;
516
6
        return true;
517
6
      }
518
10
    }
519
6
  }
520
521
416
  return false;
522
422
}
523
524
2.65k
Status PerTableLoadState::AddReplica(const TabletId& tablet_id, const TabletServerId& to_ts) {
525
2.65k
  RETURN_NOT_OK(AddStartingTablet(tablet_id, to_ts));
526
2.65k
  tablets_added_.insert(tablet_id);
527
2.65k
  SortLoad();
528
2.65k
  return Status::OK();
529
2.65k
}
530
531
3.20k
Status PerTableLoadState::RemoveReplica(const TabletId& tablet_id, const TabletServerId& from_ts) {
532
3.20k
  RETURN_NOT_OK(RemoveRunningTablet(tablet_id, from_ts));
533
3.10k
  if (per_ts_meta_[from_ts].starting_tablets.count(tablet_id)) {
534
0
    LOG(DFATAL) << "Invalid request: remove starting tablet " << tablet_id
535
0
                << " from ts " << from_ts;
536
0
  }
537
3.10k
  if (per_tablet_meta_[tablet_id].leader_uuid == from_ts) {
538
815
    RETURN_NOT_OK(MoveLeader(tablet_id, from_ts));
539
815
  }
540
  // This artificially constrains the removes to only handle one over_replication/wrong_placement
541
  // per run.
542
  // Create a copy of tablet_id because tablet_id could be a const reference from
543
  // tablets_wrong_placement_ (if the requests comes from HandleRemoveIfWrongPlacement) or a const
544
  // reference from tablets_over_replicated_ (if the request comes from HandleRemoveReplicas).
545
3.10k
  TabletId tablet_id_key(tablet_id);
546
3.10k
  tablets_over_replicated_.erase(tablet_id_key);
547
3.10k
  tablets_wrong_placement_.erase(tablet_id_key);
548
3.10k
  SortLoad();
549
3.10k
  return Status::OK();
550
3.10k
}
551
552
1.78M
void PerTableLoadState::SortLoad() {
553
1.78M
  auto comparator = Comparator(this);
554
1.78M
  sort(sorted_load_.begin(), sorted_load_.end(), comparator);
555
556
1.78M
  if (global_state_->drive_aware_) {
557
1.78M
    SortDriveLoad();
558
1.78M
  }
559
1.78M
}
560
561
1.78M
void PerTableLoadState::SortDriveLoad() {
562
  // Sort drives on each ts by the tablets count to use a sorted list while
563
  // looking the tablet to move from the drive with the most tablets count.
564
5.36M
  for (const auto& ts : sorted_load_) {
565
5.36M
    auto& ts_meta = per_ts_meta_[ts];
566
5.36M
    std::vector<std::pair<std::string, uint64>> drive_load;
567
5.36M
    bool empty_path_found = false;
568
5.72M
    for (const auto& path_to_tablet : ts_meta.path_to_tablets) {
569
5.72M
      if (path_to_tablet.first.empty()) {
570
        // TS reported tablet without path (rolling restart case).
571
763
        empty_path_found = true;
572
763
        continue;
573
763
      }
574
5.72M
      int starting_tablets_count = FindWithDefault(ts_meta.path_to_starting_tablets_count,
575
5.72M
                                                   path_to_tablet.first, 0);
576
5.72M
      drive_load.emplace_back(std::pair<std::string, uint>(
577
5.72M
                                {path_to_tablet.first,
578
5.72M
                                 starting_tablets_count + path_to_tablet.second.size()}));
579
5.72M
    }
580
581
    // Sort by decreasing load.
582
5.36M
    sort(drive_load.begin(), drive_load.end(),
583
5.36M
          [](const std::pair<std::string, uint64>& l, const std::pair<std::string, uint64>& r) {
584
544k
              return l.second > r.second;
585
544k
            });
586
5.36M
    ts_meta.sorted_path_load_by_tablets_count.reserve(drive_load.size());
587
5.36M
    std::transform(drive_load.begin(), drive_load.end(),
588
5.36M
                    std::back_inserter(ts_meta.sorted_path_load_by_tablets_count),
589
5.72M
                    [](const std::pair<std::string, uint64>& v) { return v.first;});
590
5.36M
    if (empty_path_found) {
591
      // Empty path was found at path_to_tablets, so add the empty path to the
592
      // end so that it has the lowest priority.
593
763
      ts_meta.sorted_path_load_by_tablets_count.push_back(std::string());
594
763
    }
595
5.36M
  }
596
1.78M
}
597
598
Status PerTableLoadState::MoveLeader(const TabletId& tablet_id,
599
                                     const TabletServerId& from_ts,
600
                                     const TabletServerId& to_ts,
601
54.6k
                                     const TabletServerId& to_ts_path) {
602
54.6k
  if (per_tablet_meta_[tablet_id].leader_uuid != from_ts) {
603
1
    return STATUS_SUBSTITUTE(IllegalState, "Tablet $0 has leader $1, but $2 expected.",
604
1
                              tablet_id, per_tablet_meta_[tablet_id].leader_uuid, from_ts);
605
1
  }
606
54.6k
  per_tablet_meta_[tablet_id].leader_uuid = to_ts;
607
54.6k
  RETURN_NOT_OK(RemoveLeaderTablet(tablet_id, from_ts));
608
54.5k
  if (!to_ts.empty()) {
609
53.7k
    RETURN_NOT_OK(AddLeaderTablet(tablet_id, to_ts, to_ts_path));
610
53.7k
  }
611
54.5k
  SortLeaderLoad();
612
54.5k
  return Status::OK();
613
54.5k
}
614
615
946k
void PerTableLoadState::SortLeaderLoad() {
616
946k
  auto leader_count_comparator = LeaderLoadComparator(this);
617
946k
  sort(sorted_non_affinitized_leader_load_.begin(),
618
946k
        sorted_non_affinitized_leader_load_.end(),
619
946k
        leader_count_comparator);
620
946k
  sort(sorted_leader_load_.begin(), sorted_leader_load_.end(), leader_count_comparator);
621
622
946k
  if (global_state_->drive_aware_) {
623
946k
    SortDriveLeaderLoad();
624
946k
  }
625
946k
}
626
627
946k
void PerTableLoadState::SortDriveLeaderLoad() {
628
  // Sort drives on each ts by the leaders count to use a sorted list while
629
  // looking the leader to move to the drive with the least leaders count.
630
2.81M
  for (const auto& ts : sorted_leader_load_) {
631
2.81M
    auto& ts_meta = per_ts_meta_[ts];
632
2.81M
    std::vector<std::pair<std::string, uint64>> drive_load;
633
2.81M
    bool empty_path_found = false;
634
    // Add drives with leaders
635
2.81M
    for (const auto& path_to_tablet : ts_meta.path_to_leaders) {
636
2.35M
      if (path_to_tablet.first.empty()) {
637
525
        empty_path_found = true;
638
525
        continue;
639
525
      }
640
2.35M
      drive_load.emplace_back(std::pair<std::string, uint>(
641
2.35M
                                {path_to_tablet.first, path_to_tablet.second.size()}));
642
2.35M
    }
643
    // Add drives without leaders, but with tablets
644
2.99M
    for (const auto& path_to_tablet : ts_meta.path_to_tablets) {
645
2.99M
      const auto& path = path_to_tablet.first;
646
2.99M
      if (path.empty()) {
647
421
        continue;
648
421
      }
649
650
2.99M
      if (ts_meta.path_to_leaders.find(path) == ts_meta.path_to_leaders.end()) {
651
636k
        drive_load.emplace_back(std::pair<std::string, uint>({path_to_tablet.first, 0}));
652
636k
      }
653
2.99M
    }
654
655
    // Sort by ascending load.
656
2.81M
    sort(drive_load.begin(), drive_load.end(),
657
2.81M
          [](const std::pair<std::string, uint64>& l, const std::pair<std::string, uint64>& r) {
658
268k
              return l.second < r.second;
659
268k
            });
660
2.81M
    bool add_empty_path = empty_path_found || 
ts_meta.path_to_leaders.empty()2.81M
;
661
2.81M
    ts_meta.sorted_path_load_by_leader_count.reserve(drive_load.size() + (add_empty_path ? 
1462k
:
02.34M
));
662
2.81M
    if (add_empty_path) {
663
      // Empty path was found at path_to_leaders or no leaders on TS, so add the empty path.
664
462k
      ts_meta.sorted_path_load_by_leader_count.push_back(std::string());
665
462k
    }
666
2.81M
    std::transform(drive_load.begin(), drive_load.end(),
667
2.81M
                    std::back_inserter(ts_meta.sorted_path_load_by_leader_count),
668
2.99M
                    [](const std::pair<std::string, uint64>& v) { return v.first;});
669
2.81M
  }
670
946k
}
671
672
51
void PerTableLoadState::LogSortedLeaderLoad() {
673
  // Sample output:
674
  // ts1_uuid[ts1_load] ts2_uuid[ts2_load] ts4_uuid[ts4_load] -- ts3_uuid[ts3_load]
675
  // Note: entries following "--" are leader blacklisted tservers
676
677
51
  bool blacklisted_leader = false;
678
51
  std::string s;
679
186
  for (const auto& ts_uuid : sorted_leader_load_) {
680
186
    if (!blacklisted_leader) {
681
186
      blacklisted_leader = (leader_blacklisted_servers_.find(ts_uuid) !=
682
186
          leader_blacklisted_servers_.end());
683
186
      if (blacklisted_leader) {
684
51
        s += " --";
685
51
      }
686
186
    }
687
688
186
    s +=  " " + ts_uuid + "[" + strings::Substitute("$0", GetLeaderLoad(ts_uuid)) + "]";
689
186
  }
690
51
  if (s.size() > 0) {
691
51
    LOG(INFO) << "tservers sorted by whether leader blacklisted and load: " << s;
692
51
  }
693
51
}
694
695
891k
void PerTableLoadState::AdjustLeaderBalanceThreshold() {
696
891k
  if (leader_balance_threshold_ != 0) {
697
12
    int min_threshold = sorted_leader_load_.empty() ? 
00
:
698
12
                        static_cast<int>(std::ceil(
699
12
                          static_cast<double>(per_tablet_meta_.size()) /
700
12
                          static_cast<double>(sorted_leader_load_.size())));
701
12
    if (leader_balance_threshold_ < min_threshold) {
702
0
      LOG(WARNING) << strings::Substitute(
703
0
        "leader_balance_threshold flag is set to $0 but is too low for the current "
704
0
          "configuration. Adjusting it to $1.",
705
0
        leader_balance_threshold_, min_threshold);
706
0
      leader_balance_threshold_ = min_threshold;
707
0
    }
708
12
  }
709
891k
}
710
711
std::shared_ptr<const TabletReplicaMap> PerTableLoadState::GetReplicaLocations(
712
3.94M
    TabletInfo* tablet) {
713
3.94M
  auto replica_locations = std::make_shared<TabletReplicaMap>();
714
3.94M
  auto replica_map = tablet->GetReplicaLocations();
715
11.7M
  for (const auto& it : *replica_map) {
716
11.7M
    const TabletReplica& replica = it.second;
717
11.7M
    bool is_replica_live =  IsTsInLivePlacement(replica.ts_desc);
718
11.7M
    if (is_replica_live && 
options_->type == LIVE11.5M
) {
719
11.3M
      replica_locations->emplace(it.first, replica);
720
11.3M
    } else 
if (374k
!is_replica_live374k
&&
options_->type == READ_ONLY251k
) {
721
126k
      const string& placement_uuid = replica.ts_desc->placement_uuid();
722
126k
      if (placement_uuid == options_->placement_uuid) {
723
124k
        replica_locations->emplace(it.first, replica);
724
124k
      }
725
126k
    }
726
11.7M
  }
727
3.94M
  return replica_locations;
728
3.94M
}
729
730
Status PerTableLoadState::AddRunningTablet(const TabletId& tablet_id,
731
                                           const TabletServerId& ts_uuid,
732
11.5M
                                           const std::string& path) {
733
11.5M
  SCHECK(per_ts_meta_.find(ts_uuid) != per_ts_meta_.end(), IllegalState,
734
11.5M
          Format(uninitialized_ts_meta_format_msg, ts_uuid, table_id_));
735
  // Set::Insert returns a pair where the second value is whether or not an item was inserted.
736
11.5M
  auto& meta_ts = per_ts_meta_.at(ts_uuid);
737
11.5M
  auto ret = meta_ts.running_tablets.insert(tablet_id);
738
11.5M
  if (ret.second) {
739
11.5M
    ++global_state_->per_ts_global_meta_[ts_uuid].running_tablets_count;
740
11.5M
    ++total_running_;
741
11.5M
    ++per_tablet_meta_[tablet_id].running;
742
11.5M
  }
743
11.5M
  meta_ts.path_to_tablets[path].insert(tablet_id);
744
11.5M
  return Status::OK();
745
11.5M
}
746
747
Status PerTableLoadState::RemoveRunningTablet(
748
3.20k
    const TabletId& tablet_id, const TabletServerId& ts_uuid) {
749
3.20k
  SCHECK(per_ts_meta_.find(ts_uuid) != per_ts_meta_.end(), IllegalState,
750
3.20k
          Format(uninitialized_ts_meta_format_msg, ts_uuid, table_id_));
751
3.20k
  auto& meta_ts = per_ts_meta_.at(ts_uuid);
752
3.20k
  auto num_erased = meta_ts.running_tablets.erase(tablet_id);
753
3.20k
  if (num_erased == 0) {
754
100
    return STATUS_FORMAT(
755
100
      IllegalState,
756
100
      "Could not find running tablet to remove: ts_uuid: $0, tablet_id: $1",
757
100
      ts_uuid, tablet_id);
758
100
  }
759
3.10k
  global_state_->per_ts_global_meta_[ts_uuid].running_tablets_count -= num_erased;
760
3.10k
  total_running_ -= num_erased;
761
3.10k
  per_tablet_meta_[tablet_id].running -= num_erased;
762
3.10k
  bool found = false;
763
3.15k
  for (auto &path : meta_ts.path_to_tablets) {
764
3.15k
    if (path.second.erase(tablet_id) == 0) {
765
116
      found = true;
766
116
      break;
767
116
    }
768
3.15k
  }
769
3.10k
  VLOG_IF
(1, !found) << "Updated replica wasn't found, tablet id: " << tablet_id0
;
770
3.10k
  return Status::OK();
771
3.20k
}
772
773
Status PerTableLoadState::AddStartingTablet(
774
4.46k
    const TabletId& tablet_id, const TabletServerId& ts_uuid) {
775
4.46k
  SCHECK(per_ts_meta_.find(ts_uuid) != per_ts_meta_.end(), IllegalState,
776
4.46k
          Format(uninitialized_ts_meta_format_msg, ts_uuid, table_id_));
777
4.46k
  auto ret = per_ts_meta_.at(ts_uuid).starting_tablets.insert(tablet_id);
778
4.46k
  if (ret.second) {
779
4.04k
    ++global_state_->per_ts_global_meta_[ts_uuid].starting_tablets_count;
780
4.04k
    ++total_starting_;
781
4.04k
    ++global_state_->total_starting_tablets_;
782
4.04k
    ++per_tablet_meta_[tablet_id].starting;
783
    // If the tablet wasn't over replicated before the add, it's over replicated now.
784
4.04k
    if (tablets_missing_replicas_.count(tablet_id) == 0) {
785
3.32k
      tablets_over_replicated_.insert(tablet_id);
786
3.32k
    }
787
4.04k
  }
788
4.46k
  return Status::OK();
789
4.46k
}
790
791
Status PerTableLoadState::AddLeaderTablet(
792
3.95M
    const TabletId& tablet_id, const TabletServerId& ts_uuid, const TabletServerId& ts_path) {
793
3.95M
  SCHECK(per_ts_meta_.find(ts_uuid) != per_ts_meta_.end(), IllegalState,
794
3.95M
          Format(uninitialized_ts_meta_format_msg, ts_uuid, table_id_));
795
3.95M
  auto& meta_ts = per_ts_meta_.at(ts_uuid);
796
3.95M
  auto ret = meta_ts.leaders.insert(tablet_id);
797
3.95M
  if (ret.second) {
798
3.95M
    ++global_state_->per_ts_global_meta_[ts_uuid].leaders_count;
799
3.95M
  }
800
3.95M
  meta_ts.path_to_leaders[ts_path].insert(tablet_id);
801
3.95M
  return Status::OK();
802
3.95M
}
803
804
Status PerTableLoadState::RemoveLeaderTablet(
805
54.6k
    const TabletId& tablet_id, const TabletServerId& ts_uuid) {
806
54.6k
  SCHECK(per_ts_meta_.find(ts_uuid) != per_ts_meta_.end(), IllegalState,
807
54.6k
          Format(uninitialized_ts_meta_format_msg, ts_uuid, table_id_));
808
54.5k
  auto num_erased = per_ts_meta_.at(ts_uuid).leaders.erase(tablet_id);
809
54.5k
  global_state_->per_ts_global_meta_[ts_uuid].leaders_count -= num_erased;
810
54.5k
  return Status::OK();
811
54.6k
}
812
813
Status PerTableLoadState::AddDisabledByTSTablet(
814
0
    const TabletId& tablet_id, const TabletServerId& ts_uuid) {
815
0
  SCHECK(per_ts_meta_.find(ts_uuid) != per_ts_meta_.end(), IllegalState,
816
0
          Format(uninitialized_ts_meta_format_msg, ts_uuid, table_id_));
817
0
  per_ts_meta_.at(ts_uuid).disabled_by_ts_tablets.insert(tablet_id);
818
0
  return Status::OK();
819
0
}
820
821
0
bool PerTableLoadState::CompareByReplica(const TabletReplica& a, const TabletReplica& b) {
822
0
  return CompareByUuid(a.ts_desc->permanent_uuid(), b.ts_desc->permanent_uuid());
823
0
}
824
825
} // namespace master
826
} // namespace yb