YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/src/yb/master/cluster_balance_util.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) YugaByte, Inc.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
4
// in compliance with the License.  You may obtain a copy of the License at
5
//
6
// http://www.apache.org/licenses/LICENSE-2.0
7
//
8
// Unless required by applicable law or agreed to in writing, software distributed under the License
9
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
10
// or implied.  See the License for the specific language governing permissions and limitations
11
// under the License.
12
//
13
14
#include "yb/master/cluster_balance_util.h"
15
16
#include "yb/gutil/map-util.h"
17
18
#include "yb/master/catalog_entity_info.h"
19
#include "yb/master/master_cluster.pb.h"
20
21
#include "yb/util/atomic.h"
22
23
DECLARE_int32(min_leader_stepdown_retry_interval_ms);
24
25
DECLARE_int32(leader_balance_threshold);
26
27
DECLARE_int32(leader_balance_unresponsive_timeout_ms);
28
29
DECLARE_int32(replication_factor);
30
31
DECLARE_bool(allow_leader_balancing_dead_node);
32
33
namespace yb {
34
namespace master {
35
36
bool CBTabletMetadata::CanAddTSToMissingPlacements(
37
315
    const std::shared_ptr<TSDescriptor> ts_descriptor) const {
38
319
  for (const auto& under_replicated_ci : under_replicated_placements) {
39
    // A prefix.
40
319
    if (ts_descriptor->MatchesCloudInfo(under_replicated_ci)) {
41
115
      return true;
42
115
    }
43
319
  }
44
200
  return false;
45
315
}
46
47
0
std::string CBTabletMetadata::ToString() const {
48
0
  return YB_STRUCT_TO_STRING(
49
0
      running, starting, is_under_replicated, under_replicated_placements,
50
0
      is_over_replicated, over_replicated_tablet_servers,
51
0
      wrong_placement_tablet_servers, blacklisted_tablet_servers,
52
0
      leader_uuid, leader_stepdown_failures, leader_blacklisted_tablet_servers);
53
0
}
54
55
1.15M
int GlobalLoadState::GetGlobalLoad(const TabletServerId& ts_uuid) const {
56
1.15M
  const auto& ts_meta = per_ts_global_meta_.at(ts_uuid);
57
1.15M
  return ts_meta.starting_tablets_count + ts_meta.running_tablets_count;
58
1.15M
}
59
60
599k
int GlobalLoadState::GetGlobalLeaderLoad(const TabletServerId& ts_uuid) const {
61
599k
  const auto& ts_meta = per_ts_global_meta_.at(ts_uuid);
62
599k
  return ts_meta.leaders_count;
63
599k
}
64
65
PerTableLoadState::PerTableLoadState(GlobalLoadState* global_state)
66
    : leader_balance_threshold_(FLAGS_leader_balance_threshold),
67
      current_time_(MonoTime::Now()),
68
121k
      global_state_(global_state) {}
69
70
120k
PerTableLoadState::~PerTableLoadState() {}
71
72
bool PerTableLoadState::LeaderLoadComparator::operator()(
73
352k
    const TabletServerId& a, const TabletServerId& b) {
74
  // Primary criteria: whether tserver is leader blacklisted.
75
352k
  auto a_leader_blacklisted =
76
352k
    state_->leader_blacklisted_servers_.find(a) != state_->leader_blacklisted_servers_.end();
77
352k
  auto b_leader_blacklisted =
78
352k
    state_->leader_blacklisted_servers_.find(b) != state_->leader_blacklisted_servers_.end();
79
352k
  if (a_leader_blacklisted != b_leader_blacklisted) {
80
1.21k
    return !a_leader_blacklisted;
81
1.21k
  }
82
83
  // Use global leader load as tie-breaker.
84
351k
  auto a_load = state_->GetLeaderLoad(a);
85
351k
  auto b_load = state_->GetLeaderLoad(b);
86
351k
  if (a_load == b_load) {
87
284k
    a_load = state_->global_state_->GetGlobalLeaderLoad(a);
88
284k
    b_load = state_->global_state_->GetGlobalLeaderLoad(b);
89
284k
    if (a_load == b_load) {
90
262k
      return a < b;
91
262k
    }
92
88.3k
  }
93
  // Secondary criteria: tserver leader load.
94
88.3k
  return a_load < b_load;
95
88.3k
}
96
97
627k
bool PerTableLoadState::CompareByUuid(const TabletServerId& a, const TabletServerId& b) {
98
627k
  auto load_a = GetLoad(a);
99
627k
  auto load_b = GetLoad(b);
100
627k
  if (load_a == load_b) {
101
    // Use global load as a heuristic to help break ties.
102
575k
    load_a = global_state_->GetGlobalLoad(a);
103
575k
    load_b = global_state_->GetGlobalLoad(b);
104
575k
    if (load_a == load_b) {
105
563k
      return a < b;
106
563k
    }
107
63.6k
  }
108
63.6k
  return load_a < load_b;
109
63.6k
}
110
111
1.54M
size_t PerTableLoadState::GetLoad(const TabletServerId& ts_uuid) const {
112
1.54M
  const auto& ts_meta = per_ts_meta_.at(ts_uuid);
113
1.54M
  return ts_meta.starting_tablets.size() + ts_meta.running_tablets.size();
114
1.54M
}
115
116
986k
size_t PerTableLoadState::GetLeaderLoad(const TabletServerId& ts_uuid) const {
117
986k
  return per_ts_meta_.at(ts_uuid).leaders.size();
118
986k
}
119
120
967k
Status PerTableLoadState::UpdateTablet(TabletInfo *tablet) {
121
967k
  const auto& tablet_id = tablet->id();
122
  // Set the per-tablet entry to empty default and get the reference for filling up information.
123
967k
  auto& tablet_meta = per_tablet_meta_[tablet_id];
124
125
  // Get the placement for this tablet.
126
967k
  const auto& placement = placement_by_table_[tablet->table()->id()];
127
128
  // Get replicas for this tablet.
129
967k
  auto replica_map = GetReplicaLocations(tablet);
130
  // Set state information for both the tablet and the tablet server replicas.
131
2.86M
  for (const auto& replica_it : *replica_map) {
132
2.86M
    const auto& ts_uuid = replica_it.first;
133
2.86M
    const auto& replica = replica_it.second;
134
    // If we do not have ts_meta information for this particular replica, then we are in the
135
    // rare case where we just became the master leader and started doing load balancing, but we
136
    // have yet to receive heartbeats from all the tablet servers. We will just return false
137
    // across the stack and stop load balancing and log errors, until we get all the needed info.
138
    //
139
    // Worst case scenario, there is a network partition that is stopping us from actually
140
    // getting the heartbeats from a certain tablet server, but we anticipate that to be a
141
    // temporary matter. We should monitor error logs for this and see that it never actually
142
    // becomes a problem!
143
2.86M
    if (per_ts_meta_.find(ts_uuid) == per_ts_meta_.end()) {
144
5
      return STATUS_SUBSTITUTE(LeaderNotReadyToServe, "Master leader has not yet received "
145
5
          "heartbeat from ts $0, either master just became leader or a network partition.",
146
5
                                ts_uuid);
147
5
    }
148
2.86M
    auto& meta_ts = per_ts_meta_[ts_uuid];
149
150
    // If the TS of this replica is deemed DEAD then perform LBing only if it is blacklisted.
151
2.86M
    if (check_ts_liveness_ && !meta_ts.descriptor->IsLiveAndHasReported()) {
152
8.48k
      if (!blacklisted_servers_.count(ts_uuid)) {
153
8.12k
        if (GetAtomicFlag(&FLAGS_allow_leader_balancing_dead_node)) {
154
8.12k
          allow_only_leader_balancing_ = true;
155
8.12k
          YB_LOG_EVERY_N_SECS(INFO, 30)
156
38
              << strings::Substitute("Master leader not received heartbeat from ts $0. "
157
38
                                     "Only performing leader balancing for tables with replicas"
158
38
                                     " in this TS.", ts_uuid);
159
0
        } else {
160
0
          return STATUS_SUBSTITUTE(LeaderNotReadyToServe, "Master leader has not yet received "
161
0
              "heartbeat from ts $0. Aborting load balancing.", ts_uuid);
162
0
        }
163
357
      } else {
164
357
        YB_LOG_EVERY_N_SECS(INFO, 30)
165
3
            << strings::Substitute("Master leader not received heartbeat from ts $0 but it is "
166
3
                                   "blacklisted. Continuing LB operations for tables with replicas"
167
3
                                   " in this TS.", ts_uuid);
168
357
      }
169
8.48k
    }
170
171
    // Fill leader info.
172
2.86M
    if (replica.role == PeerRole::LEADER) {
173
946k
      tablet_meta.leader_uuid = ts_uuid;
174
946k
      RETURN_NOT_OK(AddLeaderTablet(tablet_id, ts_uuid, replica.fs_data_dir));
175
946k
    }
176
177
2.86M
    const tablet::RaftGroupStatePB& tablet_state = replica.state;
178
2.86M
    const bool replica_is_stale = replica.IsStale();
179
0
    VLOG(2) << "Tablet " << tablet_id << " for table " << table_id_
180
0
              << " is in state " << RaftGroupStatePB_Name(tablet_state);
181
2.86M
    if (tablet_state == tablet::RUNNING) {
182
2.86M
      RETURN_NOT_OK(AddRunningTablet(tablet_id, ts_uuid, replica.fs_data_dir));
183
733
    } else if (!replica_is_stale &&
184
733
                (tablet_state == tablet::BOOTSTRAPPING || tablet_state == tablet::NOT_STARTED)) {
185
      // Keep track of transitioning state (not running, but not in a stopped or failed state).
186
733
      RETURN_NOT_OK(AddStartingTablet(tablet_id, ts_uuid));
187
0
      VLOG(1) << "Increased total_starting to "
188
0
                  << total_starting_ << " for tablet " << tablet_id << " and table " << table_id_;
189
733
      auto counter_it = meta_ts.path_to_starting_tablets_count.find(replica.fs_data_dir);
190
733
      if (counter_it != meta_ts.path_to_starting_tablets_count.end()) {
191
211
        ++counter_it->second;
192
522
      } else {
193
522
        meta_ts.path_to_starting_tablets_count.insert({replica.fs_data_dir, 1});
194
522
      }
195
0
    } else if (replica_is_stale) {
196
0
      VLOG(1) << "Replica is stale: " << replica.ToString();
197
0
    }
198
199
2.86M
    if (replica.should_disable_lb_move) {
200
0
      RETURN_NOT_OK(AddDisabledByTSTablet(tablet_id, ts_uuid));
201
0
      VLOG(1) << "Replica was disabled by TS: " << replica.ToString();
202
0
    }
203
204
    // If this replica is blacklisted, we want to keep track of these specially, so we can
205
    // prioritize accordingly.
206
2.86M
    if (blacklisted_servers_.count(ts_uuid)) {
207
11.3k
      tablet_meta.blacklisted_tablet_servers.insert(ts_uuid);
208
11.3k
    }
209
210
    // If this replica has blacklisted leader, we want to keep track of these specially, so we can
211
    // prioritize accordingly.
212
2.86M
    if (leader_blacklisted_servers_.count(ts_uuid)) {
213
3.99k
      tablet_meta.leader_blacklisted_tablet_servers.insert(ts_uuid);
214
3.99k
    }
215
2.86M
  }
216
217
  // Only set the over-replication section if we need to.
218
967k
  size_t placement_num_replicas = placement.num_replicas() > 0 ?
219
860k
      placement.num_replicas() : FLAGS_replication_factor;
220
967k
  tablet_meta.is_over_replicated = placement_num_replicas < replica_map->size();
221
967k
  tablet_meta.is_under_replicated = placement_num_replicas > replica_map->size();
222
223
  // If no placement information, we will have already set the over and under replication flags.
224
  // For under-replication, we cannot use any placement_id, so we just leave the set empty and
225
  // use that as a marker that we are in this situation.
226
  //
227
  // For over-replication, we just add all the ts_uuids as candidates.
228
967k
  if (placement.placement_blocks().empty()) {
229
954k
    if (tablet_meta.is_over_replicated) {
230
6.43k
      for (auto& replica_entry : *replica_map) {
231
6.43k
        tablet_meta.over_replicated_tablet_servers.insert(std::move(replica_entry.first));
232
6.43k
      }
233
1.64k
    }
234
12.2k
  } else {
235
    // If we do have placement information, figure out how the load is distributed based on
236
    // placement blocks, for this tablet.
237
12.2k
    std::unordered_map<CloudInfoPB, vector<TabletReplica>, cloud_hash, cloud_equal_to>
238
12.2k
                                                                    placement_to_replicas;
239
12.2k
    std::unordered_map<CloudInfoPB, int, cloud_hash, cloud_equal_to> placement_to_min_replicas;
240
    // Preset the min_replicas, so we know if we're missing replicas somewhere as well.
241
22.8k
    for (const auto& pb : placement.placement_blocks()) {
242
      // Default empty vector.
243
22.8k
      placement_to_replicas[pb.cloud_info()];
244
22.8k
      placement_to_min_replicas[pb.cloud_info()] = pb.min_num_replicas();
245
22.8k
    }
246
    // Now actually fill the structures with matching TSs.
247
35.1k
    for (auto& replica_entry : *replica_map) {
248
35.1k
      auto ci = GetValidPlacement(replica_entry.first, &placement);
249
35.1k
      if (ci.has_value()) {
250
33.9k
        placement_to_replicas[*ci].push_back(std::move(replica_entry.second));
251
1.17k
      } else {
252
        // If placement does not match, we likely changed the config or the schema and this
253
        // tablet should no longer live on this tablet server.
254
1.17k
        tablet_meta.wrong_placement_tablet_servers.insert(std::move(replica_entry.first));
255
1.17k
      }
256
35.1k
    }
257
258
    // Loop over the data and populate extra replica as well as missing replica information.
259
22.8k
    for (const auto& entry : placement_to_replicas) {
260
22.8k
      const auto& cloud_info = entry.first;
261
22.8k
      const auto& replica_set = entry.second;
262
22.8k
      const size_t min_num_replicas = placement_to_min_replicas[cloud_info];
263
22.8k
      if (min_num_replicas > replica_set.size()) {
264
        // Placements that are under-replicated should be handled ASAP.
265
799
        tablet_meta.under_replicated_placements.insert(cloud_info);
266
22.0k
      } else if (tablet_meta.is_over_replicated && min_num_replicas < replica_set.size()) {
267
        // If this tablet is over-replicated, consider all the placements that have more than the
268
        // minimum number of tablets, as candidates for removing a replica.
269
658
        for (auto& replica : replica_set) {
270
658
          tablet_meta.over_replicated_tablet_servers.insert(
271
658
            std::move(replica.ts_desc->permanent_uuid()));
272
658
        }
273
214
      }
274
22.8k
    }
275
12.2k
  }
276
967k
  tablet->GetLeaderStepDownFailureTimes(
277
967k
      current_time_ - MonoDelta::FromMilliseconds(FLAGS_min_leader_stepdown_retry_interval_ms),
278
967k
      &tablet_meta.leader_stepdown_failures);
279
280
  // Prepare placement related sets for tablets that have placement info.
281
967k
  if (tablet_meta.is_missing_replicas()) {
282
6.19k
    tablets_missing_replicas_.insert(tablet_id);
283
6.19k
  }
284
967k
  if (tablet_meta.is_over_replicated) {
285
2.98k
    tablets_over_replicated_.insert(tablet_id);
286
2.98k
  }
287
967k
  if (tablet_meta.has_wrong_placements()) {
288
5.90k
    tablets_wrong_placement_.insert(tablet_id);
289
5.90k
  }
290
291
967k
  return Status::OK();
292
967k
}
293
294
382k
void PerTableLoadState::UpdateTabletServer(std::shared_ptr<TSDescriptor> ts_desc) {
295
382k
  const auto& ts_uuid = ts_desc->permanent_uuid();
296
  // Set and get, so we can use this for both tablet servers we've added data to, as well as
297
  // tablet servers that happen to not be serving any tablets, so were not in the map yet.
298
382k
  auto& ts_meta = per_ts_meta_[ts_uuid];
299
382k
  ts_meta.descriptor = ts_desc;
300
301
  // Also insert into per_ts_global_meta_ if we have yet to.
302
382k
  global_state_->per_ts_global_meta_.emplace(ts_uuid, CBTabletServerLoadCounts());
303
304
  // Only add TS for LBing if it is not dead.
305
  // check_ts_liveness_ is an artifact of cluster_balance_mocked.h
306
  // and is used to ensure that we don't perform a liveness check
307
  // during mimicing load balancers.
308
382k
  if (!check_ts_liveness_ || ts_desc->IsLiveAndHasReported()) {
309
377k
    sorted_load_.push_back(ts_uuid);
310
377k
  }
311
312
  // Mark as blacklisted if it matches.
313
382k
  bool is_blacklisted = false;
314
39.9k
  for (const auto& hp : blacklist_.hosts()) {
315
39.9k
    if (ts_meta.descriptor->IsRunningOn(hp)) {
316
8.03k
      blacklisted_servers_.insert(ts_uuid);
317
8.03k
      is_blacklisted = true;
318
8.03k
      break;
319
8.03k
    }
320
39.9k
  }
321
322
  // Mark as blacklisted leader if it matches.
323
2.34k
  for (const auto& hp : leader_blacklist_.hosts()) {
324
2.34k
    if (ts_meta.descriptor->IsRunningOn(hp)) {
325
604
      leader_blacklisted_servers_.insert(ts_uuid);
326
604
      break;
327
604
    }
328
2.34k
  }
329
330
  // Add this tablet server for leader load-balancing only if it is not blacklisted and it has
331
  // heartbeated recently enough to be considered responsive for leader balancing.
332
  // Also, don't add it if isn't live or hasn't reported all its tablets.
333
  // check_ts_liveness_ is an artifact of cluster_balance_mocked.h
334
  // and is used to ensure that we don't perform a liveness check
335
  // during mimicing load balancers.
336
382k
  if (!is_blacklisted &&
337
374k
      ts_desc->TimeSinceHeartbeat().ToMilliseconds() <
338
374k
      FLAGS_leader_balance_unresponsive_timeout_ms &&
339
362k
      (!check_ts_liveness_ || ts_desc->IsLiveAndHasReported())) {
340
362k
    sorted_leader_load_.push_back(ts_uuid);
341
362k
  }
342
343
382k
  if (ts_desc->HasTabletDeletePending()) {
344
45.2k
    servers_with_pending_deletes_.insert(ts_uuid);
345
45.2k
  }
346
347
  // If the TS is perceived as DEAD then ignore it.
348
382k
  if (check_ts_liveness_ && !ts_desc->IsLiveAndHasReported()) {
349
4.70k
    return;
350
4.70k
  }
351
352
377k
  bool is_ts_live = IsTsInLivePlacement(ts_desc.get());
353
377k
  switch (options_->type) {
354
369k
    case LIVE: {
355
369k
      if (!is_ts_live) {
356
        // LIVE cb run with READ_ONLY ts, ignore this ts
357
2.78k
        sorted_load_.pop_back();
358
2.78k
      }
359
369k
      break;
360
0
    }
361
8.60k
    case READ_ONLY: {
362
8.60k
      if (is_ts_live) {
363
        // READ_ONLY cb run with LIVE ts, ignore this ts
364
5.64k
        sorted_load_.pop_back();
365
2.96k
      } else {
366
2.96k
        string placement_uuid = ts_desc->placement_uuid();
367
2.96k
        if (placement_uuid == "") {
368
0
          LOG(WARNING) << "Read only ts " << ts_desc->permanent_uuid()
369
0
                        << " does not have placement uuid";
370
2.96k
        } else if (placement_uuid != options_->placement_uuid) {
371
          // Do not include this ts in load balancing.
372
362
          sorted_load_.pop_back();
373
362
        }
374
2.96k
      }
375
8.60k
      sorted_leader_load_.clear();
376
8.60k
      return;
377
369k
    }
378
369k
  }
379
380
369k
  if (sorted_leader_load_.empty() ||
381
361k
      sorted_leader_load_.back() != ts_uuid ||
382
368k
      affinitized_zones_.empty()) {
383
368k
    return;
384
368k
  }
385
646
  TSRegistrationPB registration = ts_desc->GetRegistration();
386
646
  if (affinitized_zones_.find(registration.common().cloud_info()) == affinitized_zones_.end()) {
387
    // This tablet server is in an affinitized leader zone.
388
515
    sorted_leader_load_.pop_back();
389
515
    sorted_non_affinitized_leader_load_.push_back(ts_uuid);
390
515
  }
391
646
}
392
393
Result<bool> PerTableLoadState::CanAddTabletToTabletServer(
394
299k
    const TabletId& tablet_id, const TabletServerId& to_ts, const PlacementInfoPB* placement_info) {
395
299k
  const auto& ts_meta = per_ts_meta_[to_ts];
396
397
  // If this server is deemed DEAD then don't add it.
398
299k
  if (check_ts_liveness_ && !ts_meta.descriptor->IsLiveAndHasReported()) {
399
0
    return false;
400
0
  }
401
402
  // If this tablet has already been added to a new tablet server, don't add it again.
403
299k
  if (tablets_added_.count(tablet_id)) {
404
1.19k
    return false;
405
1.19k
  }
406
  // We do not add load to blacklisted servers.
407
298k
  if (blacklisted_servers_.count(to_ts)) {
408
237k
    return false;
409
237k
  }
410
  // We cannot add a tablet to a tablet server if it is already serving it.
411
60.9k
  if (ts_meta.running_tablets.count(tablet_id) || ts_meta.starting_tablets.count(tablet_id)) {
412
17.2k
    return false;
413
17.2k
  }
414
  // If we ask to use placement information, check against it.
415
43.6k
  if (placement_info && !GetValidPlacement(to_ts, placement_info).has_value()) {
416
18.4k
    YB_LOG_EVERY_N_SECS(INFO, 30) << "tablet server " << to_ts << " has invalid placement info. "
417
22
                                  << "Not allowing it to take more tablets.";
418
18.4k
    return false;
419
18.4k
  }
420
  // If this server has a pending tablet delete, don't use it.
421
25.2k
  if (servers_with_pending_deletes_.count(to_ts)) {
422
1.83k
    LOG(INFO) << "tablet server " << to_ts << " has a pending delete. "
423
1.83k
              << "Not allowing it to take more tablets";
424
1.83k
    return false;
425
1.83k
  }
426
  // If all checks pass, return true.
427
23.3k
  return true;
428
23.3k
}
429
430
boost::optional<CloudInfoPB> PerTableLoadState::GetValidPlacement(
431
112k
    const TabletServerId& ts_uuid, const PlacementInfoPB* placement_info) {
432
112k
  if (!placement_info->placement_blocks().empty()) {
433
168k
    for (const auto& pb : placement_info->placement_blocks()) {
434
168k
      if (per_ts_meta_[ts_uuid].descriptor->MatchesCloudInfo(pb.cloud_info())) {
435
75.7k
        return pb.cloud_info();
436
75.7k
      }
437
168k
    }
438
19.6k
    return boost::none;
439
17.5k
  }
440
  // Return the cloudInfoPB of TS if no placement policy is specified
441
17.5k
  return per_ts_meta_[ts_uuid].descriptor->GetCloudInfo();
442
17.5k
}
443
444
Result<bool> PerTableLoadState::CanSelectWrongReplicaToMove(
445
    const TabletId& tablet_id, const PlacementInfoPB& placement_info, TabletServerId* out_from_ts,
446
916
    TabletServerId* out_to_ts) {
447
  // We consider both invalid placements (potentially due to config or schema changes), as well
448
  // as servers being blacklisted, as wrong placement.
449
916
  const auto& tablet_meta = per_tablet_meta_[tablet_id];
450
  // Prioritize taking away load from blacklisted servers, then from wrong placements.
451
916
  bool found_match = false;
452
  // Use these to do a fallback move, if placement id is the only thing that does not match.
453
916
  TabletServerId fallback_to_uuid;
454
916
  TabletServerId fallback_from_uuid;
455
1.06k
  for (const auto& from_uuid : tablet_meta.blacklisted_tablet_servers) {
456
1.06k
    bool invalid_placement = tablet_meta.wrong_placement_tablet_servers.count(from_uuid);
457
5.57k
    for (const auto& to_uuid : sorted_load_) {
458
      // TODO(bogdan): this could be made smarter if we kept track of per-placement numbers and
459
      // allowed to remove one from one placement, as long as it is still above the minimum.
460
      //
461
      // If this is a blacklisted server, we should aim to still respect placement and for now,
462
      // just try to move the load to the same placement. However, if the from_uuid was
463
      // previously invalidly placed, then we should ignore its placement.
464
5.57k
      if (invalid_placement &&
465
0
          VERIFY_RESULT(CanAddTabletToTabletServer(tablet_id, to_uuid, &placement_info))) {
466
0
        found_match = true;
467
5.57k
      } else {
468
5.57k
        if (VERIFY_RESULT(CanAddTabletToTabletServer(tablet_id, to_uuid))) {
469
          // If we have placement information, we want to only pick the tablet if it's moving
470
          // to the same placement, so we guarantee we're keeping the same type of distribution.
471
          // Since we allow prefixes as well, we can still respect the placement of this tablet
472
          // even if their placement ids aren't the same. An e.g.
473
          // placement info of tablet: C.R1.*
474
          // placement info of from_ts: C.R1.Z1
475
          // placement info of to_ts: C.R2.Z2
476
          // Note that we've assumed that for every TS there is a unique placement block
477
          // to which it can be mapped (see the validation rules in yb_admin-client).
478
          // If there is no unique placement block then it is simply the C.R.Z of the TS itself.
479
252
          auto ci_from_ts = GetValidPlacement(from_uuid, &placement_info);
480
252
          auto ci_to_ts = GetValidPlacement(to_uuid, &placement_info);
481
252
          if (ci_to_ts.has_value() && ci_from_ts.has_value() &&
482
252
          TSDescriptor::generate_placement_id(*ci_from_ts) ==
483
218
                                    TSDescriptor::generate_placement_id(*ci_to_ts)) {
484
218
            found_match = true;
485
34
          } else {
486
            // ENG-500 : Placement does not match, but we can still use this combo as a fallback.
487
            // It uses the last such pair, which should be fine.
488
34
            fallback_to_uuid = to_uuid;
489
34
            fallback_from_uuid = from_uuid;
490
34
          }
491
252
        }
492
5.57k
      }
493
5.57k
      if (found_match) {
494
218
        *out_from_ts = from_uuid;
495
218
        *out_to_ts = to_uuid;
496
218
        return true;
497
218
      }
498
5.57k
    }
499
1.06k
  }
500
501
698
  if (!fallback_to_uuid.empty()) {
502
26
    *out_from_ts = fallback_from_uuid;
503
26
    *out_to_ts = fallback_to_uuid;
504
26
    return true;
505
26
  }
506
507
  // TODO(bogdan): sort and pick the highest load as source.
508
  //
509
  // If we didn't have or find any blacklisted server to move load from, move to the wrong
510
  // placement tablet servers. We can pick any of them as the source for now.
511
672
  if (!tablet_meta.wrong_placement_tablet_servers.empty()) {
512
14
    for (const auto& to_uuid : sorted_load_) {
513
14
      if (VERIFY_RESULT(CanAddTabletToTabletServer(tablet_id, to_uuid, &placement_info))) {
514
8
        *out_from_ts = *tablet_meta.wrong_placement_tablet_servers.begin();
515
8
        *out_to_ts = to_uuid;
516
8
        return true;
517
8
      }
518
14
    }
519
8
  }
520
521
664
  return false;
522
672
}
523
524
1.28k
Status PerTableLoadState::AddReplica(const TabletId& tablet_id, const TabletServerId& to_ts) {
525
1.28k
  RETURN_NOT_OK(AddStartingTablet(tablet_id, to_ts));
526
1.28k
  tablets_added_.insert(tablet_id);
527
1.28k
  SortLoad();
528
1.28k
  return Status::OK();
529
1.28k
}
530
531
1.51k
Status PerTableLoadState::RemoveReplica(const TabletId& tablet_id, const TabletServerId& from_ts) {
532
1.51k
  RETURN_NOT_OK(RemoveRunningTablet(tablet_id, from_ts));
533
1.45k
  if (per_ts_meta_[from_ts].starting_tablets.count(tablet_id)) {
534
0
    LOG(DFATAL) << "Invalid request: remove starting tablet " << tablet_id
535
0
                << " from ts " << from_ts;
536
0
  }
537
1.45k
  if (per_tablet_meta_[tablet_id].leader_uuid == from_ts) {
538
343
    RETURN_NOT_OK(MoveLeader(tablet_id, from_ts));
539
343
  }
540
  // This artificially constrains the removes to only handle one over_replication/wrong_placement
541
  // per run.
542
  // Create a copy of tablet_id because tablet_id could be a const reference from
543
  // tablets_wrong_placement_ (if the requests comes from HandleRemoveIfWrongPlacement) or a const
544
  // reference from tablets_over_replicated_ (if the request comes from HandleRemoveReplicas).
545
1.45k
  TabletId tablet_id_key(tablet_id);
546
1.45k
  tablets_over_replicated_.erase(tablet_id_key);
547
1.45k
  tablets_wrong_placement_.erase(tablet_id_key);
548
1.45k
  SortLoad();
549
1.45k
  return Status::OK();
550
1.45k
}
551
552
245k
void PerTableLoadState::SortLoad() {
553
245k
  auto comparator = Comparator(this);
554
245k
  sort(sorted_load_.begin(), sorted_load_.end(), comparator);
555
556
245k
  if (global_state_->drive_aware_) {
557
245k
    SortDriveLoad();
558
245k
  }
559
245k
}
560
561
245k
void PerTableLoadState::SortDriveLoad() {
562
  // Sort drives on each ts by the tablets count to use a sorted list while
563
  // looking the tablet to move from the drive with the most tablets count.
564
749k
  for (const auto& ts : sorted_load_) {
565
749k
    auto& ts_meta = per_ts_meta_[ts];
566
749k
    std::vector<std::pair<std::string, uint64>> drive_load;
567
749k
    bool empty_path_found = false;
568
751k
    for (const auto& path_to_tablet : ts_meta.path_to_tablets) {
569
751k
      if (path_to_tablet.first.empty()) {
570
        // TS reported tablet without path (rolling restart case).
571
763
        empty_path_found = true;
572
763
        continue;
573
763
      }
574
750k
      int starting_tablets_count = FindWithDefault(ts_meta.path_to_starting_tablets_count,
575
750k
                                                   path_to_tablet.first, 0);
576
750k
      drive_load.emplace_back(std::pair<std::string, uint>(
577
750k
                                {path_to_tablet.first,
578
750k
                                 starting_tablets_count + path_to_tablet.second.size()}));
579
750k
    }
580
581
    // Sort by decreasing load.
582
749k
    sort(drive_load.begin(), drive_load.end(),
583
14.5k
          [](const std::pair<std::string, uint64>& l, const std::pair<std::string, uint64>& r) {
584
14.5k
              return l.second > r.second;
585
14.5k
            });
586
749k
    ts_meta.sorted_path_load_by_tablets_count.reserve(drive_load.size());
587
749k
    std::transform(drive_load.begin(), drive_load.end(),
588
749k
                    std::back_inserter(ts_meta.sorted_path_load_by_tablets_count),
589
750k
                    [](const std::pair<std::string, uint64>& v) { return v.first;});
590
749k
    if (empty_path_found) {
591
      // Empty path was found at path_to_tablets, so add the empty path to the
592
      // end so that it has the lowest priority.
593
763
      ts_meta.sorted_path_load_by_tablets_count.push_back(std::string());
594
763
    }
595
749k
  }
596
245k
}
597
598
Status PerTableLoadState::MoveLeader(const TabletId& tablet_id,
599
                                     const TabletServerId& from_ts,
600
                                     const TabletServerId& to_ts,
601
5.95k
                                     const TabletServerId& to_ts_path) {
602
5.95k
  if (per_tablet_meta_[tablet_id].leader_uuid != from_ts) {
603
1
    return STATUS_SUBSTITUTE(IllegalState, "Tablet $0 has leader $1, but $2 expected.",
604
1
                              tablet_id, per_tablet_meta_[tablet_id].leader_uuid, from_ts);
605
1
  }
606
5.95k
  per_tablet_meta_[tablet_id].leader_uuid = to_ts;
607
5.95k
  RETURN_NOT_OK(RemoveLeaderTablet(tablet_id, from_ts));
608
5.94k
  if (!to_ts.empty()) {
609
5.60k
    RETURN_NOT_OK(AddLeaderTablet(tablet_id, to_ts, to_ts_path));
610
5.60k
  }
611
5.94k
  SortLeaderLoad();
612
5.94k
  return Status::OK();
613
5.94k
}
614
615
127k
void PerTableLoadState::SortLeaderLoad() {
616
127k
  auto leader_count_comparator = LeaderLoadComparator(this);
617
127k
  sort(sorted_non_affinitized_leader_load_.begin(),
618
127k
        sorted_non_affinitized_leader_load_.end(),
619
127k
        leader_count_comparator);
620
127k
  sort(sorted_leader_load_.begin(), sorted_leader_load_.end(), leader_count_comparator);
621
622
127k
  if (global_state_->drive_aware_) {
623
127k
    SortDriveLeaderLoad();
624
127k
  }
625
127k
}
626
627
127k
void PerTableLoadState::SortDriveLeaderLoad() {
628
  // Sort drives on each ts by the leaders count to use a sorted list while
629
  // looking the leader to move to the drive with the least leaders count.
630
373k
  for (const auto& ts : sorted_leader_load_) {
631
373k
    auto& ts_meta = per_ts_meta_[ts];
632
373k
    std::vector<std::pair<std::string, uint64>> drive_load;
633
373k
    bool empty_path_found = false;
634
    // Add drives with leaders
635
343k
    for (const auto& path_to_tablet : ts_meta.path_to_leaders) {
636
343k
      if (path_to_tablet.first.empty()) {
637
323
        empty_path_found = true;
638
323
        continue;
639
323
      }
640
343k
      drive_load.emplace_back(std::pair<std::string, uint>(
641
343k
                                {path_to_tablet.first, path_to_tablet.second.size()}));
642
343k
    }
643
    // Add drives without leaders, but with tablets
644
373k
    for (const auto& path_to_tablet : ts_meta.path_to_tablets) {
645
373k
      const auto& path = path_to_tablet.first;
646
373k
      if (path.empty()) {
647
421
        continue;
648
421
      }
649
650
373k
      if (ts_meta.path_to_leaders.find(path) == ts_meta.path_to_leaders.end()) {
651
30.1k
        drive_load.emplace_back(std::pair<std::string, uint>({path_to_tablet.first, 0}));
652
30.1k
      }
653
373k
    }
654
655
    // Sort by ascending load.
656
373k
    sort(drive_load.begin(), drive_load.end(),
657
5.16k
          [](const std::pair<std::string, uint64>& l, const std::pair<std::string, uint64>& r) {
658
5.16k
              return l.second < r.second;
659
5.16k
            });
660
373k
    bool add_empty_path = empty_path_found || ts_meta.path_to_leaders.empty();
661
341k
    ts_meta.sorted_path_load_by_leader_count.reserve(drive_load.size() + (add_empty_path ? 1 : 0));
662
373k
    if (add_empty_path) {
663
      // Empty path was found at path_to_leaders or no leaders on TS, so add the empty path.
664
31.9k
      ts_meta.sorted_path_load_by_leader_count.push_back(std::string());
665
31.9k
    }
666
373k
    std::transform(drive_load.begin(), drive_load.end(),
667
373k
                    std::back_inserter(ts_meta.sorted_path_load_by_leader_count),
668
373k
                    [](const std::pair<std::string, uint64>& v) { return v.first;});
669
373k
  }
670
127k
}
671
672
32
void PerTableLoadState::LogSortedLeaderLoad() {
673
  // Sample output:
674
  // ts1_uuid[ts1_load] ts2_uuid[ts2_load] ts4_uuid[ts4_load] -- ts3_uuid[ts3_load]
675
  // Note: entries following "--" are leader blacklisted tservers
676
677
32
  bool blacklisted_leader = false;
678
32
  std::string s;
679
111
  for (const auto& ts_uuid : sorted_leader_load_) {
680
111
    if (!blacklisted_leader) {
681
111
      blacklisted_leader = (leader_blacklisted_servers_.find(ts_uuid) !=
682
111
          leader_blacklisted_servers_.end());
683
111
      if (blacklisted_leader) {
684
32
        s += " --";
685
32
      }
686
111
    }
687
688
111
    s +=  " " + ts_uuid + "[" + strings::Substitute("$0", GetLeaderLoad(ts_uuid)) + "]";
689
111
  }
690
32
  if (s.size() > 0) {
691
32
    LOG(INFO) << "tservers sorted by whether leader blacklisted and load: " << s;
692
32
  }
693
32
}
694
695
121k
void PerTableLoadState::AdjustLeaderBalanceThreshold() {
696
121k
  if (leader_balance_threshold_ != 0) {
697
0
    int min_threshold = sorted_leader_load_.empty() ? 0 :
698
12
                        static_cast<int>(std::ceil(
699
12
                          static_cast<double>(per_tablet_meta_.size()) /
700
12
                          static_cast<double>(sorted_leader_load_.size())));
701
12
    if (leader_balance_threshold_ < min_threshold) {
702
0
      LOG(WARNING) << strings::Substitute(
703
0
        "leader_balance_threshold flag is set to $0 but is too low for the current "
704
0
          "configuration. Adjusting it to $1.",
705
0
        leader_balance_threshold_, min_threshold);
706
0
      leader_balance_threshold_ = min_threshold;
707
0
    }
708
12
  }
709
121k
}
710
711
std::shared_ptr<const TabletReplicaMap> PerTableLoadState::GetReplicaLocations(
712
967k
    TabletInfo* tablet) {
713
967k
  auto replica_locations = std::make_shared<TabletReplicaMap>();
714
967k
  auto replica_map = tablet->GetReplicaLocations();
715
2.98M
  for (const auto& it : *replica_map) {
716
2.98M
    const TabletReplica& replica = it.second;
717
2.98M
    bool is_replica_live =  IsTsInLivePlacement(replica.ts_desc);
718
2.98M
    if (is_replica_live && options_->type == LIVE) {
719
2.80M
      replica_locations->emplace(it.first, replica);
720
182k
    } else if (!is_replica_live && options_->type  == READ_ONLY) {
721
62.1k
      const string& placement_uuid = replica.ts_desc->placement_uuid();
722
62.1k
      if (placement_uuid == options_->placement_uuid) {
723
60.0k
        replica_locations->emplace(it.first, replica);
724
60.0k
      }
725
62.1k
    }
726
2.98M
  }
727
967k
  return replica_locations;
728
967k
}
729
730
Status PerTableLoadState::AddRunningTablet(const TabletId& tablet_id,
731
                                           const TabletServerId& ts_uuid,
732
2.86M
                                           const std::string& path) {
733
2.86M
  SCHECK(per_ts_meta_.find(ts_uuid) != per_ts_meta_.end(), IllegalState,
734
2.86M
          Format(uninitialized_ts_meta_format_msg, ts_uuid, table_id_));
735
  // Set::Insert returns a pair where the second value is whether or not an item was inserted.
736
2.86M
  auto& meta_ts = per_ts_meta_.at(ts_uuid);
737
2.86M
  auto ret = meta_ts.running_tablets.insert(tablet_id);
738
2.86M
  if (ret.second) {
739
2.86M
    ++global_state_->per_ts_global_meta_[ts_uuid].running_tablets_count;
740
2.86M
    ++total_running_;
741
2.86M
    ++per_tablet_meta_[tablet_id].running;
742
2.86M
  }
743
2.86M
  meta_ts.path_to_tablets[path].insert(tablet_id);
744
2.86M
  return Status::OK();
745
2.86M
}
746
747
Status PerTableLoadState::RemoveRunningTablet(
748
1.51k
    const TabletId& tablet_id, const TabletServerId& ts_uuid) {
749
1.51k
  SCHECK(per_ts_meta_.find(ts_uuid) != per_ts_meta_.end(), IllegalState,
750
1.51k
          Format(uninitialized_ts_meta_format_msg, ts_uuid, table_id_));
751
1.51k
  auto& meta_ts = per_ts_meta_.at(ts_uuid);
752
1.51k
  auto num_erased = meta_ts.running_tablets.erase(tablet_id);
753
1.51k
  if (num_erased == 0) {
754
51
    return STATUS_FORMAT(
755
51
      IllegalState,
756
51
      "Could not find running tablet to remove: ts_uuid: $0, tablet_id: $1",
757
51
      ts_uuid, tablet_id);
758
51
  }
759
1.45k
  global_state_->per_ts_global_meta_[ts_uuid].running_tablets_count -= num_erased;
760
1.45k
  total_running_ -= num_erased;
761
1.45k
  per_tablet_meta_[tablet_id].running -= num_erased;
762
1.45k
  bool found = false;
763
1.50k
  for (auto &path : meta_ts.path_to_tablets) {
764
1.50k
    if (path.second.erase(tablet_id) == 0) {
765
90
      found = true;
766
90
      break;
767
90
    }
768
1.50k
  }
769
0
  VLOG_IF(1, !found) << "Updated replica wasn't found, tablet id: " << tablet_id;
770
1.45k
  return Status::OK();
771
1.45k
}
772
773
Status PerTableLoadState::AddStartingTablet(
774
2.19k
    const TabletId& tablet_id, const TabletServerId& ts_uuid) {
775
2.19k
  SCHECK(per_ts_meta_.find(ts_uuid) != per_ts_meta_.end(), IllegalState,
776
2.19k
          Format(uninitialized_ts_meta_format_msg, ts_uuid, table_id_));
777
2.19k
  auto ret = per_ts_meta_.at(ts_uuid).starting_tablets.insert(tablet_id);
778
2.19k
  if (ret.second) {
779
2.01k
    ++global_state_->per_ts_global_meta_[ts_uuid].starting_tablets_count;
780
2.01k
    ++total_starting_;
781
2.01k
    ++global_state_->total_starting_tablets_;
782
2.01k
    ++per_tablet_meta_[tablet_id].starting;
783
    // If the tablet wasn't over replicated before the add, it's over replicated now.
784
2.01k
    if (tablets_missing_replicas_.count(tablet_id) == 0) {
785
1.65k
      tablets_over_replicated_.insert(tablet_id);
786
1.65k
    }
787
2.01k
  }
788
2.19k
  return Status::OK();
789
2.19k
}
790
791
Status PerTableLoadState::AddLeaderTablet(
792
951k
    const TabletId& tablet_id, const TabletServerId& ts_uuid, const TabletServerId& ts_path) {
793
951k
  SCHECK(per_ts_meta_.find(ts_uuid) != per_ts_meta_.end(), IllegalState,
794
951k
          Format(uninitialized_ts_meta_format_msg, ts_uuid, table_id_));
795
951k
  auto& meta_ts = per_ts_meta_.at(ts_uuid);
796
951k
  auto ret = meta_ts.leaders.insert(tablet_id);
797
951k
  if (ret.second) {
798
951k
    ++global_state_->per_ts_global_meta_[ts_uuid].leaders_count;
799
951k
  }
800
951k
  meta_ts.path_to_leaders[ts_path].insert(tablet_id);
801
951k
  return Status::OK();
802
951k
}
803
804
Status PerTableLoadState::RemoveLeaderTablet(
805
5.95k
    const TabletId& tablet_id, const TabletServerId& ts_uuid) {
806
5.95k
  SCHECK(per_ts_meta_.find(ts_uuid) != per_ts_meta_.end(), IllegalState,
807
5.95k
          Format(uninitialized_ts_meta_format_msg, ts_uuid, table_id_));
808
5.94k
  auto num_erased = per_ts_meta_.at(ts_uuid).leaders.erase(tablet_id);
809
5.94k
  global_state_->per_ts_global_meta_[ts_uuid].leaders_count -= num_erased;
810
5.94k
  return Status::OK();
811
5.95k
}
812
813
Status PerTableLoadState::AddDisabledByTSTablet(
814
0
    const TabletId& tablet_id, const TabletServerId& ts_uuid) {
815
0
  SCHECK(per_ts_meta_.find(ts_uuid) != per_ts_meta_.end(), IllegalState,
816
0
          Format(uninitialized_ts_meta_format_msg, ts_uuid, table_id_));
817
0
  per_ts_meta_.at(ts_uuid).disabled_by_ts_tablets.insert(tablet_id);
818
0
  return Status::OK();
819
0
}
820
821
0
bool PerTableLoadState::CompareByReplica(const TabletReplica& a, const TabletReplica& b) {
822
0
  return CompareByUuid(a.ts_desc->permanent_uuid(), b.ts_desc->permanent_uuid());
823
0
}
824
825
} // namespace master
826
} // namespace yb