YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/master/tablet_split_manager.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) YugaByte, Inc.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
4
// in compliance with the License.  You may obtain a copy of the License at
5
//
6
// http://www.apache.org/licenses/LICENSE-2.0
7
//
8
// Unless required by applicable law or agreed to in writing, software distributed under the License
9
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
10
// or implied.  See the License for the specific language governing permissions and limitations
11
// under the License.
12
//
13
14
#include <chrono>
15
16
#include "yb/common/constants.h"
17
18
#include "yb/gutil/casts.h"
19
#include "yb/gutil/map-util.h"
20
21
#include "yb/common/schema.h"
22
23
#include "yb/master/async_rpc_tasks.h"
24
#include "yb/master/master_error.h"
25
#include "yb/master/master_fwd.h"
26
#include "yb/master/tablet_split_manager.h"
27
#include "yb/master/ts_descriptor.h"
28
29
#include "yb/master/xcluster_split_driver.h"
30
#include "yb/server/monitored_task.h"
31
32
#include "yb/util/flag_tags.h"
33
#include "yb/util/monotime.h"
34
#include "yb/util/result.h"
35
#include "yb/util/unique_lock.h"
36
37
DEFINE_int32(process_split_tablet_candidates_interval_msec, 0,
38
             "The minimum time between automatic splitting attempts. The actual splitting time "
39
             "between runs is also affected by catalog_manager_bg_task_wait_ms, which controls how "
40
             "long the bg tasks thread sleeps at the end of each loop. The top-level automatic "
41
             "tablet splitting method, which checks for the time since last run, is run once per "
42
             "loop.");
43
DEFINE_int32(max_queued_split_candidates, 0,
44
             "DEPRECATED. The max number of pending tablet split candidates we will hold onto. We "
45
             "potentially iterate through every candidate in the queue for each tablet we process "
46
             "in a tablet report so this size should be kept relatively small to avoid any "
47
             "issues.");
48
49
DECLARE_bool(enable_automatic_tablet_splitting);
50
51
DEFINE_uint64(outstanding_tablet_split_limit, 1,
52
              "Limit of the number of outstanding tablet splits. Limitation is disabled if this "
53
              "value is set to 0.");
54
55
DECLARE_bool(TEST_validate_all_tablet_candidates);
56
57
DEFINE_bool(enable_tablet_split_of_pitr_tables, true,
58
            "When set, it enables automatic tablet splitting of tables covered by "
59
            "Point In Time Restore schedules.");
60
TAG_FLAG(enable_tablet_split_of_pitr_tables, runtime);
61
62
DEFINE_bool(enable_tablet_split_of_xcluster_replicated_tables, false,
63
            "When set, it enables automatic tablet splitting for tables that are part of an "
64
            "xCluster replication setup");
65
TAG_FLAG(enable_tablet_split_of_xcluster_replicated_tables, runtime);
66
TAG_FLAG(enable_tablet_split_of_xcluster_replicated_tables, hidden);
67
68
DEFINE_uint64(tablet_split_limit_per_table, 256,
69
              "Limit of the number of tablets per table for tablet splitting. Limitation is "
70
              "disabled if this value is set to 0.");
71
72
DEFINE_uint64(prevent_split_for_ttl_tables_for_seconds, 86400,
73
              "Seconds between checks for whether to split a table with TTL. Checks are disabled "
74
              "if this value is set to 0.");
75
76
namespace yb {
77
namespace master {
78
79
using strings::Substitute;
80
using namespace std::literals;
81
82
TabletSplitManager::TabletSplitManager(
83
    TabletSplitCandidateFilterIf* filter,
84
    TabletSplitDriverIf* driver,
85
    XClusterSplitDriverIf* xcluster_split_driver):
86
    filter_(filter),
87
    driver_(driver),
88
    xcluster_split_driver_(xcluster_split_driver),
89
    is_running_(false),
90
    splitting_disabled_until_(CoarseDuration::zero()),
91
8.07k
    last_run_time_(CoarseDuration::zero()) {}
92
93
94.3M
Status TabletSplitManager::ValidateSplitCandidateTable(const TableInfo& table) {
94
94.3M
  if (PREDICT_FALSE(FLAGS_TEST_validate_all_tablet_candidates)) {
95
0
    return Status::OK();
96
0
  }
97
94.3M
  if (table.is_deleted()) {
98
1.01M
    VLOG
(1) << Substitute("Table is deleted; ignoring for splitting. table_id: $0", table.id())0
;
99
1.01M
    return STATUS_FORMAT(
100
1.01M
        NotSupported,
101
1.01M
        "Table is deleted; ignoring for splitting. table_id: $0", table.id());
102
1.01M
  }
103
93.3M
  {
104
93.3M
    UniqueLock<decltype(mutex_)> lock(mutex_);
105
93.3M
    const auto entry = ignore_table_for_splitting_until_.find(table.id());
106
93.3M
    if (entry != ignore_table_for_splitting_until_.end()) {
107
0
      const auto ignore_for_split_ttl_until = entry->second;
108
0
      if (ignore_for_split_ttl_until > CoarseMonoClock::Now()) {
109
0
        VLOG(1) << Substitute("Table has file expiration for TTL enabled; ignored for "
110
0
            "splitting until $0. table_id: $1",  ToString(ignore_for_split_ttl_until), table.id());
111
0
        return STATUS_FORMAT(
112
0
            NotSupported,
113
0
            "Table has file expiration for TTL enabled; ignored for splitting until $0. "
114
0
            "table_id: $1", ToString(ignore_for_split_ttl_until), table.id());
115
0
      } else {
116
0
        ignore_table_for_splitting_until_.erase(entry);
117
0
      }
118
0
    }
119
93.3M
  }
120
  // Check if this table is covered by a PITR schedule.
121
93.3M
  if (!FLAGS_enable_tablet_split_of_pitr_tables &&
122
93.3M
      
VERIFY_RESULT0
(filter_->IsTablePartOfSomeSnapshotSchedule(table))) {
123
0
    VLOG(1) << Substitute("Tablet splitting is not supported for tables that are a part of"
124
0
                          " some active PITR schedule, table_id: $0", table.id());
125
0
    return STATUS_FORMAT(
126
0
        NotSupported,
127
0
        "Tablet splitting is not supported for tables that are a part of"
128
0
        " some active PITR schedule, table_id: $0", table.id());
129
0
  }
130
  // Check if this table is part of a cdc stream.
131
93.3M
  if (PREDICT_TRUE(!FLAGS_enable_tablet_split_of_xcluster_replicated_tables) &&
132
93.3M
      filter_->IsCdcEnabled(table)) {
133
4
    VLOG(1) << Substitute("Tablet splitting is not supported for tables that are a part of"
134
0
                          " a CDC stream, table_id: $0", table.id());
135
4
    return STATUS_FORMAT(
136
4
        NotSupported,
137
4
        "Tablet splitting is not supported for tables that are a part of"
138
4
        " a CDC stream, tablet_id: $0", table.id());
139
4
  }
140
93.3M
  if (table.GetTableType() == TableType::TRANSACTION_STATUS_TABLE_TYPE) {
141
101k
    VLOG(1) << Substitute("Tablet splitting is not supported for transaction status tables, "
142
0
                          "table_id: $0", table.id());
143
101k
    return STATUS_FORMAT(
144
101k
        NotSupported,
145
101k
        "Tablet splitting is not supported for transaction status tables, table_id: $0",
146
101k
        table.id());
147
101k
  }
148
93.2M
  if (table.GetTableType() == REDIS_TABLE_TYPE) {
149
30.3k
    VLOG(1) << Substitute("Tablet splitting is not supported for YEDIS tables, table_id: $0",
150
0
                          table.id());
151
30.3k
    return STATUS_FORMAT(
152
30.3k
        NotSupported,
153
30.3k
        "Tablet splitting is not supported for YEDIS tables, table_id: $0", table.id());
154
30.3k
  }
155
93.1M
  if (FLAGS_tablet_split_limit_per_table != 0 &&
156
93.1M
      
table.NumPartitions() >= FLAGS_tablet_split_limit_per_table93.1M
) {
157
    // TODO(tsplit): Avoid tablet server of scanning tablets for the tables that already
158
    //  reached the split limit of tablet #6220
159
0
    VLOG(1) << Substitute("Too many tablets for the table, table_id: $0, limit: $1",
160
0
                          table.id(), FLAGS_tablet_split_limit_per_table);
161
0
    return STATUS_EC_FORMAT(IllegalState, MasterError(MasterErrorPB::REACHED_SPLIT_LIMIT),
162
0
                            "Too many tablets for the table, table_id: $0, limit: $1",
163
0
                            table.id(), FLAGS_tablet_split_limit_per_table);
164
0
  }
165
93.1M
  if (table.IsBackfilling()) {
166
2.90k
    VLOG
(1) << Substitute("Backfill operation in progress, table_id: $0", table.id())0
;
167
2.90k
    return STATUS_EC_FORMAT(IllegalState, MasterError(MasterErrorPB::SPLIT_OR_BACKFILL_IN_PROGRESS),
168
2.90k
                            "Backfill operation in progress, table_id: $0", table.id());
169
2.90k
  }
170
93.1M
  return Status::OK();
171
93.1M
}
172
173
3.50M
Status TabletSplitManager::ValidateSplitCandidateTablet(const TabletInfo& tablet) {
174
3.50M
  if (PREDICT_FALSE(FLAGS_TEST_validate_all_tablet_candidates)) {
175
0
    return Status::OK();
176
0
  }
177
178
3.50M
  Schema schema;
179
3.50M
  RETURN_NOT_OK(tablet.table()->GetSchema(&schema));
180
3.50M
  auto ts_desc = VERIFY_RESULT(tablet.GetLeader());
181
3.50M
  if (schema.table_properties().HasDefaultTimeToLive()
182
3.50M
      && 
ts_desc->get_disable_tablet_split_if_default_ttl()126k
) {
183
0
    MarkTtlTableForSplitIgnore(tablet.table()->id());
184
0
    return STATUS_FORMAT(
185
0
        NotSupported, "Tablet splitting is not supported for tables with default time to live, "
186
0
        "tablet_id: $0", tablet.tablet_id());
187
0
  }
188
189
3.50M
  if (tablet.colocated()) {
190
14.1k
    return STATUS_FORMAT(
191
14.1k
        NotSupported, "Tablet splitting is not supported for colocated tables, tablet_id: $0",
192
14.1k
        tablet.tablet_id());
193
14.1k
  }
194
3.49M
  {
195
3.49M
    auto tablet_state = tablet.LockForRead()->pb.state();
196
3.49M
    if (tablet_state != SysTabletsEntryPB::RUNNING) {
197
22
      return STATUS_EC_FORMAT(IllegalState, MasterError(MasterErrorPB::TABLET_NOT_RUNNING),
198
22
                              "Tablet is not in running state: $0",
199
22
                              tablet_state);
200
22
    }
201
3.49M
  }
202
3.49M
  return Status::OK();
203
3.49M
}
204
205
206
207
0
void TabletSplitManager::MarkTtlTableForSplitIgnore(const TableId& table_id) {
208
0
  if (FLAGS_prevent_split_for_ttl_tables_for_seconds != 0) {
209
0
    const auto recheck_at = CoarseMonoClock::Now()
210
0
        + MonoDelta::FromSeconds(FLAGS_prevent_split_for_ttl_tables_for_seconds);
211
0
    UniqueLock<decltype(mutex_)> lock(mutex_);
212
0
    ignore_table_for_splitting_until_.insert({table_id, recheck_at});
213
0
  }
214
0
}
215
216
217
399
bool AllReplicasHaveFinishedCompaction(const TabletInfo& tablet_info) {
218
399
  auto replica_map = tablet_info.GetReplicaLocations();
219
492
  for (auto const& replica : *replica_map) {
220
492
    if (replica.second.drive_info.may_have_orphaned_post_split_data) {
221
26
      return false;
222
26
    }
223
492
  }
224
373
  return true;
225
399
}
226
227
95.3M
bool TabletSplitManager::ShouldSplitTablet(const TabletInfo& tablet) {
228
95.3M
  auto tablet_lock = tablet.LockForRead();
229
  // If no leader for this tablet, skip it for now.
230
95.3M
  auto drive_info_opt = tablet.GetLeaderReplicaDriveInfo();
231
95.3M
  if (!drive_info_opt.ok()) {
232
91.8M
    return false;
233
91.8M
  }
234
3.50M
  if (ValidateSplitCandidateTablet(tablet).ok() &&
235
3.50M
      
filter_->ShouldSplitValidCandidate(tablet, drive_info_opt.get())3.49M
&&
236
3.50M
      
AllReplicasHaveFinishedCompaction(tablet)94
) {
237
88
    return true;
238
88
  }
239
3.50M
  return false;
240
3.50M
}
241
242
1.52M
void TabletSplitManager::ScheduleSplits(const unordered_set<TabletId>& splits_to_schedule) {
243
1.52M
  for (const auto& tablet_id : splits_to_schedule) {
244
133
    auto s = driver_->SplitTablet(tablet_id, false /* select_all_tablets_for_split */);
245
133
    if (!s.ok()) {
246
0
      WARN_NOT_OK(s, Format("Failed to start/restart split for tablet_id: $0.", tablet_id));
247
133
    } else {
248
133
      LOG(INFO) << Substitute("Scheduled split for tablet_id: $0.", tablet_id);
249
133
    }
250
133
  }
251
1.52M
}
252
253
1.52M
void TabletSplitManager::DoSplitting(const TableInfoMap& table_info_map) {
254
  // Splits which are tracked by an AsyncGetTabletSplitKey or AsyncSplitTablet task.
255
1.52M
  unordered_set<TabletId> splits_with_task;
256
  // Splits for which at least one child tablet is still undergoing compaction.
257
1.52M
  unordered_set<TabletId> compacting_splits;
258
  // Splits that need to be started / restarted.
259
1.52M
  unordered_set<TabletId> splits_to_schedule;
260
  // New split candidates. The chosen candidates are eventually added to splits_to_schedule.
261
1.52M
  TabletInfos new_split_candidates;
262
263
  // Helper method to determine if more splits can be scheduled, or if we should exit early.
264
188M
  const auto can_split_more = [&]() {
265
188M
    uint64_t outstanding_splits = splits_with_task.size() +
266
188M
                                  compacting_splits.size() +
267
188M
                                  splits_to_schedule.size();
268
188M
    return FLAGS_outstanding_tablet_split_limit == 0 ||
269
188M
           outstanding_splits < FLAGS_outstanding_tablet_split_limit;
270
188M
  };
271
272
  // TODO(asrivastava): We might want to loop over all running tables when determining outstanding
273
  // splits, to avoid missing outstanding splits for tables that have recently become invalid for
274
  // splitting. This is most critical for tables that frequently switch between being valid and
275
  // invalid for splitting (e.g. for tables with frequent PITR schedules).
276
  // https://github.com/yugabyte/yugabyte-db/issues/11459
277
1.52M
  vector<TableInfoPtr> valid_tables;
278
94.3M
  for (const auto& table : table_info_map) {
279
94.3M
    if (ValidateSplitCandidateTable(*table.second).ok()) {
280
93.1M
      valid_tables.push_back(table.second);
281
93.1M
    }
282
94.3M
  }
283
284
93.1M
  for (const auto& table : valid_tables) {
285
93.1M
    for (const auto& task : table->GetTasks()) {
286
      // These tasks will retry automatically until they succeed or fail.
287
290k
      if (task->type() == yb::server::MonitoredTask::ASYNC_GET_TABLET_SPLIT_KEY ||
288
290k
          task->type() == yb::server::MonitoredTask::ASYNC_SPLIT_TABLET) {
289
6
        const TabletId tablet_id = static_cast<AsyncTabletLeaderTask*>(task.get())->tablet_id();
290
6
        splits_with_task.insert(tablet_id);
291
6
        LOG(INFO) << Substitute("Found split with ongoing task. Task type: $0. "
292
6
                                "Split parent id: $1.", task->type_name(), tablet_id);
293
6
        if (!can_split_more()) {
294
6
          return;
295
6
        }
296
6
      }
297
290k
    }
298
93.1M
  }
299
300
93.1M
  
for (const auto& table : valid_tables)1.52M
{
301
95.3M
    for (const auto& tablet : table->GetTablets()) {
302
95.3M
      if (!can_split_more()) {
303
114
        break;
304
114
      }
305
95.3M
      if (splits_with_task.count(tablet->id())) {
306
0
        continue;
307
0
      }
308
309
95.3M
      auto tablet_lock = tablet->LockForRead();
310
      // Ignore a tablet as a new split candidate if it is part of an outstanding split.
311
95.3M
      bool ignore_as_candidate = false;
312
95.3M
      if (tablet_lock->pb.has_split_parent_tablet_id()) {
313
401
        const TabletId& parent_id = tablet_lock->pb.split_parent_tablet_id();
314
401
        if (splits_with_task.count(parent_id) != 0) {
315
0
          continue;
316
0
        }
317
401
        if (!tablet_lock->is_running()) {
318
          // Recently split child is not running; restart the split.
319
96
          ignore_as_candidate = true;
320
96
          LOG(INFO) << Substitute("Found split child ($0) that is not running. Adding parent ($1) "
321
96
                                  "to list of splits to reschedule.", tablet->id(), parent_id);
322
96
          splits_to_schedule.insert(parent_id);
323
305
        } else if (!AllReplicasHaveFinishedCompaction(*tablet)) {
324
          // This (running) tablet is the child of a split and is still compacting. We assume that
325
          // this split will eventually complete for both tablets.
326
20
          ignore_as_candidate = true;
327
20
          LOG(INFO) << Substitute("Found split child ($0) that is compacting. Adding parent ($1) "
328
20
                                  " to list of compacting splits.", tablet->id(), parent_id);
329
20
          compacting_splits.insert(parent_id);
330
20
        }
331
401
        if (splits_to_schedule.count(parent_id) != 0 && 
compacting_splits.count(parent_id) != 096
) {
332
          // It's possible that one child subtablet leads us to insert the parent tablet id into
333
          // splits_to_schedule, and another leads us to insert into compacting_splits. In this
334
          // case, it means one of the children is live, thus both children have been created and
335
          // the split RPC does not need to be scheduled.
336
0
          LOG(INFO) << Substitute("Found compacting split child ($0), so removing split parent "
337
0
                                  "($1) from splits to schedule.", tablet->id(), parent_id);
338
0
          splits_to_schedule.erase(parent_id);
339
0
        }
340
401
      }
341
95.3M
      if (!ignore_as_candidate && 
ShouldSplitTablet(*tablet)95.3M
) {
342
88
        new_split_candidates.push_back(tablet);
343
88
      }
344
95.3M
    }
345
93.1M
    if (!can_split_more()) {
346
116
      break;
347
116
    }
348
93.1M
  }
349
350
  // Add any new splits to the set of splits to schedule (while respecting the max number of
351
  // outstanding splits).
352
1.52M
  for (const auto& tablet : new_split_candidates) {
353
58
    if (!can_split_more()) {
354
21
      break;
355
21
    }
356
37
    splits_to_schedule.insert(tablet->id());
357
37
  }
358
359
1.52M
  ScheduleSplits(splits_to_schedule);
360
1.52M
}
361
362
0
bool TabletSplitManager::HasOutstandingTabletSplits(const TableInfoMap& table_info_map) {
363
0
  vector<TableInfoPtr> valid_tables;
364
0
  for (const auto& table : table_info_map) {
365
0
    if (ValidateSplitCandidateTable(*table.second).ok()) {
366
0
      valid_tables.push_back(table.second);
367
0
    }
368
0
  }
369
370
0
  for (const auto& table : valid_tables) {
371
0
    for (const auto& task : table->GetTasks()) {
372
0
      if (task->type() == yb::server::MonitoredTask::ASYNC_GET_TABLET_SPLIT_KEY ||
373
0
          task->type() == yb::server::MonitoredTask::ASYNC_SPLIT_TABLET) {
374
0
        return true;
375
0
      }
376
0
    }
377
0
  }
378
379
0
  for (const auto& table : valid_tables) {
380
0
    for (const auto& tablet : table->GetTablets()) {
381
0
      auto tablet_lock = tablet->LockForRead();
382
0
      if (tablet_lock->pb.has_split_parent_tablet_id() && !tablet_lock->is_running()) {
383
0
        return true;
384
0
      }
385
0
    }
386
0
  }
387
0
  return false;
388
0
}
389
390
0
bool TabletSplitManager::IsRunning() {
391
0
  return is_running_;
392
0
}
393
394
0
bool TabletSplitManager::IsTabletSplittingComplete(const TableInfoMap& table_info_map) {
395
0
  return !HasOutstandingTabletSplits(table_info_map) && !is_running_;
396
0
}
397
398
0
void TabletSplitManager::DisableSplittingFor(const MonoDelta& disable_duration) {
399
0
  LOG(INFO) << Substitute("Disabling tablet splitting for $0 milliseconds.",
400
0
                          disable_duration.ToMilliseconds());
401
0
  splitting_disabled_until_ = CoarseMonoClock::Now() + disable_duration;
402
0
}
403
404
1.56M
void TabletSplitManager::MaybeDoSplitting(const TableInfoMap& table_info_map) {
405
1.56M
  if (!FLAGS_enable_automatic_tablet_splitting) {
406
46.4k
    return;
407
46.4k
  }
408
409
1.52M
  auto time_since_last_run = CoarseMonoClock::Now() - last_run_time_;
410
1.52M
  if (time_since_last_run < (FLAGS_process_split_tablet_candidates_interval_msec * 1ms)) {
411
457
    return;
412
457
  }
413
414
  // Setting and unsetting is_running_ could also be accomplished using a scoped object, but this is
415
  // simpler for now.
416
1.52M
  is_running_ = true;
417
1.52M
  if (CoarseMonoClock::Now() < splitting_disabled_until_) {
418
0
    is_running_ = false;
419
0
    return;
420
0
  }
421
1.52M
  DoSplitting(table_info_map);
422
1.52M
  is_running_ = false;
423
1.52M
  last_run_time_ = CoarseMonoClock::Now();
424
1.52M
}
425
426
void TabletSplitManager::ProcessSplitTabletResult(
427
    const Status& status,
428
    const TableId& split_table_id,
429
140
    const SplitTabletIds& split_tablet_ids) {
430
140
  if (!status.ok()) {
431
0
    LOG(WARNING) << "AsyncSplitTablet task failed with status: " << status;
432
140
  } else {
433
    // TODO(JHE) Handle failure cases here (github issue #11030).
434
    // Update the xCluster tablet mapping.
435
140
    Status s = xcluster_split_driver_->UpdateXClusterConsumerOnTabletSplit(
436
140
        split_table_id, split_tablet_ids);
437
140
    WARN_NOT_OK(s, Format(
438
140
        "Encountered an error while updating the xCluster consumer tablet mapping. "
439
140
        "Table id: $0, Split Tablets: $1",
440
140
        split_table_id, split_tablet_ids.ToString()));
441
    // Also process tablet splits for producer side splits.
442
140
    s = xcluster_split_driver_->UpdateXClusterProducerOnTabletSplit(
443
140
        split_table_id, split_tablet_ids);
444
140
    WARN_NOT_OK(s, Format(
445
140
        "Encountered an error while updating the xCluster producer tablet mapping. "
446
140
        "Table id: $0, Split Tablets: $1",
447
140
        split_table_id, split_tablet_ids.ToString()));
448
140
  }
449
140
}
450
451
}  // namespace master
452
}  // namespace yb