YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/src/yb/master/tablet_split_manager.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) YugaByte, Inc.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
4
// in compliance with the License.  You may obtain a copy of the License at
5
//
6
// http://www.apache.org/licenses/LICENSE-2.0
7
//
8
// Unless required by applicable law or agreed to in writing, software distributed under the License
9
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
10
// or implied.  See the License for the specific language governing permissions and limitations
11
// under the License.
12
//
13
14
#include <chrono>
15
16
#include "yb/common/constants.h"
17
18
#include "yb/gutil/casts.h"
19
#include "yb/gutil/map-util.h"
20
21
#include "yb/master/async_rpc_tasks.h"
22
#include "yb/master/master_error.h"
23
#include "yb/master/tablet_split_manager.h"
24
25
#include "yb/master/xcluster_split_driver.h"
26
#include "yb/server/monitored_task.h"
27
28
#include "yb/util/flag_tags.h"
29
#include "yb/util/result.h"
30
31
DEFINE_int32(process_split_tablet_candidates_interval_msec, 0,
32
             "The minimum time between automatic splitting attempts. The actual splitting time "
33
             "between runs is also affected by catalog_manager_bg_task_wait_ms, which controls how "
34
             "long the bg tasks thread sleeps at the end of each loop. The top-level automatic "
35
             "tablet splitting method, which checks for the time since last run, is run once per "
36
             "loop.");
37
DEFINE_int32(max_queued_split_candidates, 0,
38
             "DEPRECATED. The max number of pending tablet split candidates we will hold onto. We "
39
             "potentially iterate through every candidate in the queue for each tablet we process "
40
             "in a tablet report so this size should be kept relatively small to avoid any "
41
             "issues.");
42
43
DECLARE_bool(enable_automatic_tablet_splitting);
44
45
DEFINE_uint64(outstanding_tablet_split_limit, 1,
46
              "Limit of the number of outstanding tablet splits. Limitation is disabled if this "
47
              "value is set to 0.");
48
49
DECLARE_bool(TEST_validate_all_tablet_candidates);
50
51
DEFINE_bool(enable_tablet_split_of_pitr_tables, true,
52
            "When set, it enables automatic tablet splitting of tables covered by "
53
            "Point In Time Restore schedules.");
54
TAG_FLAG(enable_tablet_split_of_pitr_tables, runtime);
55
56
DEFINE_bool(enable_tablet_split_of_xcluster_replicated_tables, false,
57
            "When set, it enables automatic tablet splitting for tables that are part of an "
58
            "xCluster replication setup");
59
TAG_FLAG(enable_tablet_split_of_xcluster_replicated_tables, runtime);
60
TAG_FLAG(enable_tablet_split_of_xcluster_replicated_tables, hidden);
61
62
DEFINE_uint64(tablet_split_limit_per_table, 256,
63
              "Limit of the number of tablets per table for tablet splitting. Limitation is "
64
              "disabled if this value is set to 0.");
65
66
namespace yb {
67
namespace master {
68
69
using strings::Substitute;
70
using namespace std::literals;
71
72
TabletSplitManager::TabletSplitManager(
73
    TabletSplitCandidateFilterIf* filter,
74
    TabletSplitDriverIf* driver,
75
    XClusterSplitDriverIf* xcluster_split_driver):
76
    filter_(filter),
77
    driver_(driver),
78
    xcluster_split_driver_(xcluster_split_driver),
79
5.45k
    last_run_time_(CoarseDuration::zero()) {}
80
81
11.5M
Status TabletSplitManager::ValidateSplitCandidateTable(const TableInfo& table) {
82
11.5M
  if (PREDICT_FALSE(FLAGS_TEST_validate_all_tablet_candidates)) {
83
0
    return Status::OK();
84
0
  }
85
11.5M
  if (table.is_deleted()) {
86
0
    VLOG(1) << Substitute("Table is deleted; ignoring for splitting. table_id: $0", table.id());
87
184k
    return STATUS_FORMAT(
88
184k
        NotSupported,
89
184k
        "Table is deleted; ignoring for splitting. table_id: $0", table.id());
90
184k
  }
91
  // Check if this table is covered by a PITR schedule.
92
11.3M
  if (!FLAGS_enable_tablet_split_of_pitr_tables &&
93
0
      VERIFY_RESULT(filter_->IsTablePartOfSomeSnapshotSchedule(table))) {
94
0
    VLOG(1) << Substitute("Tablet splitting is not supported for tables that are a part of"
95
0
                          " some active PITR schedule, table_id: $0", table.id());
96
0
    return STATUS_FORMAT(
97
0
        NotSupported,
98
0
        "Tablet splitting is not supported for tables that are a part of"
99
0
        " some active PITR schedule, table_id: $0", table.id());
100
0
  }
101
  // Check if this table is part of a cdc stream.
102
11.3M
  if (PREDICT_TRUE(!FLAGS_enable_tablet_split_of_xcluster_replicated_tables) &&
103
11.3M
      filter_->IsCdcEnabled(table)) {
104
0
    VLOG(1) << Substitute("Tablet splitting is not supported for tables that are a part of"
105
0
                          " a CDC stream, table_id: $0", table.id());
106
4
    return STATUS_FORMAT(
107
4
        NotSupported,
108
4
        "Tablet splitting is not supported for tables that are a part of"
109
4
        " a CDC stream, tablet_id: $0", table.id());
110
4
  }
111
11.3M
  if (table.GetTableType() == TableType::TRANSACTION_STATUS_TABLE_TYPE) {
112
0
    VLOG(1) << Substitute("Tablet splitting is not supported for transaction status tables, "
113
0
                          "table_id: $0", table.id());
114
29.1k
    return STATUS_FORMAT(
115
29.1k
        NotSupported,
116
29.1k
        "Tablet splitting is not supported for transaction status tables, table_id: $0",
117
29.1k
        table.id());
118
29.1k
  }
119
11.3M
  if (table.GetTableType() == REDIS_TABLE_TYPE) {
120
0
    VLOG(1) << Substitute("Tablet splitting is not supported for YEDIS tables, table_id: $0",
121
0
                          table.id());
122
4.59k
    return STATUS_FORMAT(
123
4.59k
        NotSupported,
124
4.59k
        "Tablet splitting is not supported for YEDIS tables, table_id: $0", table.id());
125
4.59k
  }
126
11.3M
  if (FLAGS_tablet_split_limit_per_table != 0 &&
127
11.3M
      table.NumPartitions() >= FLAGS_tablet_split_limit_per_table) {
128
    // TODO(tsplit): Avoid tablet server of scanning tablets for the tables that already
129
    //  reached the split limit of tablet #6220
130
0
    VLOG(1) << Substitute("Too many tablets for the table, table_id: $0, limit: $1",
131
0
                          table.id(), FLAGS_tablet_split_limit_per_table);
132
0
    return STATUS_EC_FORMAT(IllegalState, MasterError(MasterErrorPB::REACHED_SPLIT_LIMIT),
133
0
                            "Too many tablets for the table, table_id: $0, limit: $1",
134
0
                            table.id(), FLAGS_tablet_split_limit_per_table);
135
0
  }
136
11.3M
  if (table.IsBackfilling()) {
137
0
    VLOG(1) << Substitute("Backfill operation in progress, table_id: $0", table.id());
138
2.41k
    return STATUS_EC_FORMAT(IllegalState, MasterError(MasterErrorPB::SPLIT_OR_BACKFILL_IN_PROGRESS),
139
2.41k
                            "Backfill operation in progress, table_id: $0", table.id());
140
2.41k
  }
141
11.2M
  return Status::OK();
142
11.2M
}
143
144
546k
Status TabletSplitManager::ValidateSplitCandidateTablet(const TabletInfo& tablet) {
145
546k
  if (PREDICT_FALSE(FLAGS_TEST_validate_all_tablet_candidates)) {
146
0
    return Status::OK();
147
0
  }
148
546k
  if (tablet.colocated()) {
149
1.22k
    return STATUS_FORMAT(
150
1.22k
        NotSupported, "Tablet splitting is not supported for colocated tables, tablet_id: $0",
151
1.22k
        tablet.tablet_id());
152
1.22k
  }
153
545k
  {
154
545k
    auto tablet_state = tablet.LockForRead()->pb.state();
155
545k
    if (tablet_state != SysTabletsEntryPB::RUNNING) {
156
4
      return STATUS_EC_FORMAT(IllegalState, MasterError(MasterErrorPB::TABLET_NOT_RUNNING),
157
4
                              "Tablet is not in running state: $0",
158
4
                              tablet_state);
159
4
    }
160
545k
  }
161
545k
  return Status::OK();
162
545k
}
163
164
94
bool AllReplicasHaveFinishedCompaction(const TabletInfo& tablet_info) {
165
94
  auto replica_map = tablet_info.GetReplicaLocations();
166
186
  for (auto const& replica : *replica_map) {
167
186
    if (replica.second.drive_info.may_have_orphaned_post_split_data) {
168
26
      return false;
169
26
    }
170
186
  }
171
68
  return true;
172
94
}
173
174
11.7M
bool TabletSplitManager::ShouldSplitTablet(const TabletInfo& tablet) {
175
11.7M
  auto tablet_lock = tablet.LockForRead();
176
  // If no leader for this tablet, skip it for now.
177
11.7M
  auto drive_info_opt = tablet.GetLeaderReplicaDriveInfo();
178
11.7M
  if (!drive_info_opt.ok()) {
179
11.1M
    return false;
180
11.1M
  }
181
546k
  if (ValidateSplitCandidateTablet(tablet).ok() &&
182
545k
      filter_->ShouldSplitValidCandidate(tablet, drive_info_opt.get()) &&
183
46
      AllReplicasHaveFinishedCompaction(tablet)) {
184
40
    return true;
185
40
  }
186
546k
  return false;
187
546k
}
188
189
89.3k
void TabletSplitManager::ScheduleSplits(const unordered_set<TabletId>& splits_to_schedule) {
190
35
  for (const auto& tablet_id : splits_to_schedule) {
191
35
    auto s = driver_->SplitTablet(tablet_id, false /* select_all_tablets_for_split */);
192
35
    if (!s.ok()) {
193
0
      WARN_NOT_OK(s, Format("Failed to start/restart split for tablet_id: $0.", tablet_id));
194
35
    } else {
195
35
      LOG(INFO) << Substitute("Scheduled split for tablet_id: $0.", tablet_id);
196
35
    }
197
35
  }
198
89.3k
}
199
200
89.3k
void TabletSplitManager::DoSplitting(const TableInfoMap& table_info_map) {
201
  // Splits which are tracked by an AsyncGetTabletSplitKey or AsyncSplitTablet task.
202
89.3k
  unordered_set<TabletId> splits_with_task;
203
  // Splits for which at least one child tablet is still undergoing compaction.
204
89.3k
  unordered_set<TabletId> compacting_splits;
205
  // Splits that need to be started / restarted.
206
89.3k
  unordered_set<TabletId> splits_to_schedule;
207
  // New split candidates. The chosen candidates are eventually added to splits_to_schedule.
208
89.3k
  TabletInfos new_split_candidates;
209
210
  // Helper method to determine if more splits can be scheduled, or if we should exit early.
211
23.0M
  const auto can_split_more = [&]() {
212
23.0M
    uint64_t outstanding_splits = splits_with_task.size() +
213
23.0M
                                  compacting_splits.size() +
214
23.0M
                                  splits_to_schedule.size();
215
23.0M
    return outstanding_splits < FLAGS_outstanding_tablet_split_limit;
216
23.0M
  };
217
218
  // TODO(asrivastava): We might want to loop over all running tables when determining outstanding
219
  // splits, to avoid missing outstanding splits for tables that have recently become invalid for
220
  // splitting. This is most critical for tables that frequently switch between being valid and
221
  // invalid for splitting (e.g. for tables with frequent PITR schedules).
222
  // https://github.com/yugabyte/yugabyte-db/issues/11459
223
89.3k
  vector<TableInfoPtr> valid_tables;
224
11.5M
  for (const auto& table : table_info_map) {
225
11.5M
    if (ValidateSplitCandidateTable(*table.second).ok()) {
226
11.2M
      valid_tables.push_back(table.second);
227
11.2M
    }
228
11.5M
  }
229
230
11.2M
  for (const auto& table : valid_tables) {
231
217k
    for (const auto& task : table->GetTasks()) {
232
      // These tasks will retry automatically until they succeed or fail.
233
217k
      if (task->type() == yb::server::MonitoredTask::ASYNC_GET_TABLET_SPLIT_KEY ||
234
217k
          task->type() == yb::server::MonitoredTask::ASYNC_SPLIT_TABLET) {
235
0
        const TabletId tablet_id = static_cast<AsyncTabletLeaderTask*>(task.get())->tablet_id();
236
0
        splits_with_task.insert(tablet_id);
237
0
        LOG(INFO) << Substitute("Found split with ongoing task. Task type: $0. "
238
0
                                "Split parent id: $1.", task->type_name(), tablet_id);
239
0
        if (!can_split_more()) {
240
0
          return;
241
0
        }
242
0
      }
243
217k
    }
244
11.2M
  }
245
246
11.2M
  for (const auto& table : valid_tables) {
247
11.7M
    for (const auto& tablet : table->GetTablets()) {
248
11.7M
      if (!can_split_more()) {
249
45
        break;
250
45
      }
251
11.7M
      if (splits_with_task.count(tablet->id())) {
252
0
        continue;
253
0
      }
254
255
11.7M
      auto tablet_lock = tablet->LockForRead();
256
      // Ignore a tablet as a new split candidate if it is part of an outstanding split.
257
11.7M
      bool ignore_as_candidate = false;
258
11.7M
      if (tablet_lock->pb.has_split_parent_tablet_id()) {
259
73
        const TabletId& parent_id = tablet_lock->pb.split_parent_tablet_id();
260
73
        if (splits_with_task.count(parent_id) != 0) {
261
0
          continue;
262
0
        }
263
73
        if (!tablet_lock->is_running()) {
264
          // Recently split child is not running; restart the split.
265
25
          ignore_as_candidate = true;
266
25
          LOG(INFO) << Substitute("Found split child ($0) that is not running. Adding parent ($1) "
267
25
                                  "to list of splits to reschedule.", tablet->id(), parent_id);
268
25
          splits_to_schedule.insert(parent_id);
269
48
        } else if (!AllReplicasHaveFinishedCompaction(*tablet)) {
270
          // This (running) tablet is the child of a split and is still compacting. We assume that
271
          // this split will eventually complete for both tablets.
272
20
          ignore_as_candidate = true;
273
20
          LOG(INFO) << Substitute("Found split child ($0) that is compacting. Adding parent ($1) "
274
20
                                  " to list of compacting splits.", tablet->id(), parent_id);
275
20
          compacting_splits.insert(parent_id);
276
20
        }
277
73
        if (splits_to_schedule.count(parent_id) != 0 && compacting_splits.count(parent_id) != 0) {
278
          // It's possible that one child subtablet leads us to insert the parent tablet id into
279
          // splits_to_schedule, and another leads us to insert into compacting_splits. In this
280
          // case, it means one of the children is live, thus both children have been created and
281
          // the split RPC does not need to be scheduled.
282
0
          LOG(INFO) << Substitute("Found compacting split child ($0), so removing split parent "
283
0
                                  "($1) from splits to schedule.", tablet->id(), parent_id);
284
0
          splits_to_schedule.erase(parent_id);
285
0
        }
286
73
      }
287
11.7M
      if (!ignore_as_candidate && ShouldSplitTablet(*tablet)) {
288
40
        new_split_candidates.push_back(tablet);
289
40
      }
290
11.7M
    }
291
11.2M
    if (!can_split_more()) {
292
45
      break;
293
45
    }
294
11.2M
  }
295
296
  // Add any new splits to the set of splits to schedule (while respecting the max number of
297
  // outstanding splits).
298
19
  for (const auto& tablet : new_split_candidates) {
299
19
    if (!can_split_more()) {
300
9
      break;
301
9
    }
302
10
    splits_to_schedule.insert(tablet->id());
303
10
  }
304
305
89.3k
  ScheduleSplits(splits_to_schedule);
306
89.3k
}
307
308
90.0k
void TabletSplitManager::MaybeDoSplitting(const TableInfoMap& table_info_map) {
309
90.0k
  if (!FLAGS_enable_automatic_tablet_splitting || FLAGS_outstanding_tablet_split_limit == 0) {
310
685
    return;
311
685
  }
312
313
89.3k
  auto time_since_last_run = CoarseMonoClock::Now() - last_run_time_;
314
89.3k
  if (time_since_last_run < (FLAGS_process_split_tablet_candidates_interval_msec * 1ms)) {
315
0
    return;
316
0
  }
317
89.3k
  DoSplitting(table_info_map);
318
89.3k
  last_run_time_ = CoarseMonoClock::Now();
319
89.3k
}
320
321
void TabletSplitManager::ProcessSplitTabletResult(
322
    const Status& status,
323
    const TableId& split_table_id,
324
43
    const SplitTabletIds& split_tablet_ids) {
325
43
  if (!status.ok()) {
326
1
    LOG(WARNING) << "AsyncSplitTablet task failed with status: " << status;
327
42
  } else {
328
    // TODO(JHE) Handle failure cases here (github issue #11030).
329
    // Update the xCluster tablet mapping.
330
42
    Status s = xcluster_split_driver_->UpdateXClusterConsumerOnTabletSplit(
331
42
        split_table_id, split_tablet_ids);
332
42
    WARN_NOT_OK(s, Format(
333
42
        "Encountered an error while updating the xCluster consumer tablet mapping. "
334
42
        "Table id: $0, Split Tablets: $1",
335
42
        split_table_id, split_tablet_ids.ToString()));
336
    // Also process tablet splits for producer side splits.
337
42
    s = xcluster_split_driver_->UpdateXClusterProducerOnTabletSplit(
338
42
        split_table_id, split_tablet_ids);
339
42
    WARN_NOT_OK(s, Format(
340
42
        "Encountered an error while updating the xCluster producer tablet mapping. "
341
42
        "Table id: $0, Split Tablets: $1",
342
42
        split_table_id, split_tablet_ids.ToString()));
343
42
  }
344
43
}
345
346
}  // namespace master
347
}  // namespace yb