/Users/deen/code/yugabyte-db/src/yb/master/tablet_split_manager.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) YugaByte, Inc. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
4 | | // in compliance with the License. You may obtain a copy of the License at |
5 | | // |
6 | | // http://www.apache.org/licenses/LICENSE-2.0 |
7 | | // |
8 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
9 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
10 | | // or implied. See the License for the specific language governing permissions and limitations |
11 | | // under the License. |
12 | | // |
13 | | |
14 | | #include <chrono> |
15 | | |
16 | | #include "yb/common/constants.h" |
17 | | |
18 | | #include "yb/gutil/casts.h" |
19 | | #include "yb/gutil/map-util.h" |
20 | | |
21 | | #include "yb/master/async_rpc_tasks.h" |
22 | | #include "yb/master/master_error.h" |
23 | | #include "yb/master/tablet_split_manager.h" |
24 | | |
25 | | #include "yb/master/xcluster_split_driver.h" |
26 | | #include "yb/server/monitored_task.h" |
27 | | |
28 | | #include "yb/util/flag_tags.h" |
29 | | #include "yb/util/result.h" |
30 | | |
31 | | DEFINE_int32(process_split_tablet_candidates_interval_msec, 0, |
32 | | "The minimum time between automatic splitting attempts. The actual splitting time " |
33 | | "between runs is also affected by catalog_manager_bg_task_wait_ms, which controls how " |
34 | | "long the bg tasks thread sleeps at the end of each loop. The top-level automatic " |
35 | | "tablet splitting method, which checks for the time since last run, is run once per " |
36 | | "loop."); |
37 | | DEFINE_int32(max_queued_split_candidates, 0, |
38 | | "DEPRECATED. The max number of pending tablet split candidates we will hold onto. We " |
39 | | "potentially iterate through every candidate in the queue for each tablet we process " |
40 | | "in a tablet report so this size should be kept relatively small to avoid any " |
41 | | "issues."); |
42 | | |
43 | | DECLARE_bool(enable_automatic_tablet_splitting); |
44 | | |
45 | | DEFINE_uint64(outstanding_tablet_split_limit, 1, |
46 | | "Limit of the number of outstanding tablet splits. Limitation is disabled if this " |
47 | | "value is set to 0."); |
48 | | |
49 | | DECLARE_bool(TEST_validate_all_tablet_candidates); |
50 | | |
51 | | DEFINE_bool(enable_tablet_split_of_pitr_tables, true, |
52 | | "When set, it enables automatic tablet splitting of tables covered by " |
53 | | "Point In Time Restore schedules."); |
54 | | TAG_FLAG(enable_tablet_split_of_pitr_tables, runtime); |
55 | | |
56 | | DEFINE_bool(enable_tablet_split_of_xcluster_replicated_tables, false, |
57 | | "When set, it enables automatic tablet splitting for tables that are part of an " |
58 | | "xCluster replication setup"); |
59 | | TAG_FLAG(enable_tablet_split_of_xcluster_replicated_tables, runtime); |
60 | | TAG_FLAG(enable_tablet_split_of_xcluster_replicated_tables, hidden); |
61 | | |
62 | | DEFINE_uint64(tablet_split_limit_per_table, 256, |
63 | | "Limit of the number of tablets per table for tablet splitting. Limitation is " |
64 | | "disabled if this value is set to 0."); |
65 | | |
66 | | namespace yb { |
67 | | namespace master { |
68 | | |
69 | | using strings::Substitute; |
70 | | using namespace std::literals; |
71 | | |
72 | | TabletSplitManager::TabletSplitManager( |
73 | | TabletSplitCandidateFilterIf* filter, |
74 | | TabletSplitDriverIf* driver, |
75 | | XClusterSplitDriverIf* xcluster_split_driver): |
76 | | filter_(filter), |
77 | | driver_(driver), |
78 | | xcluster_split_driver_(xcluster_split_driver), |
79 | 5.45k | last_run_time_(CoarseDuration::zero()) {} |
80 | | |
81 | 11.5M | Status TabletSplitManager::ValidateSplitCandidateTable(const TableInfo& table) { |
82 | 11.5M | if (PREDICT_FALSE(FLAGS_TEST_validate_all_tablet_candidates)) { |
83 | 0 | return Status::OK(); |
84 | 0 | } |
85 | 11.5M | if (table.is_deleted()) { |
86 | 0 | VLOG(1) << Substitute("Table is deleted; ignoring for splitting. table_id: $0", table.id()); |
87 | 184k | return STATUS_FORMAT( |
88 | 184k | NotSupported, |
89 | 184k | "Table is deleted; ignoring for splitting. table_id: $0", table.id()); |
90 | 184k | } |
91 | | // Check if this table is covered by a PITR schedule. |
92 | 11.3M | if (!FLAGS_enable_tablet_split_of_pitr_tables && |
93 | 0 | VERIFY_RESULT(filter_->IsTablePartOfSomeSnapshotSchedule(table))) { |
94 | 0 | VLOG(1) << Substitute("Tablet splitting is not supported for tables that are a part of" |
95 | 0 | " some active PITR schedule, table_id: $0", table.id()); |
96 | 0 | return STATUS_FORMAT( |
97 | 0 | NotSupported, |
98 | 0 | "Tablet splitting is not supported for tables that are a part of" |
99 | 0 | " some active PITR schedule, table_id: $0", table.id()); |
100 | 0 | } |
101 | | // Check if this table is part of a cdc stream. |
102 | 11.3M | if (PREDICT_TRUE(!FLAGS_enable_tablet_split_of_xcluster_replicated_tables) && |
103 | 11.3M | filter_->IsCdcEnabled(table)) { |
104 | 0 | VLOG(1) << Substitute("Tablet splitting is not supported for tables that are a part of" |
105 | 0 | " a CDC stream, table_id: $0", table.id()); |
106 | 4 | return STATUS_FORMAT( |
107 | 4 | NotSupported, |
108 | 4 | "Tablet splitting is not supported for tables that are a part of" |
109 | 4 | " a CDC stream, tablet_id: $0", table.id()); |
110 | 4 | } |
111 | 11.3M | if (table.GetTableType() == TableType::TRANSACTION_STATUS_TABLE_TYPE) { |
112 | 0 | VLOG(1) << Substitute("Tablet splitting is not supported for transaction status tables, " |
113 | 0 | "table_id: $0", table.id()); |
114 | 29.1k | return STATUS_FORMAT( |
115 | 29.1k | NotSupported, |
116 | 29.1k | "Tablet splitting is not supported for transaction status tables, table_id: $0", |
117 | 29.1k | table.id()); |
118 | 29.1k | } |
119 | 11.3M | if (table.GetTableType() == REDIS_TABLE_TYPE) { |
120 | 0 | VLOG(1) << Substitute("Tablet splitting is not supported for YEDIS tables, table_id: $0", |
121 | 0 | table.id()); |
122 | 4.59k | return STATUS_FORMAT( |
123 | 4.59k | NotSupported, |
124 | 4.59k | "Tablet splitting is not supported for YEDIS tables, table_id: $0", table.id()); |
125 | 4.59k | } |
126 | 11.3M | if (FLAGS_tablet_split_limit_per_table != 0 && |
127 | 11.3M | table.NumPartitions() >= FLAGS_tablet_split_limit_per_table) { |
128 | | // TODO(tsplit): Avoid tablet server of scanning tablets for the tables that already |
129 | | // reached the split limit of tablet #6220 |
130 | 0 | VLOG(1) << Substitute("Too many tablets for the table, table_id: $0, limit: $1", |
131 | 0 | table.id(), FLAGS_tablet_split_limit_per_table); |
132 | 0 | return STATUS_EC_FORMAT(IllegalState, MasterError(MasterErrorPB::REACHED_SPLIT_LIMIT), |
133 | 0 | "Too many tablets for the table, table_id: $0, limit: $1", |
134 | 0 | table.id(), FLAGS_tablet_split_limit_per_table); |
135 | 0 | } |
136 | 11.3M | if (table.IsBackfilling()) { |
137 | 0 | VLOG(1) << Substitute("Backfill operation in progress, table_id: $0", table.id()); |
138 | 2.41k | return STATUS_EC_FORMAT(IllegalState, MasterError(MasterErrorPB::SPLIT_OR_BACKFILL_IN_PROGRESS), |
139 | 2.41k | "Backfill operation in progress, table_id: $0", table.id()); |
140 | 2.41k | } |
141 | 11.2M | return Status::OK(); |
142 | 11.2M | } |
143 | | |
144 | 546k | Status TabletSplitManager::ValidateSplitCandidateTablet(const TabletInfo& tablet) { |
145 | 546k | if (PREDICT_FALSE(FLAGS_TEST_validate_all_tablet_candidates)) { |
146 | 0 | return Status::OK(); |
147 | 0 | } |
148 | 546k | if (tablet.colocated()) { |
149 | 1.22k | return STATUS_FORMAT( |
150 | 1.22k | NotSupported, "Tablet splitting is not supported for colocated tables, tablet_id: $0", |
151 | 1.22k | tablet.tablet_id()); |
152 | 1.22k | } |
153 | 545k | { |
154 | 545k | auto tablet_state = tablet.LockForRead()->pb.state(); |
155 | 545k | if (tablet_state != SysTabletsEntryPB::RUNNING) { |
156 | 4 | return STATUS_EC_FORMAT(IllegalState, MasterError(MasterErrorPB::TABLET_NOT_RUNNING), |
157 | 4 | "Tablet is not in running state: $0", |
158 | 4 | tablet_state); |
159 | 4 | } |
160 | 545k | } |
161 | 545k | return Status::OK(); |
162 | 545k | } |
163 | | |
164 | 94 | bool AllReplicasHaveFinishedCompaction(const TabletInfo& tablet_info) { |
165 | 94 | auto replica_map = tablet_info.GetReplicaLocations(); |
166 | 186 | for (auto const& replica : *replica_map) { |
167 | 186 | if (replica.second.drive_info.may_have_orphaned_post_split_data) { |
168 | 26 | return false; |
169 | 26 | } |
170 | 186 | } |
171 | 68 | return true; |
172 | 94 | } |
173 | | |
174 | 11.7M | bool TabletSplitManager::ShouldSplitTablet(const TabletInfo& tablet) { |
175 | 11.7M | auto tablet_lock = tablet.LockForRead(); |
176 | | // If no leader for this tablet, skip it for now. |
177 | 11.7M | auto drive_info_opt = tablet.GetLeaderReplicaDriveInfo(); |
178 | 11.7M | if (!drive_info_opt.ok()) { |
179 | 11.1M | return false; |
180 | 11.1M | } |
181 | 546k | if (ValidateSplitCandidateTablet(tablet).ok() && |
182 | 545k | filter_->ShouldSplitValidCandidate(tablet, drive_info_opt.get()) && |
183 | 46 | AllReplicasHaveFinishedCompaction(tablet)) { |
184 | 40 | return true; |
185 | 40 | } |
186 | 546k | return false; |
187 | 546k | } |
188 | | |
189 | 89.3k | void TabletSplitManager::ScheduleSplits(const unordered_set<TabletId>& splits_to_schedule) { |
190 | 35 | for (const auto& tablet_id : splits_to_schedule) { |
191 | 35 | auto s = driver_->SplitTablet(tablet_id, false /* select_all_tablets_for_split */); |
192 | 35 | if (!s.ok()) { |
193 | 0 | WARN_NOT_OK(s, Format("Failed to start/restart split for tablet_id: $0.", tablet_id)); |
194 | 35 | } else { |
195 | 35 | LOG(INFO) << Substitute("Scheduled split for tablet_id: $0.", tablet_id); |
196 | 35 | } |
197 | 35 | } |
198 | 89.3k | } |
199 | | |
200 | 89.3k | void TabletSplitManager::DoSplitting(const TableInfoMap& table_info_map) { |
201 | | // Splits which are tracked by an AsyncGetTabletSplitKey or AsyncSplitTablet task. |
202 | 89.3k | unordered_set<TabletId> splits_with_task; |
203 | | // Splits for which at least one child tablet is still undergoing compaction. |
204 | 89.3k | unordered_set<TabletId> compacting_splits; |
205 | | // Splits that need to be started / restarted. |
206 | 89.3k | unordered_set<TabletId> splits_to_schedule; |
207 | | // New split candidates. The chosen candidates are eventually added to splits_to_schedule. |
208 | 89.3k | TabletInfos new_split_candidates; |
209 | | |
210 | | // Helper method to determine if more splits can be scheduled, or if we should exit early. |
211 | 23.0M | const auto can_split_more = [&]() { |
212 | 23.0M | uint64_t outstanding_splits = splits_with_task.size() + |
213 | 23.0M | compacting_splits.size() + |
214 | 23.0M | splits_to_schedule.size(); |
215 | 23.0M | return outstanding_splits < FLAGS_outstanding_tablet_split_limit; |
216 | 23.0M | }; |
217 | | |
218 | | // TODO(asrivastava): We might want to loop over all running tables when determining outstanding |
219 | | // splits, to avoid missing outstanding splits for tables that have recently become invalid for |
220 | | // splitting. This is most critical for tables that frequently switch between being valid and |
221 | | // invalid for splitting (e.g. for tables with frequent PITR schedules). |
222 | | // https://github.com/yugabyte/yugabyte-db/issues/11459 |
223 | 89.3k | vector<TableInfoPtr> valid_tables; |
224 | 11.5M | for (const auto& table : table_info_map) { |
225 | 11.5M | if (ValidateSplitCandidateTable(*table.second).ok()) { |
226 | 11.2M | valid_tables.push_back(table.second); |
227 | 11.2M | } |
228 | 11.5M | } |
229 | | |
230 | 11.2M | for (const auto& table : valid_tables) { |
231 | 217k | for (const auto& task : table->GetTasks()) { |
232 | | // These tasks will retry automatically until they succeed or fail. |
233 | 217k | if (task->type() == yb::server::MonitoredTask::ASYNC_GET_TABLET_SPLIT_KEY || |
234 | 217k | task->type() == yb::server::MonitoredTask::ASYNC_SPLIT_TABLET) { |
235 | 0 | const TabletId tablet_id = static_cast<AsyncTabletLeaderTask*>(task.get())->tablet_id(); |
236 | 0 | splits_with_task.insert(tablet_id); |
237 | 0 | LOG(INFO) << Substitute("Found split with ongoing task. Task type: $0. " |
238 | 0 | "Split parent id: $1.", task->type_name(), tablet_id); |
239 | 0 | if (!can_split_more()) { |
240 | 0 | return; |
241 | 0 | } |
242 | 0 | } |
243 | 217k | } |
244 | 11.2M | } |
245 | | |
246 | 11.2M | for (const auto& table : valid_tables) { |
247 | 11.7M | for (const auto& tablet : table->GetTablets()) { |
248 | 11.7M | if (!can_split_more()) { |
249 | 45 | break; |
250 | 45 | } |
251 | 11.7M | if (splits_with_task.count(tablet->id())) { |
252 | 0 | continue; |
253 | 0 | } |
254 | | |
255 | 11.7M | auto tablet_lock = tablet->LockForRead(); |
256 | | // Ignore a tablet as a new split candidate if it is part of an outstanding split. |
257 | 11.7M | bool ignore_as_candidate = false; |
258 | 11.7M | if (tablet_lock->pb.has_split_parent_tablet_id()) { |
259 | 73 | const TabletId& parent_id = tablet_lock->pb.split_parent_tablet_id(); |
260 | 73 | if (splits_with_task.count(parent_id) != 0) { |
261 | 0 | continue; |
262 | 0 | } |
263 | 73 | if (!tablet_lock->is_running()) { |
264 | | // Recently split child is not running; restart the split. |
265 | 25 | ignore_as_candidate = true; |
266 | 25 | LOG(INFO) << Substitute("Found split child ($0) that is not running. Adding parent ($1) " |
267 | 25 | "to list of splits to reschedule.", tablet->id(), parent_id); |
268 | 25 | splits_to_schedule.insert(parent_id); |
269 | 48 | } else if (!AllReplicasHaveFinishedCompaction(*tablet)) { |
270 | | // This (running) tablet is the child of a split and is still compacting. We assume that |
271 | | // this split will eventually complete for both tablets. |
272 | 20 | ignore_as_candidate = true; |
273 | 20 | LOG(INFO) << Substitute("Found split child ($0) that is compacting. Adding parent ($1) " |
274 | 20 | " to list of compacting splits.", tablet->id(), parent_id); |
275 | 20 | compacting_splits.insert(parent_id); |
276 | 20 | } |
277 | 73 | if (splits_to_schedule.count(parent_id) != 0 && compacting_splits.count(parent_id) != 0) { |
278 | | // It's possible that one child subtablet leads us to insert the parent tablet id into |
279 | | // splits_to_schedule, and another leads us to insert into compacting_splits. In this |
280 | | // case, it means one of the children is live, thus both children have been created and |
281 | | // the split RPC does not need to be scheduled. |
282 | 0 | LOG(INFO) << Substitute("Found compacting split child ($0), so removing split parent " |
283 | 0 | "($1) from splits to schedule.", tablet->id(), parent_id); |
284 | 0 | splits_to_schedule.erase(parent_id); |
285 | 0 | } |
286 | 73 | } |
287 | 11.7M | if (!ignore_as_candidate && ShouldSplitTablet(*tablet)) { |
288 | 40 | new_split_candidates.push_back(tablet); |
289 | 40 | } |
290 | 11.7M | } |
291 | 11.2M | if (!can_split_more()) { |
292 | 45 | break; |
293 | 45 | } |
294 | 11.2M | } |
295 | | |
296 | | // Add any new splits to the set of splits to schedule (while respecting the max number of |
297 | | // outstanding splits). |
298 | 19 | for (const auto& tablet : new_split_candidates) { |
299 | 19 | if (!can_split_more()) { |
300 | 9 | break; |
301 | 9 | } |
302 | 10 | splits_to_schedule.insert(tablet->id()); |
303 | 10 | } |
304 | | |
305 | 89.3k | ScheduleSplits(splits_to_schedule); |
306 | 89.3k | } |
307 | | |
308 | 90.0k | void TabletSplitManager::MaybeDoSplitting(const TableInfoMap& table_info_map) { |
309 | 90.0k | if (!FLAGS_enable_automatic_tablet_splitting || FLAGS_outstanding_tablet_split_limit == 0) { |
310 | 685 | return; |
311 | 685 | } |
312 | | |
313 | 89.3k | auto time_since_last_run = CoarseMonoClock::Now() - last_run_time_; |
314 | 89.3k | if (time_since_last_run < (FLAGS_process_split_tablet_candidates_interval_msec * 1ms)) { |
315 | 0 | return; |
316 | 0 | } |
317 | 89.3k | DoSplitting(table_info_map); |
318 | 89.3k | last_run_time_ = CoarseMonoClock::Now(); |
319 | 89.3k | } |
320 | | |
321 | | void TabletSplitManager::ProcessSplitTabletResult( |
322 | | const Status& status, |
323 | | const TableId& split_table_id, |
324 | 43 | const SplitTabletIds& split_tablet_ids) { |
325 | 43 | if (!status.ok()) { |
326 | 1 | LOG(WARNING) << "AsyncSplitTablet task failed with status: " << status; |
327 | 42 | } else { |
328 | | // TODO(JHE) Handle failure cases here (github issue #11030). |
329 | | // Update the xCluster tablet mapping. |
330 | 42 | Status s = xcluster_split_driver_->UpdateXClusterConsumerOnTabletSplit( |
331 | 42 | split_table_id, split_tablet_ids); |
332 | 42 | WARN_NOT_OK(s, Format( |
333 | 42 | "Encountered an error while updating the xCluster consumer tablet mapping. " |
334 | 42 | "Table id: $0, Split Tablets: $1", |
335 | 42 | split_table_id, split_tablet_ids.ToString())); |
336 | | // Also process tablet splits for producer side splits. |
337 | 42 | s = xcluster_split_driver_->UpdateXClusterProducerOnTabletSplit( |
338 | 42 | split_table_id, split_tablet_ids); |
339 | 42 | WARN_NOT_OK(s, Format( |
340 | 42 | "Encountered an error while updating the xCluster producer tablet mapping. " |
341 | 42 | "Table id: $0, Split Tablets: $1", |
342 | 42 | split_table_id, split_tablet_ids.ToString())); |
343 | 42 | } |
344 | 43 | } |
345 | | |
346 | | } // namespace master |
347 | | } // namespace yb |