/Users/deen/code/yugabyte-db/src/yb/tserver/ts_tablet_manager.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // |
18 | | // The following only applies to changes made to this file as part of YugaByte development. |
19 | | // |
20 | | // Portions Copyright (c) YugaByte, Inc. |
21 | | // |
22 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
23 | | // in compliance with the License. You may obtain a copy of the License at |
24 | | // |
25 | | // http://www.apache.org/licenses/LICENSE-2.0 |
26 | | // |
27 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
28 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
29 | | // or implied. See the License for the specific language governing permissions and limitations |
30 | | // under the License. |
31 | | // |
32 | | |
33 | | #include "yb/tserver/ts_tablet_manager.h" |
34 | | |
35 | | #include <algorithm> |
36 | | #include <chrono> |
37 | | #include <memory> |
38 | | #include <mutex> |
39 | | #include <string> |
40 | | #include <thread> |
41 | | #include <vector> |
42 | | |
43 | | #include <boost/container/static_vector.hpp> |
44 | | #include <boost/optional/optional.hpp> |
45 | | #include <glog/logging.h> |
46 | | |
47 | | #include "yb/client/client.h" |
48 | | |
49 | | #include "yb/common/wire_protocol.h" |
50 | | |
51 | | #include "yb/consensus/consensus.h" |
52 | | #include "yb/consensus/multi_raft_batcher.h" |
53 | | #include "yb/consensus/consensus_meta.h" |
54 | | #include "yb/consensus/log.h" |
55 | | #include "yb/consensus/log_anchor_registry.h" |
56 | | #include "yb/consensus/metadata.pb.h" |
57 | | #include "yb/consensus/opid_util.h" |
58 | | #include "yb/consensus/quorum_util.h" |
59 | | #include "yb/consensus/raft_consensus.h" |
60 | | #include "yb/consensus/retryable_requests.h" |
61 | | #include "yb/consensus/state_change_context.h" |
62 | | |
63 | | #include "yb/docdb/docdb_rocksdb_util.h" |
64 | | |
65 | | #include "yb/fs/fs_manager.h" |
66 | | |
67 | | #include "yb/gutil/bind.h" |
68 | | #include "yb/gutil/stl_util.h" |
69 | | #include "yb/gutil/strings/substitute.h" |
70 | | #include "yb/gutil/sysinfo.h" |
71 | | |
72 | | #include "yb/master/master_heartbeat.pb.h" |
73 | | #include "yb/master/sys_catalog.h" |
74 | | |
75 | | #include "yb/rpc/messenger.h" |
76 | | #include "yb/rpc/poller.h" |
77 | | |
78 | | #include "yb/tablet/metadata.pb.h" |
79 | | #include "yb/tablet/operations/split_operation.h" |
80 | | #include "yb/tablet/tablet.h" |
81 | | #include "yb/tablet/tablet.pb.h" |
82 | | #include "yb/tablet/tablet_bootstrap_if.h" |
83 | | #include "yb/tablet/tablet_metadata.h" |
84 | | #include "yb/tablet/tablet_options.h" |
85 | | #include "yb/tablet/tablet_peer.h" |
86 | | |
87 | | #include "yb/tserver/heartbeater.h" |
88 | | #include "yb/tserver/remote_bootstrap_client.h" |
89 | | #include "yb/tserver/remote_bootstrap_session.h" |
90 | | #include "yb/tserver/tablet_server.h" |
91 | | #include "yb/tserver/tserver.pb.h" |
92 | | |
93 | | #include "yb/util/debug/long_operation_tracker.h" |
94 | | #include "yb/util/debug/trace_event.h" |
95 | | #include "yb/util/env.h" |
96 | | #include "yb/util/fault_injection.h" |
97 | | #include "yb/util/flag_tags.h" |
98 | | #include "yb/util/format.h" |
99 | | #include "yb/util/logging.h" |
100 | | #include "yb/util/mem_tracker.h" |
101 | | #include "yb/util/metrics.h" |
102 | | #include "yb/util/pb_util.h" |
103 | | #include "yb/util/scope_exit.h" |
104 | | #include "yb/util/shared_lock.h" |
105 | | #include "yb/util/status_format.h" |
106 | | #include "yb/util/status_log.h" |
107 | | #include "yb/util/stopwatch.h" |
108 | | #include "yb/util/trace.h" |
109 | | |
110 | | using namespace std::literals; |
111 | | using namespace std::placeholders; |
112 | | |
113 | | DEFINE_int32(num_tablets_to_open_simultaneously, 0, |
114 | | "Number of threads available to open tablets during startup. If this " |
115 | | "is set to 0 (the default), then the number of bootstrap threads will " |
116 | | "be set based on the number of data directories. If the data directories " |
117 | | "are on some very fast storage device such as SSD or a RAID array, it " |
118 | | "may make sense to manually tune this."); |
119 | | TAG_FLAG(num_tablets_to_open_simultaneously, advanced); |
120 | | |
121 | | DEFINE_int32(tablet_start_warn_threshold_ms, 500, |
122 | | "If a tablet takes more than this number of millis to start, issue " |
123 | | "a warning with a trace."); |
124 | | TAG_FLAG(tablet_start_warn_threshold_ms, hidden); |
125 | | |
126 | | DEFINE_int32(cleanup_split_tablets_interval_sec, 60, |
127 | | "Interval at which tablet manager tries to cleanup split tablets which are no longer " |
128 | | "needed. Setting this to 0 disables cleanup of split tablets."); |
129 | | |
130 | | DEFINE_test_flag(double, fault_crash_after_blocks_deleted, 0.0, |
131 | | "Fraction of the time when the tablet will crash immediately " |
132 | | "after deleting the data blocks during tablet deletion."); |
133 | | |
134 | | DEFINE_test_flag(double, fault_crash_after_wal_deleted, 0.0, |
135 | | "Fraction of the time when the tablet will crash immediately " |
136 | | "after deleting the WAL segments during tablet deletion."); |
137 | | |
138 | | DEFINE_test_flag(double, fault_crash_after_cmeta_deleted, 0.0, |
139 | | "Fraction of the time when the tablet will crash immediately " |
140 | | "after deleting the consensus metadata during tablet deletion."); |
141 | | |
142 | | DEFINE_test_flag(double, fault_crash_after_rb_files_fetched, 0.0, |
143 | | "Fraction of the time when the tablet will crash immediately " |
144 | | "after fetching the files during a remote bootstrap but before " |
145 | | "marking the superblock as TABLET_DATA_READY."); |
146 | | |
147 | | DEFINE_test_flag(double, fault_crash_in_split_after_log_copied, 0.0, |
148 | | "Fraction of the time when the tablet will crash immediately after initiating a " |
149 | | "Log::CopyTo from parent to child tablet, but before marking the child tablet as " |
150 | | "TABLET_DATA_READY."); |
151 | | |
152 | | DEFINE_test_flag(bool, simulate_already_present_in_remote_bootstrap, false, |
153 | | "If true, return an AlreadyPresent error in remote bootstrap after starting the " |
154 | | "remote bootstrap client."); |
155 | | |
156 | | DEFINE_test_flag(double, fault_crash_in_split_before_log_flushed, 0.0, |
157 | | "Fraction of the time when the tablet will crash immediately before flushing a " |
158 | | "parent tablet's kSplit operation."); |
159 | | |
160 | | DEFINE_test_flag(uint64, crash_if_remote_bootstrap_sessions_greater_than, 0, |
161 | | "If greater than zero, this process will crash if we detect more than the " |
162 | | "specified number of remote bootstrap sessions."); |
163 | | |
164 | | DEFINE_test_flag(uint64, crash_if_remote_bootstrap_sessions_per_table_greater_than, 0, |
165 | | "If greater than zero, this process will crash if for any table we exceed the " |
166 | | "specified number of remote bootstrap sessions"); |
167 | | |
168 | | DEFINE_test_flag(bool, crash_before_apply_tablet_split_op, false, |
169 | | "Crash inside TSTabletManager::ApplyTabletSplit before doing anything"); |
170 | | |
171 | | DEFINE_test_flag(bool, force_single_tablet_failure, false, |
172 | | "Force exactly one tablet to a failed state."); |
173 | | |
174 | | DEFINE_test_flag(int32, apply_tablet_split_inject_delay_ms, 0, |
175 | | "Inject delay into TSTabletManager::ApplyTabletSplit."); |
176 | | |
177 | | DEFINE_test_flag(bool, skip_deleting_split_tablets, false, |
178 | | "Skip deleting tablets which have been split."); |
179 | | |
180 | | DEFINE_test_flag(bool, skip_post_split_compaction, false, |
181 | | "Skip processing post split compaction."); |
182 | | |
183 | | DEFINE_int32(verify_tablet_data_interval_sec, 0, |
184 | | "The tick interval time for the tablet data integrity verification background task. " |
185 | | "This defaults to 0, which means disable the background task."); |
186 | | |
187 | | DEFINE_int32(cleanup_metrics_interval_sec, 60, |
188 | | "The tick interval time for the metrics cleanup background task. " |
189 | | "If set to 0, it disables the background task."); |
190 | | |
191 | | DEFINE_bool(skip_tablet_data_verification, false, |
192 | | "Skip checking tablet data for corruption."); |
193 | | |
194 | | DEFINE_int32(read_pool_max_threads, 128, |
195 | | "The maximum number of threads allowed for read_pool_. This pool is used " |
196 | | "to run multiple read operations, that are part of the same tablet rpc, " |
197 | | "in parallel."); |
198 | | DEFINE_int32(read_pool_max_queue_size, 128, |
199 | | "The maximum number of tasks that can be held in the queue for read_pool_. This pool " |
200 | | "is used to run multiple read operations, that are part of the same tablet rpc, " |
201 | | "in parallel."); |
202 | | |
203 | | DEFINE_int32(post_split_trigger_compaction_pool_max_threads, 1, |
204 | | "The maximum number of threads allowed for post_split_trigger_compaction_pool_. This " |
205 | | "pool is used to run compactions on tablets after they have been split and still " |
206 | | "contain irrelevant data from the tablet they were sourced from."); |
207 | | DEFINE_int32(post_split_trigger_compaction_pool_max_queue_size, 16, |
208 | | "The maximum number of tasks that can be held in the pool for " |
209 | | "post_split_trigger_compaction_pool_. This pool is used to run compactions on tablets " |
210 | | "after they have been split and still contain irrelevant data from the tablet they " |
211 | | "were sourced from."); |
212 | | |
213 | | DEFINE_test_flag(int32, sleep_after_tombstoning_tablet_secs, 0, |
214 | | "Whether we sleep in LogAndTombstone after calling DeleteTabletData."); |
215 | | |
216 | | constexpr int kTServerYbClientDefaultTimeoutMs = 60 * 1000; |
217 | | |
218 | | DEFINE_int32(tserver_yb_client_default_timeout_ms, kTServerYbClientDefaultTimeoutMs, |
219 | | "Default timeout for the YBClient embedded into the tablet server that is used " |
220 | | "for distributed transactions."); |
221 | | |
222 | | DEFINE_bool(enable_restart_transaction_status_tablets_first, true, |
223 | | "Set to true to prioritize bootstrapping transaction status tablets first."); |
224 | | |
225 | | DECLARE_string(rocksdb_compact_flush_rate_limit_sharing_mode); |
226 | | |
227 | | namespace yb { |
228 | | namespace tserver { |
229 | | |
230 | | METRIC_DEFINE_coarse_histogram(server, op_apply_queue_length, "Operation Apply Queue Length", |
231 | | MetricUnit::kTasks, |
232 | | "Number of operations waiting to be applied to the tablet. " |
233 | | "High queue lengths indicate that the server is unable to process " |
234 | | "operations as fast as they are being written to the WAL."); |
235 | | |
236 | | METRIC_DEFINE_coarse_histogram(server, op_apply_queue_time, "Operation Apply Queue Time", |
237 | | MetricUnit::kMicroseconds, |
238 | | "Time that operations spent waiting in the apply queue before being " |
239 | | "processed. High queue times indicate that the server is unable to " |
240 | | "process operations as fast as they are being written to the WAL."); |
241 | | |
242 | | METRIC_DEFINE_coarse_histogram(server, op_apply_run_time, "Operation Apply Run Time", |
243 | | MetricUnit::kMicroseconds, |
244 | | "Time that operations spent being applied to the tablet. " |
245 | | "High values may indicate that the server is under-provisioned or " |
246 | | "that operations consist of very large batches."); |
247 | | |
248 | | METRIC_DEFINE_coarse_histogram(server, op_read_queue_length, "Operation Read op Queue Length", |
249 | | MetricUnit::kTasks, |
250 | | "Number of operations waiting to be applied to the tablet. " |
251 | | "High queue lengths indicate that the server is unable to process " |
252 | | "operations as fast as they are being written to the WAL."); |
253 | | |
254 | | METRIC_DEFINE_coarse_histogram(server, op_read_queue_time, "Operation Read op Queue Time", |
255 | | MetricUnit::kMicroseconds, |
256 | | "Time that operations spent waiting in the read queue before being " |
257 | | "processed. High queue times indicate that the server is unable to " |
258 | | "process operations as fast as they are being written to the WAL."); |
259 | | |
260 | | METRIC_DEFINE_coarse_histogram(server, op_read_run_time, "Operation Read op Run Time", |
261 | | MetricUnit::kMicroseconds, |
262 | | "Time that operations spent being applied to the tablet. " |
263 | | "High values may indicate that the server is under-provisioned or " |
264 | | "that operations consist of very large batches."); |
265 | | |
266 | | METRIC_DEFINE_coarse_histogram(server, ts_bootstrap_time, "TServer Bootstrap Time", |
267 | | MetricUnit::kMicroseconds, |
268 | | "Time that the tablet server takes to bootstrap all of its tablets."); |
269 | | |
270 | | THREAD_POOL_METRICS_DEFINE( |
271 | | server, post_split_trigger_compaction_pool, "Thread pool for tablet compaction jobs."); |
272 | | |
273 | | THREAD_POOL_METRICS_DEFINE( |
274 | | server, admin_triggered_compaction_pool, "Thread pool for tablet compaction jobs."); |
275 | | |
276 | | using consensus::ConsensusMetadata; |
277 | | using consensus::ConsensusStatePB; |
278 | | using consensus::RaftConfigPB; |
279 | | using consensus::RaftPeerPB; |
280 | | using consensus::StartRemoteBootstrapRequestPB; |
281 | | using log::Log; |
282 | | using master::ReportedTabletPB; |
283 | | using master::TabletReportPB; |
284 | | using master::TabletReportUpdatesPB; |
285 | | using std::shared_ptr; |
286 | | using std::string; |
287 | | using std::unordered_set; |
288 | | using std::vector; |
289 | | using strings::Substitute; |
290 | | using tablet::BOOTSTRAPPING; |
291 | | using tablet::NOT_STARTED; |
292 | | using tablet::RaftGroupMetadata; |
293 | | using tablet::RaftGroupMetadataPtr; |
294 | | using tablet::RaftGroupStatePB; |
295 | | using tablet::RUNNING; |
296 | | using tablet::TABLET_DATA_COPYING; |
297 | | using tablet::TABLET_DATA_DELETED; |
298 | | using tablet::TABLET_DATA_INIT_STARTED; |
299 | | using tablet::TABLET_DATA_READY; |
300 | | using tablet::TABLET_DATA_SPLIT_COMPLETED; |
301 | | using tablet::TABLET_DATA_TOMBSTONED; |
302 | | using tablet::TabletDataState; |
303 | | using tablet::TabletPeer; |
304 | | using tablet::TabletPeerPtr; |
305 | | using tablet::TabletStatusListener; |
306 | | using tablet::TabletStatusPB; |
307 | | |
308 | | constexpr int32_t kDefaultTserverBlockCacheSizePercentage = 50; |
309 | | |
310 | 0 | void TSTabletManager::VerifyTabletData() { |
311 | 0 | LOG_WITH_PREFIX(INFO) << "Beginning tablet data verification checks"; |
312 | 0 | for (const TabletPeerPtr& peer : GetTabletPeers()) { |
313 | 0 | if (peer->state() == RUNNING) { |
314 | 0 | if (PREDICT_FALSE(FLAGS_skip_tablet_data_verification)) { |
315 | 0 | LOG_WITH_PREFIX(INFO) |
316 | 0 | << Format("Skipped tablet data verification check on $0", peer->tablet_id()); |
317 | 0 | } else { |
318 | 0 | Status s = peer->tablet()->VerifyDataIntegrity(); |
319 | 0 | if (!s.ok()) { |
320 | 0 | LOG(WARNING) << "Tablet data integrity verification failed on " << peer->tablet_id() |
321 | 0 | << ": " << s; |
322 | 0 | } |
323 | 0 | } |
324 | 0 | } |
325 | 0 | } |
326 | 0 | } |
327 | | |
328 | 88.0k | void TSTabletManager::CleanupOldMetrics() { |
329 | 88.0k | VLOG(2) << "Cleaning up old metrics"0 ; |
330 | 88.0k | metric_registry_->RetireOldMetrics(); |
331 | 88.0k | } |
332 | | |
333 | | TSTabletManager::TSTabletManager(FsManager* fs_manager, |
334 | | TabletServer* server, |
335 | | MetricRegistry* metric_registry) |
336 | | : fs_manager_(fs_manager), |
337 | | server_(server), |
338 | | metric_registry_(metric_registry), |
339 | 9.27k | state_(MANAGER_INITIALIZING) { |
340 | 9.27k | ThreadPoolMetrics metrics = { |
341 | 9.27k | METRIC_op_apply_queue_length.Instantiate(server_->metric_entity()), |
342 | 9.27k | METRIC_op_apply_queue_time.Instantiate(server_->metric_entity()), |
343 | 9.27k | METRIC_op_apply_run_time.Instantiate(server_->metric_entity()) |
344 | 9.27k | }; |
345 | 9.27k | tablet_options_.ServerMetricEntity = server_->metric_entity(); |
346 | 9.27k | CHECK_OK(ThreadPoolBuilder("apply") |
347 | 9.27k | .set_metrics(std::move(metrics)) |
348 | 9.27k | .Build(&apply_pool_)); |
349 | | |
350 | | // This pool is shared by all replicas hosted by this server. |
351 | | // |
352 | | // Some submitted tasks use blocking IO, so we configure no upper bound on |
353 | | // the maximum number of threads in each pool (otherwise the default value of |
354 | | // "number of CPUs" may cause blocking tasks to starve other "fast" tasks). |
355 | | // However, the effective upper bound is the number of replicas as each will |
356 | | // submit its own tasks via a dedicated token. |
357 | 9.27k | CHECK_OK(ThreadPoolBuilder("consensus") |
358 | 9.27k | .set_min_threads(1) |
359 | 9.27k | .unlimited_threads() |
360 | 9.27k | .Build(&raft_pool_)); |
361 | 9.27k | CHECK_OK(ThreadPoolBuilder("prepare") |
362 | 9.27k | .set_min_threads(1) |
363 | 9.27k | .unlimited_threads() |
364 | 9.27k | .Build(&tablet_prepare_pool_)); |
365 | 9.27k | CHECK_OK(ThreadPoolBuilder("append") |
366 | 9.27k | .set_min_threads(1) |
367 | 9.27k | .unlimited_threads() |
368 | 9.27k | .set_idle_timeout(MonoDelta::FromMilliseconds(10000)) |
369 | 9.27k | .Build(&append_pool_)); |
370 | 9.27k | CHECK_OK(ThreadPoolBuilder("log-alloc") |
371 | 9.27k | .set_min_threads(1) |
372 | 9.27k | .unlimited_threads() |
373 | 9.27k | .Build(&allocation_pool_)); |
374 | 9.27k | ThreadPoolMetrics read_metrics = { |
375 | 9.27k | METRIC_op_read_queue_length.Instantiate(server_->metric_entity()), |
376 | 9.27k | METRIC_op_read_queue_time.Instantiate(server_->metric_entity()), |
377 | 9.27k | METRIC_op_read_run_time.Instantiate(server_->metric_entity()) |
378 | 9.27k | }; |
379 | 9.27k | CHECK_OK(ThreadPoolBuilder("read-parallel") |
380 | 9.27k | .set_max_threads(FLAGS_read_pool_max_threads) |
381 | 9.27k | .set_max_queue_size(FLAGS_read_pool_max_queue_size) |
382 | 9.27k | .set_metrics(std::move(read_metrics)) |
383 | 9.27k | .Build(&read_pool_)); |
384 | 9.27k | CHECK_OK(ThreadPoolBuilder("tablet-split-compaction") |
385 | 9.27k | .set_max_threads(FLAGS_post_split_trigger_compaction_pool_max_threads) |
386 | 9.27k | .set_max_queue_size(FLAGS_post_split_trigger_compaction_pool_max_queue_size) |
387 | 9.27k | .set_metrics(THREAD_POOL_METRICS_INSTANCE( |
388 | 9.27k | server_->metric_entity(), post_split_trigger_compaction_pool)) |
389 | 9.27k | .Build(&post_split_trigger_compaction_pool_)); |
390 | 9.27k | CHECK_OK(ThreadPoolBuilder("admin-compaction") |
391 | 9.27k | .set_max_threads(std::max(docdb::GetGlobalRocksDBPriorityThreadPoolSize(), 0)) |
392 | 9.27k | .set_metrics(THREAD_POOL_METRICS_INSTANCE( |
393 | 9.27k | server_->metric_entity(), admin_triggered_compaction_pool)) |
394 | 9.27k | .Build(&admin_triggered_compaction_pool_)); |
395 | | |
396 | 9.27k | mem_manager_ = std::make_shared<TabletMemoryManager>( |
397 | 9.27k | &tablet_options_, |
398 | 9.27k | server_->mem_tracker(), |
399 | 9.27k | kDefaultTserverBlockCacheSizePercentage, |
400 | 9.27k | server_->metric_entity(), |
401 | 9.27k | [this](){ return GetTabletPeers(); }410 ); |
402 | 9.27k | } |
403 | | |
404 | 164 | TSTabletManager::~TSTabletManager() { |
405 | 164 | } |
406 | | |
407 | 8.74k | Status TSTabletManager::Init() { |
408 | 8.74k | CHECK_EQ(state(), MANAGER_INITIALIZING); |
409 | | |
410 | 8.74k | async_client_init_.emplace( |
411 | 8.74k | "tserver_client", 0 /* num_reactors */, |
412 | 8.74k | FLAGS_tserver_yb_client_default_timeout_ms / 1000, server_->permanent_uuid(), |
413 | 8.74k | &server_->options(), server_->metric_entity(), server_->mem_tracker(), |
414 | 8.74k | server_->messenger()); |
415 | | |
416 | 8.74k | async_client_init_->AddPostCreateHook([this](client::YBClient* client) { |
417 | 7.97k | auto* tserver = server(); |
418 | 7.97k | if (tserver != nullptr && tserver->proxy() != nullptr) { |
419 | 7.97k | client->SetLocalTabletServer(tserver->permanent_uuid(), tserver->proxy(), tserver); |
420 | 7.97k | } |
421 | 7.97k | }); |
422 | | |
423 | 8.74k | tablet_options_.env = server_->GetEnv(); |
424 | 8.74k | tablet_options_.rocksdb_env = server_->GetRocksDBEnv(); |
425 | 8.74k | tablet_options_.listeners = server_->options().listeners; |
426 | 8.74k | if (docdb::GetRocksDBRateLimiterSharingMode() == docdb::RateLimiterSharingMode::TSERVER) { |
427 | 8.74k | tablet_options_.rate_limiter = docdb::CreateRocksDBRateLimiter(); |
428 | 8.74k | } |
429 | | |
430 | | // Start the threadpool we'll use to open tablets. |
431 | | // This has to be done in Init() instead of the constructor, since the |
432 | | // FsManager isn't initialized until this point. |
433 | 8.74k | int max_bootstrap_threads = FLAGS_num_tablets_to_open_simultaneously; |
434 | 8.74k | if (max_bootstrap_threads == 0) { |
435 | 8.74k | int num_cpus = base::NumCPUs(); |
436 | 8.74k | if (num_cpus <= 2) { |
437 | 0 | max_bootstrap_threads = 2; |
438 | 8.74k | } else { |
439 | 8.74k | max_bootstrap_threads = min( |
440 | 8.74k | num_cpus - 1, narrow_cast<int>(fs_manager_->GetDataRootDirs().size()) * 8); |
441 | 8.74k | } |
442 | 8.74k | LOG_WITH_PREFIX(INFO) << "max_bootstrap_threads=" << max_bootstrap_threads; |
443 | 8.74k | } |
444 | 8.74k | ThreadPoolMetrics bootstrap_metrics = { |
445 | 8.74k | nullptr, |
446 | 8.74k | nullptr, |
447 | 8.74k | METRIC_ts_bootstrap_time.Instantiate(server_->metric_entity()) |
448 | 8.74k | }; |
449 | 8.74k | RETURN_NOT_OK(ThreadPoolBuilder("tablet-bootstrap") |
450 | 8.74k | .set_max_threads(max_bootstrap_threads) |
451 | 8.74k | .set_metrics(std::move(bootstrap_metrics)) |
452 | 8.74k | .Build(&open_tablet_pool_)); |
453 | | |
454 | 8.74k | CleanupCheckpoints(); |
455 | | |
456 | | // Search for tablets in the metadata dir. |
457 | 8.74k | vector<string> tablet_ids; |
458 | 8.74k | RETURN_NOT_OK(fs_manager_->ListTabletIds(&tablet_ids)); |
459 | | |
460 | 8.74k | InitLocalRaftPeerPB(); |
461 | | |
462 | 8.74k | multi_raft_manager_ = std::make_unique<consensus::MultiRaftManager>(server_->messenger(), |
463 | 8.74k | &server_->proxy_cache(), |
464 | 8.74k | local_peer_pb_.cloud_info()); |
465 | | |
466 | 8.74k | deque<RaftGroupMetadataPtr> metas; |
467 | | |
468 | | // First, load all of the tablet metadata. We do this before we start |
469 | | // submitting the actual OpenTablet() tasks so that we don't have to compete |
470 | | // for disk resources, etc, with bootstrap processes and running tablets. |
471 | 8.74k | MonoTime start(MonoTime::Now()); |
472 | 8.74k | for (const string& tablet_id : tablet_ids) { |
473 | 217 | RaftGroupMetadataPtr meta; |
474 | 217 | RETURN_NOT_OK_PREPEND(OpenTabletMeta(tablet_id, &meta), |
475 | 217 | "Failed to open tablet metadata for tablet: " + tablet_id); |
476 | 217 | if (PREDICT_FALSE(!CanServeTabletData(meta->tablet_data_state()))) { |
477 | 8 | RETURN_NOT_OK(HandleNonReadyTabletOnStartup(meta)); |
478 | 8 | continue; |
479 | 8 | } |
480 | 209 | RegisterDataAndWalDir( |
481 | 209 | fs_manager_, meta->table_id(), meta->raft_group_id(), meta->data_root_dir(), |
482 | 209 | meta->wal_root_dir()); |
483 | 209 | if (FLAGS_enable_restart_transaction_status_tablets_first) { |
484 | | // Prioritize bootstrapping transaction status tablets first. |
485 | 209 | if (meta->table_type() == TRANSACTION_STATUS_TABLE_TYPE) { |
486 | 21 | metas.push_front(meta); |
487 | 188 | } else { |
488 | 188 | metas.push_back(meta); |
489 | 188 | } |
490 | 209 | } else { |
491 | 0 | metas.push_back(meta); |
492 | 0 | } |
493 | 209 | } |
494 | | |
495 | 8.74k | MonoDelta elapsed = MonoTime::Now().GetDeltaSince(start); |
496 | 8.74k | LOG(INFO) << "Loaded metadata for " << tablet_ids.size() << " tablet in " |
497 | 8.74k | << elapsed.ToMilliseconds() << " ms"; |
498 | | |
499 | | // Now submit the "Open" task for each. |
500 | 8.74k | for (const RaftGroupMetadataPtr& meta : metas) { |
501 | 209 | scoped_refptr<TransitionInProgressDeleter> deleter; |
502 | 209 | RETURN_NOT_OK(StartTabletStateTransition( |
503 | 209 | meta->raft_group_id(), "opening tablet", &deleter)); |
504 | | |
505 | 209 | TabletPeerPtr tablet_peer = VERIFY_RESULT(CreateAndRegisterTabletPeer(meta, NEW_PEER)); |
506 | 209 | RETURN_NOT_OK(open_tablet_pool_->SubmitFunc( |
507 | 209 | std::bind(&TSTabletManager::OpenTablet, this, meta, deleter))); |
508 | 209 | } |
509 | | |
510 | 8.74k | { |
511 | 8.74k | std::lock_guard<RWMutex> lock(mutex_); |
512 | 8.74k | state_ = MANAGER_RUNNING; |
513 | 8.74k | } |
514 | | |
515 | 8.74k | RETURN_NOT_OK(mem_manager_->Init()); |
516 | | |
517 | 8.74k | tablets_cleaner_ = std::make_unique<rpc::Poller>( |
518 | 8.74k | LogPrefix(), std::bind(&TSTabletManager::CleanupSplitTablets, this)); |
519 | | |
520 | 8.74k | verify_tablet_data_poller_ = std::make_unique<rpc::Poller>( |
521 | 8.74k | LogPrefix(), std::bind(&TSTabletManager::VerifyTabletData, this)); |
522 | | |
523 | 8.74k | metrics_cleaner_ = std::make_unique<rpc::Poller>( |
524 | 8.74k | LogPrefix(), std::bind(&TSTabletManager::CleanupOldMetrics, this)); |
525 | | |
526 | 8.74k | return Status::OK(); |
527 | 8.74k | } |
528 | | |
529 | 8.74k | void TSTabletManager::CleanupCheckpoints() { |
530 | 8.84k | for (const auto& data_root : fs_manager_->GetDataRootDirs()) { |
531 | 8.84k | auto tables_dir = JoinPathSegments(data_root, FsManager::kRocksDBDirName); |
532 | 8.84k | auto tables = fs_manager_->env()->GetChildren(tables_dir, ExcludeDots::kTrue); |
533 | 8.84k | if (!tables.ok()) { |
534 | 0 | LOG_WITH_PREFIX(WARNING) |
535 | 0 | << "Failed to get tables in " << tables_dir << ": " << tables.status(); |
536 | 0 | continue; |
537 | 0 | } |
538 | 8.84k | for (const auto& table : *tables) { |
539 | 94 | auto table_dir = JoinPathSegments(tables_dir, table); |
540 | 94 | auto tablets = fs_manager_->env()->GetChildren(table_dir, ExcludeDots::kTrue); |
541 | 94 | if (!tablets.ok()) { |
542 | 0 | LOG_WITH_PREFIX(WARNING) |
543 | 0 | << "Failed to get tablets in " << table_dir << ": " << tablets.status(); |
544 | 0 | continue; |
545 | 0 | } |
546 | 566 | for (const auto& tablet : *tablets)94 { |
547 | 566 | auto checkpoints_dir = JoinPathSegments( |
548 | 566 | table_dir, tablet, RemoteBootstrapSession::kCheckpointsDir); |
549 | 566 | if (fs_manager_->env()->FileExists(checkpoints_dir)) { |
550 | 0 | LOG_WITH_PREFIX(INFO) << "Cleaning up checkpoints dir: " << yb::ToString(checkpoints_dir); |
551 | 0 | auto status = fs_manager_->env()->DeleteRecursively(checkpoints_dir); |
552 | 0 | WARN_NOT_OK(status, Format("Cleanup of checkpoints dir $0 failed", checkpoints_dir)); |
553 | 0 | } |
554 | 566 | } |
555 | 94 | } |
556 | 8.84k | } |
557 | 8.74k | } |
558 | | |
559 | 8.58k | Status TSTabletManager::Start() { |
560 | 8.58k | async_client_init_->Start(); |
561 | 8.58k | if (FLAGS_cleanup_split_tablets_interval_sec > 0) { |
562 | 8.58k | tablets_cleaner_->Start( |
563 | 8.58k | &server_->messenger()->scheduler(), FLAGS_cleanup_split_tablets_interval_sec * 1s); |
564 | 8.58k | LOG(INFO) << "Split tablets cleanup monitor started..."; |
565 | 8.58k | } else { |
566 | 0 | LOG(INFO) |
567 | 0 | << "Split tablets cleanup is disabled by cleanup_split_tablets_interval_sec flag set to 0"; |
568 | 0 | } |
569 | 8.58k | if (FLAGS_verify_tablet_data_interval_sec > 0) { |
570 | 0 | verify_tablet_data_poller_->Start( |
571 | 0 | &server_->messenger()->scheduler(), FLAGS_verify_tablet_data_interval_sec * 1s); |
572 | 0 | LOG(INFO) << "Tablet data verification task started..."; |
573 | 8.58k | } else { |
574 | 8.58k | LOG(INFO) |
575 | 8.58k | << "Tablet data verification is disabled by verify_tablet_data_interval_sec flag set to 0"; |
576 | 8.58k | } |
577 | 8.58k | if (FLAGS_cleanup_metrics_interval_sec > 0) { |
578 | 8.58k | metrics_cleaner_->Start( |
579 | 8.58k | &server_->messenger()->scheduler(), FLAGS_cleanup_metrics_interval_sec * 1s); |
580 | 8.58k | LOG(INFO) << "Old metrics cleanup task started..."; |
581 | 8.58k | } else { |
582 | 0 | LOG(INFO) |
583 | 0 | << "Old metrics cleanup is disabled by cleanup_metrics_interval_sec flag set to 0"; |
584 | 0 | } |
585 | | |
586 | 8.58k | return Status::OK(); |
587 | 8.58k | } |
588 | | |
589 | 354k | void TSTabletManager::CleanupSplitTablets() { |
590 | 354k | VLOG_WITH_PREFIX_AND_FUNC0 (3) << "looking for tablets to cleanup..."0 ; |
591 | 354k | auto tablet_peers = GetTabletPeers(); |
592 | 1.63M | for (const auto& tablet_peer : tablet_peers) { |
593 | 1.63M | if (tablet_peer->CanBeDeleted()) { |
594 | 13 | const auto& tablet_id = tablet_peer->tablet_id(); |
595 | 13 | if (PREDICT_FALSE(FLAGS_TEST_skip_deleting_split_tablets)) { |
596 | 7 | LOG_WITH_PREFIX(INFO) << Format("Skipped triggering delete of tablet $0", tablet_id); |
597 | 7 | } else { |
598 | 6 | LOG_WITH_PREFIX(INFO) << Format("Triggering delete of tablet $0", tablet_id); |
599 | 6 | client().DeleteNotServingTablet( |
600 | 6 | tablet_peer->tablet_id(), [tablet_id](const Status& status) { |
601 | 6 | LOG(INFO) << Format("Tablet $0 deletion result: $1", tablet_id, status); |
602 | 6 | }); |
603 | 6 | } |
604 | 13 | } |
605 | 1.63M | } |
606 | 354k | } |
607 | | |
608 | 14 | Status TSTabletManager::WaitForAllBootstrapsToFinish() { |
609 | 14 | CHECK_EQ(state(), MANAGER_RUNNING); |
610 | | |
611 | 14 | open_tablet_pool_->Wait(); |
612 | | |
613 | 14 | Status s = Status::OK(); |
614 | | |
615 | 14 | SharedLock<RWMutex> shared_lock(mutex_); |
616 | 29 | for (const TabletMap::value_type& entry : tablet_map_) { |
617 | 29 | if (entry.second->state() == tablet::FAILED) { |
618 | 1 | if (s.ok()) { |
619 | 1 | s = entry.second->error(); |
620 | 1 | } |
621 | 1 | } |
622 | 29 | } |
623 | | |
624 | 14 | return s; |
625 | 14 | } |
626 | | |
627 | | Result<scoped_refptr<TransitionInProgressDeleter>> |
628 | 139k | TSTabletManager::StartTabletStateTransitionForCreation(const TabletId& tablet_id) { |
629 | 139k | scoped_refptr<TransitionInProgressDeleter> deleter; |
630 | 139k | SharedLock<RWMutex> lock(mutex_); |
631 | 139k | TRACE("Acquired tablet manager lock"); |
632 | | |
633 | | // Sanity check that the tablet isn't already registered. |
634 | 139k | TabletPeerPtr junk; |
635 | 139k | if (LookupTabletUnlocked(tablet_id, &junk)) { |
636 | 1 | return STATUS(AlreadyPresent, "Tablet already registered", tablet_id); |
637 | 1 | } |
638 | | |
639 | 139k | RETURN_NOT_OK(StartTabletStateTransition(tablet_id, "creating tablet", &deleter)); |
640 | | |
641 | 139k | return deleter; |
642 | 139k | } |
643 | | |
644 | | Result<TabletPeerPtr> TSTabletManager::CreateNewTablet( |
645 | | const tablet::TableInfoPtr& table_info, |
646 | | const string& tablet_id, |
647 | | const Partition& partition, |
648 | | RaftConfigPB config, |
649 | | const bool colocated, |
650 | 139k | const std::vector<SnapshotScheduleId>& snapshot_schedules) { |
651 | 139k | if (state() != MANAGER_RUNNING) { |
652 | 1 | return STATUS_FORMAT(IllegalState, "Manager is not running: $0", state()); |
653 | 1 | } |
654 | 139k | CHECK(IsRaftConfigMember(server_->instance_pb().permanent_uuid(), config)); |
655 | | |
656 | 556k | for (int i = 0; i < config.peers_size(); ++i416k ) { |
657 | 416k | const auto& config_peer = config.peers(i); |
658 | 416k | CHECK(config_peer.has_member_type()); |
659 | 416k | } |
660 | | |
661 | | // Set the initial opid_index for a RaftConfigPB to -1. |
662 | 139k | config.set_opid_index(consensus::kInvalidOpIdIndex); |
663 | | |
664 | 139k | scoped_refptr<TransitionInProgressDeleter> deleter = |
665 | 139k | VERIFY_RESULT139k (StartTabletStateTransitionForCreation(tablet_id));139k |
666 | | |
667 | | // Create the metadata. |
668 | 139k | TRACE("Creating new metadata..."); |
669 | 139k | string data_root_dir; |
670 | 139k | string wal_root_dir; |
671 | 139k | GetAndRegisterDataAndWalDir( |
672 | 139k | fs_manager_, table_info->table_id, tablet_id, &data_root_dir, &wal_root_dir); |
673 | 139k | auto create_result = RaftGroupMetadata::CreateNew(tablet::RaftGroupMetadataData { |
674 | 139k | .fs_manager = fs_manager_, |
675 | 139k | .table_info = table_info, |
676 | 139k | .raft_group_id = tablet_id, |
677 | 139k | .partition = partition, |
678 | 139k | .tablet_data_state = TABLET_DATA_READY, |
679 | 139k | .colocated = colocated, |
680 | 139k | .snapshot_schedules = snapshot_schedules, |
681 | 139k | }, data_root_dir, wal_root_dir); |
682 | 139k | if (!create_result.ok()) { |
683 | 9 | UnregisterDataWalDir(table_info->table_id, tablet_id, data_root_dir, wal_root_dir); |
684 | 9 | } |
685 | 139k | RETURN_NOT_OK_PREPEND(create_result, "Couldn't create tablet metadata") |
686 | 139k | RaftGroupMetadataPtr meta = std::move(*create_result); |
687 | 139k | LOG(INFO) << TabletLogPrefix(tablet_id) |
688 | 139k | << "Created tablet metadata for table: " << table_info->table_id; |
689 | | |
690 | | // We must persist the consensus metadata to disk before starting a new |
691 | | // tablet's TabletPeer and Consensus implementation. |
692 | 139k | std::unique_ptr<ConsensusMetadata> cmeta; |
693 | 139k | RETURN_NOT_OK_PREPEND(ConsensusMetadata::Create(fs_manager_, tablet_id, fs_manager_->uuid(), |
694 | 139k | config, consensus::kMinimumTerm, &cmeta), |
695 | 139k | "Unable to create new ConsensusMeta for tablet " + tablet_id); |
696 | 139k | TabletPeerPtr new_peer = VERIFY_RESULT(CreateAndRegisterTabletPeer(meta, NEW_PEER)); |
697 | | |
698 | | // We can run this synchronously since there is nothing to bootstrap. |
699 | 139k | RETURN_NOT_OK( |
700 | 139k | open_tablet_pool_->SubmitFunc(std::bind(&TSTabletManager::OpenTablet, this, meta, deleter))); |
701 | | |
702 | 139k | return new_peer; |
703 | 139k | } |
704 | | |
705 | | struct TabletCreationMetaData { |
706 | | TabletId tablet_id; |
707 | | scoped_refptr<TransitionInProgressDeleter> transition_deleter; |
708 | | Partition partition; |
709 | | docdb::KeyBounds key_bounds; |
710 | | RaftGroupMetadataPtr raft_group_metadata; |
711 | | }; |
712 | | |
713 | | namespace { |
714 | | |
715 | | // Creates SplitTabletsCreationMetaData for two new tablets for `tablet` splitting based on request. |
716 | | SplitTabletsCreationMetaData PrepareTabletCreationMetaDataForSplit( |
717 | 68 | const tablet::SplitTabletRequestPB& request, const tablet::Tablet& tablet) { |
718 | 68 | SplitTabletsCreationMetaData metas; |
719 | | |
720 | 68 | const auto& split_partition_key = request.split_partition_key(); |
721 | 68 | const auto& split_encoded_key = request.split_encoded_key(); |
722 | | |
723 | 68 | std::shared_ptr<Partition> source_partition = tablet.metadata()->partition(); |
724 | 68 | const auto source_key_bounds = *tablet.doc_db().key_bounds; |
725 | | |
726 | 68 | { |
727 | 68 | TabletCreationMetaData meta; |
728 | 68 | meta.tablet_id = request.new_tablet1_id(); |
729 | 68 | meta.partition = *source_partition; |
730 | 68 | meta.key_bounds = source_key_bounds; |
731 | 68 | meta.partition.set_partition_key_end(split_partition_key); |
732 | 68 | meta.key_bounds.upper.Reset(split_encoded_key); |
733 | 68 | metas.push_back(meta); |
734 | 68 | } |
735 | | |
736 | 68 | { |
737 | 68 | TabletCreationMetaData meta; |
738 | 68 | meta.tablet_id = request.new_tablet2_id(); |
739 | 68 | meta.partition = *source_partition; |
740 | 68 | meta.key_bounds = source_key_bounds; |
741 | 68 | meta.partition.set_partition_key_start(split_partition_key); |
742 | 68 | meta.key_bounds.lower.Reset(split_encoded_key); |
743 | 68 | metas.push_back(meta); |
744 | 68 | } |
745 | | |
746 | 68 | return metas; |
747 | 68 | } |
748 | | |
749 | | } // namespace |
750 | | |
751 | | Status TSTabletManager::StartSubtabletsSplit( |
752 | 68 | const RaftGroupMetadata& source_tablet_meta, SplitTabletsCreationMetaData* tcmetas) { |
753 | 68 | auto* const env = fs_manager_->env(); |
754 | | |
755 | 68 | auto iter = tcmetas->begin(); |
756 | 204 | while (iter != tcmetas->end()) { |
757 | 136 | const auto& subtablet_id = iter->tablet_id; |
758 | | |
759 | 136 | auto transition_deleter_result = StartTabletStateTransitionForCreation(subtablet_id); |
760 | 136 | if (transition_deleter_result.ok()) { |
761 | 136 | iter->transition_deleter = *transition_deleter_result; |
762 | 136 | } else if (0 transition_deleter_result.status().IsAlreadyPresent()0 ) { |
763 | | // State transition for sub tablet with subtablet_id could be already registered because its |
764 | | // remote bootstrap (from already split parent tablet leader) is in progress. |
765 | 0 | iter = tcmetas->erase(iter); |
766 | 0 | continue; |
767 | 0 | } else { |
768 | 0 | return transition_deleter_result.status(); |
769 | 0 | } |
770 | | |
771 | | // Try to load metadata from previous not completed split. |
772 | 136 | auto load_result = RaftGroupMetadata::Load(fs_manager_, subtablet_id); |
773 | 136 | if (load_result.ok() && CanServeTabletData((*load_result)->tablet_data_state())2 ) { |
774 | | // Sub tablet has been already created and ready during previous split attempt at this node or |
775 | | // as a result of remote bootstrap from another node, no need to re-create. |
776 | 0 | iter = tcmetas->erase(iter); |
777 | 0 | continue; |
778 | 0 | } |
779 | | |
780 | | // Delete on-disk data for new tablet IDs in case it is present as a leftover from previously |
781 | | // failed tablet split attempt. |
782 | | // TODO(tsplit): add test for that. |
783 | 136 | const auto data_dir = source_tablet_meta.GetSubRaftGroupDataDir(subtablet_id); |
784 | 136 | if (env->FileExists(data_dir)) { |
785 | 0 | RETURN_NOT_OK_PREPEND( |
786 | 0 | env->DeleteRecursively(data_dir), |
787 | 0 | Format("Unable to recursively delete data dir for tablet $0", subtablet_id)); |
788 | 0 | } |
789 | 136 | RETURN_NOT_OK(Log::DeleteOnDiskData( |
790 | 136 | env, subtablet_id, source_tablet_meta.GetSubRaftGroupWalDir(subtablet_id), |
791 | 136 | fs_manager_->uuid())); |
792 | 136 | RETURN_NOT_OK(ConsensusMetadata::DeleteOnDiskData(fs_manager_, subtablet_id)); |
793 | | |
794 | 136 | ++iter; |
795 | 136 | } |
796 | 68 | return Status::OK(); |
797 | 68 | } |
798 | | |
799 | | void TSTabletManager::CreatePeerAndOpenTablet( |
800 | | const tablet::RaftGroupMetadataPtr& meta, |
801 | 134 | const scoped_refptr<TransitionInProgressDeleter>& deleter) { |
802 | 134 | Status s = ResultToStatus(CreateAndRegisterTabletPeer(meta, NEW_PEER)); |
803 | 134 | if (!s.ok()) { |
804 | 0 | s = s.CloneAndPrepend("Failed to create and register tablet peer"); |
805 | 0 | if (s.IsShutdownInProgress()) { |
806 | | // If shutdown is in progress, it is not a failure to not being able to create and register |
807 | | // tablet peer. |
808 | 0 | LOG_WITH_PREFIX(WARNING) << s; |
809 | 0 | } else { |
810 | 0 | LOG_WITH_PREFIX(DFATAL) << s; |
811 | 0 | } |
812 | 0 | return; |
813 | 0 | } |
814 | 134 | s = open_tablet_pool_->SubmitFunc(std::bind(&TSTabletManager::OpenTablet, this, meta, deleter)); |
815 | 134 | if (!s.ok()) { |
816 | 0 | LOG(DFATAL) << Format("Failed to schedule opening tablet $0: $1", meta->table_id(), s); |
817 | 0 | return; |
818 | 0 | } |
819 | 134 | } |
820 | | |
821 | 70 | Status TSTabletManager::ApplyTabletSplit(tablet::SplitOperation* operation, log::Log* raft_log) { |
822 | 70 | if (PREDICT_FALSE(FLAGS_TEST_crash_before_apply_tablet_split_op)) { |
823 | 2 | LOG(FATAL) << "Crashing due to FLAGS_TEST_crash_before_apply_tablet_split_op"; |
824 | 2 | } |
825 | | |
826 | 70 | if (state() != MANAGER_RUNNING) { |
827 | 0 | return STATUS_FORMAT(IllegalState, "Manager is not running: $0", state()); |
828 | 0 | } |
829 | | |
830 | 70 | auto* tablet = CHECK_NOTNULL(operation->tablet()); |
831 | 70 | const auto tablet_id = tablet->tablet_id(); |
832 | 70 | const auto* request = operation->request(); |
833 | 70 | SCHECK_EQ( |
834 | 70 | request->tablet_id(), tablet_id, IllegalState, |
835 | 70 | Format( |
836 | 70 | "Unexpected SPLIT_OP $0 designated for tablet $1 to be applied to tablet $2", |
837 | 70 | operation->op_id(), request->tablet_id(), tablet_id)); |
838 | 70 | SCHECK( |
839 | 70 | tablet_id != request->new_tablet1_id() && tablet_id != request->new_tablet2_id(), |
840 | 70 | IllegalState, |
841 | 70 | Format( |
842 | 70 | "One of SPLIT_OP $0 destination tablet IDs ($1, $2) is the same as source tablet ID $3", |
843 | 70 | operation->op_id(), request->new_tablet1_id(), request->new_tablet2_id(), tablet_id)); |
844 | | |
845 | 70 | LOG_WITH_PREFIX(INFO) << "Tablet " << tablet_id << " split operation apply started"; |
846 | | |
847 | 70 | if (raft_log == nullptr) { |
848 | 68 | auto tablet_peer = VERIFY_RESULT(LookupTablet(tablet_id)); |
849 | 0 | raft_log = tablet_peer->raft_consensus()->log().get(); |
850 | 68 | } |
851 | | |
852 | 70 | MAYBE_FAULT(FLAGS_TEST_fault_crash_in_split_before_log_flushed); |
853 | | |
854 | 70 | RETURN_NOT_OK(raft_log->FlushIndex()); |
855 | | |
856 | 70 | auto& meta = *CHECK_NOTNULL(tablet->metadata()); |
857 | | |
858 | | // TODO(tsplit): We can later implement better per-disk distribution during compaction of split |
859 | | // tablets. |
860 | 70 | const auto table_id = meta.table_id(); |
861 | 70 | const auto data_root_dir = |
862 | 70 | VERIFY_RESULT(GetAssignedRootDirForTablet(TabletDirType::kData, table_id, tablet_id)); |
863 | 0 | const auto wal_root_dir = |
864 | 70 | VERIFY_RESULT(GetAssignedRootDirForTablet(TabletDirType::kWal, table_id, tablet_id)); |
865 | | |
866 | 70 | if (FLAGS_TEST_apply_tablet_split_inject_delay_ms > 0) { |
867 | 2 | LOG(INFO) << "TEST: ApplyTabletSplit: injecting delay of " |
868 | 2 | << FLAGS_TEST_apply_tablet_split_inject_delay_ms << " ms for " |
869 | 2 | << AsString(*operation); |
870 | 2 | std::this_thread::sleep_for(FLAGS_TEST_apply_tablet_split_inject_delay_ms * 1ms); |
871 | 2 | LOG(INFO) << "TEST: ApplyTabletSplit: delay finished"; |
872 | 2 | } |
873 | | |
874 | 70 | auto tcmetas = PrepareTabletCreationMetaDataForSplit(*request, *tablet); |
875 | | |
876 | 70 | RETURN_NOT_OK(StartSubtabletsSplit(meta, &tcmetas)); |
877 | | |
878 | 136 | for (const auto& tcmeta : tcmetas)70 { |
879 | 136 | RegisterDataAndWalDir(fs_manager_, table_id, tcmeta.tablet_id, data_root_dir, wal_root_dir); |
880 | 136 | } |
881 | | |
882 | 70 | bool successfully_completed = false; |
883 | 70 | auto se = ScopeExit([&] { |
884 | 67 | if (!successfully_completed) { |
885 | 0 | for (const auto& tcmeta : tcmetas) { |
886 | 0 | UnregisterDataWalDir(table_id, tcmeta.tablet_id, data_root_dir, wal_root_dir); |
887 | 0 | } |
888 | 0 | } |
889 | 67 | }); |
890 | | |
891 | 70 | std::unique_ptr<ConsensusMetadata> cmeta; |
892 | 70 | RETURN_NOT_OK(ConsensusMetadata::Load(fs_manager_, tablet_id, fs_manager_->uuid(), &cmeta)); |
893 | | |
894 | 135 | for (auto& tcmeta : tcmetas)70 { |
895 | 135 | const auto& new_tablet_id = tcmeta.tablet_id; |
896 | | |
897 | | // Copy raft group metadata. |
898 | 135 | tcmeta.raft_group_metadata = VERIFY_RESULT(tablet->CreateSubtablet( |
899 | 0 | new_tablet_id, tcmeta.partition, tcmeta.key_bounds, operation->op_id(), |
900 | 0 | operation->hybrid_time())); |
901 | 135 | LOG_WITH_PREFIX(INFO) << "Created raft group metadata for table: " << table_id |
902 | 135 | << " tablet: " << new_tablet_id; |
903 | | |
904 | | // Copy consensus metadata. |
905 | | // Here we reuse the same cmeta instance for both new tablets. This is safe, because: |
906 | | // 1) Their consensus metadata only differ by tablet id. |
907 | | // 2) Flush() will save it into a new path corresponding to tablet id we set before flushing. |
908 | 135 | cmeta->set_tablet_id(new_tablet_id); |
909 | 135 | cmeta->set_split_parent_tablet_id(tablet_id); |
910 | 135 | RETURN_NOT_OK(cmeta->Flush()); |
911 | | |
912 | 135 | const auto& dest_wal_dir = tcmeta.raft_group_metadata->wal_dir(); |
913 | 135 | RETURN_NOT_OK(raft_log->CopyTo(dest_wal_dir)); |
914 | | |
915 | 135 | MAYBE_FAULT(FLAGS_TEST_fault_crash_in_split_after_log_copied); |
916 | | |
917 | 135 | tcmeta.raft_group_metadata->set_tablet_data_state(TABLET_DATA_READY); |
918 | 135 | RETURN_NOT_OK(tcmeta.raft_group_metadata->Flush()); |
919 | 135 | } |
920 | | |
921 | 70 | meta.SetSplitDone(operation->op_id(), request->new_tablet1_id(), request->new_tablet2_id()); |
922 | 70 | RETURN_NOT_OK(meta.Flush()); |
923 | | |
924 | 70 | tablet->SplitDone(); |
925 | | |
926 | 134 | for (auto& tcmeta : tcmetas) { |
927 | | // Call CreatePeerAndOpenTablet asynchronously to avoid write-locking TSTabletManager::mutex_ |
928 | | // here since apply of SPLIT_OP is done under ReplicaState lock and this could lead to deadlock |
929 | | // in case of reverse lock order in some other thread. |
930 | | // See https://github.com/yugabyte/yugabyte-db/issues/4312 for more details. |
931 | 134 | RETURN_NOT_OK(apply_pool_->SubmitFunc(std::bind( |
932 | 134 | &TSTabletManager::CreatePeerAndOpenTablet, this, tcmeta.raft_group_metadata, |
933 | 134 | tcmeta.transition_deleter))); |
934 | 134 | } |
935 | | |
936 | 70 | successfully_completed = true; |
937 | 70 | LOG_WITH_PREFIX(INFO) << "Tablet " << tablet_id << " split operation has been applied"; |
938 | 70 | return Status::OK(); |
939 | 70 | } |
940 | | |
941 | 368k | string LogPrefix(const string& tablet_id, const string& uuid) { |
942 | 368k | return "T " + tablet_id + " P " + uuid + ": "; |
943 | 368k | } |
944 | | |
945 | | Status CheckLeaderTermNotLower( |
946 | | const string& tablet_id, |
947 | | const string& uuid, |
948 | | int64_t leader_term, |
949 | 153 | int64_t last_logged_term) { |
950 | 153 | if (PREDICT_FALSE(leader_term < last_logged_term)) { |
951 | 0 | Status s = STATUS(InvalidArgument, |
952 | 0 | Substitute("Leader has replica of tablet $0 with term $1 lower than last " |
953 | 0 | "logged term $2 on local replica. Rejecting remote bootstrap request", |
954 | 0 | tablet_id, leader_term, last_logged_term)); |
955 | 0 | LOG(WARNING) << LogPrefix(tablet_id, uuid) << "Remote bootstrap: " << s; |
956 | 0 | return s; |
957 | 0 | } |
958 | 153 | return Status::OK(); |
959 | 153 | } |
960 | | |
961 | | Status HandleReplacingStaleTablet( |
962 | | RaftGroupMetadataPtr meta, |
963 | | TabletPeerPtr old_tablet_peer, |
964 | | const string& tablet_id, |
965 | | const string& uuid, |
966 | 153 | const int64_t& leader_term) { |
967 | 153 | TabletDataState data_state = meta->tablet_data_state(); |
968 | 153 | switch (data_state) { |
969 | 0 | case TABLET_DATA_COPYING: { |
970 | | // This should not be possible due to the transition_in_progress_ "lock". |
971 | 0 | LOG(FATAL) << LogPrefix(tablet_id, uuid) << " Remote bootstrap: " |
972 | 0 | << "Found tablet in TABLET_DATA_COPYING state during StartRemoteBootstrap()"; |
973 | 0 | } |
974 | 153 | case TABLET_DATA_TOMBSTONED: { |
975 | 153 | RETURN_NOT_OK(old_tablet_peer->CheckShutdownOrNotStarted()); |
976 | 153 | int64_t last_logged_term = meta->tombstone_last_logged_opid().term; |
977 | 153 | RETURN_NOT_OK(CheckLeaderTermNotLower(tablet_id, |
978 | 153 | uuid, |
979 | 153 | leader_term, |
980 | 153 | last_logged_term)); |
981 | 153 | break; |
982 | 153 | } |
983 | 153 | case TABLET_DATA_SPLIT_COMPLETED: |
984 | 0 | case TABLET_DATA_READY: { |
985 | 0 | if (tablet_id == master::kSysCatalogTabletId) { |
986 | 0 | LOG(FATAL) << LogPrefix(tablet_id, uuid) << " Remote bootstrap: " |
987 | 0 | << "Found tablet in " << TabletDataState_Name(data_state) |
988 | 0 | << " state during StartRemoteBootstrap()"; |
989 | 0 | } |
990 | | // There's a valid race here that can lead us to come here: |
991 | | // 1. Leader sends a second remote bootstrap request as a result of receiving a |
992 | | // TABLET_NOT_FOUND from this tserver while it was in the middle of a remote bootstrap. |
993 | | // 2. The remote bootstrap request arrives after the first one is finished, and it is able to |
994 | | // grab the mutex. |
995 | | // 3. This tserver finds that it already has the metadata for the tablet, and determines that |
996 | | // it needs to replace the tablet setting replacing_tablet to true. |
997 | | // In this case, the master can simply ignore this error. |
998 | 0 | return STATUS_FORMAT( |
999 | 0 | IllegalState, "Tablet $0 in $1 state", tablet_id, TabletDataState_Name(data_state)); |
1000 | 0 | } |
1001 | 0 | default: { |
1002 | 0 | return STATUS(IllegalState, |
1003 | 0 | Substitute("Found tablet $0 in unexpected state $1 for remote bootstrap.", |
1004 | 0 | tablet_id, TabletDataState_Name(data_state))); |
1005 | 0 | } |
1006 | 153 | } |
1007 | | |
1008 | 153 | return Status::OK(); |
1009 | 153 | } |
1010 | | |
1011 | 10.3k | Status TSTabletManager::StartRemoteBootstrap(const StartRemoteBootstrapRequestPB& req) { |
1012 | | // To prevent racing against Shutdown, we increment this as soon as we start. This should be done |
1013 | | // before checking for ClosingUnlocked, as on shutdown, we proceed in reverse: |
1014 | | // - first mark as closing |
1015 | | // - then wait for num_tablets_being_remote_bootstrapped_ == 0 |
1016 | 10.3k | ++num_tablets_being_remote_bootstrapped_; |
1017 | 10.3k | auto private_addr = req.source_private_addr()[0].host(); |
1018 | 10.3k | auto decrement_num_rbs_se = ScopeExit([this, &private_addr](){ |
1019 | 10.3k | { |
1020 | 10.3k | std::lock_guard<RWMutex> lock(mutex_); |
1021 | 10.3k | auto iter = bootstrap_source_addresses_.find(private_addr); |
1022 | 10.3k | if (iter != bootstrap_source_addresses_.end()) { |
1023 | 10.0k | bootstrap_source_addresses_.erase(iter); |
1024 | 10.0k | } |
1025 | 10.3k | } |
1026 | 10.3k | --num_tablets_being_remote_bootstrapped_; |
1027 | 10.3k | }); |
1028 | | |
1029 | 10.3k | LongOperationTracker tracker("StartRemoteBootstrap", 5s); |
1030 | | |
1031 | 10.3k | const string& tablet_id = req.tablet_id(); |
1032 | 10.3k | const string& bootstrap_peer_uuid = req.bootstrap_peer_uuid(); |
1033 | 10.3k | HostPort bootstrap_peer_addr = HostPortFromPB(DesiredHostPort( |
1034 | 10.3k | req.source_broadcast_addr(), req.source_private_addr(), req.source_cloud_info(), |
1035 | 10.3k | server_->MakeCloudInfoPB())); |
1036 | 10.3k | int64_t leader_term = req.caller_term(); |
1037 | | |
1038 | 10.3k | const string kLogPrefix = TabletLogPrefix(tablet_id); |
1039 | | |
1040 | 10.3k | TabletPeerPtr old_tablet_peer; |
1041 | 10.3k | RaftGroupMetadataPtr meta; |
1042 | 10.3k | bool replacing_tablet = false; |
1043 | 10.3k | scoped_refptr<TransitionInProgressDeleter> deleter; |
1044 | 10.3k | { |
1045 | 10.3k | std::lock_guard<RWMutex> lock(mutex_); |
1046 | 10.3k | bootstrap_source_addresses_.emplace(private_addr); |
1047 | 10.3k | if (ClosingUnlocked()) { |
1048 | 0 | auto result = STATUS_FORMAT( |
1049 | 0 | IllegalState, "StartRemoteBootstrap in wrong state: $0", |
1050 | 0 | TSTabletManagerStatePB_Name(state_)); |
1051 | 0 | LOG(WARNING) << kLogPrefix << result; |
1052 | 0 | return result; |
1053 | 0 | } |
1054 | | |
1055 | 10.3k | if (LookupTabletUnlocked(tablet_id, &old_tablet_peer)) { |
1056 | 8.58k | meta = old_tablet_peer->tablet_metadata(); |
1057 | 8.58k | replacing_tablet = true; |
1058 | 8.58k | } |
1059 | 10.3k | RETURN_NOT_OK(StartTabletStateTransition( |
1060 | 10.3k | tablet_id, Substitute("remote bootstrapping tablet from peer $0", bootstrap_peer_uuid), |
1061 | 10.3k | &deleter)); |
1062 | 10.3k | } |
1063 | | |
1064 | 1.96k | if (replacing_tablet) { |
1065 | | // Make sure the existing tablet peer is shut down and tombstoned. |
1066 | 153 | RETURN_NOT_OK(HandleReplacingStaleTablet(meta, |
1067 | 153 | old_tablet_peer, |
1068 | 153 | tablet_id, |
1069 | 153 | fs_manager_->uuid(), |
1070 | 153 | leader_term)); |
1071 | 153 | } |
1072 | | |
1073 | 1.96k | string init_msg = kLogPrefix + Substitute("Initiating remote bootstrap from Peer $0 ($1)", |
1074 | 1.96k | bootstrap_peer_uuid, bootstrap_peer_addr.ToString()); |
1075 | 1.96k | LOG(INFO) << init_msg; |
1076 | 1.96k | TRACE(init_msg); |
1077 | | |
1078 | 1.96k | auto rb_client = std::make_unique<RemoteBootstrapClient>(tablet_id, fs_manager_); |
1079 | | |
1080 | | // Download and persist the remote superblock in TABLET_DATA_COPYING state. |
1081 | 1.96k | if (replacing_tablet) { |
1082 | 153 | RETURN_NOT_OK(rb_client->SetTabletToReplace(meta, leader_term)); |
1083 | 153 | } |
1084 | 1.96k | RETURN_NOT_OK(rb_client->Start(bootstrap_peer_uuid, |
1085 | 1.96k | &server_->proxy_cache(), |
1086 | 1.96k | bootstrap_peer_addr, |
1087 | 1.96k | &meta, |
1088 | 1.96k | this)); |
1089 | | |
1090 | | // From this point onward, the superblock is persisted in TABLET_DATA_COPYING |
1091 | | // state, and we need to tombstone the tablet if additional steps prior to |
1092 | | // getting to a TABLET_DATA_READY state fail. |
1093 | | |
1094 | 1.96k | if (PREDICT_FALSE(FLAGS_TEST_simulate_already_present_in_remote_bootstrap)) { |
1095 | 2 | LOG_WITH_PREFIX(INFO) |
1096 | 2 | << "Simulating AlreadyPresent error in TSTabletManager::StartRemoteBootstrap."; |
1097 | 2 | return STATUS(AlreadyPresent, "failed"); |
1098 | 2 | } |
1099 | | |
1100 | | // Registering a non-initialized TabletPeer offers visibility through the Web UI. |
1101 | 1.96k | TabletPeerPtr tablet_peer = VERIFY_RESULT( |
1102 | 1.96k | CreateAndRegisterTabletPeer(meta, replacing_tablet ? REPLACEMENT_PEER : NEW_PEER)); |
1103 | 0 | MarkTabletBeingRemoteBootstrapped(tablet_peer->tablet_id(), |
1104 | 1.96k | tablet_peer->tablet_metadata()->table_id()); |
1105 | | |
1106 | | // TODO: If we ever make this method asynchronous, we need to move this code somewhere else. |
1107 | 1.96k | auto se = ScopeExit([this, tablet_peer] { |
1108 | 1.94k | UnmarkTabletBeingRemoteBootstrapped(tablet_peer->tablet_id(), |
1109 | 1.94k | tablet_peer->tablet_metadata()->table_id()); |
1110 | 1.94k | }); |
1111 | | |
1112 | | // Download all of the remote files. |
1113 | 1.96k | TOMBSTONE_NOT_OK(rb_client->FetchAll(tablet_peer->status_listener()), |
1114 | 1.96k | meta, |
1115 | 1.96k | fs_manager_->uuid(), |
1116 | 1.96k | "Remote bootstrap: Unable to fetch data from remote peer " + |
1117 | 1.96k | bootstrap_peer_uuid + " (" + bootstrap_peer_addr.ToString() + ")", |
1118 | 1.96k | this); |
1119 | | |
1120 | 1.95k | MAYBE_FAULT(FLAGS_TEST_fault_crash_after_rb_files_fetched); |
1121 | | |
1122 | | // Write out the last files to make the new replica visible and update the |
1123 | | // TabletDataState in the superblock to TABLET_DATA_READY. |
1124 | | // Finish() will call EndRemoteSession() and wait for the leader to successfully submit a |
1125 | | // ChangeConfig request (to change this server's role from PRE_VOTER or PRE_OBSERVER to VOTER or |
1126 | | // OBSERVER respectively). If the RPC times out, we will ignore the error (since the leader could |
1127 | | // have successfully submitted the ChangeConfig request and failed to respond in time) |
1128 | | // and check the committed config until we find that this server's role has changed, or until we |
1129 | | // time out which will cause us to tombstone the tablet. |
1130 | 1.95k | TOMBSTONE_NOT_OK(rb_client->Finish(), |
1131 | 1.95k | meta, |
1132 | 1.95k | fs_manager_->uuid(), |
1133 | 1.95k | "Remote bootstrap: Failed calling Finish()", |
1134 | 1.95k | this); |
1135 | | |
1136 | 1.95k | LOG(INFO) << kLogPrefix << "Remote bootstrap: Opening tablet"; |
1137 | | |
1138 | | // TODO(hector): ENG-3173: We need to simulate a failure in OpenTablet during remote bootstrap |
1139 | | // and verify that this tablet server gets remote bootstrapped again by the leader. We also need |
1140 | | // to check what happens when this server receives raft consensus requests since at this point, |
1141 | | // this tablet server could be a voter (if the ChangeRole request in Finish succeeded and its |
1142 | | // initial role was PRE_VOTER). |
1143 | 1.95k | OpenTablet(meta, nullptr); |
1144 | | // If OpenTablet fails, tablet_peer->error() will be set. |
1145 | 1.95k | RETURN_NOT_OK(ShutdownAndTombstoneTabletPeerNotOk( |
1146 | 1.95k | tablet_peer->error(), tablet_peer, meta, fs_manager_->uuid(), |
1147 | 1.95k | "Remote bootstrap: OpenTablet() failed", this)); |
1148 | | |
1149 | 1.95k | auto status = rb_client->VerifyChangeRoleSucceeded(tablet_peer->shared_consensus()); |
1150 | 1.95k | if (!status.ok()) { |
1151 | | // If for some reason this tserver wasn't promoted (e.g. from PRE-VOTER to VOTER), the leader |
1152 | | // will find out and do the CHANGE_CONFIG. |
1153 | 3 | LOG(WARNING) << kLogPrefix << "Remote bootstrap finished. " |
1154 | 3 | << "Failure calling VerifyChangeRoleSucceeded: " |
1155 | 3 | << status.ToString(); |
1156 | 1.95k | } else { |
1157 | 1.95k | LOG(INFO) << kLogPrefix << "Remote bootstrap for tablet ended successfully"; |
1158 | 1.95k | } |
1159 | | |
1160 | 1.95k | WARN_NOT_OK(rb_client->Remove(), "Remove remote bootstrap sessions failed"); |
1161 | | |
1162 | 1.95k | return Status::OK(); |
1163 | 1.95k | } |
1164 | | |
1165 | | // Create and register a new TabletPeer, given tablet metadata. |
1166 | | Result<TabletPeerPtr> TSTabletManager::CreateAndRegisterTabletPeer( |
1167 | 142k | const RaftGroupMetadataPtr& meta, RegisterTabletPeerMode mode) { |
1168 | 142k | TabletPeerPtr tablet_peer(new tablet::TabletPeer( |
1169 | 142k | meta, |
1170 | 142k | local_peer_pb_, |
1171 | 142k | scoped_refptr<server::Clock>(server_->clock()), |
1172 | 142k | fs_manager_->uuid(), |
1173 | 142k | Bind(&TSTabletManager::ApplyChange, Unretained(this), meta->raft_group_id()), |
1174 | 142k | metric_registry_, |
1175 | 142k | this, |
1176 | 142k | async_client_init_->get_client_future())); |
1177 | 142k | RETURN_NOT_OK(RegisterTablet(meta->raft_group_id(), tablet_peer, mode)); |
1178 | 142k | return tablet_peer; |
1179 | 142k | } |
1180 | | |
1181 | | Status TSTabletManager::DeleteTablet( |
1182 | | const string& tablet_id, |
1183 | | TabletDataState delete_type, |
1184 | | const boost::optional<int64_t>& cas_config_opid_index_less_or_equal, |
1185 | | bool hide_only, |
1186 | 77.5k | boost::optional<TabletServerErrorPB::Code>* error_code) { |
1187 | | |
1188 | 77.5k | if (delete_type != TABLET_DATA_DELETED && delete_type != TABLET_DATA_TOMBSTONED3.48k ) { |
1189 | 0 | return STATUS(InvalidArgument, "DeleteTablet() requires an argument that is one of " |
1190 | 0 | "TABLET_DATA_DELETED or TABLET_DATA_TOMBSTONED", |
1191 | 0 | Substitute("Given: $0 ($1)", |
1192 | 0 | TabletDataState_Name(delete_type), delete_type)); |
1193 | 0 | } |
1194 | | |
1195 | 77.5k | TRACE("Deleting tablet $0", tablet_id); |
1196 | | |
1197 | 77.5k | TabletPeerPtr tablet_peer; |
1198 | 77.5k | scoped_refptr<TransitionInProgressDeleter> deleter; |
1199 | 77.5k | { |
1200 | | // Acquire the lock in exclusive mode as we'll add a entry to the |
1201 | | // transition_in_progress_ map. |
1202 | 77.5k | std::lock_guard<RWMutex> lock(mutex_); |
1203 | 77.5k | TRACE("Acquired tablet manager lock"); |
1204 | 77.5k | RETURN_NOT_OK(CheckRunningUnlocked(error_code)); |
1205 | | |
1206 | 77.5k | if (!LookupTabletUnlocked(tablet_id, &tablet_peer)) { |
1207 | 45 | *error_code = TabletServerErrorPB::TABLET_NOT_FOUND; |
1208 | 45 | return STATUS(NotFound, "Tablet not found", tablet_id); |
1209 | 45 | } |
1210 | | // Sanity check that the tablet's deletion isn't already in progress |
1211 | 77.5k | Status s = StartTabletStateTransition(tablet_id, "deleting tablet", &deleter); |
1212 | 77.5k | if (PREDICT_FALSE(!s.ok())) { |
1213 | 1.87k | *error_code = TabletServerErrorPB::TABLET_NOT_RUNNING; |
1214 | 1.87k | return s; |
1215 | 1.87k | } |
1216 | 77.5k | } |
1217 | | |
1218 | | // If the tablet is already deleted, the CAS check isn't possible because |
1219 | | // consensus and therefore the log is not available. |
1220 | 75.6k | TabletDataState data_state = tablet_peer->tablet_metadata()->tablet_data_state(); |
1221 | 75.6k | bool tablet_deleted = (data_state == TABLET_DATA_DELETED || data_state == TABLET_DATA_TOMBSTONED); |
1222 | | |
1223 | | // If a tablet peer is in the FAILED state, then we need to be able to tombstone or delete this |
1224 | | // tablet. If the tablet is tombstoned, then this TS can be remote bootstrapped with the same |
1225 | | // tablet. |
1226 | 75.6k | bool tablet_failed = tablet_peer->state() == RaftGroupStatePB::FAILED; |
1227 | | |
1228 | | // They specified an "atomic" delete. Check the committed config's opid_index. |
1229 | | // TODO: There's actually a race here between the check and shutdown, but |
1230 | | // it's tricky to fix. We could try checking again after the shutdown and |
1231 | | // restarting the tablet if the local replica committed a higher config |
1232 | | // change op during that time, or potentially something else more invasive. |
1233 | 75.6k | if (cas_config_opid_index_less_or_equal && !tablet_deleted1.65k && !tablet_failed1.56k ) { |
1234 | 1.56k | shared_ptr<consensus::Consensus> consensus = tablet_peer->shared_consensus(); |
1235 | 1.56k | if (!consensus) { |
1236 | 0 | *error_code = TabletServerErrorPB::TABLET_NOT_RUNNING; |
1237 | 0 | return STATUS(IllegalState, "Consensus not available. Tablet shutting down"); |
1238 | 0 | } |
1239 | 1.56k | RaftConfigPB committed_config = consensus->CommittedConfig(); |
1240 | 1.56k | if (committed_config.opid_index() > *cas_config_opid_index_less_or_equal) { |
1241 | 79 | *error_code = TabletServerErrorPB::CAS_FAILED; |
1242 | 79 | return STATUS(IllegalState, Substitute("Request specified cas_config_opid_index_less_or_equal" |
1243 | 79 | " of $0 but the committed config has opid_index of $1", |
1244 | 79 | *cas_config_opid_index_less_or_equal, |
1245 | 79 | committed_config.opid_index())); |
1246 | 79 | } |
1247 | 1.56k | } |
1248 | | |
1249 | 75.5k | RaftGroupMetadataPtr meta = tablet_peer->tablet_metadata(); |
1250 | 75.5k | if (hide_only) { |
1251 | 36 | meta->SetHidden(true); |
1252 | 36 | return meta->Flush(); |
1253 | 36 | } |
1254 | | // No matter if the tablet was deleted (drop table), or tombstoned (potentially moved to a |
1255 | | // different TS), we do not need to flush rocksdb anymore, as this data is irrelevant. |
1256 | | // |
1257 | | // Note: This might change for PITR. |
1258 | 75.5k | bool delete_data = delete_type == TABLET_DATA_DELETED || delete_type == TABLET_DATA_TOMBSTONED1.58k ; |
1259 | 75.5k | RETURN_NOT_OK(tablet_peer->Shutdown(tablet::IsDropTable(delete_data))); |
1260 | | |
1261 | 75.5k | yb::OpId last_logged_opid = tablet_peer->GetLatestLogEntryOpId(); |
1262 | | |
1263 | 75.5k | Status s = DeleteTabletData(meta, |
1264 | 75.5k | delete_type, |
1265 | 75.5k | fs_manager_->uuid(), |
1266 | 75.5k | last_logged_opid, |
1267 | 75.5k | this); |
1268 | 75.5k | if (PREDICT_FALSE(!s.ok())) { |
1269 | 0 | s = s.CloneAndPrepend(Substitute("Unable to delete on-disk data from tablet $0", |
1270 | 0 | tablet_id)); |
1271 | 0 | LOG(WARNING) << s.ToString(); |
1272 | 0 | tablet_peer->SetFailed(s); |
1273 | 0 | return s; |
1274 | 0 | } |
1275 | | |
1276 | 75.5k | tablet_peer->status_listener()->StatusMessage("Deleted tablet blocks from disk"); |
1277 | | |
1278 | | // We only remove DELETED tablets from the tablet map. |
1279 | 75.5k | if (delete_type == TABLET_DATA_DELETED) { |
1280 | 73.9k | std::lock_guard<RWMutex> lock(mutex_); |
1281 | 73.9k | RETURN_NOT_OK(CheckRunningUnlocked(error_code)); |
1282 | 73.9k | CHECK_EQ(1, tablet_map_.erase(tablet_id)) << tablet_id0 ; |
1283 | 73.9k | dirty_tablets_.erase(tablet_id); |
1284 | 73.9k | } |
1285 | | |
1286 | | // We unregister TOMBSTONED tablets in addition to DELETED tablets because they do not have |
1287 | | // any more data on disk, so we shouldn't count these tablets when load balancing the disks. |
1288 | 75.5k | UnregisterDataWalDir(meta->table_id(), |
1289 | 75.5k | tablet_id, |
1290 | 75.5k | meta->data_root_dir(), |
1291 | 75.5k | meta->wal_root_dir()); |
1292 | | |
1293 | 75.5k | return Status::OK(); |
1294 | 75.5k | } |
1295 | | |
1296 | | Status TSTabletManager::CheckRunningUnlocked( |
1297 | 151k | boost::optional<TabletServerErrorPB::Code>* error_code) const { |
1298 | 151k | if (state_ == MANAGER_RUNNING) { |
1299 | 151k | return Status::OK(); |
1300 | 151k | } |
1301 | 0 | *error_code = TabletServerErrorPB::TABLET_NOT_RUNNING; |
1302 | 0 | return STATUS(ServiceUnavailable, Substitute("Tablet Manager is not running: $0", |
1303 | 151k | TSTabletManagerStatePB_Name(state_))); |
1304 | 151k | } |
1305 | | |
1306 | | // NO_THREAD_SAFETY_ANALYSIS because this analysis does not work with unique_lock. |
1307 | | Status TSTabletManager::StartTabletStateTransition( |
1308 | | const string& tablet_id, |
1309 | | const string& reason, |
1310 | 228k | scoped_refptr<TransitionInProgressDeleter>* deleter) NO_THREAD_SAFETY_ANALYSIS { |
1311 | 228k | std::unique_lock<std::mutex> lock(transition_in_progress_mutex_); |
1312 | 228k | const auto emplace_result = transition_in_progress_.emplace(tablet_id, reason); |
1313 | 228k | if (!emplace_result.second) { |
1314 | 10.3k | return STATUS_FORMAT( |
1315 | 10.3k | AlreadyPresent, "State transition of tablet $0 already in progress: $1", tablet_id, |
1316 | 10.3k | *emplace_result.first); |
1317 | 10.3k | } |
1318 | 217k | deleter->reset(new TransitionInProgressDeleter( |
1319 | 217k | &transition_in_progress_, &transition_in_progress_mutex_, tablet_id)); |
1320 | 217k | return Status::OK(); |
1321 | 228k | } |
1322 | | |
1323 | 68 | bool TSTabletManager::IsTabletInTransition(const TabletId& tablet_id) const { |
1324 | 68 | std::unique_lock<std::mutex> lock(transition_in_progress_mutex_); |
1325 | 68 | return ContainsKey(transition_in_progress_, tablet_id); |
1326 | 68 | } |
1327 | | |
1328 | | Status TSTabletManager::OpenTabletMeta(const string& tablet_id, |
1329 | 217 | RaftGroupMetadataPtr* metadata) { |
1330 | 217 | LOG(INFO) << "Loading metadata for tablet " << tablet_id; |
1331 | 217 | TRACE("Loading metadata..."); |
1332 | 217 | auto load_result = RaftGroupMetadata::Load(fs_manager_, tablet_id); |
1333 | 217 | RETURN_NOT_OK_PREPEND(load_result, |
1334 | 217 | Format("Failed to load tablet metadata for tablet id $0", tablet_id)); |
1335 | 217 | TRACE("Metadata loaded"); |
1336 | 217 | metadata->swap(*load_result); |
1337 | 217 | return Status::OK(); |
1338 | 217 | } |
1339 | | |
1340 | | void TSTabletManager::OpenTablet(const RaftGroupMetadataPtr& meta, |
1341 | 142k | const scoped_refptr<TransitionInProgressDeleter>& deleter) { |
1342 | 142k | string tablet_id = meta->raft_group_id(); |
1343 | 142k | TRACE_EVENT1("tserver", "TSTabletManager::OpenTablet", |
1344 | 142k | "tablet_id", tablet_id); |
1345 | | |
1346 | 142k | TabletPeerPtr tablet_peer; |
1347 | 18.4E | CHECK(LookupTablet(tablet_id, &tablet_peer)) |
1348 | 18.4E | << "Tablet not registered prior to OpenTabletAsync call: " << tablet_id; |
1349 | | |
1350 | 142k | tablet::TabletPtr tablet; |
1351 | 142k | scoped_refptr<Log> log; |
1352 | 142k | const string kLogPrefix = TabletLogPrefix(tablet_id); |
1353 | | |
1354 | 142k | LOG(INFO) << kLogPrefix << "Bootstrapping tablet"; |
1355 | 142k | TRACE("Bootstrapping tablet"); |
1356 | | |
1357 | 142k | consensus::ConsensusBootstrapInfo bootstrap_info; |
1358 | 142k | consensus::RetryableRequests retryable_requests(kLogPrefix); |
1359 | 142k | yb::OpId split_op_id; |
1360 | | |
1361 | 142k | LOG_TIMING_PREFIX(INFO, kLogPrefix, "bootstrapping tablet") { |
1362 | | // Read flag before CAS to avoid TSAN race conflict with GetAllFlags. |
1363 | 142k | if (GetAtomicFlag(&FLAGS_TEST_force_single_tablet_failure) && |
1364 | 142k | CompareAndSetFlag(&FLAGS_TEST_force_single_tablet_failure, |
1365 | 3 | true /* expected */, false /* val */)) { |
1366 | 3 | LOG(ERROR) << "Setting the state of a tablet to FAILED"; |
1367 | 3 | tablet_peer->SetFailed(STATUS(InternalError, "Setting tablet to failed state for test", |
1368 | 3 | tablet_id)); |
1369 | 3 | return; |
1370 | 3 | } |
1371 | | |
1372 | | // TODO: handle crash mid-creation of tablet? do we ever end up with a |
1373 | | // partially created tablet here? |
1374 | 142k | auto s = tablet_peer->SetBootstrapping(); |
1375 | 142k | if (!s.ok()) { |
1376 | 0 | LOG(ERROR) << kLogPrefix << "Tablet failed to set bootstrapping: " << s; |
1377 | 0 | tablet_peer->SetFailed(s); |
1378 | 0 | return; |
1379 | 0 | } |
1380 | | |
1381 | 142k | tablet::TabletInitData tablet_init_data = { |
1382 | 142k | .metadata = meta, |
1383 | 142k | .client_future = async_client_init_->get_client_future(), |
1384 | 142k | .clock = scoped_refptr<server::Clock>(server_->clock()), |
1385 | 142k | .parent_mem_tracker = MemTracker::FindOrCreateTracker("Tablets", server_->mem_tracker()), |
1386 | 142k | .block_based_table_mem_tracker = mem_manager_->block_based_table_mem_tracker(), |
1387 | 142k | .metric_registry = metric_registry_, |
1388 | 142k | .log_anchor_registry = tablet_peer->log_anchor_registry(), |
1389 | 142k | .tablet_options = tablet_options_, |
1390 | 142k | .log_prefix_suffix = " P " + tablet_peer->permanent_uuid(), |
1391 | 142k | .transaction_participant_context = tablet_peer.get(), |
1392 | 142k | .local_tablet_filter = std::bind(&TSTabletManager::PreserveLocalLeadersOnly, this, _1), |
1393 | 142k | .transaction_coordinator_context = tablet_peer.get(), |
1394 | 142k | .txns_enabled = tablet::TransactionsEnabled::kTrue, |
1395 | | // We are assuming we're never dealing with the system catalog tablet in TSTabletManager. |
1396 | 142k | .is_sys_catalog = tablet::IsSysCatalogTablet::kFalse, |
1397 | 142k | .snapshot_coordinator = nullptr, |
1398 | 142k | .tablet_splitter = this, |
1399 | 142k | .allowed_history_cutoff_provider = std::bind( |
1400 | 142k | &TSTabletManager::AllowedHistoryCutoff, this, _1), |
1401 | 142k | }; |
1402 | 142k | tablet::BootstrapTabletData data = { |
1403 | 142k | .tablet_init_data = tablet_init_data, |
1404 | 142k | .listener = tablet_peer->status_listener(), |
1405 | 142k | .append_pool = append_pool(), |
1406 | 142k | .allocation_pool = allocation_pool_.get(), |
1407 | 142k | .retryable_requests = &retryable_requests, |
1408 | 142k | }; |
1409 | 142k | s = BootstrapTablet(data, &tablet, &log, &bootstrap_info); |
1410 | 142k | if (!s.ok()) { |
1411 | 13 | LOG(ERROR) << kLogPrefix << "Tablet failed to bootstrap: " << s; |
1412 | 13 | tablet_peer->SetFailed(s); |
1413 | 13 | return; |
1414 | 13 | } |
1415 | 142k | } |
1416 | | |
1417 | 142k | MonoTime start(MonoTime::Now()); |
1418 | 142k | LOG_TIMING_PREFIX(INFO, kLogPrefix, "starting tablet") { |
1419 | 142k | TRACE("Initializing tablet peer"); |
1420 | 142k | auto s = tablet_peer->InitTabletPeer( |
1421 | 142k | tablet, |
1422 | 142k | server_->mem_tracker(), |
1423 | 142k | server_->messenger(), |
1424 | 142k | &server_->proxy_cache(), |
1425 | 142k | log, |
1426 | 142k | tablet->GetTableMetricsEntity(), |
1427 | 142k | tablet->GetTabletMetricsEntity(), |
1428 | 142k | raft_pool(), |
1429 | 142k | tablet_prepare_pool(), |
1430 | 142k | &retryable_requests, |
1431 | 142k | multi_raft_manager_.get()); |
1432 | | |
1433 | 142k | if (!s.ok()) { |
1434 | 2 | LOG(ERROR) << kLogPrefix << "Tablet failed to init: " |
1435 | 2 | << s.ToString(); |
1436 | 2 | tablet_peer->SetFailed(s); |
1437 | 2 | return; |
1438 | 2 | } |
1439 | | |
1440 | 142k | TRACE("Starting tablet peer"); |
1441 | 142k | s = tablet_peer->Start(bootstrap_info); |
1442 | 142k | if (!s.ok()) { |
1443 | 0 | LOG(ERROR) << kLogPrefix << "Tablet failed to start: " |
1444 | 0 | << s.ToString(); |
1445 | 0 | tablet_peer->SetFailed(s); |
1446 | 0 | return; |
1447 | 0 | } |
1448 | | |
1449 | 142k | tablet_peer->RegisterMaintenanceOps(server_->maintenance_manager()); |
1450 | 142k | } |
1451 | | |
1452 | 142k | auto elapsed_ms = MonoTime::Now().GetDeltaSince(start).ToMilliseconds(); |
1453 | 142k | if (elapsed_ms > FLAGS_tablet_start_warn_threshold_ms) { |
1454 | 112 | LOG(WARNING) << kLogPrefix << "Tablet startup took " << elapsed_ms << "ms"; |
1455 | 112 | if (Trace::CurrentTrace()) { |
1456 | 0 | LOG(WARNING) << kLogPrefix << "Trace:" << std::endl |
1457 | 0 | << Trace::CurrentTrace()->DumpToString(true); |
1458 | 0 | } |
1459 | 112 | } |
1460 | | |
1461 | 142k | if (PREDICT_TRUE(!FLAGS_TEST_skip_post_split_compaction)) { |
1462 | 142k | WARN_NOT_OK( |
1463 | 142k | tablet->TriggerPostSplitCompactionIfNeeded([&]() { |
1464 | 142k | return post_split_trigger_compaction_pool_->NewToken(ThreadPool::ExecutionMode::SERIAL); |
1465 | 142k | }), |
1466 | 142k | "Failed to submit compaction for post-split tablet."); |
1467 | 142k | } else { |
1468 | 3 | LOG(INFO) << "Skipping post split compaction " << meta->raft_group_id(); |
1469 | 3 | } |
1470 | | |
1471 | 142k | if (tablet->ShouldDisableLbMove()) { |
1472 | 134 | std::lock_guard<RWMutex> lock(mutex_); |
1473 | 134 | tablets_blocked_from_lb_.insert(tablet->tablet_id()); |
1474 | 134 | VLOG(2) << TabletLogPrefix(tablet->tablet_id()) |
1475 | 0 | << " marking as maybe being compacted after split."; |
1476 | 134 | } |
1477 | 142k | } |
1478 | | |
1479 | 0 | Status TSTabletManager::TriggerCompactionAndWait(const TabletPtrs& tablets) { |
1480 | 0 | CountDownLatch latch(tablets.size()); |
1481 | 0 | auto token = admin_triggered_compaction_pool_->NewToken(ThreadPool::ExecutionMode::CONCURRENT); |
1482 | 0 | for (auto tablet : tablets) { |
1483 | 0 | RETURN_NOT_OK(token->SubmitFunc([&latch, tablet]() { |
1484 | 0 | WARN_NOT_OK(tablet->ForceFullRocksDBCompact(), "Failed to submit compaction for tablet."); |
1485 | 0 | latch.CountDown(); |
1486 | 0 | })); |
1487 | 0 | } |
1488 | 0 | latch.Wait(); |
1489 | 0 | return Status::OK(); |
1490 | 0 | } |
1491 | | |
1492 | 193 | void TSTabletManager::StartShutdown() { |
1493 | 193 | { |
1494 | 193 | std::lock_guard<RWMutex> lock(mutex_); |
1495 | 193 | switch (state_) { |
1496 | 0 | case MANAGER_QUIESCING: { |
1497 | 0 | VLOG(1) << "Tablet manager shut down already in progress.."; |
1498 | 0 | return; |
1499 | 0 | } |
1500 | 0 | case MANAGER_SHUTDOWN: { |
1501 | 0 | VLOG(1) << "Tablet manager has already been shut down."; |
1502 | 0 | return; |
1503 | 0 | } |
1504 | 0 | case MANAGER_INITIALIZING: |
1505 | 193 | case MANAGER_RUNNING: { |
1506 | 193 | LOG_WITH_PREFIX(INFO) << "Shutting down tablet manager..."; |
1507 | 193 | state_ = MANAGER_QUIESCING; |
1508 | 193 | break; |
1509 | 0 | } |
1510 | 0 | default: { |
1511 | 0 | LOG(FATAL) << "Invalid state: " << TSTabletManagerStatePB_Name(state_); |
1512 | 0 | } |
1513 | 193 | } |
1514 | 193 | } |
1515 | | |
1516 | 193 | tablets_cleaner_->Shutdown(); |
1517 | | |
1518 | 193 | verify_tablet_data_poller_->Shutdown(); |
1519 | | |
1520 | 193 | metrics_cleaner_->Shutdown(); |
1521 | | |
1522 | 193 | async_client_init_->Shutdown(); |
1523 | | |
1524 | 193 | mem_manager_->Shutdown(); |
1525 | | |
1526 | | // Wait for all RBS operations to finish. |
1527 | 193 | const MonoDelta kSingleWait = 10ms; |
1528 | 193 | const MonoDelta kReportInterval = 5s; |
1529 | 193 | const MonoDelta kMaxWait = 30s; |
1530 | 193 | MonoDelta waited = MonoDelta::kZero; |
1531 | 193 | MonoDelta next_report_time = kReportInterval; |
1532 | 193 | while (int remaining_rbs = num_tablets_being_remote_bootstrapped_ > 0) { |
1533 | 0 | if (waited >= next_report_time) { |
1534 | 0 | if (waited >= kMaxWait) { |
1535 | 0 | std::string addr = ""; |
1536 | 0 | for (auto iter = bootstrap_source_addresses_.begin(); |
1537 | 0 | iter != bootstrap_source_addresses_.end(); |
1538 | 0 | iter++) { |
1539 | 0 | if (iter == bootstrap_source_addresses_.begin()) { |
1540 | 0 | addr += *iter; |
1541 | 0 | } else { |
1542 | 0 | addr += "," + *iter; |
1543 | 0 | } |
1544 | 0 | } |
1545 | 0 | LOG_WITH_PREFIX(DFATAL) |
1546 | 0 | << "Waited for " << waited << "ms. Still had " |
1547 | 0 | << remaining_rbs << " pending remote bootstraps: " + addr; |
1548 | 0 | } else { |
1549 | 0 | LOG_WITH_PREFIX(WARNING) |
1550 | 0 | << "Still waiting for " << remaining_rbs |
1551 | 0 | << " ongoing RemoteBootstraps to finish after " << waited; |
1552 | 0 | } |
1553 | 0 | next_report_time = std::min(kMaxWait, waited + kReportInterval); |
1554 | 0 | } |
1555 | 0 | SleepFor(kSingleWait); |
1556 | 0 | waited += kSingleWait; |
1557 | 0 | } |
1558 | | |
1559 | | // Shut down the bootstrap pool, so new tablets are registered after this point. |
1560 | 193 | open_tablet_pool_->Shutdown(); |
1561 | | |
1562 | | // Take a snapshot of the peers list -- that way we don't have to hold |
1563 | | // on to the lock while shutting them down, which might cause a lock |
1564 | | // inversion. (see KUDU-308 for example). |
1565 | 193 | for (const TabletPeerPtr& peer : GetTabletPeers()) { |
1566 | 98 | if (peer->StartShutdown()) { |
1567 | 97 | shutting_down_peers_.push_back(peer); |
1568 | 97 | } |
1569 | 98 | } |
1570 | 193 | } |
1571 | | |
1572 | 89 | void TSTabletManager::CompleteShutdown() { |
1573 | 97 | for (const TabletPeerPtr& peer : shutting_down_peers_) { |
1574 | 97 | peer->CompleteShutdown(); |
1575 | 97 | } |
1576 | | |
1577 | | // Shut down the apply pool. |
1578 | 89 | apply_pool_->Shutdown(); |
1579 | | |
1580 | 89 | if (raft_pool_) { |
1581 | 89 | raft_pool_->Shutdown(); |
1582 | 89 | } |
1583 | 89 | if (tablet_prepare_pool_) { |
1584 | 89 | tablet_prepare_pool_->Shutdown(); |
1585 | 89 | } |
1586 | 89 | if (append_pool_) { |
1587 | 89 | append_pool_->Shutdown(); |
1588 | 89 | } |
1589 | 89 | if (post_split_trigger_compaction_pool_) { |
1590 | 89 | post_split_trigger_compaction_pool_->Shutdown(); |
1591 | 89 | } |
1592 | 89 | if (admin_triggered_compaction_pool_) { |
1593 | 89 | admin_triggered_compaction_pool_->Shutdown(); |
1594 | 89 | } |
1595 | | |
1596 | 89 | { |
1597 | 89 | std::lock_guard<RWMutex> l(mutex_); |
1598 | 89 | tablet_map_.clear(); |
1599 | 89 | dirty_tablets_.clear(); |
1600 | | |
1601 | 89 | std::lock_guard<std::mutex> dir_assignment_lock(dir_assignment_mutex_); |
1602 | 89 | table_data_assignment_map_.clear(); |
1603 | 89 | table_wal_assignment_map_.clear(); |
1604 | | |
1605 | 89 | state_ = MANAGER_SHUTDOWN; |
1606 | 89 | } |
1607 | 89 | } |
1608 | | |
1609 | 177k | std::string TSTabletManager::LogPrefix() const { |
1610 | 177k | return "P " + fs_manager_->uuid() + ": "; |
1611 | 177k | } |
1612 | | |
1613 | 292k | std::string TSTabletManager::TabletLogPrefix(const TabletId& tablet_id) const { |
1614 | 292k | return tserver::LogPrefix(tablet_id, fs_manager_->uuid()); |
1615 | 292k | } |
1616 | | |
1617 | 152k | bool TSTabletManager::ClosingUnlocked() const { |
1618 | 152k | return state_ == MANAGER_QUIESCING || state_ == MANAGER_SHUTDOWN; |
1619 | 152k | } |
1620 | | |
1621 | | Status TSTabletManager::RegisterTablet(const TabletId& tablet_id, |
1622 | | const TabletPeerPtr& tablet_peer, |
1623 | 141k | RegisterTabletPeerMode mode) { |
1624 | 141k | std::lock_guard<RWMutex> lock(mutex_); |
1625 | 141k | if (ClosingUnlocked()) { |
1626 | 0 | auto result = STATUS_FORMAT( |
1627 | 0 | ShutdownInProgress, "Unable to register tablet peer: $0: closing", tablet_id); |
1628 | 0 | LOG(WARNING) << result; |
1629 | 0 | return result; |
1630 | 0 | } |
1631 | | |
1632 | | // If we are replacing a tablet peer, we delete the existing one first. |
1633 | 141k | if (mode == REPLACEMENT_PEER && tablet_map_.erase(tablet_id) != 1152 ) { |
1634 | 0 | auto result = STATUS_FORMAT( |
1635 | 0 | NotFound, "Unable to remove previous tablet peer $0: not registered", tablet_id); |
1636 | 0 | LOG(WARNING) << result; |
1637 | 0 | return result; |
1638 | 0 | } |
1639 | 141k | if (!InsertIfNotPresent(&tablet_map_, tablet_id, tablet_peer)) { |
1640 | 0 | auto result = STATUS_FORMAT( |
1641 | 0 | AlreadyPresent, "Unable to register tablet peer $0: already registered", tablet_id); |
1642 | 0 | LOG(WARNING) << result; |
1643 | 0 | return result; |
1644 | 0 | } |
1645 | | |
1646 | 141k | LOG_WITH_PREFIX(INFO) << "Registered tablet " << tablet_id; |
1647 | | |
1648 | 141k | return Status::OK(); |
1649 | 141k | } |
1650 | | |
1651 | | bool TSTabletManager::LookupTablet(const string& tablet_id, |
1652 | 38.0M | TabletPeerPtr* tablet_peer) const { |
1653 | 38.0M | SharedLock<RWMutex> shared_lock(mutex_); |
1654 | 38.0M | return LookupTabletUnlocked(tablet_id, tablet_peer); |
1655 | 38.0M | } |
1656 | | |
1657 | | Result<std::shared_ptr<tablet::TabletPeer>> TSTabletManager::LookupTablet( |
1658 | 68 | const TabletId& tablet_id) const { |
1659 | 68 | TabletPeerPtr tablet_peer; |
1660 | 68 | SCHECK(LookupTablet(tablet_id, &tablet_peer), NotFound, Format("Tablet $0 not found", tablet_id)); |
1661 | 68 | return tablet_peer; |
1662 | 68 | } |
1663 | | |
1664 | | bool TSTabletManager::LookupTabletUnlocked(const string& tablet_id, |
1665 | 38.3M | TabletPeerPtr* tablet_peer) const { |
1666 | 38.3M | const TabletPeerPtr* found = FindOrNull(tablet_map_, tablet_id); |
1667 | 38.3M | if (!found) { |
1668 | 145k | return false; |
1669 | 145k | } |
1670 | 38.1M | *tablet_peer = *found; |
1671 | 38.1M | return true; |
1672 | 38.3M | } |
1673 | | |
1674 | | Status TSTabletManager::GetTabletPeer(const string& tablet_id, |
1675 | 37.8M | TabletPeerPtr* tablet_peer) const { |
1676 | 37.8M | if (!LookupTablet(tablet_id, tablet_peer)) { |
1677 | 3.32k | return STATUS(NotFound, "Tablet not found", tablet_id); |
1678 | 3.32k | } |
1679 | 37.8M | TabletDataState data_state = (*tablet_peer)->tablet_metadata()->tablet_data_state(); |
1680 | 37.8M | if (!CanServeTabletData(data_state)) { |
1681 | 9.06k | return STATUS( |
1682 | 9.06k | IllegalState, "Tablet data state not ready: " + TabletDataState_Name(data_state), |
1683 | 9.06k | tablet_id); |
1684 | 9.06k | } |
1685 | 37.8M | return Status::OK(); |
1686 | 37.8M | } |
1687 | | |
1688 | 24.5M | const NodeInstancePB& TSTabletManager::NodeInstance() const { |
1689 | 24.5M | return server_->instance_pb(); |
1690 | 24.5M | } |
1691 | | |
1692 | 1 | Status TSTabletManager::GetRegistration(ServerRegistrationPB* reg) const { |
1693 | 1 | return server_->GetRegistration(reg, server::RpcOnly::kTrue); |
1694 | 1 | } |
1695 | | |
1696 | 1.42M | TSTabletManager::TabletPeers TSTabletManager::GetTabletPeers(TabletPtrs* tablet_ptrs) const { |
1697 | 1.42M | SharedLock<RWMutex> shared_lock(mutex_); |
1698 | 1.42M | TabletPeers peers; |
1699 | 1.42M | GetTabletPeersUnlocked(&peers); |
1700 | 1.42M | if (tablet_ptrs) { |
1701 | 8 | for (const auto& peer : peers) { |
1702 | 8 | if (!peer) continue0 ; |
1703 | 8 | auto tablet_ptr = peer->shared_tablet(); |
1704 | 8 | if (tablet_ptr) { |
1705 | 8 | tablet_ptrs->push_back(tablet_ptr); |
1706 | 8 | } |
1707 | 8 | } |
1708 | 8 | } |
1709 | 1.42M | return peers; |
1710 | 1.42M | } |
1711 | | |
1712 | 1.43M | void TSTabletManager::GetTabletPeersUnlocked(TabletPeers* tablet_peers) const { |
1713 | 1.43M | DCHECK(tablet_peers != nullptr); |
1714 | | // See AppendKeysFromMap for why this is done. |
1715 | 1.43M | if (tablet_peers->empty()) { |
1716 | 1.43M | tablet_peers->reserve(tablet_map_.size()); |
1717 | 1.43M | } |
1718 | 4.12M | for (const auto& entry : tablet_map_) { |
1719 | 4.12M | if (entry.second != nullptr) { |
1720 | 4.12M | tablet_peers->push_back(entry.second); |
1721 | 4.12M | } |
1722 | 4.12M | } |
1723 | 1.43M | } |
1724 | | |
1725 | 411k | void TSTabletManager::PreserveLocalLeadersOnly(std::vector<const TabletId*>* tablet_ids) const { |
1726 | 411k | SharedLock<decltype(mutex_)> shared_lock(mutex_); |
1727 | 8.48M | auto filter = [this](const TabletId* id) REQUIRES_SHARED(mutex_) { |
1728 | 8.48M | auto it = tablet_map_.find(*id); |
1729 | 8.48M | if (it == tablet_map_.end()) { |
1730 | 722 | return true; |
1731 | 722 | } |
1732 | 8.48M | auto leader_status = it->second->LeaderStatus(); |
1733 | 8.48M | return leader_status != consensus::LeaderStatus::LEADER_AND_READY; |
1734 | 8.48M | }; |
1735 | 411k | tablet_ids->erase(std::remove_if(tablet_ids->begin(), tablet_ids->end(), filter), |
1736 | 411k | tablet_ids->end()); |
1737 | 411k | } |
1738 | | |
1739 | | void TSTabletManager::ApplyChange(const string& tablet_id, |
1740 | 592k | shared_ptr<consensus::StateChangeContext> context) { |
1741 | 592k | WARN_NOT_OK( |
1742 | 592k | apply_pool_->SubmitFunc( |
1743 | 592k | std::bind(&TSTabletManager::MarkTabletDirty, this, tablet_id, context)), |
1744 | 592k | "Unable to run MarkDirty callback") |
1745 | 592k | } |
1746 | | |
1747 | | void TSTabletManager::MarkTabletDirty(const TabletId& tablet_id, |
1748 | 592k | std::shared_ptr<consensus::StateChangeContext> context) { |
1749 | 592k | std::lock_guard<RWMutex> lock(mutex_); |
1750 | 592k | MarkDirtyUnlocked(tablet_id, context); |
1751 | 592k | } |
1752 | | |
1753 | | void TSTabletManager::MarkTabletBeingRemoteBootstrapped( |
1754 | 1.96k | const TabletId& tablet_id, const TableId& table_id) { |
1755 | 1.96k | std::lock_guard<RWMutex> lock(mutex_); |
1756 | 1.96k | tablets_being_remote_bootstrapped_.insert(tablet_id); |
1757 | 1.96k | tablets_being_remote_bootstrapped_per_table_[table_id].insert(tablet_id); |
1758 | 1.96k | MaybeDoChecksForTests(table_id); |
1759 | 1.96k | LOG(INFO) << "Concurrent remote bootstrap sessions: " |
1760 | 1.96k | << tablets_being_remote_bootstrapped_.size() |
1761 | 1.96k | << "Concurrent remote bootstrap sessions for table " << table_id |
1762 | 1.96k | << ": " << tablets_being_remote_bootstrapped_per_table_[table_id].size(); |
1763 | 1.96k | } |
1764 | | |
1765 | | void TSTabletManager::UnmarkTabletBeingRemoteBootstrapped( |
1766 | 1.94k | const TabletId& tablet_id, const TableId& table_id) { |
1767 | 1.94k | std::lock_guard<RWMutex> lock(mutex_); |
1768 | 1.94k | tablets_being_remote_bootstrapped_.erase(tablet_id); |
1769 | 1.94k | tablets_being_remote_bootstrapped_per_table_[table_id].erase(tablet_id); |
1770 | 1.94k | } |
1771 | | |
1772 | 0 | size_t TSTabletManager::TEST_GetNumDirtyTablets() const { |
1773 | 0 | SharedLock<RWMutex> lock(mutex_); |
1774 | 0 | return dirty_tablets_.size(); |
1775 | 0 | } |
1776 | | |
1777 | | Status TSTabletManager::GetNumTabletsPendingBootstrap( |
1778 | 9 | IsTabletServerReadyResponsePB* resp) const { |
1779 | 9 | if (state() != MANAGER_RUNNING) { |
1780 | 0 | resp->set_num_tablets_not_running(INT_MAX); |
1781 | 0 | resp->set_total_tablets(INT_MAX); |
1782 | 0 | return Status::OK(); |
1783 | 0 | } |
1784 | | |
1785 | 9 | SharedLock<RWMutex> shared_lock(mutex_); |
1786 | 9 | int num_pending = 0; |
1787 | 9 | int total_tablets = 0; |
1788 | 9 | for (const auto& entry : tablet_map_) { |
1789 | 0 | RaftGroupStatePB state = entry.second->state(); |
1790 | 0 | TabletDataState data_state = entry.second->data_state(); |
1791 | | // Do not count tablets that will never get to RUNNING state. |
1792 | 0 | if (!CanServeTabletData(data_state)) { |
1793 | 0 | continue; |
1794 | 0 | } |
1795 | 0 | bool not_started_or_bootstrap = state == NOT_STARTED || state == BOOTSTRAPPING; |
1796 | 0 | if (not_started_or_bootstrap || state == RUNNING) { |
1797 | 0 | total_tablets++; |
1798 | 0 | } |
1799 | 0 | if (not_started_or_bootstrap) { |
1800 | 0 | num_pending++; |
1801 | 0 | } |
1802 | 0 | } |
1803 | | |
1804 | 9 | LOG(INFO) << num_pending << " tablets pending bootstrap out of " << total_tablets; |
1805 | 9 | resp->set_num_tablets_not_running(num_pending); |
1806 | 9 | resp->set_total_tablets(total_tablets); |
1807 | | |
1808 | 9 | return Status::OK(); |
1809 | 9 | } |
1810 | | |
1811 | 5.25M | int TSTabletManager::GetNumLiveTablets() const { |
1812 | 5.25M | int count = 0; |
1813 | 5.25M | SharedLock<RWMutex> lock(mutex_); |
1814 | 24.9M | for (const auto& entry : tablet_map_) { |
1815 | 24.9M | RaftGroupStatePB state = entry.second->state(); |
1816 | 24.9M | if (state == BOOTSTRAPPING || |
1817 | 24.9M | state == RUNNING24.5M ) { |
1818 | 23.9M | count++; |
1819 | 23.9M | } |
1820 | 24.9M | } |
1821 | 5.25M | return count; |
1822 | 5.25M | } |
1823 | | |
1824 | 5.25M | int TSTabletManager::GetLeaderCount() const { |
1825 | 5.25M | int count = 0; |
1826 | 5.25M | SharedLock<RWMutex> lock(mutex_); |
1827 | 24.9M | for (const auto& entry : tablet_map_) { |
1828 | 24.9M | consensus::LeaderStatus leader_status = entry.second->LeaderStatus(/* allow_stale =*/ true); |
1829 | 24.9M | if (leader_status != consensus::LeaderStatus::NOT_LEADER) { |
1830 | 7.09M | count++; |
1831 | 7.09M | } |
1832 | 24.9M | } |
1833 | 5.25M | return count; |
1834 | 5.25M | } |
1835 | | |
1836 | | void TSTabletManager::MarkDirtyUnlocked(const TabletId& tablet_id, |
1837 | 592k | std::shared_ptr<consensus::StateChangeContext> context) { |
1838 | 592k | TabletReportState* state = FindOrNull(dirty_tablets_, tablet_id); |
1839 | 592k | if (state != nullptr) { |
1840 | 250k | CHECK_GE(next_report_seq_, state->change_seq); |
1841 | 250k | state->change_seq = next_report_seq_; |
1842 | 342k | } else { |
1843 | 342k | TabletReportState state; |
1844 | 342k | state.change_seq = next_report_seq_; |
1845 | 342k | InsertOrDie(&dirty_tablets_, tablet_id, state); |
1846 | 342k | } |
1847 | 592k | VLOG(2) << TabletLogPrefix(tablet_id) |
1848 | 0 | << "Marking dirty. Reason: " << AsString(context) |
1849 | 0 | << ". Will report this tablet to the Master in the next heartbeat " |
1850 | 0 | << "as part of report #" << next_report_seq_; |
1851 | 592k | server_->heartbeater()->TriggerASAP(); |
1852 | 592k | } |
1853 | | |
1854 | 8.74k | void TSTabletManager::InitLocalRaftPeerPB() { |
1855 | 8.74k | DCHECK_EQ(state(), MANAGER_INITIALIZING); |
1856 | 8.74k | local_peer_pb_.set_permanent_uuid(fs_manager_->uuid()); |
1857 | 8.74k | ServerRegistrationPB reg; |
1858 | 8.74k | CHECK_OK(server_->GetRegistration(®, server::RpcOnly::kTrue)); |
1859 | 8.74k | TakeRegistration(®, &local_peer_pb_); |
1860 | 8.74k | } |
1861 | | |
1862 | | void TSTabletManager::CreateReportedTabletPB(const TabletPeerPtr& tablet_peer, |
1863 | 458k | ReportedTabletPB* reported_tablet) { |
1864 | 458k | reported_tablet->set_tablet_id(tablet_peer->tablet_id()); |
1865 | 458k | reported_tablet->set_state(tablet_peer->state()); |
1866 | 458k | reported_tablet->set_tablet_data_state(tablet_peer->tablet_metadata()->tablet_data_state()); |
1867 | 458k | if (tablet_peer->state() == tablet::FAILED) { |
1868 | 0 | AppStatusPB* error_status = reported_tablet->mutable_error(); |
1869 | 0 | StatusToPB(tablet_peer->error(), error_status); |
1870 | 0 | } |
1871 | 458k | reported_tablet->set_schema_version(tablet_peer->tablet_metadata()->schema_version()); |
1872 | | |
1873 | 458k | { |
1874 | 458k | auto tablet_ptr = tablet_peer->shared_tablet(); |
1875 | 458k | if (tablet_ptr != nullptr) { |
1876 | 454k | reported_tablet->set_should_disable_lb_move(tablet_ptr->ShouldDisableLbMove()); |
1877 | 454k | } |
1878 | 458k | } |
1879 | 458k | reported_tablet->set_fs_data_dir(tablet_peer->tablet_metadata()->data_root_dir()); |
1880 | | |
1881 | | // We cannot get consensus state information unless the TabletPeer is running. |
1882 | 458k | shared_ptr<consensus::Consensus> consensus = tablet_peer->shared_consensus(); |
1883 | 458k | if (consensus) { |
1884 | 454k | *reported_tablet->mutable_committed_consensus_state() = |
1885 | 454k | consensus->ConsensusState(consensus::CONSENSUS_CONFIG_COMMITTED); |
1886 | 454k | } |
1887 | | |
1888 | | // Set the hide status of the tablet. |
1889 | 458k | reported_tablet->set_is_hidden(tablet_peer->tablet_metadata()->hidden()); |
1890 | 458k | } |
1891 | | |
1892 | 5.24M | void TSTabletManager::GenerateTabletReport(TabletReportPB* report, bool include_bootstrap) { |
1893 | 5.24M | report->Clear(); |
1894 | | // Creating the tablet report can be slow in the case that it is in the |
1895 | | // middle of flushing its consensus metadata. We don't want to hold |
1896 | | // lock_ for too long, even in read mode, since it can cause other readers |
1897 | | // to block if there is a waiting writer (see KUDU-2193). So, we just make |
1898 | | // a local copy of the set of replicas. |
1899 | 5.24M | vector<std::shared_ptr<TabletPeer>> to_report; |
1900 | 5.24M | TabletIdSet tablet_ids; |
1901 | 5.24M | size_t dirty_count, report_limit; |
1902 | 5.24M | { |
1903 | 5.24M | std::lock_guard<RWMutex> write_lock(mutex_); |
1904 | 5.24M | uint32_t cur_report_seq = next_report_seq_++; |
1905 | 5.24M | report->set_sequence_number(cur_report_seq); |
1906 | | |
1907 | 5.24M | TabletIdSet::iterator i = tablets_blocked_from_lb_.begin(); |
1908 | 5.24M | while (i != tablets_blocked_from_lb_.end()) { |
1909 | 189 | TabletPeerPtr* tablet_peer = FindOrNull(tablet_map_, *i); |
1910 | 189 | if (tablet_peer) { |
1911 | 185 | const auto tablet = (*tablet_peer)->shared_tablet(); |
1912 | | // If tablet is null, one of two things may be true: |
1913 | | // 1. TabletPeer::InitTabletPeer was not called yet |
1914 | | // |
1915 | | // Skip and keep tablet in tablets_blocked_from_lb_ till call InitTabletPeer. |
1916 | | // |
1917 | | // 2. TabletPeer::CompleteShutdown was called |
1918 | | // |
1919 | | // Tablet will be removed from tablets_blocked_from_lb_ with next GenerateTabletReport |
1920 | | // since tablet_peer will be removed from tablet_map_ |
1921 | 185 | if (tablet == nullptr) { |
1922 | 0 | ++i; |
1923 | 0 | continue; |
1924 | 0 | } |
1925 | 185 | const std::string& tablet_id = tablet->tablet_id(); |
1926 | 185 | if (!tablet->ShouldDisableLbMove()) { |
1927 | 122 | i = tablets_blocked_from_lb_.erase(i); |
1928 | 122 | VLOG(1) << "Tablet " << tablet_id << " is no longer blocked from load-balancing."0 ; |
1929 | 122 | InsertOrUpdate(&dirty_tablets_, tablet_id, TabletReportState{cur_report_seq}); |
1930 | 122 | } else { |
1931 | 63 | ++i; |
1932 | 63 | } |
1933 | 185 | } else { |
1934 | 4 | VLOG(1) << "Tablet " << *i |
1935 | 0 | << " was marked as blocked from load balancing but was not found"; |
1936 | 4 | i = tablets_blocked_from_lb_.erase(i); |
1937 | 4 | } |
1938 | 189 | } |
1939 | | |
1940 | 5.24M | if (include_bootstrap) { |
1941 | 5.24M | for (auto const& tablet_id : tablets_being_remote_bootstrapped_) { |
1942 | 18.3k | VLOG(1) << "Tablet " << tablet_id << " being remote bootstrapped and marked for report"0 ; |
1943 | 18.3k | InsertOrUpdate(&dirty_tablets_, tablet_id, TabletReportState{cur_report_seq}); |
1944 | 18.3k | } |
1945 | 5.24M | } |
1946 | 5.24M | for (const DirtyMap::value_type& dirty_entry : dirty_tablets_) { |
1947 | 455k | const TabletId& tablet_id = dirty_entry.first; |
1948 | 455k | tablet_ids.insert(tablet_id); |
1949 | 455k | } |
1950 | | |
1951 | 5.24M | for (auto const& tablet_id : tablet_ids) { |
1952 | 455k | TabletPeerPtr* tablet_peer = FindOrNull(tablet_map_, tablet_id); |
1953 | 455k | if (tablet_peer) { |
1954 | | // Dirty entry, report on it. |
1955 | 455k | to_report.push_back(*tablet_peer); |
1956 | 455k | } else { |
1957 | | // Tell the Master that this tablet was removed from the TServer side. |
1958 | 0 | report->add_removed_tablet_ids(tablet_id); |
1959 | | // Don't count this as a 'dirty_tablet_' because the Master may not have it either. |
1960 | 0 | dirty_tablets_.erase(tablet_id); |
1961 | 0 | } |
1962 | 455k | } |
1963 | 5.24M | dirty_count = dirty_tablets_.size(); |
1964 | 5.24M | report_limit = report_limit_; |
1965 | 5.24M | } |
1966 | 5.24M | for (const auto& replica : to_report) { |
1967 | 455k | CreateReportedTabletPB(replica, report->add_updated_tablets()); |
1968 | | // Enforce a max tablet limit on reported tablets. |
1969 | 455k | if (implicit_cast<size_t>(report->updated_tablets_size()) >= report_limit) break0 ; |
1970 | 455k | } |
1971 | 5.24M | report->set_remaining_tablet_count( |
1972 | 5.24M | narrow_cast<int>(dirty_count - report->updated_tablets_size())); |
1973 | 5.24M | } |
1974 | | |
1975 | 8.22k | void TSTabletManager::StartFullTabletReport(TabletReportPB* report) { |
1976 | 8.22k | report->Clear(); |
1977 | | // Creating the tablet report can be slow in the case that it is in the |
1978 | | // middle of flushing its consensus metadata. We don't want to hold |
1979 | | // lock_ for too long, even in read mode, since it can cause other readers |
1980 | | // to block if there is a waiting writer (see KUDU-2193). So, we just make |
1981 | | // a local copy of the set of replicas. |
1982 | 8.22k | vector<std::shared_ptr<TabletPeer>> to_report; |
1983 | 8.22k | size_t dirty_count, report_limit; |
1984 | 8.22k | { |
1985 | 8.22k | std::lock_guard<RWMutex> write_lock(mutex_); |
1986 | 8.22k | uint32_t cur_report_seq = next_report_seq_++; |
1987 | 8.22k | report->set_sequence_number(cur_report_seq); |
1988 | 8.22k | GetTabletPeersUnlocked(&to_report); |
1989 | | // Mark all tablets as dirty, to be cleaned when reading the heartbeat response. |
1990 | 8.22k | for (const auto& peer : to_report) { |
1991 | 2.76k | InsertOrUpdate(&dirty_tablets_, peer->tablet_id(), TabletReportState{cur_report_seq}); |
1992 | 2.76k | } |
1993 | 8.22k | dirty_count = dirty_tablets_.size(); |
1994 | 8.22k | report_limit = report_limit_; |
1995 | 8.22k | } |
1996 | 8.22k | for (const auto& replica : to_report) { |
1997 | 2.76k | CreateReportedTabletPB(replica, report->add_updated_tablets()); |
1998 | | // Enforce a max tablet limit on reported tablets. |
1999 | 2.76k | if (implicit_cast<size_t>(report->updated_tablets_size()) >= report_limit) break0 ; |
2000 | 2.76k | } |
2001 | 8.22k | report->set_remaining_tablet_count( |
2002 | 8.22k | narrow_cast<int32_t>(dirty_count - report->updated_tablets_size())); |
2003 | 8.22k | } |
2004 | | |
2005 | | void TSTabletManager::MarkTabletReportAcknowledged(uint32_t acked_seq, |
2006 | | const TabletReportUpdatesPB& updates, |
2007 | 4.82M | bool dirty_check) { |
2008 | 4.82M | std::lock_guard<RWMutex> l(mutex_); |
2009 | | |
2010 | 4.82M | CHECK_LT(acked_seq, next_report_seq_); |
2011 | | |
2012 | | // Clear the "dirty" state for any tablets processed in this report. |
2013 | 4.82M | for (auto const & tablet : updates.tablets()) { |
2014 | 454k | auto it = dirty_tablets_.find(tablet.tablet_id()); |
2015 | 454k | if (it != dirty_tablets_.end()) { |
2016 | 454k | const TabletReportState& state = it->second; |
2017 | 454k | if (state.change_seq <= acked_seq) { |
2018 | | // This entry has not changed since this tablet report, we no longer need to track it |
2019 | | // as dirty. Next modification will be re-added with a higher sequence number. |
2020 | 355k | dirty_tablets_.erase(it); |
2021 | 355k | } |
2022 | 454k | } |
2023 | 454k | } |
2024 | 4.82M | #ifndef NDEBUG |
2025 | | // Verify dirty_tablets_ always processes all tablet changes. |
2026 | 4.82M | if (dirty_check) { |
2027 | 4.82M | for (auto const & d : dirty_tablets_) { |
2028 | 269k | if (d.second.change_seq <= acked_seq) { |
2029 | 0 | LOG(DFATAL) << "Dirty Tablet should have been reported but wasn't: " |
2030 | 0 | << d.first << "@" << d.second.change_seq << " <= " << acked_seq; |
2031 | 0 | } |
2032 | 269k | } |
2033 | 4.82M | } |
2034 | 4.82M | #endif |
2035 | 4.82M | } |
2036 | | |
2037 | | Status TSTabletManager::HandleNonReadyTabletOnStartup( |
2038 | 8 | const RaftGroupMetadataPtr& meta) { |
2039 | 8 | const string& tablet_id = meta->raft_group_id(); |
2040 | 8 | TabletDataState data_state = meta->tablet_data_state(); |
2041 | 8 | CHECK(data_state == TABLET_DATA_DELETED || |
2042 | 0 | data_state == TABLET_DATA_TOMBSTONED || |
2043 | 0 | data_state == TABLET_DATA_COPYING || |
2044 | 0 | data_state == TABLET_DATA_INIT_STARTED) |
2045 | 0 | << "Unexpected TabletDataState in tablet " << tablet_id << ": " |
2046 | 0 | << TabletDataState_Name(data_state) << " (" << data_state << ")"; |
2047 | | |
2048 | 8 | if (data_state == TABLET_DATA_COPYING) { |
2049 | | // We tombstone tablets that failed to remotely bootstrap. |
2050 | 0 | data_state = TABLET_DATA_TOMBSTONED; |
2051 | 0 | } |
2052 | | |
2053 | 8 | if (data_state == TABLET_DATA_INIT_STARTED) { |
2054 | | // We delete tablets that failed to completely initialize after a split. |
2055 | | // TODO(tsplit): https://github.com/yugabyte/yugabyte-db/issues/8013 |
2056 | 1 | data_state = TABLET_DATA_DELETED; |
2057 | 1 | } |
2058 | | |
2059 | 8 | const string kLogPrefix = TabletLogPrefix(tablet_id); |
2060 | | |
2061 | | // If the tablet is already fully tombstoned with no remaining data or WAL, |
2062 | | // then no need to roll anything forward. |
2063 | 8 | bool skip_deletion = meta->IsTombstonedWithNoRocksDBData() && |
2064 | 8 | !Log::HasOnDiskData(meta->fs_manager(), meta->wal_dir())2 ; |
2065 | | |
2066 | 8 | LOG_IF(WARNING, !skip_deletion) |
2067 | 6 | << kLogPrefix << "Tablet Manager startup: Rolling forward tablet deletion " |
2068 | 6 | << "of type " << TabletDataState_Name(data_state); |
2069 | | |
2070 | 8 | if (!skip_deletion) { |
2071 | | // Passing no OpId will retain the last_logged_opid that was previously in the metadata. |
2072 | 6 | RETURN_NOT_OK(DeleteTabletData(meta, data_state, fs_manager_->uuid(), yb::OpId())); |
2073 | 6 | } |
2074 | | |
2075 | | // We only delete the actual superblock of a TABLET_DATA_DELETED tablet on startup. |
2076 | | // TODO: Consider doing this after a fixed delay, instead of waiting for a restart. |
2077 | | // See KUDU-941. |
2078 | 8 | if (data_state == TABLET_DATA_DELETED) { |
2079 | 6 | LOG(INFO) << kLogPrefix << "Deleting tablet superblock"; |
2080 | 6 | return meta->DeleteSuperBlock(); |
2081 | 6 | } |
2082 | | |
2083 | | // Register TOMBSTONED tablets so that they get reported to the Master, which |
2084 | | // allows us to permanently delete replica tombstones when a table gets deleted. |
2085 | 2 | if (data_state == TABLET_DATA_TOMBSTONED) { |
2086 | 2 | RETURN_NOT_OK(CreateAndRegisterTabletPeer(meta, NEW_PEER)); |
2087 | 2 | } |
2088 | | |
2089 | 2 | return Status::OK(); |
2090 | 2 | } |
2091 | | |
2092 | | void TSTabletManager::GetAndRegisterDataAndWalDir(FsManager* fs_manager, |
2093 | | const string& table_id, |
2094 | | const string& tablet_id, |
2095 | | string* data_root_dir, |
2096 | 141k | string* wal_root_dir) { |
2097 | | // Skip sys catalog table and kudu table from modifying the map. |
2098 | 141k | if (table_id == master::kSysCatalogTableId) { |
2099 | 0 | return; |
2100 | 0 | } |
2101 | 141k | LOG(INFO) << "Get and update data/wal directory assignment map for table: " \ |
2102 | 141k | << table_id << " and tablet " << tablet_id; |
2103 | 141k | std::lock_guard<std::mutex> dir_assignment_lock(dir_assignment_mutex_); |
2104 | | // Initialize the map if the directory mapping does not exist. |
2105 | 141k | auto data_root_dirs = fs_manager->GetDataRootDirs(); |
2106 | 18.4E | CHECK(!data_root_dirs.empty()) << "No data root directories found"; |
2107 | 141k | auto table_data_assignment_iter = table_data_assignment_map_.find(table_id); |
2108 | 141k | if (table_data_assignment_iter == table_data_assignment_map_.end()) { |
2109 | 23.4k | for (string data_root_iter : data_root_dirs) { |
2110 | 23.4k | unordered_set<string> tablet_id_set; |
2111 | 23.4k | table_data_assignment_map_[table_id][data_root_iter] = tablet_id_set; |
2112 | 23.4k | } |
2113 | 23.3k | } |
2114 | | // Find the data directory with the least count of tablets for this table. |
2115 | 141k | table_data_assignment_iter = table_data_assignment_map_.find(table_id); |
2116 | 141k | auto data_assignment_value_map = table_data_assignment_iter->second; |
2117 | 141k | string min_dir; |
2118 | 141k | uint64_t min_dir_count = kuint64max; |
2119 | 284k | for (auto it = data_assignment_value_map.begin(); it != data_assignment_value_map.end(); ++it142k ) { |
2120 | 142k | if (min_dir_count > it->second.size()) { |
2121 | 142k | min_dir = it->first; |
2122 | 142k | min_dir_count = it->second.size(); |
2123 | 142k | } |
2124 | 142k | } |
2125 | 141k | *data_root_dir = min_dir; |
2126 | | // Increment the count for min_dir. |
2127 | 141k | auto data_assignment_value_iter = table_data_assignment_map_[table_id].find(min_dir); |
2128 | 141k | data_assignment_value_iter->second.insert(tablet_id); |
2129 | | |
2130 | | // Find the wal directory with the least count of tablets for this table. |
2131 | 141k | min_dir = ""; |
2132 | 141k | min_dir_count = kuint64max; |
2133 | 141k | auto wal_root_dirs = fs_manager->GetWalRootDirs(); |
2134 | 18.4E | CHECK(!wal_root_dirs.empty()) << "No wal root directories found"; |
2135 | 141k | auto table_wal_assignment_iter = table_wal_assignment_map_.find(table_id); |
2136 | 141k | if (table_wal_assignment_iter == table_wal_assignment_map_.end()) { |
2137 | 23.4k | for (string wal_root_iter : wal_root_dirs) { |
2138 | 23.4k | unordered_set<string> tablet_id_set; |
2139 | 23.4k | table_wal_assignment_map_[table_id][wal_root_iter] = tablet_id_set; |
2140 | 23.4k | } |
2141 | 23.3k | } |
2142 | 141k | table_wal_assignment_iter = table_wal_assignment_map_.find(table_id); |
2143 | 141k | auto wal_assignment_value_map = table_wal_assignment_iter->second; |
2144 | 284k | for (auto it = wal_assignment_value_map.begin(); it != wal_assignment_value_map.end(); ++it142k ) { |
2145 | 142k | if (min_dir_count > it->second.size()) { |
2146 | 142k | min_dir = it->first; |
2147 | 142k | min_dir_count = it->second.size(); |
2148 | 142k | } |
2149 | 142k | } |
2150 | 141k | *wal_root_dir = min_dir; |
2151 | 141k | auto wal_assignment_value_iter = table_wal_assignment_map_[table_id].find(min_dir); |
2152 | 141k | wal_assignment_value_iter->second.insert(tablet_id); |
2153 | 141k | } |
2154 | | |
2155 | | void TSTabletManager::RegisterDataAndWalDir(FsManager* fs_manager, |
2156 | | const string& table_id, |
2157 | | const string& tablet_id, |
2158 | | const string& data_root_dir, |
2159 | 497 | const string& wal_root_dir) { |
2160 | | // Skip sys catalog table from modifying the map. |
2161 | 497 | if (table_id == master::kSysCatalogTableId) { |
2162 | 0 | return; |
2163 | 0 | } |
2164 | 497 | LOG(INFO) << "Update data/wal directory assignment map for table: " |
2165 | 497 | << table_id << " and tablet " << tablet_id; |
2166 | 497 | std::lock_guard<std::mutex> dir_assignment_lock(dir_assignment_mutex_); |
2167 | | // Initialize the map if the directory mapping does not exist. |
2168 | 497 | auto data_root_dirs = fs_manager->GetDataRootDirs(); |
2169 | 497 | CHECK(!data_root_dirs.empty()) << "No data root directories found"0 ; |
2170 | 497 | auto table_data_assignment_iter = table_data_assignment_map_.find(table_id); |
2171 | 497 | if (table_data_assignment_iter == table_data_assignment_map_.end()) { |
2172 | 116 | for (string data_root_iter : data_root_dirs) { |
2173 | 116 | unordered_set<string> tablet_id_set; |
2174 | 116 | table_data_assignment_map_[table_id][data_root_iter] = tablet_id_set; |
2175 | 116 | } |
2176 | 79 | } |
2177 | | // Increment the count for data_root_dir. |
2178 | 497 | table_data_assignment_iter = table_data_assignment_map_.find(table_id); |
2179 | 497 | auto data_assignment_value_map = table_data_assignment_iter->second; |
2180 | 497 | auto data_assignment_value_iter = table_data_assignment_map_[table_id].find(data_root_dir); |
2181 | 497 | if (data_assignment_value_iter == table_data_assignment_map_[table_id].end()) { |
2182 | 0 | unordered_set<string> tablet_id_set; |
2183 | 0 | tablet_id_set.insert(tablet_id); |
2184 | 0 | table_data_assignment_map_[table_id][data_root_dir] = tablet_id_set; |
2185 | 497 | } else { |
2186 | 497 | data_assignment_value_iter->second.insert(tablet_id); |
2187 | 497 | } |
2188 | | |
2189 | 497 | auto wal_root_dirs = fs_manager->GetWalRootDirs(); |
2190 | 497 | CHECK(!wal_root_dirs.empty()) << "No wal root directories found"0 ; |
2191 | 497 | auto table_wal_assignment_iter = table_wal_assignment_map_.find(table_id); |
2192 | 497 | if (table_wal_assignment_iter == table_wal_assignment_map_.end()) { |
2193 | 116 | for (string wal_root_iter : wal_root_dirs) { |
2194 | 116 | unordered_set<string> tablet_id_set; |
2195 | 116 | table_wal_assignment_map_[table_id][wal_root_iter] = tablet_id_set; |
2196 | 116 | } |
2197 | 79 | } |
2198 | | // Increment the count for wal_root_dir. |
2199 | 497 | table_wal_assignment_map_[table_id][wal_root_dir].insert(tablet_id); |
2200 | 497 | table_wal_assignment_iter = table_wal_assignment_map_.find(table_id); |
2201 | 497 | auto wal_assignment_value_map = table_wal_assignment_iter->second; |
2202 | 497 | auto wal_assignment_value_iter = table_wal_assignment_map_[table_id].find(wal_root_dir); |
2203 | 497 | if (wal_assignment_value_iter == table_wal_assignment_map_[table_id].end()) { |
2204 | 0 | unordered_set<string> tablet_id_set; |
2205 | 0 | tablet_id_set.insert(tablet_id); |
2206 | 0 | table_wal_assignment_map_[table_id][wal_root_dir] = tablet_id_set; |
2207 | 497 | } else { |
2208 | 497 | wal_assignment_value_iter->second.insert(tablet_id); |
2209 | 497 | } |
2210 | 497 | } |
2211 | | |
2212 | | TSTabletManager::TableDiskAssignmentMap* TSTabletManager::GetTableDiskAssignmentMapUnlocked( |
2213 | 136 | TabletDirType dir_type) { |
2214 | 136 | switch (dir_type) { |
2215 | 68 | case TabletDirType::kData: |
2216 | 68 | return &table_data_assignment_map_; |
2217 | 68 | case TabletDirType::kWal: |
2218 | 68 | return &table_wal_assignment_map_; |
2219 | 136 | } |
2220 | 0 | FATAL_INVALID_ENUM_VALUE(TabletDirType, dir_type); |
2221 | 0 | } |
2222 | | |
2223 | | Result<const std::string&> TSTabletManager::GetAssignedRootDirForTablet( |
2224 | 136 | TabletDirType dir_type, const TableId& table_id, const TabletId& tablet_id) { |
2225 | 136 | std::lock_guard<std::mutex> dir_assignment_lock(dir_assignment_mutex_); |
2226 | | |
2227 | 136 | TableDiskAssignmentMap* table_assignment_map = GetTableDiskAssignmentMapUnlocked(dir_type); |
2228 | 136 | auto tablets_by_root_dir = table_assignment_map->find(table_id); |
2229 | 136 | if (tablets_by_root_dir == table_assignment_map->end()) { |
2230 | 0 | return STATUS_FORMAT( |
2231 | 0 | IllegalState, "Table ID $0 is not in $1 table assignment map", table_id, dir_type); |
2232 | 0 | } |
2233 | 136 | for (auto& data_dir_and_tablets : tablets_by_root_dir->second) { |
2234 | 136 | if (data_dir_and_tablets.second.count(tablet_id) > 0) { |
2235 | 136 | return data_dir_and_tablets.first; |
2236 | 136 | } |
2237 | 136 | } |
2238 | 0 | return STATUS_FORMAT( |
2239 | 136 | IllegalState, "Tablet ID $0 is not found in $1 assignment map for table $2", tablet_id, |
2240 | 136 | dir_type, table_id); |
2241 | 136 | } |
2242 | | |
2243 | | void TSTabletManager::UnregisterDataWalDir(const string& table_id, |
2244 | | const string& tablet_id, |
2245 | | const string& data_root_dir, |
2246 | 75.5k | const string& wal_root_dir) { |
2247 | | // Skip sys catalog table from modifying the map. |
2248 | 75.5k | if (table_id == master::kSysCatalogTableId) { |
2249 | 0 | return; |
2250 | 0 | } |
2251 | 75.5k | LOG(INFO) << "Unregister data/wal directory assignment map for table: " |
2252 | 75.5k | << table_id << " and tablet " << tablet_id; |
2253 | 75.5k | std::lock_guard<std::mutex> lock(dir_assignment_mutex_); |
2254 | 75.5k | auto table_data_assignment_iter = table_data_assignment_map_.find(table_id); |
2255 | 75.5k | if (table_data_assignment_iter == table_data_assignment_map_.end()) { |
2256 | | // It is possible that we can't find an assignment for the table if the operations followed in |
2257 | | // this order: |
2258 | | // 1. The only tablet for a table gets tombstoned, and UnregisterDataWalDir removes it from |
2259 | | // the maps. |
2260 | | // 2. TSTabletManager gets restarted (so the maps are cleared). |
2261 | | // 3. During TsTabletManager initialization, the tombstoned TABLET won't get registered, |
2262 | | // so if a DeleteTablet request with type DELETED gets sent, UnregisterDataWalDir won't |
2263 | | // find the table. |
2264 | | |
2265 | | // Check that both maps should be consistent. |
2266 | 0 | DCHECK(table_wal_assignment_map_.find(table_id) == table_wal_assignment_map_.end()); |
2267 | 0 | } |
2268 | 75.5k | if (table_data_assignment_iter != table_data_assignment_map_.end()75.5k ) { |
2269 | 75.5k | auto data_assignment_value_iter = table_data_assignment_map_[table_id].find(data_root_dir); |
2270 | 75.5k | DCHECK(data_assignment_value_iter != table_data_assignment_map_[table_id].end()) |
2271 | 0 | << "No data directory index found for table: " << table_id; |
2272 | 75.5k | if (data_assignment_value_iter != table_data_assignment_map_[table_id].end()) { |
2273 | 75.5k | data_assignment_value_iter->second.erase(tablet_id); |
2274 | 75.5k | } else { |
2275 | 0 | LOG(WARNING) << "Tablet " << tablet_id << " not in the set for data directory " |
2276 | 0 | << data_root_dir << "for table " << table_id; |
2277 | 0 | } |
2278 | 75.5k | } |
2279 | 75.5k | auto table_wal_assignment_iter = table_wal_assignment_map_.find(table_id); |
2280 | 75.5k | if (table_wal_assignment_iter != table_wal_assignment_map_.end()75.5k ) { |
2281 | 75.5k | auto wal_assignment_value_iter = table_wal_assignment_map_[table_id].find(wal_root_dir); |
2282 | 75.5k | DCHECK(wal_assignment_value_iter != table_wal_assignment_map_[table_id].end()) |
2283 | 0 | << "No wal directory index found for table: " << table_id; |
2284 | 75.5k | if (wal_assignment_value_iter != table_wal_assignment_map_[table_id].end()) { |
2285 | 75.5k | wal_assignment_value_iter->second.erase(tablet_id); |
2286 | 75.5k | } else { |
2287 | 0 | LOG(WARNING) << "Tablet " << tablet_id << " not in the set for wal directory " |
2288 | 0 | << wal_root_dir << "for table " << table_id; |
2289 | 0 | } |
2290 | 75.5k | } |
2291 | 75.5k | } |
2292 | | |
2293 | 6 | client::YBClient& TSTabletManager::client() { |
2294 | 6 | return *async_client_init_->client(); |
2295 | 6 | } |
2296 | | |
2297 | 83.7k | const std::shared_future<client::YBClient*>& TSTabletManager::client_future() { |
2298 | 83.7k | return async_client_init_->get_client_future(); |
2299 | 83.7k | } |
2300 | | |
2301 | 1.96k | void TSTabletManager::MaybeDoChecksForTests(const TableId& table_id) { |
2302 | | // First check that the global RBS limits are respected if the flag is non-zero. |
2303 | 1.96k | if (PREDICT_FALSE(FLAGS_TEST_crash_if_remote_bootstrap_sessions_greater_than > 0) && |
2304 | 1.96k | tablets_being_remote_bootstrapped_.size() > |
2305 | 24 | FLAGS_TEST_crash_if_remote_bootstrap_sessions_greater_than) { |
2306 | 0 | string tablets; |
2307 | | // The purpose of limiting the number of remote bootstraps is to cap how much |
2308 | | // network bandwidth all the RBS sessions use. |
2309 | | // When we finish transferring the files, we wait until the role of the new peer |
2310 | | // has been changed from PRE_VOTER to VOTER before we remove the tablet_id |
2311 | | // from tablets_being_remote_bootstrapped_. Since it's possible to be here |
2312 | | // because a few tablets are already open, and in the RUNNING state, but still |
2313 | | // in the tablets_being_remote_bootstrapped_ list, we check the state of each |
2314 | | // tablet before deciding if the load balancer has violated the concurrent RBS limit. |
2315 | 0 | size_t count = 0; |
2316 | 0 | for (const auto& tablet_id : tablets_being_remote_bootstrapped_) { |
2317 | 0 | TabletPeerPtr* tablet_peer = FindOrNull(tablet_map_, tablet_id); |
2318 | 0 | if (tablet_peer && (*tablet_peer)->state() == RaftGroupStatePB::RUNNING) { |
2319 | 0 | continue; |
2320 | 0 | } |
2321 | 0 | if (!tablets.empty()) { |
2322 | 0 | tablets += ", "; |
2323 | 0 | } |
2324 | 0 | tablets += tablet_id; |
2325 | 0 | count++; |
2326 | 0 | } |
2327 | 0 | if (count > FLAGS_TEST_crash_if_remote_bootstrap_sessions_greater_than) { |
2328 | 0 | LOG(FATAL) << "Exceeded the specified maximum number of concurrent remote bootstrap sessions." |
2329 | 0 | << " Specified: " << FLAGS_TEST_crash_if_remote_bootstrap_sessions_greater_than |
2330 | 0 | << ", number concurrent remote bootstrap sessions: " |
2331 | 0 | << tablets_being_remote_bootstrapped_.size() << ", for tablets: " << tablets; |
2332 | 0 | } |
2333 | 0 | } |
2334 | | |
2335 | | // Check that the per-table RBS limits are respected if the flag is non-zero. |
2336 | 1.96k | if (PREDICT_FALSE(FLAGS_TEST_crash_if_remote_bootstrap_sessions_per_table_greater_than > 0) && |
2337 | 1.96k | tablets_being_remote_bootstrapped_per_table_[table_id].size() > |
2338 | 24 | FLAGS_TEST_crash_if_remote_bootstrap_sessions_per_table_greater_than) { |
2339 | 0 | string tablets; |
2340 | 0 | size_t count = 0; |
2341 | 0 | for (const auto& tablet_id : tablets_being_remote_bootstrapped_per_table_[table_id]) { |
2342 | 0 | TabletPeerPtr* tablet_peer = FindOrNull(tablet_map_, tablet_id); |
2343 | 0 | if (tablet_peer && (*tablet_peer)->state() == RaftGroupStatePB::RUNNING) { |
2344 | 0 | continue; |
2345 | 0 | } |
2346 | 0 | if (!tablets.empty()) { |
2347 | 0 | tablets += ", "; |
2348 | 0 | } |
2349 | 0 | tablets += tablet_id; |
2350 | 0 | count++; |
2351 | 0 | } |
2352 | 0 | if (count > FLAGS_TEST_crash_if_remote_bootstrap_sessions_per_table_greater_than) { |
2353 | 0 | LOG(FATAL) << "Exceeded the specified maximum number of concurrent remote bootstrap " |
2354 | 0 | << "sessions per table. Specified: " |
2355 | 0 | << FLAGS_TEST_crash_if_remote_bootstrap_sessions_per_table_greater_than |
2356 | 0 | << ", number of concurrent remote bootstrap sessions for table " << table_id |
2357 | 0 | << ": " << tablets_being_remote_bootstrapped_per_table_[table_id].size() |
2358 | 0 | << ", for tablets: " << tablets; |
2359 | 0 | } |
2360 | 0 | } |
2361 | 1.96k | } |
2362 | | |
2363 | 4.82M | Status TSTabletManager::UpdateSnapshotsInfo(const master::TSSnapshotsInfoPB& info) { |
2364 | 4.82M | bool restorations_updated; |
2365 | 4.82M | RestorationCompleteTimeMap restoration_complete_time; |
2366 | 4.82M | { |
2367 | 4.82M | std::lock_guard<simple_spinlock> lock(snapshot_schedule_allowed_history_cutoff_mutex_); |
2368 | 4.82M | ++snapshot_schedules_version_; |
2369 | 4.82M | snapshot_schedule_allowed_history_cutoff_.clear(); |
2370 | 4.82M | for (const auto& schedule : info.schedules()) { |
2371 | 454 | auto schedule_id = VERIFY_RESULT(FullyDecodeSnapshotScheduleId(schedule.id())); |
2372 | 0 | snapshot_schedule_allowed_history_cutoff_.emplace( |
2373 | 454 | schedule_id, HybridTime::FromPB(schedule.last_snapshot_hybrid_time())); |
2374 | 454 | missing_snapshot_schedules_.erase(schedule_id); |
2375 | 454 | } |
2376 | 4.82M | HybridTime restorations_update_ht(info.last_restorations_update_ht()); |
2377 | 4.82M | restorations_updated = restorations_update_ht != last_restorations_update_ht_; |
2378 | 4.82M | if (restorations_updated) { |
2379 | 8.16k | last_restorations_update_ht_ = restorations_update_ht; |
2380 | 8.16k | for (const auto& entry : info.restorations()) { |
2381 | 15 | auto id = VERIFY_RESULT(FullyDecodeTxnSnapshotRestorationId(entry.id())); |
2382 | 0 | auto complete_time = HybridTime::FromPB(entry.complete_time_ht()); |
2383 | 15 | restoration_complete_time.emplace(id, complete_time); |
2384 | 15 | } |
2385 | 8.16k | } |
2386 | 4.82M | } |
2387 | 4.82M | if (!restorations_updated) { |
2388 | 4.81M | return Status::OK(); |
2389 | 4.81M | } |
2390 | 8.16k | std::vector<tablet::TabletPtr> tablets; |
2391 | 8.16k | { |
2392 | 8.16k | SharedLock<RWMutex> shared_lock(mutex_); |
2393 | 8.16k | tablets.reserve(tablet_map_.size()); |
2394 | 8.16k | for (const auto& entry : tablet_map_) { |
2395 | 2.97k | auto tablet = entry.second->shared_tablet(); |
2396 | 2.97k | if (tablet) { |
2397 | 2.71k | tablets.push_back(tablet); |
2398 | 2.71k | } |
2399 | 2.97k | } |
2400 | 8.16k | } |
2401 | 8.16k | for (const auto& tablet : tablets) { |
2402 | 2.71k | RETURN_NOT_OK(tablet->CheckRestorations(restoration_complete_time)); |
2403 | 2.71k | } |
2404 | 8.16k | return Status::OK(); |
2405 | 8.16k | } |
2406 | | |
2407 | 464 | HybridTime TSTabletManager::AllowedHistoryCutoff(tablet::RaftGroupMetadata* metadata) { |
2408 | 464 | auto schedules = metadata->SnapshotSchedules(); |
2409 | 464 | if (schedules.empty()) { |
2410 | 464 | return HybridTime::kMax; |
2411 | 464 | } |
2412 | 0 | std::vector<SnapshotScheduleId> schedules_to_remove; |
2413 | 0 | auto se = ScopeExit([&schedules_to_remove, metadata]() { |
2414 | 0 | if (schedules_to_remove.empty()) { |
2415 | 0 | return; |
2416 | 0 | } |
2417 | 0 | bool any_removed = false; |
2418 | 0 | for (const auto& schedule_id : schedules_to_remove) { |
2419 | 0 | any_removed = metadata->RemoveSnapshotSchedule(schedule_id) || any_removed; |
2420 | 0 | } |
2421 | 0 | if (any_removed) { |
2422 | 0 | WARN_NOT_OK(metadata->Flush(), "Failed to flush metadata"); |
2423 | 0 | } |
2424 | 0 | }); |
2425 | 0 | std::lock_guard<simple_spinlock> lock(snapshot_schedule_allowed_history_cutoff_mutex_); |
2426 | 0 | HybridTime result = HybridTime::kMax; |
2427 | 0 | for (const auto& schedule_id : schedules) { |
2428 | 0 | auto it = snapshot_schedule_allowed_history_cutoff_.find(schedule_id); |
2429 | 0 | if (it == snapshot_schedule_allowed_history_cutoff_.end()) { |
2430 | | // We don't know this schedule. |
2431 | 0 | auto emplace_result = missing_snapshot_schedules_.emplace( |
2432 | 0 | schedule_id, snapshot_schedules_version_); |
2433 | 0 | if (!emplace_result.second && |
2434 | 0 | emplace_result.first->second + 2 <= snapshot_schedules_version_) { |
2435 | | // We don't know this schedule, and there are already 2 rounds of heartbeat passed |
2436 | | // after we first time found that we don't know this schedule. |
2437 | | // So it means that schedule was deleted. |
2438 | | // One round is not enough, because schedule could be added after heartbeat processed on |
2439 | | // master, but response not yet received on TServer. |
2440 | 0 | schedules_to_remove.push_back(schedule_id); |
2441 | 0 | continue; |
2442 | 0 | } |
2443 | 0 | return HybridTime::kMin; |
2444 | 0 | } |
2445 | 0 | if (!it->second) { |
2446 | | // Schedules does not have snapshots yet. |
2447 | 0 | return HybridTime::kMin; |
2448 | 0 | } |
2449 | 0 | result = std::min(result, it->second); |
2450 | 0 | } |
2451 | 0 | return result; |
2452 | 0 | } |
2453 | | |
2454 | | Status DeleteTabletData(const RaftGroupMetadataPtr& meta, |
2455 | | TabletDataState data_state, |
2456 | | const string& uuid, |
2457 | | const yb::OpId& last_logged_opid, |
2458 | 75.5k | TSTabletManager* ts_manager) { |
2459 | 75.5k | const string& tablet_id = meta->raft_group_id(); |
2460 | 75.5k | const string kLogPrefix = LogPrefix(tablet_id, uuid); |
2461 | 75.5k | LOG(INFO) << kLogPrefix << "Deleting tablet data with delete state " |
2462 | 75.5k | << TabletDataState_Name(data_state); |
2463 | 75.5k | CHECK(data_state == TABLET_DATA_DELETED || |
2464 | 0 | data_state == TABLET_DATA_TOMBSTONED) |
2465 | 0 | << "Unexpected data_state to delete tablet " << meta->raft_group_id() << ": " |
2466 | 0 | << TabletDataState_Name(data_state) << " (" << data_state << ")"; |
2467 | | |
2468 | | // Note: Passing an unset 'last_logged_opid' will retain the last_logged_opid |
2469 | | // that was previously in the metadata. |
2470 | 75.5k | RETURN_NOT_OK(meta->DeleteTabletData(data_state, last_logged_opid)); |
2471 | 75.5k | LOG(INFO) << kLogPrefix << "Tablet deleted. Last logged OpId: " |
2472 | 75.5k | << meta->tombstone_last_logged_opid(); |
2473 | 75.5k | MAYBE_FAULT(FLAGS_TEST_fault_crash_after_blocks_deleted); |
2474 | | |
2475 | 75.5k | RETURN_NOT_OK(Log::DeleteOnDiskData( |
2476 | 75.5k | meta->fs_manager()->env(), meta->raft_group_id(), meta->wal_dir(), |
2477 | 75.5k | meta->fs_manager()->uuid())); |
2478 | 75.5k | MAYBE_FAULT(FLAGS_TEST_fault_crash_after_wal_deleted); |
2479 | | |
2480 | | // We do not delete the superblock or the consensus metadata when tombstoning |
2481 | | // a tablet. |
2482 | 75.5k | if (data_state == TABLET_DATA_TOMBSTONED) { |
2483 | 1.59k | return Status::OK(); |
2484 | 1.59k | } |
2485 | | |
2486 | | // Only TABLET_DATA_DELETED tablets get this far. |
2487 | 73.9k | RETURN_NOT_OK(ConsensusMetadata::DeleteOnDiskData(meta->fs_manager(), meta->raft_group_id())); |
2488 | 73.9k | MAYBE_FAULT(FLAGS_TEST_fault_crash_after_cmeta_deleted); |
2489 | | |
2490 | 73.9k | return Status::OK(); |
2491 | 73.9k | } |
2492 | | |
2493 | | void LogAndTombstone(const RaftGroupMetadataPtr& meta, |
2494 | | const std::string& msg, |
2495 | | const std::string& uuid, |
2496 | | const Status& s, |
2497 | 3 | TSTabletManager* ts_manager) { |
2498 | 3 | const string& tablet_id = meta->raft_group_id(); |
2499 | 3 | const string kLogPrefix = LogPrefix(tablet_id, uuid); |
2500 | 3 | LOG(WARNING) << kLogPrefix << msg << ": " << s.ToString(); |
2501 | | |
2502 | | // Tombstone the tablet when remote bootstrap fails. |
2503 | 3 | LOG(INFO) << kLogPrefix << "Tombstoning tablet after failed remote bootstrap"; |
2504 | 3 | Status delete_status = DeleteTabletData(meta, |
2505 | 3 | TABLET_DATA_TOMBSTONED, |
2506 | 3 | uuid, |
2507 | 3 | yb::OpId(), |
2508 | 3 | ts_manager); |
2509 | | |
2510 | 3 | if (PREDICT_FALSE(FLAGS_TEST_sleep_after_tombstoning_tablet_secs > 0)) { |
2511 | | // We sleep here so that the test can verify that the state of the tablet is |
2512 | | // TABLET_DATA_TOMBSTONED. |
2513 | 0 | LOG(INFO) << "Sleeping after remote bootstrap failed"; |
2514 | 0 | SleepFor(MonoDelta::FromSeconds(FLAGS_TEST_sleep_after_tombstoning_tablet_secs)); |
2515 | 0 | } |
2516 | | |
2517 | 3 | if (PREDICT_FALSE(!delete_status.ok())) { |
2518 | | // This failure should only either indicate a bug or an IO error. |
2519 | 0 | LOG(FATAL) << kLogPrefix << "Failed to tombstone tablet after remote bootstrap: " |
2520 | 0 | << delete_status.ToString(); |
2521 | 0 | } |
2522 | | |
2523 | | // Remove the child tracker if present. |
2524 | 3 | if (ts_manager != nullptr) { |
2525 | 3 | auto tracker = MemTracker::FindTracker( |
2526 | 3 | Format("tablet-$0", meta->raft_group_id()), ts_manager->server()->mem_tracker()); |
2527 | 3 | if (tracker) { |
2528 | 0 | tracker->UnregisterFromParent(); |
2529 | 0 | } |
2530 | 3 | } |
2531 | 3 | } |
2532 | | |
2533 | | TransitionInProgressDeleter::TransitionInProgressDeleter( |
2534 | | TransitionInProgressMap* map, std::mutex* mutex, const TabletId& tablet_id) |
2535 | 217k | : in_progress_(map), mutex_(mutex), tablet_id_(tablet_id) {} |
2536 | | |
2537 | 217k | TransitionInProgressDeleter::~TransitionInProgressDeleter() { |
2538 | 217k | std::string transition; |
2539 | 217k | { |
2540 | 217k | std::unique_lock<std::mutex> lock(*mutex_); |
2541 | 217k | const auto iter = in_progress_->find(tablet_id_); |
2542 | 217k | CHECK(iter != in_progress_->end()); |
2543 | 217k | transition = iter->second; |
2544 | 217k | in_progress_->erase(iter); |
2545 | 217k | } |
2546 | 217k | LOG(INFO) << "Deleted transition in progress " << transition |
2547 | 217k | << " for tablet " << tablet_id_; |
2548 | 217k | } |
2549 | | |
2550 | | Status ShutdownAndTombstoneTabletPeerNotOk( |
2551 | | const Status& status, const tablet::TabletPeerPtr& tablet_peer, |
2552 | | const tablet::RaftGroupMetadataPtr& meta, const std::string& uuid, const char* msg, |
2553 | 2.01k | TSTabletManager* ts_tablet_manager) { |
2554 | 2.01k | if (status.ok()) { |
2555 | 2.01k | return status; |
2556 | 2.01k | } |
2557 | | // If shutdown was initiated by someone else we should not wait for shutdown to complete. |
2558 | 0 | if (tablet_peer && tablet_peer->StartShutdown()) { |
2559 | 0 | tablet_peer->CompleteShutdown(); |
2560 | 0 | } |
2561 | 0 | tserver::LogAndTombstone(meta, msg, uuid, status, ts_tablet_manager); |
2562 | 0 | return status; |
2563 | 2.01k | } |
2564 | | |
2565 | | } // namespace tserver |
2566 | | } // namespace yb |