/Users/deen/code/yugabyte-db/src/yb/tserver/ts_tablet_manager.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // |
18 | | // The following only applies to changes made to this file as part of YugaByte development. |
19 | | // |
20 | | // Portions Copyright (c) YugaByte, Inc. |
21 | | // |
22 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
23 | | // in compliance with the License. You may obtain a copy of the License at |
24 | | // |
25 | | // http://www.apache.org/licenses/LICENSE-2.0 |
26 | | // |
27 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
28 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
29 | | // or implied. See the License for the specific language governing permissions and limitations |
30 | | // under the License. |
31 | | // |
32 | | |
33 | | #include "yb/tserver/ts_tablet_manager.h" |
34 | | |
35 | | #include <algorithm> |
36 | | #include <chrono> |
37 | | #include <memory> |
38 | | #include <mutex> |
39 | | #include <string> |
40 | | #include <thread> |
41 | | #include <vector> |
42 | | |
43 | | #include <boost/container/static_vector.hpp> |
44 | | #include <boost/optional/optional.hpp> |
45 | | #include <glog/logging.h> |
46 | | |
47 | | #include "yb/client/client.h" |
48 | | |
49 | | #include "yb/common/wire_protocol.h" |
50 | | |
51 | | #include "yb/consensus/consensus.h" |
52 | | #include "yb/consensus/multi_raft_batcher.h" |
53 | | #include "yb/consensus/consensus_meta.h" |
54 | | #include "yb/consensus/log.h" |
55 | | #include "yb/consensus/log_anchor_registry.h" |
56 | | #include "yb/consensus/metadata.pb.h" |
57 | | #include "yb/consensus/opid_util.h" |
58 | | #include "yb/consensus/quorum_util.h" |
59 | | #include "yb/consensus/raft_consensus.h" |
60 | | #include "yb/consensus/retryable_requests.h" |
61 | | #include "yb/consensus/state_change_context.h" |
62 | | |
63 | | #include "yb/docdb/docdb_rocksdb_util.h" |
64 | | |
65 | | #include "yb/fs/fs_manager.h" |
66 | | |
67 | | #include "yb/gutil/bind.h" |
68 | | #include "yb/gutil/stl_util.h" |
69 | | #include "yb/gutil/strings/substitute.h" |
70 | | #include "yb/gutil/sysinfo.h" |
71 | | |
72 | | #include "yb/master/master_heartbeat.pb.h" |
73 | | #include "yb/master/sys_catalog.h" |
74 | | |
75 | | #include "yb/rpc/messenger.h" |
76 | | #include "yb/rpc/poller.h" |
77 | | |
78 | | #include "yb/tablet/metadata.pb.h" |
79 | | #include "yb/tablet/operations/split_operation.h" |
80 | | #include "yb/tablet/tablet.h" |
81 | | #include "yb/tablet/tablet.pb.h" |
82 | | #include "yb/tablet/tablet_bootstrap_if.h" |
83 | | #include "yb/tablet/tablet_metadata.h" |
84 | | #include "yb/tablet/tablet_options.h" |
85 | | #include "yb/tablet/tablet_peer.h" |
86 | | |
87 | | #include "yb/tserver/heartbeater.h" |
88 | | #include "yb/tserver/remote_bootstrap_client.h" |
89 | | #include "yb/tserver/remote_bootstrap_session.h" |
90 | | #include "yb/tserver/tablet_server.h" |
91 | | #include "yb/tserver/tserver.pb.h" |
92 | | |
93 | | #include "yb/util/debug/long_operation_tracker.h" |
94 | | #include "yb/util/debug/trace_event.h" |
95 | | #include "yb/util/env.h" |
96 | | #include "yb/util/fault_injection.h" |
97 | | #include "yb/util/flag_tags.h" |
98 | | #include "yb/util/format.h" |
99 | | #include "yb/util/logging.h" |
100 | | #include "yb/util/mem_tracker.h" |
101 | | #include "yb/util/metrics.h" |
102 | | #include "yb/util/pb_util.h" |
103 | | #include "yb/util/scope_exit.h" |
104 | | #include "yb/util/shared_lock.h" |
105 | | #include "yb/util/status_format.h" |
106 | | #include "yb/util/status_log.h" |
107 | | #include "yb/util/stopwatch.h" |
108 | | #include "yb/util/trace.h" |
109 | | |
110 | | using namespace std::literals; |
111 | | using namespace std::placeholders; |
112 | | |
113 | | DEFINE_int32(num_tablets_to_open_simultaneously, 0, |
114 | | "Number of threads available to open tablets during startup. If this " |
115 | | "is set to 0 (the default), then the number of bootstrap threads will " |
116 | | "be set based on the number of data directories. If the data directories " |
117 | | "are on some very fast storage device such as SSD or a RAID array, it " |
118 | | "may make sense to manually tune this."); |
119 | | TAG_FLAG(num_tablets_to_open_simultaneously, advanced); |
120 | | |
121 | | DEFINE_int32(tablet_start_warn_threshold_ms, 500, |
122 | | "If a tablet takes more than this number of millis to start, issue " |
123 | | "a warning with a trace."); |
124 | | TAG_FLAG(tablet_start_warn_threshold_ms, hidden); |
125 | | |
126 | | DEFINE_int32(cleanup_split_tablets_interval_sec, 60, |
127 | | "Interval at which tablet manager tries to cleanup split tablets which are no longer " |
128 | | "needed. Setting this to 0 disables cleanup of split tablets."); |
129 | | |
130 | | DEFINE_test_flag(double, fault_crash_after_blocks_deleted, 0.0, |
131 | | "Fraction of the time when the tablet will crash immediately " |
132 | | "after deleting the data blocks during tablet deletion."); |
133 | | |
134 | | DEFINE_test_flag(double, fault_crash_after_wal_deleted, 0.0, |
135 | | "Fraction of the time when the tablet will crash immediately " |
136 | | "after deleting the WAL segments during tablet deletion."); |
137 | | |
138 | | DEFINE_test_flag(double, fault_crash_after_cmeta_deleted, 0.0, |
139 | | "Fraction of the time when the tablet will crash immediately " |
140 | | "after deleting the consensus metadata during tablet deletion."); |
141 | | |
142 | | DEFINE_test_flag(double, fault_crash_after_rb_files_fetched, 0.0, |
143 | | "Fraction of the time when the tablet will crash immediately " |
144 | | "after fetching the files during a remote bootstrap but before " |
145 | | "marking the superblock as TABLET_DATA_READY."); |
146 | | |
147 | | DEFINE_test_flag(double, fault_crash_in_split_after_log_copied, 0.0, |
148 | | "Fraction of the time when the tablet will crash immediately after initiating a " |
149 | | "Log::CopyTo from parent to child tablet, but before marking the child tablet as " |
150 | | "TABLET_DATA_READY."); |
151 | | |
152 | | DEFINE_test_flag(bool, simulate_already_present_in_remote_bootstrap, false, |
153 | | "If true, return an AlreadyPresent error in remote bootstrap after starting the " |
154 | | "remote bootstrap client."); |
155 | | |
156 | | DEFINE_test_flag(double, fault_crash_in_split_before_log_flushed, 0.0, |
157 | | "Fraction of the time when the tablet will crash immediately before flushing a " |
158 | | "parent tablet's kSplit operation."); |
159 | | |
160 | | DEFINE_test_flag(uint64, crash_if_remote_bootstrap_sessions_greater_than, 0, |
161 | | "If greater than zero, this process will crash if we detect more than the " |
162 | | "specified number of remote bootstrap sessions."); |
163 | | |
164 | | DEFINE_test_flag(uint64, crash_if_remote_bootstrap_sessions_per_table_greater_than, 0, |
165 | | "If greater than zero, this process will crash if for any table we exceed the " |
166 | | "specified number of remote bootstrap sessions"); |
167 | | |
168 | | DEFINE_test_flag(bool, crash_before_apply_tablet_split_op, false, |
169 | | "Crash inside TSTabletManager::ApplyTabletSplit before doing anything"); |
170 | | |
171 | | DEFINE_test_flag(bool, force_single_tablet_failure, false, |
172 | | "Force exactly one tablet to a failed state."); |
173 | | |
174 | | DEFINE_test_flag(int32, apply_tablet_split_inject_delay_ms, 0, |
175 | | "Inject delay into TSTabletManager::ApplyTabletSplit."); |
176 | | |
177 | | DEFINE_test_flag(bool, skip_deleting_split_tablets, false, |
178 | | "Skip deleting tablets which have been split."); |
179 | | |
180 | | DEFINE_test_flag(bool, skip_post_split_compaction, false, |
181 | | "Skip processing post split compaction."); |
182 | | |
183 | | DEFINE_int32(verify_tablet_data_interval_sec, 0, |
184 | | "The tick interval time for the tablet data integrity verification background task. " |
185 | | "This defaults to 0, which means disable the background task."); |
186 | | |
187 | | DEFINE_int32(cleanup_metrics_interval_sec, 60, |
188 | | "The tick interval time for the metrics cleanup background task. " |
189 | | "If set to 0, it disables the background task."); |
190 | | |
191 | | DEFINE_bool(skip_tablet_data_verification, false, |
192 | | "Skip checking tablet data for corruption."); |
193 | | |
194 | | DEFINE_int32(read_pool_max_threads, 128, |
195 | | "The maximum number of threads allowed for read_pool_. This pool is used " |
196 | | "to run multiple read operations, that are part of the same tablet rpc, " |
197 | | "in parallel."); |
198 | | DEFINE_int32(read_pool_max_queue_size, 128, |
199 | | "The maximum number of tasks that can be held in the queue for read_pool_. This pool " |
200 | | "is used to run multiple read operations, that are part of the same tablet rpc, " |
201 | | "in parallel."); |
202 | | |
203 | | DEFINE_int32(post_split_trigger_compaction_pool_max_threads, 1, |
204 | | "The maximum number of threads allowed for post_split_trigger_compaction_pool_. This " |
205 | | "pool is used to run compactions on tablets after they have been split and still " |
206 | | "contain irrelevant data from the tablet they were sourced from."); |
207 | | DEFINE_int32(post_split_trigger_compaction_pool_max_queue_size, 16, |
208 | | "The maximum number of tasks that can be held in the pool for " |
209 | | "post_split_trigger_compaction_pool_. This pool is used to run compactions on tablets " |
210 | | "after they have been split and still contain irrelevant data from the tablet they " |
211 | | "were sourced from."); |
212 | | |
213 | | DEFINE_test_flag(int32, sleep_after_tombstoning_tablet_secs, 0, |
214 | | "Whether we sleep in LogAndTombstone after calling DeleteTabletData."); |
215 | | |
216 | | constexpr int kTServerYbClientDefaultTimeoutMs = 60 * 1000; |
217 | | |
218 | | DEFINE_int32(tserver_yb_client_default_timeout_ms, kTServerYbClientDefaultTimeoutMs, |
219 | | "Default timeout for the YBClient embedded into the tablet server that is used " |
220 | | "for distributed transactions."); |
221 | | |
222 | | DEFINE_bool(enable_restart_transaction_status_tablets_first, true, |
223 | | "Set to true to prioritize bootstrapping transaction status tablets first."); |
224 | | |
225 | | DECLARE_string(rocksdb_compact_flush_rate_limit_sharing_mode); |
226 | | |
227 | | namespace yb { |
228 | | namespace tserver { |
229 | | |
230 | | METRIC_DEFINE_coarse_histogram(server, op_apply_queue_length, "Operation Apply Queue Length", |
231 | | MetricUnit::kTasks, |
232 | | "Number of operations waiting to be applied to the tablet. " |
233 | | "High queue lengths indicate that the server is unable to process " |
234 | | "operations as fast as they are being written to the WAL."); |
235 | | |
236 | | METRIC_DEFINE_coarse_histogram(server, op_apply_queue_time, "Operation Apply Queue Time", |
237 | | MetricUnit::kMicroseconds, |
238 | | "Time that operations spent waiting in the apply queue before being " |
239 | | "processed. High queue times indicate that the server is unable to " |
240 | | "process operations as fast as they are being written to the WAL."); |
241 | | |
242 | | METRIC_DEFINE_coarse_histogram(server, op_apply_run_time, "Operation Apply Run Time", |
243 | | MetricUnit::kMicroseconds, |
244 | | "Time that operations spent being applied to the tablet. " |
245 | | "High values may indicate that the server is under-provisioned or " |
246 | | "that operations consist of very large batches."); |
247 | | |
248 | | METRIC_DEFINE_coarse_histogram(server, op_read_queue_length, "Operation Read op Queue Length", |
249 | | MetricUnit::kTasks, |
250 | | "Number of operations waiting to be applied to the tablet. " |
251 | | "High queue lengths indicate that the server is unable to process " |
252 | | "operations as fast as they are being written to the WAL."); |
253 | | |
254 | | METRIC_DEFINE_coarse_histogram(server, op_read_queue_time, "Operation Read op Queue Time", |
255 | | MetricUnit::kMicroseconds, |
256 | | "Time that operations spent waiting in the read queue before being " |
257 | | "processed. High queue times indicate that the server is unable to " |
258 | | "process operations as fast as they are being written to the WAL."); |
259 | | |
260 | | METRIC_DEFINE_coarse_histogram(server, op_read_run_time, "Operation Read op Run Time", |
261 | | MetricUnit::kMicroseconds, |
262 | | "Time that operations spent being applied to the tablet. " |
263 | | "High values may indicate that the server is under-provisioned or " |
264 | | "that operations consist of very large batches."); |
265 | | |
266 | | METRIC_DEFINE_coarse_histogram(server, ts_bootstrap_time, "TServer Bootstrap Time", |
267 | | MetricUnit::kMicroseconds, |
268 | | "Time that the tablet server takes to bootstrap all of its tablets."); |
269 | | |
270 | | THREAD_POOL_METRICS_DEFINE( |
271 | | server, post_split_trigger_compaction_pool, "Thread pool for tablet compaction jobs."); |
272 | | |
273 | | THREAD_POOL_METRICS_DEFINE( |
274 | | server, admin_triggered_compaction_pool, "Thread pool for tablet compaction jobs."); |
275 | | |
276 | | using consensus::ConsensusMetadata; |
277 | | using consensus::ConsensusStatePB; |
278 | | using consensus::RaftConfigPB; |
279 | | using consensus::RaftPeerPB; |
280 | | using consensus::StartRemoteBootstrapRequestPB; |
281 | | using log::Log; |
282 | | using master::ReportedTabletPB; |
283 | | using master::TabletReportPB; |
284 | | using master::TabletReportUpdatesPB; |
285 | | using std::shared_ptr; |
286 | | using std::string; |
287 | | using std::unordered_set; |
288 | | using std::vector; |
289 | | using strings::Substitute; |
290 | | using tablet::BOOTSTRAPPING; |
291 | | using tablet::NOT_STARTED; |
292 | | using tablet::RaftGroupMetadata; |
293 | | using tablet::RaftGroupMetadataPtr; |
294 | | using tablet::RaftGroupStatePB; |
295 | | using tablet::RUNNING; |
296 | | using tablet::TABLET_DATA_COPYING; |
297 | | using tablet::TABLET_DATA_DELETED; |
298 | | using tablet::TABLET_DATA_INIT_STARTED; |
299 | | using tablet::TABLET_DATA_READY; |
300 | | using tablet::TABLET_DATA_SPLIT_COMPLETED; |
301 | | using tablet::TABLET_DATA_TOMBSTONED; |
302 | | using tablet::TabletDataState; |
303 | | using tablet::TabletPeer; |
304 | | using tablet::TabletPeerPtr; |
305 | | using tablet::TabletStatusListener; |
306 | | using tablet::TabletStatusPB; |
307 | | |
308 | | constexpr int32_t kDefaultTserverBlockCacheSizePercentage = 50; |
309 | | |
310 | 0 | void TSTabletManager::VerifyTabletData() { |
311 | 0 | LOG_WITH_PREFIX(INFO) << "Beginning tablet data verification checks"; |
312 | 0 | for (const TabletPeerPtr& peer : GetTabletPeers()) { |
313 | 0 | if (peer->state() == RUNNING) { |
314 | 0 | if (PREDICT_FALSE(FLAGS_skip_tablet_data_verification)) { |
315 | 0 | LOG_WITH_PREFIX(INFO) |
316 | 0 | << Format("Skipped tablet data verification check on $0", peer->tablet_id()); |
317 | 0 | } else { |
318 | 0 | Status s = peer->tablet()->VerifyDataIntegrity(); |
319 | 0 | if (!s.ok()) { |
320 | 0 | LOG(WARNING) << "Tablet data integrity verification failed on " << peer->tablet_id() |
321 | 0 | << ": " << s; |
322 | 0 | } |
323 | 0 | } |
324 | 0 | } |
325 | 0 | } |
326 | 0 | } |
327 | | |
328 | 1.72k | void TSTabletManager::CleanupOldMetrics() { |
329 | 0 | VLOG(2) << "Cleaning up old metrics"; |
330 | 1.72k | metric_registry_->RetireOldMetrics(); |
331 | 1.72k | } |
332 | | |
333 | | TSTabletManager::TSTabletManager(FsManager* fs_manager, |
334 | | TabletServer* server, |
335 | | MetricRegistry* metric_registry) |
336 | | : fs_manager_(fs_manager), |
337 | | server_(server), |
338 | | metric_registry_(metric_registry), |
339 | 6.10k | state_(MANAGER_INITIALIZING) { |
340 | 6.10k | ThreadPoolMetrics metrics = { |
341 | 6.10k | METRIC_op_apply_queue_length.Instantiate(server_->metric_entity()), |
342 | 6.10k | METRIC_op_apply_queue_time.Instantiate(server_->metric_entity()), |
343 | 6.10k | METRIC_op_apply_run_time.Instantiate(server_->metric_entity()) |
344 | 6.10k | }; |
345 | 6.10k | tablet_options_.ServerMetricEntity = server_->metric_entity(); |
346 | 6.10k | CHECK_OK(ThreadPoolBuilder("apply") |
347 | 6.10k | .set_metrics(std::move(metrics)) |
348 | 6.10k | .Build(&apply_pool_)); |
349 | | |
350 | | // This pool is shared by all replicas hosted by this server. |
351 | | // |
352 | | // Some submitted tasks use blocking IO, so we configure no upper bound on |
353 | | // the maximum number of threads in each pool (otherwise the default value of |
354 | | // "number of CPUs" may cause blocking tasks to starve other "fast" tasks). |
355 | | // However, the effective upper bound is the number of replicas as each will |
356 | | // submit its own tasks via a dedicated token. |
357 | 6.10k | CHECK_OK(ThreadPoolBuilder("consensus") |
358 | 6.10k | .set_min_threads(1) |
359 | 6.10k | .unlimited_threads() |
360 | 6.10k | .Build(&raft_pool_)); |
361 | 6.10k | CHECK_OK(ThreadPoolBuilder("prepare") |
362 | 6.10k | .set_min_threads(1) |
363 | 6.10k | .unlimited_threads() |
364 | 6.10k | .Build(&tablet_prepare_pool_)); |
365 | 6.10k | CHECK_OK(ThreadPoolBuilder("append") |
366 | 6.10k | .set_min_threads(1) |
367 | 6.10k | .unlimited_threads() |
368 | 6.10k | .set_idle_timeout(MonoDelta::FromMilliseconds(10000)) |
369 | 6.10k | .Build(&append_pool_)); |
370 | 6.10k | CHECK_OK(ThreadPoolBuilder("log-alloc") |
371 | 6.10k | .set_min_threads(1) |
372 | 6.10k | .unlimited_threads() |
373 | 6.10k | .Build(&allocation_pool_)); |
374 | 6.10k | ThreadPoolMetrics read_metrics = { |
375 | 6.10k | METRIC_op_read_queue_length.Instantiate(server_->metric_entity()), |
376 | 6.10k | METRIC_op_read_queue_time.Instantiate(server_->metric_entity()), |
377 | 6.10k | METRIC_op_read_run_time.Instantiate(server_->metric_entity()) |
378 | 6.10k | }; |
379 | 6.10k | CHECK_OK(ThreadPoolBuilder("read-parallel") |
380 | 6.10k | .set_max_threads(FLAGS_read_pool_max_threads) |
381 | 6.10k | .set_max_queue_size(FLAGS_read_pool_max_queue_size) |
382 | 6.10k | .set_metrics(std::move(read_metrics)) |
383 | 6.10k | .Build(&read_pool_)); |
384 | 6.10k | CHECK_OK(ThreadPoolBuilder("tablet-split-compaction") |
385 | 6.10k | .set_max_threads(FLAGS_post_split_trigger_compaction_pool_max_threads) |
386 | 6.10k | .set_max_queue_size(FLAGS_post_split_trigger_compaction_pool_max_queue_size) |
387 | 6.10k | .set_metrics(THREAD_POOL_METRICS_INSTANCE( |
388 | 6.10k | server_->metric_entity(), post_split_trigger_compaction_pool)) |
389 | 6.10k | .Build(&post_split_trigger_compaction_pool_)); |
390 | 6.10k | CHECK_OK(ThreadPoolBuilder("admin-compaction") |
391 | 6.10k | .set_max_threads(std::max(docdb::GetGlobalRocksDBPriorityThreadPoolSize(), 0)) |
392 | 6.10k | .set_metrics(THREAD_POOL_METRICS_INSTANCE( |
393 | 6.10k | server_->metric_entity(), admin_triggered_compaction_pool)) |
394 | 6.10k | .Build(&admin_triggered_compaction_pool_)); |
395 | | |
396 | 6.10k | mem_manager_ = std::make_shared<TabletMemoryManager>( |
397 | 6.10k | &tablet_options_, |
398 | 6.10k | server_->mem_tracker(), |
399 | 6.10k | kDefaultTserverBlockCacheSizePercentage, |
400 | 6.10k | server_->metric_entity(), |
401 | 88 | [this](){ return GetTabletPeers(); }); |
402 | 6.10k | } |
403 | | |
404 | 111 | TSTabletManager::~TSTabletManager() { |
405 | 111 | } |
406 | | |
407 | 5.81k | Status TSTabletManager::Init() { |
408 | 5.81k | CHECK_EQ(state(), MANAGER_INITIALIZING); |
409 | | |
410 | 5.81k | async_client_init_.emplace( |
411 | 5.81k | "tserver_client", 0 /* num_reactors */, |
412 | 5.81k | FLAGS_tserver_yb_client_default_timeout_ms / 1000, server_->permanent_uuid(), |
413 | 5.81k | &server_->options(), server_->metric_entity(), server_->mem_tracker(), |
414 | 5.81k | server_->messenger()); |
415 | | |
416 | 5.48k | async_client_init_->AddPostCreateHook([this](client::YBClient* client) { |
417 | 5.48k | auto* tserver = server(); |
418 | 5.48k | if (tserver != nullptr && tserver->proxy() != nullptr) { |
419 | 5.48k | client->SetLocalTabletServer(tserver->permanent_uuid(), tserver->proxy(), tserver); |
420 | 5.48k | } |
421 | 5.48k | }); |
422 | | |
423 | 5.81k | tablet_options_.env = server_->GetEnv(); |
424 | 5.81k | tablet_options_.rocksdb_env = server_->GetRocksDBEnv(); |
425 | 5.81k | tablet_options_.listeners = server_->options().listeners; |
426 | 5.81k | if (docdb::GetRocksDBRateLimiterSharingMode() == docdb::RateLimiterSharingMode::TSERVER) { |
427 | 4.66k | tablet_options_.rate_limiter = docdb::CreateRocksDBRateLimiter(); |
428 | 4.66k | } |
429 | | |
430 | | // Start the threadpool we'll use to open tablets. |
431 | | // This has to be done in Init() instead of the constructor, since the |
432 | | // FsManager isn't initialized until this point. |
433 | 5.81k | int max_bootstrap_threads = FLAGS_num_tablets_to_open_simultaneously; |
434 | 5.81k | if (max_bootstrap_threads == 0) { |
435 | 5.81k | int num_cpus = base::NumCPUs(); |
436 | 5.81k | if (num_cpus <= 2) { |
437 | 0 | max_bootstrap_threads = 2; |
438 | 5.81k | } else { |
439 | 5.81k | max_bootstrap_threads = min( |
440 | 5.81k | num_cpus - 1, narrow_cast<int>(fs_manager_->GetDataRootDirs().size()) * 8); |
441 | 5.81k | } |
442 | 5.81k | LOG_WITH_PREFIX(INFO) << "max_bootstrap_threads=" << max_bootstrap_threads; |
443 | 5.81k | } |
444 | 5.81k | ThreadPoolMetrics bootstrap_metrics = { |
445 | 5.81k | nullptr, |
446 | 5.81k | nullptr, |
447 | 5.81k | METRIC_ts_bootstrap_time.Instantiate(server_->metric_entity()) |
448 | 5.81k | }; |
449 | 5.81k | RETURN_NOT_OK(ThreadPoolBuilder("tablet-bootstrap") |
450 | 5.81k | .set_max_threads(max_bootstrap_threads) |
451 | 5.81k | .set_metrics(std::move(bootstrap_metrics)) |
452 | 5.81k | .Build(&open_tablet_pool_)); |
453 | | |
454 | 5.81k | CleanupCheckpoints(); |
455 | | |
456 | | // Search for tablets in the metadata dir. |
457 | 5.81k | vector<string> tablet_ids; |
458 | 5.81k | RETURN_NOT_OK(fs_manager_->ListTabletIds(&tablet_ids)); |
459 | | |
460 | 5.81k | InitLocalRaftPeerPB(); |
461 | | |
462 | 5.81k | multi_raft_manager_ = std::make_unique<consensus::MultiRaftManager>(server_->messenger(), |
463 | 5.81k | &server_->proxy_cache(), |
464 | 5.81k | local_peer_pb_.cloud_info()); |
465 | | |
466 | 5.81k | deque<RaftGroupMetadataPtr> metas; |
467 | | |
468 | | // First, load all of the tablet metadata. We do this before we start |
469 | | // submitting the actual OpenTablet() tasks so that we don't have to compete |
470 | | // for disk resources, etc, with bootstrap processes and running tablets. |
471 | 5.81k | MonoTime start(MonoTime::Now()); |
472 | 228 | for (const string& tablet_id : tablet_ids) { |
473 | 228 | RaftGroupMetadataPtr meta; |
474 | 228 | RETURN_NOT_OK_PREPEND(OpenTabletMeta(tablet_id, &meta), |
475 | 228 | "Failed to open tablet metadata for tablet: " + tablet_id); |
476 | 228 | if (PREDICT_FALSE(!CanServeTabletData(meta->tablet_data_state()))) { |
477 | 9 | RETURN_NOT_OK(HandleNonReadyTabletOnStartup(meta)); |
478 | 9 | continue; |
479 | 219 | } |
480 | 219 | RegisterDataAndWalDir( |
481 | 219 | fs_manager_, meta->table_id(), meta->raft_group_id(), meta->data_root_dir(), |
482 | 219 | meta->wal_root_dir()); |
483 | 219 | if (FLAGS_enable_restart_transaction_status_tablets_first) { |
484 | | // Prioritize bootstrapping transaction status tablets first. |
485 | 219 | if (meta->table_type() == TRANSACTION_STATUS_TABLE_TYPE) { |
486 | 27 | metas.push_front(meta); |
487 | 192 | } else { |
488 | 192 | metas.push_back(meta); |
489 | 192 | } |
490 | 0 | } else { |
491 | 0 | metas.push_back(meta); |
492 | 0 | } |
493 | 219 | } |
494 | | |
495 | 5.81k | MonoDelta elapsed = MonoTime::Now().GetDeltaSince(start); |
496 | 5.81k | LOG(INFO) << "Loaded metadata for " << tablet_ids.size() << " tablet in " |
497 | 5.81k | << elapsed.ToMilliseconds() << " ms"; |
498 | | |
499 | | // Now submit the "Open" task for each. |
500 | 219 | for (const RaftGroupMetadataPtr& meta : metas) { |
501 | 219 | scoped_refptr<TransitionInProgressDeleter> deleter; |
502 | 219 | RETURN_NOT_OK(StartTabletStateTransition( |
503 | 219 | meta->raft_group_id(), "opening tablet", &deleter)); |
504 | | |
505 | 219 | TabletPeerPtr tablet_peer = VERIFY_RESULT(CreateAndRegisterTabletPeer(meta, NEW_PEER)); |
506 | 219 | RETURN_NOT_OK(open_tablet_pool_->SubmitFunc( |
507 | 219 | std::bind(&TSTabletManager::OpenTablet, this, meta, deleter))); |
508 | 219 | } |
509 | | |
510 | 5.81k | { |
511 | 5.81k | std::lock_guard<RWMutex> lock(mutex_); |
512 | 5.81k | state_ = MANAGER_RUNNING; |
513 | 5.81k | } |
514 | | |
515 | 5.81k | RETURN_NOT_OK(mem_manager_->Init()); |
516 | | |
517 | 5.81k | tablets_cleaner_ = std::make_unique<rpc::Poller>( |
518 | 5.81k | LogPrefix(), std::bind(&TSTabletManager::CleanupSplitTablets, this)); |
519 | | |
520 | 5.81k | verify_tablet_data_poller_ = std::make_unique<rpc::Poller>( |
521 | 5.81k | LogPrefix(), std::bind(&TSTabletManager::VerifyTabletData, this)); |
522 | | |
523 | 5.81k | metrics_cleaner_ = std::make_unique<rpc::Poller>( |
524 | 5.81k | LogPrefix(), std::bind(&TSTabletManager::CleanupOldMetrics, this)); |
525 | | |
526 | 5.81k | return Status::OK(); |
527 | 5.81k | } |
528 | | |
529 | 5.81k | void TSTabletManager::CleanupCheckpoints() { |
530 | 5.91k | for (const auto& data_root : fs_manager_->GetDataRootDirs()) { |
531 | 5.91k | auto tables_dir = JoinPathSegments(data_root, FsManager::kRocksDBDirName); |
532 | 5.91k | auto tables = fs_manager_->env()->GetChildren(tables_dir, ExcludeDots::kTrue); |
533 | 5.91k | if (!tables.ok()) { |
534 | 0 | LOG_WITH_PREFIX(WARNING) |
535 | 0 | << "Failed to get tables in " << tables_dir << ": " << tables.status(); |
536 | 0 | continue; |
537 | 0 | } |
538 | 5.91k | for (const auto& table : *tables) { |
539 | 95 | auto table_dir = JoinPathSegments(tables_dir, table); |
540 | 95 | auto tablets = fs_manager_->env()->GetChildren(table_dir, ExcludeDots::kTrue); |
541 | 95 | if (!tablets.ok()) { |
542 | 0 | LOG_WITH_PREFIX(WARNING) |
543 | 0 | << "Failed to get tablets in " << table_dir << ": " << tablets.status(); |
544 | 0 | continue; |
545 | 0 | } |
546 | 578 | for (const auto& tablet : *tablets) { |
547 | 578 | auto checkpoints_dir = JoinPathSegments( |
548 | 578 | table_dir, tablet, RemoteBootstrapSession::kCheckpointsDir); |
549 | 578 | if (fs_manager_->env()->FileExists(checkpoints_dir)) { |
550 | 2 | LOG_WITH_PREFIX(INFO) << "Cleaning up checkpoints dir: " << yb::ToString(checkpoints_dir); |
551 | 2 | auto status = fs_manager_->env()->DeleteRecursively(checkpoints_dir); |
552 | 2 | WARN_NOT_OK(status, Format("Cleanup of checkpoints dir $0 failed", checkpoints_dir)); |
553 | 2 | } |
554 | 578 | } |
555 | 95 | } |
556 | 5.91k | } |
557 | 5.81k | } |
558 | | |
559 | 5.80k | Status TSTabletManager::Start() { |
560 | 5.80k | async_client_init_->Start(); |
561 | 5.80k | if (FLAGS_cleanup_split_tablets_interval_sec > 0) { |
562 | 5.80k | tablets_cleaner_->Start( |
563 | 5.80k | &server_->messenger()->scheduler(), FLAGS_cleanup_split_tablets_interval_sec * 1s); |
564 | 5.80k | LOG(INFO) << "Split tablets cleanup monitor started..."; |
565 | 0 | } else { |
566 | 0 | LOG(INFO) |
567 | 0 | << "Split tablets cleanup is disabled by cleanup_split_tablets_interval_sec flag set to 0"; |
568 | 0 | } |
569 | 5.80k | if (FLAGS_verify_tablet_data_interval_sec > 0) { |
570 | 0 | verify_tablet_data_poller_->Start( |
571 | 0 | &server_->messenger()->scheduler(), FLAGS_verify_tablet_data_interval_sec * 1s); |
572 | 0 | LOG(INFO) << "Tablet data verification task started..."; |
573 | 5.80k | } else { |
574 | 5.80k | LOG(INFO) |
575 | 5.80k | << "Tablet data verification is disabled by verify_tablet_data_interval_sec flag set to 0"; |
576 | 5.80k | } |
577 | 5.80k | if (FLAGS_cleanup_metrics_interval_sec > 0) { |
578 | 5.80k | metrics_cleaner_->Start( |
579 | 5.80k | &server_->messenger()->scheduler(), FLAGS_cleanup_metrics_interval_sec * 1s); |
580 | 5.80k | LOG(INFO) << "Old metrics cleanup task started..."; |
581 | 0 | } else { |
582 | 0 | LOG(INFO) |
583 | 0 | << "Old metrics cleanup is disabled by cleanup_metrics_interval_sec flag set to 0"; |
584 | 0 | } |
585 | | |
586 | 5.80k | return Status::OK(); |
587 | 5.80k | } |
588 | | |
589 | 3.73k | void TSTabletManager::CleanupSplitTablets() { |
590 | 0 | VLOG_WITH_PREFIX_AND_FUNC(3) << "looking for tablets to cleanup..."; |
591 | 3.73k | auto tablet_peers = GetTabletPeers(); |
592 | 24.1k | for (const auto& tablet_peer : tablet_peers) { |
593 | 24.1k | if (tablet_peer->CanBeDeleted()) { |
594 | 15 | const auto& tablet_id = tablet_peer->tablet_id(); |
595 | 15 | if (PREDICT_FALSE(FLAGS_TEST_skip_deleting_split_tablets)) { |
596 | 7 | LOG_WITH_PREFIX(INFO) << Format("Skipped triggering delete of tablet $0", tablet_id); |
597 | 8 | } else { |
598 | 8 | LOG_WITH_PREFIX(INFO) << Format("Triggering delete of tablet $0", tablet_id); |
599 | 8 | client().DeleteNotServingTablet( |
600 | 8 | tablet_peer->tablet_id(), [tablet_id](const Status& status) { |
601 | 8 | LOG(INFO) << Format("Tablet $0 deletion result: $1", tablet_id, status); |
602 | 8 | }); |
603 | 8 | } |
604 | 15 | } |
605 | 24.1k | } |
606 | 3.73k | } |
607 | | |
608 | 14 | Status TSTabletManager::WaitForAllBootstrapsToFinish() { |
609 | 14 | CHECK_EQ(state(), MANAGER_RUNNING); |
610 | | |
611 | 14 | open_tablet_pool_->Wait(); |
612 | | |
613 | 14 | Status s = Status::OK(); |
614 | | |
615 | 14 | SharedLock<RWMutex> shared_lock(mutex_); |
616 | 29 | for (const TabletMap::value_type& entry : tablet_map_) { |
617 | 29 | if (entry.second->state() == tablet::FAILED) { |
618 | 1 | if (s.ok()) { |
619 | 1 | s = entry.second->error(); |
620 | 1 | } |
621 | 1 | } |
622 | 29 | } |
623 | | |
624 | 14 | return s; |
625 | 14 | } |
626 | | |
627 | | Result<scoped_refptr<TransitionInProgressDeleter>> |
628 | 81.9k | TSTabletManager::StartTabletStateTransitionForCreation(const TabletId& tablet_id) { |
629 | 81.9k | scoped_refptr<TransitionInProgressDeleter> deleter; |
630 | 81.9k | SharedLock<RWMutex> lock(mutex_); |
631 | 81.9k | TRACE("Acquired tablet manager lock"); |
632 | | |
633 | | // Sanity check that the tablet isn't already registered. |
634 | 81.9k | TabletPeerPtr junk; |
635 | 81.9k | if (LookupTabletUnlocked(tablet_id, &junk)) { |
636 | 4 | return STATUS(AlreadyPresent, "Tablet already registered", tablet_id); |
637 | 4 | } |
638 | | |
639 | 81.9k | RETURN_NOT_OK(StartTabletStateTransition(tablet_id, "creating tablet", &deleter)); |
640 | | |
641 | 81.9k | return deleter; |
642 | 81.9k | } |
643 | | |
644 | | Result<TabletPeerPtr> TSTabletManager::CreateNewTablet( |
645 | | const tablet::TableInfoPtr& table_info, |
646 | | const string& tablet_id, |
647 | | const Partition& partition, |
648 | | RaftConfigPB config, |
649 | | const bool colocated, |
650 | 82.1k | const std::vector<SnapshotScheduleId>& snapshot_schedules) { |
651 | 82.1k | if (state() != MANAGER_RUNNING) { |
652 | 0 | return STATUS_FORMAT(IllegalState, "Manager is not running: $0", state()); |
653 | 0 | } |
654 | 82.1k | CHECK(IsRaftConfigMember(server_->instance_pb().permanent_uuid(), config)); |
655 | | |
656 | 326k | for (int i = 0; i < config.peers_size(); ++i) { |
657 | 244k | const auto& config_peer = config.peers(i); |
658 | 244k | CHECK(config_peer.has_member_type()); |
659 | 244k | } |
660 | | |
661 | | // Set the initial opid_index for a RaftConfigPB to -1. |
662 | 82.1k | config.set_opid_index(consensus::kInvalidOpIdIndex); |
663 | | |
664 | 82.1k | scoped_refptr<TransitionInProgressDeleter> deleter = |
665 | 82.1k | VERIFY_RESULT(StartTabletStateTransitionForCreation(tablet_id)); |
666 | | |
667 | | // Create the metadata. |
668 | 82.1k | TRACE("Creating new metadata..."); |
669 | 82.1k | string data_root_dir; |
670 | 82.1k | string wal_root_dir; |
671 | 82.1k | GetAndRegisterDataAndWalDir( |
672 | 82.1k | fs_manager_, table_info->table_id, tablet_id, &data_root_dir, &wal_root_dir); |
673 | 82.1k | auto create_result = RaftGroupMetadata::CreateNew(tablet::RaftGroupMetadataData { |
674 | 82.1k | .fs_manager = fs_manager_, |
675 | 82.1k | .table_info = table_info, |
676 | 82.1k | .raft_group_id = tablet_id, |
677 | 82.1k | .partition = partition, |
678 | 82.1k | .tablet_data_state = TABLET_DATA_READY, |
679 | 82.1k | .colocated = colocated, |
680 | 82.1k | .snapshot_schedules = snapshot_schedules, |
681 | 82.1k | }, data_root_dir, wal_root_dir); |
682 | 82.1k | if (!create_result.ok()) { |
683 | 2 | UnregisterDataWalDir(table_info->table_id, tablet_id, data_root_dir, wal_root_dir); |
684 | 2 | } |
685 | 82.1k | RETURN_NOT_OK_PREPEND(create_result, "Couldn't create tablet metadata") |
686 | 82.1k | RaftGroupMetadataPtr meta = std::move(*create_result); |
687 | 82.1k | LOG(INFO) << TabletLogPrefix(tablet_id) |
688 | 82.1k | << "Created tablet metadata for table: " << table_info->table_id; |
689 | | |
690 | | // We must persist the consensus metadata to disk before starting a new |
691 | | // tablet's TabletPeer and Consensus implementation. |
692 | 82.1k | std::unique_ptr<ConsensusMetadata> cmeta; |
693 | 82.1k | RETURN_NOT_OK_PREPEND(ConsensusMetadata::Create(fs_manager_, tablet_id, fs_manager_->uuid(), |
694 | 82.1k | config, consensus::kMinimumTerm, &cmeta), |
695 | 82.1k | "Unable to create new ConsensusMeta for tablet " + tablet_id); |
696 | 82.1k | TabletPeerPtr new_peer = VERIFY_RESULT(CreateAndRegisterTabletPeer(meta, NEW_PEER)); |
697 | | |
698 | | // We can run this synchronously since there is nothing to bootstrap. |
699 | 82.1k | RETURN_NOT_OK( |
700 | 82.1k | open_tablet_pool_->SubmitFunc(std::bind(&TSTabletManager::OpenTablet, this, meta, deleter))); |
701 | | |
702 | 82.1k | return new_peer; |
703 | 82.1k | } |
704 | | |
705 | | struct TabletCreationMetaData { |
706 | | TabletId tablet_id; |
707 | | scoped_refptr<TransitionInProgressDeleter> transition_deleter; |
708 | | Partition partition; |
709 | | docdb::KeyBounds key_bounds; |
710 | | RaftGroupMetadataPtr raft_group_metadata; |
711 | | }; |
712 | | |
713 | | namespace { |
714 | | |
715 | | // Creates SplitTabletsCreationMetaData for two new tablets for `tablet` splitting based on request. |
716 | | SplitTabletsCreationMetaData PrepareTabletCreationMetaDataForSplit( |
717 | 44 | const tablet::SplitTabletRequestPB& request, const tablet::Tablet& tablet) { |
718 | 44 | SplitTabletsCreationMetaData metas; |
719 | | |
720 | 44 | const auto& split_partition_key = request.split_partition_key(); |
721 | 44 | const auto& split_encoded_key = request.split_encoded_key(); |
722 | | |
723 | 44 | std::shared_ptr<Partition> source_partition = tablet.metadata()->partition(); |
724 | 44 | const auto source_key_bounds = *tablet.doc_db().key_bounds; |
725 | | |
726 | 44 | { |
727 | 44 | TabletCreationMetaData meta; |
728 | 44 | meta.tablet_id = request.new_tablet1_id(); |
729 | 44 | meta.partition = *source_partition; |
730 | 44 | meta.key_bounds = source_key_bounds; |
731 | 44 | meta.partition.set_partition_key_end(split_partition_key); |
732 | 44 | meta.key_bounds.upper.Reset(split_encoded_key); |
733 | 44 | metas.push_back(meta); |
734 | 44 | } |
735 | | |
736 | 44 | { |
737 | 44 | TabletCreationMetaData meta; |
738 | 44 | meta.tablet_id = request.new_tablet2_id(); |
739 | 44 | meta.partition = *source_partition; |
740 | 44 | meta.key_bounds = source_key_bounds; |
741 | 44 | meta.partition.set_partition_key_start(split_partition_key); |
742 | 44 | meta.key_bounds.lower.Reset(split_encoded_key); |
743 | 44 | metas.push_back(meta); |
744 | 44 | } |
745 | | |
746 | 44 | return metas; |
747 | 44 | } |
748 | | |
749 | | } // namespace |
750 | | |
751 | | Status TSTabletManager::StartSubtabletsSplit( |
752 | 0 | const RaftGroupMetadata& source_tablet_meta, SplitTabletsCreationMetaData* tcmetas) { |
753 | 0 | auto* const env = fs_manager_->env(); |
754 | |
|
755 | 0 | auto iter = tcmetas->begin(); |
756 | 0 | while (iter != tcmetas->end()) { |
757 | 0 | const auto& subtablet_id = iter->tablet_id; |
758 | |
|
759 | 0 | auto transition_deleter_result = StartTabletStateTransitionForCreation(subtablet_id); |
760 | 0 | if (transition_deleter_result.ok()) { |
761 | 0 | iter->transition_deleter = *transition_deleter_result; |
762 | 0 | } else if (transition_deleter_result.status().IsAlreadyPresent()) { |
763 | | // State transition for sub tablet with subtablet_id could be already registered because its |
764 | | // remote bootstrap (from already split parent tablet leader) is in progress. |
765 | 0 | iter = tcmetas->erase(iter); |
766 | 0 | continue; |
767 | 0 | } else { |
768 | 0 | return transition_deleter_result.status(); |
769 | 0 | } |
770 | | |
771 | | // Try to load metadata from previous not completed split. |
772 | 0 | auto load_result = RaftGroupMetadata::Load(fs_manager_, subtablet_id); |
773 | 0 | if (load_result.ok() && CanServeTabletData((*load_result)->tablet_data_state())) { |
774 | | // Sub tablet has been already created and ready during previous split attempt at this node or |
775 | | // as a result of remote bootstrap from another node, no need to re-create. |
776 | 0 | iter = tcmetas->erase(iter); |
777 | 0 | continue; |
778 | 0 | } |
779 | | |
780 | | // Delete on-disk data for new tablet IDs in case it is present as a leftover from previously |
781 | | // failed tablet split attempt. |
782 | | // TODO(tsplit): add test for that. |
783 | 0 | const auto data_dir = source_tablet_meta.GetSubRaftGroupDataDir(subtablet_id); |
784 | 0 | if (env->FileExists(data_dir)) { |
785 | 0 | RETURN_NOT_OK_PREPEND( |
786 | 0 | env->DeleteRecursively(data_dir), |
787 | 0 | Format("Unable to recursively delete data dir for tablet $0", subtablet_id)); |
788 | 0 | } |
789 | 0 | RETURN_NOT_OK(Log::DeleteOnDiskData( |
790 | 0 | env, subtablet_id, source_tablet_meta.GetSubRaftGroupWalDir(subtablet_id), |
791 | 0 | fs_manager_->uuid())); |
792 | 0 | RETURN_NOT_OK(ConsensusMetadata::DeleteOnDiskData(fs_manager_, subtablet_id)); |
793 | |
|
794 | 0 | ++iter; |
795 | 0 | } |
796 | 0 | return Status::OK(); |
797 | 0 | } |
798 | | |
799 | | void TSTabletManager::CreatePeerAndOpenTablet( |
800 | | const tablet::RaftGroupMetadataPtr& meta, |
801 | 86 | const scoped_refptr<TransitionInProgressDeleter>& deleter) { |
802 | 86 | Status s = ResultToStatus(CreateAndRegisterTabletPeer(meta, NEW_PEER)); |
803 | 86 | if (!s.ok()) { |
804 | 0 | s = s.CloneAndPrepend("Failed to create and register tablet peer"); |
805 | 0 | if (s.IsShutdownInProgress()) { |
806 | | // If shutdown is in progress, it is not a failure to not being able to create and register |
807 | | // tablet peer. |
808 | 0 | LOG_WITH_PREFIX(WARNING) << s; |
809 | 0 | } else { |
810 | 0 | LOG_WITH_PREFIX(DFATAL) << s; |
811 | 0 | } |
812 | 0 | return; |
813 | 0 | } |
814 | 86 | s = open_tablet_pool_->SubmitFunc(std::bind(&TSTabletManager::OpenTablet, this, meta, deleter)); |
815 | 86 | if (!s.ok()) { |
816 | 0 | LOG(DFATAL) << Format("Failed to schedule opening tablet $0: $1", meta->table_id(), s); |
817 | 0 | return; |
818 | 0 | } |
819 | 86 | } |
820 | | |
821 | 47 | Status TSTabletManager::ApplyTabletSplit(tablet::SplitOperation* operation, log::Log* raft_log) { |
822 | 47 | if (PREDICT_FALSE(FLAGS_TEST_crash_before_apply_tablet_split_op)) { |
823 | 2 | LOG(FATAL) << "Crashing due to FLAGS_TEST_crash_before_apply_tablet_split_op"; |
824 | 2 | } |
825 | | |
826 | 47 | if (state() != MANAGER_RUNNING) { |
827 | 0 | return STATUS_FORMAT(IllegalState, "Manager is not running: $0", state()); |
828 | 0 | } |
829 | | |
830 | 47 | auto* tablet = CHECK_NOTNULL(operation->tablet()); |
831 | 47 | const auto tablet_id = tablet->tablet_id(); |
832 | 47 | const auto* request = operation->request(); |
833 | 47 | SCHECK_EQ( |
834 | 47 | request->tablet_id(), tablet_id, IllegalState, |
835 | 47 | Format( |
836 | 47 | "Unexpected SPLIT_OP $0 designated for tablet $1 to be applied to tablet $2", |
837 | 47 | operation->op_id(), request->tablet_id(), tablet_id)); |
838 | 47 | SCHECK( |
839 | 47 | tablet_id != request->new_tablet1_id() && tablet_id != request->new_tablet2_id(), |
840 | 47 | IllegalState, |
841 | 47 | Format( |
842 | 47 | "One of SPLIT_OP $0 destination tablet IDs ($1, $2) is the same as source tablet ID $3", |
843 | 47 | operation->op_id(), request->new_tablet1_id(), request->new_tablet2_id(), tablet_id)); |
844 | | |
845 | 47 | LOG_WITH_PREFIX(INFO) << "Tablet " << tablet_id << " split operation apply started"; |
846 | | |
847 | 47 | if (raft_log == nullptr) { |
848 | 44 | auto tablet_peer = VERIFY_RESULT(LookupTablet(tablet_id)); |
849 | 44 | raft_log = tablet_peer->raft_consensus()->log().get(); |
850 | 44 | } |
851 | | |
852 | 47 | MAYBE_FAULT(FLAGS_TEST_fault_crash_in_split_before_log_flushed); |
853 | | |
854 | 47 | RETURN_NOT_OK(raft_log->FlushIndex()); |
855 | | |
856 | 47 | auto& meta = *CHECK_NOTNULL(tablet->metadata()); |
857 | | |
858 | | // TODO(tsplit): We can later implement better per-disk distribution during compaction of split |
859 | | // tablets. |
860 | 47 | const auto table_id = meta.table_id(); |
861 | 47 | const auto data_root_dir = |
862 | 47 | VERIFY_RESULT(GetAssignedRootDirForTablet(TabletDirType::kData, table_id, tablet_id)); |
863 | 47 | const auto wal_root_dir = |
864 | 47 | VERIFY_RESULT(GetAssignedRootDirForTablet(TabletDirType::kWal, table_id, tablet_id)); |
865 | | |
866 | 47 | if (FLAGS_TEST_apply_tablet_split_inject_delay_ms > 0) { |
867 | 2 | LOG(INFO) << "TEST: ApplyTabletSplit: injecting delay of " |
868 | 2 | << FLAGS_TEST_apply_tablet_split_inject_delay_ms << " ms for " |
869 | 2 | << AsString(*operation); |
870 | 2 | std::this_thread::sleep_for(FLAGS_TEST_apply_tablet_split_inject_delay_ms * 1ms); |
871 | 2 | LOG(INFO) << "TEST: ApplyTabletSplit: delay finished"; |
872 | 2 | } |
873 | | |
874 | 47 | auto tcmetas = PrepareTabletCreationMetaDataForSplit(*request, *tablet); |
875 | | |
876 | 47 | RETURN_NOT_OK(StartSubtabletsSplit(meta, &tcmetas)); |
877 | | |
878 | 88 | for (const auto& tcmeta : tcmetas) { |
879 | 88 | RegisterDataAndWalDir(fs_manager_, table_id, tcmeta.tablet_id, data_root_dir, wal_root_dir); |
880 | 88 | } |
881 | | |
882 | 47 | bool successfully_completed = false; |
883 | 43 | auto se = ScopeExit([&] { |
884 | 43 | if (!successfully_completed) { |
885 | 0 | for (const auto& tcmeta : tcmetas) { |
886 | 0 | UnregisterDataWalDir(table_id, tcmeta.tablet_id, data_root_dir, wal_root_dir); |
887 | 0 | } |
888 | 0 | } |
889 | 43 | }); |
890 | | |
891 | 47 | std::unique_ptr<ConsensusMetadata> cmeta; |
892 | 47 | RETURN_NOT_OK(ConsensusMetadata::Load(fs_manager_, tablet_id, fs_manager_->uuid(), &cmeta)); |
893 | | |
894 | 87 | for (auto& tcmeta : tcmetas) { |
895 | 87 | const auto& new_tablet_id = tcmeta.tablet_id; |
896 | | |
897 | | // Copy raft group metadata. |
898 | 87 | tcmeta.raft_group_metadata = VERIFY_RESULT(tablet->CreateSubtablet( |
899 | 87 | new_tablet_id, tcmeta.partition, tcmeta.key_bounds, operation->op_id(), |
900 | 87 | operation->hybrid_time())); |
901 | 87 | LOG_WITH_PREFIX(INFO) << "Created raft group metadata for table: " << table_id |
902 | 87 | << " tablet: " << new_tablet_id; |
903 | | |
904 | | // Copy consensus metadata. |
905 | | // Here we reuse the same cmeta instance for both new tablets. This is safe, because: |
906 | | // 1) Their consensus metadata only differ by tablet id. |
907 | | // 2) Flush() will save it into a new path corresponding to tablet id we set before flushing. |
908 | 87 | cmeta->set_tablet_id(new_tablet_id); |
909 | 87 | cmeta->set_split_parent_tablet_id(tablet_id); |
910 | 87 | RETURN_NOT_OK(cmeta->Flush()); |
911 | | |
912 | 87 | const auto& dest_wal_dir = tcmeta.raft_group_metadata->wal_dir(); |
913 | 87 | RETURN_NOT_OK(raft_log->CopyTo(dest_wal_dir)); |
914 | | |
915 | 87 | MAYBE_FAULT(FLAGS_TEST_fault_crash_in_split_after_log_copied); |
916 | | |
917 | 87 | tcmeta.raft_group_metadata->set_tablet_data_state(TABLET_DATA_READY); |
918 | 87 | RETURN_NOT_OK(tcmeta.raft_group_metadata->Flush()); |
919 | 87 | } |
920 | | |
921 | 47 | meta.SetSplitDone(operation->op_id(), request->new_tablet1_id(), request->new_tablet2_id()); |
922 | 47 | RETURN_NOT_OK(meta.Flush()); |
923 | | |
924 | 47 | tablet->SplitDone(); |
925 | | |
926 | 86 | for (auto& tcmeta : tcmetas) { |
927 | | // Call CreatePeerAndOpenTablet asynchronously to avoid write-locking TSTabletManager::mutex_ |
928 | | // here since apply of SPLIT_OP is done under ReplicaState lock and this could lead to deadlock |
929 | | // in case of reverse lock order in some other thread. |
930 | | // See https://github.com/yugabyte/yugabyte-db/issues/4312 for more details. |
931 | 86 | RETURN_NOT_OK(apply_pool_->SubmitFunc(std::bind( |
932 | 86 | &TSTabletManager::CreatePeerAndOpenTablet, this, tcmeta.raft_group_metadata, |
933 | 86 | tcmeta.transition_deleter))); |
934 | 86 | } |
935 | | |
936 | 47 | successfully_completed = true; |
937 | 47 | LOG_WITH_PREFIX(INFO) << "Tablet " << tablet_id << " split operation has been applied"; |
938 | 47 | return Status::OK(); |
939 | 47 | } |
940 | | |
941 | 217k | string LogPrefix(const string& tablet_id, const string& uuid) { |
942 | 217k | return "T " + tablet_id + " P " + uuid + ": "; |
943 | 217k | } |
944 | | |
945 | | Status CheckLeaderTermNotLower( |
946 | | const string& tablet_id, |
947 | | const string& uuid, |
948 | | int64_t leader_term, |
949 | 102 | int64_t last_logged_term) { |
950 | 102 | if (PREDICT_FALSE(leader_term < last_logged_term)) { |
951 | 0 | Status s = STATUS(InvalidArgument, |
952 | 0 | Substitute("Leader has replica of tablet $0 with term $1 lower than last " |
953 | 0 | "logged term $2 on local replica. Rejecting remote bootstrap request", |
954 | 0 | tablet_id, leader_term, last_logged_term)); |
955 | 0 | LOG(WARNING) << LogPrefix(tablet_id, uuid) << "Remote bootstrap: " << s; |
956 | 0 | return s; |
957 | 0 | } |
958 | 102 | return Status::OK(); |
959 | 102 | } |
960 | | |
961 | | Status HandleReplacingStaleTablet( |
962 | | RaftGroupMetadataPtr meta, |
963 | | TabletPeerPtr old_tablet_peer, |
964 | | const string& tablet_id, |
965 | | const string& uuid, |
966 | 102 | const int64_t& leader_term) { |
967 | 102 | TabletDataState data_state = meta->tablet_data_state(); |
968 | 102 | switch (data_state) { |
969 | 0 | case TABLET_DATA_COPYING: { |
970 | | // This should not be possible due to the transition_in_progress_ "lock". |
971 | 0 | LOG(FATAL) << LogPrefix(tablet_id, uuid) << " Remote bootstrap: " |
972 | 0 | << "Found tablet in TABLET_DATA_COPYING state during StartRemoteBootstrap()"; |
973 | 0 | } |
974 | 102 | case TABLET_DATA_TOMBSTONED: { |
975 | 102 | RETURN_NOT_OK(old_tablet_peer->CheckShutdownOrNotStarted()); |
976 | 102 | int64_t last_logged_term = meta->tombstone_last_logged_opid().term; |
977 | 102 | RETURN_NOT_OK(CheckLeaderTermNotLower(tablet_id, |
978 | 102 | uuid, |
979 | 102 | leader_term, |
980 | 102 | last_logged_term)); |
981 | 102 | break; |
982 | 102 | } |
983 | 0 | case TABLET_DATA_SPLIT_COMPLETED: |
984 | 0 | case TABLET_DATA_READY: { |
985 | 0 | if (tablet_id == master::kSysCatalogTabletId) { |
986 | 0 | LOG(FATAL) << LogPrefix(tablet_id, uuid) << " Remote bootstrap: " |
987 | 0 | << "Found tablet in " << TabletDataState_Name(data_state) |
988 | 0 | << " state during StartRemoteBootstrap()"; |
989 | 0 | } |
990 | | // There's a valid race here that can lead us to come here: |
991 | | // 1. Leader sends a second remote bootstrap request as a result of receiving a |
992 | | // TABLET_NOT_FOUND from this tserver while it was in the middle of a remote bootstrap. |
993 | | // 2. The remote bootstrap request arrives after the first one is finished, and it is able to |
994 | | // grab the mutex. |
995 | | // 3. This tserver finds that it already has the metadata for the tablet, and determines that |
996 | | // it needs to replace the tablet setting replacing_tablet to true. |
997 | | // In this case, the master can simply ignore this error. |
998 | 0 | return STATUS_FORMAT( |
999 | 0 | IllegalState, "Tablet $0 in $1 state", tablet_id, TabletDataState_Name(data_state)); |
1000 | 0 | } |
1001 | 0 | default: { |
1002 | 0 | return STATUS(IllegalState, |
1003 | 102 | Substitute("Found tablet $0 in unexpected state $1 for remote bootstrap.", |
1004 | 102 | tablet_id, TabletDataState_Name(data_state))); |
1005 | 102 | } |
1006 | 102 | } |
1007 | | |
1008 | 102 | return Status::OK(); |
1009 | 102 | } |
1010 | | |
1011 | 4.70k | Status TSTabletManager::StartRemoteBootstrap(const StartRemoteBootstrapRequestPB& req) { |
1012 | | // To prevent racing against Shutdown, we increment this as soon as we start. This should be done |
1013 | | // before checking for ClosingUnlocked, as on shutdown, we proceed in reverse: |
1014 | | // - first mark as closing |
1015 | | // - then wait for num_tablets_being_remote_bootstrapped_ == 0 |
1016 | 4.70k | ++num_tablets_being_remote_bootstrapped_; |
1017 | 4.70k | auto private_addr = req.source_private_addr()[0].host(); |
1018 | 4.69k | auto decrement_num_rbs_se = ScopeExit([this, &private_addr](){ |
1019 | 4.69k | { |
1020 | 4.69k | std::lock_guard<RWMutex> lock(mutex_); |
1021 | 4.69k | auto iter = bootstrap_source_addresses_.find(private_addr); |
1022 | 4.69k | if (iter != bootstrap_source_addresses_.end()) { |
1023 | 4.51k | bootstrap_source_addresses_.erase(iter); |
1024 | 4.51k | } |
1025 | 4.69k | } |
1026 | 4.69k | --num_tablets_being_remote_bootstrapped_; |
1027 | 4.69k | }); |
1028 | | |
1029 | 4.70k | LongOperationTracker tracker("StartRemoteBootstrap", 5s); |
1030 | | |
1031 | 4.70k | const string& tablet_id = req.tablet_id(); |
1032 | 4.70k | const string& bootstrap_peer_uuid = req.bootstrap_peer_uuid(); |
1033 | 4.70k | HostPort bootstrap_peer_addr = HostPortFromPB(DesiredHostPort( |
1034 | 4.70k | req.source_broadcast_addr(), req.source_private_addr(), req.source_cloud_info(), |
1035 | 4.70k | server_->MakeCloudInfoPB())); |
1036 | 4.70k | int64_t leader_term = req.caller_term(); |
1037 | | |
1038 | 4.70k | const string kLogPrefix = TabletLogPrefix(tablet_id); |
1039 | | |
1040 | 4.70k | TabletPeerPtr old_tablet_peer; |
1041 | 4.70k | RaftGroupMetadataPtr meta; |
1042 | 4.70k | bool replacing_tablet = false; |
1043 | 4.70k | scoped_refptr<TransitionInProgressDeleter> deleter; |
1044 | 4.70k | { |
1045 | 4.70k | std::lock_guard<RWMutex> lock(mutex_); |
1046 | 4.70k | bootstrap_source_addresses_.emplace(private_addr); |
1047 | 4.70k | if (ClosingUnlocked()) { |
1048 | 0 | auto result = STATUS_FORMAT( |
1049 | 0 | IllegalState, "StartRemoteBootstrap in wrong state: $0", |
1050 | 0 | TSTabletManagerStatePB_Name(state_)); |
1051 | 0 | LOG(WARNING) << kLogPrefix << result; |
1052 | 0 | return result; |
1053 | 0 | } |
1054 | | |
1055 | 4.70k | if (LookupTabletUnlocked(tablet_id, &old_tablet_peer)) { |
1056 | 3.40k | meta = old_tablet_peer->tablet_metadata(); |
1057 | 3.40k | replacing_tablet = true; |
1058 | 3.40k | } |
1059 | 4.70k | RETURN_NOT_OK(StartTabletStateTransition( |
1060 | 4.70k | tablet_id, Substitute("remote bootstrapping tablet from peer $0", bootstrap_peer_uuid), |
1061 | 4.70k | &deleter)); |
1062 | 4.70k | } |
1063 | | |
1064 | 1.40k | if (replacing_tablet) { |
1065 | | // Make sure the existing tablet peer is shut down and tombstoned. |
1066 | 102 | RETURN_NOT_OK(HandleReplacingStaleTablet(meta, |
1067 | 102 | old_tablet_peer, |
1068 | 102 | tablet_id, |
1069 | 102 | fs_manager_->uuid(), |
1070 | 102 | leader_term)); |
1071 | 102 | } |
1072 | | |
1073 | 1.40k | string init_msg = kLogPrefix + Substitute("Initiating remote bootstrap from Peer $0 ($1)", |
1074 | 1.40k | bootstrap_peer_uuid, bootstrap_peer_addr.ToString()); |
1075 | 1.40k | LOG(INFO) << init_msg; |
1076 | 1.40k | TRACE(init_msg); |
1077 | | |
1078 | 1.40k | auto rb_client = std::make_unique<RemoteBootstrapClient>(tablet_id, fs_manager_); |
1079 | | |
1080 | | // Download and persist the remote superblock in TABLET_DATA_COPYING state. |
1081 | 1.40k | if (replacing_tablet) { |
1082 | 102 | RETURN_NOT_OK(rb_client->SetTabletToReplace(meta, leader_term)); |
1083 | 102 | } |
1084 | 1.40k | RETURN_NOT_OK(rb_client->Start(bootstrap_peer_uuid, |
1085 | 1.40k | &server_->proxy_cache(), |
1086 | 1.40k | bootstrap_peer_addr, |
1087 | 1.40k | &meta, |
1088 | 1.40k | this)); |
1089 | | |
1090 | | // From this point onward, the superblock is persisted in TABLET_DATA_COPYING |
1091 | | // state, and we need to tombstone the tablet if additional steps prior to |
1092 | | // getting to a TABLET_DATA_READY state fail. |
1093 | | |
1094 | 960 | if (PREDICT_FALSE(FLAGS_TEST_simulate_already_present_in_remote_bootstrap)) { |
1095 | 2 | LOG_WITH_PREFIX(INFO) |
1096 | 2 | << "Simulating AlreadyPresent error in TSTabletManager::StartRemoteBootstrap."; |
1097 | 2 | return STATUS(AlreadyPresent, "failed"); |
1098 | 2 | } |
1099 | | |
1100 | | // Registering a non-initialized TabletPeer offers visibility through the Web UI. |
1101 | 958 | TabletPeerPtr tablet_peer = VERIFY_RESULT( |
1102 | 958 | CreateAndRegisterTabletPeer(meta, replacing_tablet ? REPLACEMENT_PEER : NEW_PEER)); |
1103 | 958 | MarkTabletBeingRemoteBootstrapped(tablet_peer->tablet_id(), |
1104 | 958 | tablet_peer->tablet_metadata()->table_id()); |
1105 | | |
1106 | | // TODO: If we ever make this method asynchronous, we need to move this code somewhere else. |
1107 | 943 | auto se = ScopeExit([this, tablet_peer] { |
1108 | 943 | UnmarkTabletBeingRemoteBootstrapped(tablet_peer->tablet_id(), |
1109 | 943 | tablet_peer->tablet_metadata()->table_id()); |
1110 | 943 | }); |
1111 | | |
1112 | | // Download all of the remote files. |
1113 | 958 | TOMBSTONE_NOT_OK(rb_client->FetchAll(tablet_peer->status_listener()), |
1114 | 958 | meta, |
1115 | 958 | fs_manager_->uuid(), |
1116 | 958 | "Remote bootstrap: Unable to fetch data from remote peer " + |
1117 | 958 | bootstrap_peer_uuid + " (" + bootstrap_peer_addr.ToString() + ")", |
1118 | 958 | this); |
1119 | | |
1120 | 956 | MAYBE_FAULT(FLAGS_TEST_fault_crash_after_rb_files_fetched); |
1121 | | |
1122 | | // Write out the last files to make the new replica visible and update the |
1123 | | // TabletDataState in the superblock to TABLET_DATA_READY. |
1124 | | // Finish() will call EndRemoteSession() and wait for the leader to successfully submit a |
1125 | | // ChangeConfig request (to change this server's role from PRE_VOTER or PRE_OBSERVER to VOTER or |
1126 | | // OBSERVER respectively). If the RPC times out, we will ignore the error (since the leader could |
1127 | | // have successfully submitted the ChangeConfig request and failed to respond in time) |
1128 | | // and check the committed config until we find that this server's role has changed, or until we |
1129 | | // time out which will cause us to tombstone the tablet. |
1130 | 956 | TOMBSTONE_NOT_OK(rb_client->Finish(), |
1131 | 956 | meta, |
1132 | 956 | fs_manager_->uuid(), |
1133 | 956 | "Remote bootstrap: Failed calling Finish()", |
1134 | 956 | this); |
1135 | | |
1136 | 956 | LOG(INFO) << kLogPrefix << "Remote bootstrap: Opening tablet"; |
1137 | | |
1138 | | // TODO(hector): ENG-3173: We need to simulate a failure in OpenTablet during remote bootstrap |
1139 | | // and verify that this tablet server gets remote bootstrapped again by the leader. We also need |
1140 | | // to check what happens when this server receives raft consensus requests since at this point, |
1141 | | // this tablet server could be a voter (if the ChangeRole request in Finish succeeded and its |
1142 | | // initial role was PRE_VOTER). |
1143 | 956 | OpenTablet(meta, nullptr); |
1144 | | // If OpenTablet fails, tablet_peer->error() will be set. |
1145 | 956 | RETURN_NOT_OK(ShutdownAndTombstoneTabletPeerNotOk( |
1146 | 956 | tablet_peer->error(), tablet_peer, meta, fs_manager_->uuid(), |
1147 | 956 | "Remote bootstrap: OpenTablet() failed", this)); |
1148 | | |
1149 | 956 | auto status = rb_client->VerifyChangeRoleSucceeded(tablet_peer->shared_consensus()); |
1150 | 956 | if (!status.ok()) { |
1151 | | // If for some reason this tserver wasn't promoted (e.g. from PRE-VOTER to VOTER), the leader |
1152 | | // will find out and do the CHANGE_CONFIG. |
1153 | 3 | LOG(WARNING) << kLogPrefix << "Remote bootstrap finished. " |
1154 | 3 | << "Failure calling VerifyChangeRoleSucceeded: " |
1155 | 3 | << status.ToString(); |
1156 | 953 | } else { |
1157 | 953 | LOG(INFO) << kLogPrefix << "Remote bootstrap for tablet ended successfully"; |
1158 | 953 | } |
1159 | | |
1160 | 956 | WARN_NOT_OK(rb_client->Remove(), "Remove remote bootstrap sessions failed"); |
1161 | | |
1162 | 956 | return Status::OK(); |
1163 | 956 | } |
1164 | | |
1165 | | // Create and register a new TabletPeer, given tablet metadata. |
1166 | | Result<TabletPeerPtr> TSTabletManager::CreateAndRegisterTabletPeer( |
1167 | 83.4k | const RaftGroupMetadataPtr& meta, RegisterTabletPeerMode mode) { |
1168 | 83.4k | TabletPeerPtr tablet_peer(new tablet::TabletPeer( |
1169 | 83.4k | meta, |
1170 | 83.4k | local_peer_pb_, |
1171 | 83.4k | scoped_refptr<server::Clock>(server_->clock()), |
1172 | 83.4k | fs_manager_->uuid(), |
1173 | 83.4k | Bind(&TSTabletManager::ApplyChange, Unretained(this), meta->raft_group_id()), |
1174 | 83.4k | metric_registry_, |
1175 | 83.4k | this, |
1176 | 83.4k | async_client_init_->get_client_future())); |
1177 | 83.4k | RETURN_NOT_OK(RegisterTablet(meta->raft_group_id(), tablet_peer, mode)); |
1178 | 83.4k | return tablet_peer; |
1179 | 83.4k | } |
1180 | | |
1181 | | Status TSTabletManager::DeleteTablet( |
1182 | | const string& tablet_id, |
1183 | | TabletDataState delete_type, |
1184 | | const boost::optional<int64_t>& cas_config_opid_index_less_or_equal, |
1185 | | bool hide_only, |
1186 | 49.3k | boost::optional<TabletServerErrorPB::Code>* error_code) { |
1187 | | |
1188 | 49.3k | if (delete_type != TABLET_DATA_DELETED && delete_type != TABLET_DATA_TOMBSTONED) { |
1189 | 0 | return STATUS(InvalidArgument, "DeleteTablet() requires an argument that is one of " |
1190 | 0 | "TABLET_DATA_DELETED or TABLET_DATA_TOMBSTONED", |
1191 | 0 | Substitute("Given: $0 ($1)", |
1192 | 0 | TabletDataState_Name(delete_type), delete_type)); |
1193 | 0 | } |
1194 | | |
1195 | 49.3k | TRACE("Deleting tablet $0", tablet_id); |
1196 | | |
1197 | 49.3k | TabletPeerPtr tablet_peer; |
1198 | 49.3k | scoped_refptr<TransitionInProgressDeleter> deleter; |
1199 | 49.3k | { |
1200 | | // Acquire the lock in exclusive mode as we'll add a entry to the |
1201 | | // transition_in_progress_ map. |
1202 | 49.3k | std::lock_guard<RWMutex> lock(mutex_); |
1203 | 49.3k | TRACE("Acquired tablet manager lock"); |
1204 | 49.3k | RETURN_NOT_OK(CheckRunningUnlocked(error_code)); |
1205 | | |
1206 | 49.3k | if (!LookupTabletUnlocked(tablet_id, &tablet_peer)) { |
1207 | 29 | *error_code = TabletServerErrorPB::TABLET_NOT_FOUND; |
1208 | 29 | return STATUS(NotFound, "Tablet not found", tablet_id); |
1209 | 29 | } |
1210 | | // Sanity check that the tablet's deletion isn't already in progress |
1211 | 49.3k | Status s = StartTabletStateTransition(tablet_id, "deleting tablet", &deleter); |
1212 | 49.3k | if (PREDICT_FALSE(!s.ok())) { |
1213 | 1.59k | *error_code = TabletServerErrorPB::TABLET_NOT_RUNNING; |
1214 | 1.59k | return s; |
1215 | 1.59k | } |
1216 | 47.7k | } |
1217 | | |
1218 | | // If the tablet is already deleted, the CAS check isn't possible because |
1219 | | // consensus and therefore the log is not available. |
1220 | 47.7k | TabletDataState data_state = tablet_peer->tablet_metadata()->tablet_data_state(); |
1221 | 47.7k | bool tablet_deleted = (data_state == TABLET_DATA_DELETED || data_state == TABLET_DATA_TOMBSTONED); |
1222 | | |
1223 | | // If a tablet peer is in the FAILED state, then we need to be able to tombstone or delete this |
1224 | | // tablet. If the tablet is tombstoned, then this TS can be remote bootstrapped with the same |
1225 | | // tablet. |
1226 | 47.7k | bool tablet_failed = tablet_peer->state() == RaftGroupStatePB::FAILED; |
1227 | | |
1228 | | // They specified an "atomic" delete. Check the committed config's opid_index. |
1229 | | // TODO: There's actually a race here between the check and shutdown, but |
1230 | | // it's tricky to fix. We could try checking again after the shutdown and |
1231 | | // restarting the tablet if the local replica committed a higher config |
1232 | | // change op during that time, or potentially something else more invasive. |
1233 | 47.7k | if (cas_config_opid_index_less_or_equal && !tablet_deleted && !tablet_failed) { |
1234 | 839 | shared_ptr<consensus::Consensus> consensus = tablet_peer->shared_consensus(); |
1235 | 839 | if (!consensus) { |
1236 | 0 | *error_code = TabletServerErrorPB::TABLET_NOT_RUNNING; |
1237 | 0 | return STATUS(IllegalState, "Consensus not available. Tablet shutting down"); |
1238 | 0 | } |
1239 | 839 | RaftConfigPB committed_config = consensus->CommittedConfig(); |
1240 | 839 | if (committed_config.opid_index() > *cas_config_opid_index_less_or_equal) { |
1241 | 146 | *error_code = TabletServerErrorPB::CAS_FAILED; |
1242 | 146 | return STATUS(IllegalState, Substitute("Request specified cas_config_opid_index_less_or_equal" |
1243 | 146 | " of $0 but the committed config has opid_index of $1", |
1244 | 146 | *cas_config_opid_index_less_or_equal, |
1245 | 146 | committed_config.opid_index())); |
1246 | 146 | } |
1247 | 47.6k | } |
1248 | | |
1249 | 47.6k | RaftGroupMetadataPtr meta = tablet_peer->tablet_metadata(); |
1250 | 47.6k | if (hide_only) { |
1251 | 0 | meta->SetHidden(true); |
1252 | 0 | return meta->Flush(); |
1253 | 0 | } |
1254 | | // No matter if the tablet was deleted (drop table), or tombstoned (potentially moved to a |
1255 | | // different TS), we do not need to flush rocksdb anymore, as this data is irrelevant. |
1256 | | // |
1257 | | // Note: This might change for PITR. |
1258 | 47.6k | bool delete_data = delete_type == TABLET_DATA_DELETED || delete_type == TABLET_DATA_TOMBSTONED; |
1259 | 47.6k | RETURN_NOT_OK(tablet_peer->Shutdown(tablet::IsDropTable(delete_data))); |
1260 | | |
1261 | 47.6k | yb::OpId last_logged_opid = tablet_peer->GetLatestLogEntryOpId(); |
1262 | | |
1263 | 47.6k | Status s = DeleteTabletData(meta, |
1264 | 47.6k | delete_type, |
1265 | 47.6k | fs_manager_->uuid(), |
1266 | 47.6k | last_logged_opid, |
1267 | 47.6k | this); |
1268 | 47.6k | if (PREDICT_FALSE(!s.ok())) { |
1269 | 0 | s = s.CloneAndPrepend(Substitute("Unable to delete on-disk data from tablet $0", |
1270 | 0 | tablet_id)); |
1271 | 0 | LOG(WARNING) << s.ToString(); |
1272 | 0 | tablet_peer->SetFailed(s); |
1273 | 0 | return s; |
1274 | 0 | } |
1275 | | |
1276 | 47.6k | tablet_peer->status_listener()->StatusMessage("Deleted tablet blocks from disk"); |
1277 | | |
1278 | | // We only remove DELETED tablets from the tablet map. |
1279 | 47.6k | if (delete_type == TABLET_DATA_DELETED) { |
1280 | 46.8k | std::lock_guard<RWMutex> lock(mutex_); |
1281 | 46.8k | RETURN_NOT_OK(CheckRunningUnlocked(error_code)); |
1282 | 0 | CHECK_EQ(1, tablet_map_.erase(tablet_id)) << tablet_id; |
1283 | 46.8k | dirty_tablets_.erase(tablet_id); |
1284 | 46.8k | } |
1285 | | |
1286 | | // We unregister TOMBSTONED tablets in addition to DELETED tablets because they do not have |
1287 | | // any more data on disk, so we shouldn't count these tablets when load balancing the disks. |
1288 | 47.6k | UnregisterDataWalDir(meta->table_id(), |
1289 | 47.6k | tablet_id, |
1290 | 47.6k | meta->data_root_dir(), |
1291 | 47.6k | meta->wal_root_dir()); |
1292 | | |
1293 | 47.6k | return Status::OK(); |
1294 | 47.6k | } |
1295 | | |
1296 | | Status TSTabletManager::CheckRunningUnlocked( |
1297 | 96.2k | boost::optional<TabletServerErrorPB::Code>* error_code) const { |
1298 | 96.2k | if (state_ == MANAGER_RUNNING) { |
1299 | 96.2k | return Status::OK(); |
1300 | 96.2k | } |
1301 | 0 | *error_code = TabletServerErrorPB::TABLET_NOT_RUNNING; |
1302 | 0 | return STATUS(ServiceUnavailable, Substitute("Tablet Manager is not running: $0", |
1303 | 0 | TSTabletManagerStatePB_Name(state_))); |
1304 | 0 | } |
1305 | | |
1306 | | // NO_THREAD_SAFETY_ANALYSIS because this analysis does not work with unique_lock. |
1307 | | Status TSTabletManager::StartTabletStateTransition( |
1308 | | const string& tablet_id, |
1309 | | const string& reason, |
1310 | 136k | scoped_refptr<TransitionInProgressDeleter>* deleter) NO_THREAD_SAFETY_ANALYSIS { |
1311 | 136k | std::unique_lock<std::mutex> lock(transition_in_progress_mutex_); |
1312 | 136k | const auto emplace_result = transition_in_progress_.emplace(tablet_id, reason); |
1313 | 136k | if (!emplace_result.second) { |
1314 | 4.89k | return STATUS_FORMAT( |
1315 | 4.89k | AlreadyPresent, "State transition of tablet $0 already in progress: $1", tablet_id, |
1316 | 4.89k | *emplace_result.first); |
1317 | 4.89k | } |
1318 | 131k | deleter->reset(new TransitionInProgressDeleter( |
1319 | 131k | &transition_in_progress_, &transition_in_progress_mutex_, tablet_id)); |
1320 | 131k | return Status::OK(); |
1321 | 131k | } |
1322 | | |
1323 | 68 | bool TSTabletManager::IsTabletInTransition(const TabletId& tablet_id) const { |
1324 | 68 | std::unique_lock<std::mutex> lock(transition_in_progress_mutex_); |
1325 | 68 | return ContainsKey(transition_in_progress_, tablet_id); |
1326 | 68 | } |
1327 | | |
1328 | | Status TSTabletManager::OpenTabletMeta(const string& tablet_id, |
1329 | 228 | RaftGroupMetadataPtr* metadata) { |
1330 | 228 | LOG(INFO) << "Loading metadata for tablet " << tablet_id; |
1331 | 228 | TRACE("Loading metadata..."); |
1332 | 228 | auto load_result = RaftGroupMetadata::Load(fs_manager_, tablet_id); |
1333 | 228 | RETURN_NOT_OK_PREPEND(load_result, |
1334 | 228 | Format("Failed to load tablet metadata for tablet id $0", tablet_id)); |
1335 | 228 | TRACE("Metadata loaded"); |
1336 | 228 | metadata->swap(*load_result); |
1337 | 228 | return Status::OK(); |
1338 | 228 | } |
1339 | | |
1340 | | void TSTabletManager::OpenTablet(const RaftGroupMetadataPtr& meta, |
1341 | 83.4k | const scoped_refptr<TransitionInProgressDeleter>& deleter) { |
1342 | 83.4k | string tablet_id = meta->raft_group_id(); |
1343 | 83.4k | TRACE_EVENT1("tserver", "TSTabletManager::OpenTablet", |
1344 | 83.4k | "tablet_id", tablet_id); |
1345 | | |
1346 | 83.4k | TabletPeerPtr tablet_peer; |
1347 | 18.4E | CHECK(LookupTablet(tablet_id, &tablet_peer)) |
1348 | 18.4E | << "Tablet not registered prior to OpenTabletAsync call: " << tablet_id; |
1349 | | |
1350 | 83.4k | tablet::TabletPtr tablet; |
1351 | 83.4k | scoped_refptr<Log> log; |
1352 | 83.4k | const string kLogPrefix = TabletLogPrefix(tablet_id); |
1353 | | |
1354 | 83.4k | LOG(INFO) << kLogPrefix << "Bootstrapping tablet"; |
1355 | 83.4k | TRACE("Bootstrapping tablet"); |
1356 | | |
1357 | 83.4k | consensus::ConsensusBootstrapInfo bootstrap_info; |
1358 | 83.4k | consensus::RetryableRequests retryable_requests(kLogPrefix); |
1359 | 83.4k | yb::OpId split_op_id; |
1360 | | |
1361 | 83.4k | LOG_TIMING_PREFIX(INFO, kLogPrefix, "bootstrapping tablet") { |
1362 | | // Read flag before CAS to avoid TSAN race conflict with GetAllFlags. |
1363 | 83.4k | if (GetAtomicFlag(&FLAGS_TEST_force_single_tablet_failure) && |
1364 | 3 | CompareAndSetFlag(&FLAGS_TEST_force_single_tablet_failure, |
1365 | 3 | true /* expected */, false /* val */)) { |
1366 | 3 | LOG(ERROR) << "Setting the state of a tablet to FAILED"; |
1367 | 3 | tablet_peer->SetFailed(STATUS(InternalError, "Setting tablet to failed state for test", |
1368 | 3 | tablet_id)); |
1369 | 3 | return; |
1370 | 3 | } |
1371 | | |
1372 | | // TODO: handle crash mid-creation of tablet? do we ever end up with a |
1373 | | // partially created tablet here? |
1374 | 83.4k | auto s = tablet_peer->SetBootstrapping(); |
1375 | 83.4k | if (!s.ok()) { |
1376 | 0 | LOG(ERROR) << kLogPrefix << "Tablet failed to set bootstrapping: " << s; |
1377 | 0 | tablet_peer->SetFailed(s); |
1378 | 0 | return; |
1379 | 0 | } |
1380 | | |
1381 | 83.4k | tablet::TabletInitData tablet_init_data = { |
1382 | 83.4k | .metadata = meta, |
1383 | 83.4k | .client_future = async_client_init_->get_client_future(), |
1384 | 83.4k | .clock = scoped_refptr<server::Clock>(server_->clock()), |
1385 | 83.4k | .parent_mem_tracker = MemTracker::FindOrCreateTracker("Tablets", server_->mem_tracker()), |
1386 | 83.4k | .block_based_table_mem_tracker = mem_manager_->block_based_table_mem_tracker(), |
1387 | 83.4k | .metric_registry = metric_registry_, |
1388 | 83.4k | .log_anchor_registry = tablet_peer->log_anchor_registry(), |
1389 | 83.4k | .tablet_options = tablet_options_, |
1390 | 83.4k | .log_prefix_suffix = " P " + tablet_peer->permanent_uuid(), |
1391 | 83.4k | .transaction_participant_context = tablet_peer.get(), |
1392 | 83.4k | .local_tablet_filter = std::bind(&TSTabletManager::PreserveLocalLeadersOnly, this, _1), |
1393 | 83.4k | .transaction_coordinator_context = tablet_peer.get(), |
1394 | 83.4k | .txns_enabled = tablet::TransactionsEnabled::kTrue, |
1395 | | // We are assuming we're never dealing with the system catalog tablet in TSTabletManager. |
1396 | 83.4k | .is_sys_catalog = tablet::IsSysCatalogTablet::kFalse, |
1397 | 83.4k | .snapshot_coordinator = nullptr, |
1398 | 83.4k | .tablet_splitter = this, |
1399 | 83.4k | .allowed_history_cutoff_provider = std::bind( |
1400 | 83.4k | &TSTabletManager::AllowedHistoryCutoff, this, _1), |
1401 | 83.4k | }; |
1402 | 83.4k | tablet::BootstrapTabletData data = { |
1403 | 83.4k | .tablet_init_data = tablet_init_data, |
1404 | 83.4k | .listener = tablet_peer->status_listener(), |
1405 | 83.4k | .append_pool = append_pool(), |
1406 | 83.4k | .allocation_pool = allocation_pool_.get(), |
1407 | 83.4k | .retryable_requests = &retryable_requests, |
1408 | 83.4k | }; |
1409 | 83.4k | s = BootstrapTablet(data, &tablet, &log, &bootstrap_info); |
1410 | 83.4k | if (!s.ok()) { |
1411 | 1 | LOG(ERROR) << kLogPrefix << "Tablet failed to bootstrap: " << s; |
1412 | 1 | tablet_peer->SetFailed(s); |
1413 | 1 | return; |
1414 | 1 | } |
1415 | 83.4k | } |
1416 | | |
1417 | 83.4k | MonoTime start(MonoTime::Now()); |
1418 | 83.4k | LOG_TIMING_PREFIX(INFO, kLogPrefix, "starting tablet") { |
1419 | 83.4k | TRACE("Initializing tablet peer"); |
1420 | 83.4k | auto s = tablet_peer->InitTabletPeer( |
1421 | 83.4k | tablet, |
1422 | 83.4k | server_->mem_tracker(), |
1423 | 83.4k | server_->messenger(), |
1424 | 83.4k | &server_->proxy_cache(), |
1425 | 83.4k | log, |
1426 | 83.4k | tablet->GetTableMetricsEntity(), |
1427 | 83.4k | tablet->GetTabletMetricsEntity(), |
1428 | 83.4k | raft_pool(), |
1429 | 83.4k | tablet_prepare_pool(), |
1430 | 83.4k | &retryable_requests, |
1431 | 83.4k | multi_raft_manager_.get()); |
1432 | | |
1433 | 83.4k | if (!s.ok()) { |
1434 | 1 | LOG(ERROR) << kLogPrefix << "Tablet failed to init: " |
1435 | 1 | << s.ToString(); |
1436 | 1 | tablet_peer->SetFailed(s); |
1437 | 1 | return; |
1438 | 1 | } |
1439 | | |
1440 | 83.4k | TRACE("Starting tablet peer"); |
1441 | 83.4k | s = tablet_peer->Start(bootstrap_info); |
1442 | 83.4k | if (!s.ok()) { |
1443 | 2 | LOG(ERROR) << kLogPrefix << "Tablet failed to start: " |
1444 | 2 | << s.ToString(); |
1445 | 2 | tablet_peer->SetFailed(s); |
1446 | 2 | return; |
1447 | 2 | } |
1448 | | |
1449 | 83.4k | tablet_peer->RegisterMaintenanceOps(server_->maintenance_manager()); |
1450 | 83.4k | } |
1451 | | |
1452 | 83.4k | auto elapsed_ms = MonoTime::Now().GetDeltaSince(start).ToMilliseconds(); |
1453 | 83.4k | if (elapsed_ms > FLAGS_tablet_start_warn_threshold_ms) { |
1454 | 11 | LOG(WARNING) << kLogPrefix << "Tablet startup took " << elapsed_ms << "ms"; |
1455 | 11 | if (Trace::CurrentTrace()) { |
1456 | 0 | LOG(WARNING) << kLogPrefix << "Trace:" << std::endl |
1457 | 0 | << Trace::CurrentTrace()->DumpToString(true); |
1458 | 0 | } |
1459 | 11 | } |
1460 | | |
1461 | 83.4k | if (PREDICT_TRUE(!FLAGS_TEST_skip_post_split_compaction)) { |
1462 | 83.3k | WARN_NOT_OK( |
1463 | 83.3k | tablet->TriggerPostSplitCompactionIfNeeded([&]() { |
1464 | 83.3k | return post_split_trigger_compaction_pool_->NewToken(ThreadPool::ExecutionMode::SERIAL); |
1465 | 83.3k | }), |
1466 | 83.3k | "Failed to submit compaction for post-split tablet."); |
1467 | 30 | } else { |
1468 | 30 | LOG(INFO) << "Skipping post split compaction " << meta->raft_group_id(); |
1469 | 30 | } |
1470 | | |
1471 | 83.4k | if (tablet->ShouldDisableLbMove()) { |
1472 | 84 | std::lock_guard<RWMutex> lock(mutex_); |
1473 | 84 | tablets_blocked_from_lb_.insert(tablet->tablet_id()); |
1474 | 0 | VLOG(2) << TabletLogPrefix(tablet->tablet_id()) |
1475 | 0 | << " marking as maybe being compacted after split."; |
1476 | 84 | } |
1477 | 83.4k | } |
1478 | | |
1479 | 0 | Status TSTabletManager::TriggerCompactionAndWait(const TabletPtrs& tablets) { |
1480 | 0 | CountDownLatch latch(tablets.size()); |
1481 | 0 | auto token = admin_triggered_compaction_pool_->NewToken(ThreadPool::ExecutionMode::CONCURRENT); |
1482 | 0 | for (auto tablet : tablets) { |
1483 | 0 | RETURN_NOT_OK(token->SubmitFunc([&latch, tablet]() { |
1484 | 0 | WARN_NOT_OK(tablet->ForceFullRocksDBCompact(), "Failed to submit compaction for tablet."); |
1485 | 0 | latch.CountDown(); |
1486 | 0 | })); |
1487 | 0 | } |
1488 | 0 | latch.Wait(); |
1489 | 0 | return Status::OK(); |
1490 | 0 | } |
1491 | | |
1492 | 76 | void TSTabletManager::StartShutdown() { |
1493 | 76 | { |
1494 | 76 | std::lock_guard<RWMutex> lock(mutex_); |
1495 | 76 | switch (state_) { |
1496 | 0 | case MANAGER_QUIESCING: { |
1497 | 0 | VLOG(1) << "Tablet manager shut down already in progress.."; |
1498 | 0 | return; |
1499 | 0 | } |
1500 | 0 | case MANAGER_SHUTDOWN: { |
1501 | 0 | VLOG(1) << "Tablet manager has already been shut down."; |
1502 | 0 | return; |
1503 | 0 | } |
1504 | 0 | case MANAGER_INITIALIZING: |
1505 | 76 | case MANAGER_RUNNING: { |
1506 | 76 | LOG_WITH_PREFIX(INFO) << "Shutting down tablet manager..."; |
1507 | 76 | state_ = MANAGER_QUIESCING; |
1508 | 76 | break; |
1509 | 0 | } |
1510 | 0 | default: { |
1511 | 0 | LOG(FATAL) << "Invalid state: " << TSTabletManagerStatePB_Name(state_); |
1512 | 0 | } |
1513 | 76 | } |
1514 | 76 | } |
1515 | | |
1516 | 76 | tablets_cleaner_->Shutdown(); |
1517 | | |
1518 | 76 | verify_tablet_data_poller_->Shutdown(); |
1519 | | |
1520 | 76 | metrics_cleaner_->Shutdown(); |
1521 | | |
1522 | 76 | async_client_init_->Shutdown(); |
1523 | | |
1524 | 76 | mem_manager_->Shutdown(); |
1525 | | |
1526 | | // Wait for all RBS operations to finish. |
1527 | 76 | const MonoDelta kSingleWait = 10ms; |
1528 | 76 | const MonoDelta kReportInterval = 5s; |
1529 | 76 | const MonoDelta kMaxWait = 30s; |
1530 | 76 | MonoDelta waited = MonoDelta::kZero; |
1531 | 76 | MonoDelta next_report_time = kReportInterval; |
1532 | 76 | while (int remaining_rbs = num_tablets_being_remote_bootstrapped_ > 0) { |
1533 | 0 | if (waited >= next_report_time) { |
1534 | 0 | if (waited >= kMaxWait) { |
1535 | 0 | std::string addr = ""; |
1536 | 0 | for (auto iter = bootstrap_source_addresses_.begin(); |
1537 | 0 | iter != bootstrap_source_addresses_.end(); |
1538 | 0 | iter++) { |
1539 | 0 | if (iter == bootstrap_source_addresses_.begin()) { |
1540 | 0 | addr += *iter; |
1541 | 0 | } else { |
1542 | 0 | addr += "," + *iter; |
1543 | 0 | } |
1544 | 0 | } |
1545 | 0 | LOG_WITH_PREFIX(DFATAL) |
1546 | 0 | << "Waited for " << waited << "ms. Still had " |
1547 | 0 | << remaining_rbs << " pending remote bootstraps: " + addr; |
1548 | 0 | } else { |
1549 | 0 | LOG_WITH_PREFIX(WARNING) |
1550 | 0 | << "Still waiting for " << remaining_rbs |
1551 | 0 | << " ongoing RemoteBootstraps to finish after " << waited; |
1552 | 0 | } |
1553 | 0 | next_report_time = std::min(kMaxWait, waited + kReportInterval); |
1554 | 0 | } |
1555 | 0 | SleepFor(kSingleWait); |
1556 | 0 | waited += kSingleWait; |
1557 | 0 | } |
1558 | | |
1559 | | // Shut down the bootstrap pool, so new tablets are registered after this point. |
1560 | 76 | open_tablet_pool_->Shutdown(); |
1561 | | |
1562 | | // Take a snapshot of the peers list -- that way we don't have to hold |
1563 | | // on to the lock while shutting them down, which might cause a lock |
1564 | | // inversion. (see KUDU-308 for example). |
1565 | 92 | for (const TabletPeerPtr& peer : GetTabletPeers()) { |
1566 | 92 | if (peer->StartShutdown()) { |
1567 | 91 | shutting_down_peers_.push_back(peer); |
1568 | 91 | } |
1569 | 92 | } |
1570 | 76 | } |
1571 | | |
1572 | 73 | void TSTabletManager::CompleteShutdown() { |
1573 | 91 | for (const TabletPeerPtr& peer : shutting_down_peers_) { |
1574 | 91 | peer->CompleteShutdown(); |
1575 | 91 | } |
1576 | | |
1577 | | // Shut down the apply pool. |
1578 | 73 | apply_pool_->Shutdown(); |
1579 | | |
1580 | 73 | if (raft_pool_) { |
1581 | 73 | raft_pool_->Shutdown(); |
1582 | 73 | } |
1583 | 73 | if (tablet_prepare_pool_) { |
1584 | 73 | tablet_prepare_pool_->Shutdown(); |
1585 | 73 | } |
1586 | 73 | if (append_pool_) { |
1587 | 73 | append_pool_->Shutdown(); |
1588 | 73 | } |
1589 | 73 | if (post_split_trigger_compaction_pool_) { |
1590 | 73 | post_split_trigger_compaction_pool_->Shutdown(); |
1591 | 73 | } |
1592 | 73 | if (admin_triggered_compaction_pool_) { |
1593 | 73 | admin_triggered_compaction_pool_->Shutdown(); |
1594 | 73 | } |
1595 | | |
1596 | 73 | { |
1597 | 73 | std::lock_guard<RWMutex> l(mutex_); |
1598 | 73 | tablet_map_.clear(); |
1599 | 73 | dirty_tablets_.clear(); |
1600 | | |
1601 | 73 | std::lock_guard<std::mutex> dir_assignment_lock(dir_assignment_mutex_); |
1602 | 73 | table_data_assignment_map_.clear(); |
1603 | 73 | table_wal_assignment_map_.clear(); |
1604 | | |
1605 | 73 | state_ = MANAGER_SHUTDOWN; |
1606 | 73 | } |
1607 | 73 | } |
1608 | | |
1609 | 106k | std::string TSTabletManager::LogPrefix() const { |
1610 | 106k | return "P " + fs_manager_->uuid() + ": "; |
1611 | 106k | } |
1612 | | |
1613 | 170k | std::string TSTabletManager::TabletLogPrefix(const TabletId& tablet_id) const { |
1614 | 170k | return tserver::LogPrefix(tablet_id, fs_manager_->uuid()); |
1615 | 170k | } |
1616 | | |
1617 | 88.1k | bool TSTabletManager::ClosingUnlocked() const { |
1618 | 88.1k | return state_ == MANAGER_QUIESCING || state_ == MANAGER_SHUTDOWN; |
1619 | 88.1k | } |
1620 | | |
1621 | | Status TSTabletManager::RegisterTablet(const TabletId& tablet_id, |
1622 | | const TabletPeerPtr& tablet_peer, |
1623 | 83.2k | RegisterTabletPeerMode mode) { |
1624 | 83.2k | std::lock_guard<RWMutex> lock(mutex_); |
1625 | 83.2k | if (ClosingUnlocked()) { |
1626 | 0 | auto result = STATUS_FORMAT( |
1627 | 0 | ShutdownInProgress, "Unable to register tablet peer: $0: closing", tablet_id); |
1628 | 0 | LOG(WARNING) << result; |
1629 | 0 | return result; |
1630 | 0 | } |
1631 | | |
1632 | | // If we are replacing a tablet peer, we delete the existing one first. |
1633 | 83.2k | if (mode == REPLACEMENT_PEER && tablet_map_.erase(tablet_id) != 1) { |
1634 | 0 | auto result = STATUS_FORMAT( |
1635 | 0 | NotFound, "Unable to remove previous tablet peer $0: not registered", tablet_id); |
1636 | 0 | LOG(WARNING) << result; |
1637 | 0 | return result; |
1638 | 0 | } |
1639 | 83.2k | if (!InsertIfNotPresent(&tablet_map_, tablet_id, tablet_peer)) { |
1640 | 0 | auto result = STATUS_FORMAT( |
1641 | 0 | AlreadyPresent, "Unable to register tablet peer $0: already registered", tablet_id); |
1642 | 0 | LOG(WARNING) << result; |
1643 | 0 | return result; |
1644 | 0 | } |
1645 | | |
1646 | 83.2k | LOG_WITH_PREFIX(INFO) << "Registered tablet " << tablet_id; |
1647 | | |
1648 | 83.2k | return Status::OK(); |
1649 | 83.2k | } |
1650 | | |
1651 | | bool TSTabletManager::LookupTablet(const string& tablet_id, |
1652 | 16.7M | TabletPeerPtr* tablet_peer) const { |
1653 | 16.7M | SharedLock<RWMutex> shared_lock(mutex_); |
1654 | 16.7M | return LookupTabletUnlocked(tablet_id, tablet_peer); |
1655 | 16.7M | } |
1656 | | |
1657 | | Result<std::shared_ptr<tablet::TabletPeer>> TSTabletManager::LookupTablet( |
1658 | 44 | const TabletId& tablet_id) const { |
1659 | 44 | TabletPeerPtr tablet_peer; |
1660 | 44 | SCHECK(LookupTablet(tablet_id, &tablet_peer), NotFound, Format("Tablet $0 not found", tablet_id)); |
1661 | 44 | return tablet_peer; |
1662 | 44 | } |
1663 | | |
1664 | | bool TSTabletManager::LookupTabletUnlocked(const string& tablet_id, |
1665 | 16.9M | TabletPeerPtr* tablet_peer) const { |
1666 | 16.9M | const TabletPeerPtr* found = FindOrNull(tablet_map_, tablet_id); |
1667 | 16.9M | if (!found) { |
1668 | 86.2k | return false; |
1669 | 86.2k | } |
1670 | 16.8M | *tablet_peer = *found; |
1671 | 16.8M | return true; |
1672 | 16.8M | } |
1673 | | |
1674 | | Status TSTabletManager::GetTabletPeer(const string& tablet_id, |
1675 | 16.6M | TabletPeerPtr* tablet_peer) const { |
1676 | 16.6M | if (!LookupTablet(tablet_id, tablet_peer)) { |
1677 | 2.47k | return STATUS(NotFound, "Tablet not found", tablet_id); |
1678 | 2.47k | } |
1679 | 16.6M | TabletDataState data_state = (*tablet_peer)->tablet_metadata()->tablet_data_state(); |
1680 | 16.6M | if (!CanServeTabletData(data_state)) { |
1681 | 3.73k | return STATUS( |
1682 | 3.73k | IllegalState, "Tablet data state not ready: " + TabletDataState_Name(data_state), |
1683 | 3.73k | tablet_id); |
1684 | 3.73k | } |
1685 | 16.6M | return Status::OK(); |
1686 | 16.6M | } |
1687 | | |
1688 | 9.51M | const NodeInstancePB& TSTabletManager::NodeInstance() const { |
1689 | 9.51M | return server_->instance_pb(); |
1690 | 9.51M | } |
1691 | | |
1692 | 1 | Status TSTabletManager::GetRegistration(ServerRegistrationPB* reg) const { |
1693 | 1 | return server_->GetRegistration(reg, server::RpcOnly::kTrue); |
1694 | 1 | } |
1695 | | |
1696 | 42.7k | TSTabletManager::TabletPeers TSTabletManager::GetTabletPeers(TabletPtrs* tablet_ptrs) const { |
1697 | 42.7k | SharedLock<RWMutex> shared_lock(mutex_); |
1698 | 42.7k | TabletPeers peers; |
1699 | 42.7k | GetTabletPeersUnlocked(&peers); |
1700 | 42.7k | if (tablet_ptrs) { |
1701 | 4 | for (const auto& peer : peers) { |
1702 | 4 | if (!peer) continue; |
1703 | 4 | auto tablet_ptr = peer->shared_tablet(); |
1704 | 4 | if (tablet_ptr) { |
1705 | 4 | tablet_ptrs->push_back(tablet_ptr); |
1706 | 4 | } |
1707 | 4 | } |
1708 | 4 | } |
1709 | 42.7k | return peers; |
1710 | 42.7k | } |
1711 | | |
1712 | 48.3k | void TSTabletManager::GetTabletPeersUnlocked(TabletPeers* tablet_peers) const { |
1713 | 48.3k | DCHECK(tablet_peers != nullptr); |
1714 | | // See AppendKeysFromMap for why this is done. |
1715 | 48.3k | if (tablet_peers->empty()) { |
1716 | 48.3k | tablet_peers->reserve(tablet_map_.size()); |
1717 | 48.3k | } |
1718 | 341k | for (const auto& entry : tablet_map_) { |
1719 | 341k | if (entry.second != nullptr) { |
1720 | 341k | tablet_peers->push_back(entry.second); |
1721 | 341k | } |
1722 | 341k | } |
1723 | 48.3k | } |
1724 | | |
1725 | 243k | void TSTabletManager::PreserveLocalLeadersOnly(std::vector<const TabletId*>* tablet_ids) const { |
1726 | 243k | SharedLock<decltype(mutex_)> shared_lock(mutex_); |
1727 | 4.86M | auto filter = [this](const TabletId* id) REQUIRES_SHARED(mutex_) { |
1728 | 4.86M | auto it = tablet_map_.find(*id); |
1729 | 4.86M | if (it == tablet_map_.end()) { |
1730 | 60 | return true; |
1731 | 60 | } |
1732 | 4.86M | auto leader_status = it->second->LeaderStatus(); |
1733 | 4.86M | return leader_status != consensus::LeaderStatus::LEADER_AND_READY; |
1734 | 4.86M | }; |
1735 | 243k | tablet_ids->erase(std::remove_if(tablet_ids->begin(), tablet_ids->end(), filter), |
1736 | 243k | tablet_ids->end()); |
1737 | 243k | } |
1738 | | |
1739 | | void TSTabletManager::ApplyChange(const string& tablet_id, |
1740 | 338k | shared_ptr<consensus::StateChangeContext> context) { |
1741 | 338k | WARN_NOT_OK( |
1742 | 338k | apply_pool_->SubmitFunc( |
1743 | 338k | std::bind(&TSTabletManager::MarkTabletDirty, this, tablet_id, context)), |
1744 | 338k | "Unable to run MarkDirty callback") |
1745 | 338k | } |
1746 | | |
1747 | | void TSTabletManager::MarkTabletDirty(const TabletId& tablet_id, |
1748 | 338k | std::shared_ptr<consensus::StateChangeContext> context) { |
1749 | 338k | std::lock_guard<RWMutex> lock(mutex_); |
1750 | 338k | MarkDirtyUnlocked(tablet_id, context); |
1751 | 338k | } |
1752 | | |
1753 | | void TSTabletManager::MarkTabletBeingRemoteBootstrapped( |
1754 | 958 | const TabletId& tablet_id, const TableId& table_id) { |
1755 | 958 | std::lock_guard<RWMutex> lock(mutex_); |
1756 | 958 | tablets_being_remote_bootstrapped_.insert(tablet_id); |
1757 | 958 | tablets_being_remote_bootstrapped_per_table_[table_id].insert(tablet_id); |
1758 | 958 | MaybeDoChecksForTests(table_id); |
1759 | 958 | LOG(INFO) << "Concurrent remote bootstrap sessions: " |
1760 | 958 | << tablets_being_remote_bootstrapped_.size() |
1761 | 958 | << "Concurrent remote bootstrap sessions for table " << table_id |
1762 | 958 | << ": " << tablets_being_remote_bootstrapped_per_table_[table_id].size(); |
1763 | 958 | } |
1764 | | |
1765 | | void TSTabletManager::UnmarkTabletBeingRemoteBootstrapped( |
1766 | 943 | const TabletId& tablet_id, const TableId& table_id) { |
1767 | 943 | std::lock_guard<RWMutex> lock(mutex_); |
1768 | 943 | tablets_being_remote_bootstrapped_.erase(tablet_id); |
1769 | 943 | tablets_being_remote_bootstrapped_per_table_[table_id].erase(tablet_id); |
1770 | 943 | } |
1771 | | |
1772 | 0 | size_t TSTabletManager::TEST_GetNumDirtyTablets() const { |
1773 | 0 | SharedLock<RWMutex> lock(mutex_); |
1774 | 0 | return dirty_tablets_.size(); |
1775 | 0 | } |
1776 | | |
1777 | | Status TSTabletManager::GetNumTabletsPendingBootstrap( |
1778 | 9 | IsTabletServerReadyResponsePB* resp) const { |
1779 | 9 | if (state() != MANAGER_RUNNING) { |
1780 | 0 | resp->set_num_tablets_not_running(INT_MAX); |
1781 | 0 | resp->set_total_tablets(INT_MAX); |
1782 | 0 | return Status::OK(); |
1783 | 0 | } |
1784 | | |
1785 | 9 | SharedLock<RWMutex> shared_lock(mutex_); |
1786 | 9 | int num_pending = 0; |
1787 | 9 | int total_tablets = 0; |
1788 | 0 | for (const auto& entry : tablet_map_) { |
1789 | 0 | RaftGroupStatePB state = entry.second->state(); |
1790 | 0 | TabletDataState data_state = entry.second->data_state(); |
1791 | | // Do not count tablets that will never get to RUNNING state. |
1792 | 0 | if (!CanServeTabletData(data_state)) { |
1793 | 0 | continue; |
1794 | 0 | } |
1795 | 0 | bool not_started_or_bootstrap = state == NOT_STARTED || state == BOOTSTRAPPING; |
1796 | 0 | if (not_started_or_bootstrap || state == RUNNING) { |
1797 | 0 | total_tablets++; |
1798 | 0 | } |
1799 | 0 | if (not_started_or_bootstrap) { |
1800 | 0 | num_pending++; |
1801 | 0 | } |
1802 | 0 | } |
1803 | | |
1804 | 9 | LOG(INFO) << num_pending << " tablets pending bootstrap out of " << total_tablets; |
1805 | 9 | resp->set_num_tablets_not_running(num_pending); |
1806 | 9 | resp->set_total_tablets(total_tablets); |
1807 | | |
1808 | 9 | return Status::OK(); |
1809 | 9 | } |
1810 | | |
1811 | 403k | int TSTabletManager::GetNumLiveTablets() const { |
1812 | 403k | int count = 0; |
1813 | 403k | SharedLock<RWMutex> lock(mutex_); |
1814 | 7.03M | for (const auto& entry : tablet_map_) { |
1815 | 7.03M | RaftGroupStatePB state = entry.second->state(); |
1816 | 7.03M | if (state == BOOTSTRAPPING || |
1817 | 6.79M | state == RUNNING) { |
1818 | 6.71M | count++; |
1819 | 6.71M | } |
1820 | 7.03M | } |
1821 | 403k | return count; |
1822 | 403k | } |
1823 | | |
1824 | 403k | int TSTabletManager::GetLeaderCount() const { |
1825 | 403k | int count = 0; |
1826 | 403k | SharedLock<RWMutex> lock(mutex_); |
1827 | 7.03M | for (const auto& entry : tablet_map_) { |
1828 | 7.03M | consensus::LeaderStatus leader_status = entry.second->LeaderStatus(/* allow_stale =*/ true); |
1829 | 7.03M | if (leader_status != consensus::LeaderStatus::NOT_LEADER) { |
1830 | 1.91M | count++; |
1831 | 1.91M | } |
1832 | 7.03M | } |
1833 | 403k | return count; |
1834 | 403k | } |
1835 | | |
1836 | | void TSTabletManager::MarkDirtyUnlocked(const TabletId& tablet_id, |
1837 | 338k | std::shared_ptr<consensus::StateChangeContext> context) { |
1838 | 338k | TabletReportState* state = FindOrNull(dirty_tablets_, tablet_id); |
1839 | 338k | if (state != nullptr) { |
1840 | 142k | CHECK_GE(next_report_seq_, state->change_seq); |
1841 | 142k | state->change_seq = next_report_seq_; |
1842 | 196k | } else { |
1843 | 196k | TabletReportState state; |
1844 | 196k | state.change_seq = next_report_seq_; |
1845 | 196k | InsertOrDie(&dirty_tablets_, tablet_id, state); |
1846 | 196k | } |
1847 | 0 | VLOG(2) << TabletLogPrefix(tablet_id) |
1848 | 0 | << "Marking dirty. Reason: " << AsString(context) |
1849 | 0 | << ". Will report this tablet to the Master in the next heartbeat " |
1850 | 0 | << "as part of report #" << next_report_seq_; |
1851 | 338k | server_->heartbeater()->TriggerASAP(); |
1852 | 338k | } |
1853 | | |
1854 | 5.81k | void TSTabletManager::InitLocalRaftPeerPB() { |
1855 | 5.81k | DCHECK_EQ(state(), MANAGER_INITIALIZING); |
1856 | 5.81k | local_peer_pb_.set_permanent_uuid(fs_manager_->uuid()); |
1857 | 5.81k | ServerRegistrationPB reg; |
1858 | 5.81k | CHECK_OK(server_->GetRegistration(®, server::RpcOnly::kTrue)); |
1859 | 5.81k | TakeRegistration(®, &local_peer_pb_); |
1860 | 5.81k | } |
1861 | | |
1862 | | void TSTabletManager::CreateReportedTabletPB(const TabletPeerPtr& tablet_peer, |
1863 | 263k | ReportedTabletPB* reported_tablet) { |
1864 | 263k | reported_tablet->set_tablet_id(tablet_peer->tablet_id()); |
1865 | 263k | reported_tablet->set_state(tablet_peer->state()); |
1866 | 263k | reported_tablet->set_tablet_data_state(tablet_peer->tablet_metadata()->tablet_data_state()); |
1867 | 263k | if (tablet_peer->state() == tablet::FAILED) { |
1868 | 0 | AppStatusPB* error_status = reported_tablet->mutable_error(); |
1869 | 0 | StatusToPB(tablet_peer->error(), error_status); |
1870 | 0 | } |
1871 | 263k | reported_tablet->set_schema_version(tablet_peer->tablet_metadata()->schema_version()); |
1872 | | |
1873 | 263k | { |
1874 | 263k | auto tablet_ptr = tablet_peer->shared_tablet(); |
1875 | 263k | if (tablet_ptr != nullptr) { |
1876 | 260k | reported_tablet->set_should_disable_lb_move(tablet_ptr->ShouldDisableLbMove()); |
1877 | 260k | } |
1878 | 263k | } |
1879 | 263k | reported_tablet->set_fs_data_dir(tablet_peer->tablet_metadata()->data_root_dir()); |
1880 | | |
1881 | | // We cannot get consensus state information unless the TabletPeer is running. |
1882 | 263k | shared_ptr<consensus::Consensus> consensus = tablet_peer->shared_consensus(); |
1883 | 263k | if (consensus) { |
1884 | 260k | *reported_tablet->mutable_committed_consensus_state() = |
1885 | 260k | consensus->ConsensusState(consensus::CONSENSUS_CONFIG_COMMITTED); |
1886 | 260k | } |
1887 | | |
1888 | | // Set the hide status of the tablet. |
1889 | 263k | reported_tablet->set_is_hidden(tablet_peer->tablet_metadata()->hidden()); |
1890 | 263k | } |
1891 | | |
1892 | 398k | void TSTabletManager::GenerateTabletReport(TabletReportPB* report, bool include_bootstrap) { |
1893 | 398k | report->Clear(); |
1894 | | // Creating the tablet report can be slow in the case that it is in the |
1895 | | // middle of flushing its consensus metadata. We don't want to hold |
1896 | | // lock_ for too long, even in read mode, since it can cause other readers |
1897 | | // to block if there is a waiting writer (see KUDU-2193). So, we just make |
1898 | | // a local copy of the set of replicas. |
1899 | 398k | vector<std::shared_ptr<TabletPeer>> to_report; |
1900 | 398k | TabletIdSet tablet_ids; |
1901 | 398k | size_t dirty_count, report_limit; |
1902 | 398k | { |
1903 | 398k | std::lock_guard<RWMutex> write_lock(mutex_); |
1904 | 398k | uint32_t cur_report_seq = next_report_seq_++; |
1905 | 398k | report->set_sequence_number(cur_report_seq); |
1906 | | |
1907 | 398k | TabletIdSet::iterator i = tablets_blocked_from_lb_.begin(); |
1908 | 398k | while (i != tablets_blocked_from_lb_.end()) { |
1909 | 134 | TabletPeerPtr* tablet_peer = FindOrNull(tablet_map_, *i); |
1910 | 134 | if (tablet_peer) { |
1911 | 134 | const auto tablet = (*tablet_peer)->shared_tablet(); |
1912 | | // If tablet is null, one of two things may be true: |
1913 | | // 1. TabletPeer::InitTabletPeer was not called yet |
1914 | | // |
1915 | | // Skip and keep tablet in tablets_blocked_from_lb_ till call InitTabletPeer. |
1916 | | // |
1917 | | // 2. TabletPeer::CompleteShutdown was called |
1918 | | // |
1919 | | // Tablet will be removed from tablets_blocked_from_lb_ with next GenerateTabletReport |
1920 | | // since tablet_peer will be removed from tablet_map_ |
1921 | 134 | if (tablet == nullptr) { |
1922 | 0 | ++i; |
1923 | 0 | continue; |
1924 | 0 | } |
1925 | 134 | const std::string& tablet_id = tablet->tablet_id(); |
1926 | 134 | if (!tablet->ShouldDisableLbMove()) { |
1927 | 84 | i = tablets_blocked_from_lb_.erase(i); |
1928 | 0 | VLOG(1) << "Tablet " << tablet_id << " is no longer blocked from load-balancing."; |
1929 | 84 | InsertOrUpdate(&dirty_tablets_, tablet_id, TabletReportState{cur_report_seq}); |
1930 | 50 | } else { |
1931 | 50 | ++i; |
1932 | 50 | } |
1933 | 0 | } else { |
1934 | 0 | VLOG(1) << "Tablet " << *i |
1935 | 0 | << " was marked as blocked from load balancing but was not found"; |
1936 | 0 | i = tablets_blocked_from_lb_.erase(i); |
1937 | 0 | } |
1938 | 134 | } |
1939 | | |
1940 | 398k | if (include_bootstrap) { |
1941 | 9.62k | for (auto const& tablet_id : tablets_being_remote_bootstrapped_) { |
1942 | 0 | VLOG(1) << "Tablet " << tablet_id << " being remote bootstrapped and marked for report"; |
1943 | 9.62k | InsertOrUpdate(&dirty_tablets_, tablet_id, TabletReportState{cur_report_seq}); |
1944 | 9.62k | } |
1945 | 398k | } |
1946 | 261k | for (const DirtyMap::value_type& dirty_entry : dirty_tablets_) { |
1947 | 261k | const TabletId& tablet_id = dirty_entry.first; |
1948 | 261k | tablet_ids.insert(tablet_id); |
1949 | 261k | } |
1950 | | |
1951 | 261k | for (auto const& tablet_id : tablet_ids) { |
1952 | 261k | TabletPeerPtr* tablet_peer = FindOrNull(tablet_map_, tablet_id); |
1953 | 261k | if (tablet_peer) { |
1954 | | // Dirty entry, report on it. |
1955 | 261k | to_report.push_back(*tablet_peer); |
1956 | 0 | } else { |
1957 | | // Tell the Master that this tablet was removed from the TServer side. |
1958 | 0 | report->add_removed_tablet_ids(tablet_id); |
1959 | | // Don't count this as a 'dirty_tablet_' because the Master may not have it either. |
1960 | 0 | dirty_tablets_.erase(tablet_id); |
1961 | 0 | } |
1962 | 261k | } |
1963 | 398k | dirty_count = dirty_tablets_.size(); |
1964 | 398k | report_limit = report_limit_; |
1965 | 398k | } |
1966 | 261k | for (const auto& replica : to_report) { |
1967 | 261k | CreateReportedTabletPB(replica, report->add_updated_tablets()); |
1968 | | // Enforce a max tablet limit on reported tablets. |
1969 | 261k | if (implicit_cast<size_t>(report->updated_tablets_size()) >= report_limit) break; |
1970 | 261k | } |
1971 | 398k | report->set_remaining_tablet_count( |
1972 | 398k | narrow_cast<int>(dirty_count - report->updated_tablets_size())); |
1973 | 398k | } |
1974 | | |
1975 | 5.59k | void TSTabletManager::StartFullTabletReport(TabletReportPB* report) { |
1976 | 5.59k | report->Clear(); |
1977 | | // Creating the tablet report can be slow in the case that it is in the |
1978 | | // middle of flushing its consensus metadata. We don't want to hold |
1979 | | // lock_ for too long, even in read mode, since it can cause other readers |
1980 | | // to block if there is a waiting writer (see KUDU-2193). So, we just make |
1981 | | // a local copy of the set of replicas. |
1982 | 5.59k | vector<std::shared_ptr<TabletPeer>> to_report; |
1983 | 5.59k | size_t dirty_count, report_limit; |
1984 | 5.59k | { |
1985 | 5.59k | std::lock_guard<RWMutex> write_lock(mutex_); |
1986 | 5.59k | uint32_t cur_report_seq = next_report_seq_++; |
1987 | 5.59k | report->set_sequence_number(cur_report_seq); |
1988 | 5.59k | GetTabletPeersUnlocked(&to_report); |
1989 | | // Mark all tablets as dirty, to be cleaned when reading the heartbeat response. |
1990 | 1.52k | for (const auto& peer : to_report) { |
1991 | 1.52k | InsertOrUpdate(&dirty_tablets_, peer->tablet_id(), TabletReportState{cur_report_seq}); |
1992 | 1.52k | } |
1993 | 5.59k | dirty_count = dirty_tablets_.size(); |
1994 | 5.59k | report_limit = report_limit_; |
1995 | 5.59k | } |
1996 | 1.52k | for (const auto& replica : to_report) { |
1997 | 1.52k | CreateReportedTabletPB(replica, report->add_updated_tablets()); |
1998 | | // Enforce a max tablet limit on reported tablets. |
1999 | 1.52k | if (implicit_cast<size_t>(report->updated_tablets_size()) >= report_limit) break; |
2000 | 1.52k | } |
2001 | 5.59k | report->set_remaining_tablet_count( |
2002 | 5.59k | narrow_cast<int32_t>(dirty_count - report->updated_tablets_size())); |
2003 | 5.59k | } |
2004 | | |
2005 | | void TSTabletManager::MarkTabletReportAcknowledged(uint32_t acked_seq, |
2006 | | const TabletReportUpdatesPB& updates, |
2007 | 396k | bool dirty_check) { |
2008 | 396k | std::lock_guard<RWMutex> l(mutex_); |
2009 | | |
2010 | 396k | CHECK_LT(acked_seq, next_report_seq_); |
2011 | | |
2012 | | // Clear the "dirty" state for any tablets processed in this report. |
2013 | 260k | for (auto const & tablet : updates.tablets()) { |
2014 | 260k | auto it = dirty_tablets_.find(tablet.tablet_id()); |
2015 | 260k | if (it != dirty_tablets_.end()) { |
2016 | 260k | const TabletReportState& state = it->second; |
2017 | 260k | if (state.change_seq <= acked_seq) { |
2018 | | // This entry has not changed since this tablet report, we no longer need to track it |
2019 | | // as dirty. Next modification will be re-added with a higher sequence number. |
2020 | 203k | dirty_tablets_.erase(it); |
2021 | 203k | } |
2022 | 260k | } |
2023 | 260k | } |
2024 | 396k | #ifndef NDEBUG |
2025 | | // Verify dirty_tablets_ always processes all tablet changes. |
2026 | 396k | if (dirty_check) { |
2027 | 158k | for (auto const & d : dirty_tablets_) { |
2028 | 158k | if (d.second.change_seq <= acked_seq) { |
2029 | 0 | LOG(DFATAL) << "Dirty Tablet should have been reported but wasn't: " |
2030 | 0 | << d.first << "@" << d.second.change_seq << " <= " << acked_seq; |
2031 | 0 | } |
2032 | 158k | } |
2033 | 395k | } |
2034 | 396k | #endif |
2035 | 396k | } |
2036 | | |
2037 | | Status TSTabletManager::HandleNonReadyTabletOnStartup( |
2038 | 9 | const RaftGroupMetadataPtr& meta) { |
2039 | 9 | const string& tablet_id = meta->raft_group_id(); |
2040 | 9 | TabletDataState data_state = meta->tablet_data_state(); |
2041 | 0 | CHECK(data_state == TABLET_DATA_DELETED || |
2042 | 0 | data_state == TABLET_DATA_TOMBSTONED || |
2043 | 0 | data_state == TABLET_DATA_COPYING || |
2044 | 0 | data_state == TABLET_DATA_INIT_STARTED) |
2045 | 0 | << "Unexpected TabletDataState in tablet " << tablet_id << ": " |
2046 | 0 | << TabletDataState_Name(data_state) << " (" << data_state << ")"; |
2047 | | |
2048 | 9 | if (data_state == TABLET_DATA_COPYING) { |
2049 | | // We tombstone tablets that failed to remotely bootstrap. |
2050 | 0 | data_state = TABLET_DATA_TOMBSTONED; |
2051 | 0 | } |
2052 | | |
2053 | 9 | if (data_state == TABLET_DATA_INIT_STARTED) { |
2054 | | // We delete tablets that failed to completely initialize after a split. |
2055 | | // TODO(tsplit): https://github.com/yugabyte/yugabyte-db/issues/8013 |
2056 | 1 | data_state = TABLET_DATA_DELETED; |
2057 | 1 | } |
2058 | | |
2059 | 9 | const string kLogPrefix = TabletLogPrefix(tablet_id); |
2060 | | |
2061 | | // If the tablet is already fully tombstoned with no remaining data or WAL, |
2062 | | // then no need to roll anything forward. |
2063 | 9 | bool skip_deletion = meta->IsTombstonedWithNoRocksDBData() && |
2064 | 2 | !Log::HasOnDiskData(meta->fs_manager(), meta->wal_dir()); |
2065 | | |
2066 | 7 | LOG_IF(WARNING, !skip_deletion) |
2067 | 7 | << kLogPrefix << "Tablet Manager startup: Rolling forward tablet deletion " |
2068 | 7 | << "of type " << TabletDataState_Name(data_state); |
2069 | | |
2070 | 9 | if (!skip_deletion) { |
2071 | | // Passing no OpId will retain the last_logged_opid that was previously in the metadata. |
2072 | 7 | RETURN_NOT_OK(DeleteTabletData(meta, data_state, fs_manager_->uuid(), yb::OpId())); |
2073 | 7 | } |
2074 | | |
2075 | | // We only delete the actual superblock of a TABLET_DATA_DELETED tablet on startup. |
2076 | | // TODO: Consider doing this after a fixed delay, instead of waiting for a restart. |
2077 | | // See KUDU-941. |
2078 | 9 | if (data_state == TABLET_DATA_DELETED) { |
2079 | 7 | LOG(INFO) << kLogPrefix << "Deleting tablet superblock"; |
2080 | 7 | return meta->DeleteSuperBlock(); |
2081 | 7 | } |
2082 | | |
2083 | | // Register TOMBSTONED tablets so that they get reported to the Master, which |
2084 | | // allows us to permanently delete replica tombstones when a table gets deleted. |
2085 | 2 | if (data_state == TABLET_DATA_TOMBSTONED) { |
2086 | 2 | RETURN_NOT_OK(CreateAndRegisterTabletPeer(meta, NEW_PEER)); |
2087 | 2 | } |
2088 | | |
2089 | 2 | return Status::OK(); |
2090 | 2 | } |
2091 | | |
2092 | | void TSTabletManager::GetAndRegisterDataAndWalDir(FsManager* fs_manager, |
2093 | | const string& table_id, |
2094 | | const string& tablet_id, |
2095 | | string* data_root_dir, |
2096 | 83.4k | string* wal_root_dir) { |
2097 | | // Skip sys catalog table and kudu table from modifying the map. |
2098 | 83.4k | if (table_id == master::kSysCatalogTableId) { |
2099 | 0 | return; |
2100 | 0 | } |
2101 | 83.4k | LOG(INFO) << "Get and update data/wal directory assignment map for table: " \ |
2102 | 83.4k | << table_id << " and tablet " << tablet_id; |
2103 | 83.4k | std::lock_guard<std::mutex> dir_assignment_lock(dir_assignment_mutex_); |
2104 | | // Initialize the map if the directory mapping does not exist. |
2105 | 83.4k | auto data_root_dirs = fs_manager->GetDataRootDirs(); |
2106 | 18.4E | CHECK(!data_root_dirs.empty()) << "No data root directories found"; |
2107 | 83.4k | auto table_data_assignment_iter = table_data_assignment_map_.find(table_id); |
2108 | 83.4k | if (table_data_assignment_iter == table_data_assignment_map_.end()) { |
2109 | 11.2k | for (string data_root_iter : data_root_dirs) { |
2110 | 11.2k | unordered_set<string> tablet_id_set; |
2111 | 11.2k | table_data_assignment_map_[table_id][data_root_iter] = tablet_id_set; |
2112 | 11.2k | } |
2113 | 11.1k | } |
2114 | | // Find the data directory with the least count of tablets for this table. |
2115 | 83.4k | table_data_assignment_iter = table_data_assignment_map_.find(table_id); |
2116 | 83.4k | auto data_assignment_value_map = table_data_assignment_iter->second; |
2117 | 83.4k | string min_dir; |
2118 | 83.4k | uint64_t min_dir_count = kuint64max; |
2119 | 167k | for (auto it = data_assignment_value_map.begin(); it != data_assignment_value_map.end(); ++it) { |
2120 | 83.9k | if (min_dir_count > it->second.size()) { |
2121 | 83.6k | min_dir = it->first; |
2122 | 83.6k | min_dir_count = it->second.size(); |
2123 | 83.6k | } |
2124 | 83.9k | } |
2125 | 83.4k | *data_root_dir = min_dir; |
2126 | | // Increment the count for min_dir. |
2127 | 83.4k | auto data_assignment_value_iter = table_data_assignment_map_[table_id].find(min_dir); |
2128 | 83.4k | data_assignment_value_iter->second.insert(tablet_id); |
2129 | | |
2130 | | // Find the wal directory with the least count of tablets for this table. |
2131 | 83.4k | min_dir = ""; |
2132 | 83.4k | min_dir_count = kuint64max; |
2133 | 83.4k | auto wal_root_dirs = fs_manager->GetWalRootDirs(); |
2134 | 18.4E | CHECK(!wal_root_dirs.empty()) << "No wal root directories found"; |
2135 | 83.4k | auto table_wal_assignment_iter = table_wal_assignment_map_.find(table_id); |
2136 | 83.4k | if (table_wal_assignment_iter == table_wal_assignment_map_.end()) { |
2137 | 11.2k | for (string wal_root_iter : wal_root_dirs) { |
2138 | 11.2k | unordered_set<string> tablet_id_set; |
2139 | 11.2k | table_wal_assignment_map_[table_id][wal_root_iter] = tablet_id_set; |
2140 | 11.2k | } |
2141 | 11.1k | } |
2142 | 83.4k | table_wal_assignment_iter = table_wal_assignment_map_.find(table_id); |
2143 | 83.4k | auto wal_assignment_value_map = table_wal_assignment_iter->second; |
2144 | 167k | for (auto it = wal_assignment_value_map.begin(); it != wal_assignment_value_map.end(); ++it) { |
2145 | 83.9k | if (min_dir_count > it->second.size()) { |
2146 | 83.6k | min_dir = it->first; |
2147 | 83.6k | min_dir_count = it->second.size(); |
2148 | 83.6k | } |
2149 | 83.9k | } |
2150 | 83.4k | *wal_root_dir = min_dir; |
2151 | 83.4k | auto wal_assignment_value_iter = table_wal_assignment_map_[table_id].find(min_dir); |
2152 | 83.4k | wal_assignment_value_iter->second.insert(tablet_id); |
2153 | 83.4k | } |
2154 | | |
2155 | | void TSTabletManager::RegisterDataAndWalDir(FsManager* fs_manager, |
2156 | | const string& table_id, |
2157 | | const string& tablet_id, |
2158 | | const string& data_root_dir, |
2159 | 407 | const string& wal_root_dir) { |
2160 | | // Skip sys catalog table from modifying the map. |
2161 | 407 | if (table_id == master::kSysCatalogTableId) { |
2162 | 0 | return; |
2163 | 0 | } |
2164 | 407 | LOG(INFO) << "Update data/wal directory assignment map for table: " |
2165 | 407 | << table_id << " and tablet " << tablet_id; |
2166 | 407 | std::lock_guard<std::mutex> dir_assignment_lock(dir_assignment_mutex_); |
2167 | | // Initialize the map if the directory mapping does not exist. |
2168 | 407 | auto data_root_dirs = fs_manager->GetDataRootDirs(); |
2169 | 0 | CHECK(!data_root_dirs.empty()) << "No data root directories found"; |
2170 | 407 | auto table_data_assignment_iter = table_data_assignment_map_.find(table_id); |
2171 | 407 | if (table_data_assignment_iter == table_data_assignment_map_.end()) { |
2172 | 119 | for (string data_root_iter : data_root_dirs) { |
2173 | 119 | unordered_set<string> tablet_id_set; |
2174 | 119 | table_data_assignment_map_[table_id][data_root_iter] = tablet_id_set; |
2175 | 119 | } |
2176 | 82 | } |
2177 | | // Increment the count for data_root_dir. |
2178 | 407 | table_data_assignment_iter = table_data_assignment_map_.find(table_id); |
2179 | 407 | auto data_assignment_value_map = table_data_assignment_iter->second; |
2180 | 407 | auto data_assignment_value_iter = table_data_assignment_map_[table_id].find(data_root_dir); |
2181 | 407 | if (data_assignment_value_iter == table_data_assignment_map_[table_id].end()) { |
2182 | 0 | unordered_set<string> tablet_id_set; |
2183 | 0 | tablet_id_set.insert(tablet_id); |
2184 | 0 | table_data_assignment_map_[table_id][data_root_dir] = tablet_id_set; |
2185 | 407 | } else { |
2186 | 407 | data_assignment_value_iter->second.insert(tablet_id); |
2187 | 407 | } |
2188 | | |
2189 | 407 | auto wal_root_dirs = fs_manager->GetWalRootDirs(); |
2190 | 0 | CHECK(!wal_root_dirs.empty()) << "No wal root directories found"; |
2191 | 407 | auto table_wal_assignment_iter = table_wal_assignment_map_.find(table_id); |
2192 | 407 | if (table_wal_assignment_iter == table_wal_assignment_map_.end()) { |
2193 | 119 | for (string wal_root_iter : wal_root_dirs) { |
2194 | 119 | unordered_set<string> tablet_id_set; |
2195 | 119 | table_wal_assignment_map_[table_id][wal_root_iter] = tablet_id_set; |
2196 | 119 | } |
2197 | 82 | } |
2198 | | // Increment the count for wal_root_dir. |
2199 | 407 | table_wal_assignment_map_[table_id][wal_root_dir].insert(tablet_id); |
2200 | 407 | table_wal_assignment_iter = table_wal_assignment_map_.find(table_id); |
2201 | 407 | auto wal_assignment_value_map = table_wal_assignment_iter->second; |
2202 | 407 | auto wal_assignment_value_iter = table_wal_assignment_map_[table_id].find(wal_root_dir); |
2203 | 407 | if (wal_assignment_value_iter == table_wal_assignment_map_[table_id].end()) { |
2204 | 0 | unordered_set<string> tablet_id_set; |
2205 | 0 | tablet_id_set.insert(tablet_id); |
2206 | 0 | table_wal_assignment_map_[table_id][wal_root_dir] = tablet_id_set; |
2207 | 407 | } else { |
2208 | 407 | wal_assignment_value_iter->second.insert(tablet_id); |
2209 | 407 | } |
2210 | 407 | } |
2211 | | |
2212 | | TSTabletManager::TableDiskAssignmentMap* TSTabletManager::GetTableDiskAssignmentMapUnlocked( |
2213 | 88 | TabletDirType dir_type) { |
2214 | 88 | switch (dir_type) { |
2215 | 44 | case TabletDirType::kData: |
2216 | 44 | return &table_data_assignment_map_; |
2217 | 44 | case TabletDirType::kWal: |
2218 | 44 | return &table_wal_assignment_map_; |
2219 | 0 | } |
2220 | 0 | FATAL_INVALID_ENUM_VALUE(TabletDirType, dir_type); |
2221 | 0 | } |
2222 | | |
2223 | | Result<const std::string&> TSTabletManager::GetAssignedRootDirForTablet( |
2224 | 88 | TabletDirType dir_type, const TableId& table_id, const TabletId& tablet_id) { |
2225 | 88 | std::lock_guard<std::mutex> dir_assignment_lock(dir_assignment_mutex_); |
2226 | | |
2227 | 88 | TableDiskAssignmentMap* table_assignment_map = GetTableDiskAssignmentMapUnlocked(dir_type); |
2228 | 88 | auto tablets_by_root_dir = table_assignment_map->find(table_id); |
2229 | 88 | if (tablets_by_root_dir == table_assignment_map->end()) { |
2230 | 0 | return STATUS_FORMAT( |
2231 | 0 | IllegalState, "Table ID $0 is not in $1 table assignment map", table_id, dir_type); |
2232 | 0 | } |
2233 | 88 | for (auto& data_dir_and_tablets : tablets_by_root_dir->second) { |
2234 | 88 | if (data_dir_and_tablets.second.count(tablet_id) > 0) { |
2235 | 88 | return data_dir_and_tablets.first; |
2236 | 88 | } |
2237 | 88 | } |
2238 | 0 | return STATUS_FORMAT( |
2239 | 88 | IllegalState, "Tablet ID $0 is not found in $1 assignment map for table $2", tablet_id, |
2240 | 88 | dir_type, table_id); |
2241 | 88 | } |
2242 | | |
2243 | | void TSTabletManager::UnregisterDataWalDir(const string& table_id, |
2244 | | const string& tablet_id, |
2245 | | const string& data_root_dir, |
2246 | 48.0k | const string& wal_root_dir) { |
2247 | | // Skip sys catalog table from modifying the map. |
2248 | 48.0k | if (table_id == master::kSysCatalogTableId) { |
2249 | 0 | return; |
2250 | 0 | } |
2251 | 48.0k | LOG(INFO) << "Unregister data/wal directory assignment map for table: " |
2252 | 48.0k | << table_id << " and tablet " << tablet_id; |
2253 | 48.0k | std::lock_guard<std::mutex> lock(dir_assignment_mutex_); |
2254 | 48.0k | auto table_data_assignment_iter = table_data_assignment_map_.find(table_id); |
2255 | 48.0k | if (table_data_assignment_iter == table_data_assignment_map_.end()) { |
2256 | | // It is possible that we can't find an assignment for the table if the operations followed in |
2257 | | // this order: |
2258 | | // 1. The only tablet for a table gets tombstoned, and UnregisterDataWalDir removes it from |
2259 | | // the maps. |
2260 | | // 2. TSTabletManager gets restarted (so the maps are cleared). |
2261 | | // 3. During TsTabletManager initialization, the tombstoned TABLET won't get registered, |
2262 | | // so if a DeleteTablet request with type DELETED gets sent, UnregisterDataWalDir won't |
2263 | | // find the table. |
2264 | | |
2265 | | // Check that both maps should be consistent. |
2266 | 0 | DCHECK(table_wal_assignment_map_.find(table_id) == table_wal_assignment_map_.end()); |
2267 | 0 | } |
2268 | 48.0k | if (table_data_assignment_iter != table_data_assignment_map_.end()) { |
2269 | 48.0k | auto data_assignment_value_iter = table_data_assignment_map_[table_id].find(data_root_dir); |
2270 | 0 | DCHECK(data_assignment_value_iter != table_data_assignment_map_[table_id].end()) |
2271 | 0 | << "No data directory index found for table: " << table_id; |
2272 | 48.0k | if (data_assignment_value_iter != table_data_assignment_map_[table_id].end()) { |
2273 | 48.0k | data_assignment_value_iter->second.erase(tablet_id); |
2274 | 0 | } else { |
2275 | 0 | LOG(WARNING) << "Tablet " << tablet_id << " not in the set for data directory " |
2276 | 0 | << data_root_dir << "for table " << table_id; |
2277 | 0 | } |
2278 | 48.0k | } |
2279 | 48.0k | auto table_wal_assignment_iter = table_wal_assignment_map_.find(table_id); |
2280 | 48.0k | if (table_wal_assignment_iter != table_wal_assignment_map_.end()) { |
2281 | 48.0k | auto wal_assignment_value_iter = table_wal_assignment_map_[table_id].find(wal_root_dir); |
2282 | 0 | DCHECK(wal_assignment_value_iter != table_wal_assignment_map_[table_id].end()) |
2283 | 0 | << "No wal directory index found for table: " << table_id; |
2284 | 48.0k | if (wal_assignment_value_iter != table_wal_assignment_map_[table_id].end()) { |
2285 | 48.0k | wal_assignment_value_iter->second.erase(tablet_id); |
2286 | 0 | } else { |
2287 | 0 | LOG(WARNING) << "Tablet " << tablet_id << " not in the set for wal directory " |
2288 | 0 | << wal_root_dir << "for table " << table_id; |
2289 | 0 | } |
2290 | 48.0k | } |
2291 | 48.0k | } |
2292 | | |
2293 | 8 | client::YBClient& TSTabletManager::client() { |
2294 | 8 | return *async_client_init_->client(); |
2295 | 8 | } |
2296 | | |
2297 | 51.7k | const std::shared_future<client::YBClient*>& TSTabletManager::client_future() { |
2298 | 51.7k | return async_client_init_->get_client_future(); |
2299 | 51.7k | } |
2300 | | |
2301 | 958 | void TSTabletManager::MaybeDoChecksForTests(const TableId& table_id) { |
2302 | | // First check that the global RBS limits are respected if the flag is non-zero. |
2303 | 958 | if (PREDICT_FALSE(FLAGS_TEST_crash_if_remote_bootstrap_sessions_greater_than > 0) && |
2304 | 24 | tablets_being_remote_bootstrapped_.size() > |
2305 | 0 | FLAGS_TEST_crash_if_remote_bootstrap_sessions_greater_than) { |
2306 | 0 | string tablets; |
2307 | | // The purpose of limiting the number of remote bootstraps is to cap how much |
2308 | | // network bandwidth all the RBS sessions use. |
2309 | | // When we finish transferring the files, we wait until the role of the new peer |
2310 | | // has been changed from PRE_VOTER to VOTER before we remove the tablet_id |
2311 | | // from tablets_being_remote_bootstrapped_. Since it's possible to be here |
2312 | | // because a few tablets are already open, and in the RUNNING state, but still |
2313 | | // in the tablets_being_remote_bootstrapped_ list, we check the state of each |
2314 | | // tablet before deciding if the load balancer has violated the concurrent RBS limit. |
2315 | 0 | size_t count = 0; |
2316 | 0 | for (const auto& tablet_id : tablets_being_remote_bootstrapped_) { |
2317 | 0 | TabletPeerPtr* tablet_peer = FindOrNull(tablet_map_, tablet_id); |
2318 | 0 | if (tablet_peer && (*tablet_peer)->state() == RaftGroupStatePB::RUNNING) { |
2319 | 0 | continue; |
2320 | 0 | } |
2321 | 0 | if (!tablets.empty()) { |
2322 | 0 | tablets += ", "; |
2323 | 0 | } |
2324 | 0 | tablets += tablet_id; |
2325 | 0 | count++; |
2326 | 0 | } |
2327 | 0 | if (count > FLAGS_TEST_crash_if_remote_bootstrap_sessions_greater_than) { |
2328 | 0 | LOG(FATAL) << "Exceeded the specified maximum number of concurrent remote bootstrap sessions." |
2329 | 0 | << " Specified: " << FLAGS_TEST_crash_if_remote_bootstrap_sessions_greater_than |
2330 | 0 | << ", number concurrent remote bootstrap sessions: " |
2331 | 0 | << tablets_being_remote_bootstrapped_.size() << ", for tablets: " << tablets; |
2332 | 0 | } |
2333 | 0 | } |
2334 | | |
2335 | | // Check that the per-table RBS limits are respected if the flag is non-zero. |
2336 | 958 | if (PREDICT_FALSE(FLAGS_TEST_crash_if_remote_bootstrap_sessions_per_table_greater_than > 0) && |
2337 | 24 | tablets_being_remote_bootstrapped_per_table_[table_id].size() > |
2338 | 0 | FLAGS_TEST_crash_if_remote_bootstrap_sessions_per_table_greater_than) { |
2339 | 0 | string tablets; |
2340 | 0 | size_t count = 0; |
2341 | 0 | for (const auto& tablet_id : tablets_being_remote_bootstrapped_per_table_[table_id]) { |
2342 | 0 | TabletPeerPtr* tablet_peer = FindOrNull(tablet_map_, tablet_id); |
2343 | 0 | if (tablet_peer && (*tablet_peer)->state() == RaftGroupStatePB::RUNNING) { |
2344 | 0 | continue; |
2345 | 0 | } |
2346 | 0 | if (!tablets.empty()) { |
2347 | 0 | tablets += ", "; |
2348 | 0 | } |
2349 | 0 | tablets += tablet_id; |
2350 | 0 | count++; |
2351 | 0 | } |
2352 | 0 | if (count > FLAGS_TEST_crash_if_remote_bootstrap_sessions_per_table_greater_than) { |
2353 | 0 | LOG(FATAL) << "Exceeded the specified maximum number of concurrent remote bootstrap " |
2354 | 0 | << "sessions per table. Specified: " |
2355 | 0 | << FLAGS_TEST_crash_if_remote_bootstrap_sessions_per_table_greater_than |
2356 | 0 | << ", number of concurrent remote bootstrap sessions for table " << table_id |
2357 | 0 | << ": " << tablets_being_remote_bootstrapped_per_table_[table_id].size() |
2358 | 0 | << ", for tablets: " << tablets; |
2359 | 0 | } |
2360 | 0 | } |
2361 | 958 | } |
2362 | | |
2363 | 396k | Status TSTabletManager::UpdateSnapshotsInfo(const master::TSSnapshotsInfoPB& info) { |
2364 | 396k | bool restorations_updated; |
2365 | 396k | RestorationCompleteTimeMap restoration_complete_time; |
2366 | 396k | { |
2367 | 396k | std::lock_guard<simple_spinlock> lock(snapshot_schedule_allowed_history_cutoff_mutex_); |
2368 | 396k | ++snapshot_schedules_version_; |
2369 | 396k | snapshot_schedule_allowed_history_cutoff_.clear(); |
2370 | 0 | for (const auto& schedule : info.schedules()) { |
2371 | 0 | auto schedule_id = VERIFY_RESULT(FullyDecodeSnapshotScheduleId(schedule.id())); |
2372 | 0 | snapshot_schedule_allowed_history_cutoff_.emplace( |
2373 | 0 | schedule_id, HybridTime::FromPB(schedule.last_snapshot_hybrid_time())); |
2374 | 0 | missing_snapshot_schedules_.erase(schedule_id); |
2375 | 0 | } |
2376 | 396k | HybridTime restorations_update_ht(info.last_restorations_update_ht()); |
2377 | 396k | restorations_updated = restorations_update_ht != last_restorations_update_ht_; |
2378 | 396k | if (restorations_updated) { |
2379 | 5.58k | last_restorations_update_ht_ = restorations_update_ht; |
2380 | 0 | for (const auto& entry : info.restorations()) { |
2381 | 0 | auto id = VERIFY_RESULT(FullyDecodeTxnSnapshotRestorationId(entry.id())); |
2382 | 0 | auto complete_time = HybridTime::FromPB(entry.complete_time_ht()); |
2383 | 0 | restoration_complete_time.emplace(id, complete_time); |
2384 | 0 | } |
2385 | 5.58k | } |
2386 | 396k | } |
2387 | 396k | if (!restorations_updated) { |
2388 | 391k | return Status::OK(); |
2389 | 391k | } |
2390 | 5.58k | std::vector<tablet::TabletPtr> tablets; |
2391 | 5.58k | { |
2392 | 5.58k | SharedLock<RWMutex> shared_lock(mutex_); |
2393 | 5.58k | tablets.reserve(tablet_map_.size()); |
2394 | 1.53k | for (const auto& entry : tablet_map_) { |
2395 | 1.53k | auto tablet = entry.second->shared_tablet(); |
2396 | 1.53k | if (tablet) { |
2397 | 1.41k | tablets.push_back(tablet); |
2398 | 1.41k | } |
2399 | 1.53k | } |
2400 | 5.58k | } |
2401 | 1.41k | for (const auto& tablet : tablets) { |
2402 | 1.41k | RETURN_NOT_OK(tablet->CheckRestorations(restoration_complete_time)); |
2403 | 1.41k | } |
2404 | 5.58k | return Status::OK(); |
2405 | 5.58k | } |
2406 | | |
2407 | 412 | HybridTime TSTabletManager::AllowedHistoryCutoff(tablet::RaftGroupMetadata* metadata) { |
2408 | 412 | auto schedules = metadata->SnapshotSchedules(); |
2409 | 412 | if (schedules.empty()) { |
2410 | 412 | return HybridTime::kMax; |
2411 | 412 | } |
2412 | 0 | std::vector<SnapshotScheduleId> schedules_to_remove; |
2413 | 0 | auto se = ScopeExit([&schedules_to_remove, metadata]() { |
2414 | 0 | if (schedules_to_remove.empty()) { |
2415 | 0 | return; |
2416 | 0 | } |
2417 | 0 | bool any_removed = false; |
2418 | 0 | for (const auto& schedule_id : schedules_to_remove) { |
2419 | 0 | any_removed = metadata->RemoveSnapshotSchedule(schedule_id) || any_removed; |
2420 | 0 | } |
2421 | 0 | if (any_removed) { |
2422 | 0 | WARN_NOT_OK(metadata->Flush(), "Failed to flush metadata"); |
2423 | 0 | } |
2424 | 0 | }); |
2425 | 0 | std::lock_guard<simple_spinlock> lock(snapshot_schedule_allowed_history_cutoff_mutex_); |
2426 | 0 | HybridTime result = HybridTime::kMax; |
2427 | 0 | for (const auto& schedule_id : schedules) { |
2428 | 0 | auto it = snapshot_schedule_allowed_history_cutoff_.find(schedule_id); |
2429 | 0 | if (it == snapshot_schedule_allowed_history_cutoff_.end()) { |
2430 | | // We don't know this schedule. |
2431 | 0 | auto emplace_result = missing_snapshot_schedules_.emplace( |
2432 | 0 | schedule_id, snapshot_schedules_version_); |
2433 | 0 | if (!emplace_result.second && |
2434 | 0 | emplace_result.first->second + 2 <= snapshot_schedules_version_) { |
2435 | | // We don't know this schedule, and there are already 2 rounds of heartbeat passed |
2436 | | // after we first time found that we don't know this schedule. |
2437 | | // So it means that schedule was deleted. |
2438 | | // One round is not enough, because schedule could be added after heartbeat processed on |
2439 | | // master, but response not yet received on TServer. |
2440 | 0 | schedules_to_remove.push_back(schedule_id); |
2441 | 0 | continue; |
2442 | 0 | } |
2443 | 0 | return HybridTime::kMin; |
2444 | 0 | } |
2445 | 0 | if (!it->second) { |
2446 | | // Schedules does not have snapshots yet. |
2447 | 0 | return HybridTime::kMin; |
2448 | 0 | } |
2449 | 0 | result = std::min(result, it->second); |
2450 | 0 | } |
2451 | 0 | return result; |
2452 | 0 | } |
2453 | | |
2454 | | Status DeleteTabletData(const RaftGroupMetadataPtr& meta, |
2455 | | TabletDataState data_state, |
2456 | | const string& uuid, |
2457 | | const yb::OpId& last_logged_opid, |
2458 | 47.6k | TSTabletManager* ts_manager) { |
2459 | 47.6k | const string& tablet_id = meta->raft_group_id(); |
2460 | 47.6k | const string kLogPrefix = LogPrefix(tablet_id, uuid); |
2461 | 47.6k | LOG(INFO) << kLogPrefix << "Deleting tablet data with delete state " |
2462 | 47.6k | << TabletDataState_Name(data_state); |
2463 | 0 | CHECK(data_state == TABLET_DATA_DELETED || |
2464 | 0 | data_state == TABLET_DATA_TOMBSTONED) |
2465 | 0 | << "Unexpected data_state to delete tablet " << meta->raft_group_id() << ": " |
2466 | 0 | << TabletDataState_Name(data_state) << " (" << data_state << ")"; |
2467 | | |
2468 | | // Note: Passing an unset 'last_logged_opid' will retain the last_logged_opid |
2469 | | // that was previously in the metadata. |
2470 | 47.6k | RETURN_NOT_OK(meta->DeleteTabletData(data_state, last_logged_opid)); |
2471 | 47.6k | LOG(INFO) << kLogPrefix << "Tablet deleted. Last logged OpId: " |
2472 | 47.6k | << meta->tombstone_last_logged_opid(); |
2473 | 47.6k | MAYBE_FAULT(FLAGS_TEST_fault_crash_after_blocks_deleted); |
2474 | | |
2475 | 47.6k | RETURN_NOT_OK(Log::DeleteOnDiskData( |
2476 | 47.6k | meta->fs_manager()->env(), meta->raft_group_id(), meta->wal_dir(), |
2477 | 47.6k | meta->fs_manager()->uuid())); |
2478 | 47.6k | MAYBE_FAULT(FLAGS_TEST_fault_crash_after_wal_deleted); |
2479 | | |
2480 | | // We do not delete the superblock or the consensus metadata when tombstoning |
2481 | | // a tablet. |
2482 | 47.6k | if (data_state == TABLET_DATA_TOMBSTONED) { |
2483 | 731 | return Status::OK(); |
2484 | 731 | } |
2485 | | |
2486 | | // Only TABLET_DATA_DELETED tablets get this far. |
2487 | 46.9k | RETURN_NOT_OK(ConsensusMetadata::DeleteOnDiskData(meta->fs_manager(), meta->raft_group_id())); |
2488 | 46.9k | MAYBE_FAULT(FLAGS_TEST_fault_crash_after_cmeta_deleted); |
2489 | | |
2490 | 46.9k | return Status::OK(); |
2491 | 46.9k | } |
2492 | | |
2493 | | void LogAndTombstone(const RaftGroupMetadataPtr& meta, |
2494 | | const std::string& msg, |
2495 | | const std::string& uuid, |
2496 | | const Status& s, |
2497 | 3 | TSTabletManager* ts_manager) { |
2498 | 3 | const string& tablet_id = meta->raft_group_id(); |
2499 | 3 | const string kLogPrefix = LogPrefix(tablet_id, uuid); |
2500 | 3 | LOG(WARNING) << kLogPrefix << msg << ": " << s.ToString(); |
2501 | | |
2502 | | // Tombstone the tablet when remote bootstrap fails. |
2503 | 3 | LOG(INFO) << kLogPrefix << "Tombstoning tablet after failed remote bootstrap"; |
2504 | 3 | Status delete_status = DeleteTabletData(meta, |
2505 | 3 | TABLET_DATA_TOMBSTONED, |
2506 | 3 | uuid, |
2507 | 3 | yb::OpId(), |
2508 | 3 | ts_manager); |
2509 | | |
2510 | 3 | if (PREDICT_FALSE(FLAGS_TEST_sleep_after_tombstoning_tablet_secs > 0)) { |
2511 | | // We sleep here so that the test can verify that the state of the tablet is |
2512 | | // TABLET_DATA_TOMBSTONED. |
2513 | 0 | LOG(INFO) << "Sleeping after remote bootstrap failed"; |
2514 | 0 | SleepFor(MonoDelta::FromSeconds(FLAGS_TEST_sleep_after_tombstoning_tablet_secs)); |
2515 | 0 | } |
2516 | | |
2517 | 3 | if (PREDICT_FALSE(!delete_status.ok())) { |
2518 | | // This failure should only either indicate a bug or an IO error. |
2519 | 0 | LOG(FATAL) << kLogPrefix << "Failed to tombstone tablet after remote bootstrap: " |
2520 | 0 | << delete_status.ToString(); |
2521 | 0 | } |
2522 | | |
2523 | | // Remove the child tracker if present. |
2524 | 3 | if (ts_manager != nullptr) { |
2525 | 2 | auto tracker = MemTracker::FindTracker( |
2526 | 2 | Format("tablet-$0", meta->raft_group_id()), ts_manager->server()->mem_tracker()); |
2527 | 2 | if (tracker) { |
2528 | 0 | tracker->UnregisterFromParent(); |
2529 | 0 | } |
2530 | 2 | } |
2531 | 3 | } |
2532 | | |
2533 | | TransitionInProgressDeleter::TransitionInProgressDeleter( |
2534 | | TransitionInProgressMap* map, std::mutex* mutex, const TabletId& tablet_id) |
2535 | 131k | : in_progress_(map), mutex_(mutex), tablet_id_(tablet_id) {} |
2536 | | |
2537 | 131k | TransitionInProgressDeleter::~TransitionInProgressDeleter() { |
2538 | 131k | std::string transition; |
2539 | 131k | { |
2540 | 131k | std::unique_lock<std::mutex> lock(*mutex_); |
2541 | 131k | const auto iter = in_progress_->find(tablet_id_); |
2542 | 131k | CHECK(iter != in_progress_->end()); |
2543 | 131k | transition = iter->second; |
2544 | 131k | in_progress_->erase(iter); |
2545 | 131k | } |
2546 | 131k | LOG(INFO) << "Deleted transition in progress " << transition |
2547 | 131k | << " for tablet " << tablet_id_; |
2548 | 131k | } |
2549 | | |
2550 | | Status ShutdownAndTombstoneTabletPeerNotOk( |
2551 | | const Status& status, const tablet::TabletPeerPtr& tablet_peer, |
2552 | | const tablet::RaftGroupMetadataPtr& meta, const std::string& uuid, const char* msg, |
2553 | 995 | TSTabletManager* ts_tablet_manager) { |
2554 | 995 | if (status.ok()) { |
2555 | 994 | return status; |
2556 | 994 | } |
2557 | | // If shutdown was initiated by someone else we should not wait for shutdown to complete. |
2558 | 1 | if (tablet_peer && tablet_peer->StartShutdown()) { |
2559 | 0 | tablet_peer->CompleteShutdown(); |
2560 | 0 | } |
2561 | 1 | tserver::LogAndTombstone(meta, msg, uuid, status, ts_tablet_manager); |
2562 | 1 | return status; |
2563 | 1 | } |
2564 | | |
2565 | | } // namespace tserver |
2566 | | } // namespace yb |